def _add_options(self): """Function for adding the parser and options to the given ArgumentParser. Notes ----- Uses the class constructor's subparser object for appending the tool's parser and options. """ # Create the description and options for the parser. description = "Create a VDJ or VJ model by executing IGoR's commandline tool via a python subprocess using default " \ "model parameters." parser_options = { '-seqs': { 'metavar': '<fasta/separated>', 'required': 'True', 'type': 'str', 'help': "An input FASTA or separated data file with sequences for training the model." }, '-ref': { 'metavar': ('<gene>', '<fasta>'), 'type': 'str', 'action': 'append', 'nargs': 2, 'required': 'True', 'help': "A gene (V, D or J) followed by a reference genome FASTA file. Note: the FASTA reference genome files " "needs to conform to IGMT annotation (separated by '|' character)." }, '-type': { 'type': 'str.lower', 'choices': ['alpha', 'beta', 'light', 'heavy'], 'required': 'True', 'help': 'The type of model to create. (select one: %(choices)s).' }, '-n-iter': { 'type': 'int', 'nargs': '?', 'help': 'The number of inference iterations to perform when creating the model (default: {}).' .format(get_config_data('BUILD', 'NUM_ITERATIONS', 'int')) } } # Add the options to the parser and return the updated parser. parser_tool = self.subparsers.add_parser('build', help=description, description=description) parser_tool = dynamic_cli_options(parser=parser_tool, options=parser_options)
def _add_options(self): """Function for adding the parser/options to the input ArgumentParser. Notes ----- Uses the class constructor's subparser object for appending the tool's parser and options. """ # Create the description and options for the parser. description = "Create an alignment for the given reference genome FASTA files and seach the given alignment for " \ "conserved motif regions. The located CDR3 anchors can be used for the other tools." parser_options = { '-ref': { 'metavar': ('<gene>', '<fasta>'), 'type': 'str', 'action': 'append', 'nargs': 2, 'required': 'True', 'help': "A gene (V or J) followed by a reference genome FASTA file. Note: the FASTA reference genome files " "needs to conform to IGMT annotation (separated by '|' character)." }, '-motif': { 'type': 'str.upper', 'action': 'append', 'help': "The motifs to look for (default: 'V' {} and 'J' {} respectivly)." .format( get_config_data('LOCATE', 'V_MOTIFS').split(','), get_config_data('LOCATE', 'J_MOTIFS').split(',')) } } # Add the options to the parser and return the updated parser. parser_tool = self.subparsers.add_parser('locate', help=description, description=description) parser_tool = dynamic_cli_options(parser=parser_tool, options=parser_options)
def main(): """Function to create the ArgumentParser containing the sub-options.""" # Setting up the logger. logging.basicConfig( format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=os.environ.get("LOGLEVEL", "INFO") ) logger = logging.getLogger(__name__) # Create the parser with general commands and set the subparser. description = 'Create IGoR models and calculate the generation probability of V(D)J and CDR3 sequences.' parser_general_options = { '-separator': { 'type': 'str.lower', 'choices': ['tab', 'semi-colon', 'comma'], 'help': 'The separator character used for input files and for writing new files (select one: %(choices)s) ' '(default: {}).'.format( {'\t': 'tab', ';': 'semi-colon', ',': 'comma'}[get_config_data('COMMON', 'SEPARATOR')]) }, '-threads': { 'type': 'int', 'nargs': '?', 'help': 'The number of threads the program is allowed to use (default: {}).' .format(get_config_data('COMMON', 'NUM_THREADS', 'int')) }, '-set-wd': { 'type': 'str', 'nargs': '?', 'help': 'An optional location for writing files (default: {}).'.format(get_config_data('COMMON', 'WORKING_DIR')) }, '-out-name': { 'type': 'str', 'nargs': '?', 'help': 'An optional output file name. If multiple files are created, the value is used as a prefix for the file ' '(default: {}).'.format(get_config_data('COMMON', 'OUT_NAME')) }, '-config-file': { 'type': 'str', 'nargs': '?', 'help': 'An optional configuration file path for ImmunoProbs. This file is always combined with the default ' 'configuration to make up missing values.' }, } parser = argparse.ArgumentParser(prog='immuno-probs', description=description) parser = dynamic_cli_options(parser=parser, options=parser_general_options) subparsers = parser.add_subparsers( help='Supported immuno-probs options, command plus help displays more information for the option.', dest='subparser_name' ) # Add main- and suboptions to the subparser. logger.info('Setting up ImmunoProbs commandline tools') try: cas = ConvertAdaptiveSequences(subparsers=subparsers) lca = LocateCdr3Anchors(subparsers=subparsers) bim = BuildIgorModel(subparsers=subparsers) ges = GenerateSequences(subparsers=subparsers) evs = EvaluateSequences(subparsers=subparsers) except (TypeError) as err: logger.error(str(err)) return # Parse the commandline arguments and set variables. logger.info('Parsing/formatting commandline arguments') try: parsed_arguments = parser.parse_args() if parsed_arguments.config_file is not None: set_config_data(parsed_arguments.config_file) if parsed_arguments.separator is not None: set_separator(parsed_arguments.separator) if parsed_arguments.threads is not None: set_num_threads(parsed_arguments.threads) if parsed_arguments.set_wd is not None: set_working_dir(parsed_arguments.set_wd) if parsed_arguments.out_name is not None: set_out_name(parsed_arguments.out_name) except (TypeError, ValueError, IOError) as err: logger.error(str(err)) return # Create the directory paths for temporary files. logger.info('Setting up temporary system directory') try: output_dir = get_config_data('COMMON', 'WORKING_DIR') if get_config_data('EXPERT', 'USE_SYSTEM_TEMP', 'bool'): temp_dir = create_directory_path(os.path.join(tempfile.gettempdir(), get_config_data('EXPERT', 'TEMP_DIR'))) else: temp_dir = create_directory_path(os.path.join(output_dir, get_config_data('EXPERT', 'TEMP_DIR'))) set_working_dir(temp_dir) except (IOError, AttributeError) as err: logger.error(str(err)) return # Execute the correct tool based on given subparser name. logger.info('Executing selected ImmunoProbs tool (%s)', parsed_arguments.subparser_name) if parsed_arguments.subparser_name == 'convert': cas.run(args=parsed_arguments, output_dir=output_dir) elif parsed_arguments.subparser_name == 'locate': lca.run(args=parsed_arguments, output_dir=output_dir) elif parsed_arguments.subparser_name == 'build': bim.run(args=parsed_arguments, output_dir=output_dir) elif parsed_arguments.subparser_name == 'generate': ges.run(args=parsed_arguments, output_dir=output_dir) elif parsed_arguments.subparser_name == 'evaluate': evs.run(args=parsed_arguments, output_dir=output_dir) else: logger.error('No tool selected, run help command to show all supported tools') # Finally, delete the temporary directory if specified. if get_config_data('EXPERT', 'REMOVE_TEMP_DIR', 'bool'): logger.info('Cleaning up working directory') rmtree(temp_dir, ignore_errors=True)
def _add_options(self): """Function for adding the parser and options to the given ArgumentParser. Notes ----- Uses the class constructor's subparser object for appending the tool's parser and options. """ # Create the description and options for the parser. description = "Evaluate VDJ or VJ sequences given a custom IGoR model (or build-in) through IGoR's commandline " \ "tool via python subprocess. Or evaluate CDR3 sequences with the model by using OLGA." parser_options = { '-seqs': { 'metavar': '<fasta/separated>', 'required': 'True', 'type': 'str', 'help': "An input FASTA or separated data file with sequences to evaluate." }, '-model': { 'type': 'str.lower', 'choices': get_default_model_file_paths(), 'required': '-custom-model' not in sys.argv, 'help': "Specify a pre-installed model for evaluation. (required if -custom-model NOT specified) " "(select one: %(choices)s)." }, '-ref': { 'metavar': ('<gene>', '<fasta>'), 'type': 'str', 'action': 'append', 'nargs': 2, 'required': ('-cdr3' not in sys.argv and '-custom-model' in sys.argv), 'help': "A gene (V, D or J) followed by a reference genome FASTA file. Note: the FASTA reference genome files " "needs to conform to IGMT annotation (separated by '|' character). (required for -custom-model " "without -cdr3)" }, '-type': { 'type': 'str.lower', 'choices': ['alpha', 'beta', 'light', 'heavy'], 'required': ('-custom-model' in sys.argv), 'help': 'The type of the custom model to use. (select one: %(choices)s) (required for -custom-model).' }, '-custom-model': { 'metavar': ('<parameters>', '<marginals>'), 'type': 'str', 'nargs': 2, 'help': 'A IGoR parameters file followed by an IGoR marginals file.' }, '-anchor': { 'metavar': ('<gene>', '<separated>'), 'type': 'str', 'action': 'append', 'nargs': 2, 'required': ('-cdr3' in sys.argv and '-custom-model' in sys.argv), 'help': 'A gene (V or J) followed by a CDR3 anchor separated data file. Note: need to contain gene in the ' 'first column, anchor index in the second and gene function in the third (required for -cdr3 and ' '-custom-model).' }, '-cdr3': { 'action': 'store_true', 'help': 'If specified (True), CDR3 sequences should be evaluated, otherwise V(D)J sequences (default: {}).' .format(get_config_data('EVALUATE', 'EVAL_CDR3', 'bool')) }, '-use-allele': { 'action': 'store_true', 'help': "If specified (True), in combination with the '-cdr3' flag, the allele information from the gene " "choice fields is used to calculate the generation probability (default: {})." .format(get_config_data('EVALUATE', 'USE_ALLELE', 'bool')) }, } # Add the options to the parser and return the updated parser. parser_tool = self.subparsers.add_parser('evaluate', help=description, description=description) parser_tool = dynamic_cli_options(parser=parser_tool, options=parser_options)
def run(self, args, output_dir): """Function to execute the commandline tool. Parameters ---------- args : Namespace Object containing our parsed commandline arguments. output_dir : str A directory path for writing output files to. """ eval_cdr3 = get_config_data('EVALUATE', 'EVAL_CDR3', 'bool') if args.cdr3: eval_cdr3 = args.cdr3 # If the given type of sequences evaluation is VDJ, use IGoR. if not eval_cdr3: # Add general IGoR commands. self.logger.info('Setting up initial IGoR command (1/4)') command_list = [] working_dir = get_config_data('COMMON', 'WORKING_DIR') command_list.append(['set_wd', working_dir]) command_list.append(['threads', str(get_config_data('COMMON', 'NUM_THREADS', 'int'))]) # Add the model (build-in or custom) command depending on given. self.logger.info('Processing genomic reference templates (2/4)') try: if args.model: files = get_default_model_file_paths(name=args.model) model_type = files['type'] command_list.append([ 'set_custom_model', files['parameters'], files['marginals'] ]) ref_list = ['set_genomic'] for gene, filename in files['reference'].items(): ref_list.append([gene, filename]) command_list.append(ref_list) elif args.custom_model: model_type = args.type command_list.append([ 'set_custom_model', copy_to_dir(working_dir, str(args.custom_model[0]), 'txt'), copy_to_dir(working_dir, str(args.custom_model[1]), 'txt'), ]) ref_list = ['set_genomic'] for i in args.ref: filename = preprocess_reference_file( os.path.join(working_dir, 'genomic_templates'), copy_to_dir(working_dir, i[1], 'fasta'), 1 ) ref_list.append([i[0], filename]) command_list.append(ref_list) except IOError as err: self.logger.error(str(err)) return # Add the sequence command after pre-processing of the input file. self.logger.info('Pre-processing input sequence file (3/4)') try: if is_fasta(args.seqs): self.logger.info('FASTA input file extension detected') command_list.append([ 'read_seqs', copy_to_dir(working_dir, str(args.seqs), 'fasta') ]) elif is_separated(args.seqs, get_config_data('COMMON', 'SEPARATOR')): self.logger.info('Separated input file type detected') input_seqs = preprocess_separated_file( os.path.join(working_dir, 'input'), copy_to_dir(working_dir, str(args.seqs), 'csv'), get_config_data('COMMON', 'SEPARATOR'), ';', get_config_data('COMMON', 'I_COL'), [get_config_data('COMMON', 'NT_COL')] ) command_list.append(['read_seqs', input_seqs]) else: self.logger.error( 'Given input sequence file could not be detected as ' 'FASTA file or separated data type') return except (IOError, KeyError, ValueError) as err: self.logger.error(str(err)) return # Add alignment and evealuation commands. self.logger.info('Adding additional variables to IGoR command (4/4)') command_list.append(['align', ['all']]) command_list.append(['evaluate']) command_list.append(['output', ['Pgen']]) # Execute IGoR through command line and catch error code. self.logger.info('Executing IGoR (this might take a while)') try: igor_cline = IgorInterface(command=command_list) exit_code, _, stderr, _ = igor_cline.call() if exit_code != 0: self.logger.error( "An error occurred during execution of IGoR command " "(exit code %s):\n%s", exit_code, stderr) return except OSError as err: self.logger.error(str(err)) return # Read in all data frame files, based on input file type. self.logger.info('Processing generation probabilities') try: if is_fasta(args.seqs): seqs_df = read_fasta_as_dataframe( file=args.seqs, col=get_config_data('COMMON', 'NT_COL')) elif is_separated(args.seqs, get_config_data('COMMON', 'SEPARATOR')): seqs_df = read_separated_to_dataframe( file=args.seqs, separator=get_config_data('COMMON', 'SEPARATOR'), index_col=get_config_data('COMMON', 'I_COL')) full_pgen_df = read_separated_to_dataframe( file=os.path.join(working_dir, 'output', 'Pgen_counts.csv'), separator=';', index_col='seq_index', cols=['Pgen_estimate']) full_pgen_df.index.names = [get_config_data('COMMON', 'I_COL')] full_pgen_df.rename( columns={'Pgen_estimate': get_config_data('COMMON', 'NT_P_COL')}, inplace=True) full_pgen_df.loc[:, get_config_data('COMMON', 'AA_P_COL')] = numpy.nan except (IOError, KeyError, ValueError) as err: self.logger.error(str(err)) return # Insert amino acid sequence column if not existent. self.logger.info('Formatting output dataframe') if (get_config_data('COMMON', 'NT_COL') in seqs_df.columns and not get_config_data('COMMON', 'AA_COL') in seqs_df.columns): seqs_df.insert( seqs_df.columns.get_loc(get_config_data('COMMON', 'NT_COL')) + 1, get_config_data('COMMON', 'AA_COL'), numpy.nan) seqs_df[get_config_data('COMMON', 'AA_COL')] = seqs_df[get_config_data('COMMON', 'NT_COL')].apply(nucleotides_to_aminoacids) # Merge IGoR generated sequence output dataframes. full_pgen_df = seqs_df.merge(full_pgen_df, left_index=True, right_index=True) # Write the pandas dataframe to a separated file. try: self.logger.info('Writing evaluated data to file system') output_filename = get_config_data('COMMON', 'OUT_NAME') if not output_filename: output_filename = 'pgen_estimate_{}'.format(model_type) _, filename = write_dataframe_to_separated( dataframe=full_pgen_df, filename=output_filename, directory=output_dir, separator=get_config_data('COMMON', 'SEPARATOR'), index_name=get_config_data('COMMON', 'I_COL')) self.logger.info("Written '%s'", filename) except IOError as err: self.logger.error(str(err)) return # If the given type of sequences evaluation is CDR3, use OLGA. elif eval_cdr3: # Create the directory for the output files. working_dir = os.path.join(get_config_data('COMMON', 'WORKING_DIR'), 'output') if not os.path.isdir(working_dir): os.makedirs(os.path.join(get_config_data('COMMON', 'WORKING_DIR'), 'output')) # Load the model and create the sequence evaluator. self.logger.info('Loading the IGoR model files') try: if args.model: files = get_default_model_file_paths(name=args.model) model_type = files['type'] model = IgorLoader(model_type=model_type, model_params=files['parameters'], model_marginals=files['marginals']) args.anchor = [['V', files['v_anchors']], ['J', files['j_anchors']]] separator = '\t' elif args.custom_model: model_type = args.type model = IgorLoader(model_type=model_type, model_params=args.custom_model[0], model_marginals=args.custom_model[1]) separator = get_config_data('COMMON', 'SEPARATOR') for gene in args.anchor: anchor_file = preprocess_separated_file( os.path.join(working_dir, 'cdr3_anchors'), str(gene[1]), separator, ',' ) model.set_anchor(gene=gene[0], file=anchor_file) model.initialize_model() except (TypeError, OSError, IOError, KeyError, ValueError) as err: self.logger.error(str(err)) return # Based on input file type, load in input file. self.logger.info('Pre-processing input sequence file') try: if is_fasta(args.seqs): self.logger.info('FASTA input file extension detected') seqs_df = read_fasta_as_dataframe( file=args.seqs, col=get_config_data('COMMON', 'NT_COL')) elif is_separated(args.seqs, get_config_data('COMMON', 'SEPARATOR')): self.logger.info('Separated input file type detected') seqs_df = read_separated_to_dataframe( file=args.seqs, separator=get_config_data('COMMON', 'SEPARATOR'), index_col=get_config_data('COMMON', 'I_COL')) else: self.logger.error('Given input sequence file could not be detected as FASTA file or separated data type') return except (IOError, KeyError, ValueError) as err: self.logger.error(str(err)) return # Evaluate the sequences. self.logger.info('Evaluating sequences') try: use_allele = get_config_data('EVALUATE', 'USE_ALLELE', 'bool') if args.use_allele: use_allele = args.use_allele seq_evaluator = OlgaContainer( igor_model=model, nt_col=get_config_data('COMMON', 'NT_COL'), nt_p_col=get_config_data('COMMON', 'NT_P_COL'), aa_col=get_config_data('COMMON', 'AA_COL'), aa_p_col=get_config_data('COMMON', 'AA_P_COL'), v_gene_choice_col=get_config_data('COMMON', 'V_GENE_CHOICE_COL'), j_gene_choice_col=get_config_data('COMMON', 'J_GENE_CHOICE_COL')) cdr3_pgen_df = seq_evaluator.evaluate( seqs=seqs_df, num_threads=get_config_data('COMMON', 'NUM_THREADS', 'int'), use_allele=use_allele, default_allele=get_config_data('EVALUATE', 'DEFAULT_ALLELE')) # Merge IGoR generated sequence output dataframes. cdr3_pgen_df = seqs_df.merge(cdr3_pgen_df, left_index=True, right_index=True) except (TypeError, IOError) as err: self.logger.error(str(err)) return # Write the pandas dataframe to a separated file. try: self.logger.info('Writing evaluated data to file system') output_filename = get_config_data('COMMON', 'OUT_NAME') if not output_filename: output_filename = 'pgen_estimate_{}_CDR3'.format(model_type) _, filename = write_dataframe_to_separated( dataframe=cdr3_pgen_df, filename=output_filename, directory=output_dir, separator=get_config_data('COMMON', 'SEPARATOR'), index_name=get_config_data('COMMON', 'I_COL')) self.logger.info("Written '%s'", filename) except IOError as err: self.logger.error(str(err)) return
def run(self, args, output_dir): """Function to execute the commandline tool. Parameters ---------- args : Namespace Object containing our parsed commandline arguments. output_dir : str A directory path for writing output files to. """ # Get the working directory. working_dir = get_config_data('COMMON', 'WORKING_DIR') # Create the alignment and locate the motifs. for gene in args.ref: self.logger.info( 'Processing genomic reference template for %s and building MUSCLE alignment', gene[0]) try: filename = preprocess_reference_file( os.path.join(working_dir, 'genomic_templates'), copy_to_dir(working_dir, gene[1], 'fasta'), ) aligner = MuscleAligner(infile=filename) locator = AnchorLocator( alignment=aligner.get_muscle_alignment(), gene=gene[0]) except (OSError, ValueError, IOError) as err: self.logger.error(str(err)) return try: self.logger.info('Locating CDR3 anchors for %s', gene[0]) if args.motif is not None: anchors_df = locator.get_indices_motifs( get_config_data('COMMON', 'NUM_THREADS', 'int'), *args.motif) else: if gene[0] == 'V': anchors_df = locator.get_indices_motifs( get_config_data('COMMON', 'NUM_THREADS', 'int'), *get_config_data('LOCATE', 'V_MOTIFS').split(',')) elif gene[0] == 'J': anchors_df = locator.get_indices_motifs( get_config_data('COMMON', 'NUM_THREADS', 'int'), *get_config_data('LOCATE', 'J_MOTIFS').split(',')) except ValueError as err: self.logger.error(str(err)) return # Modify the dataframe to make it OLGA compliant. self.logger.info('Formatting CDR3 anchor dataframe') try: anchors_df.insert(2, 'function', numpy.nan) anchors_df.rename(columns={'name': 'gene'}, inplace=True) anchors_df['gene'], anchors_df['function'] = zip( *anchors_df['gene'].apply(lambda value: (value.split('|')[ 1], value.split('|')[3]))) except (IndexError, ValueError): self.logger.error( "FASTA header needs to be separated by '|', needs to have gene name on index position 1 and function " "on index position 3: '%s'", anchors_df['gene']) return # Write the pandas dataframe to a separated file with prefix. try: self.logger.info('Writing CDR3 acnhors for %s to system', gene[0]) output_prefix = get_config_data('COMMON', 'OUT_NAME') if not output_prefix: output_prefix = 'gene_CDR3_anchors' _, filename = write_dataframe_to_separated( dataframe=anchors_df, filename='{}_{}'.format(gene[0], output_prefix), directory=output_dir, separator=get_config_data('COMMON', 'SEPARATOR')) self.logger.info("Written '%s' for %s gene", filename, gene[0]) except IOError as err: self.logger.error(str(err)) return
def _add_options(self): """Function for adding the parser and options to the given ArgumentParser. Notes ----- Uses the class constructor's subparser object for appending the tool's parser and options. """ # Create the description and options for the parser. description = "Generate VDJ or VJ sequences given a custom IGoR model (or build-in) by executing IGoR's " \ "commandline tool via python subprocess. Or generate CDR3 sequences from the model by using OLGA." parser_options = { '-model': { 'type': 'str.lower', 'choices': get_default_model_file_paths(), 'required': '-custom-model' not in sys.argv, 'help': "Specify a pre-installed model for generation. (required if -custom-model NOT specified) " "(select one: %(choices)s)." }, '-type': { 'type': 'str.lower', 'choices': ['alpha', 'beta', 'light', 'heavy'], 'required': ('-custom-model' in sys.argv), 'help': 'The type of the custom model to use. (select one: %(choices)s) (required for -custom-model).' }, '-anchor': { 'metavar': ('<gene>', '<separated>'), 'type': 'str', 'action': 'append', 'nargs': 2, 'required': ('-cdr3' in sys.argv and '-custom-model' in sys.argv), 'help': 'A gene (V or J) followed by a CDR3 anchor separated data file. Note: need to contain gene in the ' 'first column, anchor index in the second and gene function in the third (required for -cdr3 and ' '-custom-model).' }, '-custom-model': { 'metavar': ('<parameters>', '<marginals>'), 'type': 'str', 'nargs': 2, 'help': 'A IGoR parameters file followed by an IGoR marginals file.' }, '-n-gen': { 'type': 'int', 'nargs': '?', 'help': 'The number of sequences to generate (default: {}).'.format( get_config_data('GENERATE', 'NUM_GENERATE', 'int')) }, '-cdr3': { 'action': 'store_true', 'help': 'If specified (True), CDR3 sequences are generated, otherwise V(D)J sequences (default: {}).' .format(get_config_data('GENERATE', 'EVAL_CDR3', 'bool')) }, } # Add the options to the parser and return the updated parser. parser_tool = self.subparsers.add_parser('generate', help=description, description=description) parser_tool = dynamic_cli_options(parser=parser_tool, options=parser_options)
def run(self, args, output_dir): """Function to execute the commandline tool. Parameters ---------- args : Namespace Object containing our parsed commandline arguments. output_dir : str A directory path for writing output files to. """ eval_cdr3 = get_config_data('GENERATE', 'EVAL_CDR3', 'bool') if args.cdr3: eval_cdr3 = args.cdr3 # If the given type of sequences generation is not CDR3, use IGoR. if not eval_cdr3: # Add general igor commands. self.logger.info('Setting up initial IGoR command (1/3)') command_list = [] working_dir = get_config_data('COMMON', 'WORKING_DIR') command_list.append(['set_wd', working_dir]) command_list.append([ 'threads', str(get_config_data('COMMON', 'NUM_THREADS', 'int')) ]) # Add the model (build-in or custom) command. self.logger.info('Processing IGoR model files (2/3)') try: if args.model: files = get_default_model_file_paths(name=args.model) command_list.append([ 'set_custom_model', files['parameters'], files['marginals'] ]) elif args.custom_model: command_list.append([ 'set_custom_model', copy_to_dir(working_dir, str(args.custom_model[0]), 'txt'), copy_to_dir(working_dir, str(args.custom_model[1]), 'txt') ]) except IOError as err: self.logger.error(str(err)) return # Add generate command. self.logger.info( 'Adding additional variables to IGoR command (3/3)') if args.n_gen: command_list.append(['generate', str(args.n_gen), ['noerr']]) else: command_list.append([ 'generate', str(get_config_data('GENERATE', 'NUM_GENERATE', 'int')), ['noerr'] ]) # Execute IGoR through command line and catch error code. self.logger.info('Executing IGoR (this might take a while)') try: igor_cline = IgorInterface(command=command_list) exit_code, _, stderr, _ = igor_cline.call() if exit_code != 0: self.logger.error( "An error occurred during execution of IGoR command (exit code %s):\n%s", exit_code, stderr) return except OSError as err: self.logger.error(str(err)) return # Merge the generated output files together (translated). self.logger.info('Processing sequence realizations') try: seqs_df = read_separated_to_dataframe(file=os.path.join( working_dir, 'generated', 'generated_seqs_noerr.csv'), separator=';', index_col='seq_index', cols=['nt_sequence']) seqs_df.index.names = [get_config_data('COMMON', 'I_COL')] seqs_df.columns = [get_config_data('COMMON', 'NT_COL')] seqs_df[get_config_data('COMMON', 'AA_COL')] = \ seqs_df[get_config_data('COMMON', 'NT_COL')].apply(nucleotides_to_aminoacids) real_df = read_separated_to_dataframe(file=os.path.join( working_dir, 'generated', 'generated_realizations_noerr.csv'), separator=';', index_col='seq_index') real_df.index.names = [get_config_data('COMMON', 'I_COL')] if args.model: files = get_default_model_file_paths(name=args.model) model_type = files['type'] model = IgorLoader(model_type=model_type, model_params=files['parameters'], model_marginals=files['marginals']) elif args.custom_model: model_type = args.type model = IgorLoader(model_type=model_type, model_params=args.custom_model[0], model_marginals=args.custom_model[1]) real_df = self._process_realizations( data=real_df, model=model, v_gene_choice_col=get_config_data('COMMON', 'V_GENE_CHOICE_COL'), d_gene_choice_col=get_config_data('COMMON', 'D_GENE_CHOICE_COL'), j_gene_choice_col=get_config_data('COMMON', 'J_GENE_CHOICE_COL')) full_seqs_df = seqs_df.merge(real_df, left_index=True, right_index=True) except (IOError, KeyError, ValueError) as err: self.logger.error(str(err)) return # Write the pandas dataframe to a separated file. try: self.logger.info('Writing generated sequences to file system') output_filename = get_config_data('COMMON', 'OUT_NAME') if not output_filename: output_filename = 'generated_seqs_{}'.format(model_type) _, filename = write_dataframe_to_separated( dataframe=full_seqs_df, filename=output_filename, directory=output_dir, separator=get_config_data('COMMON', 'SEPARATOR'), index_name=get_config_data('COMMON', 'I_COL')) self.logger.info("Written '%s'", filename) except IOError as err: self.logger.error(str(err)) return # If the given type of sequences generation is CDR3, use OLGA. elif eval_cdr3: # Get the working directory. working_dir = get_config_data('COMMON', 'WORKING_DIR') # Load the model, create the sequence generator and generate the sequences. self.logger.info('Loading the IGoR model files') try: if args.model: files = get_default_model_file_paths(name=args.model) model_type = files['type'] model = IgorLoader(model_type=model_type, model_params=files['parameters'], model_marginals=files['marginals']) args.anchor = [['V', files['v_anchors']], ['J', files['j_anchors']]] separator = '\t' elif args.custom_model: model_type = args.type model = IgorLoader(model_type=model_type, model_params=args.custom_model[0], model_marginals=args.custom_model[1]) separator = get_config_data('COMMON', 'SEPARATOR') for gene in args.anchor: anchor_file = preprocess_separated_file( os.path.join(working_dir, 'cdr3_anchors'), str(gene[1]), separator, ',') model.set_anchor(gene=gene[0], file=anchor_file) model.initialize_model() except (TypeError, OSError, IOError, KeyError, ValueError) as err: self.logger.error(str(err)) return # Setup the sequence generator and generate sequences. self.logger.info('Generating sequences') try: seq_generator = OlgaContainer( igor_model=model, nt_col=get_config_data('COMMON', 'NT_COL'), nt_p_col=get_config_data('COMMON', 'NT_P_COL'), aa_col=get_config_data('COMMON', 'AA_COL'), aa_p_col=get_config_data('COMMON', 'AA_P_COL'), v_gene_choice_col=get_config_data('COMMON', 'V_GENE_CHOICE_COL'), j_gene_choice_col=get_config_data('COMMON', 'J_GENE_CHOICE_COL')) n_generate = get_config_data('GENERATE', 'NUM_GENERATE', 'int') if args.n_gen: n_generate = args.n_gen if n_generate > 0: cdr3_seqs_df = seq_generator.generate(num_seqs=n_generate) else: self.logger.error( 'Number of sequences to generate should be higher 0') return except (TypeError, IOError) as err: self.logger.error(str(err)) return # Write the pandas dataframe to a separated file with. try: self.logger.info('Writing generated sequences to file system') output_filename = get_config_data('COMMON', 'OUT_NAME') if not output_filename: output_filename = 'generated_seqs_{}_CDR3'.format( model_type) _, filename = write_dataframe_to_separated( dataframe=cdr3_seqs_df, filename=output_filename, directory=output_dir, separator=get_config_data('COMMON', 'SEPARATOR'), index_name=get_config_data('COMMON', 'I_COL')) self.logger.info("Written '%s'", filename) except IOError as err: self.logger.error(str(err)) return
def run(self, args, output_dir): """Function to execute the commandline tool. Parameters ---------- args : Namespace Object containing our parsed commandline arguments. output_dir : str A directory path for writing output files to. """ # Add general igor commands. self.logger.info('Setting up initial IGoR command (1/5)') command_list = [] working_dir = get_config_data('COMMON', 'WORKING_DIR') command_list.append(['set_wd', working_dir]) command_list.append( ['threads', str(get_config_data('COMMON', 'NUM_THREADS', 'int'))]) # Add sequence and file paths commands. self.logger.info('Processing genomic reference templates (2/5)') try: ref_list = ['set_genomic'] for i in args.ref: filename = preprocess_reference_file( os.path.join(working_dir, 'genomic_templates'), copy_to_dir(working_dir, i[1], 'fasta'), 1) ref_list.append([i[0], filename]) command_list.append(ref_list) except IOError as err: self.logger.error(str(err)) return # Set the initial model parameters using a build-in model. self.logger.info('Setting initial model parameters (3/5)') if args.type in ['beta', 'heavy']: command_list.append([ 'set_custom_model', get_default_model_file_paths(name='human-t-beta')['parameters'] ]) elif args.type in ['alpha', 'light']: command_list.append([ 'set_custom_model', get_default_model_file_paths( name='human-t-alpha')['parameters'] ]) # Add the sequence command after pre-processing of the input file. self.logger.info('Pre-processing input sequence file (4/5)') try: if is_fasta(args.seqs): self.logger.info('FASTA input file extension detected') command_list.append([ 'read_seqs', copy_to_dir(working_dir, str(args.seqs), 'fasta') ]) elif is_separated(args.seqs, get_config_data('COMMON', 'SEPARATOR')): self.logger.info('Separated input file type detected') try: input_seqs = preprocess_separated_file( os.path.join(working_dir, 'input'), copy_to_dir(working_dir, str(args.seqs), 'csv'), get_config_data('COMMON', 'SEPARATOR'), ';', get_config_data('COMMON', 'I_COL'), [get_config_data('COMMON', 'NT_COL')]) command_list.append(['read_seqs', input_seqs]) except (KeyError, ValueError) as err: self.logger.error( "Given input sequence file does not have a '%s' column", get_config_data('COMMON', 'NT_COL')) return else: self.logger.error( 'Given input sequence file could not be detected as ' 'FASTA file or separated data type') return except (IOError, KeyError) as err: self.logger.error(str(err)) return # Add alignment command and inference commands. self.logger.info('Adding additional variables to IGoR command (5/5)') command_list.append(['align', ['all']]) if args.n_iter: command_list.append(['infer', ['N_iter', str(args.n_iter)]]) else: command_list.append([ 'infer', [ 'N_iter', str(get_config_data('BUILD', 'NUM_ITERATIONS', 'int')) ] ]) # Execute IGoR through command line and catch error code. self.logger.info('Executing IGoR (this might take a while)') try: igor_cline = IgorInterface(command=command_list) exit_code, _, stderr, _ = igor_cline.call() if exit_code != 0: self.logger.error( "An error occurred during execution of IGoR command " "(exit code %s):\n%s", exit_code, stderr) return except OSError as err: self.logger.error(str(err)) return # Copy the output files to the output directory with prefix. try: self.logger.info('Writing model files to file system') output_prefix = get_config_data('COMMON', 'OUT_NAME') if not output_prefix: output_prefix = 'model' _, filename_1 = self._copy_file_to_output( file=os.path.join(working_dir, 'inference', 'final_marginals.txt'), filename='{}_marginals'.format(output_prefix), directory=output_dir) self.logger.info("Written '%s'", filename_1) _, filename_2 = self._copy_file_to_output( file=os.path.join(working_dir, 'inference', 'final_parms.txt'), filename='{}_params'.format(output_prefix), directory=output_dir) self.logger.info("Written '%s'", filename_2) for file in os.listdir(os.path.join(working_dir, 'inference')): _, filename = self._copy_file_to_output( file=os.path.join(working_dir, 'inference', file), filename='{}_{}'.format(output_prefix, file.split('.')[0]), directory=output_dir) self.logger.info("Written '%s'", filename) except IOError as err: self.logger.error(str(err)) return
def _add_options(self): """Function for adding the parser/options to the input ArgumentParser. Notes ----- Uses the class constructor's subparser object for appending the tool's parser and options. """ # Create the description and options for the parser. description = "Converts the full length (VDJ for productive, unproductive and the total) and CDR3 sequences from a " \ "given adaptive input sequence file. The VDJ sequences can be used to build a new IGoR model and the CDR3 " \ "sequences can be evaluated." parser_options = { '-seqs': { 'metavar': '<separated>', 'required': 'True', 'type': 'str', 'help': "An input separated data file with sequences to convert using the defined column names." }, '-ref': { 'metavar': ('<gene>', '<fasta>'), 'type': 'str', 'action': 'append', 'nargs': 2, 'required': 'True', 'help': "A gene (V or J) followed by a reference genome FASTA file. Note: the FASTA reference genome files " "needs to conform to IGMT annotation (separated by '|' character)." }, '-n-random': { 'type': 'int', 'nargs': '?', 'help': "Number of random sequences (subset) to convert from the given file (default: {})." .format(get_config_data('CONVERT', 'NUM_RANDOM', 'int')) }, '-use-allele': { 'action': 'store_true', 'help': "If specified (True), the allele information from the resolved gene fields are used to when " "reconstructing the gene choices (default: {}).".format( get_config_data('CONVERT', 'USE_ALLELE', 'bool')) }, } # Add the options to the parser and return the updated parser. parser_tool = self.subparsers.add_parser('convert', help=description, description=description) parser_tool = dynamic_cli_options(parser=parser_tool, options=parser_options)
def run(self, args, output_dir): """Function to execute the commandline tool. Parameters ---------- args : Namespace Object containing our parsed commandline arguments. output_dir : str A directory path for writing output files to. """ # Get the working directory. working_dir = get_config_data('COMMON', 'WORKING_DIR') # Collect and read in the corresponding reference genomic templates. self.logger.info('Processing genomic reference templates') try: for gene in args.ref: filename = preprocess_reference_file( os.path.join(working_dir, 'genomic_templates'), copy_to_dir(working_dir, gene[1], 'fasta'), ) if gene[0] == 'V': v_gene_df = self._process_gene_df( filename=filename, nt_col=get_config_data('COMMON', 'NT_COL'), resolved_col=get_config_data('COMMON', 'V_RESOLVED_COL')) if gene[0] == 'J': j_gene_df = self._process_gene_df( filename=filename, nt_col=get_config_data('COMMON', 'NT_COL'), resolved_col=get_config_data('COMMON', 'J_RESOLVED_COL')) except (IOError, KeyError, ValueError) as err: self.logger.error(str(err)) return # Read in the sequence data. self.logger.info('Pre-processing input sequence file') try: seqs_df = read_separated_to_dataframe( file=args.seqs, separator=get_config_data('COMMON', 'SEPARATOR'), cols=[ get_config_data('COMMON', 'NT_COL'), get_config_data('COMMON', 'AA_COL'), get_config_data('COMMON', 'FRAME_TYPE_COL'), get_config_data('COMMON', 'CDR3_LENGTH_COL'), get_config_data('COMMON', 'V_RESOLVED_COL'), get_config_data('COMMON', 'J_RESOLVED_COL') ]) # Take a random subsample of sequences in the file. n_random = get_config_data('CONVERT', 'NUM_RANDOM', 'int') if args.n_random: n_random = args.n_random if n_random != 0: if len(seqs_df) < n_random: self.logger.warning( 'Number of random sequences is higher then number of ' 'rows in file, all rows are used') return except (IOError, KeyError, ValueError) as err: self.logger.error(str(err)) return # Setup the data convertor class and convert data. self.logger.info('Converting adaptive file format') try: use_allele = get_config_data('CONVERT', 'USE_ALLELE', 'bool') if args.use_allele: use_allele = args.use_allele asc = AdaptiveSequenceConvertor() cdr3_df, full_prod_df, full_unprod_df, full_df = asc.convert( num_threads=get_config_data('COMMON', 'NUM_THREADS', 'int'), seqs=seqs_df, ref_v_genes=v_gene_df, ref_j_genes=j_gene_df, row_id_col=get_config_data('COMMON', 'ROW_ID_COL'), nt_col=get_config_data('COMMON', 'NT_COL'), aa_col=get_config_data('COMMON', 'AA_COL'), frame_type_col=get_config_data('COMMON', 'FRAME_TYPE_COL'), cdr3_length_col=get_config_data('COMMON', 'CDR3_LENGTH_COL'), v_resolved_col=get_config_data('COMMON', 'V_RESOLVED_COL'), v_gene_choice_col=get_config_data('COMMON', 'V_GENE_CHOICE_COL'), j_resolved_col=get_config_data('COMMON', 'J_RESOLVED_COL'), j_gene_choice_col=get_config_data('COMMON', 'J_GENE_CHOICE_COL'), use_allele=use_allele, default_allele=get_config_data('CONVERT', 'DEFAULT_ALLELE'), n_random=n_random) cdr3_df.insert(0, get_config_data('COMMON', 'FILE_NAME_ID_COL'), os.path.splitext(os.path.basename(args.seqs))[0]) full_prod_df.insert( 0, get_config_data('COMMON', 'FILE_NAME_ID_COL'), os.path.splitext(os.path.basename(args.seqs))[0]) full_unprod_df.insert( 0, get_config_data('COMMON', 'FILE_NAME_ID_COL'), os.path.splitext(os.path.basename(args.seqs))[0]) full_df.insert(0, get_config_data('COMMON', 'FILE_NAME_ID_COL'), os.path.splitext(os.path.basename(args.seqs))[0]) except KeyError as err: self.logger.error(str(err)) return # Copy the output files to the output directory with prefix. try: self.logger.info('Writing converted files to file system') output_prefix = get_config_data('COMMON', 'OUT_NAME') if not output_prefix: output_prefix = 'converted' _, filename_1 = write_dataframe_to_separated( dataframe=cdr3_df, filename='{}_CDR3'.format(output_prefix), directory=output_dir, separator=get_config_data('COMMON', 'SEPARATOR'), index_name=get_config_data('COMMON', 'I_COL')) self.logger.info("Written '%s'", filename_1) _, filename_2 = write_dataframe_to_separated( dataframe=full_prod_df, filename='{}_full_length_productive'.format(output_prefix), directory=output_dir, separator=get_config_data('COMMON', 'SEPARATOR'), index_name=get_config_data('COMMON', 'I_COL')) self.logger.info("Written '%s'", filename_2) _, filename_3 = write_dataframe_to_separated( dataframe=full_unprod_df, filename='{}_full_length_unproductive'.format(output_prefix), directory=output_dir, separator=get_config_data('COMMON', 'SEPARATOR'), index_name=get_config_data('COMMON', 'I_COL')) self.logger.info("Written '%s'", filename_3) _, filename_4 = write_dataframe_to_separated( dataframe=full_df, filename='{}_full_length'.format(output_prefix), directory=output_dir, separator=get_config_data('COMMON', 'SEPARATOR'), index_name=get_config_data('COMMON', 'I_COL')) self.logger.info("Written '%s'", filename_4) except IOError as err: self.logger.error(str(err)) return