def correctIMGTFields(receptor, references): """ Add IMGT-gaps to IMGT fields in a Receptor object Arguments: receptor (changeo.Receptor.Receptor): Receptor object to modify. references (dict): dictionary of IMGT-gapped references sequences. Returns: changeo.Receptor.Receptor: modified Receptor with IMGT-gapped fields. """ # Initialize update object imgt_dict = {'sequence_imgt': None, 'v_germ_start_imgt': None, 'v_germ_length_imgt': None, 'germline_imgt': None} try: if not all([receptor.sequence_imgt, receptor.v_germ_start_imgt, receptor.v_germ_length_imgt, receptor.v_call]): raise AttributeError except AttributeError: return None # Update IMGT fields try: gapped = gapV(receptor.sequence_imgt, receptor.v_germ_start_imgt, receptor.v_germ_length_imgt, receptor.v_call, references) except KeyError as e: printWarning(e) return None # Verify IMGT-gapped sequence and junction concur try: check = (receptor.junction == gapped['sequence_imgt'][309:(309 + receptor.junction_length)]) except TypeError: check = False if not check: return None # Rebuild germline sequence __, germlines, __ = buildGermline(receptor, references) if germlines is None: return None else: gapped['germline_imgt'] = germlines['full'] # Update return object imgt_dict.update(gapped) return imgt_dict
def parseCommonArgs(args, in_arg=None, in_types=None): """ Checks common arguments from getCommonArgParser and transforms output options to a dictionary Arguments: args : Argument Namespace defined by ArgumentParser.parse_args in_arg : String defining a non-standard input file argument to verify; by default ['db_files', 'seq_files', 'seq_files_1', 'seq_files_2', 'primer_file'] are supported in that order in_types : List of types (file extensions as strings) to allow for files in file_arg if None do not check type Returns: dict : Dictionary copy of args with output arguments embedded in the dictionary out_args """ db_types = ['tab'] seq_types = ['fasta', 'fastq'] primer_types = ['fasta'] if in_types is not None: in_types = [f.lower for f in in_types] args_dict = args.__dict__.copy() # Count input files if 'seq_files' in args_dict: input_count = len(args_dict['seq_files'] or []) input_files = args_dict['seq_files'] elif all([k in args_dict for k in ('seq_files_1', 'seq_files_2')]): input_count = len(args_dict['seq_files_1'] or []) input_files = args_dict['seq_files_1'] + args_dict['seq_files_2'] elif 'db_files' in args_dict: input_count = len(args_dict['db_files'] or []) input_files = args_dict['db_files'] elif 'primer_file' in args_dict: input_count = 1 input_files = args_dict['primer_file'] elif in_arg is not None and in_arg in args_dict: input_count = len(args_dict[in_arg] or []) input_files = args_dict[in_arg] else: printError('Cannot determine input file argument.') # Exit if output names or log files are specified with multiple input files if args_dict.get('out_name', None) is not None and input_count > 1: printError('The --outname argument may not be specified with multiple input files.') if args_dict.get('log_file', None) is not None and input_count > 1: printError('The --log argument may not be specified with multiple input files.') # Verify single-end sequence files if 'seq_files' in args_dict and args_dict['seq_files']: for f in args_dict['seq_files']: if not os.path.isfile(f): printError('Sequence file %s does not exist.' % f) if getFileType(f) not in seq_types: printError('Sequence file %s is not a supported type. Must be one: %s.' \ % (f, ', '.join(seq_types))) # Verify paired-end sequence files if all([k in args_dict and args_dict[k] for k in ('seq_files_1', 'seq_files_2')]): if len(args_dict['seq_files_1']) != len(args_dict['seq_files_2']): printError('The -1 and -2 arguments must contain the same number of files.') for f1, f2 in zip(args_dict['seq_files_1'], args_dict['seq_files_2']): if getFileType(f1) != getFileType(f2): printError('Each pair of files in the -1 and -2 arguments must be the same file type.') for f in (args_dict['seq_files_1'] + args_dict['seq_files_2']): if not os.path.isfile(f): printError('Sequence file %s does not exist.' % f) if getFileType(f) not in seq_types: printError('Sequence file %s is not a supported type. Must be one: %s.' \ % (f, ', '.join(seq_types))) # Verify database files if 'db_files' in args_dict and args_dict['db_files']: for f in args_dict['db_files']: if not os.path.isfile(f): printError('Database file %s does not exist.' % f) if getFileType(f) not in db_types: printError('Database file %s is not a supported type. Must be one: %s.' \ % (f, ', '.join(db_types))) # Verify primer file if 'primer_file' in args_dict and args_dict['primer_file']: primer_file = args_dict['primer_file'] if not os.path.isfile(primer_file): printError('Primer file %s does not exist.' % primer_file) if getFileType(primer_file) not in primer_types: printError('Primer file %s is not a supported type. Must be one: %s.' \ % (primer_file, ', '.join(primer_types))) # Verify non-standard input files if in_arg is not None and in_arg in args_dict and args_dict[in_arg]: files = args_dict[in_arg] if isinstance(args_dict[in_arg], list) \ else [args_dict[in_arg]] for f in files: if not os.path.exists(f): printError('Input %s does not exist.' % f) if in_types is not None and getFileType(f) not in in_types: printError('Input %s is not a supported type. Must be one: %s.' \ % (f, ', '.join(in_types))) # Verify output file arguments and exit if anything is hinky if args_dict.get('out_files', None) is not None \ or args_dict.get('out_file', None) is not None: if args_dict.get('out_dir', None) is not None: printError('The -o argument may not be specified with the --outdir argument.') if args_dict.get('out_name', None) is not None: printError('The -o argument may not be specified with the --outname argument.') if args_dict.get('failed', False): printError('The -o argument may not be specified with the --failed argument.') if args_dict.get('out_files', None) is not None: if len(args_dict['out_files']) != input_count: printError('The -o argument requires one output file name per input file.') for f in args_dict['out_files']: if f in input_files: printError('Output files and input files cannot have the same names.') for f in args_dict['out_files']: if os.path.isfile(f): printWarning('Output file %s already exists and will be overwritten.' % f) if args_dict.get('out_file', None) is not None: if args_dict['out_file'] in input_files: printError('Output files and input files cannot have the same names.') if os.path.isfile(args_dict['out_file']): printWarning('Output file %s already exists and will be overwritten.' % args_dict['out_file']) # Verify output directory if 'out_dir' in args_dict and args_dict['out_dir']: if os.path.exists(args_dict['out_dir']) and not os.path.isdir(args_dict['out_dir']): printError('Directory %s exists but it is not a directory.' % args_dict['out_dir']) # Redefine common output options as out_args dictionary out_args = ['log_file', 'delimiter', 'separator', 'out_dir', 'out_name', 'out_type', 'failed'] args_dict['out_args'] = {k:args_dict.setdefault(k, None) for k in out_args} for k in out_args: del args_dict[k] return args_dict
def insertGaps(db_file, references=None, format=default_format, out_file=None, out_args=default_out_args): """ Inserts IMGT numbering into V fields Arguments: db_file : the database file name. references : folder with germline repertoire files. If None, do not updated alignment columns wtih IMGT gaps. format : input format. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str : output file name """ log = OrderedDict() log['START'] = 'ConvertDb' log['COMMAND'] = 'imgt' log['FILE'] = os.path.basename(db_file) printLog(log) # Define format operators try: reader, writer, schema = getFormatOperators(format) except ValueError: printError('Invalid format %s.' % format) # Open input db_handle = open(db_file, 'rt') db_iter = reader(db_handle) # Check for required columns try: required = ['sequence_imgt', 'v_germ_start_imgt'] checkFields(required, db_iter.fields, schema=schema) except LookupError as e: printError(e) # Load references reference_dict = readGermlines(references) # Check for IMGT-gaps in germlines if all('...' not in x for x in reference_dict.values()): printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.') # Open output writer if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='gap', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=schema.out_type) pass_writer = writer(pass_handle, fields=db_iter.fields) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = pass_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Update IMGT fields imgt_dict = correctIMGTFields(rec, reference_dict) # Write records if imgt_dict is not None: pass_count += 1 rec.setDict(imgt_dict, parse=False) pass_writer.writeReceptor(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['PASS'] = pass_count log['FAIL'] = rec_count - pass_count log['END'] = 'ConvertDb' printLog(log) # Close file handles pass_handle.close() db_handle.close() return pass_handle.name
def createGermlines(db_file, references, seq_field=default_seq_field, v_field=default_v_field, d_field=default_d_field, j_field=default_j_field, cloned=False, clone_field=default_clone_field, germ_types=default_germ_types, format=default_format, out_file=None, out_args=default_out_args): """ Write germline sequences to tab-delimited database file Arguments: db_file : input tab-delimited database file. references : folders and/or files containing germline repertoire data in FASTA format. seq_field : field in which to look for sequence. v_field : field in which to look for V call. d_field : field in which to look for D call. j_field : field in which to look for J call. cloned : if True build germlines by clone, otherwise build individual germlines. clone_field : field containing clone identifiers; ignored if cloned=False. germ_types : list of germline sequence types to be output from the set of 'full', 'dmask', 'vonly', 'regions' format : input and output format. out_file : output file name. Automatically generated from the input file if None. out_args : arguments for output preferences. Returns: dict: names of the 'pass' and 'fail' output files. """ # Print parameter info log = OrderedDict() log['START'] = 'CreateGermlines' log['FILE'] = os.path.basename(db_file) log['GERM_TYPES'] = ','.join(germ_types) log['SEQ_FIELD'] = seq_field log['V_FIELD'] = v_field log['D_FIELD'] = d_field log['J_FIELD'] = j_field log['CLONED'] = cloned if cloned: log['CLONE_FIELD'] = clone_field printLog(log) # Define format operators try: reader, writer, schema = getFormatOperators(format) except ValueError: printError('Invalid format %s' % format) out_args['out_type'] = schema.out_type # TODO: this won't work for AIRR necessarily # Define output germline fields germline_fields = OrderedDict() seq_type = seq_field.split('_')[-1] if 'full' in germ_types: germline_fields['full'] = 'germline_' + seq_type if 'dmask' in germ_types: germline_fields['dmask'] = 'germline_' + seq_type + '_d_mask' if 'vonly' in germ_types: germline_fields['vonly'] = 'germline_' + seq_type + '_v_region' if 'regions' in germ_types: germline_fields['regions'] = 'germline_regions' if cloned: germline_fields['v'] = 'germline_v_call' germline_fields['d'] = 'germline_d_call' germline_fields['j'] = 'germline_j_call' out_fields = getDbFields(db_file, add=[schema.fromReceptor(f) for f in germline_fields.values()], reader=reader) # Get repertoire and open Db reader reference_dict = readGermlines(references) db_handle = open(db_file, 'rt') db_iter = reader(db_handle) # Check for required columns try: required = ['v_germ_start_imgt', 'd_germ_start', 'j_germ_start', 'np1_length', 'np2_length'] checkFields(required, db_iter.fields, schema=schema) except LookupError as e: printError(e) # Check for IMGT-gaps in germlines if all('...' not in x for x in reference_dict.values()): printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.') # Count input total_count = countDbFile(db_file) # Check for existence of fields for f in [v_field, d_field, j_field, seq_field]: if f not in db_iter.fields: printError('%s field does not exist in input database file.' % f) # Translate to Receptor attribute names v_field = schema.toReceptor(v_field) d_field = schema.toReceptor(d_field) j_field = schema.toReceptor(j_field) seq_field = schema.toReceptor(seq_field) clone_field = schema.toReceptor(clone_field) # Define Receptor iterator if cloned: start_time = time() printMessage('Sorting by clone', start_time=start_time, width=20) sorted_records = sorted(db_iter, key=lambda x: x.getField(clone_field)) printMessage('Done', start_time=start_time, end=True, width=20) receptor_iter = groupby(sorted_records, lambda x: x.getField(clone_field)) else: receptor_iter = ((x.sequence_id, [x]) for x in db_iter) # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') # Initialize handles, writers and counters pass_handle, pass_writer = None, None fail_handle, fail_writer = None, None rec_count, pass_count, fail_count = 0, 0, 0 start_time = time() # Iterate over rows for key, records in receptor_iter: # Print progress printProgress(rec_count, total_count, 0.05, start_time=start_time) # Define iteration variables records = list(records) rec_log = OrderedDict([('ID', key)]) rec_count += len(records) # Build germline for records if len(records) == 1: germ_log, germlines, genes = buildGermline(records[0], reference_dict, seq_field=seq_field, v_field=v_field, d_field=d_field, j_field=j_field) else: germ_log, germlines, genes = buildClonalGermline(records, reference_dict, seq_field=seq_field, v_field=v_field, d_field=d_field, j_field=j_field) rec_log.update(germ_log) # Write row to pass or fail file if germlines is not None: pass_count += len(records) # Add germlines to Receptor record annotations = {} if 'full' in germ_types: annotations[germline_fields['full']] = germlines['full'] if 'dmask' in germ_types: annotations[germline_fields['dmask']] = germlines['dmask'] if 'vonly' in germ_types: annotations[germline_fields['vonly']] = germlines['vonly'] if 'regions' in germ_types: annotations[germline_fields['regions']] = germlines['regions'] if cloned: annotations[germline_fields['v']] = genes['v'] annotations[germline_fields['d']] = genes['d'] annotations[germline_fields['j']] = genes['j'] # Write records try: for r in records: r.setDict(annotations) pass_writer.writeReceptor(r) except AttributeError: # Create output file handle and writer if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='germ-pass', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = writer(pass_handle, fields=out_fields) for r in records: r.setDict(annotations) pass_writer.writeReceptor(r) else: fail_count += len(records) if out_args['failed']: try: fail_writer.writeReceptor(records) except AttributeError: fail_handle = getOutputHandle(db_file, out_label='germ-fail', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) fail_writer = writer(fail_handle, fields=out_fields) fail_writer.writeReceptor(records) # Write log printLog(rec_log, handle=log_handle) # Print log printProgress(rec_count, total_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) if pass_handle is not None else None log['RECORDS'] = rec_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'CreateGermlines' printLog(log) # Close file handles db_handle.close() output = {'pass': None, 'fail': None} if pass_handle is not None: output['pass'] = pass_handle.name pass_handle.close() if fail_handle is not None: output['fail'] = fail_handle.name fail_handle.close() if log_handle is not None: log_handle.close() return output
def parseIHMM(aligner_file, seq_file, repo, cellranger_file=None, partial=False, asis_id=True, extended=False, format=default_format, out_file=None, out_args=default_out_args): """ Main for iHMMuneAlign aligned sample sequences. Arguments: aligner_file : iHMMune-Align output file to process. seq_file : fasta file input to iHMMuneAlign (from which to get sequence). repo : folder with germline repertoire files. partial : If True put incomplete alignments in the pass file. asis_id : if ID is to be parsed for pRESTO output with default delimiters. extended : if True parse alignment scores, FWR and CDR region fields. format : output format. One of 'changeo' or 'airr'. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: dict : names of the 'pass' and 'fail' output files. """ # Print parameter info log = OrderedDict() log['START'] = 'MakeDB' log['COMMAND'] = 'ihmm' log['ALIGNER_FILE'] = os.path.basename(aligner_file) log['SEQ_FILE'] = os.path.basename(seq_file) log['ASIS_ID'] = asis_id log['PARTIAL'] = partial log['EXTENDED'] = extended printLog(log) start_time = time() printMessage('Loading files', start_time=start_time, width=20) # Count records in sequence file total_count = countSeqFile(seq_file) # Get input sequence dictionary seq_dict = getSeqDict(seq_file) # Create germline repo dictionary references = readGermlines(repo) # Load supplementary annotation table if cellranger_file is not None: f = cellranger_extended if extended else cellranger_base annotations = readCellRanger(cellranger_file, fields=f) else: annotations = None printMessage('Done', start_time=start_time, end=True, width=20) # Check for IMGT-gaps in germlines if all('...' not in x for x in references.values()): printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.') # Define format operators try: __, writer, schema = getFormatOperators(format) except ValueError: printError('Invalid format %s.' % format) out_args['out_type'] = schema.out_type # Define output fields fields = list(schema.required) if extended: custom = IHMMuneReader.customFields(scores=True, regions=True, schema=schema) fields.extend(custom) # Parse and write output with open(aligner_file, 'r') as f: parse_iter = IHMMuneReader(f, seq_dict, references) germ_iter = (addGermline(x, references) for x in parse_iter) output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count, annotations=annotations, asis_id=asis_id, partial=partial, writer=writer, out_file=out_file, out_args=out_args) return output
def parseIgBLAST(aligner_file, seq_file, repo, amino_acid=False, cellranger_file=None, partial=False, asis_id=True, asis_calls=False, extended=False, regions='default', format='changeo', out_file=None, out_args=default_out_args): """ Main for IgBLAST aligned sample sequences. Arguments: aligner_file (str): IgBLAST output file to process. seq_file (str): fasta file input to IgBlast (from which to get sequence). repo (str): folder with germline repertoire files. amino_acid (bool): if True then the IgBLAST output files are results from igblastp. igblastn is assumed if False. partial : If True put incomplete alignments in the pass file. asis_id (bool): if ID is to be parsed for pRESTO output with default delimiters. asis_calls (bool): if True do not parse gene calls for allele names. extended (bool): if True add alignment scores, FWR regions, and CDR regions to the output. regions (str): name of the IMGT FWR/CDR region definitions to use. format (str): output format. one of 'changeo' or 'airr'. out_file (str): output file name. Automatically generated from the input file if None. out_args (dict): common output argument dictionary from parseCommonArgs. Returns: dict : names of the 'pass' and 'fail' output files. """ # Print parameter info log = OrderedDict() log['START'] = 'MakeDB' log['COMMAND'] = 'igblast-aa' if amino_acid else 'igblast' log['ALIGNER_FILE'] = os.path.basename(aligner_file) log['SEQ_FILE'] = os.path.basename(seq_file) log['ASIS_ID'] = asis_id log['ASIS_CALLS'] = asis_calls log['PARTIAL'] = partial log['EXTENDED'] = extended printLog(log) # Set amino acid conditions if amino_acid: format = '%s-aa' % format parser = IgBLASTReaderAA else: parser = IgBLASTReader # Start start_time = time() printMessage('Loading files', start_time=start_time, width=20) # Count records in sequence file total_count = countSeqFile(seq_file) # Get input sequence dictionary seq_dict = getSeqDict(seq_file) # Create germline repo dictionary references = readGermlines(repo, asis=asis_calls) # Load supplementary annotation table if cellranger_file is not None: f = cellranger_extended if extended else cellranger_base annotations = readCellRanger(cellranger_file, fields=f) else: annotations = None printMessage('Done', start_time=start_time, end=True, width=20) # Check for IMGT-gaps in germlines if all('...' not in x for x in references.values()): printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.') # Define format operators try: __, writer, schema = getFormatOperators(format) except ValueError: printError('Invalid format %s.' % format) out_args['out_type'] = schema.out_type # Define output fields fields = list(schema.required) if extended: custom = parser.customFields(schema=schema) fields.extend(custom) # Parse and write output with open(aligner_file, 'r') as f: parse_iter = parser(f, seq_dict, references, regions=regions, asis_calls=asis_calls) germ_iter = (addGermline(x, references, amino_acid=amino_acid) for x in parse_iter) output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count, annotations=annotations, amino_acid=amino_acid, partial=partial, asis_id=asis_id, regions=regions, writer=writer, out_file=out_file, out_args=out_args) return output
def parseIMGT(aligner_file, seq_file=None, repo=None, cellranger_file=None, partial=False, asis_id=True, extended=False, format=default_format, out_file=None, out_args=default_out_args): """ Main for IMGT aligned sample sequences. Arguments: aligner_file : zipped file or unzipped folder output by IMGT. seq_file : FASTA file input to IMGT (from which to get seqID). repo : folder with germline repertoire files. partial : If True put incomplete alignments in the pass file. asis_id : if ID is to be parsed for pRESTO output with default delimiters. extended : if True add alignment score, FWR, CDR and junction fields to output file. format : output format. one of 'changeo' or 'airr'. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: dict : names of the 'pass' and 'fail' output files. """ # Print parameter info log = OrderedDict() log['START'] = 'MakeDb' log['COMMAND'] = 'imgt' log['ALIGNER_FILE'] = aligner_file log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else '' log['ASIS_ID'] = asis_id log['PARTIAL'] = partial log['EXTENDED'] = extended printLog(log) start_time = time() printMessage('Loading files', start_time=start_time, width=20) # Extract IMGT files temp_dir, imgt_files = extractIMGT(aligner_file) # Count records in IMGT files total_count = countDbFile(imgt_files['summary']) # Get (parsed) IDs from fasta file submitted to IMGT id_dict = getIDforIMGT(seq_file) if seq_file else {} # Load supplementary annotation table if cellranger_file is not None: f = cellranger_extended if extended else cellranger_base annotations = readCellRanger(cellranger_file, fields=f) else: annotations = None printMessage('Done', start_time=start_time, end=True, width=20) # Define format operators try: __, writer, schema = getFormatOperators(format) except ValueError: printError('Invalid format %s.' % format) out_args['out_type'] = schema.out_type # Define output fields fields = list(schema.required) if extended: custom = IMGTReader.customFields(scores=True, regions=True, junction=True, schema=schema) fields.extend(custom) # Parse IMGT output and write db with open(imgt_files['summary'], 'r') as summary_handle, \ open(imgt_files['gapped'], 'r') as gapped_handle, \ open(imgt_files['ntseq'], 'r') as ntseq_handle, \ open(imgt_files['junction'], 'r') as junction_handle: # Open parser parse_iter = IMGTReader(summary_handle, gapped_handle, ntseq_handle, junction_handle) # Add germline sequence if repo is None: germ_iter = parse_iter else: references = readGermlines(repo) # Check for IMGT-gaps in germlines if all('...' not in x for x in references.values()): printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.') germ_iter = (addGermline(x, references) for x in parse_iter) # Write db output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count, annotations=annotations, id_dict=id_dict, asis_id=asis_id, partial=partial, writer=writer, out_file=out_file, out_args=out_args) # Cleanup temp directory temp_dir.cleanup() return output
def writeDb(records, fields, aligner_file, total_count, id_dict=None, annotations=None, amino_acid=False, partial=False, asis_id=True, regions='default', writer=AIRRWriter, out_file=None, out_args=default_out_args): """ Writes parsed records to an output file Arguments: records : a iterator of Receptor objects containing alignment data. fields : a list of ordered field names to write. aligner_file : input file name. total_count : number of records (for progress bar). id_dict : a dictionary of the truncated sequence ID mapped to the full sequence ID. annotations : additional annotation dictionary. amino_acid : if True do verification on amino acid fields. partial : if True put incomplete alignments in the pass file. asis_id : if ID is to be parsed for pRESTO output with default delimiters. regions (str): name of the IMGT FWR/CDR region definitions to use. writer : writer class. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: None """ # Wrapper for opening handles and writers def _open(x, f, writer=writer, out_file=out_file): if out_file is not None and x == 'pass': handle = open(out_file, 'w') else: handle = getOutputHandle(aligner_file, out_label='db-%s' % x, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) return handle, writer(handle, fields=f) # Function to convert fasta header annotations to changeo columns def _changeo(f, header): h = [ChangeoSchema.fromReceptor(x) for x in header if x.upper() not in f] f.extend(h) return f def _airr(f, header): h = [AIRRSchema.fromReceptor(x) for x in header if x.lower() not in f] f.extend(h) return f # Function to verify IMGT-gapped sequence and junction concur def _imgt_check(rec): try: if amino_acid: rd = RegionDefinition(rec.junction_aa_length, amino_acid=amino_acid, definition=regions) x, y = rd.positions['junction'] check = (rec.junction_aa == rec.sequence_aa_imgt[x:y]) else: rd = RegionDefinition(rec.junction_length, amino_acid=amino_acid, definition=regions) x, y = rd.positions['junction'] check = (rec.junction == rec.sequence_imgt[x:y]) except (TypeError, AttributeError): check = False return check # Function to check for valid records strictly def _strict(rec): if amino_acid: valid = [rec.v_call and rec.v_call != 'None', rec.j_call and rec.j_call != 'None', rec.functional is not None, rec.sequence_aa_imgt, rec.junction_aa, _imgt_check(rec)] else: valid = [rec.v_call and rec.v_call != 'None', rec.j_call and rec.j_call != 'None', rec.functional is not None, rec.sequence_imgt, rec.junction, _imgt_check(rec)] return all(valid) # Function to check for valid records loosely def _gentle(rec): valid = [rec.v_call and rec.v_call != 'None', rec.d_call and rec.d_call != 'None', rec.j_call and rec.j_call != 'None'] return any(valid) # Set writer class and annotation conversion function if writer == ChangeoWriter: _annotate = _changeo elif writer == AIRRWriter: _annotate = _airr else: printError('Invalid output writer.') # Additional annotation (e.g. 10X cell calls) # _append_table = None # if cellranger_file is not None: # with open(cellranger_file) as csv_file: # # Read in annotation file (use Sniffer to discover file delimiters) # dialect = csv.Sniffer().sniff(csv_file.readline()) # csv_file.seek(0) # csv_reader = csv.DictReader(csv_file, dialect = dialect) # # # Generate annotation dictionary # anntab_dict = {entry['contig_id']: {cellranger_map[field]: entry[field] \ # for field in cellranger_map.keys()} for entry in csv_reader} # # fields = _annotate(fields, cellranger_map.values()) # _append_table = lambda sequence_id: anntab_dict[sequence_id] # Set pass criteria _pass = _gentle if partial else _strict # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') # Initialize handles, writers and counters pass_handle, pass_writer = None, None fail_handle, fail_writer = None, None pass_count, fail_count = 0, 0 start_time = time() # Validate and write output printProgress(0, total_count, 0.05, start_time=start_time) for i, record in enumerate(records, start=1): # Replace sequence description with full string, if required if id_dict is not None and record.sequence_id in id_dict: record.sequence_id = id_dict[record.sequence_id] # Parse sequence description into new columns if not asis_id: try: ann_raw = parseAnnotation(record.sequence_id) record.sequence_id = ann_raw.pop('ID') # Convert to Receptor fields ann_parsed = OrderedDict() for k, v in ann_raw.items(): ann_parsed[ChangeoSchema.toReceptor(k)] = v # Add annotations to Receptor and update field list record.setDict(ann_parsed, parse=True) if i == 1: fields = _annotate(fields, ann_parsed.keys()) except IndexError: # Could not parse pRESTO-style annotations so fall back to no parse asis_id = True printWarning('Sequence annotation format not recognized. Sequence headers will not be parsed.') # Add supplemental annotation fields # if _append_table is not None: # record.setDict(_append_table(record.sequence_id), parse=True) if annotations is not None: record.setDict(annotations[record.sequence_id], parse=True) if i == 1: fields = _annotate(fields, annotations[record.sequence_id].keys()) # Count pass or fail and write to appropriate file if _pass(record): pass_count += 1 # Write row to pass file try: pass_writer.writeReceptor(record) except AttributeError: # Open pass file and writer pass_handle, pass_writer = _open('pass', fields) pass_writer.writeReceptor(record) else: fail_count += 1 # Write row to fail file if specified if out_args['failed']: try: fail_writer.writeReceptor(record) except AttributeError: # Open fail file and writer fail_handle, fail_writer = _open('fail', fields) fail_writer.writeReceptor(record) # Write log if log_handle is not None: log = OrderedDict([('ID', record.sequence_id), ('V_CALL', record.v_call), ('D_CALL', record.d_call), ('J_CALL', record.j_call), ('PRODUCTIVE', record.functional)]) if not _imgt_check(record) and not amino_acid: log['ERROR'] = 'Junction does not match the sequence starting at position 310 in the IMGT numbered V(D)J sequence.' printLog(log, log_handle) # Print progress printProgress(i, total_count, 0.05, start_time=start_time) # Print console log log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) if pass_handle is not None else None log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'MakeDb' printLog(log) # Close file handles output = {'pass': None, 'fail': None} if pass_handle is not None: output['pass'] = pass_handle.name pass_handle.close() if fail_handle is not None: output['fail'] = fail_handle.name fail_handle.close() return output