def parseIMGT(aligner_output, seq_file=None, no_parse=True, partial=False, parse_scores=False, parse_regions=False, parse_junction=False, out_args=default_out_args): """ Main for IMGT aligned sample sequences. Arguments: aligner_output : zipped file or unzipped folder output by IMGT. seq_file : FASTA file input to IMGT (from which to get seqID). no_parse : if ID is to be parsed for pRESTO output with default delimiters. partial : If True put incomplete alignments in the pass file. parse_scores : if True add alignment score fields to output file. parse_regions : if True add FWR and CDR region fields to output file. out_args : common output argument dictionary from parseCommonArgs. Returns: None """ # Print parameter info log = OrderedDict() log['START'] = 'MakeDb' log['ALIGNER'] = 'IMGT' log['ALIGNER_OUTPUT'] = aligner_output log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else '' log['NO_PARSE'] = no_parse log['PARTIAL'] = partial log['SCORES'] = parse_scores log['REGIONS'] = parse_regions log['JUNCTION'] = parse_junction printLog(log) start_time = time() printMessage('Loading sequence files', start_time=start_time, width=25) # Extract IMGT files temp_dir, imgt_files = extractIMGT(aligner_output) # Count records in IMGT files total_count = countDbFile(imgt_files['summary']) # Get (parsed) IDs from fasta file submitted to IMGT id_dict = getIDforIMGT(seq_file) if seq_file else {} printMessage('Done', start_time=start_time, end=True, width=25) # Parse IMGT output and write db with open(imgt_files['summary'], 'r') as summary_handle, \ open(imgt_files['gapped'], 'r') as gapped_handle, \ open(imgt_files['ntseq'], 'r') as ntseq_handle, \ open(imgt_files['junction'], 'r') as junction_handle: parse_iter = IMGTReader(summary_handle, gapped_handle, ntseq_handle, junction_handle, parse_scores=parse_scores, parse_regions=parse_regions, parse_junction=parse_junction) file_prefix = getFilePrefix(aligner_output, out_args) writeDb(parse_iter, parse_iter.fields, file_prefix, total_count, id_dict=id_dict, no_parse=no_parse, partial=partial, out_args=out_args) # Cleanup temp directory temp_dir.cleanup() return None
def parseIMGT(imgt_output, seq_file=None, no_parse=True, score_fields=False, region_fields=False, out_args=default_out_args): """ Main for IMGT aligned sample sequences Arguments: imgt_output = zipped file or unzipped folder output by IMGT seq_file = FASTA file input to IMGT (from which to get seqID) no_parse = if ID is to be parsed for pRESTO output with default delimiters score_fields = if True add alignment score fields to output file region_fields = if True add FWR and CDR region fields to output file out_args = common output argument dictionary from parseCommonArgs Returns: None """ # Print parameter info log = OrderedDict() log['START'] = 'MakeDb' log['ALIGNER'] = 'IMGT' log['ALIGN_RESULTS'] = imgt_output log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else '' log['NO_PARSE'] = no_parse log['SCORE_FIELDS'] = score_fields log['REGION_FIELDS'] = region_fields printLog(log) # Get individual IMGT result files temp_dir, imgt_files = extractIMGT(imgt_output) # Formalize out_dir and file-prefix if not out_args['out_dir']: out_dir = os.path.dirname(os.path.abspath(imgt_output)) else: out_dir = os.path.abspath(out_args['out_dir']) if not os.path.exists(out_dir): os.mkdir(out_dir) if out_args['out_name']: file_prefix = out_args['out_name'] else: file_prefix = os.path.splitext(os.path.split(os.path.abspath(imgt_output))[1])[0] file_prefix = os.path.join(out_dir, file_prefix) total_count = countDbFile(imgt_files[0]) # Get (parsed) IDs from fasta file submitted to IMGT id_dict = getIDforIMGT(seq_file) if seq_file else {} # Create imgt_dict = readIMGT(imgt_files, score_fields=score_fields, region_fields=region_fields) writeDb(imgt_dict, file_prefix, total_count, id_dict=id_dict, no_parse=no_parse, score_fields=score_fields, region_fields=region_fields, out_args=out_args) # Delete temp directory rmtree(temp_dir)
def addDbFile(db_file, fields, values, out_args=default_out_args): """ Adds field and value pairs to a database file Arguments: db_file = the database file name fields = a list of fields to add values = a list of values to assign to all rows of each field out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'add' log['FILE'] = os.path.basename(db_file) log['FIELDS'] = ','.join(fields) log['VALUES'] = ','.join(values) printLog(log) # Open file handles db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='parse-add', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') pass_writer = getDbWriter(pass_handle, db_file, add_fields=fields) # Count records result_count = countDbFile(db_file) # Define fields and values to append add_dict = {k:v for k,v in zip(fields, values) if k not in db_iter.fieldnames} # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # Write updated row rec.update(add_dict) pass_writer.writerow(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def indexDbFile(db_file, field=default_index_field, out_args=default_out_args): """ Adds an index column to a database file Arguments: db_file = the database file name field = the name of the index field to add out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'index' log['FILE'] = os.path.basename(db_file) log['FIELD'] = field printLog(log) # Open file handles db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='parse-index', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') pass_writer = getDbWriter(pass_handle, db_file, add_fields=field) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # Add count and write updated row rec.update({field: rec_count}) pass_writer.writerow(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def dropDbFile(db_file, fields, out_args=default_out_args): """ Deletes entire fields from a database file Arguments: db_file = the database file name fields = a list of fields to drop out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'add' log['FILE'] = os.path.basename(db_file) log['FIELDS'] = ','.join(fields) printLog(log) # Open file handles db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='parse-drop', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') pass_writer = getDbWriter(pass_handle, db_file, exclude_fields=fields) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # Write row pass_writer.writerow(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def indexDbFile(db_file, field=default_index_field, out_args=default_out_args): """ Adds an index column to a database file Arguments: db_file = the database file name field = the name of the index field to add out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'index' log['FILE'] = os.path.basename(db_file) log['FIELD'] = field printLog(log) # Open file handles db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='parse-index', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') pass_writer = getDbWriter(pass_handle, db_file, add_fields=field) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # Add count and write updated row rec.update({field:rec_count}) pass_writer.writerow(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def convertDbFasta(db_file, id_field=default_id_field, seq_field=default_seq_field, meta_fields=None, out_args=default_out_args): """ Builds fasta files from database records Arguments: db_file = the database file name id_field = the field containing identifiers seq_field = the field containing sequences meta_fields = a list of fields to add to sequence annotations out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'fasta' log['FILE'] = os.path.basename(db_file) log['ID_FIELD'] = id_field log['SEQ_FIELD'] = seq_field if meta_fields is not None: log['META_FIELDS'] = ','.join(meta_fields) printLog(log) # Open file handles out_type = 'fasta' db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='sequences', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_type) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = pass_count = fail_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # Get SeqRecord seq = getDbSeqRecord(rec, id_field, seq_field, meta_fields, out_args['delimiter']) # Write sequences if seq is not None: pass_count += 1 SeqIO.write(seq, pass_handle, out_type) else: fail_count += 1 # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def collectDbQueue(alive, result_queue, collect_queue, db_file, task_label, out_args, add_fields=None): """ Pulls from results queue, assembles results and manages log and file IO Arguments: alive : multiprocessing.Value boolean controlling whether processing continues; when False function returns result_queue : multiprocessing.Queue holding worker results collect_queue : multiprocessing.Queue to store collector return values db_file : Database file name task_label : Task label used to tag the output files out_args : Common output argument dictionary from parseCommonArgs add_fields : List of fields added to the writer not present in the in_file; if None do not add fields Returns: None : Adds a dictionary with key value pairs to collect_queue containing 'log' defining a log object, 'out_files' defining the output file names """ try: result_count = countDbFile(db_file) # Define output format out_type = getFileType(db_file) if out_args['out_type'] is None \ else out_args['out_type'] # Defined valid alignment output handle pass_handle = getOutputHandle(db_file, '%s-pass' % task_label, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_type) pass_writer = getDbWriter(pass_handle, db_file, add_fields=add_fields) # Defined failed alignment output handle if out_args['failed']: fail_handle = getOutputHandle(db_file, '%s-fail' % task_label, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_type) fail_writer = getDbWriter(fail_handle, db_file) else: fail_handle = None # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') except: alive.value = False raise try: # Iterator over results queue until sentinel object reached start_time = time() set_count = rec_count = pass_count = fail_count = 0 while alive.value: # Get result from queue if result_queue.empty(): continue else: result = result_queue.get() # Exit upon reaching sentinel if result is None: break # Print progress for previous iteration printProgress(pass_count, result_count, 0.05, start_time) # Update counts for current iteration set_count += 1 rec_count += result.data_count # Write log printLog(result.log, handle=log_handle) # Write alignments if result: pass_count += result.data_count for rec in result.results: pass_writer.writerow(rec.toDict()) else: fail_count += result.data_count if fail_handle is not None: for rec in result.data: pass_writer.writerow(rec.toDict()) else: sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ % os.getpid()) return None # Print total counts printProgress(pass_count, result_count, 0.05, start_time) # Update return values log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['GROUPS'] = set_count log['PASS'] = pass_count log['FAIL'] = fail_count collect_dict = {'log':log, 'out_files': [pass_handle.name]} collect_queue.put(collect_dict) # Close file handles pass_handle.close() if fail_handle is not None: fail_handle.close() if log_handle is not None: log_handle.close() except: alive.value = False raise return None
def updateDbFile(db_file, field, values, updates, out_file=None, out_args=default_out_args): """ Updates field and value pairs to a database file Arguments: db_file : the database file name. field : the field to update. values : a list of values to specifying which rows to update. updates : a list of values to update each value with. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str : output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'update' log['FILE'] = os.path.basename(db_file) log['FIELD'] = field log['VALUES'] = ','.join(values) log['UPDATES'] = ','.join(updates) printLog(log) # Open input db_handle = open(db_file, 'rt') db_iter = TSVReader(db_handle) out_fields = db_iter.fields __, __, out_args['out_type'] = splitName(db_file) # Open output if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='parse-update', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = TSVWriter(pass_handle, out_fields) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count, pass_count = 0, 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Updated values if found for x, y in zip(values, updates): if rec[field] == x: rec[field] = y pass_count += 1 # Write records pass_writer.writeDict(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['UPDATED'] = pass_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() db_handle.close() return pass_handle.name
def collectQueue(alive, result_queue, collect_queue, db_file, out_args, cluster_func=None, cluster_args={}): """ Assembles results from a queue of individual sequence results and manages log/file I/O Arguments: alive = a multiprocessing.Value boolean controlling whether processing continues if False exit process result_queue = a multiprocessing.Queue holding processQueue results collect_queue = a multiprocessing.Queue to store collector return values db_file = the input database file name out_args = common output argument dictionary from parseCommonArgs cluster_func = the function to call for carrying out clustering on distance matrix cluster_args = a dictionary of arguments to pass to cluster_func Returns: None (adds 'log' and 'out_files' to collect_dict) """ # Open output files try: # Count records and define output format out_type = getFileType(db_file) if out_args['out_type'] is None \ else out_args['out_type'] result_count = countDbFile(db_file) # Defined successful output handle pass_handle = getOutputHandle(db_file, out_label='clone-pass', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_type) pass_writer = getDbWriter(pass_handle, db_file, add_fields='CLONE') # Defined failed alignment output handle if out_args['failed']: fail_handle = getOutputHandle(db_file, out_label='clone-fail', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_type) fail_writer = getDbWriter(fail_handle, db_file) else: fail_handle = None fail_writer = None # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') except: #sys.stderr.write('Exception in collector file opening step\n') alive.value = False raise # Get results from queue and write to files try: #print 'START COLLECT', alive.value # Iterator over results queue until sentinel object reached start_time = time() rec_count = clone_count = pass_count = fail_count = 0 while alive.value: # Get result from queue if result_queue.empty(): continue else: result = result_queue.get() # Exit upon reaching sentinel if result is None: break #print "COLLECT", alive.value, result['id'] # Print progress for previous iteration and update record count if rec_count == 0: print('PROGRESS> Assigning clones') printProgress(rec_count, result_count, 0.05, start_time) rec_count += len(result.data) # Write passed and failed records if result: for clone in result.results.values(): clone_count += 1 for i, rec in enumerate(clone): rec.annotations['CLONE'] = clone_count pass_writer.writerow(rec.toDict()) pass_count += 1 result.log['CLONE%i-%i' % (clone_count, i + 1)] = str(rec.junction) else: for i, rec in enumerate(result.data): if fail_writer is not None: fail_writer.writerow(rec.toDict()) fail_count += 1 result.log['CLONE0-%i' % (i + 1)] = str(rec.junction) # Write log printLog(result.log, handle=log_handle) else: sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ % os.getpid()) return None # Print total counts printProgress(rec_count, result_count, 0.05, start_time) # Close file handles pass_handle.close() if fail_handle is not None: fail_handle.close() if log_handle is not None: log_handle.close() # Update return list log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['CLONES'] = clone_count log['RECORDS'] = rec_count log['PASS'] = pass_count log['FAIL'] = fail_count collect_dict = {'log':log, 'out_files': [pass_handle.name]} collect_queue.put(collect_dict) except: #sys.stderr.write('Exception in collector result processing step\n') alive.value = False raise return None
def dropDbFile(db_file, fields, out_file=None, out_args=default_out_args): """ Deletes entire fields from a database file Arguments: db_file : the database file name. fields : a list of fields to drop. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs Returns: str : output file name. """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'add' log['FILE'] = os.path.basename(db_file) log['FIELDS'] = ','.join(fields) printLog(log) # Open input db_handle = open(db_file, 'rt') db_iter = TSVReader(db_handle) __, __, out_args['out_type'] = splitName(db_file) # Exclude dropped field from output out_fields = [f for f in db_iter.fields if f not in fields] # Open output if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='parse-drop', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = TSVWriter(pass_handle, out_fields) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Write row pass_writer.writeDict(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def renameDbFile(db_file, fields, names, out_file=None, out_args=default_out_args): """ Renames fields in a database file Arguments: db_file : the database file name. fields : a list of fields to rename. values : a list of new names for fields. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str : output file name. """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'rename' log['FILE'] = os.path.basename(db_file) log['FIELDS'] = ','.join(fields) log['NAMES'] = ','.join(names) printLog(log) # Open file handles db_handle = open(db_file, 'rt') db_iter = TSVReader(db_handle) __, __, out_args['out_type'] = splitName(db_file) # Get header and rename fields out_fields = list(db_iter.fields) for f, n in zip(fields, names): i = out_fields.index(f) out_fields[i] = n # Open writer if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='parse-rename', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = TSVWriter(pass_handle, out_fields) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # TODO: repeating renaming is unnecessary. # Rename fields for f, n in zip(fields, names): rec[n] = rec.pop(f) # Write pass_writer.writeDict(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() db_handle.close() return pass_handle.name
def assembleEachGermline(db_file, repo, germ_types, v_field, seq_field, out_args=default_out_args): """ Write germline sequences to tab-delimited database file Arguments: db_file = input tab-delimited database file repo = folder with germline repertoire files germ_types = types of germline sequences to be output (full germline, D-region masked, only V-region germline) v_field = field in which to look for V call seq_field = field in which to look for sequence out_args = arguments for output preferences Returns: None """ # Print parameter info log = OrderedDict() log['START'] = 'CreateGermlines' log['DB_FILE'] = os.path.basename(db_file) log['GERM_TYPES'] = germ_types if isinstance(germ_types, str) else ','.join(germ_types) log['CLONED'] = 'False' log['V_FIELD'] = v_field log['SEQ_FIELD'] = seq_field printLog(log) # Get repertoire and open Db reader repo_dict = getRepo(repo) reader = readDbFile(db_file, ig=False) # Exit if V call field does not exist in reader if v_field not in reader.fieldnames: sys.exit('Error: V field does not exist in input database file.') # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') add_fields = [] seq_type = seq_field.split('_')[-1] if 'full' in germ_types: add_fields += ['GERMLINE_' + seq_type] if 'dmask' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_D_MASK'] if 'vonly' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_V_REGION'] # Create output file handle and Db writer pass_handle = getOutputHandle(db_file, 'germ-pass', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = getDbWriter(pass_handle, db_file, add_fields=add_fields) if out_args['failed']: fail_handle = getOutputHandle(db_file, 'germ-fail', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) fail_writer = getDbWriter(fail_handle, db_file, add_fields=add_fields) else: fail_handle = None fail_writer = None # Initialize time and total count for progress bar start_time = time() rec_count = countDbFile(db_file) pass_count = fail_count = 0 # Iterate over rows for i,row in enumerate(reader): # Print progress printProgress(i, rec_count, 0.05, start_time) result_log, germlines = joinGermline(row, repo_dict, germ_types, v_field, seq_field) # Add germline field(s) to dictionary if 'full' in germ_types: row['GERMLINE_' + seq_type] = germlines['full'] if 'dmask' in germ_types: row['GERMLINE_' + seq_type + '_D_MASK'] = germlines['dmask'] if 'vonly' in germ_types: row['GERMLINE_' + seq_type + '_V_REGION'] = germlines['vonly'] # Write row to pass or fail file if 'ERROR' in result_log: fail_count += 1 if fail_writer is not None: fail_writer.writerow(row) else: result_log['SEQUENCE'] = row[seq_field] result_log['GERMLINE'] = germlines['full'] result_log['REGIONS'] = germlines['regions'] pass_count += 1 pass_writer.writerow(row) printLog(result_log, handle=log_handle) # Print log printProgress(i+1, rec_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'CreateGermlines' printLog(log) # Close file handles pass_handle.close() if fail_handle is not None: fail_handle.close() if log_handle is not None: log_handle.close()
def addDbFile(db_file, fields, values, out_file=None, out_args=default_out_args): """ Adds field and value pairs to a database file Arguments: db_file : the database file name. fields : a list of fields to add. values : a list of values to assign to all rows of each field. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str : output file name. """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'add' log['FILE'] = os.path.basename(db_file) log['FIELDS'] = ','.join(fields) log['VALUES'] = ','.join(values) printLog(log) # Open inut db_handle = open(db_file, 'rt') db_iter = TSVReader(db_handle) __, __, out_args['out_type'] = splitName(db_file) # Add fields out_fields = list(db_iter.fields) out_fields.extend(fields) # Open output if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='parse-add', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = TSVWriter(pass_handle, out_fields) # Count records result_count = countDbFile(db_file) # Define fields and values to append add_dict = { k: v for k, v in zip(fields, values) if k not in db_iter.fields } # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Write updated row rec.update(add_dict) pass_writer.writeDict(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() db_handle.close() return pass_handle.name
def addDbFile(db_file, fields, values, out_args=default_out_args): """ Adds field and value pairs to a database file Arguments: db_file = the database file name fields = a list of fields to add values = a list of values to assign to all rows of each field out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'add' log['FILE'] = os.path.basename(db_file) log['FIELDS'] = ','.join(fields) log['VALUES'] = ','.join(values) printLog(log) # Open file handles db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='parse-add', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') pass_writer = getDbWriter(pass_handle, db_file, add_fields=fields) # Count records result_count = countDbFile(db_file) # Define fields and values to append add_dict = { k: v for k, v in zip(fields, values) if k not in db_iter.fieldnames } # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # Write updated row rec.update(add_dict) pass_writer.writerow(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def assembleCloneGermline(db_file, repo, germ_types, v_field, seq_field, out_args=default_out_args): """ Assemble one germline sequence for each clone in a tab-delimited database file Arguments: db_file = input tab-delimited database file repo = folder with germline repertoire files germ_types = types of germline sequences to be output (full germline, D-region masked, only V-region germline) v_field = field in which to look for V call seq_field = field in which to look for sequence out_args = arguments for output preferences Returns: None """ # Print parameter info log = OrderedDict() log['START'] = 'CreateGermlines' log['DB_FILE'] = os.path.basename(db_file) log['GERM_TYPES'] = germ_types if isinstance(germ_types, str) else ','.join(germ_types) log['CLONED'] = 'True' log['V_FIELD'] = v_field log['SEQ_FIELD'] = seq_field printLog(log) # Get repertoire and open Db reader repo_dict = getRepo(repo) reader = readDbFile(db_file, ig=False) # Exit if V call field does not exist in reader if v_field not in reader.fieldnames: sys.exit('Error: V field does not exist in input database file.') # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') add_fields = [] seq_type = seq_field.split('_')[-1] if 'full' in germ_types: add_fields += ['GERMLINE_' + seq_type] if 'dmask' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_D_MASK'] if 'vonly' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_V_REGION'] # Create output file handle and Db writer writers = {} pass_handle = getOutputHandle(db_file, 'germ-pass', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) writers['pass'] = getDbWriter(pass_handle, db_file, add_fields=add_fields) if out_args['failed']: fail_handle = getOutputHandle(db_file, 'germ-fail', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) writers['fail'] = getDbWriter(fail_handle, db_file, add_fields=add_fields) else: fail_handle = None writers['fail'] = None # Initialize time and total count for progress bar start_time = time() rec_count = countDbFile(db_file) counts = {} clone_count = counts['pass'] = counts['fail'] = 0 # Iterate over rows clone = 'initial' clone_dict = OrderedDict() for i,row in enumerate(reader): # Print progress printProgress(i, rec_count, 0.05, start_time) # Clone isn't over yet if row.get('CLONE','') == clone: clone_dict[row["SEQUENCE_ID"]] = row # Clone just finished elif clone_dict: clone_count += 1 result_log = makeCloneGermline(clone, clone_dict, repo_dict, germ_types, v_field, seq_field, counts, writers, out_args) printLog(result_log, handle=log_handle) # Now deal with current row (first of next clone) clone = row['CLONE'] clone_dict = OrderedDict([(row['SEQUENCE_ID'],row)]) # Last case is only for first row of file else: clone = row['CLONE'] clone_dict = OrderedDict([(row['SEQUENCE_ID'],row)]) clone_count += 1 result_log = makeCloneGermline(clone, clone_dict, repo_dict, germ_types, v_field, seq_field, counts, writers, out_args) printLog(result_log, handle=log_handle) # Print log printProgress(i+1, rec_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['CLONES'] = clone_count log['RECORDS'] = rec_count log['PASS'] = counts['pass'] log['FAIL'] = counts['fail'] log['END'] = 'CreateGermlines' printLog(log) # Close file handles pass_handle.close() if fail_handle is not None: fail_handle.close() if log_handle is not None: log_handle.close()
def assembleCloneGermline(db_file, repo, seq_field=default_seq_field, v_field=default_v_field, germ_types=default_germ_types, out_args=default_out_args): """ Assemble one germline sequence for each clone in a tab-delimited database file Arguments: db_file = input tab-delimited database file repo = folder with germline repertoire files germ_types = types of germline sequences to be output (full germline, D-region masked, only V-region germline) v_field = field in which to look for V call seq_field = field in which to look for sequence out_args = arguments for output preferences Returns: None """ # Print parameter info log = OrderedDict() log['START'] = 'CreateGermlines' log['DB_FILE'] = os.path.basename(db_file) log['GERM_TYPES'] = germ_types if isinstance(germ_types, str) else ','.join(germ_types) log['CLONED'] = 'True' log['V_FIELD'] = v_field log['SEQ_FIELD'] = seq_field printLog(log) # Get repertoire and open Db reader references = readRepo(repo) reader = readDbFile(db_file, ig=False) # Exit if V call field does not exist in reader if v_field not in reader.fieldnames: sys.exit('Error: V field does not exist in input database file.') # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') add_fields = [] seq_type = seq_field.split('_')[-1] if 'full' in germ_types: add_fields += ['GERMLINE_' + seq_type] if 'dmask' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_D_MASK'] if 'vonly' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_V_REGION'] if 'regions' in germ_types: add_fields += ['GERMLINE_REGIONS'] add_fields += ['GERMLINE_V_CALL'] add_fields += ['GERMLINE_D_CALL'] add_fields += ['GERMLINE_J_CALL'] # Create output file handle and Db writer writers = {} pass_handle = getOutputHandle(db_file, 'germ-pass', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) writers['pass'] = getDbWriter(pass_handle, db_file, add_fields=add_fields) if out_args['failed']: fail_handle = getOutputHandle(db_file, 'germ-fail', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) writers['fail'] = getDbWriter(fail_handle, db_file, add_fields=add_fields) else: fail_handle = None writers['fail'] = None # Initialize time and total count for progress bar start_time = time() rec_count = countDbFile(db_file) counts = {} clone_count = counts['pass'] = counts['fail'] = 0 # Iterate over rows clone = 'initial' clone_dict = OrderedDict() for i, row in enumerate(reader): # Print progress printProgress(i, rec_count, 0.05, start_time) # Clone isn't over yet if row.get('CLONE', '') == clone: clone_dict[i] = row # Clone just finished elif clone_dict: clone_count += 1 result_log = makeCloneGermline(clone, clone_dict, references, germ_types, v_field, seq_field, counts, writers, out_args) printLog(result_log, handle=log_handle) # Now deal with current row (first of next clone) clone = row['CLONE'] clone_dict = OrderedDict([(i, row)]) # Last case is only for first row of file else: clone = row['CLONE'] clone_dict = OrderedDict([(i, row)]) clone_count += 1 result_log = makeCloneGermline(clone, clone_dict, references, germ_types, v_field, seq_field, counts, writers, out_args) printLog(result_log, handle=log_handle) # Print log printProgress(i + 1, rec_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['CLONES'] = clone_count log['RECORDS'] = rec_count log['PASS'] = counts['pass'] log['FAIL'] = counts['fail'] log['END'] = 'CreateGermlines' printLog(log) # Close file handles pass_handle.close() if fail_handle is not None: fail_handle.close() if log_handle is not None: log_handle.close()
def assembleEachGermline(db_file, repo, germ_types, v_field, seq_field, out_args=default_out_args): """ Write germline sequences to tab-delimited database file Arguments: db_file = input tab-delimited database file repo = folder with germline repertoire files germ_types = types of germline sequences to be output (full germline, D-region masked, only V-region germline) v_field = field in which to look for V call seq_field = field in which to look for sequence out_args = arguments for output preferences Returns: None """ # Print parameter info log = OrderedDict() log['START'] = 'CreateGermlines' log['DB_FILE'] = os.path.basename(db_file) log['GERM_TYPES'] = germ_types if isinstance(germ_types, str) else ','.join(germ_types) log['CLONED'] = 'False' log['V_FIELD'] = v_field log['SEQ_FIELD'] = seq_field printLog(log) # Get repertoire and open Db reader references = readRepo(repo) reader = readDbFile(db_file, ig=False) # Exit if V call field does not exist in reader if v_field not in reader.fieldnames: sys.exit('Error: V field does not exist in input database file.') # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') add_fields = [] seq_type = seq_field.split('_')[-1] if 'full' in germ_types: add_fields += ['GERMLINE_' + seq_type] if 'dmask' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_D_MASK'] if 'vonly' in germ_types: add_fields += ['GERMLINE_' + seq_type + '_V_REGION'] if 'regions' in germ_types: add_fields += ['GERMLINE_REGIONS'] # Create output file handle and Db writer pass_handle = getOutputHandle(db_file, 'germ-pass', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = getDbWriter(pass_handle, db_file, add_fields=add_fields) if out_args['failed']: fail_handle = getOutputHandle(db_file, 'germ-fail', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) fail_writer = getDbWriter(fail_handle, db_file, add_fields=add_fields) else: fail_handle = None fail_writer = None # Initialize time and total count for progress bar start_time = time() rec_count = countDbFile(db_file) pass_count = fail_count = 0 # Iterate over rows for i, row in enumerate(reader): # Print progress printProgress(i, rec_count, 0.05, start_time) result_log, germlines = joinGermline(row, references, seq_field=seq_field, v_field=v_field, germ_types=germ_types) # Add germline field(s) to dictionary if 'full' in germ_types: row['GERMLINE_' + seq_type] = germlines['full'] if 'dmask' in germ_types: row['GERMLINE_' + seq_type + '_D_MASK'] = germlines['dmask'] if 'vonly' in germ_types: row['GERMLINE_' + seq_type + '_V_REGION'] = germlines['vonly'] if 'regions' in germ_types: row['GERMLINE_REGIONS'] = germlines['regions'] # Write row to pass or fail file if 'ERROR' in result_log: fail_count += 1 if fail_writer is not None: fail_writer.writerow(row) else: result_log['SEQUENCE'] = row[seq_field] result_log['GERMLINE'] = germlines['full'] result_log['REGIONS'] = germlines['regions'] pass_count += 1 pass_writer.writerow(row) printLog(result_log, handle=log_handle) # Print log printProgress(i + 1, rec_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'CreateGermlines' printLog(log) # Close file handles pass_handle.close() if fail_handle is not None: fail_handle.close() if log_handle is not None: log_handle.close()
def convertToFasta(db_file, id_field=default_id_field, seq_field=default_seq_field, meta_fields=None, out_file=None, out_args=default_out_args): """ Builds fasta files from database records Arguments: db_file : the database file name. id_field : the field containing identifiers. seq_field : the field containing sequences. meta_fields : a list of fields to add to sequence annotations. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str : output file name. """ log = OrderedDict() log['START'] = 'ConvertDb' log['COMMAND'] = 'fasta' log['FILE'] = os.path.basename(db_file) log['ID_FIELD'] = id_field log['SEQ_FIELD'] = seq_field if meta_fields is not None: log['META_FIELDS'] = ','.join(meta_fields) printLog(log) # Open input out_type = 'fasta' db_handle = open(db_file, 'rt') db_iter = TSVReader(db_handle) result_count = countDbFile(db_file) # Open output if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='sequences', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_type) # Iterate over records start_time = time() rec_count, pass_count, fail_count = 0, 0, 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Get SeqRecord seq = buildSeqRecord(rec, id_field, seq_field, meta_fields) # Write sequences if seq is not None: pass_count += 1 SeqIO.write(seq, pass_handle, out_type) else: fail_count += 1 # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'ConvertDb' printLog(log) # Close file handles pass_handle.close() db_handle.close() return pass_handle.name
def createGermlines(db_file, references, seq_field=default_seq_field, v_field=default_v_field, d_field=default_d_field, j_field=default_j_field, cloned=False, clone_field=default_clone_field, germ_types=default_germ_types, format=default_format, out_file=None, out_args=default_out_args): """ Write germline sequences to tab-delimited database file Arguments: db_file : input tab-delimited database file. references : folders and/or files containing germline repertoire data in FASTA format. seq_field : field in which to look for sequence. v_field : field in which to look for V call. d_field : field in which to look for D call. j_field : field in which to look for J call. cloned : if True build germlines by clone, otherwise build individual germlines. clone_field : field containing clone identifiers; ignored if cloned=False. germ_types : list of germline sequence types to be output from the set of 'full', 'dmask', 'vonly', 'regions' format : input and output format. out_file : output file name. Automatically generated from the input file if None. out_args : arguments for output preferences. Returns: dict: names of the 'pass' and 'fail' output files. """ # Print parameter info log = OrderedDict() log['START'] = 'CreateGermlines' log['FILE'] = os.path.basename(db_file) log['GERM_TYPES'] = ','.join(germ_types) log['SEQ_FIELD'] = seq_field log['V_FIELD'] = v_field log['D_FIELD'] = d_field log['J_FIELD'] = j_field log['CLONED'] = cloned if cloned: log['CLONE_FIELD'] = clone_field printLog(log) # Define format operators try: reader, writer, schema = getFormatOperators(format) except ValueError: printError('Invalid format %s' % format) out_args['out_type'] = schema.out_type # TODO: this won't work for AIRR necessarily # Define output germline fields germline_fields = OrderedDict() seq_type = seq_field.split('_')[-1] if 'full' in germ_types: germline_fields['full'] = 'germline_' + seq_type if 'dmask' in germ_types: germline_fields['dmask'] = 'germline_' + seq_type + '_d_mask' if 'vonly' in germ_types: germline_fields['vonly'] = 'germline_' + seq_type + '_v_region' if 'regions' in germ_types: germline_fields['regions'] = 'germline_regions' if cloned: germline_fields['v'] = 'germline_v_call' germline_fields['d'] = 'germline_d_call' germline_fields['j'] = 'germline_j_call' out_fields = getDbFields(db_file, add=[schema.fromReceptor(f) for f in germline_fields.values()], reader=reader) # Get repertoire and open Db reader reference_dict = readGermlines(references) db_handle = open(db_file, 'rt') db_iter = reader(db_handle) # Check for required columns try: required = ['v_germ_start_imgt', 'd_germ_start', 'j_germ_start', 'np1_length', 'np2_length'] checkFields(required, db_iter.fields, schema=schema) except LookupError as e: printError(e) # Check for IMGT-gaps in germlines if all('...' not in x for x in reference_dict.values()): printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.') # Count input total_count = countDbFile(db_file) # Check for existence of fields for f in [v_field, d_field, j_field, seq_field]: if f not in db_iter.fields: printError('%s field does not exist in input database file.' % f) # Translate to Receptor attribute names v_field = schema.toReceptor(v_field) d_field = schema.toReceptor(d_field) j_field = schema.toReceptor(j_field) seq_field = schema.toReceptor(seq_field) clone_field = schema.toReceptor(clone_field) # Define Receptor iterator if cloned: start_time = time() printMessage('Sorting by clone', start_time=start_time, width=20) sorted_records = sorted(db_iter, key=lambda x: x.getField(clone_field)) printMessage('Done', start_time=start_time, end=True, width=20) receptor_iter = groupby(sorted_records, lambda x: x.getField(clone_field)) else: receptor_iter = ((x.sequence_id, [x]) for x in db_iter) # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') # Initialize handles, writers and counters pass_handle, pass_writer = None, None fail_handle, fail_writer = None, None rec_count, pass_count, fail_count = 0, 0, 0 start_time = time() # Iterate over rows for key, records in receptor_iter: # Print progress printProgress(rec_count, total_count, 0.05, start_time=start_time) # Define iteration variables records = list(records) rec_log = OrderedDict([('ID', key)]) rec_count += len(records) # Build germline for records if len(records) == 1: germ_log, germlines, genes = buildGermline(records[0], reference_dict, seq_field=seq_field, v_field=v_field, d_field=d_field, j_field=j_field) else: germ_log, germlines, genes = buildClonalGermline(records, reference_dict, seq_field=seq_field, v_field=v_field, d_field=d_field, j_field=j_field) rec_log.update(germ_log) # Write row to pass or fail file if germlines is not None: pass_count += len(records) # Add germlines to Receptor record annotations = {} if 'full' in germ_types: annotations[germline_fields['full']] = germlines['full'] if 'dmask' in germ_types: annotations[germline_fields['dmask']] = germlines['dmask'] if 'vonly' in germ_types: annotations[germline_fields['vonly']] = germlines['vonly'] if 'regions' in germ_types: annotations[germline_fields['regions']] = germlines['regions'] if cloned: annotations[germline_fields['v']] = genes['v'] annotations[germline_fields['d']] = genes['d'] annotations[germline_fields['j']] = genes['j'] # Write records try: for r in records: r.setDict(annotations) pass_writer.writeReceptor(r) except AttributeError: # Create output file handle and writer if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='germ-pass', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = writer(pass_handle, fields=out_fields) for r in records: r.setDict(annotations) pass_writer.writeReceptor(r) else: fail_count += len(records) if out_args['failed']: try: fail_writer.writeReceptor(records) except AttributeError: fail_handle = getOutputHandle(db_file, out_label='germ-fail', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) fail_writer = writer(fail_handle, fields=out_fields) fail_writer.writeReceptor(records) # Write log printLog(rec_log, handle=log_handle) # Print log printProgress(rec_count, total_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) if pass_handle is not None else None log['RECORDS'] = rec_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'CreateGermlines' printLog(log) # Close file handles db_handle.close() output = {'pass': None, 'fail': None} if pass_handle is not None: output['pass'] = pass_handle.name pass_handle.close() if fail_handle is not None: output['fail'] = fail_handle.name fail_handle.close() if log_handle is not None: log_handle.close() return output
def convertToGenbank(db_file, inference=None, db_xref=None, molecule=default_molecule, product=default_product, features=None, c_field=None, label=None, count_field=None, index_field=None, allow_stop=False, asis_id=False, asis_calls=False, allele_delim=default_allele_delim, build_asn=False, asn_template=None, tbl2asn_exec=default_tbl2asn_exec, format=default_format, out_file=None, out_args=default_out_args): """ Builds GenBank submission fasta and table files Arguments: db_file : the database file name. inference : reference alignment tool. db_xref : reference database link. molecule : source molecule (eg, "mRNA", "genomic DNA") product : Product (protein) name. features : dictionary of sample features (BioSample attributes) to add to the description of each record. c_field : column containing the C region gene call. label : a string to use as a label for the ID. if None do not add a field label. count_field : field name to populate the AIRR_READ_COUNT note. index_field : field name to populate the AIRR_CELL_INDEX note. allow_stop : if True retain records with junctions having stop codons. asis_id : if True use the original sequence ID for the output IDs. asis_calls : if True do not parse gene calls for IMGT nomenclature. allele_delim : delimiter separating the gene name from the allele number when asis_calls=True. build_asn : if True run tbl2asn on the generated .tbl and .fsa files. asn_template : template file (.sbt) to pass to tbl2asn. tbl2asn_exec : name of or path to the tbl2asn executable. format : input and output format. out_file : output file name without extension. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: tuple : the output (feature table, fasta) file names. """ log = OrderedDict() log['START'] = 'ConvertDb' log['COMMAND'] = 'genbank' log['FILE'] = os.path.basename(db_file) printLog(log) # Define format operators try: reader, __, schema = getFormatOperators(format) except ValueError: printError('Invalid format %s.' % format) # Open input db_handle = open(db_file, 'rt') db_iter = reader(db_handle) # Check for required columns try: required = ['sequence_input', 'v_call', 'd_call', 'j_call', 'v_seq_start', 'd_seq_start', 'j_seq_start'] checkFields(required, db_iter.fields, schema=schema) except LookupError as e: printError(e) # Open output if out_file is not None: out_name, __ = os.path.splitext(out_file) fsa_handle = open('%s.fsa' % out_name, 'w') tbl_handle = open('%s.tbl' % out_name, 'w') else: fsa_handle = getOutputHandle(db_file, out_label='genbank', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='fsa') tbl_handle = getOutputHandle(db_file, out_label='genbank', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tbl') # Count records result_count = countDbFile(db_file) # Define writer writer = csv.writer(tbl_handle, delimiter='\t', quoting=csv.QUOTE_NONE) # Iterate over records start_time = time() rec_count, pass_count, fail_count = 0, 0, 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Extract table dictionary name = None if asis_id else rec_count seq = makeGenbankSequence(rec, name=name, label=label, count_field=count_field, index_field=index_field, molecule=molecule, features=features) tbl = makeGenbankFeatures(rec, start=seq['start'], end=seq['end'], product=product, db_xref=db_xref, inference=inference, c_field=c_field, allow_stop=allow_stop, asis_calls=asis_calls, allele_delim=allele_delim) if tbl is not None: pass_count +=1 # Write table writer.writerow(['>Features', seq['record'].id]) for feature, qualifiers in tbl.items(): writer.writerow(feature) if qualifiers: for x in qualifiers: writer.writerow(list(chain(['', '', ''], x))) # Write sequence SeqIO.write(seq['record'], fsa_handle, 'fasta') else: fail_count += 1 # Final progress bar printProgress(rec_count, result_count, 0.05, start_time=start_time) # Run tbl2asn if build_asn: start_time = time() printMessage('Running tbl2asn', start_time=start_time, width=25) result = runASN(fsa_handle.name, template=asn_template, exec=tbl2asn_exec) printMessage('Done', start_time=start_time, end=True, width=25) # Print ending console log log = OrderedDict() log['OUTPUT_TBL'] = os.path.basename(tbl_handle.name) log['OUTPUT_FSA'] = os.path.basename(fsa_handle.name) log['RECORDS'] = rec_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'ConvertDb' printLog(log) # Close file handles tbl_handle.close() fsa_handle.close() db_handle.close() return (tbl_handle.name, fsa_handle.name)
def deleteDbFile(db_file, fields, values, logic='any', regex=False, out_args=default_out_args): """ Deletes records from a database file Arguments: db_file = the database file name fields = a list of fields to check for deletion criteria values = a list of values defining deletion targets logic = one of 'any' or 'all' defining whether one or all fields must have a match. regex = if False do exact full string matches; if True allow partial regex matches. out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ # Define string match function if regex: def _match_func(x, patterns): return any([re.search(p, x) for p in patterns]) else: def _match_func(x, patterns): return x in patterns # Define logic function if logic == 'any': _logic_func = any elif logic == 'all': _logic_func = all log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'delete' log['FILE'] = os.path.basename(db_file) log['FIELDS'] = ','.join(fields) log['VALUES'] = ','.join(values) printLog(log) # Open file handles db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='parse-delete', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') pass_writer = getDbWriter(pass_handle, db_file) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = pass_count = fail_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # Check for deletion values in all fields delete = _logic_func( [_match_func(rec.get(f, False), values) for f in fields]) # Write sequences if not delete: pass_count += 1 pass_writer.writerow(rec) else: fail_count += 1 # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['KEPT'] = pass_count log['DELETED'] = fail_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def indexDbFile(db_file, field=default_index_field, out_file=None, out_args=default_out_args): """ Adds an index column to a database file Arguments: db_file : the database file name. field : the name of the index field to add. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str : output file name. """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'index' log['FILE'] = os.path.basename(db_file) log['FIELD'] = field printLog(log) # Open input db_handle = open(db_file, 'rt') db_iter = TSVReader(db_handle) __, __, out_args['out_type'] = splitName(db_file) # Append index field out_fields = list(db_iter.fields) out_fields.append(field) # Open output if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='parse-index', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = TSVWriter(pass_handle, out_fields) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Add count and write updated row rec.update({field: rec_count}) pass_writer.writeDict(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() db_handle.close() return pass_handle.name
def renameDbFile(db_file, fields, names, out_args=default_out_args): """ Renames fields in a database file Arguments: db_file = the database file name fields = a list of fields to rename values = a list of new names for fields out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'rename' log['FILE'] = os.path.basename(db_file) log['FIELDS'] = ','.join(fields) log['NAMES'] = ','.join(names) printLog(log) # Open file handles db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='parse-rename', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') # Get header and rename fields header = (readDbFile(db_file, ig=False)).fieldnames for f, n in zip(fields, names): i = header.index(f) header[i] = n # Open writer and write new header # TODO: should modify getDbWriter to take a list of fields pass_writer = csv.DictWriter(pass_handle, fieldnames=header, dialect='excel-tab') pass_writer.writeheader() # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # TODO: repeating renaming is unnecessary. should had a non-dict reader/writer to DbCore # Rename fields for f, n in zip(fields, names): rec[n] = rec.pop(f) # Write pass_writer.writerow(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def splitDbFile(db_file, field, num_split=None, out_args=default_out_args): """ Divides a tab-delimited database file into segments by description tags Arguments: db_file : filename of the tab-delimited database file to split field : the field name by which to split db_file num_split : the numerical threshold by which to group sequences; if None treat field as textual out_args : common output argument dictionary from parseCommonArgs Returns: list : a list of output file names. """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'split' log['FILE'] = os.path.basename(db_file) log['FIELD'] = field log['NUM_SPLIT'] = num_split printLog(log) # Open input db_handle = open(db_file, 'rt') db_iter = TSVReader(db_handle) out_fields = db_iter.fields __, __, out_args['out_type'] = splitName(db_file) # Determine total numbers of records rec_count = countDbFile(db_file) start_time = time() count = 0 # Sort records into files based on textual field if num_split is None: # Create set of unique field tags with open(db_file, 'rt') as tmp_handle: tmp_iter = TSVReader(tmp_handle) tag_list = list(set([row[field] for row in tmp_iter])) # Forbidden characters in filename and replacements no_good = { '\/': 'f', '\\': 'b', '?': 'q', '\%': 'p', '*': 's', ':': 'c', '\|': 'pi', '\"': 'dq', '\'': 'sq', '<': 'gt', '>': 'lt', ' ': '_' } # Replace forbidden characters in tag_list tag_dict = {} for tag in tag_list: for c, r in no_good.items(): tag_dict[tag] = (tag_dict.get(tag, tag).replace(c,r) \ if c in tag else tag_dict.get(tag, tag)) # Create output handles handles_dict = { tag: getOutputHandle(db_file, out_label='%s-%s' % (field, label), out_name=out_args['out_name'], out_dir=out_args['out_dir'], out_type=out_args['out_type']) for tag, label in tag_dict.items() } # Create Db writer instances writers_dict = { tag: TSVWriter(handles_dict[tag], fields=out_fields) for tag in tag_dict } # Iterate over records for row in db_iter: printProgress(count, rec_count, 0.05, start_time=start_time) count += 1 # Write row to appropriate file tag = row[field] writers_dict[tag].writeDict(row) # Sort records into files based on numeric num_split else: num_split = float(num_split) # Create output handles handles_dict = { 'under': getOutputHandle(db_file, out_label='under-%.1f' % num_split, out_name=out_args['out_name'], out_dir=out_args['out_dir'], out_type=out_args['out_type']), 'atleast': getOutputHandle(db_file, out_label='atleast-%.1f' % num_split, out_name=out_args['out_name'], out_dir=out_args['out_dir'], out_type=out_args['out_type']) } # Create Db writer instances writers_dict = { 'under': TSVWriter(handles_dict['under'], fields=out_fields), 'atleast': TSVWriter(handles_dict['atleast'], fields=out_fields) } # Iterate over records for row in db_iter: printProgress(count, rec_count, 0.05, start_time=start_time) count += 1 tag = row[field] tag = 'under' if float(tag) < num_split else 'atleast' writers_dict[tag].writeDict(row) # Write log printProgress(count, rec_count, 0.05, start_time=start_time) log = OrderedDict() for i, k in enumerate(handles_dict): log['OUTPUT%i' % (i + 1)] = os.path.basename(handles_dict[k].name) log['RECORDS'] = rec_count log['PARTS'] = len(handles_dict) log['END'] = 'ParseDb' printLog(log) # Close output file handles db_handle.close() for t in handles_dict: handles_dict[t].close() return [handles_dict[t].name for t in handles_dict]
def collectQueue(alive, result_queue, collect_queue, db_file, out_args, cluster_func=None, cluster_args={}): """ Assembles results from a queue of individual sequence results and manages log/file I/O Arguments: alive = a multiprocessing.Value boolean controlling whether processing continues if False exit process result_queue = a multiprocessing.Queue holding processQueue results collect_queue = a multiprocessing.Queue to store collector return values db_file = the input database file name out_args = common output argument dictionary from parseCommonArgs cluster_func = the function to call for carrying out clustering on distance matrix cluster_args = a dictionary of arguments to pass to cluster_func Returns: None (adds 'log' and 'out_files' to collect_dict) """ # Open output files try: # Count records and define output format out_type = getFileType(db_file) if out_args['out_type'] is None \ else out_args['out_type'] result_count = countDbFile(db_file) # Defined successful output handle pass_handle = getOutputHandle(db_file, out_label='clone-pass', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_type) pass_writer = getDbWriter(pass_handle, db_file, add_fields='CLONE') # Defined failed alignment output handle if out_args['failed']: fail_handle = getOutputHandle(db_file, out_label='clone-fail', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_type) fail_writer = getDbWriter(fail_handle, db_file) else: fail_handle = None fail_writer = None # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') except: #sys.stderr.write('Exception in collector file opening step\n') alive.value = False raise # Get results from queue and write to files try: #print 'START COLLECT', alive.value # Iterator over results queue until sentinel object reached start_time = time() rec_count = clone_count = pass_count = fail_count = 0 while alive.value: # Get result from queue if result_queue.empty(): continue else: result = result_queue.get() # Exit upon reaching sentinel if result is None: break #print "COLLECT", alive.value, result['id'] # Print progress for previous iteration and update record count if rec_count == 0: print('PROGRESS> Assigning clones') printProgress(rec_count, result_count, 0.05, start_time) rec_count += len(result.data) # Write passed and failed records if result: # Writing passing sequences for clone in result.results.values(): clone_count += 1 for i, rec in enumerate(clone, start=1): rec.annotations['CLONE'] = clone_count pass_writer.writerow(rec.toDict()) pass_count += 1 result.log['CLONE%i-%i' % (clone_count, i)] = str( rec.junction) # Right failed seqeuence from passing sets if result.failed: for i, rec in enumerate(result.failed, start=1): fail_count += 1 if fail_writer is not None: fail_writer.writerow(rec.toDict()) result.log['FAIL%i-%i' % (clone_count, i)] = str( rec.junction) else: # Write failed sets for i, rec in enumerate(result.data, start=1): fail_count += 1 if fail_writer is not None: fail_writer.writerow(rec.toDict()) result.log['CLONE0-%i' % (i)] = str(rec.junction) # Write log printLog(result.log, handle=log_handle) else: sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ % os.getpid()) return None # Print total counts printProgress(rec_count, result_count, 0.05, start_time) # Close file handles pass_handle.close() if fail_handle is not None: fail_handle.close() if log_handle is not None: log_handle.close() # Update return list log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['CLONES'] = clone_count log['RECORDS'] = rec_count log['PASS'] = pass_count log['FAIL'] = fail_count collect_dict = {'log': log, 'out_files': [pass_handle.name]} collect_queue.put(collect_dict) except: #sys.stderr.write('Exception in collector result processing step\n') alive.value = False raise return None
def selectDbFile(db_file, fields, values, logic='any', regex=False, out_file=None, out_args=default_out_args): """ Selects records from a database file Arguments: db_file : the database file name fields : a list of fields to check for selection criteria values : a list of values defining selection targets logic : one of 'any' or 'all' defining whether one or all fields must have a match. regex : if False do exact full string matches; if True allow partial regex matches. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs Returns: str : output file name. """ # Define string match function if regex: def _match_func(x, patterns): return any([re.search(p, x) for p in patterns]) else: def _match_func(x, patterns): return x in patterns # Define logic function if logic == 'any': _logic_func = any elif logic == 'all': _logic_func = all # Print console log log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'select' log['FILE'] = os.path.basename(db_file) log['FIELDS'] = ','.join(fields) log['VALUES'] = ','.join(values) log['REGEX'] = regex printLog(log) # Open input db_handle = open(db_file, 'rt') db_iter = TSVReader(db_handle) out_fields = db_iter.fields __, __, out_args['out_type'] = splitName(db_file) # Open output if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='parse-select', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = TSVWriter(pass_handle, out_fields) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count, pass_count, fail_count = 0, 0, 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Check for selection values in all fields select = _logic_func( [_match_func(rec.get(f, False), values) for f in fields]) # Write sequences if select: pass_count += 1 pass_writer.writeDict(rec) else: fail_count += 1 # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['SELECTED'] = pass_count log['DISCARDED'] = fail_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() db_handle.close() return pass_handle.name
def mergeDbFiles(db_files, drop=False, out_file=None, out_args=default_out_args): """ Updates field and value pairs to a database file Arguments: db_files : list of database file names. drop : if True drop columns not present in all files. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str : output file name. """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'merge' log['FILES'] = ','.join([os.path.basename(f) for f in db_files]) log['DROP'] = drop printLog(log) # Open input db_handles = [open(f, 'rt') for f in db_files] db_iters = [TSVReader(x) for x in db_handles] result_count = sum([countDbFile(f) for f in db_files]) # Define output fields field_list = [x.fields for x in db_iters] if drop: field_set = set.intersection(*map(set, field_list)) else: field_set = set.union(*map(set, field_list)) field_order = OrderedDict([(f, None) for f in chain(*field_list)]) out_fields = [f for f in field_order if f in field_set] # Open output file if out_file is not None: pass_handle = open(out_file, 'w') else: __, __, out_args['out_type'] = splitName(db_files[0]) pass_handle = getOutputHandle(db_files[0], out_label='parse-merge', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) pass_writer = TSVWriter(pass_handle, out_fields) # Iterate over records start_time = time() rec_count = 0 for db in db_iters: for rec in db: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Write records pass_writer.writeDict(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() for x in db_handles: x.close() return pass_handle.name
def splitDbFile(db_file, field, num_split=None, out_args=default_out_args): """ Divides a tab-delimited database file into segments by description tags Arguments: db_file = filename of the tab-delimited database file to split field = the field name by which to split db_file num_split = the numerical threshold by which to group sequences; if None treat field as textual out_args = common output argument dictionary from parseCommonArgs Returns: a list of output file names """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'split' log['FILE'] = os.path.basename(db_file) log['FIELD'] = field log['NUM_SPLIT'] = num_split printLog(log) # Open IgRecord reader iter object reader = readDbFile(db_file, ig=False) # Determine total numbers of records rec_count = countDbFile(db_file) start_time = time() count = 0 # Sort records into files based on textual field if num_split is None: # Create set of unique field tags tmp_iter = readDbFile(db_file, ig=False) tag_list = list(set([row[field] for row in tmp_iter])) # Forbidden characters in filename and replacements noGood = {'\/':'f','\\':'b','?':'q','\%':'p','*':'s',':':'c', '\|':'pi','\"':'dq','\'':'sq','<':'gt','>':'lt',' ':'_'} # Replace forbidden characters in tag_list tag_dict = {} for tag in tag_list: for c,r in noGood.items(): tag_dict[tag] = (tag_dict.get(tag, tag).replace(c,r) \ if c in tag else tag_dict.get(tag, tag)) # Create output handles handles_dict = {tag:getOutputHandle(db_file, '%s-%s' % (field, label), out_type = out_args['out_type'], out_name = out_args['out_name'], out_dir = out_args['out_dir']) for tag, label in tag_dict.items()} # Create Db writer instances writers_dict = {tag:getDbWriter(handles_dict[tag], db_file) for tag in tag_dict} # Iterate over IgRecords for row in reader: printProgress(count, rec_count, 0.05, start_time) count += 1 # Write row to appropriate file tag = row[field] writers_dict[tag].writerow(row) # Sort records into files based on numeric num_split else: num_split = float(num_split) # Create output handles handles_dict = {'under':getOutputHandle(db_file, 'under-%.1f' % num_split, out_type = out_args['out_type'], out_name = out_args['out_name'], out_dir = out_args['out_dir']), 'atleast':getOutputHandle(db_file, 'atleast-%.1f' % num_split, out_type = out_args['out_type'], out_name = out_args['out_name'], out_dir = out_args['out_dir'])} # Create Db writer instances writers_dict = {'under':getDbWriter(handles_dict['under'], db_file), 'atleast':getDbWriter(handles_dict['atleast'], db_file)} # Iterate over IgRecords for row in reader: printProgress(count, rec_count, 0.05, start_time) count += 1 tag = row[field] tag = 'under' if float(tag) < num_split else 'atleast' writers_dict[tag].writerow(row) # Write log printProgress(count, rec_count, 0.05, start_time) log = OrderedDict() for i, k in enumerate(handles_dict): log['OUTPUT%i' % (i + 1)] = os.path.basename(handles_dict[k].name) log['RECORDS'] = rec_count log['PARTS'] = len(handles_dict) log['END'] = 'ParseDb' printLog(log) # Close output file handles for t in handles_dict: handles_dict[t].close() return [handles_dict[t].name for t in handles_dict]
def convertDbClip(db_file, id_field=default_id_field, seq_field=default_seq_field, germ_field=default_germ_field, cluster_field=None, meta_fields=None, out_args=default_out_args): """ Builds fasta files from database records Arguments: db_file = the database file name id_field = the field containing identifiers seq_field = the field containing sample sequences germ_field = the field containing germline sequences cluster_field = the field containing clonal groupings if None write the germline for each record meta_fields = a list of fields to add to sequence annotations out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'fasta' log['FILE'] = os.path.basename(db_file) log['ID_FIELD'] = id_field log['SEQ_FIELD'] = seq_field log['GERM_FIELD'] = germ_field log['CLUSTER_FIELD'] = cluster_field if meta_fields is not None: log['META_FIELDS'] = ','.join(meta_fields) printLog(log) # Open file handles db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='sequences', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='clip') # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = germ_count = pass_count = fail_count = 0 cluster_last = None for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # Update cluster ID cluster = rec.get(cluster_field, None) # Get germline SeqRecord when needed if cluster_field is None: germ = getDbSeqRecord(rec, id_field, germ_field, meta_fields, delimiter=out_args['delimiter']) germ.id = '>' + germ.id elif cluster != cluster_last: germ = getDbSeqRecord(rec, cluster_field, germ_field, delimiter=out_args['delimiter']) germ.id = '>' + germ.id else: germ = None # Get read SeqRecord seq = getDbSeqRecord(rec, id_field, seq_field, meta_fields, delimiter=out_args['delimiter']) # Write germline if germ is not None: germ_count += 1 SeqIO.write(germ, pass_handle, 'fasta') # Write sequences if seq is not None: pass_count += 1 SeqIO.write(seq, pass_handle, 'fasta') else: fail_count += 1 # Set last cluster ID cluster_last = cluster # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['GERMLINES'] = germ_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def parseIMGT(aligner_file, seq_file=None, repo=None, cellranger_file=None, partial=False, asis_id=True, extended=False, format=default_format, out_file=None, out_args=default_out_args): """ Main for IMGT aligned sample sequences. Arguments: aligner_file : zipped file or unzipped folder output by IMGT. seq_file : FASTA file input to IMGT (from which to get seqID). repo : folder with germline repertoire files. partial : If True put incomplete alignments in the pass file. asis_id : if ID is to be parsed for pRESTO output with default delimiters. extended : if True add alignment score, FWR, CDR and junction fields to output file. format : output format. one of 'changeo' or 'airr'. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: dict : names of the 'pass' and 'fail' output files. """ # Print parameter info log = OrderedDict() log['START'] = 'MakeDb' log['COMMAND'] = 'imgt' log['ALIGNER_FILE'] = aligner_file log['SEQ_FILE'] = os.path.basename(seq_file) if seq_file else '' log['ASIS_ID'] = asis_id log['PARTIAL'] = partial log['EXTENDED'] = extended printLog(log) start_time = time() printMessage('Loading files', start_time=start_time, width=20) # Extract IMGT files temp_dir, imgt_files = extractIMGT(aligner_file) # Count records in IMGT files total_count = countDbFile(imgt_files['summary']) # Get (parsed) IDs from fasta file submitted to IMGT id_dict = getIDforIMGT(seq_file) if seq_file else {} # Load supplementary annotation table if cellranger_file is not None: f = cellranger_extended if extended else cellranger_base annotations = readCellRanger(cellranger_file, fields=f) else: annotations = None printMessage('Done', start_time=start_time, end=True, width=20) # Define format operators try: __, writer, schema = getFormatOperators(format) except ValueError: printError('Invalid format %s.' % format) out_args['out_type'] = schema.out_type # Define output fields fields = list(schema.required) if extended: custom = IMGTReader.customFields(scores=True, regions=True, junction=True, schema=schema) fields.extend(custom) # Parse IMGT output and write db with open(imgt_files['summary'], 'r') as summary_handle, \ open(imgt_files['gapped'], 'r') as gapped_handle, \ open(imgt_files['ntseq'], 'r') as ntseq_handle, \ open(imgt_files['junction'], 'r') as junction_handle: # Open parser parse_iter = IMGTReader(summary_handle, gapped_handle, ntseq_handle, junction_handle) # Add germline sequence if repo is None: germ_iter = parse_iter else: references = readGermlines(repo) # Check for IMGT-gaps in germlines if all('...' not in x for x in references.values()): printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.') germ_iter = (addGermline(x, references) for x in parse_iter) # Write db output = writeDb(germ_iter, fields=fields, aligner_file=aligner_file, total_count=total_count, annotations=annotations, id_dict=id_dict, asis_id=asis_id, partial=partial, writer=writer, out_file=out_file, out_args=out_args) # Cleanup temp directory temp_dir.cleanup() return output
def updateDbFile(db_file, field, values, updates, out_args=default_out_args): """ Updates field and value pairs to a database file Arguments: db_file = the database file name field = the field to update values = a list of values to specifying which rows to update updates = a list of values to update each value with out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'update' log['FILE'] = os.path.basename(db_file) log['FIELD'] = field log['VALUES'] = ','.join(values) log['UPDATES'] = ','.join(updates) printLog(log) # Open file handles db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='parse-update', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') pass_writer = getDbWriter(pass_handle, db_file) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = pass_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # Updated values if found for x, y in zip(values, updates): if rec[field] == x: rec[field] = y pass_count += 1 # Write records pass_writer.writerow(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['UPDATED'] = pass_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def collectDbQueue(alive, result_queue, collect_queue, db_file, task_label, out_args, add_fields=None): """ Pulls from results queue, assembles results and manages log and file IO Arguments: alive : multiprocessing.Value boolean controlling whether processing continues; when False function returns result_queue : multiprocessing.Queue holding worker results collect_queue : multiprocessing.Queue to store collector return values db_file : Database file name task_label : Task label used to tag the output files out_args : Common output argument dictionary from parseCommonArgs add_fields : List of fields added to the writer not present in the in_file; if None do not add fields Returns: None : Adds a dictionary with key value pairs to collect_queue containing 'log' defining a log object, 'out_files' defining the output file names """ try: result_count = countDbFile(db_file) # Define output format out_type = getFileType(db_file) if out_args['out_type'] is None \ else out_args['out_type'] # Defined valid alignment output handle pass_handle = getOutputHandle(db_file, '%s-pass' % task_label, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_type) pass_writer = getDbWriter(pass_handle, db_file, add_fields=add_fields) # Defined failed alignment output handle if out_args['failed']: fail_handle = getOutputHandle(db_file, '%s-fail' % task_label, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_type) fail_writer = getDbWriter(fail_handle, db_file) else: fail_handle = None # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') except: alive.value = False raise try: # Iterator over results queue until sentinel object reached start_time = time() set_count = rec_count = pass_count = fail_count = 0 while alive.value: # Get result from queue if result_queue.empty(): continue else: result = result_queue.get() # Exit upon reaching sentinel if result is None: break # Print progress for previous iteration printProgress(pass_count, result_count, 0.05, start_time) # Update counts for current iteration set_count += 1 rec_count += result.data_count # Write log if result.log is not None: printLog(result.log, handle=log_handle) # Write alignments if result: pass_count += result.data_count if isinstance(result.results, IgRecord): pass_writer.writerow(result.results.toDict()) else: for rec in result.results: pass_writer.writerow(rec.toDict()) else: fail_count += result.data_count if fail_handle is not None: if isinstance(result.data, IgRecord): pass_writer.writerow(result.data.toDict()) else: for rec in result.data: fail_writer.writerow(rec.toDict()) else: sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ % os.getpid()) return None # Print total counts printProgress(pass_count, result_count, 0.05, start_time) # Update return values log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['GROUPS'] = set_count log['PASS'] = pass_count log['FAIL'] = fail_count collect_dict = {'log': log, 'out_files': [pass_handle.name]} collect_queue.put(collect_dict) # Close file handles pass_handle.close() if fail_handle is not None: fail_handle.close() if log_handle is not None: log_handle.close() except: alive.value = False raise return None
def insertGaps(db_file, references=None, format=default_format, out_file=None, out_args=default_out_args): """ Inserts IMGT numbering into V fields Arguments: db_file : the database file name. references : folder with germline repertoire files. If None, do not updated alignment columns wtih IMGT gaps. format : input format. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str : output file name """ log = OrderedDict() log['START'] = 'ConvertDb' log['COMMAND'] = 'imgt' log['FILE'] = os.path.basename(db_file) printLog(log) # Define format operators try: reader, writer, schema = getFormatOperators(format) except ValueError: printError('Invalid format %s.' % format) # Open input db_handle = open(db_file, 'rt') db_iter = reader(db_handle) # Check for required columns try: required = ['sequence_imgt', 'v_germ_start_imgt'] checkFields(required, db_iter.fields, schema=schema) except LookupError as e: printError(e) # Load references reference_dict = readGermlines(references) # Check for IMGT-gaps in germlines if all('...' not in x for x in reference_dict.values()): printWarning('Germline reference sequences do not appear to contain IMGT-numbering spacers. Results may be incorrect.') # Open output writer if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='gap', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=schema.out_type) pass_writer = writer(pass_handle, fields=db_iter.fields) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = pass_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Update IMGT fields imgt_dict = correctIMGTFields(rec, reference_dict) # Write records if imgt_dict is not None: pass_count += 1 rec.setDict(imgt_dict, parse=False) pass_writer.writeReceptor(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['PASS'] = pass_count log['FAIL'] = rec_count - pass_count log['END'] = 'ConvertDb' printLog(log) # Close file handles pass_handle.close() db_handle.close() return pass_handle.name
def convertToChangeo(db_file, out_file=None, out_args=default_out_args): """ Converts an AIRR formatted file into an Change-O formatted file Arguments: db_file: the database file name. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str : output file name. """ log = OrderedDict() log['START'] = 'ConvertDb' log['COMMAND'] = 'changeo' log['FILE'] = os.path.basename(db_file) printLog(log) # Open input db_handle = open(db_file, 'rt') db_iter = AIRRReader(db_handle) # Set output fields replacing length with end fields in_fields = [AIRRSchema.toReceptor(f) for f in db_iter.fields] out_fields = [] for f in in_fields: out_fields.append(f) if f in ReceptorData.end_fields and ReceptorData.end_fields[f][0] in in_fields: out_fields.append(ReceptorData.end_fields[f][1]) out_fields = list(OrderedDict.fromkeys(out_fields)) out_fields = [ChangeoSchema.fromReceptor(f) for f in out_fields] # Open output writer if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='changeo', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=ChangeoSchema.out_type) pass_writer = ChangeoWriter(pass_handle, fields=out_fields) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Write records pass_writer.writeReceptor(rec) # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['END'] = 'ConvertDb' printLog(log) # Close file handles pass_handle.close() db_handle.close() return pass_handle.name
def selectDbFile(db_file, fields, values, logic='any', regex=False, out_args=default_out_args): """ Selects records from a database file Arguments: db_file = the database file name fields = a list of fields to check for selection criteria values = a list of values defining selection targets logic = one of 'any' or 'all' defining whether one or all fields must have a match. regex = if False do exact full string matches; if True allow partial regex matches. out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ # Define string match function if regex: def _match_func(x, patterns): return any([re.search(p, x) for p in patterns]) else: def _match_func(x, patterns): return x in patterns # Define logic function if logic == 'any': _logic_func = any elif logic == 'all': _logic_func = all # Print console log log = OrderedDict() log['START'] = 'ParseDb' log['COMMAND'] = 'select' log['FILE'] = os.path.basename(db_file) log['FIELDS'] = ','.join(fields) log['VALUES'] = ','.join(values) log['REGEX'] =regex printLog(log) # Open file handles db_iter = readDbFile(db_file, ig=False) pass_handle = getOutputHandle(db_file, out_label='parse-select', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') pass_writer = getDbWriter(pass_handle, db_file) # Count records result_count = countDbFile(db_file) # Iterate over records start_time = time() rec_count = pass_count = fail_count = 0 for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time) rec_count += 1 # Check for selection values in all fields select = _logic_func([_match_func(rec.get(f, False), values) for f in fields]) # Write sequences if select: pass_count += 1 pass_writer.writerow(rec) else: fail_count += 1 # Print counts printProgress(rec_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['SELECTED'] = pass_count log['DISCARDED'] = fail_count log['END'] = 'ParseDb' printLog(log) # Close file handles pass_handle.close() return pass_handle.name
def convertToBaseline(db_file, id_field=default_id_field, seq_field=default_seq_field, germ_field=default_germ_field, cluster_field=None, meta_fields=None, out_file=None, out_args=default_out_args): """ Builds fasta files from database records Arguments: db_file : the database file name. id_field : the field containing identifiers. seq_field : the field containing sample sequences. germ_field : the field containing germline sequences. cluster_field : the field containing clonal groupings; if None write the germline for each record. meta_fields : a list of fields to add to sequence annotations. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str : output file name """ log = OrderedDict() log['START'] = 'ConvertDb' log['COMMAND'] = 'fasta' log['FILE'] = os.path.basename(db_file) log['ID_FIELD'] = id_field log['SEQ_FIELD'] = seq_field log['GERM_FIELD'] = germ_field log['CLUSTER_FIELD'] = cluster_field if meta_fields is not None: log['META_FIELDS'] = ','.join(meta_fields) printLog(log) # Open input db_handle = open(db_file, 'rt') db_iter = TSVReader(db_handle) result_count = countDbFile(db_file) # Open output if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(db_file, out_label='sequences', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='clip') # Iterate over records start_time = time() rec_count, germ_count, pass_count, fail_count = 0, 0, 0, 0 cluster_last = None for rec in db_iter: # Print progress for previous iteration printProgress(rec_count, result_count, 0.05, start_time=start_time) rec_count += 1 # Update cluster ID cluster = rec.get(cluster_field, None) # Get germline SeqRecord when needed if cluster_field is None: germ = buildSeqRecord(rec, id_field, germ_field, meta_fields) germ.id = '>' + germ.id elif cluster != cluster_last: germ = buildSeqRecord(rec, cluster_field, germ_field) germ.id = '>' + germ.id else: germ = None # Get read SeqRecord seq = buildSeqRecord(rec, id_field, seq_field, meta_fields) # Write germline if germ is not None: germ_count += 1 SeqIO.write(germ, pass_handle, 'fasta') # Write sequences if seq is not None: pass_count += 1 SeqIO.write(seq, pass_handle, 'fasta') else: fail_count += 1 # Set last cluster ID cluster_last = cluster # Print counts printProgress(rec_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['RECORDS'] = rec_count log['GERMLINES'] = germ_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'ConvertDb' printLog(log) # Close file handles pass_handle.close() db_handle.close() return pass_handle.name
def collectQueue(alive, result_queue, collect_queue, db_file, fields, writer=AIRRWriter, out_file=None, out_args=default_out_args): """ Assembles results from a queue of individual sequence results and manages log/file I/O Arguments: alive = a multiprocessing.Value boolean controlling whether processing continues if False exit process result_queue : a multiprocessing.Queue holding processQueue results collect_queue : a multiprocessing.Queue to store collector return values db_file : the input database file name fields : list of output field names writer : writer class. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs Returns: None : Adds a dictionary with key value pairs to collect_queue containing 'log' defining a log object along with the 'pass' and 'fail' output file names. """ # Wrapper for opening handles and writers def _open(x, f, writer=writer, out_file=out_file): if out_file is not None and x == 'pass': handle = open(out_file, 'w') else: handle = getOutputHandle(db_file, out_label='clone-%s' % x, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) return handle, writer(handle, fields=f) # Open log file try: # Count input records result_count = countDbFile(db_file) # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') except: #sys.stderr.write('Exception in collector file opening step\n') alive.value = False raise # Get results from queue and write to files try: # Initialize handles, writers and counters pass_handle, pass_writer = None, None fail_handle, fail_writer = None, None rec_count, clone_count, pass_count, fail_count = 0, 0, 0, 0 start_time = time() # Iterator over results queue until sentinel object reached while alive.value: # Get result from queue if result_queue.empty(): continue else: result = result_queue.get() # Exit upon reaching sentinel if result is None: break # Print progress for previous iteration and update record count printProgress(rec_count, result_count, 0.05, start_time=start_time, task='Assigning clones') rec_count += len(result.data) # Write passed and failed records if result: # Writing passing sequences for clone in result.results.values(): clone_count += 1 for i, rec in enumerate(clone, start=1): pass_count += 1 rec.setField('clone', str(clone_count)) result.log['CLONE%i-%i' % (clone_count, i)] = rec.junction try: pass_writer.writeReceptor(rec) except AttributeError: # Open pass file and define writer object pass_handle, pass_writer = _open('pass', fields) pass_writer.writeReceptor(rec) # Write failed sequences from passing sets if result.data_fail: # Write failed sequences for i, rec in enumerate(result.data_fail, start=1): fail_count += 1 result.log['FAIL%i-%i' % (clone_count, i)] = rec.junction if out_args['failed']: try: fail_writer.writeReceptor(rec) except AttributeError: # Open fail file and define writer object fail_handle, fail_writer = _open( 'fail', fields) fail_writer.writeReceptor(rec) else: # Write failing records for i, rec in enumerate(result.data, start=1): fail_count += 1 result.log['CLONE0-%i' % (i)] = rec.junction if out_args['failed']: try: fail_writer.writeReceptor(rec) except AttributeError: # Open fail file and define writer object fail_handle, fail_writer = _open('fail', fields) fail_writer.writeReceptor(rec) # Write log printLog(result.log, handle=log_handle) else: sys.stderr.write('PID %s> Error in sibling process detected. Cleaning up.\n' \ % os.getpid()) return None # Print total counts printProgress(rec_count, result_count, 0.05, start_time=start_time, task='Assigning clones') # Update return list log = OrderedDict() log['OUTPUT'] = os.path.basename( pass_handle.name) if pass_handle is not None else None log['CLONES'] = clone_count log['RECORDS'] = rec_count log['PASS'] = pass_count log['FAIL'] = fail_count # Close file handles and generate return data collect_dict = {'log': log, 'pass': None, 'fail': None} if pass_handle is not None: collect_dict['pass'] = pass_handle.name pass_handle.close() if fail_handle is not None: collect_dict['fail'] = fail_handle.name fail_handle.close() if log_handle is not None: log_handle.close() collect_queue.put(collect_dict) except: alive.value = False raise return None