def countSeqSets(seq_file, field=default_barcode_field, delimiter=default_delimiter): """ Identifies sets of sequences with the same ID field Arguments: seq_file : FASTA or FASTQ file containing sample sequences field : Annotation field containing set IDs delimiter : Tuple of delimiters for (fields, values, value lists) Returns: int : Count of unit set IDs in the sequence file """ # Count records and check file try: id_set = set() for seq in readSeqFile(seq_file): id_set.add(parseAnnotation(seq.description, delimiter=delimiter)[field]) result_count = len(id_set) except IOError: printError('File %s cannot be read.' % seq_file) except Exception as e: printError('File %s is invalid with exception %s.' % (seq_file, e)) else: if result_count == 0: printError('File %s is empty.' % seq_file) return result_count
def _header(seq, cluster, field=cluster_field, prefix=cluster_prefix, delimiter=out_args['delimiter']): label = '%s%i' % (prefix, cluster) header = parseAnnotation(seq.description, delimiter=delimiter) header = mergeAnnotation(header, {field: label}, delimiter=delimiter) seq.id = seq.name = flattenAnnotation(header, delimiter=delimiter) seq.description = '' return seq
def offsetSeqSet(seq_list, offset_dict, field=default_primer_field, mode='pad', delimiter=default_delimiter): """ Pads the head of a set of sequences with gaps according to an offset list Arguments: seq_list = a list of SeqRecord objects to offset offset_dict = a dictionary of {set ID: offset values} field = the field in sequence description containing set IDs mode = defines the action taken; one of 'pad','cut' delimiter = a tuple of delimiters for (annotations, field/values, value lists) Returns: a MultipleSeqAlignment object containing the alignment """ ann_list = [parseAnnotation(s.description, delimiter=delimiter) for s in seq_list] tag_list = [a[field] for a in ann_list] # Pad sequences with offsets align_list = [] if mode == 'pad': max_len = max([len(s) + offset_dict[t] for s, t in zip(seq_list, tag_list)]) for rec, tag in zip(seq_list, tag_list): new_rec = rec[:] new_rec.letter_annotations = {} new_rec.seq = '-' * offset_dict[tag] + new_rec.seq new_rec.seq += '-' * (max_len - len(new_rec.seq)) align_list.append(new_rec) # Cut sequences to common start position elif mode == 'cut': max_offset = max(offset_dict.values()) cut_dict = {k:(max_offset - v) for k, v in offset_dict.items()} max_len = max([len(s) - cut_dict[t] for s, t in zip(seq_list, tag_list)]) for rec, tag in zip(seq_list, tag_list): new_rec = rec[:] new_rec.letter_annotations = {} new_rec.seq = new_rec.seq[cut_dict[tag]:] new_rec.seq += '-' * (max_len - len(new_rec.seq)) align_list.append(new_rec) else: exit('offsettSeqList error: invalid offset mode') # Convert list to MultipleSeqAlignment object align = MultipleSeqAlignment(align_list) return align
def offsetSeqSet(seq_list, offset_dict, field=default_primer_field, mode='pad', delimiter=default_delimiter): """ Pads the head of a set of sequences with gaps according to an offset list Arguments: seq_list : a list of SeqRecord objects to offset offset_dict : a dictionary of {set ID: offset values} field : the field in sequence description containing set IDs mode : defines the action taken; one of 'pad','cut' delimiter : a tuple of delimiters for (annotations, field/values, value lists) Returns: Bio.Align.MultipleSeqAlignment: object containing the alignment. """ ann_list = [parseAnnotation(s.description, delimiter=delimiter) for s in seq_list] tag_list = [a[field] for a in ann_list] # Pad sequences with offsets align_list = [] if mode == 'pad': max_len = max([len(s) + offset_dict[t] for s, t in zip(seq_list, tag_list)]) for rec, tag in zip(seq_list, tag_list): new_rec = rec[:] new_rec.letter_annotations = {} new_rec.seq = '-' * offset_dict[tag] + new_rec.seq new_rec.seq += '-' * (max_len - len(new_rec.seq)) align_list.append(new_rec) # Cut sequences to common start position elif mode == 'cut': max_offset = max(offset_dict.values()) cut_dict = {k:(max_offset - v) for k, v in offset_dict.items()} max_len = max([len(s) - cut_dict[t] for s, t in zip(seq_list, tag_list)]) for rec, tag in zip(seq_list, tag_list): new_rec = rec[:] new_rec.letter_annotations = {} new_rec.seq = new_rec.seq[cut_dict[tag]:] new_rec.seq += '-' * (max_len - len(new_rec.seq)) align_list.append(new_rec) else: printError('Invalid offset mode.') # Convert list to MultipleSeqAlignment object align = MultipleSeqAlignment(align_list) return align
def indexSeqSets(seq_dict, field=default_barcode_field, delimiter=default_delimiter): """ Identifies sets of sequences with the same ID field Arguments: seq_dict : a dictionary index of sequences returned from SeqIO.index() field : the annotation field containing set IDs delimiter : a tuple of delimiters for (fields, values, value lists) Returns: dict : Dictionary mapping set name to a list of record names """ set_dict = {} for key, rec in seq_dict.items(): tag = parseAnnotation(rec.description, delimiter=delimiter)[field] set_dict.setdefault(tag, []).append(key) return set_dict
def subsetSeqIndex(seq_dict, field, values, delimiter=default_delimiter): """ Subsets a sequence set by annotation value Arguments: seq_dict : Dictionary index of sequences returned from SeqIO.index() field : Annotation field to select keys by values : List of annotation values that define the retained keys delimiter : Tuple of delimiters for (annotations, field/values, value lists) Returns: list : List of keys """ # Parse annotations from seq_dict and subset keys key_subset = [k for k in seq_dict \ if parseAnnotation(seq_dict[k].description, delimiter=delimiter)[field] \ in values] return key_subset
def subsetSeqSet(seq_iter, field, values, delimiter=default_delimiter): """ Subsets a sequence set by annotation value Arguments: seq_iter : Iterator or list of SeqRecord objects field : Annotation field to select by values : List of annotation values that define the retained sequences delimiter : Tuple of delimiters for (annotations, field/values, value lists) Returns: list : Modified list of SeqRecord objects """ # Parse annotations from seq_list records ann_list = [parseAnnotation(s.description, delimiter=delimiter) for s in seq_iter] # Subset seq_list by annotation if not isinstance(values, list): values = [values] seq_subset = [seq_iter[i] for i, a in enumerate(ann_list) if a[field] in values] return seq_subset
def convertGenericHeader(desc, delimiter=default_delimiter): """ Converts any header to the pRESTO format Arguments: desc = a sequence description string delimiter = a tuple of delimiters for (fields, values, value lists) Returns: a dictionary of header {field: value} pairs """ # Replace whitespace and delimiter characters sub_regex = '[%s\s]+' % re.escape(''.join(delimiter)) conv = re.sub(sub_regex, '_', desc) try: # Check if modified header is valid header = parseAnnotation(conv, delimiter=delimiter) except: # Assign header to None if header cannot be converted header = None return header
def groupSeqFile(seq_file, field, threshold=None, out_args=default_out_args): """ Divides a sequence file into segments by description tags Arguments: seq_file : filename of the sequence file to split field : The annotation field to split seq_file by threshold : The numerical threshold for group sequences by; if None treat field as textual out_args : common output argument dictionary from parseCommonArgs Returns: list: output file names """ log = OrderedDict() log['START'] = 'SplitSeq' log['COMMAND'] = 'group' log['FILE'] = os.path.basename(seq_file) log['FIELD'] = field log['THRESHOLD'] = threshold printLog(log) # Open file handles in_type = getFileType(seq_file) seq_iter = readSeqFile(seq_file) if out_args['out_type'] is None: out_args['out_type'] = in_type # Determine total numbers of records rec_count = countSeqFile(seq_file) # Process sequences start_time = time() seq_count = 0 if threshold is None: # Sort records into files based on textual field # Create set of unique field tags temp_iter = readSeqFile(seq_file) tag_list = getAnnotationValues(temp_iter, field, unique=True, delimiter=out_args['delimiter']) if sys.platform != 'win32': import resource # Increase open file handle limit if needed file_limit = resource.getrlimit(resource.RLIMIT_NOFILE)[0] file_count = len(tag_list) + 256 if file_limit < file_count and file_count <= 8192: #print file_limit, file_count resource.setrlimit(resource.RLIMIT_NOFILE, (file_count, file_count)) elif file_count > 8192: e = '''OS file limit would need to be set to %i. If you are sure you want to do this, then increase the file limit in the OS (via ulimit) and rerun this tool. ''' % file_count printError(dedent(e)) # Create output handles # out_label = '%s=%s' % (field, tag) handles_dict = { tag: getOutputHandle(seq_file, '%s-%s' % (field, tag), out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) for tag in tag_list } # Iterate over sequences for seq in seq_iter: printProgress(seq_count, rec_count, 0.05, start_time=start_time) seq_count += 1 # Write sequences tag = parseAnnotation(seq.description, delimiter=out_args['delimiter'])[field] SeqIO.write(seq, handles_dict[tag], out_args['out_type']) else: # Sort records into files based on numeric threshold threshold = float(threshold) # Create output handles handles_dict = { 'under': getOutputHandle(seq_file, 'under-%.1g' % threshold, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']), 'atleast': getOutputHandle(seq_file, 'atleast-%.1g' % threshold, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) } # Iterate over sequences for seq in seq_iter: printProgress(seq_count, rec_count, 0.05, start_time=start_time) seq_count += 1 # Write sequences tag = parseAnnotation(seq.description, delimiter=out_args['delimiter'])[field] tag = 'under' if float(tag) < threshold else 'atleast' SeqIO.write(seq, handles_dict[tag], out_args['out_type']) # Print log printProgress(seq_count, rec_count, 0.05, start_time=start_time) log = OrderedDict() for i, k in enumerate(handles_dict): log['OUTPUT%i' % (i + 1)] = os.path.basename(handles_dict[k].name) log['SEQUENCES'] = rec_count log['PARTS'] = len(handles_dict) log['END'] = 'SplitSeq' printLog(log) # Close output file handles for k in handles_dict: handles_dict[k].close() return [handles_dict[k].name for k in handles_dict]
def sortSeqFile(seq_file, field, numeric=False, max_count=None, out_args=default_out_args): """ Sorts a sequence file by annotation fields Arguments: seq_file : filename of the sequence file to split field : position of field in sequence description to split by numeric : if True sort field numerically; if False sort field alphabetically max_count : maximum number of records in each output file if None do not create multiple files out_args : common output argument dictionary from parseCommonArgs Returns: list: output file names """ log = OrderedDict() log['START'] = 'SplitSeq' log['COMMAND'] = 'sort' log['FILE'] = os.path.basename(seq_file) log['FIELD'] = field log['NUMERIC'] = numeric log['MAX_COUNT'] = max_count printLog(log) # Open file handles in_type = getFileType(seq_file) seq_dict = readSeqFile(seq_file, index=True) if out_args['out_type'] is None: out_args['out_type'] = in_type # Get annotations and sort seq_dict by annotation values tag_dict = { k: parseAnnotation(seq_dict[k].description, delimiter=out_args['delimiter'])[field] for k in seq_dict } if numeric: tag_dict = {k: float(v or 0) for k, v in tag_dict.items()} sorted_keys = sorted(tag_dict, key=tag_dict.get) # Determine total numbers of records rec_count = len(seq_dict) if max_count >= rec_count: max_count = None # Open initial output file handles file_count = 1 if max_count is None: out_label = 'sorted' else: out_label = 'sorted-part%06i' % file_count out_handle = getOutputHandle(seq_file, out_label, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) out_files = [out_handle.name] # Loop through sorted sequence dictionary keys start_time = time() last_tag = None saved_keys = [] seq_count = chunk_count = 0 for key in sorted_keys: # Print progress for previous iteration and update count printProgress(seq_count, rec_count, 0.05, start_time=start_time) seq_count += 1 # Write saved group of sequences when tag changes if last_tag is not None and tag_dict[key] != last_tag: # Open new output file if needed if max_count is not None and chunk_count + len( saved_keys) > max_count: # Update partition counts file_count += 1 chunk_count = 0 # Open new file handle out_handle.close() out_handle = getOutputHandle(seq_file, 'sorted-part%06i' % file_count, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) # Append output file name to out_files out_files.append(out_handle.name) # Write saved sequences for k in saved_keys: chunk_count += 1 SeqIO.write(seq_dict[k], out_handle, out_args['out_type']) # Reset saved keys to current key only saved_keys = [key] else: # Update list of saved keys if tag is unchanged saved_keys.append(key) # Check if total records reached, write all saved keys, and exit loop if seq_count == rec_count: for k in saved_keys: chunk_count += 1 SeqIO.write(seq_dict[k], out_handle, out_args['out_type']) out_handle.close() break # Update tag tracker last_tag = tag_dict[key] # Print log printProgress(seq_count, rec_count, 0.05, start_time=start_time) log = OrderedDict() for i, f in enumerate(out_files): log['OUTPUT%i' % (i + 1)] = os.path.basename(f) log['SEQUENCES'] = seq_count log['PARTS'] = len(out_files) log['END'] = 'SplitSeq' printLog(log) # Close file handles out_handle.close() return out_files
def getMaskedSeq(align, mode='mask', barcode=False, delimiter=default_delimiter): """ Create an output sequence with primers masked or cut Arguments: align = a PrimerAlignment object returned from alignPrimers or scorePrimers mode = defines the action taken; one of ['cut','mask','tag','trim'] barcode = if True add sequence preceding primer to description delimiter = a tuple of delimiters for (annotations, field/values, value lists) Returns: output SeqRecord object """ seq = align.seq # Build output sequence if mode == 'tag' or not align.align_primer: # Do not modify sequence out_seq = seq elif mode == 'trim': # Remove region before primer if not align.rev_primer: out_seq = seq[align.start:] else: out_seq = seq[:align.end] elif mode == 'cut': # Remove primer and preceding region if not align.rev_primer: out_seq = seq[align.end:] else: out_seq = seq[:align.start] elif mode == 'mask': # Mask primer with Ns and remove preceding region if not align.rev_primer: mask_len = align.end - align.start + align.gaps out_seq = 'N' * mask_len + seq[align.end:] if hasattr(seq, 'letter_annotations') and \ 'phred_quality' in seq.letter_annotations: out_seq.letter_annotations['phred_quality'] = \ [0] * mask_len + \ seq.letter_annotations['phred_quality'][align.end:] else: mask_len = min(align.end, len(seq)) - align.start + align.gaps out_seq = seq[:align.start] + 'N' * mask_len if hasattr(seq, 'letter_annotations') and \ 'phred_quality' in seq.letter_annotations: out_seq.letter_annotations['phred_quality'] = \ seq.letter_annotations['phred_quality'][:align.start] + \ [0] * mask_len # Add alignment annotations to output SeqRecord out_seq.annotations = seq.annotations out_seq.annotations['primer'] = align.primer out_seq.annotations['prstart'] = align.start out_seq.annotations['error'] = align.error # Parse seq annotation and create output annotation seq_ann = parseAnnotation(seq.description, delimiter=delimiter) out_ann = OrderedDict([('SEQORIENT', seq.annotations['seqorient']), ('PRIMER', align.primer)]) # Add ID sequence to description if barcode: seq_code = seq[:align.start].seq if not align.rev_primer \ else seq[align.end:].seq out_seq.annotations['barcode'] = seq_code out_ann['BARCODE'] = seq_code out_ann = mergeAnnotation(seq_ann, out_ann, delimiter=delimiter) out_seq.id = flattenAnnotation(out_ann, delimiter=delimiter) out_seq.description = '' return out_seq
def modifyHeaders(seq_file, modify_func, modify_args, out_file=None, out_args=default_out_args): """ Modifies sequence headers Arguments: seq_file : the sequence file name. modify_func : the function defining the modification operation. modify_args : a dictionary of arguments to pass to modify_func. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str: output file name. """ # Define subcommand label dictionary cmd_dict = {addHeader: 'add', copyHeader: 'copy', collapseHeader: 'collapse', deleteHeader: 'delete', expandHeader: 'expand', renameHeader: 'rename'} # Print parameter info log = OrderedDict() log['START'] = 'ParseHeaders' log['COMMAND'] = cmd_dict.get(modify_func, modify_func.__name__) log['FILE'] = os.path.basename(seq_file) for k in sorted(modify_args): v = modify_args[k] log[k.upper()] = ','.join(v) if isinstance(v, list) else v printLog(log) # Open file handles in_type = getFileType(seq_file) seq_iter = readSeqFile(seq_file) if out_args['out_type'] is None: out_args['out_type'] = in_type if out_file is not None: out_handle = open(out_file, 'w') else: out_handle = getOutputHandle(seq_file, 'reheader', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) # Count records result_count = countSeqFile(seq_file) # Iterate over sequences start_time = time() seq_count = 0 for seq in seq_iter: # Print progress for previous iteration printProgress(seq_count, result_count, 0.05, start_time=start_time) #Update counts seq_count += 1 # Modify header header = parseAnnotation(seq.description, delimiter=out_args['delimiter']) header = modify_func(header, delimiter=out_args['delimiter'], **modify_args) # Write new sequence seq.id = seq.name = flattenAnnotation(header, delimiter=out_args['delimiter']) seq.description = '' SeqIO.write(seq, out_handle, out_args['out_type']) # print counts printProgress(seq_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(out_handle.name) log['SEQUENCES'] = seq_count log['END'] = 'ParseHeaders' printLog(log) # Close file handles out_handle.close() return out_handle.name
def writeDb(db_gen, file_prefix, total_count, id_dict={}, no_parse=True, score_fields=False, region_fields=False, out_args=default_out_args): """ Writes tab-delimited database file in output directory Arguments: db_gen = a generator of IgRecord objects containing alignment data file_prefix = directory and prefix for CLIP tab-delim file total_count = number of records (for progress bar) id_dict = a dictionary of {IMGT ID: full seq description} no_parse = if ID is to be parsed for pRESTO output with default delimiters score_fields = if True add alignment score fields to output file region_fields = if True add FWR and CDR region fields to output file out_args = common output argument dictionary from parseCommonArgs Returns: None """ pass_file = "%s_db-pass.tab" % file_prefix fail_file = "%s_db-fail.tab" % file_prefix ordered_fields = ['SEQUENCE_ID', 'SEQUENCE_INPUT', 'FUNCTIONAL', 'IN_FRAME', 'STOP', 'MUTATED_INVARIANT', 'INDELS', 'V_CALL', 'D_CALL', 'J_CALL', 'SEQUENCE_VDJ', 'SEQUENCE_IMGT', 'V_SEQ_START', 'V_SEQ_LENGTH', 'V_GERM_START_VDJ', 'V_GERM_LENGTH_VDJ', 'V_GERM_START_IMGT', 'V_GERM_LENGTH_IMGT', 'N1_LENGTH', 'D_SEQ_START', 'D_SEQ_LENGTH', 'D_GERM_START', 'D_GERM_LENGTH', 'N2_LENGTH', 'J_SEQ_START', 'J_SEQ_LENGTH', 'J_GERM_START', 'J_GERM_LENGTH', 'JUNCTION_LENGTH', 'JUNCTION'] if score_fields: ordered_fields.extend(['V_SCORE', 'V_IDENTITY', 'V_EVALUE', 'V_BTOP', 'J_SCORE', 'J_IDENTITY', 'J_EVALUE', 'J_BTOP']) if region_fields: ordered_fields.extend(['FWR1_IMGT', 'FWR2_IMGT', 'FWR3_IMGT', 'FWR4_IMGT', 'CDR1_IMGT', 'CDR2_IMGT', 'CDR3_IMGT']) # TODO: This is not the best approach. should pass in output fields. # Initiate passed handle pass_handle = None # Open failed file if out_args['failed']: fail_handle = open(fail_file, 'wt') fail_writer = getDbWriter(fail_handle, add_fields=['SEQUENCE_ID', 'SEQUENCE_INPUT']) else: fail_handle = None fail_writer = None # Initialize counters and file pass_writer = None start_time = time() rec_count = pass_count = fail_count = 0 for record in db_gen: #printProgress(i + (total_count/2 if id_dict else 0), total_count, 0.05, start_time) printProgress(rec_count, total_count, 0.05, start_time) rec_count += 1 # Count pass or fail if (record.v_call == 'None' and record.j_call == 'None') or \ record.functional is None or \ not record.seq_vdj or \ not record.junction: # print(record.v_call, record.j_call, record.functional, record.junction) fail_count += 1 if fail_writer is not None: fail_writer.writerow(record.toDict()) continue else: pass_count += 1 # Build sample sequence description if record.id in id_dict: record.id = id_dict[record.id] # Parse sequence description into new columns if not no_parse: record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter']) record.id = record.annotations['ID'] del record.annotations['ID'] # TODO: This is not the best approach. should pass in output fields. # If first sequence, use parsed description to create new columns and initialize writer if pass_writer is None: if not no_parse: ordered_fields.extend(list(record.annotations.keys())) pass_handle = open(pass_file, 'wt') pass_writer = getDbWriter(pass_handle, add_fields=ordered_fields) # Write row to tab-delim CLIP file pass_writer.writerow(record.toDict()) # Print log #printProgress(i+1 + (total_count/2 if id_dict else 0), total_count, 0.05, start_time) printProgress(rec_count, total_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = pass_file log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'MakeDb' printLog(log) if pass_handle is not None: pass_handle.close() if fail_handle is not None: fail_handle.close()
def _barcode(seq, field=barcode_field, delimiter=out_args['delimiter']): header = parseAnnotation(seq.description, delimiter=delimiter) return SeqRecord(Seq(header[field]), id=seq.id)
def processASQueue(alive, data_queue, result_queue, align_func, align_args={}, calc_div=False, delimiter=default_delimiter): """ Pulls from data queue, performs calculations, and feeds results queue Arguments: alive = a multiprocessing.Value boolean controlling whether processing continues; when False function returns data_queue = a multiprocessing.Queue holding data to process result_queue = a multiprocessing.Queue to hold processed results align_func = the function to use for alignment align_args = a dictionary of optional arguments for the alignment function calc_div = if True perform diversity calculation delimiter = a tuple of delimiters for (annotations, field/values, value lists) Returns: None """ try: # Iterator over data queue until sentinel object reached while alive.value: # Get data from queue if data_queue.empty(): continue else: data = data_queue.get() # Exit upon reaching sentinel if data is None: break # Define result object result = SeqResult(data.id, data.data) result.log['BARCODE'] = data.id result.log['SEQCOUNT'] = len(data) # Perform alignment seq_list = data.data align_list = align_func(seq_list, **align_args) # Process alignment if align_list is not None: # Calculate diversity if calc_div: diversity = calculateDiversity(align_list) result.log['DIVERSITY'] = diversity # Restore quality scores has_quality = hasattr(seq_list[0], 'letter_annotations') and \ 'phred_quality' in seq_list[0].letter_annotations if has_quality: qual_dict = {seq.id:seq.letter_annotations['phred_quality'] \ for seq in seq_list} for seq in align_list: qual = deque(qual_dict[seq.id]) qual_new = [0 if c == '-' else qual.popleft() for c in seq.seq] seq.letter_annotations['phred_quality'] = qual_new # Add alignment to log if 'field' in align_args: for i, seq in enumerate(align_list): ann = parseAnnotation(seq.description, delimiter=delimiter) primer = ann[align_args['field']] result.log['ALIGN%i:%s' % (i + 1, primer)] = seq.seq else: for i, seq in enumerate(align_list): result.log['ALIGN%i' % (i + 1)] = seq.seq # Add alignment to results result.results = align_list result.valid = True # Feed results to result queue result_queue.put(result) else: sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ % os.getpid()) return None except: alive.value = False sys.stderr.write('Error processing sequence set with ID: %s.\n' % data.id) raise return None
def tableHeaders(seq_file, fields, out_args=default_out_args): """ Builds a table of sequence header annotations Arguments: seq_file = the sequence file name fields = the list of fields to output out_args = common output argument dictionary from parseCommonArgs Returns: the output table file name """ log = OrderedDict() log['START'] = 'ParseHeaders' log['COMMAND'] = 'table' log['FILE'] = os.path.basename(seq_file) printLog(log) # Open file handles seq_iter = readSeqFile(seq_file) out_handle = getOutputHandle(seq_file, out_label='headers', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') # Count records result_count = countSeqFile(seq_file) # Open csv writer and write header out_writer = csv.DictWriter(out_handle, extrasaction='ignore', restval='', delimiter='\t', fieldnames=fields) out_writer.writeheader() # Iterate over sequences start_time = time() seq_count = pass_count = fail_count = 0 for seq in seq_iter: # Print progress for previous iteration printProgress(seq_count, result_count, 0.05, start_time) # Get annotations seq_count += 1 ann = parseAnnotation(seq.description, fields, delimiter=out_args['delimiter']) # Write records if ann: pass_count += 1 out_writer.writerow(ann) else: fail_count += 1 # Print counts printProgress(seq_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(out_handle.name) log['SEQUENCES'] = seq_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'ParseHeaders' printLog(log) # Close file handles out_handle.close() return out_handle.name
def collapseSeq(seq_file, max_missing=default_max_missing, uniq_fields=None, copy_fields=None, copy_actions=None, max_field=None, min_field=None, inner=False, keep_missing=False, out_file=None, out_args=default_out_args): """ Removes duplicate sequences from a file Arguments: seq_file : filename of the sequence file to sample from. max_missing : number of ambiguous characters to allow in a unique sequence. uniq_fields : a list of annotations that define a sequence as unique if they differ. copy_fields : a list of annotations to copy into unique sequence annotations. copy_actions : the list of collapseAnnotation actions to take on copy_fields. max_field : a numeric field whose maximum value determines the retained sequence. min_field : a numeric field whose minimum value determines the retained sequence. inner : if True exclude consecutive outer ambiguous characters from iterations and matching. keep_missing : if True retain sequences with more ambiguous characters than max_missing as unique. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str: the collapsed output file name. """ log = OrderedDict() log['START'] = 'CollapseSeq' log['FILE'] = os.path.basename(seq_file) log['MAX_MISSING'] = max_missing log['UNIQ_FIELDS'] = ','.join([str(x) for x in uniq_fields]) \ if uniq_fields is not None else None log['COPY_FIELDS'] = ','.join([str(x) for x in copy_fields]) \ if copy_fields is not None else None log['COPY_ACTIONS'] = ','.join([str(x) for x in copy_actions]) \ if copy_actions is not None else None log['MAX_FIELD'] = max_field log['MIN_FIELD'] = min_field log['INNER'] = inner log['KEEP_MISSING'] = keep_missing printLog(log) # Read input file in_type = getFileType(seq_file) seq_dict = SeqIO.to_dict(readSeqFile(seq_file, index=False)) if out_args['out_type'] is None: out_args['out_type'] = in_type # Count total sequences rec_count = len(seq_dict) # Open unique record output handle if out_file is not None: pass_handle = open(out_file, 'w') else: pass_handle = getOutputHandle(seq_file, 'collapse-unique', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') # Find sequences with duplicates uniq_dict = {} # Added list typing for compatibility issue with Python 2.7.5 on OS X # TypeError: object of type 'dictionary-keyiterator' has no len() search_keys = list(seq_dict.keys()) dup_keys = [] for n in range(0, max_missing + 1): # Find unique sequences uniq_dict, search_keys, dup_list = findUniqueSeq(uniq_dict, search_keys, seq_dict, n, uniq_fields, copy_fields, max_field, min_field, inner, out_args['delimiter']) # Update list of duplicates dup_keys.extend(dup_list) # Break if no keys to search remain if len(search_keys) == 0: break # Write unique sequences for val in uniq_dict.values(): # Define output sequence out_seq = val.seq out_ann = parseAnnotation(out_seq.description, delimiter=out_args['delimiter']) out_app = OrderedDict() if copy_fields is not None and copy_actions is not None: for f, a in zip(copy_fields, copy_actions): x = collapseAnnotation(val.annotations, a, f, delimiter=out_args['delimiter']) out_app[f] = x[f] out_ann.pop(f, None) out_app['DUPCOUNT'] = val.count out_ann = mergeAnnotation(out_ann, out_app, delimiter=out_args['delimiter']) out_seq.id = out_seq.name = flattenAnnotation(out_ann, delimiter=out_args['delimiter']) out_seq.description = '' # Write unique sequence SeqIO.write(out_seq, pass_handle, out_args['out_type']) # Update log log = OrderedDict() log['HEADER'] = out_seq.id log['DUPCOUNT'] = val.count for i, k in enumerate(val.keys, start=1): log['ID%i' % i] = k for i, k in enumerate(val.keys, start=1): log['SEQ%i' % i] = str(seq_dict[k].seq) printLog(log, handle=log_handle) # Write sequence with high missing character counts if keep_missing: for k in search_keys: out_seq = seq_dict[k] out_ann = parseAnnotation(out_seq.description, delimiter=out_args['delimiter']) out_ann = mergeAnnotation(out_ann, {'DUPCOUNT':1}, delimiter=out_args['delimiter']) out_seq.id = out_seq.name = flattenAnnotation(out_ann, delimiter=out_args['delimiter']) out_seq.description = '' SeqIO.write(out_seq, pass_handle, out_args['out_type']) # Write sequence with high missing character counts if out_args['failed'] and not keep_missing: with getOutputHandle(seq_file, 'collapse-undetermined', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) \ as missing_handle: for k in search_keys: SeqIO.write(seq_dict[k], missing_handle, out_args['out_type']) if out_args['failed']: # Write duplicate sequences with getOutputHandle(seq_file, 'collapse-duplicate', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) \ as dup_handle: for k in dup_keys: SeqIO.write(seq_dict[k], dup_handle, out_args['out_type']) # Print log log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) log['SEQUENCES'] = rec_count log['UNIQUE'] = len(uniq_dict) log['DUPLICATE'] = len(dup_keys) log['UNDETERMINED'] = len(search_keys) log['END'] = 'CollapseSeq' printLog(log) # Close file handles pass_file = pass_handle.name if pass_handle is not None: pass_handle.close() if log_handle is not None: log_handle.close() return pass_file
def findUniqueSeq(uniq_dict, search_keys, seq_dict, max_missing=default_max_missing, uniq_fields=None, copy_fields=None, max_field=None, min_field=None, inner=False, delimiter=default_delimiter): """ Finds unique sequences Arguments: uniq_dict : a dictionary of unique sequences generated by findUniqueSeq(). search_keys : a list containing the subset of dictionary keys to be checked. seq_dict : a SeqRecords dictionary generated by SeqIO.index(). max_missing : the number of missing characters to allow in a unique sequences. uniq_fields : a list of annotations that define a sequence as unique if they differ. copy_fields : a list of annotations to copy into unique sequence annotations. max_field : a numeric field whose maximum value determines the retained sequence. min_field : a numeric field whose minimum value determines the retained sequence. inner : if True exclude consecutive outer ambiguous characters from iterations and matching. delimiter : description field delimiter. Returns: tuple: (uniq_dict, search_keys, dup_keys) modified from passed values. """ # Define local variables ambig_re = re.compile(r'[\.\-N]') score = (max_missing > 0) dup_keys = [] to_remove = [] start_time = time() result_count = len(search_keys) # Iterate over search keys and update uniq_dict and dup_keys for idx, key in enumerate(search_keys): # Print progress of previous iteration printProgress(idx, result_count, 0.05, start_time=start_time, task='%i missing' % max_missing) # Define sequence to process seq = seq_dict[key] seq_str = str(seq.seq) if inner: seq_str = seq_str.strip('.-N') # Skip processing of ambiguous sequences over max_missing threshold ambig_count = len(ambig_re.findall(seq_str)) if ambig_count > max_missing: continue # Parse annotation and define unique identifiers (uid) if uniq_fields is not None: ann = parseAnnotation(seq_dict[key].description, uniq_fields, delimiter=delimiter) uid = tuple(chain([seq_str], list(ann.values()))) else: uid = (seq_str, None) # Parse annotation and define copied identifiers (cid) if copy_fields is not None: ann = parseAnnotation(seq.description, copy_fields, delimiter=delimiter) cid = {k:[ann.get(k)] for k in copy_fields} else: cid = {} # Store new unique sequences and process duplicates match = findUID(uid, uniq_dict, score) if match is None: uniq_dict[uid] = DuplicateSet(seq, key=key, missing=ambig_count, annotations=cid) else: # Updated sequence, count, ambiguous character count, and count sets dup_key = key uniq_dict[match].count += 1 uniq_dict[match].keys.append(key) for k, v in cid.items(): uniq_dict[match].annotations[k].extend(v) # Check whether to replace previous unique sequence with current sequence if ambig_count <= uniq_dict[match].missing: swap = False seq_last = uniq_dict[match].seq if max_field is not None: swap = float(parseAnnotation(seq.description, delimiter=delimiter)[max_field]) > \ float(parseAnnotation(seq_last.description, delimiter=delimiter)[max_field]) elif min_field is not None: swap = float(parseAnnotation(seq.description, delimiter=delimiter)[min_field]) > \ float(parseAnnotation(seq_last.description, delimiter=delimiter)[min_field]) # TODO: quality evaluation is a bottleneck else: if hasattr(seq, 'letter_annotations') and 'phred_quality' in seq.letter_annotations: q_this = float(sum(seq.letter_annotations['phred_quality'])) / len(seq) q_last = float(sum(seq_last.letter_annotations['phred_quality'])) / len(seq_last) swap = q_this > q_last # Replace old sequence if criteria passed if swap: dup_key = seq_last.id uniq_dict[match].seq = seq uniq_dict[match].missing = ambig_count # Update duplicate list dup_keys.append(dup_key) # Mark seq for removal from later steps to_remove.append(idx) # Remove matched sequences from search_keys for j in reversed(to_remove): del search_keys[j] # Update progress printProgress(result_count, result_count, 0.05, start_time=start_time, task='%i missing' % max_missing) return (uniq_dict, search_keys, dup_keys)
def findUniqueSeq(uniq_dict, search_keys, seq_dict, max_missing=default_max_missing, uniq_fields=None, copy_fields=None, max_field=None, min_field=None, inner=False, delimiter=default_delimiter): """ Finds unique sequences Arguments: uniq_dict = a dictionary of unique sequences generated by findUniqueSeq() search_keys = a list containing the subset of dictionary keys to be checked seq_dict = a SeqRecords dictionary generated by SeqIO.index() max_missing = the number of missing characters to allow in a unique sequences uniq_fields = a list of annotations that define a sequence as unique if they differ copy_fields = a list of annotations to copy into unique sequence annotations max_field = a numeric field whose maximum value determines the retained sequence min_field = a numeric field whose minimum value determines the retained sequence inner = if True exclude consecutive outer ambiguous characters from iterations and matching delimiter = description field delimiter Returns: a tuple of (uniq_dict, search_keys, dup_keys) modified from passed values """ # Define local variables ambig_re = re.compile(r'[\.\-N]') score = (max_missing > 0) dup_keys = [] to_remove = [] start_time = time() result_count = len(search_keys) print('MISSING> %i' % max_missing) # Iterate over search keys and update uniq_dict and dup_keys for idx, key in enumerate(search_keys): # Print progress of previous iteration printProgress(idx, result_count, 0.05, start_time) # Define sequence to process seq = seq_dict[key] seq_str = str(seq.seq) if inner: seq_str = seq_str.strip('.-N') # Skip processing of ambiguous sequences over max_missing threshold ambig_count = len(ambig_re.findall(seq_str)) if ambig_count > max_missing: continue # Parse annotation and define unique identifiers (uid) if uniq_fields is not None: ann = parseAnnotation(seq_dict[key].description, uniq_fields, delimiter=delimiter) uid = tuple(chain([seq_str], list(ann.values()))) else: uid = (seq_str, None) # Parse annotation and define copied identifiers (cid) if copy_fields is not None: ann = parseAnnotation(seq.description, copy_fields, delimiter=delimiter) #print ann #cid = [[a] for a in ann.values()] cid = [[ann.get(k)] for k in copy_fields] #print cid else: cid = [] # Store new unique sequences and process duplicates match = findUID(uid, uniq_dict, score) if match is None: uniq_dict[uid] = list(chain([seq, 1, ambig_count], cid)) else: # Updated sequence, count, ambiguous character count, and count sets dup_key = key uniq_dict[match][1] += 1 for x, c in enumerate(cid): uniq_dict[match][3 + x].extend(c) # Check whether to replace previous unique sequence with current sequence if ambig_count <= uniq_dict[match][2]: swap = False seq_last = uniq_dict[match][0] if max_field is not None: swap = float(parseAnnotation(seq.description, delimiter=delimiter)[max_field]) > \ float(parseAnnotation(seq_last.description, delimiter=delimiter)[max_field]) elif min_field is not None: swap = float(parseAnnotation(seq.description, delimiter=delimiter)[min_field]) > \ float(parseAnnotation(seq_last.description, delimiter=delimiter)[min_field]) # TODO: quality evaluation is a bottleneck else: if hasattr(seq, 'letter_annotations') and 'phred_quality' in seq.letter_annotations: q_this = float(sum(seq.letter_annotations['phred_quality'])) / len(seq) q_last = float(sum(seq_last.letter_annotations['phred_quality'])) / len(seq_last) swap = q_this > q_last # Replace old sequence if criteria passed if swap: dup_key = seq_last.id #uniq_dict[match] = [seq, uniq_dict[match][1], ambig_count] uniq_dict[match][0] = seq uniq_dict[match][2] = ambig_count # Update duplicate list dup_keys.append(dup_key) # Mark seq for removal from later steps to_remove.append(idx) # Remove matched sequences from search_keys for j in reversed(to_remove): del search_keys[j] # Update progress printProgress(result_count, result_count, 0.05, start_time) return (uniq_dict, search_keys, dup_keys)
def collapseSeq(seq_file, max_missing=default_max_missing, uniq_fields=None, copy_fields=None, copy_actions=None, max_field=None, min_field=None, inner=False, keep_missing=False, out_args=default_out_args): """ Removes duplicate sequences from a file Arguments: seq_file = filename of the sequence file to sample from max_missing = number of ambiguous characters to allow in a unique sequence uniq_fields = a list of annotations that define a sequence as unique if they differ copy_fields = a list of annotations to copy into unique sequence annotations copy_actions = the list of collapseAnnotation actions to take on copy_fields max_field = a numeric field whose maximum value determines the retained sequence min_field = a numeric field whose minimum value determines the retained sequence inner = if True exclude consecutive outer ambiguous characters from iterations and matching keep_missing = if True retain sequences with more ambiguous characters than max_missing as unique out_args = common output argument dictionary from parseCommonArgs Returns: the collapsed output file name """ log = OrderedDict() log['START'] = 'CollapseSeq' log['FILE'] = os.path.basename(seq_file) log['MAX_MISSING'] = max_missing log['UNIQ_FIELDS'] = ','.join([str(x) for x in uniq_fields]) \ if uniq_fields is not None else None log['COPY_FIELDS'] = ','.join([str(x) for x in copy_fields]) \ if copy_fields is not None else None log['COPY_ACTIONS'] = ','.join([str(x) for x in copy_actions]) \ if copy_actions is not None else None log['MAX_FIELD'] = max_field log['MIN_FIELD'] = min_field log['INNER'] = inner log['KEEP_MISSING'] = keep_missing printLog(log) # TODO: storing all sequences in memory is faster # Read input file in_type = getFileType(seq_file) #seq_dict = readSeqFile(seq_file, index=True) seq_dict = SeqIO.to_dict(readSeqFile(seq_file, index=False)) if out_args['out_type'] is None: out_args['out_type'] = in_type # Count total sequences rec_count = len(seq_dict) # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') # Find sequences with duplicates uniq_dict = {} # Added list typing for compatibility issue with Python 2.7.5 on OS X # TypeError: object of type 'dictionary-keyiterator' has no len() search_keys = list(seq_dict.keys()) dup_keys = [] for n in range(0, max_missing + 1): # Find unique sequences uniq_dict, search_keys, dup_list = findUniqueSeq(uniq_dict, search_keys, seq_dict, n, uniq_fields, copy_fields, max_field, min_field, inner, out_args['delimiter']) # Update list of duplicates dup_keys.extend(dup_list) # Update log log = OrderedDict() log['ITERATION'] = n + 1 log['MISSING'] = n log['UNIQUE'] = len(uniq_dict) log['DUPLICATE'] = len(dup_keys) log['UNDETERMINED'] = len(search_keys) printLog(log, handle=log_handle) # Break if no keys to search remain if len(search_keys) == 0: break # Write unique sequences with getOutputHandle(seq_file, 'collapse-unique', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) \ as uniq_handle: for val in uniq_dict.values(): # Define output sequence out_seq = val[0] out_ann = parseAnnotation(out_seq.description, delimiter=out_args['delimiter']) out_app = OrderedDict() if copy_fields is not None and copy_actions is not None: for f, a, s in zip(copy_fields, copy_actions, val[3:]): out_app[f] = s out_app = collapseAnnotation(out_app, a, f, delimiter=out_args['delimiter']) out_ann.pop(f, None) out_app['DUPCOUNT'] = val[1] out_ann = mergeAnnotation(out_ann, out_app, delimiter=out_args['delimiter']) out_seq.id = out_seq.name = flattenAnnotation(out_ann, delimiter=out_args['delimiter']) out_seq.description = '' # Write unique sequence SeqIO.write(out_seq, uniq_handle, out_args['out_type']) # Write sequence with high missing character counts if keep_missing: for k in search_keys: out_seq = seq_dict[k] out_ann = parseAnnotation(out_seq.description, delimiter=out_args['delimiter']) out_ann = mergeAnnotation(out_ann, {'DUPCOUNT':1}, delimiter=out_args['delimiter']) out_seq.id = out_seq.name = flattenAnnotation(out_ann, delimiter=out_args['delimiter']) out_seq.description = '' SeqIO.write(out_seq, uniq_handle, out_args['out_type']) # Write sequence with high missing character counts if out_args['failed'] and not keep_missing: with getOutputHandle(seq_file, 'collapse-undetermined', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) \ as missing_handle: for k in search_keys: SeqIO.write(seq_dict[k], missing_handle, out_args['out_type']) if out_args['failed']: # Write duplicate sequences with getOutputHandle(seq_file, 'collapse-duplicate', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) \ as dup_handle: for k in dup_keys: SeqIO.write(seq_dict[k], dup_handle, out_args['out_type']) # Print log log = OrderedDict() log['OUTPUT'] = os.path.basename(uniq_handle.name) log['SEQUENCES'] = rec_count log['UNIQUE'] = len(uniq_dict) log['DUPLICATE'] = len(dup_keys) log['UNDETERMINED'] = len(search_keys) log['END'] = 'CollapseSeq' printLog(log) # Close file handles if log_handle is not None: log_handle.close() return uniq_handle.name
def processAssembly(data, assemble_func, assemble_args={}, rc=None, fields_1=None, fields_2=None, delimiter=default_delimiter): """ Performs assembly of a sequence pair Arguments: data = a SeqData object with a list of exactly two SeqRecords assemble_func = the function to use to assemble paired ends assemble_args = a dictionary of arguments to pass to the assembly function rc = Defines which sequences ('head','tail','both') to reverse complement before assembly; if None do not reverse complement sequences fields_1 = list of annotations in head SeqRecord to copy to assembled record; if None do not copy an annotation fields_2 = list of annotations in tail SeqRecord to copy to assembled record; if None do not copy an annotation delimiter = a tuple of delimiters for (fields, values, value lists) Returns: a SeqResult object """ # Reverse complement sequences if required head_seq = data.data[0] if rc not in ('head', 'both') \ else reverseComplement(data.data[0]) tail_seq = data.data[1] if rc not in ('tail', 'both') \ else reverseComplement(data.data[1]) # Define result object result = SeqResult(data.id, [head_seq, tail_seq]) # Define stitched sequence annotation stitch_ann = OrderedDict([('ID', data.id)]) if fields_1 is not None: head_ann = parseAnnotation(head_seq.description, fields_1, delimiter=delimiter) stitch_ann = mergeAnnotation(stitch_ann, head_ann, delimiter=delimiter) result.log['FIELDS1'] = '|'.join(['%s=%s' % (k, v) for k, v in head_ann.items()]) if fields_2 is not None: tail_ann = parseAnnotation(tail_seq.description, fields_2, delimiter=delimiter) stitch_ann = mergeAnnotation(stitch_ann, tail_ann, delimiter=delimiter) result.log['FIELDS2'] = '|'.join(['%s=%s' % (k, v) for k, v in tail_ann.items()]) # Assemble sequences stitch = assemble_func(head_seq, tail_seq, **assemble_args) ab = stitch.head_pos xy = stitch.tail_pos result.valid = stitch.valid # Add reference to log if stitch.ref_seq is not None and stitch.ref_pos is not None: result.log['REFID'] = stitch.ref_seq.id result.log['REFSEQ'] = ' ' * stitch.ref_pos[0] + stitch.ref_seq.seq if ab is not None and xy is not None: result.log['SEQ1'] = ' ' * xy[0] + head_seq.seq result.log['SEQ2'] = ' ' * ab[0] + tail_seq.seq else: result.log['SEQ1'] = head_seq.seq result.log['SEQ2'] = ' ' * (len(head_seq) + (stitch.gap or 0)) + tail_seq.seq # Define stitching log if stitch.seq is not None: # Update stitch annotation stitch.seq.id = flattenAnnotation(stitch_ann, delimiter=delimiter) stitch.seq.name = stitch.seq.id stitch.seq.description = '' result.results = stitch.seq # Add assembly to log result.log['ASSEMBLY'] = stitch.seq.seq if 'phred_quality' in stitch.seq.letter_annotations: result.log['QUALITY'] = ''.join([chr(q+33) for q in stitch.seq.letter_annotations['phred_quality']]) result.log['LENGTH'] = len(stitch) result.log['OVERLAP'] = stitch.overlap else: result.log['ASSEMBLY'] = None # Add mode specific log results if stitch.gap is not None: result.log['GAP'] = stitch.gap if stitch.error is not None: result.log['ERROR'] = '%.4f' % stitch.error if stitch.pvalue is not None: result.log['PVALUE'] = '%.4e' % stitch.pvalue if stitch.evalue is not None: result.log['EVALUE1'] = '%.4e' % stitch.evalue[0] result.log['EVALUE2'] = '%.4e' % stitch.evalue[1] if stitch.ident is not None: result.log['IDENTITY'] = '%.4f' % stitch.ident return result
def selectSeqFile(seq_file, field, value_list=None, value_file=None, negate=False, out_file=None, out_args=default_out_args): """ Select from a sequence file Arguments: seq_file : filename of the sequence file to sample from. field : the annotation field to check for required values. value_list : a list of annotation values that a sample must contain one of. value_file : a tab delimited file containing values to select. negate : if True select entires that do not contain the specific values. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str: output file name. """ # Reads value_file def _read_file(value_file, field): field_list = [] try: with open(value_file, 'rt') as handle: reader_dict = csv.DictReader(handle, dialect='excel-tab') for row in reader_dict: field_list.append(row[field]) except IOError: printError('File %s cannot be read.' % value_file) except: printError('File %s is invalid.' % value_file) return field_list # Print console log log = OrderedDict() log['START'] = 'SplitSeq' log['COMMAND'] = 'select' log['FILE'] = os.path.basename(seq_file) log['FIELD'] = field if value_list is not None: log['VALUE_LIST'] = ','.join([str(x) for x in value_list]) if value_file is not None: log['VALUE_FILE'] = os.path.basename(value_file) log['NOT'] = negate printLog(log) # Read value_file if value_list is not None and value_file is not None: printError('Specify only one of value_list and value_file.') elif value_file is not None: value_list = _read_file(value_file, field) # Read sequence file in_type = getFileType(seq_file) seq_iter = readSeqFile(seq_file) if out_args['out_type'] is None: out_args['out_type'] = in_type # Output output handle if out_file is not None: out_handle = open(out_file, 'w') else: out_handle = getOutputHandle(seq_file, 'selected', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) # Generate subset of records start_time = time() pass_count, fail_count, rec_count = 0, 0, 0 value_set = set(value_list) for rec in seq_iter: printCount(rec_count, 1e5, start_time=start_time) rec_count += 1 # Parse annotations into a list of values ann = parseAnnotation(rec.description, delimiter=out_args['delimiter'])[field] ann = ann.split(out_args['delimiter'][2]) # Write if xor(negate, not value_set.isdisjoint(ann)): # Write SeqIO.write(rec, out_handle, out_args['out_type']) pass_count += 1 else: fail_count += 1 printCount(rec_count, 1e5, start_time=start_time, end=True) # Print log log = OrderedDict() log['OUTPUT'] = os.path.basename(out_handle.name) log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'SplitSeq' printLog(log) return out_handle.name
def writeDb(records, fields, aligner_file, total_count, id_dict=None, annotations=None, amino_acid=False, partial=False, asis_id=True, regions='default', writer=AIRRWriter, out_file=None, out_args=default_out_args): """ Writes parsed records to an output file Arguments: records : a iterator of Receptor objects containing alignment data. fields : a list of ordered field names to write. aligner_file : input file name. total_count : number of records (for progress bar). id_dict : a dictionary of the truncated sequence ID mapped to the full sequence ID. annotations : additional annotation dictionary. amino_acid : if True do verification on amino acid fields. partial : if True put incomplete alignments in the pass file. asis_id : if ID is to be parsed for pRESTO output with default delimiters. regions (str): name of the IMGT FWR/CDR region definitions to use. writer : writer class. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: None """ # Wrapper for opening handles and writers def _open(x, f, writer=writer, out_file=out_file): if out_file is not None and x == 'pass': handle = open(out_file, 'w') else: handle = getOutputHandle(aligner_file, out_label='db-%s' % x, out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) return handle, writer(handle, fields=f) # Function to convert fasta header annotations to changeo columns def _changeo(f, header): h = [ChangeoSchema.fromReceptor(x) for x in header if x.upper() not in f] f.extend(h) return f def _airr(f, header): h = [AIRRSchema.fromReceptor(x) for x in header if x.lower() not in f] f.extend(h) return f # Function to verify IMGT-gapped sequence and junction concur def _imgt_check(rec): try: if amino_acid: rd = RegionDefinition(rec.junction_aa_length, amino_acid=amino_acid, definition=regions) x, y = rd.positions['junction'] check = (rec.junction_aa == rec.sequence_aa_imgt[x:y]) else: rd = RegionDefinition(rec.junction_length, amino_acid=amino_acid, definition=regions) x, y = rd.positions['junction'] check = (rec.junction == rec.sequence_imgt[x:y]) except (TypeError, AttributeError): check = False return check # Function to check for valid records strictly def _strict(rec): if amino_acid: valid = [rec.v_call and rec.v_call != 'None', rec.j_call and rec.j_call != 'None', rec.functional is not None, rec.sequence_aa_imgt, rec.junction_aa, _imgt_check(rec)] else: valid = [rec.v_call and rec.v_call != 'None', rec.j_call and rec.j_call != 'None', rec.functional is not None, rec.sequence_imgt, rec.junction, _imgt_check(rec)] return all(valid) # Function to check for valid records loosely def _gentle(rec): valid = [rec.v_call and rec.v_call != 'None', rec.d_call and rec.d_call != 'None', rec.j_call and rec.j_call != 'None'] return any(valid) # Set writer class and annotation conversion function if writer == ChangeoWriter: _annotate = _changeo elif writer == AIRRWriter: _annotate = _airr else: printError('Invalid output writer.') # Additional annotation (e.g. 10X cell calls) # _append_table = None # if cellranger_file is not None: # with open(cellranger_file) as csv_file: # # Read in annotation file (use Sniffer to discover file delimiters) # dialect = csv.Sniffer().sniff(csv_file.readline()) # csv_file.seek(0) # csv_reader = csv.DictReader(csv_file, dialect = dialect) # # # Generate annotation dictionary # anntab_dict = {entry['contig_id']: {cellranger_map[field]: entry[field] \ # for field in cellranger_map.keys()} for entry in csv_reader} # # fields = _annotate(fields, cellranger_map.values()) # _append_table = lambda sequence_id: anntab_dict[sequence_id] # Set pass criteria _pass = _gentle if partial else _strict # Define log handle if out_args['log_file'] is None: log_handle = None else: log_handle = open(out_args['log_file'], 'w') # Initialize handles, writers and counters pass_handle, pass_writer = None, None fail_handle, fail_writer = None, None pass_count, fail_count = 0, 0 start_time = time() # Validate and write output printProgress(0, total_count, 0.05, start_time=start_time) for i, record in enumerate(records, start=1): # Replace sequence description with full string, if required if id_dict is not None and record.sequence_id in id_dict: record.sequence_id = id_dict[record.sequence_id] # Parse sequence description into new columns if not asis_id: try: ann_raw = parseAnnotation(record.sequence_id) record.sequence_id = ann_raw.pop('ID') # Convert to Receptor fields ann_parsed = OrderedDict() for k, v in ann_raw.items(): ann_parsed[ChangeoSchema.toReceptor(k)] = v # Add annotations to Receptor and update field list record.setDict(ann_parsed, parse=True) if i == 1: fields = _annotate(fields, ann_parsed.keys()) except IndexError: # Could not parse pRESTO-style annotations so fall back to no parse asis_id = True printWarning('Sequence annotation format not recognized. Sequence headers will not be parsed.') # Add supplemental annotation fields # if _append_table is not None: # record.setDict(_append_table(record.sequence_id), parse=True) if annotations is not None: record.setDict(annotations[record.sequence_id], parse=True) if i == 1: fields = _annotate(fields, annotations[record.sequence_id].keys()) # Count pass or fail and write to appropriate file if _pass(record): pass_count += 1 # Write row to pass file try: pass_writer.writeReceptor(record) except AttributeError: # Open pass file and writer pass_handle, pass_writer = _open('pass', fields) pass_writer.writeReceptor(record) else: fail_count += 1 # Write row to fail file if specified if out_args['failed']: try: fail_writer.writeReceptor(record) except AttributeError: # Open fail file and writer fail_handle, fail_writer = _open('fail', fields) fail_writer.writeReceptor(record) # Write log if log_handle is not None: log = OrderedDict([('ID', record.sequence_id), ('V_CALL', record.v_call), ('D_CALL', record.d_call), ('J_CALL', record.j_call), ('PRODUCTIVE', record.functional)]) if not _imgt_check(record) and not amino_acid: log['ERROR'] = 'Junction does not match the sequence starting at position 310 in the IMGT numbered V(D)J sequence.' printLog(log, log_handle) # Print progress printProgress(i, total_count, 0.05, start_time=start_time) # Print console log log = OrderedDict() log['OUTPUT'] = os.path.basename(pass_handle.name) if pass_handle is not None else None log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'MakeDb' printLog(log) # Close file handles output = {'pass': None, 'fail': None} if pass_handle is not None: output['pass'] = pass_handle.name pass_handle.close() if fail_handle is not None: output['fail'] = fail_handle.name fail_handle.close() return output
def modifyHeaders(seq_file, modify_func, modify_args, out_args=default_out_args): """ Modifies sequence headers Arguments: seq_file = the sequence file name modify_func = the function defining the modification operation modify_args = a dictionary of arguments to pass to modify_func out_args = common output argument dictionary from parseCommonArgs Returns: the output file name """ # Define subcommand label dictionary cmd_dict = {addHeader: 'add', copyHeader: 'copy', collapseHeader: 'collapse', deleteHeader: 'delete', expandHeader: 'expand', renameHeader: 'rename'} # Print parameter info log = OrderedDict() log['START'] = 'ParseHeaders' log['COMMAND'] = cmd_dict.get(modify_func, modify_func.__name__) log['FILE'] = os.path.basename(seq_file) for k in sorted(modify_args): v = modify_args[k] log[k.upper()] = ','.join(v) if isinstance(v, list) else v printLog(log) # Open file handles in_type = getFileType(seq_file) seq_iter = readSeqFile(seq_file) if out_args['out_type'] is None: out_args['out_type'] = in_type out_handle = getOutputHandle(seq_file, 'reheader', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type=out_args['out_type']) # Count records result_count = countSeqFile(seq_file) # Iterate over sequences start_time = time() seq_count = 0 for seq in seq_iter: # Print progress for previous iteration printProgress(seq_count, result_count, 0.05, start_time) #Update counts seq_count += 1 # Modify header header = parseAnnotation(seq.description, delimiter=out_args['delimiter']) header = modify_func(header, delimiter=out_args['delimiter'], **modify_args) # Write new sequence seq.id = seq.name = flattenAnnotation(header, delimiter=out_args['delimiter']) seq.description = '' SeqIO.write(seq, out_handle, out_args['out_type']) # print counts printProgress(seq_count, result_count, 0.05, start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(out_handle.name) log['SEQUENCES'] = seq_count log['END'] = 'ParseHeaders' printLog(log) # Close file handles out_handle.close() return out_handle.name
def processQueue(alive, data_queue, result_queue, cluster_func, cluster_args={}, cluster_field=default_cluster_field, cluster_prefix=default_cluster_prefix, delimiter=default_delimiter): """ Pulls from data queue, performs calculations, and feeds results queue Arguments: alive : a multiprocessing.Value boolean controlling whether processing continues; when False function returns. data_queue : a multiprocessing.Queue holding data to process. result_queue : a multiprocessing.Queue to hold processed results. cluster_func : the function to use for clustering. cluster_args : a dictionary of optional arguments for the clustering function. cluster_field : string defining the output cluster field name. cluster_prefix : string defining a prefix for the cluster identifier. delimiter : a tuple of delimiters for (annotations, field/values, value lists). Returns: None """ try: # Iterator over data queue until sentinel object reached while alive.value: # Get data from queue if data_queue.empty(): continue else: data = data_queue.get() # Exit upon reaching sentinel if data is None: break # Define result object result = SeqResult(data.id, data.data) result.log['BARCODE'] = data.id result.log['SEQCOUNT'] = len(data) # Perform clustering cluster_dict = cluster_func(data.data, **cluster_args) # Process failed result if cluster_dict is None: # Update log result.log['CLUSTERS'] = 0 for i, seq in enumerate(data.data, start=1): result.log['CLUST0-%i' % i] = str(seq.seq) # Feed results queue and continue result_queue.put(result) continue # Get number of clusters result.log['CLUSTERS'] = len(cluster_dict) # Update sequence annotations with cluster assignments results = list() seq_dict = {s.id: s for s in data.data} for cluster, id_list in cluster_dict.items(): for i, seq_id in enumerate(id_list, start=1): # Add cluster annotation seq = seq_dict[seq_id] label = '%s%i' % (cluster_prefix, cluster) header = parseAnnotation(seq.description, delimiter=delimiter) header = mergeAnnotation(header, {cluster_field: label}, delimiter=delimiter) seq.id = seq.name = flattenAnnotation(header, delimiter=delimiter) seq.description = '' # Update log and results result.log['CLUST%i-%i' % (cluster, i)] = str(seq.seq) results.append(seq) # Check results result.results = results result.valid = (len(results) == len(seq_dict)) # Feed results to result queue result_queue.put(result) else: sys.stderr.write('PID %s> Error in sibling process detected. Cleaning up.\n' \ % os.getpid()) return None except: alive.value = False printError('Error processing sequence set with ID: %s.' % data.id, exit=False) raise return None
def processCSQueue(alive, data_queue, result_queue, cluster_field, cluster_args={}, delimiter=default_delimiter): """ Pulls from data queue, performs calculations, and feeds results queue Arguments: alive = a multiprocessing.Value boolean controlling whether processing continues; when False function returns data_queue = a multiprocessing.Queue holding data to process result_queue = a multiprocessing.Queue to hold processed results cluster_args = a dictionary of optional arguments for the clustering function delimiter = a tuple of delimiters for (annotations, field/values, value lists) Returns: None """ try: # Iterator over data queue until sentinel object reached while alive.value: # Get data from queue if data_queue.empty(): continue else: data = data_queue.get() # Exit upon reaching sentinel if data is None: break # Define result object result = SeqResult(data.id, data.data) result.log['BARCODE'] = data.id result.log['SEQCOUNT'] = len(data) # Perform clustering cluster_dict = runUClust(data.data, **cluster_args) # Process failed result if cluster_dict is None: # Update log result.log['CLUSTERS'] = 0 for i, seq in enumerate(data.data, start=1): result.log['CLUST0-%i' % i] = str(seq.seq) # Feed results queue and continue result_queue.put(result) continue # Get number of clusters result.log['CLUSTERS'] = len(cluster_dict) # Update sequence annotations with cluster assignments results = list() seq_dict = {s.id: s for s in data.data} for clust, id_list in cluster_dict.items(): for i, seq_id in enumerate(id_list, start=1): # Add cluster annotation seq = seq_dict[seq_id] header = parseAnnotation(seq.description, delimiter=delimiter) header = mergeAnnotation(header, {cluster_field:clust}, delimiter=delimiter) seq.id = seq.name = flattenAnnotation(header, delimiter=delimiter) seq.description = '' # Update log and results result.log['CLUST%i-%i' % (clust, i)] = str(seq.seq) results.append(seq) # Check results result.results = results result.valid = (len(results) == len(seq_dict)) # Feed results to result queue result_queue.put(result) else: sys.stderr.write('PID %s: Error in sibling process detected. Cleaning up.\n' \ % os.getpid()) return None except: alive.value = False sys.stderr.write('Error processing sequence set with ID: %s.\n' % data.id) raise return None
def writeDb(db, fields, file_prefix, total_count, id_dict=None, no_parse=True, partial=False, out_args=default_out_args): """ Writes tab-delimited database file in output directory. Arguments: db : a iterator of IgRecord objects containing alignment data. fields : a list of ordered field names to write. file_prefix : directory and prefix for CLIP tab-delim file. total_count : number of records (for progress bar). id_dict : a dictionary of the truncated sequence ID mapped to the full sequence ID. no_parse : if ID is to be parsed for pRESTO output with default delimiters. partial : if True put incomplete alignments in the pass file. out_args : common output argument dictionary from parseCommonArgs. Returns: None """ # Function to check for valid records strictly def _pass_strict(rec): valid = [rec.v_call and rec.v_call != 'None', rec.j_call and rec.j_call != 'None', rec.functional is not None, rec.seq_vdj, rec.junction] return all(valid) # Function to check for valid records loosely def _pass_gentle(rec): valid = [rec.v_call and rec.v_call != 'None', rec.d_call and rec.d_call != 'None', rec.j_call and rec.j_call != 'None'] return any(valid) # Set pass criteria _pass = _pass_gentle if partial else _pass_strict # Define output file names pass_file = '%s_db-pass.tab' % file_prefix fail_file = '%s_db-fail.tab' % file_prefix # Initiate handles, writers and counters pass_handle = None fail_handle = None pass_writer = None fail_writer = None start_time = time() rec_count = pass_count = fail_count = 0 # Validate and write output printProgress(0, total_count, 0.05, start_time) for i, record in enumerate(db, start=1): # Replace sequence description with full string, if required if id_dict is not None and record.id in id_dict: record.id = id_dict[record.id] # Parse sequence description into new columns if not no_parse: try: record.annotations = parseAnnotation(record.id, delimiter=out_args['delimiter']) record.id = record.annotations['ID'] del record.annotations['ID'] # TODO: This is not the best approach. should pass in output fields. # If first record, use parsed description to define extra columns if i == 1: fields.extend(list(record.annotations.keys())) except IndexError: # Could not parse pRESTO-style annotations so fall back to no parse no_parse = True sys.stderr.write('\nWARNING: Sequence annotation format not recognized. Sequence headers will not be parsed.\n') # Count pass or fail and write to appropriate file if _pass(record): # Open pass file if pass_writer is None: pass_handle = open(pass_file, 'wt') pass_writer = getDbWriter(pass_handle, add_fields=fields) # Write row to pass file pass_count += 1 pass_writer.writerow(record.toDict()) else: # Open failed file if out_args['failed'] and fail_writer is None: fail_handle = open(fail_file, 'wt') fail_writer = getDbWriter(fail_handle, add_fields=fields) # Write row to fail file if specified fail_count += 1 if fail_writer is not None: fail_writer.writerow(record.toDict()) # Print progress printProgress(i, total_count, 0.05, start_time) # Print consol log log = OrderedDict() log['OUTPUT'] = pass_file log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'MakeDb' printLog(log) if pass_handle is not None: pass_handle.close() if fail_handle is not None: fail_handle.close()
def pairSeq(seq_file_1, seq_file_2, fields_1=None, fields_2=None, coord_type=default_coord_type, out_args=default_out_args): """ Generates consensus sequences Arguments: seq_file_1 = the file containing the grouped sequences and annotations seq_file_2 = the file to assign annotations to from seq_file_1 fields_1 = list of annotations in seq_file_1 records to copy to seq_file_2 records; if None do not copy any annotations fields_2 = list of annotations in seq_file_2 records to copy to seq_file_1 records; if None do not copy any annotations coord_type = the sequence header format out_args = common output argument dictionary from parseCommonArgs Returns: a list of tuples holding successfully paired filenames for (seq_file_1, seq_file_2) """ # Define private functions def _key_func(x): return getCoordKey(x, coord_type=coord_type, delimiter=out_args['delimiter']) log = OrderedDict() log['START'] = 'PairSeq' log['FILE1'] = os.path.basename(seq_file_1) log['FILE2'] = os.path.basename(seq_file_2) log['FIELDS_1'] = ','.join(fields_1) if fields_1 is not None else None log['FIELDS_2'] = ','.join(fields_2) if fields_2 is not None else None log['COORD_TYPE'] = coord_type printLog(log) # Define output type if out_args['out_type'] is None: out_type_1 = getFileType(seq_file_1) out_type_2 = getFileType(seq_file_2) else: out_type_1 = out_type_2 = out_args['out_type'] # Define output name if out_args['out_name'] is None: out_name_1 = out_name_2 = None else: out_name_1 = '%s-1' % out_args['out_name'] out_name_2 = '%s-2' % out_args['out_name'] # Open and count files start_time = time() printMessage("Indexing files", start_time=start_time) # Index file 1 seq_count_1 = countSeqFile(seq_file_1) seq_dict_1 = readSeqFile(seq_file_1, index=True, key_func=_key_func) # Define file 2 iterator seq_count_2 = countSeqFile(seq_file_2) seq_iter_2 = readSeqFile(seq_file_2, index=False) printMessage("Done", start_time=start_time, end=True) # Open output file handles pass_handle_1 = getOutputHandle(seq_file_1, 'pair-pass', out_args['out_dir'], out_name=out_name_1, out_type=out_type_1) pass_handle_2 = getOutputHandle(seq_file_2, 'pair-pass', out_args['out_dir'], out_name=out_name_2, out_type=out_type_2) if out_args['failed']: fail_handle_1 = getOutputHandle(seq_file_1, 'pair-fail', out_dir=out_args['out_dir'], out_name=out_name_1, out_type=out_type_1) fail_handle_2 = getOutputHandle(seq_file_2, 'pair-fail', out_dir=out_args['out_dir'], out_name=out_name_2, out_type=out_type_2) pass_keys = list() # Iterate over pairs and write to output files start_time = time() rec_count = pair_count = 0 for seq_2 in seq_iter_2: # Print progress for previous iteration printProgress(rec_count, seq_count_2, 0.05, start_time) rec_count += 1 # Check for file 2 mate pair in file 1 coord_2 = getCoordKey(seq_2.id, coord_type=coord_type, delimiter=out_args['delimiter']) seq_1 = seq_dict_1.get(coord_2, None) if seq_1 is not None: # Record paired keys pair_count += 1 if fields_1 is not None or fields_2 is not None: ann_1 = parseAnnotation(seq_1.description, delimiter=out_args['delimiter']) ann_2 = parseAnnotation(seq_2.description, delimiter=out_args['delimiter']) # Prepend annotations from seq_1 to seq_2 if fields_1 is not None: copy_ann = OrderedDict([(k, v) for k, v in ann_1.items() \ if k in fields_1]) merge_ann = mergeAnnotation(ann_2, copy_ann, prepend=True, delimiter=out_args['delimiter']) seq_2.id = flattenAnnotation(merge_ann, delimiter=out_args['delimiter']) seq_2.description = '' # Append annotations from seq_2 to seq_1 if fields_2 is not None: copy_ann = OrderedDict([(k, v) for k, v in ann_2.items() \ if k in fields_2]) merge_ann = mergeAnnotation(ann_1, copy_ann, prepend=False, delimiter=out_args['delimiter']) seq_1.id = flattenAnnotation(merge_ann, delimiter=out_args['delimiter']) seq_1.description = '' # Write paired records SeqIO.write(seq_1, pass_handle_1, out_type_1) SeqIO.write(seq_2, pass_handle_2, out_type_2) # Write unpaired file 2 records and updated paired key list for finding unpaired file 1 records if out_args['failed']: if seq_1 is not None: pass_keys.append(coord_2) else: SeqIO.write(seq_2, fail_handle_2, out_type_2) # Print final progress printProgress(rec_count, seq_count_2, 0.05, start_time) # Find and write unpaired file 1 records if out_args['failed']: start_time = time() printMessage("Finding unpaired", start_time=start_time) # Find file 1 unpaired keys pass_keys = set(pass_keys) unpaired = set(seq_dict_1).difference(pass_keys) # Write unpaired file 1 records for k in unpaired: SeqIO.write(seq_dict_1[k], fail_handle_1, out_type_1) printMessage("Done", start_time=start_time, end=True) # Print log log = OrderedDict() log['OUTPUT1'] = os.path.basename(pass_handle_1.name) log['OUTPUT2'] = os.path.basename(pass_handle_2.name) log['SEQUENCES1'] = seq_count_1 log['SEQUENCES2'] = seq_count_2 log['PASS'] = pair_count log['END'] = 'PairSeq' printLog(log) # Close file handles pass_handle_1.close() pass_handle_2.close() return [(pass_handle_1.name, pass_handle_2.name)]
def tableHeaders(seq_file, fields, out_file=None, out_args=default_out_args): """ Builds a table of sequence header annotations Arguments: seq_file : the sequence file name. fields : the list of fields to output. out_file : output file name. Automatically generated from the input file if None. out_args : common output argument dictionary from parseCommonArgs. Returns: str: output table file name """ log = OrderedDict() log['START'] = 'ParseHeaders' log['COMMAND'] = 'table' log['FILE'] = os.path.basename(seq_file) printLog(log) # Open file handles seq_iter = readSeqFile(seq_file) if out_file is not None: out_handle = open(out_file, 'w') else: out_handle = getOutputHandle(seq_file, 'headers', out_dir=out_args['out_dir'], out_name=out_args['out_name'], out_type='tab') # Count records result_count = countSeqFile(seq_file) # Open csv writer and write header out_writer = csv.DictWriter(out_handle, extrasaction='ignore', restval='', delimiter='\t', fieldnames=fields) out_writer.writeheader() # Iterate over sequences start_time = time() seq_count = pass_count = fail_count = 0 for seq in seq_iter: # Print progress for previous iteration printProgress(seq_count, result_count, 0.05, start_time=start_time) # Get annotations seq_count += 1 ann = parseAnnotation(seq.description, fields, delimiter=out_args['delimiter']) # Write records if ann: pass_count += 1 out_writer.writerow(ann) else: fail_count += 1 # Print counts printProgress(seq_count, result_count, 0.05, start_time=start_time) log = OrderedDict() log['OUTPUT'] = os.path.basename(out_handle.name) log['SEQUENCES'] = seq_count log['PASS'] = pass_count log['FAIL'] = fail_count log['END'] = 'ParseHeaders' printLog(log) # Close file handles out_handle.close() return out_handle.name
def _barcode(seq, field=barcode_field, delimiter=out_args['delimiter']): header = parseAnnotation(seq.description, delimiter=delimiter) return header[field]
def assemblyWorker(data, assemble_func, assemble_args={}, rc='tail', fields_1=None, fields_2=None, delimiter=default_delimiter): """ Performs assembly of a sequence pair Arguments: data : a SeqData object with a list of exactly two SeqRecords. assemble_func : the function to use to assemble paired ends. assemble_args : a dictionary of arguments to pass to the assembly function. rc : Defines which sequences ('head', 'tail', 'both', 'none') to reverse complement before assembly; if None do not reverse complement sequences. fields_1 : list of annotations in head SeqRecord to copy to assembled record; if None do not copy an annotation. fields_2 : list of annotations in tail SeqRecord to copy to assembled record; if None do not copy an annotation. delimiter : a tuple of delimiters for (fields, values, value lists). Returns: SeqResult: a SeqResult object """ # Define result object result = SeqResult(data.id, data.data) # Reverse complement sequences if required head_seq = data.data[0] if rc not in ('head', 'both') \ else reverseComplement(data.data[0]) tail_seq = data.data[1] if rc not in ('tail', 'both') \ else reverseComplement(data.data[1]) # Define stitched sequence annotation stitch_ann = OrderedDict([('ID', data.id)]) if fields_1 is not None: head_ann = parseAnnotation(head_seq.description, fields_1, delimiter=delimiter) stitch_ann = mergeAnnotation(stitch_ann, head_ann, delimiter=delimiter) result.log['FIELDS1'] = '|'.join( ['%s=%s' % (k, v) for k, v in head_ann.items()]) if fields_2 is not None: tail_ann = parseAnnotation(tail_seq.description, fields_2, delimiter=delimiter) stitch_ann = mergeAnnotation(stitch_ann, tail_ann, delimiter=delimiter) result.log['FIELDS2'] = '|'.join( ['%s=%s' % (k, v) for k, v in tail_ann.items()]) # Assemble sequences stitch = assemble_func(head_seq, tail_seq, **assemble_args) ab = stitch.head_pos xy = stitch.tail_pos result.valid = stitch.valid # Add reference to log if stitch.ref_seq is not None and stitch.ref_pos is not None: result.log['REFID'] = stitch.ref_seq.id result.log['REFSEQ'] = ' ' * stitch.ref_pos[0] + stitch.ref_seq.seq if ab is not None and xy is not None: result.log['SEQ1'] = ' ' * xy[0] + head_seq.seq result.log['SEQ2'] = ' ' * ab[0] + tail_seq.seq else: result.log['SEQ1'] = head_seq.seq result.log['SEQ2'] = ' ' * (len(head_seq) + (stitch.gap or 0)) + tail_seq.seq # Define stitching log if stitch.seq is not None: # Update stitch annotation stitch.seq.id = flattenAnnotation(stitch_ann, delimiter=delimiter) stitch.seq.name = stitch.seq.id stitch.seq.description = '' result.results = stitch.seq # Add assembly to log result.log['ASSEMBLY'] = stitch.seq.seq if 'phred_quality' in stitch.seq.letter_annotations: result.log['QUALITY'] = ''.join([ chr(q + 33) for q in stitch.seq.letter_annotations['phred_quality'] ]) result.log['LENGTH'] = len(stitch) result.log['OVERLAP'] = stitch.overlap else: result.log['ASSEMBLY'] = None # Add mode specific log results if stitch.gap is not None: result.log['GAP'] = stitch.gap if stitch.error is not None: result.log['ERROR'] = '%.4f' % stitch.error if stitch.pvalue is not None: result.log['PVALUE'] = '%.4e' % stitch.pvalue if stitch.evalue is not None: result.log['EVALUE1'] = '%.4e' % stitch.evalue[0] result.log['EVALUE2'] = '%.4e' % stitch.evalue[1] if stitch.ident is not None: result.log['IDENTITY'] = '%.4f' % stitch.ident return result
def processQueue(alive, data_queue, result_queue, align_func, align_args={}, calc_div=False, delimiter=default_delimiter): """ Pulls from data queue, performs calculations, and feeds results queue Arguments: alive : a multiprocessing.Value boolean controlling whether processing continues; when False function returns data_queue : a multiprocessing.Queue holding data to process result_queue : a multiprocessing.Queue to hold processed results align_func : the function to use for alignment align_args : a dictionary of optional arguments for the alignment function calc_div : if True perform diversity calculation delimiter : a tuple of delimiters for (annotations, field/values, value lists) Returns: None """ try: # Iterator over data queue until sentinel object reached while alive.value: # Get data from queue if data_queue.empty(): continue else: data = data_queue.get() # Exit upon reaching sentinel if data is None: break # Define result object result = SeqResult(data.id, data.data) result.log['BARCODE'] = data.id result.log['SEQCOUNT'] = len(data) # Perform alignment seq_list = data.data align_list = align_func(seq_list, **align_args) # Process alignment if align_list is not None: # Calculate diversity if calc_div: diversity = calculateDiversity(align_list) result.log['DIVERSITY'] = diversity # Restore quality scores has_quality = hasattr(seq_list[0], 'letter_annotations') and \ 'phred_quality' in seq_list[0].letter_annotations if has_quality: qual_dict = {seq.id:seq.letter_annotations['phred_quality'] \ for seq in seq_list} for seq in align_list: qual = deque(qual_dict[seq.id]) qual_new = [0 if c == '-' else qual.popleft() for c in seq.seq] seq.letter_annotations['phred_quality'] = qual_new # Add alignment to log if 'field' in align_args: for i, seq in enumerate(align_list): ann = parseAnnotation(seq.description, delimiter=delimiter) primer = ann[align_args['field']] result.log['ALIGN%i:%s' % (i + 1, primer)] = seq.seq else: for i, seq in enumerate(align_list): result.log['ALIGN%i' % (i + 1)] = seq.seq # Add alignment to results result.results = align_list result.valid = True # Feed results to result queue result_queue.put(result) else: sys.stderr.write('PID %s> Error in sibling process detected. Cleaning up.\n' \ % os.getpid()) return None except: alive.value = False printError('Processing sequence set with ID: %s.' % data.id, exit=False) raise return None
def pairSeq(seq_file_1, seq_file_2, fields_1=None, fields_2=None, action=None, coord_type=default_coord, out_args=default_out_args): """ Syncronized paired end files and copies annotations between them Arguments: seq_file_1 : the file containing the grouped sequences and annotations. seq_file_2 : the file to assign annotations to from seq_file_1. fields_1 : list of annotations in seq_file_1 records to copy to seq_file_2 records; if None do not copy any annotations. fields_2 : list of annotations in seq_file_2 records to copy to seq_file_1 records; if None do not copy any annotations. action : the collapse action to take on all copied annotation if they already exist in the target header. coord_type : the sequence header format. out_args : common output argument dictionary from parseCommonArgs. Returns: list: a list of tuples holding successfully paired filenames for (seq_file_1, seq_file_2). """ # Define private functions def _key_func(x): return getCoordKey(x, coord_type=coord_type, delimiter=out_args['delimiter']) log = OrderedDict() log['START'] = 'PairSeq' log['FILE1'] = os.path.basename(seq_file_1) log['FILE2'] = os.path.basename(seq_file_2) log['FIELDS_1'] = ','.join(fields_1) if fields_1 is not None else None log['FIELDS_2'] = ','.join(fields_2) if fields_2 is not None else None log['COORD_TYPE'] = coord_type printLog(log) # Define output type if out_args['out_type'] is None: out_type_1 = getFileType(seq_file_1) out_type_2 = getFileType(seq_file_2) else: out_type_1 = out_type_2 = out_args['out_type'] # Define output name if out_args['out_name'] is None: out_name_1 = out_name_2 = None else: out_name_1 = '%s-1' % out_args['out_name'] out_name_2 = '%s-2' % out_args['out_name'] # Open and count files start_time = time() printMessage("Indexing files", start_time=start_time) # Index file 1 seq_count_1 = countSeqFile(seq_file_1) seq_dict_1 = readSeqFile(seq_file_1, index=True, key_func=_key_func) # Define file 2 iterator seq_count_2 = countSeqFile(seq_file_2) seq_iter_2 = readSeqFile(seq_file_2, index=False) printMessage("Done", start_time=start_time, end=True) # Open output file handles pass_handle_1 = getOutputHandle(seq_file_1, 'pair-pass', out_args['out_dir'], out_name=out_name_1, out_type=out_type_1) pass_handle_2 = getOutputHandle(seq_file_2, 'pair-pass', out_args['out_dir'], out_name=out_name_2, out_type=out_type_2) if out_args['failed']: fail_handle_1 = getOutputHandle(seq_file_1, 'pair-fail', out_dir=out_args['out_dir'], out_name=out_name_1, out_type=out_type_1) fail_handle_2 = getOutputHandle(seq_file_2, 'pair-fail', out_dir=out_args['out_dir'], out_name=out_name_2, out_type=out_type_2) pass_keys = list() # Iterate over pairs and write to output files start_time = time() rec_count = pair_count = 0 for seq_2 in seq_iter_2: # Print progress for previous iteration printProgress(rec_count, seq_count_2, 0.05, start_time=start_time) rec_count += 1 # Check for file 2 mate pair in file 1 coord_2 = getCoordKey(seq_2.id, coord_type=coord_type, delimiter=out_args['delimiter']) seq_1 = seq_dict_1.get(coord_2, None) if seq_1 is not None: # Record paired keys pair_count += 1 if fields_1 is not None or fields_2 is not None: ann_1 = parseAnnotation(seq_1.description, delimiter=out_args['delimiter']) ann_2 = parseAnnotation(seq_2.description, delimiter=out_args['delimiter']) # Prepend annotations from seq_1 to seq_2 if fields_1 is not None: copy_ann = OrderedDict([(k, v) for k, v in ann_1.items() \ if k in fields_1]) merge_ann = mergeAnnotation( ann_2, copy_ann, prepend=True, delimiter=out_args['delimiter']) # Collapse if necessary if action is not None: merge_ann = collapseAnnotation( merge_ann, action, fields=fields_1, delimiter=out_args['delimiter']) # Flatten seq_2.id = flattenAnnotation( merge_ann, delimiter=out_args['delimiter']) seq_2.description = '' # Append annotations from seq_2 to seq_1 if fields_2 is not None: copy_ann = OrderedDict([(k, v) for k, v in ann_2.items() \ if k in fields_2]) merge_ann = mergeAnnotation( ann_1, copy_ann, prepend=False, delimiter=out_args['delimiter']) # Collapse if necessary if action is not None: merge_ann = collapseAnnotation( merge_ann, action, fields=fields_2, delimiter=out_args['delimiter']) # Flatten seq_1.id = flattenAnnotation( merge_ann, delimiter=out_args['delimiter']) seq_1.description = '' # Write paired records SeqIO.write(seq_1, pass_handle_1, out_type_1) SeqIO.write(seq_2, pass_handle_2, out_type_2) # Write unpaired file 2 records and updated paired key list for finding unpaired file 1 records if out_args['failed']: if seq_1 is not None: pass_keys.append(coord_2) else: SeqIO.write(seq_2, fail_handle_2, out_type_2) # Print final progress printProgress(rec_count, seq_count_2, 0.05, start_time=start_time) # Find and write unpaired file 1 records if out_args['failed']: start_time = time() printMessage("Finding unpaired", start_time=start_time) # Find file 1 unpaired keys pass_keys = set(pass_keys) unpaired = set(seq_dict_1).difference(pass_keys) # Write unpaired file 1 records for k in unpaired: SeqIO.write(seq_dict_1[k], fail_handle_1, out_type_1) printMessage("Done", start_time=start_time, end=True) # Print log log = OrderedDict() log['OUTPUT1'] = os.path.basename(pass_handle_1.name) log['OUTPUT2'] = os.path.basename(pass_handle_2.name) log['SEQUENCES1'] = seq_count_1 log['SEQUENCES2'] = seq_count_2 log['PASS'] = pair_count log['END'] = 'PairSeq' printLog(log) # Close file handles pass_handle_1.close() pass_handle_2.close() return [(pass_handle_1.name, pass_handle_2.name)]