def action(arguments): """ Trim the alignment as specified """ # Determine file format for input and output source_format = (arguments.source_format or fileformat.from_handle(arguments.source_file)) output_format = (arguments.output_format or fileformat.from_handle(arguments.output_file)) # Load the alignment with arguments.source_file: sequences = SeqIO.parse( arguments.source_file, source_format, alphabet=Alphabet.Gapped(Alphabet.single_letter_alphabet)) # Locate primers (forward_start, forward_end), (reverse_start, reverse_end) = locate_primers( sequences, arguments.forward_primer, arguments.reverse_primer, arguments.reverse_complement, arguments.max_hamming_distance) # Generate slice indexes if arguments.include_primers: start = forward_start end = reverse_end + 1 else: start = forward_end + 1 end = reverse_start # Rewind the input file arguments.source_file.seek(0) sequences = SeqIO.parse( arguments.source_file, source_format, alphabet=Alphabet.Gapped(Alphabet.single_letter_alphabet)) # Apply the transformation prune_action = _ACTIONS[arguments.prune_action] transformed_sequences = prune_action(sequences, start, end) with arguments.output_file: SeqIO.write(transformed_sequences, arguments.output_file, output_format)
def action(arguments): """ Trim the alignment as specified """ # Determine file format for input and output source_format = (arguments.source_format or fileformat.from_handle(arguments.source_file)) output_format = (arguments.output_format or fileformat.from_handle(arguments.output_file)) # Load the alignment with arguments.source_file: sequences = SeqIO.parse(arguments.source_file, source_format, alphabet=Alphabet.Gapped( Alphabet.single_letter_alphabet)) # Locate primers (forward_start, forward_end), (reverse_start, reverse_end) = \ locate_primers(sequences, arguments.forward_primer, arguments.reverse_primer, arguments.reverse_complement, arguments.max_hamming_distance) # Generate slice indexes if arguments.include_primers: start = forward_start end = reverse_end + 1 else: start = forward_end + 1 end = reverse_start # Rewind the input file arguments.source_file.seek(0) sequences = SeqIO.parse(arguments.source_file, source_format, alphabet=Alphabet.Gapped( Alphabet.single_letter_alphabet)) # Apply the transformation prune_action = _ACTIONS[arguments.prune_action] transformed_sequences = prune_action(sequences, start, end) with arguments.output_file: SeqIO.write(transformed_sequences, arguments.output_file, output_format)
def action(arguments): """ Run """ # Ignore SIGPIPE, for head support common.exit_on_sigpipe() logging.basicConfig() prot_sequences = SeqIO.parse(arguments.protein_align, fileformat.from_handle(arguments.protein_align)) nucl_sequences = SeqIO.parse(arguments.nucl_align, fileformat.from_handle(arguments.nucl_align)) instance = AlignmentMapper(TRANSLATION_TABLES[arguments.translation_table], arguments.fail_action) SeqIO.write(instance.map_all(prot_sequences, nucl_sequences), arguments.out_file, fileformat.from_filename(arguments.out_file.name))
def action(arguments): """ Run """ # Ignore SIGPIPE, for head support common.exit_on_sigpipe() logging.basicConfig() prot_sequences = SeqIO.parse( arguments.protein_align, fileformat.from_handle(arguments.protein_align)) nucl_sequences = SeqIO.parse(arguments.nucl_align, fileformat.from_handle(arguments.nucl_align)) instance = AlignmentMapper(TRANSLATION_TABLES[arguments.translation_table], arguments.fail_action) SeqIO.write(instance.map_all(prot_sequences, nucl_sequences), arguments.out_file, fileformat.from_filename(arguments.out_file.name))
def action(arguments): common.exit_on_sigpipe() # Determine file format for input and output source_format = (arguments.input_format or fileformat.from_handle(arguments.sequence_file)) with arguments.sequence_file: sequences = SeqIO.parse(arguments.sequence_file, source_format) if arguments.include_description: ids = (sequence.description for sequence in sequences) else: ids = (sequence.id for sequence in sequences) with arguments.output_file: for i in ids: print >> arguments.output_file, i
def action(arguments): common.exit_on_sigpipe() # Determine file format for input and output source_format = (arguments.input_format or fileformat.from_handle(arguments.sequence_file)) with arguments.sequence_file: sequences = SeqIO.parse(arguments.sequence_file, source_format) if arguments.include_description: ids = (sequence.description for sequence in sequences) else: ids = (sequence.id for sequence in sequences) with arguments.output_file: for i in ids: print(i, file=arguments.output_file)
def summarize_sequence_file(source_file, file_type=None): """ Summarizes a sequence file, returning a tuple containing the name, whether the file is an alignment, minimum sequence length, maximum sequence length, average length, number of sequences. """ is_alignment = True avg_length = None min_length = sys.maxsize max_length = 0 sequence_count = 0 # Get an iterator and analyze the data. with common.FileType('rt')(source_file) as fp: if not file_type: file_type = fileformat.from_handle(fp) for record in SeqIO.parse(fp, file_type): sequence_count += 1 sequence_length = len(record) if max_length != 0: # If even one sequence is not the same length as the others, # we don't consider this an alignment. if sequence_length != max_length: is_alignment = False # Lengths if sequence_length > max_length: max_length = sequence_length if sequence_length < min_length: min_length = sequence_length # Average length if sequence_count == 1: avg_length = float(sequence_length) else: avg_length = avg_length + ((sequence_length - avg_length) / sequence_count) # Handle an empty file: if avg_length is None: min_length = max_length = avg_length = 0 if sequence_count <= 1: is_alignment = False return (source_file, str(is_alignment).upper(), min_length, max_length, avg_length, sequence_count)
def summarize_sequence_file(source_file, file_type=None): """ Summarizes a sequence file, returning a tuple containing the name, whether the file is an alignment, minimum sequence length, maximum sequence length, average length, number of sequences. """ is_alignment = True avg_length = None min_length = sys.maxint max_length = 0 sequence_count = 0 # Get an iterator and analyze the data. with common.FileType('rb')(source_file) as fp: if not file_type: file_type = fileformat.from_handle(fp) for record in SeqIO.parse(fp, file_type): sequence_count += 1 sequence_length = len(record) if max_length != 0: # If even one sequence is not the same length as the others, # we don't consider this an alignment. if sequence_length != max_length: is_alignment = False # Lengths if sequence_length > max_length: max_length = sequence_length if sequence_length < min_length: min_length = sequence_length # Average length if sequence_count == 1: avg_length = float(sequence_length) else: avg_length = avg_length + ( (sequence_length - avg_length) / sequence_count) # Handle an empty file: if avg_length is None: min_length = max_length = avg_length = 0 if sequence_count <= 1: is_alignment = False return (source_file, str(is_alignment).upper(), min_length, max_length, avg_length, sequence_count)
def transform_file(source_file, destination_file, arguments): # Get just the file name, useful for naming the temporary file. source_file_type = (arguments.input_format or from_handle(source_file)) destination_file_type = (arguments.output_format or from_handle(destination_file)) # Get an iterator. sorters = {'length': transform.sort_length, 'name': transform.sort_name,} directions = {'asc': 1, 'desc': 0} if arguments.sort: # Sorted iterator key, direction = arguments.sort.split('-') records = sorters[key](source_file=source_file, source_file_type=source_file_type, direction=directions[direction]) else: # Unsorted iterator. records = SeqIO.parse(source_file, source_file_type, alphabet=ALPHABETS.get(arguments.alphabet)) ######################################### # Apply generator functions to iterator.# ######################################### # Apply all the transform functions in transforms if arguments.transforms: # Special case handling for --cut and --relative-to if arguments.cut_relative: for o, n in ((transform.multi_cut_sequences, transform.cut_sequences_relative), (transform.multi_mask_sequences, transform.mask_sequences_relative)): # Add a function to trim any columns which are gaps in the # sequence ID try: f = next(f for f in arguments.transforms if f.func == o) except StopIteration: continue i = arguments.transforms.index(f) arguments.transforms.pop(i) arguments.transforms.insert(i, functools.partial(n, record_id=arguments.cut_relative, **f.keywords)) for function in arguments.transforms: records = function(records) if (arguments.deduplicate_sequences or arguments.deduplicate_sequences is None): records = transform.deduplicate_sequences( records, arguments.deduplicate_sequences) # Apply all the partial functions if arguments.apply_function: for apply_function in arguments.apply_function: records = apply_function(records) # Only the fasta format is supported, as SeqIO.write does not have a 'wrap' # parameter. if (arguments.line_wrap is not None and destination_file_type == 'fasta'): logging.info("Attempting to write fasta with %d line breaks.", arguments.line_wrap) with destination_file: writer = FastaIO.FastaWriter( destination_file, wrap=arguments.line_wrap) writer.write_file(records) else: # Mogrify requires writing all changes to a temporary file by default, # but convert uses a destination file instead if one was specified. Get # sequences from an iterator that has generator functions wrapping it. # After creation, it is then copied back over the original file if all # tasks finish up without an exception being thrown. This avoids # loading the entire sequence file up into memory. logging.info("Applying transformations, writing to %s", destination_file) SeqIO.write(records, destination_file, destination_file_type)
def action(arguments): """ Given parsed arguments, filter input files. """ if arguments.quality_window_mean_qual and not arguments.quality_window: raise ValueError("--quality-window-mean-qual specified without " "--quality-window") if trie is None or triefind is None: raise ValueError('Missing Bio.trie and/or Bio.triefind modules. Cannot continue') filters = [] input_type = fileformat.from_handle(arguments.sequence_file) output_type = fileformat.from_handle(arguments.output_file) with arguments.sequence_file as fp: if arguments.input_qual: sequences = QualityIO.PairedFastaQualIterator(fp, arguments.input_qual) else: sequences = SeqIO.parse(fp, input_type) listener = RecordEventListener() if arguments.details_out: rh = RecordReportHandler(arguments.details_out, arguments.argv, arguments.details_comment) rh.register_with(listener) # Track read sequences sequences = listener.iterable_hook('read', sequences) # Add filters if arguments.min_mean_quality and input_type == 'fastq': qfilter = QualityScoreFilter(arguments.min_mean_quality) filters.append(qfilter) if arguments.max_length: max_length_filter = MaxLengthFilter(arguments.max_length) filters.append(max_length_filter) if arguments.min_length: min_length_filter = MinLengthFilter(arguments.min_length) filters.append(min_length_filter) if arguments.max_ambiguous is not None: max_ambig_filter = MaxAmbiguousFilter(arguments.max_ambiguous) filters.append(max_ambig_filter) if arguments.pct_ambiguous is not None: pct_ambig_filter = PctAmbiguousFilter(arguments.pct_ambiguous) filters.append(pct_ambig_filter) if arguments.ambiguous_action: ambiguous_filter = AmbiguousBaseFilter( arguments.ambiguous_action) filters.append(ambiguous_filter) if arguments.quality_window: min_qual = arguments.quality_window_mean_qual or \ arguments.min_mean_quality window_filter = WindowQualityScoreFilter(arguments.quality_window, min_qual) filters.insert(0, window_filter) if arguments.barcode_file: with arguments.barcode_file: tr = parse_barcode_file(arguments.barcode_file, arguments.primer, arguments.barcode_header) f = PrimerBarcodeFilter(tr) filters.append(f) if arguments.map_out: barcode_writer = csv.writer(arguments.map_out, quoting=getattr(csv, arguments.quoting), lineterminator='\n') def barcode_handler(record, sample, barcode=None): barcode_writer.writerow((record.id, sample)) listener.register_handler('found_barcode', barcode_handler) for f in filters: f.listener = listener sequences = f.filter_records(sequences) # Track sequences which passed all filters sequences = listener.iterable_hook('write', sequences) with arguments.output_file: SeqIO.write(sequences, arguments.output_file, output_type) rpt_rows = (f.report_dict() for f in filters) # Write report with arguments.report_out as fp: writer = csv.DictWriter(fp, BaseFilter.report_fields, lineterminator='\n', delimiter='\t') writer.writeheader() writer.writerows(rpt_rows)
def action(arguments): """ Given parsed arguments, filter input files. """ if arguments.quality_window_mean_qual and not arguments.quality_window: raise ValueError("--quality-window-mean-qual specified without " "--quality-window") if trie is None or triefind is None: raise ValueError( 'Missing Bio.trie and/or Bio.triefind modules. Cannot continue') filters = [] input_type = fileformat.from_handle(arguments.sequence_file) output_type = fileformat.from_handle(arguments.output_file) with arguments.sequence_file as fp: if arguments.input_qual: sequences = QualityIO.PairedFastaQualIterator( fp, arguments.input_qual) else: sequences = SeqIO.parse(fp, input_type) listener = RecordEventListener() if arguments.details_out: rh = RecordReportHandler(arguments.details_out, arguments.argv, arguments.details_comment) rh.register_with(listener) # Track read sequences sequences = listener.iterable_hook('read', sequences) # Add filters if arguments.min_mean_quality and input_type == 'fastq': qfilter = QualityScoreFilter(arguments.min_mean_quality) filters.append(qfilter) if arguments.max_length: max_length_filter = MaxLengthFilter(arguments.max_length) filters.append(max_length_filter) if arguments.min_length: min_length_filter = MinLengthFilter(arguments.min_length) filters.append(min_length_filter) if arguments.max_ambiguous is not None: max_ambig_filter = MaxAmbiguousFilter(arguments.max_ambiguous) filters.append(max_ambig_filter) if arguments.pct_ambiguous is not None: pct_ambig_filter = PctAmbiguousFilter(arguments.pct_ambiguous) filters.append(pct_ambig_filter) if arguments.ambiguous_action: ambiguous_filter = AmbiguousBaseFilter(arguments.ambiguous_action) filters.append(ambiguous_filter) if arguments.quality_window: min_qual = (arguments.quality_window_mean_qual or arguments.min_mean_quality) window_filter = WindowQualityScoreFilter(arguments.quality_window, min_qual) filters.insert(0, window_filter) if arguments.barcode_file: with arguments.barcode_file: tr = parse_barcode_file(arguments.barcode_file, arguments.primer, arguments.barcode_header) f = PrimerBarcodeFilter(tr) filters.append(f) if arguments.map_out: barcode_writer = csv.writer( arguments.map_out, quoting=getattr(csv, arguments.quoting), lineterminator='\n') def barcode_handler(record, sample, barcode=None): barcode_writer.writerow((record.id, sample)) listener.register_handler('found_barcode', barcode_handler) for f in filters: f.listener = listener sequences = f.filter_records(sequences) # Track sequences which passed all filters sequences = listener.iterable_hook('write', sequences) with arguments.output_file: SeqIO.write(sequences, arguments.output_file, output_type) rpt_rows = (f.report_dict() for f in filters) # Write report with arguments.report_out as fp: writer = csv.DictWriter( fp, BaseFilter.report_fields, lineterminator='\n', delimiter='\t') writer.writeheader() writer.writerows(rpt_rows)