def write_seqs_to_files(seqs, max_num_seqs_per_file=float('inf'), format='fasta', compresslevel=None, prefix='', force=False): compress = False if compresslevel: compress = True ext = FILE_FORMATS.get_ext(format, compress) file_idx = 0 file_stream = None for seq_idx, seq in enumerate(seqs): if seq_idx % max_num_seqs_per_file == 0: if file_stream: file_stream.close() file_idx += 1 path = '{0}_{1:0>4}{2}'.format(prefix, file_idx, ext) if os.path.exists(path) and (not force): raise errors.PathExistsError( 'File {0} already exists'.format(path)) file_stream = fileio.OpenFile(path, mode='w', compresslevel=compresslevel) file_stream.write('{0}'.format(seq.format(format))) if file_stream and (not file_stream.closed): file_stream.close()
def write_seqs_to_files(seqs, max_num_seqs_per_file = float('inf'), format = 'fasta', compresslevel = None, prefix = '', force = False): compress = False if compresslevel: compress = True ext = FILE_FORMATS.get_ext(format, compress) file_idx = 0 file_stream = None for seq_idx, seq in enumerate(seqs): if seq_idx % max_num_seqs_per_file == 0: if file_stream: file_stream.close() file_idx += 1 path = '{0}_{1:0>4}{2}'.format(prefix, file_idx, ext) if os.path.exists(path) and (not force): raise errors.PathExistsError('File {0} already exists'.format( path)) file_stream = fileio.OpenFile(path, mode = 'w', compresslevel = compresslevel) file_stream.write('{0}'.format(seq.format(format))) if file_stream and (not file_stream.closed): file_stream.close()
def read_seq(file_obj, format=None, data_type='dna', ambiguities=True): """ Returns a single SeqRecord from a file containing exactly one sequence record. """ if format == None: format = FILE_FORMATS.get_format_from_file_object(file_obj) _LOG.debug("reading sequence from {0!r}.".format(file_obj)) return SeqIO.read(file_obj, format=format, alphabet=get_state_alphabet(data_type, ambiguities))
def get_seq_dict(file_obj, format=None, data_type='dna', ambiguities=True): """ Returns a dict of SeqRecords from a sequence file. This loads all the sequences in the file into memory. This is efficient for small sequence files, but may cause memory issues with large files. """ if format == None: format = FILE_FORMATS.get_format_from_file_object(file_obj) return SeqIO.to_dict(get_seq_iter([file_obj], format=format, data_type=data_type, ambiguities=ambiguities))
def convert_format(in_file, out_file, in_format=None, out_format=None, data_type='dna', ambiguities=True): if in_format == None: in_format = FILE_FORMATS.get_format_from_file_object(in_file) if out_format == None: out_format = FILE_FORMATS.get_format_from_file_object(out_file) _LOG.debug("converting {in_format}-formatted file {in_file!r} to " "{out_format}-formatted file {out_file!r}.".format( in_file=in_file, in_format=in_format, out_file=out_file, out_format=out_format)) nseqs = SeqIO.convert(in_file=in_file, in_format=in_format, out_file=out_file, out_format=out_format, alphabet=get_state_alphabet(data_type, ambiguities)) return nseqs
def get_seq_dict(file_obj, format=None, data_type='dna', ambiguities=True): """ Returns a dict of SeqRecords from a sequence file. This loads all the sequences in the file into memory. This is efficient for small sequence files, but may cause memory issues with large files. """ if format == None: format = FILE_FORMATS.get_format_from_file_object(file_obj) return SeqIO.to_dict( get_seq_iter([file_obj], format=format, data_type=data_type, ambiguities=ambiguities))
def convert_format(in_file, out_file, in_format=None, out_format=None, data_type='dna', ambiguities=True): if in_format == None: in_format = FILE_FORMATS.get_format_from_file_object(in_file) if out_format == None: out_format = FILE_FORMATS.get_format_from_file_object(out_file) _LOG.debug("converting {in_format}-formatted file {in_file!r} to " "{out_format}-formatted file {out_file!r}.".format( in_file=in_file, in_format=in_format, out_file=out_file, out_format=out_format)) nseqs = SeqIO.convert( in_file=in_file, in_format=in_format, out_file=out_file, out_format=out_format, alphabet=get_state_alphabet(data_type, ambiguities)) return nseqs
def get_indexed_seq_iter(file_path, format=None, data_type='dna', key_function=None, ambiguities=True): """ Returns an indexed SeqRecord iterator from a sequence file. Only supports sequential file formats (e.g., fasta and genbank). The iterator acts like a dict, but only parses a sequence from a file when needed. For large sequence files, this is memory-efficient alternative to reading all the sequences into a dict. """ if format == None: format = FILE_FORMATS.get_format_from_file_object(file_path) _LOG.debug("parsing indexed SeqRecord iterator from {0!r}.".format( file_path)) return SeqIO.index(file_path, format=format, alphabet=get_state_alphabet(data_type, ambiguities), key_function=key_function)
def __init__(self, file_obj, format = None, data_type = 'dna', ambiguities = True): self.__class__.count += 1 self.instance_name = '-'.join([self.__class__.__name__, str(self.count)]) self.name = getattr(file_obj, 'name', self.instance_name) self._close = False self._file_obj = file_obj if isinstance(file_obj, str): self.name = file_obj self._file_obj = fileio.OpenFile(file_obj, 'r') self._close = True if format == None: format = FILE_FORMATS.get_format_from_file_object(file_obj) self._seqs = SeqIO.parse(self._file_obj, format=format, alphabet=get_state_alphabet(data_type, ambiguities))
def get_indexed_seq_iter(file_path, format=None, data_type='dna', key_function=None, ambiguities=True): """ Returns an indexed SeqRecord iterator from a sequence file. Only supports sequential file formats (e.g., fasta and genbank). The iterator acts like a dict, but only parses a sequence from a file when needed. For large sequence files, this is memory-efficient alternative to reading all the sequences into a dict. """ if format == None: format = FILE_FORMATS.get_format_from_file_object(file_path) _LOG.debug( "parsing indexed SeqRecord iterator from {0!r}.".format(file_path)) return SeqIO.index(file_path, format=format, alphabet=get_state_alphabet(data_type, ambiguities), key_function=key_function)
def __init__(self, file_obj, format=None, data_type='dna', ambiguities=True): self.__class__.count += 1 self.instance_name = '-'.join( [self.__class__.__name__, str(self.count)]) self.name = getattr(file_obj, 'name', self.instance_name) self._close = False self._file_obj = file_obj if isinstance(file_obj, str): self.name = file_obj self._file_obj = fileio.OpenFile(file_obj, 'r') self._close = True if format == None: format = FILE_FORMATS.get_format_from_file_object(file_obj) self._seqs = SeqIO.parse(self._file_obj, format=format, alphabet=get_state_alphabet( data_type, ambiguities))
def main_cli(): description = '{name} {version}\n\n{description}'.format(**_program_info) parser = argparse.ArgumentParser( description=description, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('input_file', metavar='INPUT-SEQ-FILE', type=argparse_utils.arg_is_file, help=('Input sequence file to be vetted.')) comparison_args = parser.add_argument_group( 'Comparison Options', 'Options to control the number and nature of sequence comparisons') comparison_args.add_argument( '-n', '--num-samples', type=int, default=0, help=('The number of randomly sampled sequences to which each ' 'sequence will be compared. If less than 1 (the defualt is ' '0), all pairwise comparisons will be performed. For very ' 'large numbers of sequences, performing all pairwise ' 'comparisons will take a long time. This option will speed ' 'things up as long as the number specified is less than ' 'about half of the number of input sequences. If the ' 'number you are considering is close to half of the number ' 'sequences, you should probably specify zero and do all ' 'combinations. You should not specify a number greater than ' 'half the number of sequences, because it will take longer ' 'and be less thorough than the default.')) comparison_args.add_argument( '--seed', action='store', type=int, help=('Random number seed to use for the analysis. This option ' 'is only revelant if a number greater than 0 is specified ' 'for the `-n/--num-samples` option.')) comparison_args.add_argument( '--compare-translated', action='store_true', help=('Compare amino acid sequences encoded by the longest ' 'reading frame found in each sequence. To use this option, ' '`data-type` must be dna or rna. See "Translation Options" ' 'for controlling how the longest reading frame of each ' 'sequence is determined and translated.')) comparison_args.add_argument('--check-ids', action='store_true', help=('Check sequence IDs for duplicates.')) comparison_args.add_argument( '--summarize-reading-frame-lengths', action='store_true', help=('Report the length of the longest reading frame of ' 'each sequence. See "Translation Options" for controlling ' 'how reading frames are determined.')) comparison_args.add_argument( '-g', '--count-gaps', action='store_true', help=('Count gaps when calculating pairwise sequence distances. ' 'The default is to calculate (number of differences ' 'ignoring gaps / number of aligned sites ignoring sites ' 'with gaps) for each pairwise comparison. When this option ' 'is used, the distance is (number of differences including ' 'gap differences / total number of aligned sites).')) alignment_args = parser.add_argument_group( 'Alignment Options', ('These options control if/how sequences are to be aligned prior ' 'to calculating distances.')) alignment_args.add_argument( '-a', '--aligned', action='store_true', help=('Treat input sequences as aligned. I.e., do not perform ' 'pairwise alignment before calculating distances between ' 'sequences (except when calculating distances for reverse ' 'and complemented sequences).')) alignment_args.add_argument( '--aligner', type=argparse_utils.arg_is_executable, help=('Path to alignment program executable to use for pairwise' 'alignments of sequences. ' 'The default is to look for muscle and then mafft in PATH, ' 'and if neither are found use the (slow) built-in ' 'function. Even if the `-a`/`--aligned` option is ' 'specified, the aligner will still be used for pairwise ' 'alignments when calculating distances of reverse and ' 'complemented sequences.')) alignment_args.add_argument( '--msa', action='store_true', help=('Perform a full multiple sequence alignemnt prior to ' 'comparing sequences. The default is to align each ' 'pair of sequences being compared. This option is ' 'overruled by the `-a`/`--aligned` option. ' 'If this option is used ' 'the resulting alignment is written to file.')) alignment_args.add_argument( '--msa-aligner', type=argparse_utils.arg_is_executable, help=('Path to alignment program executable to use for full ' 'multiple sequence alignment. ' 'The default is to look for mafft and then muscle in PATH, ' 'and if neither are found the program will exit with an ' 'error message. If you do not have mafft or muscle ' 'you cannot use this option. ' 'This option is only used if the `-a`/`--aligned` option ' 'is not specified, and the `--msa` option is specified.')) translation_args = parser.add_argument_group( 'Translation Options', ('These options control translation from nucleotide to amino acid ' 'sequences.')) translation_args.add_argument( '--table', type=int, choices=list(range(1, 7)) + list(range(9, 17)) + list(range(21, 26)), default=1, help=('The translation table to use for any options associated ' 'with translating nucleotide sequences to amino acids. ' 'Option should be the integer that corresponds to the ' 'desired translation table according to NCBI ' '(http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi). ' 'The default is 1 (the "standard" code).')) translation_args.add_argument( '--allow-partial', action='store_true', default=False, help=('Allow partial reading frames at the beginning (no start ' 'codon) and end (no stop codon) of sequences.')) translation_args.add_argument( '--read-after-stop', action='store_true', default=False, help=('A new reading frame begins immediately after a stop codon. ' 'The default is to start reading frame at next start codon ' 'after a stop codon. This option might be useful for exons.')) data_args = parser.add_argument_group( 'Data Options', ('Options specifying the input data type and format')) data_args.add_argument( '-d', '--data-type', type=str, choices=VALID_DATA_TYPES, default='dna', help=('The type of sequence data. The default is dna. Valid ' 'options include: {0}.'.format(', '.join(VALID_DATA_TYPES)))) data_args.add_argument( '--format', dest='input_format', type=str, choices=FILE_FORMATS.supported_formats, help=('The format of the input sequence file. Valid options ' 'include: {0}. By default, the format is guessed based on ' 'the extension of the first input file. However, if ' 'provided, this option will always take precedence over ' 'the file extension.'.format(', '.join( FILE_FORMATS.supported_formats)))) output_args = parser.add_argument_group( 'Output Options', 'Options for controlling output of program') output_args.add_argument( '-o', '--output-dir', type=argparse_utils.arg_is_dir, help=('The directory in which all output files will be written. ' 'The default is to use the directory of the input file.')) messaging_args = parser.add_argument_group( 'Messaging Options', ('These options control verbosity of messaging.')) messaging_args.add_argument( '--log-frequency', type=argparse_utils.arg_is_nonnegative_int, default=1000, help=('The frequency at which to log progress. Default is to log ' 'every 1000 sequence comparisons.')) messaging_args.add_argument('--quiet', action='store_true', help='Run without verbose messaging.') messaging_args.add_argument('--debug', action='store_true', help='Run in debugging mode.') args = parser.parse_args() ########################################################################## ## set up logging from seqsift.utils.messaging import get_logger, LOGGING_LEVEL_ENV_VAR os.environ[LOGGING_LEVEL_ENV_VAR] = "INFO" if args.quiet: os.environ[LOGGING_LEVEL_ENV_VAR] = "WARNING" if args.debug: os.environ[LOGGING_LEVEL_ENV_VAR] = "DEBUG" log = get_logger(name=__name__) ########################################################################## ## package imports from seqsift.utils import GLOBAL_RNG, dataio, functions, alphabets from seqsift.seqops import seqsum, seqmod, seqstats from seqsift.utils.fileio import OpenFile ########################################################################## ## handle args ## set seed if randomly sampling sequences if args.num_samples > 0: if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) log.warning('Seed: {0}'.format(args.seed)) ## get input file format if not args.input_format: args.input_format = FILE_FORMATS.get_format_from_file_object( args.input_file) if not args.input_format: log.error("Could not determine input format.\n" "You must either provide the input format\n" "using the '--from' option or have a recognizable\n" "file extension on the input file name.\n" "Here are the supported file extensions:\n{0}".format( str(FILE_FORMATS))) sys.stderr.write(str(parser.print_help())) sys.exit(1) aligner_tools = ['muscle', 'mafft'] if args.aligner: aligner_tools = [args.aligner] full_aligner_tools = ['mafft', 'muscle'] if args.msa_aligner: full_aligner_tools = [args.msa_aligner] if not args.output_dir: args.output_dir = os.path.dirname(args.input_file) full_alignment_out_path = os.path.join(args.output_dir, 'seqvet-msa.txt') alphabet = alphabets.DnaAlphabet() if args.data_type in ['aa', 'protein']: alphabet = alphabets.ProteinAlphabet() if (args.summarize_reading_frame_lengths and (not args.data_type in ['dna', 'rna'])): log.error("`--summarize-reading-frame-lengths` is only compatible " "with DNA or RNA.") sys.stderr.write(str(parser.print_help())) sys.exit(1) if (args.compare_translated and (not args.data_type in ['dna', 'rna'])): log.error("`-compare-translated` is only compatible with DNA or RNA.") sys.stderr.write(str(parser.print_help())) sys.exit(1) ########################################################################## ## heavy lifting seqs = dataio.get_seq_iter([args.input_file], format=args.input_format, data_type=args.data_type) if args.summarize_reading_frame_lengths: log.info('Summarizing longest reading frame lengths...') if not isinstance(seqs, dataio.BufferedIter): seqs = dataio.BufferedIter(seqs) lengths = seqsum.summarize_longest_read_lengths( seqs, table=args.table, allow_partial=args.allow_partial, require_start_after_stop=(not args.read_after_stop)) length_path = os.path.join(args.output_dir, 'seqvet-reading-frame-lengths.txt') log.info('Writing longest reading frame lengths to file...') with OpenFile(length_path, 'w') as out: out.write('seq_id\tlrf\trev_comp_lrf\n') for (l, rc_l, seq_id) in lengths: out.write('{0}\t{1}\t{2}\n'.format(seq_id, l, rc_l)) if args.compare_translated: log.info('Translating longest reading frames for distance ' 'calculations...') seqs = seqmod.translate_longest_reading_frames( seqs, table=args.table, allow_partial=args.allow_partial, require_start_after_stop=(not args.read_after_stop)) alphabet = alphabets.ProteinAlphabet() if args.check_ids: log.info('Checking sequence IDs...') if not isinstance(seqs, dataio.BufferedIter): seqs = dataio.BufferedIter(seqs) dups = seqstats.get_duplicate_ids(seqs) if len(dups) > 0: dup_path = functions.get_new_path( os.path.join(args.output_dir, 'seqvet-duplicate-ids.txt')) log.warning('Duplicate IDs found! Writing them to ' '{0}'.format(dup_path)) with OpenFile(dup_path, 'w') as out: for dup in dups: out.write('{0}\n'.format(dup)) else: log.info('No duplicate sequence IDs were found.') log.info('Calculating pairwise distances...') distances, rev_comp_errors = seqsum.summarize_distances( seqs, sample_size=args.num_samples, per_site=True, aligned=args.aligned, ignore_gaps=(not args.count_gaps), alphabet=alphabet, do_full_alignment=args.msa, full_alignment_out_path=full_alignment_out_path, aligner_tools=aligner_tools, full_aligner_tools=full_aligner_tools, log_frequency=args.log_frequency) log.info('Done!') log.info('Writing mean distances to file...') distances = sorted([(k, v) for k, v in iteritems(distances)], key=lambda x: x[1].mean, reverse=True) mean_path = functions.get_new_path( os.path.join(args.output_dir, 'seqvet-mean-distances.txt')) with OpenFile(mean_path, 'w') as out: out.write('seq_id\tmean_distance\n') for (seq_id, dist) in distances: out.write('{0}\t{1}\n'.format(seq_id, dist.mean)) log.info('Writing max distances to file...') distances = sorted(distances, key=lambda x: x[1].maximum, reverse=True) max_path = functions.get_new_path( os.path.join(args.output_dir, 'seqvet-max-distances.txt')) with OpenFile(max_path, 'w') as out: out.write('seq_id\tmax_distance\n') for (seq_id, dist) in distances: out.write('{0}\t{1}\n'.format(seq_id, dist.maximum)) if rev_comp_errors: rev_comp_errors = sorted(rev_comp_errors) rce_set = set() rce = [] for (s1, s2, d, drc) in rev_comp_errors: pair = tuple(sorted([s1, s2])) if pair in rce_set: continue rce_set.add(pair) rce.append((pair[0], pair[1], d, drc)) log.info('Writing potential reverse-complement errors to file...') path = functions.get_new_path( os.path.join(args.output_dir, 'seqvet-reverse-complement-warnings.txt')) with OpenFile(path, 'w') as out: out.write('seq1\tseq2\tdistance\trev_comp_distance\n') for (seq1, seq2, d, drc) in rce: out.write('{0}\t{1}\t{2}\t{3}\n'.format(seq1, seq2, d, drc))
def main_cli(): description = '{name} {version}\n\n{description}'.format(**_program_info) parser = argparse.ArgumentParser( description=description, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument( 'input_files', metavar='INPUT-SEQ-FILE', nargs='+', type=argparse_utils.arg_is_file, help=('Input sequence file(s) from which to randomly sub-sample ' 'sequences (without replacement).')) parser.add_argument('-n', '--num-samples', type=int, required=True, help=('The number of sequences to randomly sample.')) parser.add_argument( '--format', dest='input_format', type=str, choices=FILE_FORMATS.supported_formats, help=('The format of the input sequence file(s). Valid options ' 'include: {0}. By default, the format is guessed based on ' 'the extension of the first input file. However, if ' 'provided, this option will always take precedence over ' 'the file extension.'.format(', '.join( FILE_FORMATS.supported_formats)))) parser.add_argument( '-d', '--data-type', type=str, choices=VALID_DATA_TYPES, default='dna', help=('The type of sequence data. The default is dna. Valid ' 'options include: {0}.'.format(', '.join(VALID_DATA_TYPES)))) parser.add_argument('--seed', action='store', type=int, help=('Random number seed.')) parser.add_argument('--quiet', action='store_true', help='Run without verbose messaging.') parser.add_argument('--debug', action='store_true', help='Run in debugging mode.') args = parser.parse_args() ########################################################################## ## set up logging from seqsift.utils.messaging import get_logger, LOGGING_LEVEL_ENV_VAR os.environ[LOGGING_LEVEL_ENV_VAR] = "INFO" if args.quiet: os.environ[LOGGING_LEVEL_ENV_VAR] = "WARNING" if args.debug: os.environ[LOGGING_LEVEL_ENV_VAR] = "DEBUG" log = get_logger(name=__name__) ########################################################################## ## package imports from seqsift.utils import dataio, GLOBAL_RNG, functions ########################################################################## ## handle args ## set seed if randomly sampling sequences if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) log.warning('Seed: {0}'.format(args.seed)) if not args.input_format: args.input_format = FILE_FORMATS.get_format_from_file_object( args.input_files[0]) if not args.input_format: log.error("Could not determine input format.\n" "You must either provide the input format\n" "using the '--from' option or have a recognizable\n" "file extension on the first input file.\n" "Here are the supported file extensions:\n{0}".format( str(FILE_FORMATS))) sys.stderr.write(str(parser.print_help())) sys.exit(1) seqs = dataio.get_seq_iter(args.input_files, format=args.input_format, data_type=args.data_type) samples = functions.sample_iter(iterable=seqs, sample_size=args.num_samples) SeqIO.write(samples, handle=sys.stdout, format=args.input_format)
def main(): description = '{name} {version}'.format(**_program_info) usage = ("\n %prog [options] <SEQ_INPUT_FILE> [<SEQ_OUTPUT_FILE>]") parser = OptionParser(usage=usage, description=description, version=_program_info['version'], add_help_option=True) format_opts = OptionGroup( parser, 'Format Options', 'These options designate file formats and data type.') format_opts.add_option( '-f', '--from', dest='from_format', type='string', help=('The format of the input sequence file. Valid options ' 'include: {0}. By default, the format is guessed based on ' 'the extension of the input file. However, if provided, ' 'this option will always take precedence over the file ' 'extension.'.format(', '.join(FILE_FORMATS.supported_formats)))) format_opts.add_option( '-t', '--to', dest='to_format', type='string', help=('The desired format of the output sequence file. Valid ' 'options include: {0}. By default, if an output file path ' 'is provided, the format is guessed based on the extension ' 'of this file. However, this option will always take ' 'precedence over the file extension. Either this option or ' 'an output file path with an extension is required; if ' 'neither are provided the program will exit with an ' 'error.'.format(', '.join(FILE_FORMATS.supported_formats)))) format_opts.add_option( '-d', '--data-type', dest='data_type', type='string', default='dna', help=('The type of sequence data. The default is dna. Valid ' 'options include: {0}.'.format(', '.join(VALID_DATA_TYPES)))) parser.add_option_group(format_opts) filter_opts = OptionGroup( parser, 'Filter Options', 'These options allow filtering of data by columns or sequences.') filter_opts.add_option( '--remove-duplicates', dest='remove_duplicates', default=False, action='store_true', help=('Remove duplicate sequences (i.e., sequences with the same ' 'ID and sequence). If a duplicate ID is found associated ' 'with a different sequence, the program will exit with an ' 'error.')) filter_opts.add_option( '-x', '--ids-to-exclude', dest='ids_to_exclude', type='string', help=('Comma-delimited list of the ids of sequences to exclude.')) filter_opts.add_option( '--remove-missing-columns', dest='remove_missing_columns', default=False, action='store_true', help=("Remove aligned columns with missing data. Characters to be " "considered missing can be specified with the " "--missing-characters option; the default is '?-'. " "The proportion of rows that must contain these characters " "for a row to be removed can be specified with the " "--missing-column-proportion option; the default is 1.0. " "Note, this option is only relevant to aligned sequences, " "and will result in an error if the input sequences are not " "aligned.")) filter_opts.add_option( '--missing-column-proportion', dest='missing_column_proportion', type='float', default=1.0, help=('The proportion of rows that must contain ' '--missing-characters for a column to be removed. ' 'This option is only relevant in combination with the ' '--remove-missing-columns option.')) filter_opts.add_option( '--remove-missing-sequences', dest='remove_missing_sequences', default=False, action='store_true', help=("Remove sequences with missing data. Characters to be " "considered missing can be specified with the " "--missing-characters option; the default is '?-'. " "The proportion of the sites that must contain these " "characters for a sequence to be removed can be specified " "with the --missing-sequence-proportion option; the default " "is 1.0.")) filter_opts.add_option( '--missing-sequence-proportion', dest='missing_sequence_proportion', type='float', default=1.0, help=('The proportion of sites that must contain ' '--missing-characters for a sequence to be removed. ' 'This option is only relevant in combination with the ' '--remove-missing-sequences option.')) filter_opts.add_option( '--missing-characters', dest='missing_characters', type='str', default='?-', help=("Characters to be considered missing and be used in " "evaluating columns/sequences to remove with the " "--remove-missing-columns and --remove-missing-sequences " "options. The default is '?-'.")) filter_opts.add_option('--remove-constant-columns', dest='remove_constant_columns', default=False, action='store_true', help=("Remove aligned columns with no variation.")) parser.add_option_group(filter_opts) rev_comp_opts = OptionGroup( parser, 'Reverse Complement Options', 'These options are for reverse complementing sequences.') rev_comp_opts.add_option( '--rev-comp', dest='rev_comp', default=False, action='store_true', help=("Reverse complement all sequences. This option overrides " "all other reverse-complement options.")) rev_comp_opts.add_option( '--fix-rev-comp-by', dest='fix_rev_comp_by', type='choice', choices=['first', 'read'], help=("Try to correct reverse complement errors. " "Options include 'first' and 'read'. If 'first' is " "specified, sequences are returned in their orientation " "that minimizes distance from the first sequence. " "If 'read' is used, sequences are returned in their " "orientation that has the longest read frame " "(see 'Translation Options' for controlling translation " "of reading frames).")) parser.add_option_group(rev_comp_opts) translation_opts = OptionGroup( parser, 'Translation Options', ('These options control translation from nucleotide to amino acid ' 'sequences.')) translation_opts.add_option( '--table', type='choice', choices=list(range(1, 7)) + list(range(9, 17)) + list(range(21, 26)), default=1, help=('The translation table to use for any options associated ' 'with translating nucleotide sequences to amino acids. ' 'Option should be the integer that corresponds to the ' 'desired translation table according to NCBI ' '(http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi). ' 'The default is 1 (the "standard" code).')) translation_opts.add_option( '--allow-partial', default=False, action='store_true', help=('Allow partial reading frames at the beginning (no start ' 'codon) and end (no stop codon) of sequences.')) translation_opts.add_option( '--read-after-stop', default=False, action='store_true', help=('A new reading frame begins immediately after a stop codon. ' 'The default is to start reading frame at next start codon ' 'after a stop codon. This option might be useful for exons.')) parser.add_option_group(translation_opts) distance_opts = OptionGroup( parser, 'Distance Options', ('These options control how distances between sequences are ' 'calculated.')) distance_opts.add_option( '-g', '--count-gaps', default=False, action='store_true', help=('Count gaps when calculating pairwise sequence distances. ' 'The default is to calculate (number of differences ' 'ignoring gaps / number of aligned sites ignoring sites ' 'with gaps) for each pairwise comparison. When this option ' 'is used, the distance is (number of differences including ' 'gap differences / total number of aligned sites).')) parser.add_option_group(distance_opts) messaging_opts = OptionGroup( parser, 'Messaging Options', ('These options control verbosity of messaging.')) messaging_opts.add_option('--quiet', action='store_true', help='Run without verbose messaging.') messaging_opts.add_option('--debug', action='store_true', help='Run in debugging mode.') parser.add_option_group(messaging_opts) (options, args) = parser.parse_args() ########################################################################## ## set up logging from seqsift.utils.messaging import get_logger, LOGGING_LEVEL_ENV_VAR os.environ[LOGGING_LEVEL_ENV_VAR] = "INFO" if options.quiet: os.environ[LOGGING_LEVEL_ENV_VAR] = "WARNING" if options.debug: os.environ[LOGGING_LEVEL_ENV_VAR] = "DEBUG" log = get_logger(name=__name__) ########################################################################## ## package imports from seqsift.seqops import seqmod, seqfilter from seqsift.utils import dataio ########################################################################## ## handle args if len(args) == 1: in_file_path = args[0] out_file_path = sys.stdout elif len(args) == 2: in_file_path = args[0] out_file_path = args[1] elif len(args) > 2: log.error("Too many arguments. Expecting at most 2 arguments:\n" "The path to the input file (required), and the path to\n" "output file (optional; defaults to standard output).") sys.stderr.write(str(parser.print_help())) sys.exit(1) elif len(args) < 1: log.error("Too few arguments. Expecting at least 1 argument:\n" "the path to the input file.") sys.stderr.write(str(parser.print_help())) sys.exit(1) opt_dict = options.__dict__ if options.from_format: in_format = opt_dict.pop('from_format') else: in_format = FILE_FORMATS.get_format_from_file_object(in_file_path) if not in_format: log.error("Could not determine format of input file.\n" "You must either provide the format of the input file\n" "using the '--from-format' option or have a recognized\n" "file extension on the input file. Here are the supported\n" "file extensions:\n{0}".format(str(FILE_FORMATS))) sys.stderr.write(str(parser.print_help())) sys.exit(1) if options.to_format: out_format = opt_dict.pop('to_format') else: out_format = FILE_FORMATS.get_format_from_file_object(out_file_path) if not out_format: log.error("Could not determine format of output file.\n" "You must either provide the format of the output file\n" "using the '--to-format' option or have a recognized\n" "file extension on the output file. Here are the supported\n" "file extensions:\n{0}".format(str(FILE_FORMATS))) sys.stderr.write(str(parser.print_help())) sys.exit(1) data_type = opt_dict.pop('data_type') if len(opt_dict) == 0: dataio.convert_format(in_file=in_file_path, out_file=out_file_path, in_format=in_format, out_format=out_format, data_type=data_type) sys.exit(0) if ((options.rev_comp or options.fix_rev_comp_by) and (data_type.lower() not in ['dna', 'rna'])): log.error("You have selected an option for reverse complementing\n" "sequences but the data type is not DNA or RNA.") sys.stderr.write(str(parser.print_help())) sys.exit(1) seqs = dataio.get_seq_iter([in_file_path], format=in_format, data_type=data_type) if options.ids_to_exclude: to_exclude = [n.strip() for n in options.ids_to_exclude.split(',')] seqs = seqfilter.id_filter(seqs, to_exclude) if options.remove_duplicates: seqs = seqfilter.duplicate_id_filter(seqs) if options.remove_missing_sequences: seqs = seqfilter.row_filter( seqs, character_list=list(options.missing_characters), max_frequency=options.missing_sequence_proportion) if options.remove_missing_columns: seqs = seqfilter.column_filter( seqs, character_list=list(options.missing_characters), max_frequency=options.missing_column_proportion) if options.remove_constant_columns: seqs = seqfilter.constant_column_filter(seqs) if options.rev_comp: log.info('Reverse complementing all sequences...') seqs = seqmod.reverse_complement(seqs) elif options.fix_rev_comp_by == 'first': log.info('Reverse complementing to match first sequence...') seqs = seqmod.reverse_complement_to_first_seq( seqs, per_site=True, aligned=False, ignore_gaps=(not options.count_gaps), alphabet=None, aligner_tools=['muscle', 'mafft'], log_frequency=100) elif options.fix_rev_comp_by == 'read': log.info('Reverse complementing to longest reading frame...') seqs = seqmod.reverse_complement_to_longest_reading_frame( seqs, gap_characters=['-'], table=options.table, allow_partial=options.allow_partial, require_start_after_stop=(not options.read_after_stop), log_frequency=100) SeqIO.write(seqs, handle=out_file_path, format=out_format)
def main_cli(): description = '{name} {version}\n\n{description}'.format(**_program_info) parser = argparse.ArgumentParser(description = description, formatter_class = argparse.RawDescriptionHelpFormatter) parser.add_argument('input_files', metavar='INPUT-SEQ-FILE', nargs = '+', type = argparse_utils.arg_is_file, help = ('Input sequence alignments(s).')) parser.add_argument('-k', '--keep', dest = 'slices_to_keep', action = 'append', nargs = 2, metavar = 'COLUMN-INDEX', type = int, required = True, help = ('Two integers specifying the beginning and end indices of ' 'columns to keep.')) parser.add_argument('--format', dest = 'input_format', type = str, choices = FILE_FORMATS.supported_formats, help = ('The format of the input sequence file(s). Valid options ' 'include: {0}. By default, the format is guessed based on ' 'the extension of the first input file. However, if ' 'provided, this option will always take precedence over ' 'the file extension.'.format( ', '.join(FILE_FORMATS.supported_formats)))) parser.add_argument('-d', '--data-type', type = str, choices = VALID_DATA_TYPES, default='dna', help = ('The type of sequence data. The default is dna. Valid ' 'options include: {0}.'.format(', '.join( VALID_DATA_TYPES)))) parser.add_argument('--quiet', action = 'store_true', help = 'Run without verbose messaging.') parser.add_argument('--debug', action = 'store_true', help = 'Run in debugging mode.') args = parser.parse_args() ########################################################################## ## set up logging from seqsift.utils.messaging import get_logger, LOGGING_LEVEL_ENV_VAR os.environ[LOGGING_LEVEL_ENV_VAR] = "INFO" if args.quiet: os.environ[LOGGING_LEVEL_ENV_VAR] = "WARNING" if args.debug: os.environ[LOGGING_LEVEL_ENV_VAR] = "DEBUG" log = get_logger(name = __name__) ########################################################################## ## package imports from seqsift.utils import dataio from seqsift.seqops import seqmod ########################################################################## ## handle args if not args.input_format: args.input_format = FILE_FORMATS.get_format_from_file_object( args.input_files[0]) if not args.input_format: log.error("Could not determine input format.\n" "You must either provide the input format\n" "using the '--from' option or have a recognizable\n" "file extension on the first input file.\n" "Here are the supported file extensions:\n{0}".format( str(FILE_FORMATS))) sys.stderr.write(str(parser.print_help())) sys.exit(1) seqs = dataio.get_seq_iter(args.input_files, format = args.input_format, data_type = args.data_type) new_seqs = seqmod.dice(seq_iter = seqs, slices_to_keep = args.slices_to_keep) SeqIO.write(new_seqs, handle = sys.stdout, format = args.input_format)
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description = description) parser.add_argument('input_files', metavar='INPUT-SEQ-FILE', nargs = '+', type = argparse_utils.arg_is_file, help = ('Input sequence file(s) to be output into files with ' '`-n` sequences per file.')) parser.add_argument('-n', '--num-samples', type = int, required = True, help = ('The maximum number of sequences to put in each output ' 'file.')) parser.add_argument('--format', dest = 'input_format', type = str, choices = FILE_FORMATS.supported_formats, help = ('The format of the input sequence file(s). Valid options ' 'include: {0}. By default, the format is guessed based on ' 'the extension of the first input file. However, if ' 'provided, this option will always take precedence over ' 'the file extension.'.format( ', '.join(FILE_FORMATS.supported_formats)))) parser.add_argument('-d', '--data-type', type = str, choices = VALID_DATA_TYPES, default='dna', help = ('The type of sequence data. The default is dna. Valid ' 'options include: {0}.'.format(', '.join( VALID_DATA_TYPES)))) parser.add_argument('--seed', action = 'store', type = int, help = ('Random number seed.')) parser.add_argument('--quiet', action = 'store_true', help = 'Run without verbose messaging.') parser.add_argument('--debug', action = 'store_true', help = 'Run in debugging mode.') args = parser.parse_args() ########################################################################## ## set up logging from seqsift.utils.messaging import get_logger, LOGGING_LEVEL_ENV_VAR os.environ[LOGGING_LEVEL_ENV_VAR] = "INFO" if args.quiet: os.environ[LOGGING_LEVEL_ENV_VAR] = "WARNING" if args.debug: os.environ[LOGGING_LEVEL_ENV_VAR] = "DEBUG" log = get_logger(name = __name__) ########################################################################## ## package imports from seqsift.utils import dataio, GLOBAL_RNG, functions ########################################################################## ## handle args ## set seed if randomly sampling sequences if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) log.warning('Seed: {0}'.format(args.seed)) if not args.input_format: args.input_format = FILE_FORMATS.get_format_from_file_object( args.input_files[0]) if not args.input_format: log.error("Could not determine input format.\n" "You must either provide the input format\n" "using the '--from' option or have a recognizable\n" "file extension on the first input file.\n" "Here are the supported file extensions:\n{0}".format( str(FILE_FORMATS))) sys.stderr.write(str(parser.print_help())) sys.exit(1) seqs = dataio.get_seq_iter(args.input_files, format = args.input_format, data_type = args.data_type) samples = functions.sample_iter(iterable = seqs, sample_size = args.num_samples) SeqIO.write(samples, handle = sys.stdout, format = args.input_format)
def main_cli(): description = '{name} {version}\n\n{description}'.format(**_program_info) parser = argparse.ArgumentParser(description = description, formatter_class = argparse.RawDescriptionHelpFormatter) parser.add_argument('input_files', metavar='INPUT-SEQ-FILE', nargs = '+', type = argparse_utils.arg_is_file, help = ('Input sequence file(s) to be output into files with ' '`-n` sequences per file.')) parser.add_argument('-n', '--num-seqs-per-file', type = int, required = True, default = 4000000, help = ('The maximum number of sequences to put in each output ' 'file.')) parser.add_argument('--format', dest = 'input_format', type = str, choices = FILE_FORMATS.supported_formats, help = ('The format of the input sequence file(s). Valid options ' 'include: {0}. By default, the format is guessed based on ' 'the extension of the first input file. However, if ' 'provided, this option will always take precedence over ' 'the file extension.'.format( ', '.join(FILE_FORMATS.supported_formats)))) parser.add_argument('-d', '--data-type', type = str, choices = VALID_DATA_TYPES, default='dna', help = ('The type of sequence data. The default is dna. Valid ' 'options include: {0}.'.format(', '.join( VALID_DATA_TYPES)))) parser.add_argument('-c', '--compress', action = 'store_true', help = 'Compress (gzip) output files.') parser.add_argument('-o', '--output-dir', type = argparse_utils.arg_is_dir, help = ('The directory in which all output files will be written. ' 'The default is to use the directory of the input file.')) parser.add_argument('-p', '--prefix', action = 'store', type = str, help = ('Prefix to use at beginning of output files. The default ' 'is to use the first input file name.')) parser.add_argument('--log-frequency', type = argparse_utils.arg_is_nonnegative_int, default = 100000, help = ('The frequency at which to log progress. Default is to log ' 'every 100000 sequences.')) parser.add_argument('--force', action = 'store_true', help = ('Overwrite files if they already exist.')) parser.add_argument('--quiet', action = 'store_true', help = 'Run without verbose messaging.') parser.add_argument('--debug', action = 'store_true', help = 'Run in debugging mode.') args = parser.parse_args() ########################################################################## ## set up logging from seqsift.utils.messaging import get_logger, LOGGING_LEVEL_ENV_VAR os.environ[LOGGING_LEVEL_ENV_VAR] = "INFO" if args.quiet: os.environ[LOGGING_LEVEL_ENV_VAR] = "WARNING" if args.debug: os.environ[LOGGING_LEVEL_ENV_VAR] = "DEBUG" log = get_logger(name = __name__) ########################################################################## ## package imports from seqsift.utils import dataio, errors from seqsift.utils.fileio import OpenFile ########################################################################## ## handle args if not args.input_format: args.input_format = FILE_FORMATS.get_format_from_file_object( args.input_files[0]) if not args.input_format: log.error("Could not determine input format.\n" "You must either provide the input format\n" "using the '--from' option or have a recognizable\n" "file extension on the first input file.\n" "Here are the supported file extensions:\n{0}".format( str(FILE_FORMATS))) sys.stderr.write(str(parser.print_help())) sys.exit(1) if not args.prefix: args.prefix = os.path.splitext(args.input_files[0])[0] if args.output_dir: args.prefix = os.path.join(args.output_dir, os.path.basename(args.prefix)) out_ext = FILE_FORMATS.get_ext(args.input_format, compressed = args.compress) compresslevel = None if args.compress: compresslevel = 9 # handle sequential formats on the fly if FILE_FORMATS.is_sequential(args.input_format): seq_iter = dataio.get_seq_iter( file_objs = args.input_files, format = args.input_format, data_type = args.data_type) try: dataio.write_seqs_to_files(seq_iter, max_num_seqs_per_file = args.num_seqs_per_file, format = args.input_format, compresslevel = compresslevel, prefix = args.prefix, force = args.force) except errors.PathExistsError as e: log.error('ERROR:\n' 'Output files already exist! You can specify a different\n' 'prefix or use the `--force` option to overwrite the\n' 'existing files. Here is the stack trace:\n\n{0}\n'.format( e)) sys.exit(1) # use SeqIO for non-sequential formats else: batch_iter = dataio.get_seq_batch_iter_from_files( file_objs = args.input_files, number_per_batch = args.num_seqs_per_file, format = args.input_format, data_type = args.data_type) for batch_idx, seq_iter in enumerate(batch_iter): out_path = '{0}_{1:0>4}{2}'.format(args.prefix, batch_idx + 1, out_ext) if os.path.exists(out_path) and (not args.force): log.error('ERROR:\n' 'Output files already exist! You can specify a ' 'different\nprefix or use the `--force` option to ' 'overwrite the\nexisting files.') sys.exit(1) out = OpenFile(out_path, mode = 'w', compresslevel = compresslevel) SeqIO.write(seq_iter, handle = out, format = args.input_format) out.close()
def main(): description = '{name} {version}'.format(**_program_info) usage = ("\n %prog [options] <SEQ_INPUT_FILE> [<SEQ_OUTPUT_FILE>]") parser = OptionParser(usage=usage, description=description, version=_program_info['version'], add_help_option=True) format_opts = OptionGroup(parser, 'Format Options', 'These options designate file formats and data type.') format_opts.add_option('-f', '--from', dest='from_format', type='string', help=('The format of the input sequence file. Valid options ' 'include: {0}. By default, the format is guessed based on ' 'the extension of the input file. However, if provided, ' 'this option will always take precedence over the file ' 'extension.'.format( ', '.join(FILE_FORMATS.supported_formats)))) format_opts.add_option('-t', '--to', dest='to_format', type='string', help=('The desired format of the output sequence file. Valid ' 'options include: {0}. By default, if an output file path ' 'is provided, the format is guessed based on the extension ' 'of this file. However, this option will always take ' 'precedence over the file extension. Either this option or ' 'an output file path with an extension is required; if ' 'neither are provided the program will exit with an ' 'error.'.format(', '.join(FILE_FORMATS.supported_formats)))) format_opts.add_option('-d', '--data-type', dest='data_type', type='string', default='dna', help=('The type of sequence data. The default is dna. Valid ' 'options include: {0}.'.format(', '.join(VALID_DATA_TYPES)))) parser.add_option_group(format_opts) filter_opts = OptionGroup(parser, 'Filter Options', 'These options allow filtering of data by columns or sequences.') filter_opts.add_option('--remove-duplicates', dest='remove_duplicates', default=False, action='store_true', help = ('Remove duplicate sequences (i.e., sequences with the same ' 'ID and sequence). If a duplicate ID is found associated ' 'with a different sequence, the program will exit with an ' 'error.')) filter_opts.add_option('-x', '--ids-to-exclude', dest='ids_to_exclude', type='string', help=('Comma-delimited list of the ids of sequences to exclude.')) filter_opts.add_option('--remove-missing-columns', dest='remove_missing_columns', default=False, action='store_true', help=("Remove aligned columns with missing data. Characters to be " "considered missing can be specified with the " "--missing-characters option; the default is '?-'. " "The proportion of rows that must contain these characters " "for a row to be removed can be specified with the " "--missing-column-proportion option; the default is 1.0. " "Note, this option is only relevant to aligned sequences, " "and will result in an error if the input sequences are not " "aligned.")) filter_opts.add_option('--missing-column-proportion', dest='missing_column_proportion', type='float', default=1.0, help=('The proportion of rows that must contain ' '--missing-characters for a column to be removed. ' 'This option is only relevant in combination with the ' '--remove-missing-columns option.')) filter_opts.add_option('--remove-missing-sequences', dest='remove_missing_sequences', default=False, action = 'store_true', help=("Remove sequences with missing data. Characters to be " "considered missing can be specified with the " "--missing-characters option; the default is '?-'. " "The proportion of the sites that must contain these " "characters for a sequence to be removed can be specified " "with the --missing-sequence-proportion option; the default " "is 1.0.")) filter_opts.add_option('--missing-sequence-proportion', dest='missing_sequence_proportion', type='float', default=1.0, help=('The proportion of sites that must contain ' '--missing-characters for a sequence to be removed. ' 'This option is only relevant in combination with the ' '--remove-missing-sequences option.')) filter_opts.add_option('--missing-characters', dest='missing_characters', type='str', default='?-', help=("Characters to be considered missing and be used in " "evaluating columns/sequences to remove with the " "--remove-missing-columns and --remove-missing-sequences " "options. The default is '?-'.")) filter_opts.add_option('--remove-constant-columns', dest='remove_constant_columns', default=False, action='store_true', help=("Remove aligned columns with no variation.")) parser.add_option_group(filter_opts) rev_comp_opts = OptionGroup(parser, 'Reverse Complement Options', 'These options are for reverse complementing sequences.') rev_comp_opts.add_option('--rev-comp', dest='rev_comp', default = False, action = 'store_true', help=("Reverse complement all sequences. This option overrides " "all other reverse-complement options.")) rev_comp_opts.add_option('--fix-rev-comp-by', dest='fix_rev_comp_by', type = 'choice', choices = ['first', 'read'], help=("Try to correct reverse complement errors. " "Options include 'first' and 'read'. If 'first' is " "specified, sequences are returned in their orientation " "that minimizes distance from the first sequence. " "If 'read' is used, sequences are returned in their " "orientation that has the longest read frame " "(see 'Translation Options' for controlling translation " "of reading frames).")) parser.add_option_group(rev_comp_opts) translation_opts = OptionGroup(parser, 'Translation Options', ('These options control translation from nucleotide to amino acid ' 'sequences.')) translation_opts.add_option('--table', type = 'choice', choices = list(range(1, 7)) + list(range(9, 17)) + list(range(21, 26)), default = 1, help = ('The translation table to use for any options associated ' 'with translating nucleotide sequences to amino acids. ' 'Option should be the integer that corresponds to the ' 'desired translation table according to NCBI ' '(http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi). ' 'The default is 1 (the "standard" code).')) translation_opts.add_option('--allow-partial', default = False, action = 'store_true', help = ('Allow partial reading frames at the beginning (no start ' 'codon) and end (no stop codon) of sequences.')) translation_opts.add_option('--read-after-stop', default = False, action = 'store_true', help = ('A new reading frame begins immediately after a stop codon. ' 'The default is to start reading frame at next start codon ' 'after a stop codon. This option might be useful for exons.')) parser.add_option_group(translation_opts) distance_opts = OptionGroup(parser, 'Distance Options', ('These options control how distances between sequences are ' 'calculated.')) distance_opts.add_option('-g', '--count-gaps', default = False, action = 'store_true', help = ('Count gaps when calculating pairwise sequence distances. ' 'The default is to calculate (number of differences ' 'ignoring gaps / number of aligned sites ignoring sites ' 'with gaps) for each pairwise comparison. When this option ' 'is used, the distance is (number of differences including ' 'gap differences / total number of aligned sites).')) parser.add_option_group(distance_opts) messaging_opts = OptionGroup(parser, 'Messaging Options', ('These options control verbosity of messaging.')) messaging_opts.add_option('--quiet', action = 'store_true', help = 'Run without verbose messaging.') messaging_opts.add_option('--debug', action = 'store_true', help = 'Run in debugging mode.') parser.add_option_group(messaging_opts) (options, args) = parser.parse_args() ########################################################################## ## set up logging from seqsift.utils.messaging import get_logger, LOGGING_LEVEL_ENV_VAR os.environ[LOGGING_LEVEL_ENV_VAR] = "INFO" if options.quiet: os.environ[LOGGING_LEVEL_ENV_VAR] = "WARNING" if options.debug: os.environ[LOGGING_LEVEL_ENV_VAR] = "DEBUG" log = get_logger(name = __name__) ########################################################################## ## package imports from seqsift.seqops import seqmod, seqfilter from seqsift.utils import dataio ########################################################################## ## handle args if len(args) == 1: in_file_path = args[0] out_file_path = sys.stdout elif len(args) == 2: in_file_path = args[0] out_file_path = args[1] elif len(args) > 2: log.error("Too many arguments. Expecting at most 2 arguments:\n" "The path to the input file (required), and the path to\n" "output file (optional; defaults to standard output).") sys.stderr.write(str(parser.print_help())) sys.exit(1) elif len(args) < 1: log.error("Too few arguments. Expecting at least 1 argument:\n" "the path to the input file.") sys.stderr.write(str(parser.print_help())) sys.exit(1) opt_dict = options.__dict__ if options.from_format: in_format = opt_dict.pop('from_format') else: in_format = FILE_FORMATS.get_format_from_file_object(in_file_path) if not in_format: log.error("Could not determine format of input file.\n" "You must either provide the format of the input file\n" "using the '--from-format' option or have a recognized\n" "file extension on the input file. Here are the supported\n" "file extensions:\n{0}".format(str(FILE_FORMATS))) sys.stderr.write(str(parser.print_help())) sys.exit(1) if options.to_format: out_format = opt_dict.pop('to_format') else: out_format = FILE_FORMATS.get_format_from_file_object(out_file_path) if not out_format: log.error("Could not determine format of output file.\n" "You must either provide the format of the output file\n" "using the '--to-format' option or have a recognized\n" "file extension on the output file. Here are the supported\n" "file extensions:\n{0}".format(str(FILE_FORMATS))) sys.stderr.write(str(parser.print_help())) sys.exit(1) data_type = opt_dict.pop('data_type') if len(opt_dict) == 0: dataio.convert_format(in_file = in_file_path, out_file = out_file_path, in_format = in_format, out_format = out_format, data_type = data_type) sys.exit(0) if ((options.rev_comp or options.fix_rev_comp_by) and (data_type.lower() not in ['dna', 'rna'])): log.error("You have selected an option for reverse complementing\n" "sequences but the data type is not DNA or RNA.") sys.stderr.write(str(parser.print_help())) sys.exit(1) seqs = dataio.get_seq_iter([in_file_path], format = in_format, data_type = data_type) if options.ids_to_exclude: to_exclude = [n.strip() for n in options.ids_to_exclude.split(',')] seqs = seqfilter.id_filter(seqs, to_exclude) if options.remove_duplicates: seqs = seqfilter.duplicate_id_filter(seqs) if options.remove_missing_sequences: seqs = seqfilter.row_filter(seqs, character_list = list(options.missing_characters), max_frequency = options.missing_sequence_proportion) if options.remove_missing_columns: seqs = seqfilter.column_filter(seqs, character_list = list(options.missing_characters), max_frequency = options.missing_column_proportion) if options.remove_constant_columns: seqs = seqfilter.constant_column_filter(seqs) if options.rev_comp: log.info('Reverse complementing all sequences...') seqs = seqmod.reverse_complement(seqs) elif options.fix_rev_comp_by == 'first': log.info('Reverse complementing to match first sequence...') seqs = seqmod.reverse_complement_to_first_seq(seqs, per_site = True, aligned = False, ignore_gaps = (not options.count_gaps), alphabet = None, aligner_tools = ['muscle', 'mafft'], log_frequency = 100) elif options.fix_rev_comp_by == 'read': log.info('Reverse complementing to longest reading frame...') seqs = seqmod.reverse_complement_to_longest_reading_frame(seqs, gap_characters=['-'], table = options.table, allow_partial = options.allow_partial, require_start_after_stop = (not options.read_after_stop), log_frequency = 100) SeqIO.write(seqs, handle = out_file_path, format = out_format)