def get_parser(): epilog = """\ Trimmed sequences will be placed in ``${input_sequence_filename}.abundfilt`` for each input sequence file. If the input sequences are from RNAseq or metagenome sequencing then :option:`--variable-coverage` should be used. Example:: load-into-counting.py -k 20 -x 5e7 countgraph data/100k-filtered.fa filter-abund.py -C 2 countgraph data/100k-filtered.fa """ parser = argparse.ArgumentParser( description='Trim sequences at a minimum k-mer abundance.', epilog=textwrap.dedent(epilog), formatter_class=ComboFormatter) parser.add_argument('input_graph', metavar='input_count_graph_filename', help='The input k-mer countgraph filename') parser.add_argument('input_filename', metavar='input_sequence_filename', help='Input FAST[AQ] sequence filename', nargs='+') add_threading_args(parser) parser.add_argument('--cutoff', '-C', dest='cutoff', default=DEFAULT_CUTOFF, type=check_argument_range(0, 256, 'cutoff'), help="Trim at k-mers below this abundance.") parser.add_argument('--variable-coverage', '-V', action='store_true', dest='variable_coverage', default=False, help='Only trim low-abundance k-mers from sequences ' 'that have high coverage.') parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to', help='Base the variable-coverage cutoff on this median' ' k-mer abundance.', default=DEFAULT_NORMALIZE_LIMIT) parser.add_argument('-o', '--output', dest='single_output_file', type=argparse.FileType('wb'), metavar="optional_output_filename", help='Output the trimmed sequences into a single file ' 'with the given filename instead of creating a new ' 'file for each input file.') parser.add_argument('--version', action=_VersionStdErrAction, version='khmer {v}'.format(v=__version__)) parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true') add_output_compression_type(parser) return parser
def get_parser(): epilog = """\ Trimmed sequences will be placed in ``${input_sequence_filename}.abundfilt``. This script is constant memory. To trim reads based on k-mer abundance across multiple files, use :program:`load-into-counting.py` and :program:`filter-abund.py`. Example:: filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa """ parser = build_counting_args( descr="Trims sequences at a minimum k-mer abundance " "(in memory version).", epilog=textwrap.dedent(epilog), citations=['counting', 'SeqAn']) add_threading_args(parser) parser.add_argument('--cutoff', '-C', default=DEFAULT_CUTOFF, type=check_argument_range(0, 256, "cutoff"), help="Trim at k-mers below this abundance.") parser.add_argument('--variable-coverage', '-V', action='store_true', dest='variable_coverage', default=False, help='Only trim low-abundance k-mers from sequences ' 'that have high coverage.') parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to', help='Base the variable-coverage cutoff on this median' ' k-mer abundance.', default=DEFAULT_NORMALIZE_LIMIT) parser.add_argument('--savegraph', metavar="filename", default='', help="If present, the name of the file to save the " "k-mer countgraph to") parser.add_argument('-o', '--outfile', metavar='optional_output_filename', default=None, help='Override default output filename ' 'and output trimmed sequences into a file with the ' 'given filename.') parser.add_argument('datafile', metavar='input_sequence_filename', help="FAST[AQ] sequence file to trim") parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true') add_output_compression_type(parser) return parser
def get_parser(): epilog = """\ Trimmed sequences will be placed in ``${input_sequence_filename}.abundfilt``. This script is constant memory. To trim reads based on k-mer abundance across multiple files, use :program:`load-into-counting.py` and :program:`filter-abund.py`. Example:: filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa """ parser = build_counting_args( descr="Trims sequences at a minimum k-mer abundance " "(in memory version).", epilog=textwrap.dedent(epilog), citations=['counting', 'SeqAn']) add_threading_args(parser) parser.add_argument('-C', '--cutoff', default=DEFAULT_CUTOFF, type=check_argument_range(0, 256, "cutoff"), help="Trim at k-mers below this abundance.") parser.add_argument('-V', '--variable-coverage', action='store_true', dest='variable_coverage', default=False, help='Only trim low-abundance k-mers from sequences ' 'that have high coverage.') parser.add_argument('-Z', '--normalize-to', type=int, dest='normalize_to', help='Base the variable-coverage cutoff on this median' ' k-mer abundance.', default=DEFAULT_NORMALIZE_LIMIT) parser.add_argument('--savegraph', metavar="filename", default='', help="If present, the name of the file to save the " "k-mer countgraph to") parser.add_argument('-o', '--outfile', metavar='optional_output_filename', default=None, help='Override default output filename ' 'and output trimmed sequences into a file with the ' 'given filename.') parser.add_argument('datafile', metavar='input_sequence_filename', help="FAST[AQ] sequence file to trim") parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true') add_output_compression_type(parser) return parser
def get_parser(): epilog = """\ Trimmed sequences will be placed in ``${input_sequence_filename}.abundfilt`` for each input sequence file. If the input sequences are from RNAseq or metagenome sequencing then :option:`--variable-coverage` should be used. Example:: load-into-counting.py -k 20 -x 5e7 countgraph data/100k-filtered.fa filter-abund.py -C 2 countgraph data/100k-filtered.fa """ parser = KhmerArgumentParser( description='Trim sequences at a minimum k-mer abundance.', epilog=textwrap.dedent(epilog), citations=['counting']) parser.add_argument('input_graph', metavar='input_count_graph_filename', help='The input k-mer countgraph filename') parser.add_argument('input_filename', metavar='input_sequence_filename', help='Input FAST[AQ] sequence filename', nargs='+') add_threading_args(parser) parser.add_argument('--cutoff', '-C', dest='cutoff', default=DEFAULT_CUTOFF, type=check_argument_range(0, 256, 'cutoff'), help="Trim at k-mers below this abundance.") parser.add_argument('--variable-coverage', '-V', action='store_true', dest='variable_coverage', default=False, help='Only trim low-abundance k-mers from sequences ' 'that have high coverage.') parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to', help='Base the variable-coverage cutoff on this median' ' k-mer abundance.', default=DEFAULT_NORMALIZE_LIMIT) parser.add_argument('-o', '--output', dest='single_output_file', type=khFileType('wb'), metavar="optional_output_filename", help='Output the trimmed sequences into a single file ' 'with the given filename instead of creating a new ' 'file for each input file.') parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true') add_output_compression_type(parser) return parser
def get_parser(): epilog = """\ Discard sequences based on whether or not their median k-mer abundance lies above a specified cutoff. Kept sequences will be placed in <fileN>.keep. By default, paired end reads will be considered together; if either read should be kept, both will be kept. (This keeps both reads from a fragment, and helps with retention of repeats.) Unpaired reads are treated individually. If :option:`-p`/:option:`--paired` is set, then proper pairing is required and the script will exit on unpaired reads, although :option:`--unpaired-reads` can be used to supply a file of orphan reads to be read after the paired reads. :option:`--force_single` will ignore all pairing information and treat reads individually. With :option:`-s`/:option:`--savegraph`, the k-mer countgraph will be saved to the specified file after all sequences have been processed. :option:`-l`/:option:`--loadgraph` will load the specified k-mer countgraph before processing the specified files. Note that these graphs are are in the same format as those produced by :program:`load-into-counting.py` and consumed by :program:`abundance-dist.py`. To append reads to an output file (rather than overwriting it), send output to STDOUT with `--output -` and use UNIX file redirection syntax (`>>`) to append to the file. Example:: normalize-by-median.py -k 17 tests/test-data/test-abund-read-2.fa Example:: normalize-by-median.py -p -k 17 \\ tests/test-data/test-abund-read-paired.fa Example:: normalize-by-median.py -p -k 17 -o - tests/test-data/paired.fq \\ >> appended-output.fq Example:: normalize-by-median.py -k 17 -f tests/test-data/test-error-reads.fq \\ tests/test-data/test-fastq-reads.fq Example:: normalize-by-median.py -k 17 -s test.ct \\ tests/test-data/test-abund-read-2.fa \\ tests/test-data/test-fastq-reads.fq""" parser = build_counting_args( descr="Do digital normalization (remove mostly redundant sequences)", epilog=textwrap.dedent(epilog), citations=['diginorm']) parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true') parser.add_argument('-C', '--cutoff', help="when the median " "k-mer coverage level is above this number the " "read is not kept.", type=check_argument_range(0, 256, "cutoff"), default=DEFAULT_DESIRED_COVERAGE) parser.add_argument('-p', '--paired', action='store_true', help='require that all sequences be properly paired') parser.add_argument('--force_single', dest='force_single', action='store_true', help='treat all sequences as single-ended/unpaired') parser.add_argument('-u', '--unpaired-reads', metavar="unpaired_reads_filename", help='include a file of unpaired reads to which ' '-p/--paired does not apply.') parser.add_argument('-s', '--savegraph', metavar="filename", default=None, help='save the k-mer countgraph to disk after all ' 'reads are loaded.') parser.add_argument('-R', '--report', help='write progress report to report_filename', metavar='report_filename', type=argparse.FileType('w')) parser.add_argument('--report-frequency', metavar='report_frequency', type=int, default=100000, help='report progress every report_frequency reads') parser.add_argument('-f', '--force', dest='force', help='continue past file reading errors', action='store_true') parser.add_argument('-o', '--output', metavar="filename", type=khFileType('wb'), default=None, dest='single_output_file', help='only output a single file with ' 'the specified filename; use a single dash "-" to ' 'specify that output should go to STDOUT (the ' 'terminal)') parser.add_argument('input_filenames', metavar='input_sequence_filename', help='Input FAST[AQ] sequence filename.', nargs='+') add_loadgraph_args(parser) add_output_compression_type(parser) return parser
def get_parser(): epilog = """\ Trimmed sequences will be placed in ``${input_sequence_filename}.abundfilt``. This script is constant memory. To trim reads based on k-mer abundance across multiple files, use :program:`load-into-counting.py` and :program:`filter-abund.py`. Example:: filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa """ parser = build_counting_args( descr="Trims sequences at a minimum k-mer abundance " "(in memory version).", epilog=textwrap.dedent(epilog), citations=["counting", "SeqAn"], ) add_threading_args(parser) parser.add_argument( "--cutoff", "-C", default=DEFAULT_CUTOFF, type=check_argument_range(0, 256, "cutoff"), help="Trim at k-mers below this abundance.", ) parser.add_argument( "--variable-coverage", "-V", action="store_true", dest="variable_coverage", default=False, help="Only trim low-abundance k-mers from sequences " "that have high coverage.", ) parser.add_argument( "--normalize-to", "-Z", type=int, dest="normalize_to", help="Base the variable-coverage cutoff on this median" " k-mer abundance.", default=DEFAULT_NORMALIZE_LIMIT, ) parser.add_argument( "--savegraph", metavar="filename", default="", help="If present, the name of the file to save the " "k-mer countgraph to", ) parser.add_argument( "-o", "--outfile", metavar="optional_output_filename", default=None, help="Override default output filename " "and output trimmed sequences into a file with the " "given filename.", ) parser.add_argument("datafile", metavar="input_sequence_filename", help="FAST[AQ] sequence file to trim") parser.add_argument("-f", "--force", default=False, action="store_true", help="Overwrite output file if it exists") parser.add_argument("-q", "--quiet", dest="quiet", default=False, action="store_true") add_output_compression_type(parser) return parser