def get_parser(): epilog = """\ Note: with :option:`-b`/:option:`--no-bigcount` the output will be the exact size of the k-mer countgraph and this script will use a constant amount of memory. In exchange k-mer counts will stop at 255. The memory usage of this script with :option:`-b` will be about 1.15x the product of the :option:`-x` and :option:`-N` numbers. Example:: load-into-counting.py -k 20 -x 5e7 out data/100k-filtered.fa Multiple threads can be used to accelerate the process, if you have extra cores to spare. Example:: load-into-counting.py -k 20 -x 5e7 -T 4 out data/100k-filtered.fa """ parser = build_counting_args( "Build a k-mer countgraph from the given" " sequences.", epilog=textwrap.dedent(epilog)) add_threading_args(parser) parser.add_argument('output_countgraph_filename', help="The name of the" " file to write the k-mer countgraph to.") parser.add_argument('input_sequence_filename', nargs='+', help="The names of one or more FAST[AQ] input " "sequence files.") parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help="The default behaviour is " "to count past 255 using bigcount. This flag turns " "bigcount off, limiting counts to 255.") parser.add_argument('--summary-info', '-s', type=str, default=None, metavar="FORMAT", choices=[str('json'), str('tsv')], help="What format should the machine readable run " "summary be in? (`json` or `tsv`, disabled by" " default)") parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true') return parser
def get_parser(): epilog = """ Note: with :option:`-b` the output will be the exact size of the k-mer counting table and this script will use a constant amount of memory. In exchange k-mer counts will stop at 255. The memory usage of this script with :option:`-b` will be about 1.15x the product of the :option:`-x` and :option:`-N` numbers. Example:: load_into_counting.py -k 20 -x 5e7 out.kh data/100k-filtered.fa Multiple threads can be used to accelerate the process, if you have extra cores to spare. Example:: load_into_counting.py -k 20 -x 5e7 -T 4 out.kh data/100k-filtered.fa """ parser = build_counting_args("Build a k-mer counting table from the given" " sequences.", epilog=textwrap.dedent(epilog)) add_threading_args(parser) parser.add_argument('output_countingtable_filename', help="The name of the" " file to write the k-mer counting table to.") parser.add_argument('input_sequence_filename', nargs='+', help="The names of one or more FAST[AQ] input " "sequence files.") parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help='Do not count k-mers past 255') return parser
def get_parser(): epilog = """ Note: with :option:`-b` the output will be the exact size of the k-mer counting table and this script will use a constant amount of memory. In exchange k-mer counts will stop at 255. The memory usage of this script with :option:`-b` will be about 1.15x the product of the :option:`-x` and :option:`-N` numbers. Example:: load-into-counting.py -k 20 -x 5e7 out.kh data/100k-filtered.fa Multiple threads can be used to accelerate the process, if you have extra cores to spare. Example:: load_into_counting.py -k 20 -x 5e7 -T 4 out.kh data/100k-filtered.fa """ parser = build_counting_args("Build a k-mer counting table from the given" " sequences.", epilog=textwrap.dedent(epilog)) add_threading_args(parser) parser.add_argument('output_countingtable_filename', help="The name of the" " file to write the k-mer counting table to.") parser.add_argument('input_sequence_filename', nargs='+', help="The names of one or more FAST[AQ] input " "sequence files.") parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help='Do not count k-mers past 255') parser.add_argument('--report-total-kmers', '-t', action='store_true', help="Prints the total number of k-mers to stderr") return parser
def get_parser(): epilog = """\ Load an k-mer nodegraph/tagset pair created by :program:`load-graph.py`, and a set of pmap files created by :program:`partition-graph.py`. Go through each pmap file, select the largest partition in each, and do the same kind of traversal as in :program:`make-initial-stoptags.py` from each of the waypoints in that partition; this should identify all of the Highly Connected Kmers in that partition. These HCKs are output to ``<graphbase>.stoptags`` after each pmap file. Parameter choice is reasonably important. See the pipeline in :doc:`partitioning-big-data` for an example run. This script is not very scalable and may blow up memory and die horribly. You should be able to use the intermediate stoptags to restart the process, and if you eliminate the already-processed pmap files, you can continue where you left off. """ parser = build_counting_args( descr="Find all highly connected k-mers.", epilog=textwrap.dedent(epilog), citations=['graph']) parser.add_argument('graphbase', help='Basename for the input and output ' 'files.') parser.add_argument('-f', '--force', default=False, action='store_true', help='Continue past warnings') return parser
def get_parser(): epilog = """ Loads a k-mer nodegraph/tagset pair created by load-graph.py, and does a small set of traversals from graph waypoints; on these traversals, looks for k-mers that are repeatedly traversed in high-density regions of the graph, i.e. are highly connected. Outputs those k-mers as an initial set of stoptags, which can be fed into partition-graph.py, find-knots.py, and filter-stoptags.py. The k-mer countgraph size options parameters are for a k-mer countgraph to keep track of repeatedly-traversed k-mers. The subset size option specifies the number of waypoints from which to traverse; for highly connected data sets, the default (1000) is probably ok. """ parser = build_counting_args( descr="Find an initial set of highly connected k-mers.", epilog=textwrap.dedent(epilog)) parser.add_argument('--subset-size', '-s', default=DEFAULT_SUBSET_SIZE, dest='subset_size', type=float, help='Set subset size (default 1e4 is prob ok)') parser.add_argument('--stoptags', '-S', metavar='filename', default='', help="Use stoptags in this file during partitioning") parser.add_argument('graphbase', help='basename for input and output ' 'filenames') parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') return parser
def test_check_tablespace(graph_type, buckets_per_byte): oldstderr = sys.stderr sys.stderr = StringIO() outfile = utils.get_test_data('truncated.fq') parser = khmer_args.build_counting_args() args = parser.parse_args(['-M', '16G']) buckets_per_table = khmer_args.calculate_graphsize(args, graph_type) total_buckets = buckets_per_table * args.n_tables space_needed = total_buckets / buckets_per_byte # First, try with insufficient space with pytest.raises(SystemExit) as se: khmer.kfile.check_space_for_graph(outfile, space_needed, force=False, _testhook_free_space=10e9) assert 'ERROR: Not enough free space' in str(se) # Now, try with insufficient space, but in force mode khmer.kfile.check_space_for_graph(outfile, space_needed, force=True, _testhook_free_space=10e9) assert 'WARNING: Not enough free space' in sys.stderr.getvalue() # Finally, try with sufficient space sys.stderr = StringIO() khmer.kfile.check_space_for_graph(outfile, space_needed, force=False, _testhook_free_space=20e9) assert sys.stderr.getvalue() == '' sys.stderr = oldstderr
def get_parser(): epilog = """ Loads a k-mer presence table/tagset pair created by load-graph.py, and does a small set of traversals from graph waypoints; on these traversals, looks for k-mers that are repeatedly traversed in high-density regions of the graph, i.e. are highly connected. Outputs those k-mers as an initial set of stoptags, which can be fed into partition-graph.py, find-knots.py, and filter-stoptags.py. The k-mer counting table size options parameters are for a k-mer counting table to keep track of repeatedly-traversed k-mers. The subset size option specifies the number of waypoints from which to traverse; for highly connected data sets, the default (1000) is probably ok. """ parser = build_counting_args( descr="Find an initial set of highly connected k-mers.", epilog=textwrap.dedent(epilog)) parser.add_argument('--subset-size', '-s', default=DEFAULT_SUBSET_SIZE, dest='subset_size', type=float, help='Set subset size (default 1e4 is prob ok)') parser.add_argument('--stoptags', '-S', metavar='filename', default='', help="Use stoptags in this file during partitioning") parser.add_argument('graphbase', help='basename for input and output ' 'filenames') return parser
def get_parser(): epilog = """ Trimmed sequences will be placed in ${input_sequence_filename}.abundfilt. This script is constant memory. To trim reads based on k-mer abundance across multiple files, use :program:`load-into-counting.py` and :program:`filter-abund.py`. Example:: filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa """ parser = build_counting_args( descr="Trims sequences at a minimum k-mer abundance " "(in memory version).", epilog=textwrap.dedent(epilog)) add_threading_args(parser) parser.add_argument('--cutoff', '-C', default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.") parser.add_argument('--savetable', metavar="filename", default='', help="If present, the name of the file to save the " "k-mer counting table to") parser.add_argument('datafile', metavar='input_sequence_filename', help="FAST[AQ] sequence file to trim") parser.add_argument('--report-total-kmers', '-t', action='store_true', help="Prints the total number of k-mers to stderr") return parser
def get_parser(): epilog = """ The output is one file for each input file, <input file>.abundtrim, placed in the current directory. This output contains the input sequences trimmed at low-abundance k-mers. The ``-V/--variable-coverage`` parameter will, if specified, prevent elimination of low-abundance reads by only trimming low-abundance k-mers from high-abundance reads; use this for non-genomic data sets that may have variable coverage. Note that the output reads will not necessarily be in the same order as the reads in the input files; if this is an important consideration, use ``load-into-counting.py`` and ``filter-abund.py``. However, read pairs will be kept together, in "broken-paired" format; you can use ``extract-paired-reads.py`` to extract read pairs and orphans. Example:: trim-low-abund.py -x 5e7 -k 20 -C 2 data/100k-filtered.fa """ parser = build_counting_args( descr='Trim low-abundance k-mers using a streaming algorithm.', epilog=textwrap.dedent(epilog)) parser.add_argument('input_filenames', nargs='+') parser.add_argument('--cutoff', '-C', type=int, help='remove k-mers below this abundance', default=DEFAULT_CUTOFF) parser.add_argument('--normalize-to', '-Z', type=int, help='base cutoff on this median k-mer abundance', default=DEFAULT_NORMALIZE_LIMIT) parser.add_argument('-o', '--output', metavar="output_filename", type=argparse.FileType('wb'), help='only output a single file with ' 'the specified filename; use a single dash "-" to ' 'specify that output should go to STDOUT (the ' 'terminal)') parser.add_argument('--variable-coverage', '-V', action='store_true', default=False, help='Only trim low-abundance k-mers from sequences ' 'that have high coverage.') add_loadgraph_args(parser) parser.add_argument('-s', '--savegraph', metavar="filename", default='', help='save the k-mer countgraph to disk after all' 'reads are loaded.') # expert options parser.add_argument('--force', default=False, action='store_true') parser.add_argument('--ignore-pairs', default=False, action='store_true') parser.add_argument('--tempdir', '-T', type=str, default='./') add_output_compression_type(parser) return parser
def get_parser(): epilog = """\ Load an k-mer nodegraph/tagset pair created by :program:`load-graph.py`, and a set of pmap files created by :program:`partition-graph.py`. Go through each pmap file, select the largest partition in each, and do the same kind of traversal as in :program:`make-initial-stoptags.py` from each of the waypoints in that partition; this should identify all of the Highly Connected Kmers in that partition. These HCKs are output to ``<graphbase>.stoptags`` after each pmap file. Parameter choice is reasonably important. See the pipeline in :doc:`partitioning-big-data` for an example run. This script is not very scalable and may blow up memory and die horribly. You should be able to use the intermediate stoptags to restart the process, and if you eliminate the already-processed pmap files, you can continue where you left off. """ parser = build_counting_args(descr="Find all highly connected k-mers.", epilog=textwrap.dedent(epilog)) parser.add_argument('graphbase', help='Basename for the input and output ' 'files.') parser.add_argument('-f', '--force', default=False, action='store_true', help='Continue past warnings') return parser
def get_parser(): epilog = """ Loads a k-mer presence table/tagset pair created by load-graph.py, and does a small set of traversals from graph waypoints; on these traversals, looks for k-mers that are repeatedly traversed in high-density regions of the graph, i.e. are highly connected. Outputs those k-mers as an initial set of stoptags, which can be fed into partition-graph.py, find-knots.py, and filter-stoptags.py. The k-mer counting table size options parameters are for a k-mer counting table to keep track of repeatedly-traversed k-mers. The subset size option specifies the number of waypoints from which to traverse; for highly connected data sets, the default (1000) is probably ok. """ parser = build_counting_args( descr="Find an initial set of highly connected k-mers.", epilog=textwrap.dedent(epilog) ) parser.add_argument( "--subset-size", "-s", default=DEFAULT_SUBSET_SIZE, dest="subset_size", type=float, help="Set subset size (default 1e4 is prob ok)", ) parser.add_argument( "--stoptags", "-S", metavar="filename", default="", help="Use stoptags in this file during partitioning" ) parser.add_argument("graphbase", help="basename for input and output " "filenames") return parser
def get_parser(): epilog = ''' Note that with :option:`-b` this script is constant memory; in exchange, k-mer counts will stop at 255. The memory usage of this script with :option:`-b` will be about 1.15x the product of the :option:`-x` and :option:`-N` numbers. To count k-mers in multiple files use :program:`load_into_counting.py` and :program:`abundance_dist.py`. ''' parser = build_counting_args( descr="Calculate the abundance distribution of k-mers from a " "single sequence file.", epilog=textwrap.dedent(epilog)) add_threading_args(parser) parser.add_argument('input_sequence_filename', help='The name of the input' ' FAST[AQ] sequence file.') parser.add_argument('output_histogram_filename', help='The name of the ' 'output histogram file. The columns are: (1) k-mer ' 'abundance, (2) k-mer count, (3) cumulative count, ' '(4) fraction of total distinct k-mers.') parser.add_argument('-z', '--no-zero', dest='output_zero', default=True, action='store_false', help='Do not output 0-count bins') parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help='Do not count k-mers past 255') parser.add_argument('-s', '--squash', dest='squash_output', default=False, action='store_true', help='Overwrite output file if it exists') parser.add_argument('--savetable', default='', metavar="filename", help="Save the k-mer counting table to the specified " "filename.") return parser
def get_parser(): epilog = """ Trimmed sequences will be placed in ${input_sequence_filename}.abundfilt. This script is constant memory. To trim reads based on k-mer abundance across multiple files, use :program:`load-into-counting.py` and :program:`filter-abund.py`. Example:: filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa """ parser = build_counting_args( descr="Trims sequences at a minimum k-mer abundance " "(in memory version).", epilog=textwrap.dedent(epilog) ) add_threading_args(parser) parser.add_argument("--cutoff", "-C", default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.") parser.add_argument( "--savetable", metavar="filename", default="", help="If present, the name of the file to save the " "k-mer counting table to", ) parser.add_argument("datafile", metavar="input_sequence_filename", help="FAST[AQ] sequence file to trim") return parser
def get_parser(): epilog = """ The memory usage of this script with :option:`-b` will be about 1.15x the product of the :option:`-x` and :option:`-N` numbers. Example:: collect-reads.py -k 20 -x 5e7 out.ct data/100k-filtered.fa """ parser = build_counting_args("Collect reads until a given avg coverage.", epilog=textwrap.dedent(epilog)) parser.add_argument('output_countgraph_filename', help="The name of the" " file to write the k-mer countgraph to.") parser.add_argument('input_sequence_filename', nargs='+', help="The names of one or more FAST[AQ] input " "sequence files.") parser.add_argument('--report-total-kmers', '-t', action='store_true', help="Prints the total number of k-mers to stderr") parser.add_argument('-C', '--coverage', type=int, default=50, help='Collect reads until this coverage, then exit.') parser.add_argument('-o', '--output', type=argparse.FileType('w'), help='Write collect reads into this file.') parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help='Do not count k-mers past 255') return parser
def test_check_tablespace_nodegraph(graph_type, exp_buckets): parser = khmer_args.build_counting_args() args = parser.parse_args(['-M', '3G']) buckets_per_table = khmer_args.calculate_graphsize(args, graph_type) total_buckets = buckets_per_table * args.n_tables sizestr = '{:.1f} million buckets'.format(float(total_buckets) / 1e9) assert sizestr == exp_buckets
def get_parser(): epilog = """ Trimmed sequences will be placed in ${input_sequence_filename}.abundfilt. This script is constant memory. To trim reads based on k-mer abundance across multiple files, use :program:`load-into-counting.py` and :program:`filter-abund.py`. Example:: filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa """ parser = build_counting_args( descr="Trims sequences at a minimum k-mer abundance " "(in memory version).", epilog=textwrap.dedent(epilog)) add_threading_args(parser) parser.add_argument('--cutoff', '-C', default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.") parser.add_argument('--savetable', metavar="filename", default='', help="If present, the name of the file to save the " "k-mer counting table to") parser.add_argument('datafile', metavar='input_sequence_filename', help="FAST[AQ] sequence file to trim") parser.add_argument('--report-total-kmers', '-t', action='store_true', help="Prints the total number of k-mers to stderr") parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') return parser
def get_parser(): epilog = """ The memory usage of this script with :option:`-b` will be about 1.15x the product of the :option:`-x` and :option:`-N` numbers. Example:: collect-reads.py -k 20 -x 5e7 out.ct data/100k-filtered.fa """ parser = build_counting_args("Collect reads until a given avg coverage.", epilog=textwrap.dedent(epilog)) parser.add_argument( "output_countingtable_filename", help="The name of the" " file to write the k-mer counting table to." ) parser.add_argument( "input_sequence_filename", nargs="+", help="The names of one or more FAST[AQ] input " "sequence files." ) parser.add_argument( "--report-total-kmers", "-t", action="store_true", help="Prints the total number of k-mers to stderr" ) parser.add_argument("-C", "--coverage", type=int, default=50, help="Collect reads until this coverage, then exit.") parser.add_argument("-o", "--output", type=argparse.FileType("w"), help="Write collect reads into this file.") parser.add_argument( "-b", "--no-bigcount", dest="bigcount", default=True, action="store_false", help="Do not count k-mers past 255" ) return parser
def get_parser(): epilog = '''\ Note that with :option:`-b`/:option:`--no-bigcount` this script is constant memory; in exchange, k-mer counts will stop at 255. The memory usage of this script with :option:`-b` will be about 1.15x the product of the :option:`-x` and :option:`-N` numbers. To count k-mers in multiple files use :program:`load_into_counting.py` and :program:`abundance_dist.py`. Example:: abundance-dist-single.py -x 1e7 -N 2 -k 17 \\ tests/test-data/test-abund-read-2.fa test-dist ''' parser = build_counting_args( descr="Calculate the abundance distribution of k-mers from a " "single sequence file.", epilog=textwrap.dedent(epilog)) add_threading_args(parser) parser.add_argument('input_sequence_filename', help='The name of the input' ' FAST[AQ] sequence file.') parser.add_argument('output_histogram_filename', help='The name of the ' 'output histogram file. The columns are: (1) k-mer ' 'abundance, (2) k-mer count, (3) cumulative count, ' '(4) fraction of total distinct k-mers.') parser.add_argument('-z', '--no-zero', dest='output_zero', default=True, action='store_false', help='Do not output zero-count bins') parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help='Do not count k-mers past 255') parser.add_argument('-s', '--squash', dest='squash_output', default=False, action='store_true', help='Overwrite output file if it exists') parser.add_argument('--savegraph', default='', metavar="filename", help="Save the k-mer countgraph to the specified " "filename.") parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') return parser
def get_parser(): epilog = """ Note: with :option:`-b` the output will be the exact size of the k-mer counting table and this script will use a constant amount of memory. In exchange k-mer counts will stop at 255. The memory usage of this script with :option:`-b` will be about 1.15x the product of the :option:`-x` and :option:`-N` numbers. Example:: load-into-counting.py -k 20 -x 5e7 out.ct data/100k-filtered.fa Multiple threads can be used to accelerate the process, if you have extra cores to spare. Example:: load-into-counting.py -k 20 -x 5e7 -T 4 out.ct data/100k-filtered.fa """ parser = build_counting_args( "Build a k-mer counting table from the given" " sequences.", epilog=textwrap.dedent(epilog)) add_threading_args(parser) parser.add_argument('output_countingtable_filename', help="The name of the" " file to write the k-mer counting table to.") parser.add_argument('input_sequence_filename', nargs='+', help="The names of one or more FAST[AQ] input " "sequence files.") parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help='Do not count k-mers past 255') parser.add_argument('--summary-info', '-s', default=None, metavar="FORMAT", choices=['json', 'tsv'], help="What format should the machine readable run " "summary be in? (json or tsv, disabled by default)") parser.add_argument('--report-total-kmers', '-t', action='store_true', help="Prints the total number of k-mers to stderr") parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') return parser
def get_parser(): epilog = """ Note that with :option:`-b` this script is constant memory; in exchange, k-mer counts will stop at 255. The memory usage of this script with :option:`-b` will be about 1.15x the product of the :option:`-x` and :option:`-N` numbers. To count k-mers in multiple files use :program:`load_into_counting.py` and :program:`abundance_dist.py`. """ parser = build_counting_args( descr="Calculate the abundance distribution of k-mers from a " "single sequence file.", epilog=textwrap.dedent(epilog), ) add_threading_args(parser) parser.add_argument("input_sequence_filename", help="The name of the input" " FAST[AQ] sequence file.") parser.add_argument( "output_histogram_filename", help="The name of the " "output histogram file. The columns are: (1) k-mer " "abundance, (2) k-mer count, (3) cumulative count, " "(4) fraction of total distinct k-mers.", ) parser.add_argument( "-z", "--no-zero", dest="output_zero", default=True, action="store_false", help="Do not output 0-count bins" ) parser.add_argument( "-b", "--no-bigcount", dest="bigcount", default=True, action="store_false", help="Do not count k-mers past 255" ) parser.add_argument( "-s", "--squash", dest="squash_output", default=False, action="store_true", help="Overwrite output file if it exists", ) parser.add_argument( "--csv", default=False, action="store_true", help="Use the CSV format for the histogram. " "Includes column headers.", ) parser.add_argument( "--savetable", default="", metavar="filename", help="Save the k-mer counting table to the specified " "filename.", ) parser.add_argument( "--report-total-kmers", "-t", action="store_true", help="Prints the total number of k-mers to stderr" ) parser.add_argument("-f", "--force", default=False, action="store_true", help="Overwrite output file if it exists") return parser
def get_parser(): epilog = """ Note: with :option:`-b` the output will be the exact size of the k-mer counting table and this script will use a constant amount of memory. In exchange k-mer counts will stop at 255. The memory usage of this script with :option:`-b` will be about 1.15x the product of the :option:`-x` and :option:`-N` numbers. Example:: load-into-counting.py -k 20 -x 5e7 out.ct data/100k-filtered.fa Multiple threads can be used to accelerate the process, if you have extra cores to spare. Example:: load-into-counting.py -k 20 -x 5e7 -T 4 out.ct data/100k-filtered.fa """ parser = build_counting_args( "Build a k-mer counting table from the given" " sequences.", epilog=textwrap.dedent(epilog) ) add_threading_args(parser) parser.add_argument( "output_countingtable_filename", help="The name of the" " file to write the k-mer counting table to." ) parser.add_argument( "input_sequence_filename", nargs="+", help="The names of one or more FAST[AQ] input " "sequence files." ) parser.add_argument( "-b", "--no-bigcount", dest="bigcount", default=True, action="store_false", help="The default behaviour is " "to count past 255 using bigcount. This flag turns " "bigcount off, limiting counts to 255.", ) parser.add_argument( "--summary-info", "-s", type=str, default=None, metavar="FORMAT", choices=[str("json"), str("tsv")], help="What format should the machine readable run " "summary be in? (json or tsv, disabled by default)", ) parser.add_argument( "--report-total-kmers", "-t", action="store_true", help="Prints the total number of k-mers to stderr" ) parser.add_argument("-f", "--force", default=False, action="store_true", help="Overwrite output file if it exists") return parser
def get_parser(): epilog = """ The output is one file for each input file, <input file>.corr, placed in the current directory. This output contains the input sequences, corrected at low-abundance k-mers. Note that the output reads will not necessarily be in the same order as the reads in the input files. However, read pairs will be kept together, in "broken-paired" format; you can use ``extract-paired-reads.py`` to extract read pairs and orphans. Example:: correct-reads.py -x 5e7 -k 20 -C 2 data/100k-filtered.fa """ parser = build_counting_args( descr='Correct reads using a semi-streaming algorithm.', epilog=textwrap.dedent(epilog)) parser.add_argument('input_filenames', nargs='+') parser.add_argument('--cutoff', '-C', type=int, help='k-mers below this abundance are not trusted', default=DEFAULT_CUTOFF) parser.add_argument('--normalize-to', '-Z', type=int, help='base cutoff on this median k-mer abundance', default=DEFAULT_NORMALIZE_LIMIT) parser.add_argument('-o', '--out', metavar="filename", type=argparse.FileType('w'), default=None, help='only output a single file with ' 'the specified filename; use a single dash "-" to ' 'specify that output should go to STDOUT (the ' 'terminal)') parser.add_argument('--variable-coverage', '-V', action='store_true', default=False, help='Only correct sequences that have high coverage.') add_loadgraph_args(parser) parser.add_argument('-s', '--savegraph', metavar="filename", default='', help='save the k-mer countgraph to disk after all' 'reads are loaded.') # expert options parser.add_argument('--force', default=False, action='store_true') parser.add_argument('--ignore-pairs', default=False, action='store_true') parser.add_argument('--tempdir', '-T', type=str, default='./') parser.add_argument("--theta", dest="bits_theta", type=float, default=1.0) return parser
def main(): parser = khmer_args.build_counting_args( "Correct reads against an already-computed table", citations=['counting', 'SeqAn']) parser.add_argument("--trusted-cov", dest="trusted_cov", type=int, default=DEFAULT_CUTOFF) parser.add_argument("--theta", dest="bits_theta", type=float, default=1.0) parser.add_argument('-o', '--output', dest='output_file', help="output file for histogram; defaults to " "<first filename>.corr in cwd.", type=khFileType('w'), default=None) parser.add_argument('counts_table') parser.add_argument('readfile') args = parser.parse_args() print('loading counts') ht = Countgraph.load(args.counts_table) aligner = khmer.ReadAligner(ht, args.trusted_cov, args.bits_theta) print("trusted:", args.trusted_cov) corrfp = args.output_file if not corrfp: outfile = os.path.basename(args.readfile) + '.corr' corrfp = open(outfile, 'w') n_corrected = 0 for n, read in enumerate(screed.open(args.readfile)): if n % 10000 == 0: print('...', n, n_corrected, file=sys.stderr) seq = read.sequence.replace('N', 'A') # build the alignment... score, graph_alignment, read_alignment, truncated = \ aligner.align(seq) if not truncated: graph_seq = graph_alignment.replace("-", "") if graph_seq != seq: n_corrected += 1 seq = graph_seq corrfp.write(output_single(read, seq))
def get_parser(): epilog = """ Trimmed sequences will be placed in ${input_sequence_filename}.abundfilt for each input sequence file. If the input sequences are from RNAseq or metagenome sequencing then :option:`--variable-coverage` should be used. Example:: load-into-counting.py -k 20 -x 5e7 table.kh data/100k-filtered.fa filter-abund.py -C 2 table.kh data/100k-filtered.fa """ parser = build_counting_args( descr='Trim sequences at a minimum k-mer abundance.', epilog=textwrap.dedent(epilog)) parser.add_argument('input_table', metavar='input_presence_table_filename', help='The input k-mer presence table filename') parser.add_argument('input_filename', metavar='input_sequence_filename', help='Input FAST[AQ] sequence filename', nargs='+') add_threading_args(parser) parser.add_argument('--cutoff', '-C', dest='cutoff', default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.") parser.add_argument('--variable-coverage', '-V', action='store_true', dest='variable_coverage', default=False, help='Only trim low-abundance k-mers from sequences ' 'that have high coverage.') parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to', help='Base the variable-coverage cutoff on this median' ' k-mer abundance.', default=DEFAULT_NORMALIZE_LIMIT) parser.add_argument('-o', '--out', dest='single_output_filename', default='', metavar="optional_output_filename", help='Output the trimmed sequences into a single file ' 'with the given filename instead of creating a new ' 'file for each input file.') return parser
def get_parser(): parser = build_counting_args( descr="Output abundances of the k-mers in the sequence file.") add_threading_args(parser) parser.add_argument('input_sequence_filename', help='The input' ' FAST[AQ] sequence file.') parser.add_argument('-o', '--out', metavar="output_file", dest='output_file', type=argparse.FileType('w'), default=None, help='output counts to this file') return parser
def test_check_tablespace_force(): save_stderr, sys.stderr = sys.stderr, io.StringIO() parser = khmer_args.build_counting_args() args = parser.parse_args(['-M', '1e9']) try: khmer.kfile.check_space_for_hashtable(args, 'countgraph', True, _testhook_free_space=0) assert True, "this should pass" except SystemExit as e: print(str(e)) finally: sys.stderr = save_stderr
def test_check_tablespace(): outfile = utils.get_test_data('truncated.fq') save_stderr, sys.stderr = sys.stderr, io.StringIO() parser = khmer_args.build_counting_args() args = parser.parse_args(['-M', '1e9']) try: tablesize = khmer_args.calculate_graphsize(args, 'countgraph') khmer.kfile.check_space_for_graph(outfile, tablesize, False, _testhook_free_space=0) assert 0, "this should fail" except SystemExit as e: print(str(e)) finally: sys.stderr = save_stderr
def get_parser(): epilog = """\ Trimmed sequences will be placed in ``${input_sequence_filename}.abundfilt``. This script is constant memory. To trim reads based on k-mer abundance across multiple files, use :program:`load-into-counting.py` and :program:`filter-abund.py`. Example:: filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa """ parser = build_counting_args( descr="Trims sequences at a minimum k-mer abundance " "(in memory version).", epilog=textwrap.dedent(epilog), citations=['counting', 'SeqAn']) add_threading_args(parser) parser.add_argument('-C', '--cutoff', default=DEFAULT_CUTOFF, type=check_argument_range(0, 256, "cutoff"), help="Trim at k-mers below this abundance.") parser.add_argument('-V', '--variable-coverage', action='store_true', dest='variable_coverage', default=False, help='Only trim low-abundance k-mers from sequences ' 'that have high coverage.') parser.add_argument('-Z', '--normalize-to', type=int, dest='normalize_to', help='Base the variable-coverage cutoff on this median' ' k-mer abundance.', default=DEFAULT_NORMALIZE_LIMIT) parser.add_argument('--savegraph', metavar="filename", default='', help="If present, the name of the file to save the " "k-mer countgraph to") parser.add_argument('-o', '--outfile', metavar='optional_output_filename', default=None, help='Override default output filename ' 'and output trimmed sequences into a file with the ' 'given filename.') parser.add_argument('datafile', metavar='input_sequence_filename', help="FAST[AQ] sequence file to trim") parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true') add_output_compression_type(parser) return parser
def get_parser(): epilog = """\ Trimmed sequences will be placed in ``${input_sequence_filename}.abundfilt``. This script is constant memory. To trim reads based on k-mer abundance across multiple files, use :program:`load-into-counting.py` and :program:`filter-abund.py`. Example:: filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa """ parser = build_counting_args( descr="Trims sequences at a minimum k-mer abundance " "(in memory version).", epilog=textwrap.dedent(epilog), citations=['counting', 'SeqAn']) add_threading_args(parser) parser.add_argument('--cutoff', '-C', default=DEFAULT_CUTOFF, type=check_argument_range(0, 256, "cutoff"), help="Trim at k-mers below this abundance.") parser.add_argument('--variable-coverage', '-V', action='store_true', dest='variable_coverage', default=False, help='Only trim low-abundance k-mers from sequences ' 'that have high coverage.') parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to', help='Base the variable-coverage cutoff on this median' ' k-mer abundance.', default=DEFAULT_NORMALIZE_LIMIT) parser.add_argument('--savegraph', metavar="filename", default='', help="If present, the name of the file to save the " "k-mer countgraph to") parser.add_argument('-o', '--outfile', metavar='optional_output_filename', default=None, help='Override default output filename ' 'and output trimmed sequences into a file with the ' 'given filename.') parser.add_argument('datafile', metavar='input_sequence_filename', help="FAST[AQ] sequence file to trim") parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true') add_output_compression_type(parser) return parser
def get_parser(): epilog = """\ Note: with :option:`-b`/:option:`--no-bigcount` the output will be the exact size of the k-mer countgraph and this script will use a constant amount of memory. In exchange k-mer counts will stop at 255. The memory usage of this script with :option:`-b` will be about 1.15x the product of the :option:`-x` and :option:`-N` numbers. Example:: load-into-counting.py -k 20 -x 5e7 out data/100k-filtered.fa Multiple threads can be used to accelerate the process, if you have extra cores to spare. Example:: load-into-counting.py -k 20 -x 5e7 -T 4 out data/100k-filtered.fa """ parser = build_counting_args("Build a k-mer countgraph from the given" " sequences.", epilog=textwrap.dedent(epilog), citations=['counting', 'SeqAn']) add_threading_args(parser) parser.add_argument('output_countgraph_filename', help="The name of the" " file to write the k-mer countgraph to.") parser.add_argument('input_sequence_filename', nargs='+', help="The names of one or more FAST[AQ] input " "sequence files.") parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help="The default behaviour is " "to count past 255 using bigcount. This flag turns " "bigcount off, limiting counts to 255.") parser.add_argument('-s', '--summary-info', type=str, default=None, metavar="FORMAT", choices=[str('json'), str('tsv')], help="What format should the machine readable run " "summary be in? (`json` or `tsv`, disabled by" " default)") parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true') return parser
def main(): parser = build_counting_args() parser.add_argument("--trusted-cov", dest="trusted_cov", type=int, default=2) parser.add_argument("--theta", type=float, default=1.0) parser.add_argument("input_table") parser.add_argument("input_filenames", nargs="+") add_loadhash_args(parser) args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print >> sys.stderr, 'file with ht: %s' % counting_ht print >> sys.stderr, 'loading hashtable' ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() aligner = khmer.new_readaligner( ht, args.trusted_cov, args.theta ) # counting hash, trusted kmer coverage cutoff, bits theta (threshold value for terminating unproductive alignemnts) ### the filtering loop for infile in infiles: print >> sys.stderr, 'aligning', infile for n, record in enumerate(screed.open(infile)): name = record['name'] seq = record['sequence'].upper() print >> sys.stderr, name print >> sys.stderr, seq score, graph_alignment, read_alignment, truncated = aligner.align( seq) print >> sys.stderr, score print >> sys.stderr, graph_alignment print >> sys.stderr, read_alignment print >> sys.stderr, truncated print ">{0}\n{1}".format(name, graph_alignment)
def main(): parser = build_counting_args() parser.add_argument('--coverage', '-C', dest='coverage', default=DEFAULT_COVERAGE, type=int) args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print('file with ht: %s' % counting_ht) print('loading hashtable') ht = khmer.load_countgraph(counting_ht) K = ht.ksize() print("K:", K) # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] med, avg, dev = ht.get_median_count(seq) if random.randint(1, med) > args.coverage: return None, None return name, seq # the filtering loop for infile in infiles: print('filtering', infile) outfile = os.path.basename(infile) + '.medfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print('output in', outfile)
def main(): parser = build_counting_args() parser.add_argument('--min-abundance', default=50, type=int) parser.add_argument('input_files', nargs='+') parser.add_argument('-o', '--out', type=argparse.FileType('wb'), default=sys.stdout) args = parser.parse_args() countgraph = khmer_args.create_countgraph(args, multiplier=1.1) count = 0 for fn in args.input_files: short = os.path.basename(fn) for n, record in enumerate(screed.open(fn)): if n % 100000 == 0: print('Processed {n} reads...'.format(n=n), file=sys.stderr) countgraph.consume(record.sequence) if countgraph.median_at_least(record.sequence, args.min_abundance): args.out.write('>{fn}:{name}:{c}\n{seq}\n'.format(fn=short, c=count, name=record.name, seq=record.sequence)) count += 1
def get_parser(): epilog = """\ Trimmed sequences will be placed in ``${input_sequence_filename}.abundfilt``. This script is constant memory. To trim reads based on k-mer abundance across multiple files, use :program:`load-into-counting.py` and :program:`filter-abund.py`. Example:: filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa """ parser = build_counting_args( descr="Trims sequences at a minimum k-mer abundance " "(in memory version).", epilog=textwrap.dedent(epilog)) add_threading_args(parser) parser.add_argument('--cutoff', '-C', default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.") parser.add_argument('--savegraph', metavar="filename", default='', help="If present, the name of the file to save the " "k-mer countgraph to") parser.add_argument('datafile', metavar='input_sequence_filename', help="FAST[AQ] sequence file to trim") parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') add_output_compression_type(parser) return parser
def get_parser(): epilog = """ Note: with :option:`-b` the output will be the exact size of the k-mer counting table and this script will use a constant amount of memory. In exchange k-mer counts will stop at 255. The memory usage of this script with :option:`-b` will be about 1.15x the product of the :option:`-x` and :option:`-N` numbers. Example:: load-into-counting.py -k 20 -x 5e7 out.kh data/100k-filtered.fa Multiple threads can be used to accelerate the process, if you have extra cores to spare. Example:: load-into-counting.py -k 20 -x 5e7 -T 4 out.kh data/100k-filtered.fa """ parser = build_counting_args("Build a k-mer counting table from the given" " sequences.", epilog=textwrap.dedent(epilog)) add_threading_args(parser) parser.add_argument('output_countingtable_filename', help="The name of the" " file to write the k-mer counting table to.") parser.add_argument('input_sequence_filename', nargs='+', help="The names of one or more FAST[AQ] input " "sequence files.") parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True, action='store_false', help='Do not count k-mers past 255') parser.add_argument('--summary-info', '-s', default=None, metavar="FORMAT", choices=['json', 'tsv'], help="What format should the machine readable run " "summary be in? (json or tsv, disabled by default)") parser.add_argument('--report-total-kmers', '-t', action='store_true', help="Prints the total number of k-mers to stderr") parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') return parser
def main(): parser = build_counting_args() parser.add_argument('--coverage', '-C', dest='coverage', default=DEFAULT_COVERAGE, type=int) args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print 'file with ht: %s' % counting_ht print 'loading hashtable' ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() print "K:", K # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] med, avg, dev = ht.get_median_count(seq) if random.randint(1, med) > args.coverage: return None, None return name, seq # the filtering loop for infile in infiles: print 'filtering', infile outfile = os.path.basename(infile) + '.medfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print 'output in', outfile
def main(): parser = build_counting_args() parser.add_argument("--trusted-cov", dest="trusted_cov", type=int, default=2) parser.add_argument("--theta", type=float, default=1.0) parser.add_argument("input_table") parser.add_argument("input_filenames", nargs="+") add_loadhash_args(parser) args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print >>sys.stderr, 'file with ht: %s' % counting_ht print >>sys.stderr, 'loading hashtable' ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() aligner = khmer.new_readaligner(ht, args.trusted_cov, args.theta) # counting hash, trusted kmer coverage cutoff, bits theta (threshold value for terminating unproductive alignemnts) ### the filtering loop for infile in infiles: print >>sys.stderr, 'aligning', infile for n, record in enumerate(screed.open(infile)): name = record['name'] seq = record['sequence'].upper() print >>sys.stderr, name print >>sys.stderr, seq score, graph_alignment, read_alignment, truncated = aligner.align(seq) print >>sys.stderr, score print >>sys.stderr, graph_alignment print >>sys.stderr, read_alignment print >>sys.stderr, truncated print ">{0}\n{1}".format(name, graph_alignment)
def get_parser(): epilog = """\ Trimmed sequences will be placed in ``${input_sequence_filename}.abundfilt``. This script is constant memory. To trim reads based on k-mer abundance across multiple files, use :program:`load-into-counting.py` and :program:`filter-abund.py`. Example:: filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa """ parser = build_counting_args( descr="Trims sequences at a minimum k-mer abundance " "(in memory version).", epilog=textwrap.dedent(epilog)) add_threading_args(parser) parser.add_argument('--cutoff', '-C', default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.") parser.add_argument('--savegraph', metavar="filename", default='', help="If present, the name of the file to save the " "k-mer countgraph to") parser.add_argument('-o', '--outfile', metavar='optional_output_filename', default=None, help='Override default output filename ' 'and output trimmed sequences into a file with the ' 'given filename.') parser.add_argument('datafile', metavar='input_sequence_filename', help="FAST[AQ] sequence file to trim") parser.add_argument('-f', '--force', default=False, action='store_true', help='Overwrite output file if it exists') parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true') add_output_compression_type(parser) return parser
def main(): p = build_counting_args(descr='Streaming assembly with tracking info') p.add_argument('fastq_files', nargs='+') p.add_argument('-o', type=argparse.FileType('w'), default='assembly-stats.csv') args = p.parse_args() cg = create_countgraph(args) kept = 0 hdn = khmer.HashSet(args.ksize) lh = khmer._GraphLabels(cg) next_label = 1 next_orf = 1 output = set() statswriter = csv.DictWriter(args.o, delimiter=',', fieldnames=[ 'read_n', 'action', 'cov', 'n_hdn', 'contig_n', 'orf_n', 'new' ]) for filename in args.fastq_files: for n, record in enumerate(screed.open(filename)): if n and n % 10000 == 0: print('...', n, file=sys.stderr) if len(record.sequence) < args.ksize: continue cov, _, _ = cg.get_median_count(record.sequence) if cov < 20: kept += 1 cg.consume(record.sequence) statswriter.writerow({ 'read_n': n, 'action': 'c', 'cov': cov, 'n_hdn': None, 'contig_n': None, 'orf_n': None, 'new': None }) elif cov < 30: #print('intermediate', next_label, file=sys.stderr) seq, pos = cg.trim_on_abundance(record.sequence, 3) if len(seq) < args.ksize: continue cg.consume(seq) hdn = cg.find_high_degree_nodes(seq) lh.label_across_high_degree_nodes(seq, hdn, next_label) next_label += 1 statswriter.writerow({ 'read_n': n, 'action': 'l', 'cov': cov, 'n_hdn': len(hdn), 'contig_n': None, 'orf_n': None, 'new': None }) elif cov == 30: contigs = lh.assemble_labeled_path( record.sequence[:args.ksize]) for contig_n, contig in enumerate(contigs): statswriter.writerow({ 'read_n': n, 'action': 'a', 'cov': cov, 'n_hdn': None, 'contig_n': contig_n, 'orf_n': None, 'new': None }) for t in translate(contig): for orf_n, o in enumerate(extract_orfs(t)): if hash(o) not in output: new = True output.add(hash(o)) print('>orf%d\n%s' % (next_orf, o)) next_orf += 1 else: new = False statswriter.writerow({ 'read_n': n, 'action': 'a', 'cov': cov, 'n_hdn': None, 'contig_n': contig_n, 'orf_n': orf_n, 'new': new })
def get_parser(): epilog = """\ The output is one file for each input file, ``<input file>.abundtrim``, placed in the current directory. This output contains the input sequences trimmed at low-abundance k-mers. The :option:`-V`/:option:`--variable-coverage` parameter will, if specified, prevent elimination of low-abundance reads by only trimming low-abundance k-mers from high-abundance reads; use this for non-genomic data sets that may have variable coverage. Note that the output reads will not necessarily be in the same order as the reads in the input files; if this is an important consideration, use :program:`load-into-counting.py` and :program:`filter-abund.py`. However, read pairs will be kept together, in "broken-paired" format; you can use :program:`extract-paired-reads.py` to extract read pairs and orphans. Example:: trim-low-abund.py -x 5e7 -k 20 -C 2 data/100k-filtered.fa """ parser = build_counting_args( descr='Trim low-abundance k-mers using a streaming algorithm.', epilog=textwrap.dedent(epilog)) parser.add_argument('input_filenames', nargs='+') parser.add_argument('--cutoff', '-C', type=int, help='remove k-mers below this abundance', default=DEFAULT_CUTOFF) parser.add_argument('--trim-at-coverage', '-Z', '--normalize-to', type=int, help='trim reads when entire read above this coverage', default=DEFAULT_TRIM_AT_COVERAGE) parser.add_argument('-o', '--output', metavar="output_filename", type=argparse.FileType('wb'), help='only output a single file with ' 'the specified filename; use a single dash "-" to ' 'specify that output should go to STDOUT (the ' 'terminal)') parser.add_argument('--variable-coverage', '-V', action='store_true', default=False, help='Only trim low-abundance k-mers from sequences ' 'that have high coverage.') add_loadgraph_args(parser) parser.add_argument('-s', '--savegraph', metavar="filename", default='', help='save the k-mer countgraph to disk after all' 'reads are loaded.') parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true') # expert options parser.add_argument('--force', default=False, action='store_true') parser.add_argument('--ignore-pairs', default=False, action='store_true') parser.add_argument('--tempdir', '-T', type=str, default='./', help="Set location of temporary directory for " "second pass") add_output_compression_type(parser) parser.add_argument('--diginorm', default=False, action='store_true', help="Eliminate high-coverage reads altogether " "(digital normalization).") parser.add_argument('--diginorm-coverage', type=int, default=DEFAULT_DIGINORM_COVERAGE, help="Coverage threshold for --diginorm") parser.add_argument('--single-pass', default=False, action='store_true', help="Do not do a second pass across the low coverage " "data") return parser
def main(): parser = build_counting_args() parser.add_argument("-t", "--trusted-cutoff", dest="trusted_cutoff", type=int, default=3) parser.add_argument( "--bits-theta", help= "Tuning parameter controlling trade off of speed vs alignment sensitivity", default=1.0, type=float, dest="bits_theta") parser.add_argument('-C', '--cutoff', type=int, dest='cutoff', default=DEFAULT_MINIMUM_COVERAGE) parser.add_argument('-s', '--savehash', dest='savehash', default='') parser.add_argument('-l', '--loadhash', dest='loadhash', default='') parser.add_argument('--details-out', dest="details_out") parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: print >> sys.stderr, '\nPARAMETERS:' print >> sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >> sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \ args.min_hashsize print >> sys.stderr, '' print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \ '(n_hashes x min_hashsize)' % ( args.n_hashes * args.min_hashsize) print >> sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes DESIRED_COVERAGE = args.cutoff filenames = args.input_filenames if args.loadhash: print 'loading hashtable from', args.loadhash ht = khmer.load_counting_hash(args.loadhash) else: print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) aligner = khmer.new_readaligner(ht, args.trusted_cutoff, args.bits_theta) if args.details_out != None: details_out = open(args.details_out, "w") else: details_out = None total = 0 discarded = 0 for input_filename in filenames: output_name = os.path.basename(input_filename) + '.keepalign' outfp = open(output_name, 'w') for n, record in enumerate(screed.open(input_filename)): if n > 0 and n % 10000 == 0: print '... kept', total - discarded, 'of', total, ', or', \ int(100. - discarded / float(total) * 100.), '%' print '... in file', input_filename total += 1 if len(record.sequence) < K: continue seq = record.sequence.upper().replace('N', 'A') ## score, graph_alignment, read_alignment, truncated = aligner.align( record.sequence) keep = False if truncated: keep = True else: if False: graph_seq = graph_alignment.replace("-", "") else: graph_seq = "" for i in range(len(graph_alignment)): if graph_alignment[i] == "-": graph_seq += read_alignment[i] else: graph_seq += graph_alignment[i] mincount = ht.get_min_count(graph_seq) keep = True seq = graph_seq #if mincount < DESIRED_COVERAGE: # keep = True # seq = graph_seq #else: # assert not keep if details_out != None: details_out.write( "+{7}\t{0:0.2f}\t{3}\t{4}\nread: {6}\ngraph_aln: {1}\nread_aln: {2}\nstored_seq:{5}\n" .format(score, graph_alignment, read_alignment, truncated, keep, seq, record.sequence, record.name)) if keep: ht.consume(seq) outfp.write('>%s\n%s\n' % (record.name, seq)) else: discarded += 1 print 'DONE with', input_filename, '; kept', total - discarded, 'of',\ total, 'or', int(100. - discarded / float(total) * 100.), '%' print 'output in', output_name if args.savehash: print 'Saving hashfile through', input_filename print '...saving to', args.savehash ht.save(args.savehash) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, "** ERROR: the counting hash is too small for" print >> sys.stderr, "** this data set. Increase hashsize/num ht." print >> sys.stderr, "**" print >> sys.stderr, "** Do not use these results!!" sys.exit(-1)
def main(): parser = build_counting_args() parser.add_argument('-C', '--cutoff', type=int, dest='cutoff', default=DEFAULT_DESIRED_COVERAGE) parser.add_argument('-p', '--paired', action='store_true') parser.add_argument('-s', '--savehash', dest='savehash', default='') parser.add_argument('-l', '--loadhash', dest='loadhash', default='') parser.add_argument('-R', '--report-to-file', dest='report_file', type=argparse.FileType('w')) parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: if args.min_hashsize == DEFAULT_MIN_HASHSIZE and not args.loadhash: print >> sys.stderr, "** WARNING: hashsize is default! You absodefly want to increase this!\n** Please read the docs!" print >> sys.stderr, '\nPARAMETERS:' print >> sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >> sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print >> sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize print >> sys.stderr, ' - paired = %s \t\t(-p)' % args.paired print >> sys.stderr, '' print >> sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' % ( args.n_hashes * args.min_hashsize) print >> sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes DESIRED_COVERAGE = args.cutoff report_fp = args.report_file filenames = args.input_filenames # In paired mode we read two records at a time batch_size = 1 if args.paired: batch_size = 2 if args.loadhash: print 'loading hashtable from', args.loadhash ht = khmer.load_counting_hash(args.loadhash) else: print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) total = 0 discarded = 0 for input_filename in filenames: output_name = os.path.basename(input_filename) + '.keepmedpct' outfp = open(output_name, 'w') n = -1 for n, batch in enumerate( batchwise(screed.open(input_filename), batch_size)): if n > 0 and n % 100000 == 0: print '... kept', total - discarded, 'of', total, ', or', \ int(100. - discarded / float(total) * 100.), '%' print '... in file', input_filename if report_fp: print>>report_fp, total, total - discarded, \ 1. - (discarded / float(total)) report_fp.flush() total += batch_size # If in paired mode, check that the reads are properly interleaved if args.paired: if not validpair(batch[0], batch[1]): print >> sys.stderr, 'Error: Improperly interleaved pairs %s %s' % ( batch[0].name, batch[1].name) sys.exit(-1) # Emit the batch of reads if any read passes the filter # and all reads are longer than K passed_filter = False passed_length = True for record in batch: if len(record.sequence) < K: passed_length = False continue seq = record.sequence.replace('N', 'A') med, avg, dev = ht.get_median_count(seq) pct = 0. if avg: pct = dev / avg * 100 if med < DESIRED_COVERAGE and pct < 100: ht.consume(seq) passed_filter = True # Emit records if any passed if passed_length and passed_filter: for record in batch: if hasattr(record, 'quality'): outfp.write( '@%s\n%s\n+\n%s\n' % (record.name, record.sequence, record.quality)) else: outfp.write('>%s\n%s\n' % (record.name, record.sequence)) else: discarded += batch_size if -1 < n: print 'DONE with', input_filename, '; kept', total - discarded, 'of',\ total, 'or', int(100. - discarded / float(total) * 100.), '%' print 'output in', output_name else: print 'SKIPPED empty file', input_filename if args.savehash: print 'Saving hashfile through', input_filename print '...saving to', args.savehash ht.save(args.savehash) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >> sys.stderr, "**" print >> sys.stderr, "** ERROR: the counting hash is too small for" print >> sys.stderr, "** this data set. Increase hashsize/num ht." print >> sys.stderr, "**" print >> sys.stderr, "** Do not use these results!!" sys.exit(-1)
def main(): parser = build_counting_args() parser.add_argument("-t", "--trusted-cutoff", dest="trusted_cutoff", type=int, default=3) parser.add_argument("--bits-theta", help="Tuning parameter controlling trade off of speed vs alignment sensitivity", default=1.0, type=float, dest="bits_theta") parser.add_argument('-C', '--cutoff', type=int, dest='cutoff', default=DEFAULT_MINIMUM_COVERAGE) parser.add_argument('-s', '--savehash', dest='savehash', default='') parser.add_argument('-l', '--loadhash', dest='loadhash', default='') parser.add_argument('--details-out', dest="details_out") parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: print >>sys.stderr, '\nPARAMETERS:' print >>sys.stderr, ' - kmer size = %d \t\t(-k)' % args.ksize print >>sys.stderr, ' - n hashes = %d \t\t(-N)' % args.n_hashes print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \ args.min_hashsize print >>sys.stderr, '' print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \ '(n_hashes x min_hashsize)' % ( args.n_hashes * args.min_hashsize) print >>sys.stderr, '-' * 8 K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes DESIRED_COVERAGE = args.cutoff filenames = args.input_filenames if args.loadhash: print 'loading hashtable from', args.loadhash ht = khmer.load_counting_hash(args.loadhash) else: print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) aligner = khmer.new_readaligner(ht, args.trusted_cutoff, args.bits_theta) if args.details_out != None: details_out = open(args.details_out, "w") else: details_out = None total = 0 discarded = 0 for input_filename in filenames: output_name = os.path.basename(input_filename) + '.keepalign' outfp = open(output_name, 'w') for n, record in enumerate(screed.open(input_filename)): if n > 0 and n % 10000 == 0: print '... kept', total - discarded, 'of', total, ', or', \ int(100. - discarded / float(total) * 100.), '%' print '... in file', input_filename total += 1 if len(record.sequence) < K: continue seq = record.sequence.upper().replace('N', 'A') ## score, graph_alignment, read_alignment, truncated = aligner.align(record.sequence) keep = False if truncated: keep = True else: if False: graph_seq = graph_alignment.replace("-", "") else: graph_seq = "" for i in range(len(graph_alignment)): if graph_alignment[i] == "-": graph_seq += read_alignment[i] else: graph_seq += graph_alignment[i] mincount = ht.get_min_count(graph_seq) keep = True seq = graph_seq #if mincount < DESIRED_COVERAGE: # keep = True # seq = graph_seq #else: # assert not keep if details_out != None: details_out.write("+{7}\t{0:0.2f}\t{3}\t{4}\nread: {6}\ngraph_aln: {1}\nread_aln: {2}\nstored_seq:{5}\n".format(score, graph_alignment, read_alignment, truncated, keep, seq, record.sequence, record.name)) if keep: ht.consume(seq) outfp.write('>%s\n%s\n' % (record.name, seq)) else: discarded += 1 print 'DONE with', input_filename, '; kept', total - discarded, 'of',\ total, 'or', int(100. - discarded / float(total) * 100.), '%' print 'output in', output_name if args.savehash: print 'Saving hashfile through', input_filename print '...saving to', args.savehash ht.save(args.savehash) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.20: print >>sys.stderr, "**" print >>sys.stderr, "** ERROR: the counting hash is too small for" print >>sys.stderr, "** this data set. Increase hashsize/num ht." print >>sys.stderr, "**" print >>sys.stderr, "** Do not use these results!!" sys.exit(-1)
def get_parser(): epilog = (""" Discard sequences based on whether or not their median k-mer abundance lies above a specified cutoff. Kept sequences will be placed in <fileN>.keep. Paired end reads will be considered together if :option:`-p` is set. If either read will be kept, then both will be kept. This should result in keeping (or discarding) each sequencing fragment. This helps with retention of repeats, especially. With :option:`-s`/:option:`--savetable`, the k-mer counting table will be saved to the specified file after all sequences have been processed. With :option:`-d`, the k-mer counting table will be saved every d files for multifile runs; if :option:`-s` is set, the specified name will be used, and if not, the name `backup.ct` will be used. :option:`-l`/:option:`--loadtable` will load the specified k-mer counting table before processing the specified files. Note that these tables are are in the same format as those produced by :program:`load-into-counting.py` and consumed by :program:`abundance-dist.py`. :option:`-f`/:option:`--fault-tolerant` will force the program to continue upon encountering a formatting error in a sequence file; the k-mer counting table up to that point will be dumped, and processing will continue on the next file. Example:: normalize-by-median.py -k 17 tests/test-data/test-abund-read-2.fa Example:: """ " normalize-by-median.py -p -k 17 tests/test-data/test-abund-read-paired.fa" # noqa """ Example:: """ " normalize-by-median.py -k 17 -f tests/test-data/test-error-reads.fq tests/test-data/test-fastq-reads.fq" # noqa """ Example:: """ " normalize-by-median.py -k 17 -d 2 -s test.ct tests/test-data/test-abund-read-2.fa tests/test-data/test-fastq-reads") # noqa parser = build_counting_args( descr="Do digital normalization (remove mostly redundant sequences)", epilog=textwrap.dedent(epilog)) parser.add_argument('-C', '--cutoff', type=int, default=DEFAULT_DESIRED_COVERAGE) parser.add_argument('-p', '--paired', action='store_true') parser.add_argument('-s', '--savetable', metavar="filename", default='') parser.add_argument('-R', '--report', metavar='filename', type=argparse.FileType('w')) parser.add_argument('-f', '--fault-tolerant', dest='force', help='continue on next file if read errors are \ encountered', action='store_true') parser.add_argument('--save-on-failure', dest='fail_save', action='store_false', default=True, help='Save k-mer counting table when an error occurs') parser.add_argument('-d', '--dump-frequency', dest='dump_frequency', type=int, help='dump k-mer counting table every d ' 'files', default=-1) parser.add_argument('-o', '--out', metavar="filename", dest='single_output_filename', default='', help='only output a single' ' file with the specified filename') parser.add_argument('input_filenames', metavar='input_sequence_filename', help='Input FAST[AQ] sequence filename.', nargs='+') parser.add_argument('--report-total-kmers', '-t', action='store_true', help="Prints the total number of k-mers" " post-normalization to stderr") parser.add_argument('--force', default=False, action='store_true', help='Overwrite output file if it exists') add_loadhash_args(parser) return parser
def get_parser(): epilog = ( """ Discard sequences based on whether or not their median k-mer abundance lies above a specified cutoff. Kept sequences will be placed in <fileN>.keep. Paired end reads will be considered together if :option:`-p` is set. If either read will be kept, then both will be kept. This should result in keeping (or discarding) each sequencing fragment. This helps with retention of repeats, especially. With :option:`-s`/:option:`--savetable`, the k-mer counting table will be saved to the specified file after all sequences have been processed. With :option:`-d`, the k-mer counting table will be saved every d files for multifile runs; if :option:`-s` is set, the specified name will be used, and if not, the name `backup.ct` will be used. :option:`-l`/:option:`--loadtable` will load the specified k-mer counting table before processing the specified files. Note that these tables are are in the same format as those produced by :program:`load-into-counting.py` and consumed by :program:`abundance-dist.py`. :option:`-f`/:option:`--fault-tolerant` will force the program to continue upon encountering a formatting error in a sequence file; the k-mer counting table up to that point will be dumped, and processing will continue on the next file. Example:: saturate-by-median.py -k 17 tests/test-data/test-abund-read-2.fa Example:: """ " saturate-by-median.py -p -k 17 tests/test-data/test-abund-read-paired.fa" # noqa """ Example:: """ " saturate-by-median.py -k 17 -f tests/test-data/test-error-reads.fq tests/test-data/test-fastq-reads.fq" # noqa """ Example:: """ " saturate-by-median.py -k 17 -d 2 -s test.ct tests/test-data/test-abund-read-2.fa tests/test-data/test-fastq-reads" ) # noqa parser = build_counting_args( descr="Do digital normalization (remove mostly redundant sequences)", epilog=textwrap.dedent(epilog)) parser.add_argument('-C', '--cutoff', type=int, default=DEFAULT_DESIRED_COVERAGE) parser.add_argument('-p', '--paired', action='store_true') parser.add_argument('-s', '--savetable', metavar="filename", default='') parser.add_argument('-R', '--report', metavar='filename', type=argparse.FileType('w')) parser.add_argument('--report-frequency', metavar='report_frequency', default=100000, type=int) parser.add_argument('-f', '--fault-tolerant', dest='force', help='continue on next file if read errors are \ encountered', action='store_true') parser.add_argument('-o', '--out', metavar="filename", dest='single_output_filename', default='', help='only output a single' ' file with the specified filename') parser.add_argument('input_filenames', metavar='input_sequence_filename', help='Input FAST[AQ] sequence filename.', nargs='+') add_loadhash_args(parser) return parser
def main(): p = build_counting_args(descr='Streaming assembly with tracking info') p.add_argument('fastq_files', nargs='+') p.add_argument('--prefix', default='transcriptome') args = p.parse_args() cg = create_countgraph(args) asm = khmer.JunctionCountAssembler(cg) tr_fn = '{0}.transcripts.fa'.format(args.prefix) orf_fn = '{0}.orfs.fa'.format(args.prefix) stats_fn = '{0}.stats.fa'.format(args.prefix) with open(tr_fn, 'w') as tr_fp,\ open(orf_fn, 'w') as orf_fp,\ open(stats_fn, 'w') as stats_fp: kept = 0 next_contig = 1 next_orf = 1 output = set() statswriter = csv.DictWriter( stats_fp, delimiter=',', fieldnames=['read_n', 'action', 'cov', 'n_junctions', 'contig_n']) for filename in args.fastq_files: for n, record in enumerate(screed.open(filename)): if n and n % 10000 == 0: print('...', n, file=sys.stderr) if len(record.sequence) < args.ksize: continue cov, _, _ = cg.get_median_count(record.sequence) if cov < 20: kept += 1 cg.consume(record.sequence) statswriter.writerow({ 'read_n': n, 'action': 'c', 'cov': cov, 'n_junctions': None, 'contig_n': None }) elif cov < 30: seq, pos = cg.trim_on_abundance(record.sequence, 3) if len(seq) < args.ksize: continue n_junctions = asm.consume(seq) statswriter.writerow({ 'read_n': n, 'action': 't', 'cov': cov, 'n_junctions': n_junctions, 'contig_n': None }) elif cov == 30: contigs = asm.assemble(record.sequence[:args.ksize]) for contig_n, contig in enumerate(contigs): statswriter.writerow({ 'read_n': n, 'action': 'a', 'cov': cov, 'n_junctions': None, 'contig_n': (next_contig, contig_n) }) tr_fp.write('>contig%d\n%s\n' % (next_contig, contig)) next_contig += 1 for t in translate(contig): for orf_n, o in enumerate(extract_orfs(t)): if hash(o) not in output: new = True output.add(hash(o)) orf_fp.write('>orf%d\n%s\n' % (next_orf, o)) next_orf += 1 else: new = False else: statswriter.writerow({ 'read_n': n, 'action': 's', 'cov': cov, 'n_junctions': None, 'contig_n': None })
def get_parser(): epilog = ( """ Discard sequences based on whether or not their median k-mer abundance lies above a specified cutoff. Kept sequences will be placed in <fileN>.keep. By default, paired end reads will be considered together; if either read should be kept, both will be kept. (This keeps both reads from a fragment, and helps with retention of repeats.) Unpaired reads are treated individually. If :option:`-p`/`--paired` is set, then proper pairing is required and the script will exit on unpaired reads, although :option:`--unpaired-reads` can be used to supply a file of orphan reads to be read after the paired reads. :option:`--force-single` will ignore all pairing information and treat reads individually. With :option:`-s`/:option:`--savetable`, the k-mer counting table will be saved to the specified file after all sequences have been processed. With :option:`-d`, the k-mer counting table will be saved every d files for multifile runs; if :option:`-s` is set, the specified name will be used, and if not, the name `backup.ct` will be used. :option:`-l`/:option:`--loadtable` will load the specified k-mer counting table before processing the specified files. Note that these tables are are in the same format as those produced by :program:`load-into-counting.py` and consumed by :program:`abundance-dist.py`. To append reads to an output file (rather than overwriting it), send output to STDOUT with `--out -` and use UNIX file redirection syntax (`>>`) to append to the file. Example:: normalize-by-median.py -k 17 tests/test-data/test-abund-read-2.fa Example:: """ " normalize-by-median.py -p -k 17 tests/test-data/test-abund-read-paired.fa" # noqa """ Example:: """ " normalize-by-median.py -p -k 17 -o - tests/test-data/paired.fq >> appended-output.fq" # noqa """ Example:: """ " normalize-by-median.py -k 17 -f tests/test-data/test-error-reads.fq tests/test-data/test-fastq-reads.fq" # noqa """ Example:: """ " normalize-by-median.py -k 17 -d 2 -s test.ct tests/test-data/test-abund-read-2.fa tests/test-data/test-fastq-reads" ) # noqa parser = build_counting_args( descr="Do digital normalization (remove mostly redundant sequences)", epilog=textwrap.dedent(epilog)) parser.add_argument('-C', '--cutoff', type=int, default=DEFAULT_DESIRED_COVERAGE) parser.add_argument('-p', '--paired', action='store_true', help='require that all sequences be properly paired') parser.add_argument('--force-single', dest='force_single', action='store_true', help='treat all sequences as single-ended/unpaired') parser.add_argument('-u', '--unpaired-reads', metavar="unpaired_reads_filename", help='include a file of unpaired reads to which ' '-p/--paired does not apply.') parser.add_argument('-s', '--savetable', metavar="filename", default='', help='save the k-mer counting table to disk after all' 'reads are loaded.') parser.add_argument('-R', '--report', metavar='filename', type=argparse.FileType('w')) parser.add_argument('-f', '--force', dest='force', help='continue on next file if read errors are \ encountered', action='store_true') parser.add_argument('-o', '--out', metavar="filename", dest='single_output_file', type=argparse.FileType('w'), default=None, help='only output a single file with ' 'the specified filename; use a single dash "-" to ' 'specify that output should go to STDOUT (the ' 'terminal)') parser.add_argument('input_filenames', metavar='input_sequence_filename', help='Input FAST[AQ] sequence filename.', nargs='+') add_loadhash_args(parser) return parser
def get_parser(): epilog = """\ Discard sequences based on whether or not their median k-mer abundance lies above a specified cutoff. Kept sequences will be placed in <fileN>.keep. By default, paired end reads will be considered together; if either read should be kept, both will be kept. (This keeps both reads from a fragment, and helps with retention of repeats.) Unpaired reads are treated individually. If :option:`-p`/:option:`--paired` is set, then proper pairing is required and the script will exit on unpaired reads, although :option:`--unpaired-reads` can be used to supply a file of orphan reads to be read after the paired reads. :option:`--force_single` will ignore all pairing information and treat reads individually. With :option:`-s`/:option:`--savegraph`, the k-mer countgraph will be saved to the specified file after all sequences have been processed. :option:`-l`/:option:`--loadgraph` will load the specified k-mer countgraph before processing the specified files. Note that these graphs are are in the same format as those produced by :program:`load-into-counting.py` and consumed by :program:`abundance-dist.py`. To append reads to an output file (rather than overwriting it), send output to STDOUT with `--output -` and use UNIX file redirection syntax (`>>`) to append to the file. Example:: normalize-by-median.py -k 17 tests/test-data/test-abund-read-2.fa Example:: normalize-by-median.py -p -k 17 \\ tests/test-data/test-abund-read-paired.fa Example:: normalize-by-median.py -p -k 17 -o - tests/test-data/paired.fq \\ >> appended-output.fq Example:: normalize-by-median.py -k 17 -f tests/test-data/test-error-reads.fq \\ tests/test-data/test-fastq-reads.fq Example:: normalize-by-median.py -k 17 -s test.ct \\ tests/test-data/test-abund-read-2.fa \\ tests/test-data/test-fastq-reads.fq""" parser = build_counting_args( descr="Do digital normalization (remove mostly redundant sequences)", epilog=textwrap.dedent(epilog), citations=['diginorm']) parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true') parser.add_argument('-p', '--paired', action='store_true', help='require that all sequences be properly paired') parser.add_argument('--force_single', dest='force_single', action='store_true', help='treat all sequences as single-ended/unpaired') parser.add_argument('-u', '--unpaired-reads', metavar="unpaired_reads_filename", help='include a file of unpaired reads to which ' '-p/--paired does not apply.') parser.add_argument('-s', '--savegraph', metavar="filename", default=None, help='save the k-mer countgraph to disk after all ' 'reads are loaded.') parser.add_argument('-R', '--report', help='write progress report to report_filename', metavar='report_filename', type=argparse.FileType('w')) parser.add_argument('--report-frequency', metavar='report_frequency', type=int, default=100000, help='report progress every report_frequency reads') parser.add_argument('-f', '--force', dest='force', help='continue past file reading errors', action='store_true') parser.add_argument('-o', '--output', metavar="filename", type=khFileType('wb'), default=None, dest='single_output_file', help='only output a single file with ' 'the specified filename; use a single dash "-" to ' 'specify that output should go to STDOUT (the ' 'terminal)') parser.add_argument('input_filenames', metavar='input_sequence_filename', help='Input FAST[AQ] sequence filename.', nargs='+') add_loadgraph_args(parser) parser.add_argument('-z', '--loadgraph2', metavar="filename", default=None, help='load a second k-mer graph') add_output_compression_type(parser) return parser
def get_parser(): epilog = """\ Discard sequences based on whether or not their median k-mer abundance lies above a specified cutoff. Kept sequences will be placed in <fileN>.keep. By default, paired end reads will be considered together; if either read should be kept, both will be kept. (This keeps both reads from a fragment, and helps with retention of repeats.) Unpaired reads are treated individually. If :option:`-p`/:option:`--paired` is set, then proper pairing is required and the script will exit on unpaired reads, although :option:`--unpaired-reads` can be used to supply a file of orphan reads to be read after the paired reads. :option:`--force_single` will ignore all pairing information and treat reads individually. With :option:`-s`/:option:`--savegraph`, the k-mer countgraph will be saved to the specified file after all sequences have been processed. :option:`-l`/:option:`--loadgraph` will load the specified k-mer countgraph before processing the specified files. Note that these graphs are are in the same format as those produced by :program:`load-into-counting.py` and consumed by :program:`abundance-dist.py`. To append reads to an output file (rather than overwriting it), send output to STDOUT with `--output -` and use UNIX file redirection syntax (`>>`) to append to the file. Example:: normalize-by-median.py -k 17 tests/test-data/test-abund-read-2.fa Example:: normalize-by-median.py -p -k 17 \\ tests/test-data/test-abund-read-paired.fa Example:: normalize-by-median.py -p -k 17 -o - tests/test-data/paired.fq \\ >> appended-output.fq Example:: normalize-by-median.py -k 17 -f tests/test-data/test-error-reads.fq \\ tests/test-data/test-fastq-reads.fq Example:: normalize-by-median.py -k 17 -s test.ct \\ tests/test-data/test-abund-read-2.fa \\ tests/test-data/test-fastq-reads.fq""" parser = build_counting_args( descr="Do digital normalization (remove mostly redundant sequences)", epilog=textwrap.dedent(epilog), citations=['diginorm']) parser.add_argument('-q', '--quiet', dest='quiet', default=False, action='store_true') parser.add_argument('-C', '--cutoff', help="when the median " "k-mer coverage level is above this number the " "read is not kept.", type=check_argument_range(0, 256, "cutoff"), default=DEFAULT_DESIRED_COVERAGE) parser.add_argument('-p', '--paired', action='store_true', help='require that all sequences be properly paired') parser.add_argument('--force_single', dest='force_single', action='store_true', help='treat all sequences as single-ended/unpaired') parser.add_argument('-u', '--unpaired-reads', metavar="unpaired_reads_filename", help='include a file of unpaired reads to which ' '-p/--paired does not apply.') parser.add_argument('-s', '--savegraph', metavar="filename", default=None, help='save the k-mer countgraph to disk after all ' 'reads are loaded.') parser.add_argument('-R', '--report', help='write progress report to report_filename', metavar='report_filename', type=argparse.FileType('w')) parser.add_argument('--report-frequency', metavar='report_frequency', type=int, default=100000, help='report progress every report_frequency reads') parser.add_argument('-f', '--force', dest='force', help='continue past file reading errors', action='store_true') parser.add_argument('-o', '--output', metavar="filename", type=khFileType('wb'), default=None, dest='single_output_file', help='only output a single file with ' 'the specified filename; use a single dash "-" to ' 'specify that output should go to STDOUT (the ' 'terminal)') parser.add_argument('input_filenames', metavar='input_sequence_filename', help='Input FAST[AQ] sequence filename.', nargs='+') add_loadgraph_args(parser) add_output_compression_type(parser) return parser
def main(): parser = build_counting_args() parser.add_argument("-t", "--trusted-cutoff", dest="trusted_cutoff", type=int, default=3) parser.add_argument("--bits-theta", help="Tuning parameter controlling" "trade off of speed vs alignment sensitivity", default=1.0, type=float, dest="bits_theta") parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to', help='base cutoff on abundance', default=DEFAULT_NORMALIZE_LIMIT) parser.add_argument('-s', '--savehash', dest='savehash', default='') parser.add_argument('-l', '--loadhash', dest='loadhash', default='') parser.add_argument('--details-out', dest="details_out") parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() if not args.quiet: print('\nPARAMETERS:', file=sys.stderr) print(' - kmer size = %d \t\t(-k)' % args.ksize, file=sys.stderr) print(' - n hashes = %d \t\t(-N)' % args.n_tables, file=sys.stderr) print(' - min hashsize = %-5.2g \t(-x)' % \ args.max_tablesize, file=sys.stderr) print('', file=sys.stderr) print('Estimated memory usage is %.2g bytes ' \ '(n_hashes x min_hashsize)' % \ (args.n_tables * args.max_tablesize), file=sys.stderr) print('-' * 8, file=sys.stderr) K = args.ksize HT_SIZE = args.max_tablesize N_HT = args.n_tables filenames = args.input_filenames if args.loadhash: print('loading hashtable from', args.loadhash) ht = khmer.load_countgraph(args.loadhash) else: print('making hashtable') ht = khmer.Countgraph(K, HT_SIZE, N_HT) aligner = khmer.ReadAligner(ht, args.trusted_cutoff, args.bits_theta) if args.details_out is not None: details_out = open(args.details_out, "w") else: details_out = None total = 0 discarded = 0 for input_filename in filenames: output_name = os.path.basename(input_filename) + '.keepvar' outfp = open(output_name, 'w') for n, record in enumerate(screed.open(input_filename)): if n > 0 and n % 10000 == 0: print('... kept', total - discarded, 'of', total, ', or', \ int(100. - discarded / float(total) * 100.), '%') print('... in file', input_filename) total += 1 if len(record.sequence) < K: continue seq = record.sequence.upper().replace('N', 'A') ## # build the alignment... score, graph_alignment, read_alignment, truncated = \ aligner.align(record.sequence) # next, decide whether or to keep it. keep = False if truncated: keep = True # keep all truncated alignments - why? else: # build a better sequence -- this is the corrected one. graph_seq = graph_alignment.replace("-", "") # OR? #graph_seq = "" #for i in range(len(graph_alignment)): # if graph_alignment[i] == "-": # graph_seq += read_alignment[i] # else: # graph_seq += graph_alignment[i] # get the minimum count for this new sequence mincount = ht.get_min_count(graph_seq) if mincount < args.normalize_to: keep = True if details_out is not None: details_out.write( "+{7}\t{0:0.2f}\t{3}\t{4}\nread: " "{6}\ngraph_aln: {1}\nread_aln: {2}\nstored_seq:{5}\n" "".format(score, graph_alignment, read_alignment, truncated, keep, seq, record.sequence, record.name)) if keep: ht.consume(seq) outfp.write('>%s\n%s\n' % (record.name, record.sequence)) else: discarded += 1 if total: print('DONE with', input_filename, \ '; kept', total - discarded, 'of', total, 'or', \ int(100. - discarded / float(total) * 100.), '%') print('output in', output_name) if args.savehash: print('Saving hashfile through', input_filename) print('...saving to', args.savehash) ht.save(args.savehash) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(ht, args.force, max_false_pos=.2) print('fp rate estimated to be %1.3f' % fp_rate)
def get_parser(): epilog = """ The output is one file for each input file, <input file>.abundtrim, placed in the current directory. This output contains the input sequences trimmed at low-abundance k-mers. The ``-V/--variable-coverage`` parameter will, if specified, prevent elimination of low-abundance reads by only trimming low-abundance k-mers from high-abundance reads; use this for non-genomic data sets that may have variable coverage. Note that the output reads will not necessarily be in the same order as the reads in the input files; if this is an important consideration, use ``load-into-counting.py`` and ``filter-abund.py``. However, read pairs will be kept together, in "broken-paired" format; you can use ``extract-paired-reads.py`` to extract read pairs and orphans. Example:: trim-low-abund.py -x 5e7 -k 20 -C 2 data/100k-filtered.fa """ parser = build_counting_args( descr='Trim low-abundance k-mers using a streaming algorithm.', epilog=textwrap.dedent(epilog)) parser.add_argument('input_filenames', nargs='+') parser.add_argument('--cutoff', '-C', type=int, help='remove k-mers below this abundance', default=DEFAULT_CUTOFF) parser.add_argument('--normalize-to', '-Z', type=int, help='base cutoff on this median k-mer abundance', default=DEFAULT_NORMALIZE_LIMIT) parser.add_argument('-o', '--out', metavar="filename", type=argparse.FileType('w'), default=None, help='only output a single file with ' 'the specified filename; use a single dash "-" to ' 'specify that output should go to STDOUT (the ' 'terminal)') parser.add_argument('--variable-coverage', '-V', action='store_true', default=False, help='Only trim low-abundance k-mers from sequences ' 'that have high coverage.') add_loadhash_args(parser) parser.add_argument('-s', '--savetable', metavar="filename", default='', help='save the k-mer counting table to disk after all' 'reads are loaded.') # expert options parser.add_argument('--force', default=False, action='store_true') parser.add_argument('--ignore-pairs', default=False, action='store_true') parser.add_argument('--tempdir', '-T', type=str, default='./') return parser