예제 #1
0
def get_parser():
    parser = build_hashbits_args(descr="Load sequences into the compressible "
                                 "graph format plus optional tagset.")
    add_threading_args(parser)
    parser.add_argument('--no-build-tagset',
                        '-n',
                        default=False,
                        action='store_true',
                        dest='no_build_tagset',
                        help='Do NOT construct tagset while loading sequences')
    parser.add_argument('output_filename',
                        metavar='output_presence_table_filename',
                        help='output'
                        ' k-mer presence table filename.')
    parser.add_argument('input_filenames',
                        metavar='input_sequence_filename',
                        nargs='+',
                        help='input FAST[AQ] sequence filename')
    parser.add_argument('--report-total-kmers',
                        '-t',
                        action='store_true',
                        help="Prints the total number of k-mers to stderr")
    parser.add_argument('--write-fp-rate',
                        '-w',
                        action='store_true',
                        help="Write false positive rate into .info file")
    parser.add_argument('-f',
                        '--force',
                        default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')
    return parser
예제 #2
0
def get_parser():
    epilog = """
    The resulting partition maps are saved as '${basename}.subset.#.pmap'
    files.
    """
    parser = argparse.ArgumentParser(
        description="Partition a sequence graph based upon waypoint "
        "connectivity", epilog=epilog,
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('basename', help="basename of the input k-mer presence"
                        " table + tagset files")
    parser.add_argument('--stoptags', '-S', metavar='filename', default='',
                        help="Use stoptags in this file during partitioning")
    parser.add_argument('--subset-size', '-s', default=DEFAULT_SUBSET_SIZE,
                        type=float, help='Set subset size (usually 1e5-1e6 is '
                        'good)')
    parser.add_argument('--no-big-traverse', action='store_true',
                        default=False, help='Truncate graph joins at big '
                        'traversals')
    parser.add_argument('--version', action='version', version='%(prog)s ' +
                        khmer.__version__)
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    add_threading_args(parser)
    return parser
예제 #3
0
def get_parser():
    epilog = """
    Trimmed sequences will be placed in ${input_sequence_filename}.abundfilt.

    This script is constant memory.

    To trim reads based on k-mer abundance across multiple files, use
    :program:`load-into-counting.py` and :program:`filter-abund.py`.

    Example::

        filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa
    """
    parser = build_counting_args(
        descr="Trims sequences at a minimum k-mer abundance "
        "(in memory version).", epilog=textwrap.dedent(epilog))
    add_threading_args(parser)

    parser.add_argument('--cutoff', '-C', default=DEFAULT_CUTOFF, type=int,
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('--savetable', metavar="filename", default='',
                        help="If present, the name of the file to save the "
                        "k-mer counting table to")
    parser.add_argument('datafile', metavar='input_sequence_filename',
                        help="FAST[AQ] sequence file to trim")
    parser.add_argument('--report-total-kmers', '-t', action='store_true',
                        help="Prints the total number of k-mers to stderr")
    return parser
예제 #4
0
def get_parser():
    epilog = """
    Note: with :option:`-b` the output will be the exact size of the
    k-mer counting table and this script will use a constant amount of memory.
    In exchange k-mer counts will stop at 255. The memory usage of this script
    with :option:`-b` will be about 1.15x the product of the :option:`-x` and
    :option:`-N` numbers.

    Example::

        load-into-counting.py -k 20 -x 5e7 out.kh data/100k-filtered.fa

    Multiple threads can be used to accelerate the process, if you have extra
    cores to spare.

    Example::

        load_into_counting.py -k 20 -x 5e7 -T 4 out.kh data/100k-filtered.fa
    """

    parser = build_counting_args("Build a k-mer counting table from the given"
                                 " sequences.", epilog=textwrap.dedent(epilog))
    add_threading_args(parser)
    parser.add_argument('output_countingtable_filename', help="The name of the"
                        " file to write the k-mer counting table to.")
    parser.add_argument('input_sequence_filename', nargs='+',
                        help="The names of one or more FAST[AQ] input "
                        "sequence files.")
    parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True,
                        action='store_false',
                        help='Do not count k-mers past 255')
    parser.add_argument('--report-total-kmers', '-t', action='store_true',
                        help="Prints the total number of k-mers to stderr")
    return parser
예제 #5
0
def get_parser():
    epilog = """\
    The resulting partition maps are saved as ``${basename}.subset.#.pmap``
    files.
    """
    parser = argparse.ArgumentParser(
        description="Partition a sequence graph based upon waypoint "
        "connectivity", epilog=textwrap.dedent(epilog),
        formatter_class=ComboFormatter)

    parser.add_argument('basename', help="basename of the input k-mer"
                        "nodegraph  + tagset files")
    parser.add_argument('--stoptags', '-S', metavar='filename', default='',
                        help="Use stoptags in this file during partitioning")
    parser.add_argument('--subset-size', '-s', default=DEFAULT_SUBSET_SIZE,
                        type=float, help='Set subset size (usually 1e5-1e6 is '
                        'good)')
    parser.add_argument('--no-big-traverse', action='store_true',
                        default=False, help='Truncate graph joins at big '
                        'traversals')
    parser.add_argument('--version', action=_VersionStdErrAction,
                        version='khmer {v}'.format(v=__version__))
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    add_threading_args(parser)
    return parser
예제 #6
0
def get_parser():
    epilog = """
    Trimmed sequences will be placed in ${input_sequence_filename}.abundfilt.

    This script is constant memory.

    To trim reads based on k-mer abundance across multiple files, use
    :program:`load-into-counting.py` and :program:`filter-abund.py`.

    Example::

        filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa
    """
    parser = build_counting_args(
        descr="Trims sequences at a minimum k-mer abundance "
        "(in memory version).", epilog=textwrap.dedent(epilog))
    add_threading_args(parser)

    parser.add_argument('--cutoff', '-C', default=DEFAULT_CUTOFF, type=int,
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('--savetable', metavar="filename", default='',
                        help="If present, the name of the file to save the "
                        "k-mer counting table to")
    parser.add_argument('datafile', metavar='input_sequence_filename',
                        help="FAST[AQ] sequence file to trim")
    parser.add_argument('--report-total-kmers', '-t', action='store_true',
                        help="Prints the total number of k-mers to stderr")
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    return parser
예제 #7
0
def get_parser():
    epilog = '''
    Note that with :option:`-b` this script is constant memory; in exchange,
    k-mer counts will stop at 255. The memory usage of this script with
    :option:`-b` will be about 1.15x the product of the :option:`-x` and
    :option:`-N` numbers.

    To count k-mers in multiple files use :program:`load_into_counting.py` and
    :program:`abundance_dist.py`.
    '''
    parser = build_counting_args(
        descr="Calculate the abundance distribution of k-mers from a "
        "single sequence file.", epilog=textwrap.dedent(epilog))
    add_threading_args(parser)

    parser.add_argument('input_sequence_filename', help='The name of the input'
                        ' FAST[AQ] sequence file.')
    parser.add_argument('output_histogram_filename', help='The name of the '
                        'output histogram file. The columns are: (1) k-mer '
                        'abundance, (2) k-mer count, (3) cumulative count, '
                        '(4) fraction of total distinct k-mers.')
    parser.add_argument('-z', '--no-zero', dest='output_zero', default=True,
                        action='store_false',
                        help='Do not output 0-count bins')
    parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True,
                        action='store_false',
                        help='Do not count k-mers past 255')
    parser.add_argument('-s', '--squash', dest='squash_output', default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')
    parser.add_argument('--savetable', default='', metavar="filename",
                        help="Save the k-mer counting table to the specified "
                        "filename.")
    return parser
예제 #8
0
def get_parser():
    epilog = """
    Trimmed sequences will be placed in ${input_sequence_filename}.abundfilt.

    This script is constant memory.

    To trim reads based on k-mer abundance across multiple files, use
    :program:`load-into-counting.py` and :program:`filter-abund.py`.

    Example::

        filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa
    """
    parser = build_counting_args(
        descr="Trims sequences at a minimum k-mer abundance " "(in memory version).", epilog=textwrap.dedent(epilog)
    )
    add_threading_args(parser)

    parser.add_argument("--cutoff", "-C", default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.")
    parser.add_argument(
        "--savetable",
        metavar="filename",
        default="",
        help="If present, the name of the file to save the " "k-mer counting table to",
    )
    parser.add_argument("datafile", metavar="input_sequence_filename", help="FAST[AQ] sequence file to trim")

    return parser
예제 #9
0
def get_parser():
    epilog = """
    Trimmed sequences will be placed in ${input_sequence_filename}.abundfilt
    for each input sequence file. If the input sequences are from RNAseq or
    metagenome sequencing then :option:`--variable-coverage` should be used.

    Example::

        load-into-counting.py -k 20 -x 5e7 table.ct data/100k-filtered.fa
        filter-abund.py -C 2 table.ct data/100k-filtered.fa
    """
    parser = argparse.ArgumentParser(
        description='Trim sequences at a minimum k-mer abundance.',
        epilog=textwrap.dedent(epilog),
        formatter_class=ComboFormatter)
    parser.add_argument('input_table',
                        metavar='input_counting_table_filename',
                        help='The input k-mer counting table filename')
    parser.add_argument('input_filename',
                        metavar='input_sequence_filename',
                        help='Input FAST[AQ] sequence filename',
                        nargs='+')
    add_threading_args(parser)
    parser.add_argument('--cutoff',
                        '-C',
                        dest='cutoff',
                        default=DEFAULT_CUTOFF,
                        type=int,
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('--variable-coverage',
                        '-V',
                        action='store_true',
                        dest='variable_coverage',
                        default=False,
                        help='Only trim low-abundance k-mers from sequences '
                        'that have high coverage.')
    parser.add_argument('--normalize-to',
                        '-Z',
                        type=int,
                        dest='normalize_to',
                        help='Base the variable-coverage cutoff on this median'
                        ' k-mer abundance.',
                        default=DEFAULT_NORMALIZE_LIMIT)
    parser.add_argument('-o',
                        '--out',
                        dest='single_output_filename',
                        default='',
                        metavar="optional_output_filename",
                        help='Output the trimmed sequences into a single file '
                        'with the given filename instead of creating a new '
                        'file for each input file.')
    parser.add_argument('--version',
                        action='version',
                        version='khmer {v}'.format(v=__version__))
    parser.add_argument('-f',
                        '--force',
                        default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')
    return parser
예제 #10
0
def get_parser():
    epilog = """\
    The resulting partition maps are saved as ``${basename}.subset.#.pmap``
    files.
    """
    parser = argparse.ArgumentParser(
        description="Partition a sequence graph based upon waypoint "
        "connectivity", epilog=textwrap.dedent(epilog),
        formatter_class=ComboFormatter)

    parser.add_argument('basename', help="basename of the input k-mer "
                        "nodegraph  + tagset files")
    parser.add_argument('--stoptags', '-S', metavar='filename', default='',
                        help="Use stoptags in this file during partitioning")
    parser.add_argument('--subset-size', '-s', default=DEFAULT_SUBSET_SIZE,
                        type=float, help='Set subset size (usually 1e5-1e6 is '
                        'good)')
    parser.add_argument('--no-big-traverse', action='store_true',
                        default=False, help='Truncate graph joins at big '
                        'traversals')
    parser.add_argument('--version', action=_VersionStdErrAction,
                        version='khmer {v}'.format(v=__version__))
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    add_threading_args(parser)
    return parser
예제 #11
0
def get_parser():
    epilog = """
    The resulting partition maps are saved as '${basename}.subset.#.pmap'
    files.
    """
    parser = argparse.ArgumentParser(
        description="Partition a sequence graph based upon waypoint "
        "connectivity",
        epilog=epilog,
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument('basename',
                        help="basename of the input k-mer presence"
                        " table + tagset files")
    parser.add_argument('--stoptags',
                        '-S',
                        metavar='filename',
                        default='',
                        help="Use stoptags in this file during partitioning")
    parser.add_argument('--subset-size',
                        '-s',
                        default=DEFAULT_SUBSET_SIZE,
                        type=float,
                        help='Set subset size (usually 1e5-1e6 is '
                        'good)')
    parser.add_argument('--no-big-traverse',
                        action='store_true',
                        default=False,
                        help='Truncate graph joins at big '
                        'traversals')
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + khmer.__version__)
    add_threading_args(parser)
    return parser
예제 #12
0
def get_parser():
    epilog = """
    Load in a set of sequences, partition them, merge the partitions, and
    annotate the original sequences files with the partition information.

    This script combines the functionality of :program:`load-graph.py`,
    :program:`partition-graph.py`, :program:`merge-partitions.py`, and
    :program:`annotate-partitions.py` into one script. This is convenient
    but should probably not be used for large data sets, because
    :program:`do-partition.py` doesn't provide save/resume functionality.
    """
    parser = build_hashbits_args(
        descr='Load, partition, and annotate FAST[AQ] sequences',
        epilog=textwrap.dedent(epilog))
    add_threading_args(parser)
    parser.add_argument('--subset-size', '-s', default=DEFAULT_SUBSET_SIZE,
                        dest='subset_size', type=float,
                        help='Set subset size (usually 1e5-1e6 is good)')
    parser.add_argument('--no-big-traverse', dest='no_big_traverse',
                        action='store_true', default=False,
                        help='Truncate graph joins at big traversals')
    parser.add_argument('--keep-subsets', dest='remove_subsets',
                        default=True, action='store_false',
                        help='Keep individual subsets (default: False)')
    parser.add_argument('graphbase', help="base name for output files")
    parser.add_argument('input_filenames', metavar='input_sequence_filename',
                        nargs='+', help='input FAST[AQ] sequence filenames')
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    return parser
예제 #13
0
def get_parser():
    epilog = """\
    Note: with :option:`-b`/:option:`--no-bigcount` the output will be the
    exact size of the k-mer countgraph and this script will use a constant
    amount of memory. In exchange k-mer counts will stop at 255. The memory
    usage of this script with :option:`-b` will be about 1.15x the product of
    the :option:`-x` and :option:`-N` numbers.

    Example::

        load-into-counting.py -k 20 -x 5e7 out data/100k-filtered.fa

    Multiple threads can be used to accelerate the process, if you have extra
    cores to spare.

    Example::

        load-into-counting.py -k 20 -x 5e7 -T 4 out data/100k-filtered.fa
    """

    parser = build_counting_args(
        "Build a k-mer countgraph from the given"
        " sequences.",
        epilog=textwrap.dedent(epilog))
    add_threading_args(parser)
    parser.add_argument('output_countgraph_filename',
                        help="The name of the"
                        " file to write the k-mer countgraph to.")
    parser.add_argument('input_sequence_filename',
                        nargs='+',
                        help="The names of one or more FAST[AQ] input "
                        "sequence files.")
    parser.add_argument('-b',
                        '--no-bigcount',
                        dest='bigcount',
                        default=True,
                        action='store_false',
                        help="The default behaviour is "
                        "to count past 255 using bigcount. This flag turns "
                        "bigcount off, limiting counts to 255.")
    parser.add_argument('--summary-info',
                        '-s',
                        type=str,
                        default=None,
                        metavar="FORMAT",
                        choices=[str('json'), str('tsv')],
                        help="What format should the machine readable run "
                        "summary be in? (`json` or `tsv`, disabled by"
                        " default)")
    parser.add_argument('-f',
                        '--force',
                        default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')
    parser.add_argument('-q',
                        '--quiet',
                        dest='quiet',
                        default=False,
                        action='store_true')
    return parser
예제 #14
0
def get_parser():
    parser = build_nodegraph_args(descr="Load sequences into the compressible "
                                 "graph format plus optional tagset.")
    add_threading_args(parser)
    parser.add_argument('input_filenames', metavar='input_sequence_filename',
                        nargs='+', help='input FAST[AQ] sequence filename')
    return parser
예제 #15
0
def get_parser():
    parser = build_nodegraph_args(descr="Load sequences into the compressible "
                                 "graph format plus optional tagset.")
    add_threading_args(parser)
    parser.add_argument('input_filenames', metavar='input_sequence_filename',
                        nargs='+', help='input FAST[AQ] sequence filename')
    return parser
예제 #16
0
def get_parser():
    epilog = '''\
    Note that with :option:`-b`/:option:`--no-bigcount` this script is constant
    memory; in exchange, k-mer counts will stop at 255. The memory usage of
    this script with :option:`-b` will be about 1.15x the product of the
    :option:`-x` and :option:`-N` numbers.

    To count k-mers in multiple files use :program:`load_into_counting.py` and
    :program:`abundance_dist.py`.

    Example::

        abundance-dist-single.py -x 1e7 -N 2 -k 17 \\
                tests/test-data/test-abund-read-2.fa test-dist
    '''
    parser = build_counting_args(
        descr="Calculate the abundance distribution of k-mers from a "
        "single sequence file.",
        epilog=textwrap.dedent(epilog))
    add_threading_args(parser)

    parser.add_argument('input_sequence_filename',
                        help='The name of the input'
                        ' FAST[AQ] sequence file.')
    parser.add_argument('output_histogram_filename',
                        help='The name of the '
                        'output histogram file. The columns are: (1) k-mer '
                        'abundance, (2) k-mer count, (3) cumulative count, '
                        '(4) fraction of total distinct k-mers.')
    parser.add_argument('-z',
                        '--no-zero',
                        dest='output_zero',
                        default=True,
                        action='store_false',
                        help='Do not output zero-count bins')
    parser.add_argument('-b',
                        '--no-bigcount',
                        dest='bigcount',
                        default=True,
                        action='store_false',
                        help='Do not count k-mers past 255')
    parser.add_argument('-s',
                        '--squash',
                        dest='squash_output',
                        default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')
    parser.add_argument('--savegraph',
                        default='',
                        metavar="filename",
                        help="Save the k-mer countgraph to the specified "
                        "filename.")
    parser.add_argument('-f',
                        '--force',
                        default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')
    return parser
예제 #17
0
def get_parser():
    epilog = """
    Note: with :option:`-b` the output will be the exact size of the
    k-mer counting table and this script will use a constant amount of memory.
    In exchange k-mer counts will stop at 255. The memory usage of this script
    with :option:`-b` will be about 1.15x the product of the :option:`-x` and
    :option:`-N` numbers.

    Example::

        load-into-counting.py -k 20 -x 5e7 out.kh data/100k-filtered.fa

    Multiple threads can be used to accelerate the process, if you have extra
    cores to spare.

    Example::

        load-into-counting.py -k 20 -x 5e7 -T 4 out.kh data/100k-filtered.fa
    """

    parser = build_counting_args(
        "Build a k-mer counting table from the given"
        " sequences.",
        epilog=textwrap.dedent(epilog))
    add_threading_args(parser)
    parser.add_argument('output_countingtable_filename',
                        help="The name of the"
                        " file to write the k-mer counting table to.")
    parser.add_argument('input_sequence_filename',
                        nargs='+',
                        help="The names of one or more FAST[AQ] input "
                        "sequence files.")
    parser.add_argument('-b',
                        '--no-bigcount',
                        dest='bigcount',
                        default=True,
                        action='store_false',
                        help='Do not count k-mers past 255')
    parser.add_argument('--summary-info',
                        '-s',
                        default=None,
                        metavar="FORMAT",
                        choices=['json', 'tsv'],
                        help="What format should the machine readable run "
                        "summary be in? (json or tsv, disabled by default)")
    parser.add_argument('--report-total-kmers',
                        '-t',
                        action='store_true',
                        help="Prints the total number of k-mers to stderr")
    parser.add_argument('-f',
                        '--force',
                        default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')
    return parser
예제 #18
0
def get_parser():
    epilog = """
    Note that with :option:`-b` this script is constant memory; in exchange,
    k-mer counts will stop at 255. The memory usage of this script with
    :option:`-b` will be about 1.15x the product of the :option:`-x` and
    :option:`-N` numbers.

    To count k-mers in multiple files use :program:`load_into_counting.py` and
    :program:`abundance_dist.py`.
    """
    parser = build_counting_args(
        descr="Calculate the abundance distribution of k-mers from a " "single sequence file.",
        epilog=textwrap.dedent(epilog),
    )
    add_threading_args(parser)

    parser.add_argument("input_sequence_filename", help="The name of the input" " FAST[AQ] sequence file.")
    parser.add_argument(
        "output_histogram_filename",
        help="The name of the "
        "output histogram file. The columns are: (1) k-mer "
        "abundance, (2) k-mer count, (3) cumulative count, "
        "(4) fraction of total distinct k-mers.",
    )
    parser.add_argument(
        "-z", "--no-zero", dest="output_zero", default=True, action="store_false", help="Do not output 0-count bins"
    )
    parser.add_argument(
        "-b", "--no-bigcount", dest="bigcount", default=True, action="store_false", help="Do not count k-mers past 255"
    )
    parser.add_argument(
        "-s",
        "--squash",
        dest="squash_output",
        default=False,
        action="store_true",
        help="Overwrite output file if it exists",
    )
    parser.add_argument(
        "--csv",
        default=False,
        action="store_true",
        help="Use the CSV format for the histogram. " "Includes column headers.",
    )
    parser.add_argument(
        "--savetable",
        default="",
        metavar="filename",
        help="Save the k-mer counting table to the specified " "filename.",
    )
    parser.add_argument(
        "--report-total-kmers", "-t", action="store_true", help="Prints the total number of k-mers to stderr"
    )
    parser.add_argument("-f", "--force", default=False, action="store_true", help="Overwrite output file if it exists")
    return parser
예제 #19
0
def get_parser():
    epilog = """
    Note: with :option:`-b` the output will be the exact size of the
    k-mer counting table and this script will use a constant amount of memory.
    In exchange k-mer counts will stop at 255. The memory usage of this script
    with :option:`-b` will be about 1.15x the product of the :option:`-x` and
    :option:`-N` numbers.

    Example::

        load-into-counting.py -k 20 -x 5e7 out.ct data/100k-filtered.fa

    Multiple threads can be used to accelerate the process, if you have extra
    cores to spare.

    Example::

        load-into-counting.py -k 20 -x 5e7 -T 4 out.ct data/100k-filtered.fa
    """

    parser = build_counting_args(
        "Build a k-mer counting table from the given" " sequences.", epilog=textwrap.dedent(epilog)
    )
    add_threading_args(parser)
    parser.add_argument(
        "output_countingtable_filename", help="The name of the" " file to write the k-mer counting table to."
    )
    parser.add_argument(
        "input_sequence_filename", nargs="+", help="The names of one or more FAST[AQ] input " "sequence files."
    )
    parser.add_argument(
        "-b",
        "--no-bigcount",
        dest="bigcount",
        default=True,
        action="store_false",
        help="The default behaviour is "
        "to count past 255 using bigcount. This flag turns "
        "bigcount off, limiting counts to 255.",
    )
    parser.add_argument(
        "--summary-info",
        "-s",
        type=str,
        default=None,
        metavar="FORMAT",
        choices=[str("json"), str("tsv")],
        help="What format should the machine readable run " "summary be in? (json or tsv, disabled by default)",
    )
    parser.add_argument(
        "--report-total-kmers", "-t", action="store_true", help="Prints the total number of k-mers to stderr"
    )
    parser.add_argument("-f", "--force", default=False, action="store_true", help="Overwrite output file if it exists")
    return parser
예제 #20
0
def build_parser(parser):
    add_threading_args(parser)
    parser.add_argument('--no-build-tagset', '-n', default=False,
                        action='store_true', dest='no_build_tagset',
                        help='Do NOT construct tagset while loading sequences')
    parser.add_argument('output_filename',
                        metavar='output_nodegraph_filename', help='output'
                        ' k-mer nodegraph filename.')
    parser.add_argument('input_filenames', metavar='input_sequence_filename',
                        nargs='+', help='input FAST[AQ] sequence filename')
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    return parser
예제 #21
0
def build_parser(parser):
    add_threading_args(parser)
    parser.add_argument('--no-build-tagset', '-n', default=False,
                        action='store_true', dest='no_build_tagset',
                        help='Do NOT construct tagset while loading sequences')
    parser.add_argument('output_filename',
                        metavar='output_nodegraph_filename', help='output'
                        ' k-mer nodegraph filename.')
    parser.add_argument('input_filenames', metavar='input_sequence_filename',
                        nargs='+', help='input FAST[AQ] sequence filename')
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    return parser
예제 #22
0
def get_parser():
    parser = build_counting_args(
        descr="Output abundances of the k-mers in the sequence file.")
    add_threading_args(parser)

    parser.add_argument('input_sequence_filename', help='The input'
                        ' FAST[AQ] sequence file.')

    parser.add_argument('-o', '--out', metavar="output_file",
                        dest='output_file',
                        type=argparse.FileType('w'),
                        default=None, help='output counts to this file')

    return parser
예제 #23
0
def get_parser():
    parser = build_counting_args(
        descr="Output abundances of the k-mers in the sequence file.")
    add_threading_args(parser)

    parser.add_argument('input_sequence_filename', help='The input'
                        ' FAST[AQ] sequence file.')

    parser.add_argument('-o', '--out', metavar="output_file",
                        dest='output_file',
                        type=argparse.FileType('w'),
                        default=None, help='output counts to this file')

    return parser
예제 #24
0
파일: do-partition.py 프로젝트: Xyroe/khmer
def get_parser():
    epilog = """\
    Load in a set of sequences, partition them, merge the partitions, and
    annotate the original sequences files with the partition information.

    This script combines the functionality of
    :program:`load-graph.py`, :program:`partition-graph.py`,
    :program:`merge-partitions.py`, and :program:`annotate-partitions.py` into
    one script. This is convenient but should probably not be used for large
    data sets, because :program:`do-partition.py` doesn't provide save/resume
    functionality.

    Example::

        do-partition.py -k 20 example tests/test-data/random-20-a.fa
    """
    parser = build_nodegraph_args(
        descr="Load, partition, and annotate FAST[AQ] sequences", epilog=textwrap.dedent(epilog)
    )
    add_threading_args(parser)
    parser.add_argument(
        "--subset-size",
        "-s",
        default=DEFAULT_SUBSET_SIZE,
        dest="subset_size",
        type=float,
        help="Set subset size (usually 1e5-1e6 is good)",
    )
    parser.add_argument(
        "--no-big-traverse",
        dest="no_big_traverse",
        action="store_true",
        default=False,
        help="Truncate graph joins at big traversals",
    )
    parser.add_argument(
        "--keep-subsets",
        dest="remove_subsets",
        default=True,
        action="store_false",
        help="Keep individual subsets (default: False)",
    )
    parser.add_argument("graphbase", help="base name for output files")
    parser.add_argument(
        "input_filenames", metavar="input_sequence_filename", nargs="+", help="input FAST[AQ] sequence filenames"
    )
    parser.add_argument("-f", "--force", default=False, action="store_true", help="Overwrite output file if it exists")
    return parser
예제 #25
0
def get_parser():
    epilog = """\
    Trimmed sequences will be placed in
    ``${input_sequence_filename}.abundfilt``.

    This script is constant memory.

    To trim reads based on k-mer abundance across multiple files, use
    :program:`load-into-counting.py` and :program:`filter-abund.py`.

    Example::

        filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa
    """
    parser = build_counting_args(
        descr="Trims sequences at a minimum k-mer abundance "
        "(in memory version).", epilog=textwrap.dedent(epilog),
        citations=['counting', 'SeqAn'])
    add_threading_args(parser)

    parser.add_argument('--cutoff', '-C', default=DEFAULT_CUTOFF,
                        type=check_argument_range(0, 256, "cutoff"),
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('--variable-coverage', '-V', action='store_true',
                        dest='variable_coverage', default=False,
                        help='Only trim low-abundance k-mers from sequences '
                        'that have high coverage.')
    parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to',
                        help='Base the variable-coverage cutoff on this median'
                        ' k-mer abundance.',
                        default=DEFAULT_NORMALIZE_LIMIT)
    parser.add_argument('--savegraph', metavar="filename", default='',
                        help="If present, the name of the file to save the "
                        "k-mer countgraph to")
    parser.add_argument('-o', '--outfile', metavar='optional_output_filename',
                        default=None, help='Override default output filename '
                        'and output trimmed sequences into a file with the '
                        'given filename.')
    parser.add_argument('datafile', metavar='input_sequence_filename',
                        help="FAST[AQ] sequence file to trim")
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    parser.add_argument('-q', '--quiet', dest='quiet', default=False,
                        action='store_true')
    add_output_compression_type(parser)
    return parser
예제 #26
0
def get_parser():
    epilog = """\
    Trimmed sequences will be placed in
    ``${input_sequence_filename}.abundfilt``.

    This script is constant memory.

    To trim reads based on k-mer abundance across multiple files, use
    :program:`load-into-counting.py` and :program:`filter-abund.py`.

    Example::

        filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa
    """
    parser = build_counting_args(
        descr="Trims sequences at a minimum k-mer abundance "
        "(in memory version).", epilog=textwrap.dedent(epilog),
        citations=['counting', 'SeqAn'])
    add_threading_args(parser)

    parser.add_argument('-C', '--cutoff', default=DEFAULT_CUTOFF,
                        type=check_argument_range(0, 256, "cutoff"),
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('-V', '--variable-coverage', action='store_true',
                        dest='variable_coverage', default=False,
                        help='Only trim low-abundance k-mers from sequences '
                        'that have high coverage.')
    parser.add_argument('-Z', '--normalize-to', type=int, dest='normalize_to',
                        help='Base the variable-coverage cutoff on this median'
                        ' k-mer abundance.',
                        default=DEFAULT_NORMALIZE_LIMIT)
    parser.add_argument('--savegraph', metavar="filename", default='',
                        help="If present, the name of the file to save the "
                        "k-mer countgraph to")
    parser.add_argument('-o', '--outfile', metavar='optional_output_filename',
                        default=None, help='Override default output filename '
                        'and output trimmed sequences into a file with the '
                        'given filename.')
    parser.add_argument('datafile', metavar='input_sequence_filename',
                        help="FAST[AQ] sequence file to trim")
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    parser.add_argument('-q', '--quiet', dest='quiet', default=False,
                        action='store_true')
    add_output_compression_type(parser)
    return parser
예제 #27
0
def get_parser():
    epilog = """\
    Trimmed sequences will be placed in
    ``${input_sequence_filename}.abundfilt`` for each input sequence file. If
    the input sequences are from RNAseq or metagenome sequencing then
    :option:`--variable-coverage` should be used.

    Example::

        load-into-counting.py -k 20 -x 5e7 countgraph data/100k-filtered.fa
        filter-abund.py -C 2 countgraph data/100k-filtered.fa
    """
    parser = KhmerArgumentParser(
        description='Trim sequences at a minimum k-mer abundance.',
        epilog=textwrap.dedent(epilog),
        citations=['counting'])
    parser.add_argument('input_graph', metavar='input_count_graph_filename',
                        help='The input k-mer countgraph filename')
    parser.add_argument('input_filename', metavar='input_sequence_filename',
                        help='Input FAST[AQ] sequence filename', nargs='+')
    add_threading_args(parser)
    parser.add_argument('--cutoff', '-C', dest='cutoff',
                        default=DEFAULT_CUTOFF,
                        type=check_argument_range(0, 256, 'cutoff'),
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('--variable-coverage', '-V', action='store_true',
                        dest='variable_coverage', default=False,
                        help='Only trim low-abundance k-mers from sequences '
                        'that have high coverage.')
    parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to',
                        help='Base the variable-coverage cutoff on this median'
                        ' k-mer abundance.',
                        default=DEFAULT_NORMALIZE_LIMIT)
    parser.add_argument('-o', '--output', dest='single_output_file',
                        type=khFileType('wb'),
                        metavar="optional_output_filename",
                        help='Output the trimmed sequences into a single file '
                        'with the given filename instead of creating a new '
                        'file for each input file.')
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    parser.add_argument('-q', '--quiet', dest='quiet', default=False,
                        action='store_true')
    add_output_compression_type(parser)
    return parser
예제 #28
0
def build_parser(parser):
    add_threading_args(parser)
    parser.add_argument(
        "--no-build-tagset",
        "-n",
        default=False,
        action="store_true",
        dest="no_build_tagset",
        help="Do NOT construct tagset while loading sequences",
    )
    parser.add_argument(
        "output_filename", metavar="output_nodegraph_filename", help="output" " k-mer nodegraph filename."
    )
    parser.add_argument(
        "input_filenames", metavar="input_sequence_filename", nargs="+", help="input FAST[AQ] sequence filename"
    )
    parser.add_argument("-f", "--force", default=False, action="store_true", help="Overwrite output file if it exists")
    return parser
예제 #29
0
def get_parser():
    epilog = """\
    Note: with :option:`-b`/:option:`--no-bigcount` the output will be the
    exact size of the k-mer countgraph and this script will use a constant
    amount of memory. In exchange k-mer counts will stop at 255. The memory
    usage of this script with :option:`-b` will be about 1.15x the product of
    the :option:`-x` and :option:`-N` numbers.

    Example::

        load-into-counting.py -k 20 -x 5e7 out data/100k-filtered.fa

    Multiple threads can be used to accelerate the process, if you have extra
    cores to spare.

    Example::

        load-into-counting.py -k 20 -x 5e7 -T 4 out data/100k-filtered.fa
    """

    parser = build_counting_args("Build a k-mer countgraph from the given"
                                 " sequences.", epilog=textwrap.dedent(epilog),
                                 citations=['counting', 'SeqAn'])
    add_threading_args(parser)
    parser.add_argument('output_countgraph_filename', help="The name of the"
                        " file to write the k-mer countgraph to.")
    parser.add_argument('input_sequence_filename', nargs='+',
                        help="The names of one or more FAST[AQ] input "
                        "sequence files.")
    parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True,
                        action='store_false', help="The default behaviour is "
                        "to count past 255 using bigcount. This flag turns "
                        "bigcount off, limiting counts to 255.")
    parser.add_argument('-s', '--summary-info', type=str, default=None,
                        metavar="FORMAT", choices=[str('json'), str('tsv')],
                        help="What format should the machine readable run "
                        "summary be in? (`json` or `tsv`, disabled by"
                        " default)")
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    parser.add_argument('-q', '--quiet', dest='quiet', default=False,
                        action='store_true')
    return parser
예제 #30
0
def get_parser():
    parser = build_hashbits_args(descr="Load sequences into the compressible "
                                 "graph format plus optional tagset.")
    add_threading_args(parser)
    parser.add_argument('--no-build-tagset', '-n', default=False,
                        action='store_true', dest='no_build_tagset',
                        help='Do NOT construct tagset while loading sequences')
    parser.add_argument('output_filename',
                        metavar='output_presence_table_filename', help='output'
                        ' k-mer presence table filename.')
    parser.add_argument('input_filenames', metavar='input_sequence_filename',
                        nargs='+', help='input FAST[AQ] sequence filename')
    parser.add_argument('--report-total-kmers', '-t', action='store_true',
                        help="Prints the total number of k-mers to stderr")
    parser.add_argument('--write-fp-rate', '-w', action='store_true',
                        help="Write false positive rate into .info file")
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    return parser
예제 #31
0
def get_parser():
    epilog = """
    Trimmed sequences will be placed in ${input_sequence_filename}.abundfilt
    for each input sequence file. If the input sequences are from RNAseq or
    metagenome sequencing then :option:`--variable-coverage` should be used.

    Example::

        load-into-counting.py -k 20 -x 5e7 table.ct data/100k-filtered.fa
        filter-abund.py -C 2 table.ct data/100k-filtered.fa
    """
    parser = argparse.ArgumentParser(
        description='Trim sequences at a minimum k-mer abundance.',
        epilog=textwrap.dedent(epilog),
        formatter_class=ComboFormatter)
    parser.add_argument('input_table', metavar='input_counting_table_filename',
                        help='The input k-mer counting table filename')
    parser.add_argument('input_filename', metavar='input_sequence_filename',
                        help='Input FAST[AQ] sequence filename', nargs='+')
    add_threading_args(parser)
    parser.add_argument('--cutoff', '-C', dest='cutoff',
                        default=DEFAULT_CUTOFF, type=int,
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('--variable-coverage', '-V', action='store_true',
                        dest='variable_coverage', default=False,
                        help='Only trim low-abundance k-mers from sequences '
                        'that have high coverage.')
    parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to',
                        help='Base the variable-coverage cutoff on this median'
                        ' k-mer abundance.',
                        default=DEFAULT_NORMALIZE_LIMIT)
    parser.add_argument('-o', '--out', dest='single_output_filename',
                        default='', metavar="optional_output_filename",
                        help='Output the trimmed sequences into a single file '
                        'with the given filename instead of creating a new '
                        'file for each input file.')
    parser.add_argument('--version', action='version',
                        version='khmer {v}'.format(v=__version__))
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    return parser
예제 #32
0
def get_parser():
    epilog = """
    Note: with :option:`-b` the output will be the exact size of the
    k-mer counting table and this script will use a constant amount of memory.
    In exchange k-mer counts will stop at 255. The memory usage of this script
    with :option:`-b` will be about 1.15x the product of the :option:`-x` and
    :option:`-N` numbers.

    Example::

        load-into-counting.py -k 20 -x 5e7 out.ct data/100k-filtered.fa

    Multiple threads can be used to accelerate the process, if you have extra
    cores to spare.

    Example::

        load-into-counting.py -k 20 -x 5e7 -T 4 out.ct data/100k-filtered.fa
    """

    parser = build_counting_args("Build a k-mer counting table from the given"
                                 " sequences.", epilog=textwrap.dedent(epilog))
    add_threading_args(parser)
    parser.add_argument('output_countingtable_filename', help="The name of the"
                        " file to write the k-mer counting table to.")
    parser.add_argument('input_sequence_filename', nargs='+',
                        help="The names of one or more FAST[AQ] input "
                        "sequence files.")
    parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True,
                        action='store_false',
                        help='Do not count k-mers past 255')
    parser.add_argument('--summary-info', '-s', default=None, metavar="FORMAT",
                        choices=['json', 'tsv'],
                        help="What format should the machine readable run "
                        "summary be in? (json or tsv, disabled by default)")
    parser.add_argument('--report-total-kmers', '-t', action='store_true',
                        help="Prints the total number of k-mers to stderr")
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    return parser
예제 #33
0
def get_parser():
    epilog = """\
    Trimmed sequences will be placed in
    ``${input_sequence_filename}.abundfilt``.

    This script is constant memory.

    To trim reads based on k-mer abundance across multiple files, use
    :program:`load-into-counting.py` and :program:`filter-abund.py`.

    Example::

        filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa
    """
    parser = build_counting_args(
        descr="Trims sequences at a minimum k-mer abundance "
        "(in memory version).",
        epilog=textwrap.dedent(epilog))
    add_threading_args(parser)

    parser.add_argument('--cutoff',
                        '-C',
                        default=DEFAULT_CUTOFF,
                        type=int,
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('--savegraph',
                        metavar="filename",
                        default='',
                        help="If present, the name of the file to save the "
                        "k-mer countgraph to")
    parser.add_argument('datafile',
                        metavar='input_sequence_filename',
                        help="FAST[AQ] sequence file to trim")
    parser.add_argument('-f',
                        '--force',
                        default=False,
                        action='store_true',
                        help='Overwrite output file if it exists')
    add_output_compression_type(parser)
    return parser
예제 #34
0
파일: load-graph.py 프로젝트: b-wyss/khmer
def get_parser():
    parser = build_hashbits_args(descr="Load sequences into the compressible " "graph format plus optional tagset.")
    add_threading_args(parser)
    parser.add_argument(
        "--no-build-tagset",
        "-n",
        default=False,
        action="store_true",
        dest="no_build_tagset",
        help="Do NOT construct tagset while loading sequences",
    )
    parser.add_argument(
        "output_filename", metavar="output_presence_table_filename", help="output" " k-mer presence table filename."
    )
    parser.add_argument(
        "input_filenames", metavar="input_sequence_filename", nargs="+", help="input FAST[AQ] sequence filename"
    )
    parser.add_argument(
        "--report-total-kmers", "-t", action="store_true", help="Prints the total number of k-mers to stderr"
    )
    parser.add_argument("--write-fp-rate", "-w", action="store_true", help="Write false positive rate into .info file")
    return parser
예제 #35
0
def get_parser():
    epilog = """\
    Trimmed sequences will be placed in
    ``${input_sequence_filename}.abundfilt``.

    This script is constant memory.

    To trim reads based on k-mer abundance across multiple files, use
    :program:`load-into-counting.py` and :program:`filter-abund.py`.

    Example::

        filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa
    """
    parser = build_counting_args(
        descr="Trims sequences at a minimum k-mer abundance "
        "(in memory version).", epilog=textwrap.dedent(epilog))
    add_threading_args(parser)

    parser.add_argument('--cutoff', '-C', default=DEFAULT_CUTOFF, type=int,
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('--savegraph', metavar="filename", default='',
                        help="If present, the name of the file to save the "
                        "k-mer countgraph to")
    parser.add_argument('-o', '--outfile', metavar='optional_output_filename',
                        default=None, help='Override default output filename '
                        'and output trimmed sequences into a file with the '
                        'given filename.')
    parser.add_argument('datafile', metavar='input_sequence_filename',
                        help="FAST[AQ] sequence file to trim")
    parser.add_argument('-f', '--force', default=False, action='store_true',
                        help='Overwrite output file if it exists')
    parser.add_argument('-q', '--quiet', dest='quiet', default=False,
                        action='store_true')
    add_output_compression_type(parser)
    return parser
예제 #36
0
def main():
    parser = build_nodegraph_args("find uniq kmer in query compard to refs")
    add_threading_args(parser)
    parser.add_argument('query',
                        help=('fasta readfile to query against'
                              'hashtable, use "-" if from stdin'))
    parser.add_argument('--x2',
                        default='1e8',
                        help='max_table size for readfile2')
    parser.add_argument('--N2',
                        default='4',
                        help='# of table (N) for readfile2')
    parser.add_argument('--bfout', help='output bloom filter of ref')

    group = parser.add_mutually_exclusive_group()
    group.add_argument('--shared',
                       dest='output',
                       action='store_const',
                       const='shared',
                       help='output shared kmers')
    group.add_argument('--uniq',
                       dest='output',
                       action='store_const',
                       const='uniq',
                       help='output uniq kmers in query')

    group2 = parser.add_mutually_exclusive_group(required=True)
    group2.add_argument(
        '--ref',
        nargs='+',
        help='fasta sequence file to be loaded in bloom filter')
    group2.add_argument('--load', help='load existing bloom filter')

    parser.set_defaults(output='uniq')
    args = parser.parse_args()
    #print(args, file=sys.stderr)

    K = args.ksize
    HT_SIZE = args.max_tablesize
    N_HT = args.n_tables
    HT_SIZE2 = int(float(args.x2))
    N_HT2 = int(args.N2)

    # positional
    query = args.query
    output = args.output

    start_time = time.time()
    # load from existing bloom filter
    if args.load:
        ht = khmer.Nodetable.load(args.load)
        end_time = time.time()
        secs = end_time - start_time
        mes = 'load bloom filter ({}) took {:.2f} hours..'
        print(mes.format(os.path.basename(args.load), secs / 3600.0),
              file=sys.stderr)

    # create a hashbits data structure
    else:
        refs = args.ref
        print('{} refs to be loaded'.format(len(refs)), file=sys.stderr)
        if query == '-' and refs == ['-']:
            print('*** query and ref can not both be "-" (read from stdin)',
                  file=sys.stderr)

        ht = khmer.Nodetable(K, HT_SIZE, N_HT)
        end_time = time.time()
        secs = end_time - start_time
        mes = 'initiation of bloom filter took {:.2f} hours..'
        print(mes.format(secs / 3600.0), file=sys.stderr)
        for index, filename in enumerate(refs):
            if index != 0 and index % 100 == 0:
                end_time = time.time()
                secs = end_time - start_time
                mes = '{} refs have been loaded within {:.2f} hours..'
                print(mes.format(index, secs / 3600.0), file=sys.stderr)

            try:
                rparser = khmer.ReadParser(filename)
            except OSError as e:
                mes = (
                    '*** Skipping due to OSError (machine or system problem):'
                    ' {}\n'
                    '*** Detailed error message:\n'
                    '*** {}')
                print(mes.format(os.path.basename(filename), str(e)),
                      file=sys.stderr)
                continue

            threads = []
            for _ in range(args.threads):
                cur_thrd = \
                    threading.Thread(
                        target=ht.consume_seqfile_with_reads_parser,
                        args=(rparser, )
                    )
                threads.append(cur_thrd)
                cur_thrd.start()

            for thread in threads:
                thread.join()

        if args.bfout:
            ht.save(args.bfout)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    mes = 'fp rate estimated to be {:1.3f}'
    print(mes.format(fp_rate), file=sys.stderr)

    if fp_rate > 0.01:
        mes = ('**\n'
               '** ERROR: the counting hash is too small for\n'
               '** refs.  Increase hashsize/num ht.\n'
               '**\n'
               '** Do not use these results!!')
        sys.exit(-1)

    n_unique1 = ht.n_unique_kmers()

    # create a hashbits data structure
    ht2 = khmer.Nodetable(K, HT_SIZE2, N_HT2)

    n_unique2 = 0
    n_shared = 0

    if output == 'uniq':
        for n, record in enumerate(khmer.ReadParser(query)):
            #for n, record in enumerate(screed.open(query)):
            _l = record.name.split(None, 1)
            if len(_l) == 2:
                name, desc = _l
            else:
                name = _l[0]
                desc = ''
            sequence = record.sequence.replace('N', 'A')
            seq_len = len(sequence)
            if seq_len < K:
                continue
            for i in range(0, seq_len + 1 - K):
                kmer = sequence[i:i + K]

                if (not ht2.get(kmer)):
                    n_unique2 += 1
                    if ht.get(kmer):
                        n_shared += 1
                    else:
                        mes = '>{}__{}  {}||length_{};k_{}\n{}'
                        print(mes.format(name, i, desc, seq_len, K, kmer))
                ht2.count(kmer)

    elif output == 'shared':
        for n, record in enumerate(khmer.ReadParser(query)):
            #for n, record in enumerate(screed.open(query)):
            _l = record.name.split(None, 1)
            if len(_l) == 2:
                name, desc = _l
            else:
                name = _l[0]
                desc = ''
            sequence = record.sequence.replace('N', 'A')
            seq_len = len(sequence)
            if seq_len < K:
                continue
            for i in range(0, seq_len + 1 - K):
                kmer = sequence[i:i + K]

                if (not ht2.get(kmer)):
                    n_unique2 += 1
                    if ht.get(kmer):
                        n_shared += 1
                        mes = '>{}__{}  {}||length_{};k_{}\n{}'
                        print(mes.format(name, i, desc, seq_len, K, kmer))
                    else:
                        pass

                ht2.count(kmer)

    mes = ('Unique kmer in {} (query):\t{}\n'
           'Shared kmer:\t{}\n'
           'Unique kmer in {}:\t{}\n')

    print(mes.format(os.path.basename(query), n_unique2, n_shared, 'refs',
                     n_unique1),
          file=sys.stderr)
예제 #37
0
def get_parser():
    epilog = """\
    Trimmed sequences will be placed in
    ``${input_sequence_filename}.abundfilt``.

    This script is constant memory.

    To trim reads based on k-mer abundance across multiple files, use
    :program:`load-into-counting.py` and :program:`filter-abund.py`.

    Example::

        filter-abund-single.py -k 20 -x 5e7 -C 2 data/100k-filtered.fa
    """
    parser = build_counting_args(
        descr="Trims sequences at a minimum k-mer abundance " "(in memory version).",
        epilog=textwrap.dedent(epilog),
        citations=["counting", "SeqAn"],
    )
    add_threading_args(parser)

    parser.add_argument(
        "--cutoff",
        "-C",
        default=DEFAULT_CUTOFF,
        type=check_argument_range(0, 256, "cutoff"),
        help="Trim at k-mers below this abundance.",
    )
    parser.add_argument(
        "--variable-coverage",
        "-V",
        action="store_true",
        dest="variable_coverage",
        default=False,
        help="Only trim low-abundance k-mers from sequences " "that have high coverage.",
    )
    parser.add_argument(
        "--normalize-to",
        "-Z",
        type=int,
        dest="normalize_to",
        help="Base the variable-coverage cutoff on this median" " k-mer abundance.",
        default=DEFAULT_NORMALIZE_LIMIT,
    )
    parser.add_argument(
        "--savegraph",
        metavar="filename",
        default="",
        help="If present, the name of the file to save the " "k-mer countgraph to",
    )
    parser.add_argument(
        "-o",
        "--outfile",
        metavar="optional_output_filename",
        default=None,
        help="Override default output filename " "and output trimmed sequences into a file with the " "given filename.",
    )
    parser.add_argument("datafile", metavar="input_sequence_filename", help="FAST[AQ] sequence file to trim")
    parser.add_argument("-f", "--force", default=False, action="store_true", help="Overwrite output file if it exists")
    parser.add_argument("-q", "--quiet", dest="quiet", default=False, action="store_true")
    add_output_compression_type(parser)
    return parser