예제 #1
0
def main():
    info('estimate_optimal_hash.py', ['counting'])
    args = sanitize_help(get_parser()).parse_args()
    N = args.N
    if args.M:
        M = args.M
        result = optimal_size(N, M=M)
        print("number of estimated distinct k-mers:  ", N, file=sys.stderr)
        print("size of memory available to use:      ", M, file=sys.stderr)
        print("optimal number of hash tables:        ", result.num_htables,
              file=sys.stderr)
        print("optimal size of hash tables:          ", result.htable_size,
              file=sys.stderr)
        print("estimated false positive rate:        ", result.fp_rate,
              file=sys.stderr)
        print("estimated usage of memory:            ", result.mem_use,
              file=sys.stderr)

    elif args.f:
        f = args.f
        result = optimal_size(N, f=f)
        print("number of estimated distinct k-mers:  ", N, file=sys.stderr)
        print("desired maximum false positive rate:  ", f, file=sys.stderr)
        print("optimal number of hash tables:        ", result.num_htables,
              file=sys.stderr)
        print("optimal size of hash tables:          ", result.htable_size,
              file=sys.stderr)
        print("estimated false positive rate:        ", result.fp_rate,
              file=sys.stderr)
        print("estimated usage of memory:            ", result.mem_use,
              file=sys.stderr)
        
    else:
        get_parser().error('No action requested, add -M (size of memory available to use) or -f (desired maximum false posotive rate)')
예제 #2
0
def main():
    info('estimate_optimal_hash.py', ['counting'])
    args = sanitize_help(get_parser()).parse_args()
    N = args.N
    if args.M:
        M = args.M
        result = optimal_size(N, M=M)
        print("number of estimated distinct k-mers:  ", N, file=sys.stderr)
        print("size of memory available to use:      ", M, file=sys.stderr)
        print("optimal number of hash tables:        ",
              result.num_htables,
              file=sys.stderr)
        print("optimal size of hash tables:          ",
              result.htable_size,
              file=sys.stderr)
        print("estimated false positive rate:        ",
              result.fp_rate,
              file=sys.stderr)
        print("estimated usage of memory:            ",
              result.mem_use,
              file=sys.stderr)

    elif args.f:
        f = args.f
        result = optimal_size(N, f=f)
        print("number of estimated distinct k-mers:  ", N, file=sys.stderr)
        print("desired maximum false positive rate:  ", f, file=sys.stderr)
        print("optimal number of hash tables:        ",
              result.num_htables,
              file=sys.stderr)
        print("optimal size of hash tables:          ",
              result.htable_size,
              file=sys.stderr)
        print("estimated false positive rate:        ",
              result.fp_rate,
              file=sys.stderr)
        print("estimated usage of memory:            ",
              result.mem_use,
              file=sys.stderr)

    else:
        get_parser().error(
            'No action requested, add -M (size of memory available to use) or -f (desired maximum false posotive rate)'
        )
예제 #3
0
def test_optimal_size_function():
    res = optimal_size(99, mem_cap=1024)
    assert res.num_htables == 7, res[0]
    assert res.htable_size == 146, res[1]
    assert res.mem_use == 1022, res[2]
    assert abs(.008 - res.fp_rate) < .001, res[3]

    res = optimal_size(99, fp_rate=0.00701925498897)
    assert res.num_htables == 7, res[0]
    assert res.htable_size == 145, res[1]
    assert res.mem_use == 1015, res[2]
    assert abs(.008 - res.fp_rate) < .002, res[3]

    try:
        optimal_size(99, mem_cap=1024, fp_rate=0.00701925498897)
        assert 0, "this should fail"
    except TypeError as err:
        print(str(err))
        assert "num_kmers and either mem_cap or fp_rate" in str(err)

    try:
        optimal_size(99)
        assert 0, "this should fail"
    except TypeError as err:
        print(str(err))
        assert "num_kmers and either mem_cap or fp_rate" in str(err)
예제 #4
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "This script creates training/reference sketches for each FASTA/Q file"
        " listed in the input file.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-p',
                        '--prime',
                        help='Prime (for modding hashes)',
                        default=9999999999971)
    parser.add_argument('-t',
                        '--threads',
                        type=int,
                        help="Number of threads to use",
                        default=multiprocessing.cpu_count())
    parser.add_argument('-n',
                        '--num_hashes',
                        type=int,
                        help="Number of hashes to use.",
                        default=500)
    parser.add_argument('-k',
                        '--k_size',
                        type=int,
                        help="K-mer size",
                        default=21)
    parser.add_argument(
        '-i',
        '--intersect_nodegraph',
        action="store_true",
        help=
        "Optional flag to export Nodegraph file (bloom filter) containing all k-mers in the"
        " training database. Saved in same location as out_file. This is to be used with QueryDNADatabase.py"
    )
    parser.add_argument(
        'in_file',
        help=
        "Input file: file containing (absolute) file names of training genomes."
    )
    parser.add_argument(
        'out_file',
        help='Output training database/reference file (in HDF5 format)')
    args = parser.parse_args()
    num_threads = args.threads
    prime = args.prime  # taking hashes mod this prime
    ksize = args.k_size
    if ksize > 31:
        raise Exception(
            "Unfortunately, ksize must be size 32 or smaller (due to khmer contraints). Please reduce the ksize or use MakeStreamingDNADatabase.py instead."
        )
    max_h = args.num_hashes
    input_file_names = os.path.abspath(args.in_file)
    if not os.path.exists(input_file_names):
        raise Exception("Input file %s does not exist." % input_file_names)
    out_file = os.path.abspath(args.out_file)
    if args.intersect_nodegraph is True:
        intersect_nodegraph_file = os.path.splitext(
            out_file)[0] + ".intersect.Nodegraph"
    else:
        intersect_nodegraph_file = None

    file_names = list()
    fid = open(input_file_names, 'r')
    for line in fid.readlines():
        line = line.strip()
        if not os.path.exists(line):
            raise Exception("Training genome %s does not exist." % line)
        file_names.append(line)
    fid.close()

    # Open the pool and make the sketches
    pool = Pool(processes=num_threads)
    genome_sketches = pool.map(
        make_minhash_star,
        zip(file_names, repeat(max_h), repeat(prime), repeat(ksize)))

    # Export all the sketches
    MH.export_multiple_to_single_hdf5(genome_sketches, out_file)

    # If requested, save all the k-mers into a big Nodegraph (unfortunately, need to pass through the data again since we
    # a-priori don't know how big of a table we need to make
    if intersect_nodegraph_file is not None:
        total_num_kmers = 0
        for sketch in genome_sketches:
            total_num_kmers += sketch._true_num_kmers
        res = optimal_size(total_num_kmers, fp_rate=0.001)
        intersect_nodegraph = khmer.Nodegraph(ksize, res.htable_size,
                                              res.num_htables)
        for file_name in file_names:
            intersect_nodegraph.consume_seqfile(file_name)
        intersect_nodegraph.save(intersect_nodegraph_file)
예제 #5
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "This script creates training/reference sketches for each FASTA/Q file"
        " listed in the input file.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-p',
                        '--prime',
                        help='Prime (for modding hashes)',
                        default=9999999999971)
    parser.add_argument('-t',
                        '--threads',
                        type=int,
                        help="Number of threads to use",
                        default=multiprocessing.cpu_count())
    parser.add_argument('-n',
                        '--num_hashes',
                        type=int,
                        help="Number of hashes to use.",
                        default=500)
    parser.add_argument('-k',
                        '--k_size',
                        type=int,
                        help="K-mer size",
                        default=21)
    parser.add_argument('-i', '--intersect_nodegraph', action="store_true", \
                        help="Optional flag to export Nodegraph file (bloom filter) containing all k-mers in the" \
                             " training database. Saved in same location as out_file. This is to be used with QueryDNADatabase.py")
    # adding new parser argument to handle
    parser.add_argument(
        '-d',
        '--temp_dir',
        type=str,
        help="temporary storage directory (define for continue flag)",
        default="./temp")
    parser.add_argument('-s', '--data_stream', action="store_true", \
                        help="Optional flag to define whether the input_files are urls to stream data instead of" \
                             " absolute paths to files.", default=False)
    parser.add_argument('-z', '--unzip_data', action="store_true", \
                        help="Optional flag to define whether the input_files are gzipped. if True, will unzip in " \
                             "chunks and delete unzipped fastas after use", default=False)
    parser.add_argument('-c', '--continue', action="store_true", \
                        help="Optional flag to define whether to continue sketching files defined in input file. " \
                             "Functionally, checks against the existing sketches in the temporary directory.", default=False)
    parser.add_argument(
        'in_file',
        help=
        "Input file: file containing (absolute) file names of training genomes."
    )
    parser.add_argument(
        'out_file',
        help='Output training database/reference file (in HDF5 format)')
    args = parser.parse_args()
    num_threads = args.threads
    prime = args.prime  # taking hashes mod this prime
    ksize = args.k_size
    if ksize > 31:
        raise Exception(
            "Unfortunately, ksize must be size 32 or smaller (due to khmer contraints). Please reduce the ksize or use MakeStreamingDNADatabase.py instead."
        )
    max_h = args.num_hashes
    input_file_names = os.path.abspath(args.in_file)
    if not os.path.exists(input_file_names):
        raise Exception("Input file %s does not exist." % input_file_names)
    out_file = os.path.abspath(args.out_file)
    if args.intersect_nodegraph is True:
        intersect_nodegraph_file = os.path.splitext(
            out_file)[0] + ".intersect.Nodegraph"
    else:
        intersect_nodegraph_file = None
    # create temporary dict if it doesn't exist
    if not os.path.isdir(args.temp_dir):
        os.mkdir(args.temp_dir)

    if args.unzip_data is True and args.data_stream is True:
        raise InputError(
            "unzip_data and data_stream flags cannot both be specified.")

    if args.unzip_data is True or args.data_stream is True:
        with open(input_file_names, 'r') as fid:
            lines = fid.readlines()
        lines = [l.strip() for l in lines]
        # just do everything in one chunk
        chunks = [lines]
        # chunk_size = 75
        # with open(input_file_names, 'r') as fid:
        #     lines = fid.readlines()
        # chunks = []
        # for i in range(int(math.ceil(len(lines) / chunk_size))):
        #     if (i+1)*chunk_size > len(lines)-1:
        #         chunks[i*chunk_size:len(lines)]
        #     else:
        #         chunks[i*chunk_size:(i+1)*chunk_size]

    genome_sketches = []

    temp_path = args.temp_dir
    if args.unzip_data:
        print("Beginning unzipping data")
        print(chunks)
        if not os.path.isdir(os.path.join(temp_path, "fastas")):
            os.mkdir(os.path.join(temp_path, "fastas"))
        for idx, chunk in enumerate(chunks):
            print("Beginning download of chunk %i of %i" % (idx, len(chunks)))
            file_names = []
            for line in chunk:
                f = unzip_file(line, os.path.join(temp_path, "fastas"))
                file_names.append(f)
                # if not check_if_pickled(line):
                #     f = unzip_file(line, os.path.join(temp_path, "fastas"))
                #     file_names.append(f)

            if len(file_names) > 0:
                print("starting sketches")
                pool = Pool(processes=num_threads)
                curr_genome_sketches = pool.map(
                    make_minhash_star,
                    zip(file_names, repeat(max_h), repeat(prime),
                        repeat(ksize)))
                genome_sketches += curr_genome_sketches

                print("removing fasta files")
                for file_name in file_names:
                    os.remove(file_name)
            else:
                print("pickled files found, continuing...")

    # adding new
    elif args.data_stream:

        for idx, chunk in enumerate(chunks):
            print("Beginning download of chunk %i of %i" % (idx, len(chunks)))
            file_names = []
            for line in chunk:
                file = stream_file(line.strip())
                file_names.append(file)
            print("starting sketches")

            pool = Pool(processes=num_threads)
            curr_genome_sketches = pool.map(
                make_minhash_star,
                zip(file_names, repeat(max_h), repeat(prime), repeat(ksize)))
            genome_sketches += curr_genome_sketches

            print("removing fasta files")
            for file_name in file_names:
                os.remove(file_name)

    else:
        file_names = list()
        fid = open(input_file_names, 'r')
        for line in fid.readlines():
            line = line.strip()
            if not os.path.exists(line):
                raise Exception("Training genome %s does not exist." % line)
            file_names.append(line)
        fid.close()

        # Open the pool and make the sketches
        pool = Pool(processes=num_threads)
        genome_sketches = pool.map(
            make_minhash_star,
            zip(file_names, repeat(max_h), repeat(prime), repeat(ksize)))
    print("Beginning export to one HDF5 file")
    # Export all the sketches
    MH.export_multiple_to_single_hdf5(genome_sketches, out_file)

    # If requested, save all the k-mers into a big Nodegraph (unfortunately, need to pass through the data again since we
    # a-priori don't know how big of a table we need to make
    if intersect_nodegraph_file is not None:
        total_num_kmers = 0
        for sketch in genome_sketches:
            total_num_kmers += sketch._true_num_kmers
        res = optimal_size(total_num_kmers, fp_rate=0.001)
        intersect_nodegraph = khmer.Nodegraph(ksize, res.htable_size,
                                              res.num_htables)
        for file_name in file_names:
            intersect_nodegraph.consume_seqfile(file_name)
        intersect_nodegraph.save(intersect_nodegraph_file)
예제 #6
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "This script creates a CSV file of similarity indicies between the"
        " input file and each of the sketches in the training/reference file.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-t',
                        '--threads',
                        type=int,
                        help="Number of threads to use",
                        default=multiprocessing.cpu_count())
    parser.add_argument('-f',
                        '--force',
                        action="store_true",
                        help="Force creation of new NodeGraph.")
    parser.add_argument('-fp',
                        '--fp_rate',
                        type=restricted_float,
                        help="False positive rate.",
                        default=0.0001)
    parser.add_argument(
        '-ct',
        '--containment_threshold',
        type=restricted_float,
        help="Only return results with containment index above this value",
        default=0.02)
    parser.add_argument(
        '-c',
        '--confidence',
        type=restricted_float,
        help=
        "Desired probability that all results were returned with containment index above threshold [-ct]",
        default=0.95)
    parser.add_argument(
        '-ng',
        '--node_graph',
        help="NodeGraph/bloom filter location. Used if it exists; if not, one "
        "will be created and put in the same directory as the specified "
        "output CSV file.",
        default=None)
    parser.add_argument(
        '-b',
        '--base_name',
        action="store_true",
        help=
        "Flag to indicate that only the base names (not the full path) should be saved in the output CSV file"
    )
    parser.add_argument(
        '-i',
        '--intersect_nodegraph',
        action="store_true",
        help=
        "Option to only insert query k-mers in bloom filter if they appear anywhere in the training"
        " database. Note that the Jaccard estimates will now be "
        "J(query intersect union_i training_i, training_i) instead of J(query, training_i), "
        "but will use significantly less space.")
    parser.add_argument('in_file',
                        help="Input file: FASTQ/A file (can be gzipped).")
    parser.add_argument(
        'training_data',
        help=
        "Training/reference data (HDF5 file created by MakeTrainingDatabase.py)"
    )
    parser.add_argument('out_csv', help='Output CSV file')

    # Parse and check args
    args = parser.parse_args()
    base_name = args.base_name
    training_data = os.path.abspath(args.training_data)
    if not os.path.exists(training_data):
        raise Exception("Training/reference file %s does not exist." %
                        training_data)
    # Let's get the k-mer sizes in the training database
    ksizes = set()
    # Import all the training data
    sketches = MH.import_multiple_from_single_hdf5(training_data)
    # Check for issues with the sketches (can also check if all the kmers make sense (i.e. no '' or non-ACTG characters))
    if sketches[0]._kmers is None:
        raise Exception(
            "For some reason, the k-mers were not saved when the database was created. Try running MakeDNADatabase.py again."
        )
    num_hashes = len(sketches[0]._kmers)
    for i in range(len(sketches)):
        sketch = sketches[i]
        if sketch._kmers is None:
            raise Exception(
                "For some reason, the k-mers were not saved when the database was created. Try running MakeDNADatabase.py again."
            )
        if len(sketch._kmers) != num_hashes:
            raise Exception("Unequal number of hashes for sketch of %s" %
                            sketch.input_file_name)
        ksizes.add(sketch.ksize)
        if len(ksizes) > 1:
            raise Exception(
                "Training/reference data uses different k-mer sizes. Culprit was %s."
                % (sketch.input_file_name))
    # Get the appropriate k-mer size
    ksize = ksizes.pop()
    # Get number of threads to use
    num_threads = args.threads
    # Check and parse the query file
    query_file = os.path.abspath(args.in_file)
    if not os.path.exists(query_file):
        raise Exception("Query file %s does not exist." % query_file)
    # Node graph is stored in the output folder with name <InputFASTQ/A>.NodeGraph.K<k_size>
    if args.node_graph is None:  # If no node graph is specified, create one
        node_graph_out = os.path.join(
            os.path.dirname(os.path.abspath(args.out_csv)),
            os.path.basename(query_file) + ".NodeGraph.K" + str(ksize))
        if not os.path.exists(
                node_graph_out
        ):  # Don't complain if the default location works
            print("Node graph not provided (via -ng). Creating one at: %s" %
                  node_graph_out)
    elif os.path.exists(
            args.node_graph):  # If one is specified and it exists, use it
        node_graph_out = args.node_graph
    else:  # Otherwise, the specified one doesn't exist
        raise Exception("Provided NodeGraph %s does not exist." %
                        args.node_graph)
    # import and check the intersect nodegraph
    if args.intersect_nodegraph is True:
        intersect_nodegraph_file = os.path.splitext(
            training_data)[0] + ".intersect.Nodegraph"
    else:
        intersect_nodegraph_file = None
    intersect_nodegraph = None
    if intersect_nodegraph_file is not None:
        if not os.path.exists(intersect_nodegraph_file):
            raise Exception(
                "Intersection nodegraph does not exist. Please re-run MakeDNADatabase.py with the -i flag."
            )
        try:
            intersect_nodegraph = khmer.load_nodegraph(
                intersect_nodegraph_file)
            if intersect_nodegraph.ksize() != ksize:
                raise Exception(
                    "Given intersect nodegraph %s has K-mer size %d while the database K-mer size is %d"
                    % (intersect_nodegraph_file, intersect_nodegraph.ksize(),
                       ksize))
        except:
            raise Exception("Could not load given intersect nodegraph %s" %
                            intersect_nodegraph_file)
    results_file = os.path.abspath(args.out_csv)
    force = args.force
    fprate = args.fp_rate
    coverage_threshold = args.containment_threshold  # desired coverage cutoff
    confidence = args.confidence  # desired confidence that you got all the organisms with coverage >= desired coverage

    # Get names of training files for use as rows in returned tabular data
    training_file_names = []
    for i in range(len(sketches)):
        training_file_names.append(sketches[i].input_file_name)

    # Only form the Nodegraph if we need to
    global sample_kmers
    if not os.path.exists(node_graph_out) or force is True:
        hll = khmer.HLLCounter(0.01, ksize)
        hll.consume_seqfile(query_file)
        full_kmer_count_estimate = hll.estimate_cardinality()
        res = optimal_size(full_kmer_count_estimate, fp_rate=fprate)
        if intersect_nodegraph is None:  # If no intersect list was given, just populate the bloom filter
            sample_kmers = khmer.Nodegraph(ksize, res.htable_size,
                                           res.num_htables)
            #sample_kmers.consume_seqfile(query_file)
            rparser = khmer.ReadParser(query_file)
            threads = []
            for _ in range(num_threads):
                cur_thrd = threading.Thread(
                    target=sample_kmers.consume_seqfile_with_reads_parser,
                    args=(rparser, ))
                threads.append(cur_thrd)
                cur_thrd.start()
            for thread in threads:
                thread.join()
        else:  # Otherwise, only put a k-mer in the bloom filter if it's in the intersect list
            # (WARNING: this will cause the Jaccard index to be calculated in terms of J(query\intersect hash_list, training)
            #  instead of J(query, training)
            # (TODO: fix this after khmer is updated)
            #intersect_nodegraph_kmer_count = intersect_nodegraph.n_unique_kmers()  # Doesnt work due to khmer bug
            intersect_nodegraph_kmer_count = intersect_nodegraph.n_occupied(
            )  # Not technically correct, but I need to wait until khmer is updated
            if intersect_nodegraph_kmer_count < full_kmer_count_estimate:  # At max, we have as many k-mers as in the union of the training database (But makes this always return 0)
                res = optimal_size(intersect_nodegraph_kmer_count,
                                   fp_rate=fprate)
                sample_kmers = khmer.Nodegraph(ksize, res.htable_size,
                                               res.num_htables)
            else:
                sample_kmers = khmer.Nodegraph(ksize, res.htable_size,
                                               res.num_htables)
            for record in screed.open(query_file):
                seq = record.sequence
                for i in range(len(seq) - ksize + 1):
                    kmer = seq[i:i + ksize]
                    if intersect_nodegraph.get(kmer) > 0:
                        sample_kmers.add(kmer)
        # Save the sample_kmers
        sample_kmers.save(node_graph_out)
        true_fprate = khmer.calc_expected_collisions(sample_kmers,
                                                     max_false_pos=0.99)
    else:
        sample_kmers = khmer.load_nodegraph(node_graph_out)
        node_ksize = sample_kmers.ksize()
        if node_ksize != ksize:
            raise Exception(
                "Node graph %s has wrong k-mer size of %d (input was %d). Try --force or change -k."
                % (node_graph_out, node_ksize, ksize))
        true_fprate = khmer.calc_expected_collisions(sample_kmers,
                                                     max_false_pos=0.99)

    #num_sample_kmers = sample_kmers.n_unique_kmers()  # For some reason this only works when creating a new node graph, use the following instead
    num_sample_kmers = sample_kmers.n_occupied()

    # Compute all the indicies for all the training data
    pool = Pool(processes=num_threads)
    res = pool.map(
        unwrap_compute_indicies,
        zip(sketches, repeat(num_sample_kmers), repeat(true_fprate)))

    # Gather up the results in a nice form
    intersection_cardinalities = np.zeros(len(sketches))
    containment_indexes = np.zeros(len(sketches))
    jaccard_indexes = np.zeros(len(sketches))
    for i in range(len(res)):
        (intersection_cardinality, containment_index, jaccard_index) = res[i]
        intersection_cardinalities[i] = intersection_cardinality
        containment_indexes[i] = containment_index
        jaccard_indexes[i] = jaccard_index

    d = {
        'intersection': intersection_cardinalities,
        'containment index': containment_indexes,
        'jaccard index': jaccard_indexes
    }
    # Use only the basenames to label the rows (if requested)
    if base_name is True:
        df = pd.DataFrame(d, map(os.path.basename, training_file_names))
    else:
        df = pd.DataFrame(d, training_file_names)

    # Only get the rows above a certain threshold
    if coverage_threshold <= 0:
        est_threshold = 0
    else:
        est_threshold = threshold_calc(num_hashes, coverage_threshold, fprate,
                                       confidence)
    filtered_results = df[df['containment index'] > est_threshold].sort_values(
        'containment index', ascending=False)
    # Export the results
    filtered_results.to_csv(results_file, index=True, encoding='utf-8')
예제 #7
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "This script will create node graph for a given k-mer size and query file (can be used as input to QueryDNADatabase.py)",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-fp',
                        '--fp_rate',
                        type=restricted_float,
                        help="False positive rate.",
                        default=0.0001)
    parser.add_argument(
        '-i',
        '--intersect_nodegraph',
        help=
        "Location of Node Graph. Will only insert query k-mers in bloom filter if they appear anywhere in the training"
        " database. Note that the Jaccard estimates will now be "
        "J(query intersect union_i training_i, training_i) instead of J(query, training_i), "
        "but will use significantly less space (unfortunately will also disable threading)."
    )
    parser.add_argument('-k',
                        '--k_size',
                        type=int,
                        help="K-mer size",
                        default=21)
    parser.add_argument('-t',
                        '--threads',
                        type=int,
                        help="Number of threads to use",
                        default=multiprocessing.cpu_count())
    parser.add_argument('in_file',
                        help="Input file: FASTQ/A file (can be gzipped).")
    parser.add_argument('out_dir', help='Output directory')

    # Parse and check args
    args = parser.parse_args()
    query_file = os.path.abspath(args.in_file)
    ksize = args.k_size
    num_threads = args.threads
    node_graph_out = os.path.join(
        os.path.abspath(args.out_dir),
        os.path.basename(query_file) + ".NodeGraph.K" + str(ksize))
    if args.intersect_nodegraph is not None:
        intersect_nodegraph_file = args.intersect_nodegraph
    else:
        intersect_nodegraph_file = None
    intersect_nodegraph = None
    if intersect_nodegraph_file is not None:
        if not os.path.exists(intersect_nodegraph_file):
            raise Exception(
                "Intersection nodegraph does not exist. Please re-run MakeDNADatabase.py with the -i flag."
            )
        try:
            intersect_nodegraph = khmer.load_nodegraph(
                intersect_nodegraph_file)
            if intersect_nodegraph.ksize() != ksize:
                raise Exception(
                    "Given intersect nodegraph %s has K-mer size %d while the database K-mer size is %d"
                    % (intersect_nodegraph_file, intersect_nodegraph.ksize(),
                       ksize))
        except:
            raise Exception("Could not load given intersect nodegraph %s" %
                            intersect_nodegraph_file)
    fprate = args.fp_rate
    hll = khmer.HLLCounter(0.01, ksize)
    hll.consume_seqfile(query_file)
    full_kmer_count_estimate = hll.estimate_cardinality()
    res = optimal_size(full_kmer_count_estimate, fp_rate=fprate)
    if intersect_nodegraph is None:  # If no intersect list was given, just populate the bloom filter
        sample_kmers = khmer.Nodegraph(ksize, res.htable_size, res.num_htables)
        #sample_kmers.consume_seqfile(query_file)
        rparser = khmer.ReadParser(query_file)
        threads = []
        for _ in range(num_threads):
            cur_thrd = threading.Thread(
                target=sample_kmers.consume_seqfile_with_reads_parser,
                args=(rparser, ))
            threads.append(cur_thrd)
            cur_thrd.start()
        for thread in threads:
            thread.join()
    else:  # Otherwise, only put a k-mer in the bloom filter if it's in the intersect list
        # (WARNING: this will cause the Jaccard index to be calculated in terms of J(query\intersect hash_list, training)
        #  instead of J(query, training)
        # (TODO: fix this after khmer is updated)
        #intersect_nodegraph_kmer_count = intersect_nodegraph.n_unique_kmers()  # Doesnt work due to khmer bug
        intersect_nodegraph_kmer_count = intersect_nodegraph.n_occupied(
        )  # Doesnt work due to khmer bug
        if intersect_nodegraph_kmer_count < full_kmer_count_estimate:  # At max, we have as many k-mers as in the union of the training database (But makes this always return 0)
            res = optimal_size(intersect_nodegraph_kmer_count, fp_rate=fprate)
            sample_kmers = khmer.Nodegraph(ksize, res.htable_size,
                                           res.num_htables)
        else:
            sample_kmers = khmer.Nodegraph(ksize, res.htable_size,
                                           res.num_htables)
        for record in screed.open(query_file):
            seq = record.sequence
            for i in range(len(seq) - ksize + 1):
                kmer = seq[i:i + ksize]
                if intersect_nodegraph.get(kmer) > 0:
                    sample_kmers.add(kmer)
    # Save the sample_kmers
    sample_kmers.save(node_graph_out)