def main(): info('estimate_optimal_hash.py', ['counting']) args = sanitize_help(get_parser()).parse_args() N = args.N if args.M: M = args.M result = optimal_size(N, M=M) print("number of estimated distinct k-mers: ", N, file=sys.stderr) print("size of memory available to use: ", M, file=sys.stderr) print("optimal number of hash tables: ", result.num_htables, file=sys.stderr) print("optimal size of hash tables: ", result.htable_size, file=sys.stderr) print("estimated false positive rate: ", result.fp_rate, file=sys.stderr) print("estimated usage of memory: ", result.mem_use, file=sys.stderr) elif args.f: f = args.f result = optimal_size(N, f=f) print("number of estimated distinct k-mers: ", N, file=sys.stderr) print("desired maximum false positive rate: ", f, file=sys.stderr) print("optimal number of hash tables: ", result.num_htables, file=sys.stderr) print("optimal size of hash tables: ", result.htable_size, file=sys.stderr) print("estimated false positive rate: ", result.fp_rate, file=sys.stderr) print("estimated usage of memory: ", result.mem_use, file=sys.stderr) else: get_parser().error('No action requested, add -M (size of memory available to use) or -f (desired maximum false posotive rate)')
def main(): info('estimate_optimal_hash.py', ['counting']) args = sanitize_help(get_parser()).parse_args() N = args.N if args.M: M = args.M result = optimal_size(N, M=M) print("number of estimated distinct k-mers: ", N, file=sys.stderr) print("size of memory available to use: ", M, file=sys.stderr) print("optimal number of hash tables: ", result.num_htables, file=sys.stderr) print("optimal size of hash tables: ", result.htable_size, file=sys.stderr) print("estimated false positive rate: ", result.fp_rate, file=sys.stderr) print("estimated usage of memory: ", result.mem_use, file=sys.stderr) elif args.f: f = args.f result = optimal_size(N, f=f) print("number of estimated distinct k-mers: ", N, file=sys.stderr) print("desired maximum false positive rate: ", f, file=sys.stderr) print("optimal number of hash tables: ", result.num_htables, file=sys.stderr) print("optimal size of hash tables: ", result.htable_size, file=sys.stderr) print("estimated false positive rate: ", result.fp_rate, file=sys.stderr) print("estimated usage of memory: ", result.mem_use, file=sys.stderr) else: get_parser().error( 'No action requested, add -M (size of memory available to use) or -f (desired maximum false posotive rate)' )
def test_optimal_size_function(): res = optimal_size(99, mem_cap=1024) assert res.num_htables == 7, res[0] assert res.htable_size == 146, res[1] assert res.mem_use == 1022, res[2] assert abs(.008 - res.fp_rate) < .001, res[3] res = optimal_size(99, fp_rate=0.00701925498897) assert res.num_htables == 7, res[0] assert res.htable_size == 145, res[1] assert res.mem_use == 1015, res[2] assert abs(.008 - res.fp_rate) < .002, res[3] try: optimal_size(99, mem_cap=1024, fp_rate=0.00701925498897) assert 0, "this should fail" except TypeError as err: print(str(err)) assert "num_kmers and either mem_cap or fp_rate" in str(err) try: optimal_size(99) assert 0, "this should fail" except TypeError as err: print(str(err)) assert "num_kmers and either mem_cap or fp_rate" in str(err)
def main(): parser = argparse.ArgumentParser( description= "This script creates training/reference sketches for each FASTA/Q file" " listed in the input file.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-p', '--prime', help='Prime (for modding hashes)', default=9999999999971) parser.add_argument('-t', '--threads', type=int, help="Number of threads to use", default=multiprocessing.cpu_count()) parser.add_argument('-n', '--num_hashes', type=int, help="Number of hashes to use.", default=500) parser.add_argument('-k', '--k_size', type=int, help="K-mer size", default=21) parser.add_argument( '-i', '--intersect_nodegraph', action="store_true", help= "Optional flag to export Nodegraph file (bloom filter) containing all k-mers in the" " training database. Saved in same location as out_file. This is to be used with QueryDNADatabase.py" ) parser.add_argument( 'in_file', help= "Input file: file containing (absolute) file names of training genomes." ) parser.add_argument( 'out_file', help='Output training database/reference file (in HDF5 format)') args = parser.parse_args() num_threads = args.threads prime = args.prime # taking hashes mod this prime ksize = args.k_size if ksize > 31: raise Exception( "Unfortunately, ksize must be size 32 or smaller (due to khmer contraints). Please reduce the ksize or use MakeStreamingDNADatabase.py instead." ) max_h = args.num_hashes input_file_names = os.path.abspath(args.in_file) if not os.path.exists(input_file_names): raise Exception("Input file %s does not exist." % input_file_names) out_file = os.path.abspath(args.out_file) if args.intersect_nodegraph is True: intersect_nodegraph_file = os.path.splitext( out_file)[0] + ".intersect.Nodegraph" else: intersect_nodegraph_file = None file_names = list() fid = open(input_file_names, 'r') for line in fid.readlines(): line = line.strip() if not os.path.exists(line): raise Exception("Training genome %s does not exist." % line) file_names.append(line) fid.close() # Open the pool and make the sketches pool = Pool(processes=num_threads) genome_sketches = pool.map( make_minhash_star, zip(file_names, repeat(max_h), repeat(prime), repeat(ksize))) # Export all the sketches MH.export_multiple_to_single_hdf5(genome_sketches, out_file) # If requested, save all the k-mers into a big Nodegraph (unfortunately, need to pass through the data again since we # a-priori don't know how big of a table we need to make if intersect_nodegraph_file is not None: total_num_kmers = 0 for sketch in genome_sketches: total_num_kmers += sketch._true_num_kmers res = optimal_size(total_num_kmers, fp_rate=0.001) intersect_nodegraph = khmer.Nodegraph(ksize, res.htable_size, res.num_htables) for file_name in file_names: intersect_nodegraph.consume_seqfile(file_name) intersect_nodegraph.save(intersect_nodegraph_file)
def main(): parser = argparse.ArgumentParser( description= "This script creates training/reference sketches for each FASTA/Q file" " listed in the input file.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-p', '--prime', help='Prime (for modding hashes)', default=9999999999971) parser.add_argument('-t', '--threads', type=int, help="Number of threads to use", default=multiprocessing.cpu_count()) parser.add_argument('-n', '--num_hashes', type=int, help="Number of hashes to use.", default=500) parser.add_argument('-k', '--k_size', type=int, help="K-mer size", default=21) parser.add_argument('-i', '--intersect_nodegraph', action="store_true", \ help="Optional flag to export Nodegraph file (bloom filter) containing all k-mers in the" \ " training database. Saved in same location as out_file. This is to be used with QueryDNADatabase.py") # adding new parser argument to handle parser.add_argument( '-d', '--temp_dir', type=str, help="temporary storage directory (define for continue flag)", default="./temp") parser.add_argument('-s', '--data_stream', action="store_true", \ help="Optional flag to define whether the input_files are urls to stream data instead of" \ " absolute paths to files.", default=False) parser.add_argument('-z', '--unzip_data', action="store_true", \ help="Optional flag to define whether the input_files are gzipped. if True, will unzip in " \ "chunks and delete unzipped fastas after use", default=False) parser.add_argument('-c', '--continue', action="store_true", \ help="Optional flag to define whether to continue sketching files defined in input file. " \ "Functionally, checks against the existing sketches in the temporary directory.", default=False) parser.add_argument( 'in_file', help= "Input file: file containing (absolute) file names of training genomes." ) parser.add_argument( 'out_file', help='Output training database/reference file (in HDF5 format)') args = parser.parse_args() num_threads = args.threads prime = args.prime # taking hashes mod this prime ksize = args.k_size if ksize > 31: raise Exception( "Unfortunately, ksize must be size 32 or smaller (due to khmer contraints). Please reduce the ksize or use MakeStreamingDNADatabase.py instead." ) max_h = args.num_hashes input_file_names = os.path.abspath(args.in_file) if not os.path.exists(input_file_names): raise Exception("Input file %s does not exist." % input_file_names) out_file = os.path.abspath(args.out_file) if args.intersect_nodegraph is True: intersect_nodegraph_file = os.path.splitext( out_file)[0] + ".intersect.Nodegraph" else: intersect_nodegraph_file = None # create temporary dict if it doesn't exist if not os.path.isdir(args.temp_dir): os.mkdir(args.temp_dir) if args.unzip_data is True and args.data_stream is True: raise InputError( "unzip_data and data_stream flags cannot both be specified.") if args.unzip_data is True or args.data_stream is True: with open(input_file_names, 'r') as fid: lines = fid.readlines() lines = [l.strip() for l in lines] # just do everything in one chunk chunks = [lines] # chunk_size = 75 # with open(input_file_names, 'r') as fid: # lines = fid.readlines() # chunks = [] # for i in range(int(math.ceil(len(lines) / chunk_size))): # if (i+1)*chunk_size > len(lines)-1: # chunks[i*chunk_size:len(lines)] # else: # chunks[i*chunk_size:(i+1)*chunk_size] genome_sketches = [] temp_path = args.temp_dir if args.unzip_data: print("Beginning unzipping data") print(chunks) if not os.path.isdir(os.path.join(temp_path, "fastas")): os.mkdir(os.path.join(temp_path, "fastas")) for idx, chunk in enumerate(chunks): print("Beginning download of chunk %i of %i" % (idx, len(chunks))) file_names = [] for line in chunk: f = unzip_file(line, os.path.join(temp_path, "fastas")) file_names.append(f) # if not check_if_pickled(line): # f = unzip_file(line, os.path.join(temp_path, "fastas")) # file_names.append(f) if len(file_names) > 0: print("starting sketches") pool = Pool(processes=num_threads) curr_genome_sketches = pool.map( make_minhash_star, zip(file_names, repeat(max_h), repeat(prime), repeat(ksize))) genome_sketches += curr_genome_sketches print("removing fasta files") for file_name in file_names: os.remove(file_name) else: print("pickled files found, continuing...") # adding new elif args.data_stream: for idx, chunk in enumerate(chunks): print("Beginning download of chunk %i of %i" % (idx, len(chunks))) file_names = [] for line in chunk: file = stream_file(line.strip()) file_names.append(file) print("starting sketches") pool = Pool(processes=num_threads) curr_genome_sketches = pool.map( make_minhash_star, zip(file_names, repeat(max_h), repeat(prime), repeat(ksize))) genome_sketches += curr_genome_sketches print("removing fasta files") for file_name in file_names: os.remove(file_name) else: file_names = list() fid = open(input_file_names, 'r') for line in fid.readlines(): line = line.strip() if not os.path.exists(line): raise Exception("Training genome %s does not exist." % line) file_names.append(line) fid.close() # Open the pool and make the sketches pool = Pool(processes=num_threads) genome_sketches = pool.map( make_minhash_star, zip(file_names, repeat(max_h), repeat(prime), repeat(ksize))) print("Beginning export to one HDF5 file") # Export all the sketches MH.export_multiple_to_single_hdf5(genome_sketches, out_file) # If requested, save all the k-mers into a big Nodegraph (unfortunately, need to pass through the data again since we # a-priori don't know how big of a table we need to make if intersect_nodegraph_file is not None: total_num_kmers = 0 for sketch in genome_sketches: total_num_kmers += sketch._true_num_kmers res = optimal_size(total_num_kmers, fp_rate=0.001) intersect_nodegraph = khmer.Nodegraph(ksize, res.htable_size, res.num_htables) for file_name in file_names: intersect_nodegraph.consume_seqfile(file_name) intersect_nodegraph.save(intersect_nodegraph_file)
def main(): parser = argparse.ArgumentParser( description= "This script creates a CSV file of similarity indicies between the" " input file and each of the sketches in the training/reference file.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-t', '--threads', type=int, help="Number of threads to use", default=multiprocessing.cpu_count()) parser.add_argument('-f', '--force', action="store_true", help="Force creation of new NodeGraph.") parser.add_argument('-fp', '--fp_rate', type=restricted_float, help="False positive rate.", default=0.0001) parser.add_argument( '-ct', '--containment_threshold', type=restricted_float, help="Only return results with containment index above this value", default=0.02) parser.add_argument( '-c', '--confidence', type=restricted_float, help= "Desired probability that all results were returned with containment index above threshold [-ct]", default=0.95) parser.add_argument( '-ng', '--node_graph', help="NodeGraph/bloom filter location. Used if it exists; if not, one " "will be created and put in the same directory as the specified " "output CSV file.", default=None) parser.add_argument( '-b', '--base_name', action="store_true", help= "Flag to indicate that only the base names (not the full path) should be saved in the output CSV file" ) parser.add_argument( '-i', '--intersect_nodegraph', action="store_true", help= "Option to only insert query k-mers in bloom filter if they appear anywhere in the training" " database. Note that the Jaccard estimates will now be " "J(query intersect union_i training_i, training_i) instead of J(query, training_i), " "but will use significantly less space.") parser.add_argument('in_file', help="Input file: FASTQ/A file (can be gzipped).") parser.add_argument( 'training_data', help= "Training/reference data (HDF5 file created by MakeTrainingDatabase.py)" ) parser.add_argument('out_csv', help='Output CSV file') # Parse and check args args = parser.parse_args() base_name = args.base_name training_data = os.path.abspath(args.training_data) if not os.path.exists(training_data): raise Exception("Training/reference file %s does not exist." % training_data) # Let's get the k-mer sizes in the training database ksizes = set() # Import all the training data sketches = MH.import_multiple_from_single_hdf5(training_data) # Check for issues with the sketches (can also check if all the kmers make sense (i.e. no '' or non-ACTG characters)) if sketches[0]._kmers is None: raise Exception( "For some reason, the k-mers were not saved when the database was created. Try running MakeDNADatabase.py again." ) num_hashes = len(sketches[0]._kmers) for i in range(len(sketches)): sketch = sketches[i] if sketch._kmers is None: raise Exception( "For some reason, the k-mers were not saved when the database was created. Try running MakeDNADatabase.py again." ) if len(sketch._kmers) != num_hashes: raise Exception("Unequal number of hashes for sketch of %s" % sketch.input_file_name) ksizes.add(sketch.ksize) if len(ksizes) > 1: raise Exception( "Training/reference data uses different k-mer sizes. Culprit was %s." % (sketch.input_file_name)) # Get the appropriate k-mer size ksize = ksizes.pop() # Get number of threads to use num_threads = args.threads # Check and parse the query file query_file = os.path.abspath(args.in_file) if not os.path.exists(query_file): raise Exception("Query file %s does not exist." % query_file) # Node graph is stored in the output folder with name <InputFASTQ/A>.NodeGraph.K<k_size> if args.node_graph is None: # If no node graph is specified, create one node_graph_out = os.path.join( os.path.dirname(os.path.abspath(args.out_csv)), os.path.basename(query_file) + ".NodeGraph.K" + str(ksize)) if not os.path.exists( node_graph_out ): # Don't complain if the default location works print("Node graph not provided (via -ng). Creating one at: %s" % node_graph_out) elif os.path.exists( args.node_graph): # If one is specified and it exists, use it node_graph_out = args.node_graph else: # Otherwise, the specified one doesn't exist raise Exception("Provided NodeGraph %s does not exist." % args.node_graph) # import and check the intersect nodegraph if args.intersect_nodegraph is True: intersect_nodegraph_file = os.path.splitext( training_data)[0] + ".intersect.Nodegraph" else: intersect_nodegraph_file = None intersect_nodegraph = None if intersect_nodegraph_file is not None: if not os.path.exists(intersect_nodegraph_file): raise Exception( "Intersection nodegraph does not exist. Please re-run MakeDNADatabase.py with the -i flag." ) try: intersect_nodegraph = khmer.load_nodegraph( intersect_nodegraph_file) if intersect_nodegraph.ksize() != ksize: raise Exception( "Given intersect nodegraph %s has K-mer size %d while the database K-mer size is %d" % (intersect_nodegraph_file, intersect_nodegraph.ksize(), ksize)) except: raise Exception("Could not load given intersect nodegraph %s" % intersect_nodegraph_file) results_file = os.path.abspath(args.out_csv) force = args.force fprate = args.fp_rate coverage_threshold = args.containment_threshold # desired coverage cutoff confidence = args.confidence # desired confidence that you got all the organisms with coverage >= desired coverage # Get names of training files for use as rows in returned tabular data training_file_names = [] for i in range(len(sketches)): training_file_names.append(sketches[i].input_file_name) # Only form the Nodegraph if we need to global sample_kmers if not os.path.exists(node_graph_out) or force is True: hll = khmer.HLLCounter(0.01, ksize) hll.consume_seqfile(query_file) full_kmer_count_estimate = hll.estimate_cardinality() res = optimal_size(full_kmer_count_estimate, fp_rate=fprate) if intersect_nodegraph is None: # If no intersect list was given, just populate the bloom filter sample_kmers = khmer.Nodegraph(ksize, res.htable_size, res.num_htables) #sample_kmers.consume_seqfile(query_file) rparser = khmer.ReadParser(query_file) threads = [] for _ in range(num_threads): cur_thrd = threading.Thread( target=sample_kmers.consume_seqfile_with_reads_parser, args=(rparser, )) threads.append(cur_thrd) cur_thrd.start() for thread in threads: thread.join() else: # Otherwise, only put a k-mer in the bloom filter if it's in the intersect list # (WARNING: this will cause the Jaccard index to be calculated in terms of J(query\intersect hash_list, training) # instead of J(query, training) # (TODO: fix this after khmer is updated) #intersect_nodegraph_kmer_count = intersect_nodegraph.n_unique_kmers() # Doesnt work due to khmer bug intersect_nodegraph_kmer_count = intersect_nodegraph.n_occupied( ) # Not technically correct, but I need to wait until khmer is updated if intersect_nodegraph_kmer_count < full_kmer_count_estimate: # At max, we have as many k-mers as in the union of the training database (But makes this always return 0) res = optimal_size(intersect_nodegraph_kmer_count, fp_rate=fprate) sample_kmers = khmer.Nodegraph(ksize, res.htable_size, res.num_htables) else: sample_kmers = khmer.Nodegraph(ksize, res.htable_size, res.num_htables) for record in screed.open(query_file): seq = record.sequence for i in range(len(seq) - ksize + 1): kmer = seq[i:i + ksize] if intersect_nodegraph.get(kmer) > 0: sample_kmers.add(kmer) # Save the sample_kmers sample_kmers.save(node_graph_out) true_fprate = khmer.calc_expected_collisions(sample_kmers, max_false_pos=0.99) else: sample_kmers = khmer.load_nodegraph(node_graph_out) node_ksize = sample_kmers.ksize() if node_ksize != ksize: raise Exception( "Node graph %s has wrong k-mer size of %d (input was %d). Try --force or change -k." % (node_graph_out, node_ksize, ksize)) true_fprate = khmer.calc_expected_collisions(sample_kmers, max_false_pos=0.99) #num_sample_kmers = sample_kmers.n_unique_kmers() # For some reason this only works when creating a new node graph, use the following instead num_sample_kmers = sample_kmers.n_occupied() # Compute all the indicies for all the training data pool = Pool(processes=num_threads) res = pool.map( unwrap_compute_indicies, zip(sketches, repeat(num_sample_kmers), repeat(true_fprate))) # Gather up the results in a nice form intersection_cardinalities = np.zeros(len(sketches)) containment_indexes = np.zeros(len(sketches)) jaccard_indexes = np.zeros(len(sketches)) for i in range(len(res)): (intersection_cardinality, containment_index, jaccard_index) = res[i] intersection_cardinalities[i] = intersection_cardinality containment_indexes[i] = containment_index jaccard_indexes[i] = jaccard_index d = { 'intersection': intersection_cardinalities, 'containment index': containment_indexes, 'jaccard index': jaccard_indexes } # Use only the basenames to label the rows (if requested) if base_name is True: df = pd.DataFrame(d, map(os.path.basename, training_file_names)) else: df = pd.DataFrame(d, training_file_names) # Only get the rows above a certain threshold if coverage_threshold <= 0: est_threshold = 0 else: est_threshold = threshold_calc(num_hashes, coverage_threshold, fprate, confidence) filtered_results = df[df['containment index'] > est_threshold].sort_values( 'containment index', ascending=False) # Export the results filtered_results.to_csv(results_file, index=True, encoding='utf-8')
def main(): parser = argparse.ArgumentParser( description= "This script will create node graph for a given k-mer size and query file (can be used as input to QueryDNADatabase.py)", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-fp', '--fp_rate', type=restricted_float, help="False positive rate.", default=0.0001) parser.add_argument( '-i', '--intersect_nodegraph', help= "Location of Node Graph. Will only insert query k-mers in bloom filter if they appear anywhere in the training" " database. Note that the Jaccard estimates will now be " "J(query intersect union_i training_i, training_i) instead of J(query, training_i), " "but will use significantly less space (unfortunately will also disable threading)." ) parser.add_argument('-k', '--k_size', type=int, help="K-mer size", default=21) parser.add_argument('-t', '--threads', type=int, help="Number of threads to use", default=multiprocessing.cpu_count()) parser.add_argument('in_file', help="Input file: FASTQ/A file (can be gzipped).") parser.add_argument('out_dir', help='Output directory') # Parse and check args args = parser.parse_args() query_file = os.path.abspath(args.in_file) ksize = args.k_size num_threads = args.threads node_graph_out = os.path.join( os.path.abspath(args.out_dir), os.path.basename(query_file) + ".NodeGraph.K" + str(ksize)) if args.intersect_nodegraph is not None: intersect_nodegraph_file = args.intersect_nodegraph else: intersect_nodegraph_file = None intersect_nodegraph = None if intersect_nodegraph_file is not None: if not os.path.exists(intersect_nodegraph_file): raise Exception( "Intersection nodegraph does not exist. Please re-run MakeDNADatabase.py with the -i flag." ) try: intersect_nodegraph = khmer.load_nodegraph( intersect_nodegraph_file) if intersect_nodegraph.ksize() != ksize: raise Exception( "Given intersect nodegraph %s has K-mer size %d while the database K-mer size is %d" % (intersect_nodegraph_file, intersect_nodegraph.ksize(), ksize)) except: raise Exception("Could not load given intersect nodegraph %s" % intersect_nodegraph_file) fprate = args.fp_rate hll = khmer.HLLCounter(0.01, ksize) hll.consume_seqfile(query_file) full_kmer_count_estimate = hll.estimate_cardinality() res = optimal_size(full_kmer_count_estimate, fp_rate=fprate) if intersect_nodegraph is None: # If no intersect list was given, just populate the bloom filter sample_kmers = khmer.Nodegraph(ksize, res.htable_size, res.num_htables) #sample_kmers.consume_seqfile(query_file) rparser = khmer.ReadParser(query_file) threads = [] for _ in range(num_threads): cur_thrd = threading.Thread( target=sample_kmers.consume_seqfile_with_reads_parser, args=(rparser, )) threads.append(cur_thrd) cur_thrd.start() for thread in threads: thread.join() else: # Otherwise, only put a k-mer in the bloom filter if it's in the intersect list # (WARNING: this will cause the Jaccard index to be calculated in terms of J(query\intersect hash_list, training) # instead of J(query, training) # (TODO: fix this after khmer is updated) #intersect_nodegraph_kmer_count = intersect_nodegraph.n_unique_kmers() # Doesnt work due to khmer bug intersect_nodegraph_kmer_count = intersect_nodegraph.n_occupied( ) # Doesnt work due to khmer bug if intersect_nodegraph_kmer_count < full_kmer_count_estimate: # At max, we have as many k-mers as in the union of the training database (But makes this always return 0) res = optimal_size(intersect_nodegraph_kmer_count, fp_rate=fprate) sample_kmers = khmer.Nodegraph(ksize, res.htable_size, res.num_htables) else: sample_kmers = khmer.Nodegraph(ksize, res.htable_size, res.num_htables) for record in screed.open(query_file): seq = record.sequence for i in range(len(seq) - ksize + 1): kmer = seq[i:i + ksize] if intersect_nodegraph.get(kmer) > 0: sample_kmers.add(kmer) # Save the sample_kmers sample_kmers.save(node_graph_out)