def test_default(self): """ Test if read nodes is working properly """ tax = Tax(ncbi_nodes=data_dir + "mini_nodes.dmp", ncbi_names=data_dir + "mini_names.dmp") self.assertEqual(len(tax.nodes), 36) self.assertEqual(tax.nodes["2"][0], "131567") self.assertEqual(tax.nodes["131567"][0], "1") self.assertEqual(tax.nodes["1"][0], "0") self.assertEqual(tax.nodes["2"][2], "Bacteria")
def classify(cfg): print_log("Classifying reads (ganon-classify)", cfg.quiet) run_ganon_classify = " ".join([cfg.path_exec['classify'], "--single-reads " + ",".join(cfg.single_reads) if cfg.single_reads else "", "--paired-reads " + ",".join(cfg.paired_reads) if cfg.paired_reads else "", "--ibf " + ",".join([db_prefix+".ibf" for db_prefix in cfg.db_prefix]), "--map " + ",".join([db_prefix+".map" for db_prefix in cfg.db_prefix]), "--tax " + ",".join([db_prefix+".tax" for db_prefix in cfg.db_prefix]), "--hierarchy-labels " + ",".join(cfg.hierarchy_labels) if cfg.hierarchy_labels else "", "--max-error " + ",".join([str(me) for me in cfg.max_error]) if cfg.max_error else "", "--min-kmers " + ",".join([str(mk) for mk in cfg.min_kmers]) if cfg.min_kmers else "", "--max-error-unique " + ",".join([str(meu) for meu in cfg.max_error_unique]) if cfg.max_error_unique else "", "--strata-filter " + ",".join([str(sf) for sf in cfg.strata_filter]) if cfg.strata_filter else "", "--offset " + str(cfg.offset) if cfg.offset else "", "--output-prefix " + cfg.output_prefix if cfg.output_prefix else "", "--output-all" if cfg.output_all else "", "--output-unclassified" if cfg.output_unclassified else "", "--output-single" if cfg.output_single else "", "--threads " + str(cfg.threads) if cfg.threads else "", "--n-reads " + str(cfg.n_reads) if cfg.n_reads is not None else "", "--n-batches " + str(cfg.n_batches) if cfg.n_batches is not None else "", "--verbose" if cfg.verbose else "", "--quiet" if cfg.quiet else ""]) stdout, stderr = run(run_ganon_classify) if not cfg.output_prefix: print(stdout) print_log(stderr, cfg.quiet) if cfg.output_prefix: tx = time.time() print_log("Generating report", cfg.quiet) tax = Tax([db_prefix+".tax" for db_prefix in cfg.db_prefix]) classified_reads, unclassified_reads, reports = parse_rep(cfg.output_prefix+".rep") print_final_report(reports, tax, classified_reads, unclassified_reads, cfg.output_prefix+".tre", cfg.ranks, 0, 0, []) print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n", cfg.quiet) return True
def update(cfg): tx = time.time() # validate input files input_files = validate_input_files(cfg.input_files, cfg.input_directory, cfg.input_extension, cfg.quiet) if len(input_files) == 0: print_log("ERROR: No valid input files found", cfg.quiet) return False # Set db prefixes db_prefix = { prefix: cfg.db_prefix + "." + prefix for prefix in ["ibf", "map", "tax", "gnn"] } # Set temporary working folder (current or new output) tmp_output_folder = cfg.output_db_prefix + "_tmp/" if cfg.output_db_prefix else cfg.db_prefix + "_tmp/" if not set_tmp_folder(tmp_output_folder): return False # Load .gnn file gnn = Gnn(file=db_prefix["gnn"]) # If specialization was set on database if gnn.specialization: # if not provided by user, use defition of database if not cfg.specialization: cfg.specialization = gnn.specialization print_log("Using --specialization " + cfg.specialization, cfg.quiet) else: if cfg.specialization: # If user defined specialization on update but database has none print_log( "ERROR: not possible to update a database with --specialization if it was built without it", cfg.quiet) return False # load bins bins = Bins(taxsbp_ret=gnn.bins, use_specialization=True if cfg.specialization else False) # load seqinfo (file or seqids) seqinfo = load_seqinfo(cfg, input_files) # check sequences compared to bins added_seqids, removed_seqids, kept_seqids = check_updated_seqids( set(seqinfo.get_seqids()), set(bins.get_seqids())) # Ignore removed sequences if not doing complete update if cfg.update_complete: print_log( "Update: adding " + str(len(added_seqids)) + " sequences, removing " + str(len(removed_seqids)) + " sequences, keeping " + str(len(kept_seqids)) + " sequences", cfg.quiet) else: removed_seqids = [] print_log( "Update: adding " + str(len(added_seqids)) + " sequences, ignoring " + str(len(kept_seqids)) + " repeated sequences", cfg.quiet) print_log("", cfg.quiet) if not added_seqids and not removed_seqids: print_log("Nothing to update", cfg.quiet) rm_tmp_folder(tmp_output_folder) return False if cfg.update_complete: # Remove already included seqids to just retrieve information for added sequences seqinfo.remove_seqids(kept_seqids | removed_seqids) else: # Remove seqids already present in the current version (repeated entries) seqinfo.remove_seqids(kept_seqids) # retrive sequence information (after removing invalid seqids) if not cfg.seq_info_file: retrieve_seqinfo(seqinfo, tmp_output_folder, input_files, cfg) # Convert cols data types if cfg.specialization: replaced_spec = seqinfo.validate_specialization() if replaced_spec: print_log( str(replaced_spec) + " invalid specialization entries (sequence accession used instead)\n", cfg.quiet) if not cfg.seq_info_file and cfg.write_seq_info_file: seqinfo.write(cfg.output_db_prefix + ".seqinfo.txt") # save set of current binids previous_binids = set(bins.get_binids()) # remove seqids from bins if performing update complete if cfg.update_complete and removed_seqids: bins.remove_seqids(removed_seqids) # save set of kept binids after removal kept_binids = set(bins.get_binids()) # Set up taxonomy files ncbi_nodes_file, ncbi_merged_file, ncbi_names_file = set_taxdump_files( cfg.taxdump_file, tmp_output_folder, cfg.quiet) tx = time.time() print_log("Running taxonomic clustering (TaxSBP)", cfg.quiet) updated_bins = run_taxsbp(seqinfo, gnn.bin_length, gnn.fragment_length, gnn.overlap_length, gnn.rank, cfg.specialization, ncbi_nodes_file, ncbi_merged_file, cfg.verbose, bins=bins) # bin statistics taxsbp_binids = set(updated_bins.get_binids()) removed_binids = previous_binids.difference(kept_binids | taxsbp_binids) new_binids = taxsbp_binids.difference(previous_binids) updated_binids = taxsbp_binids.intersection(previous_binids) print_log( " - " + str(len(new_binids)) + " bins added, " + str(len(updated_binids)) + " bins updated, " + str(len(removed_binids)) + " bins removed", cfg.quiet) print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n", cfg.quiet) tx = time.time() print_log("Updating database files", cfg.quiet) # load new taxonomy print_log( " - " + cfg.output_db_prefix + ".tax" if cfg.output_db_prefix else db_prefix["tax"], cfg.quiet) tax = Tax(ncbi_nodes=ncbi_nodes_file, ncbi_names=ncbi_names_file) # Update and write .tax file # filter only used taxids tax.filter(updated_bins.get_taxids()) # add specialization nodes if cfg.specialization: tax.add_nodes(updated_bins.get_specialization_taxid(), cfg.specialization) # Load old .tax file into new taxonomy tax.merge(Tax([db_prefix["tax"]])) # Write .tax file tax.write(cfg.output_db_prefix + ".tax" if cfg.output_db_prefix else db_prefix["tax"]) # TODO - remove entries from .tax from removed entries of the db # merge updated and old bins together bins.merge(updated_bins) # Write .gnn file print_log( " - " + cfg.output_db_prefix + ".gnn" if cfg.output_db_prefix else db_prefix["gnn"], cfg.quiet) gnn.bins = bins.get_list() # save updated bins gnn.number_of_bins = bins.get_number_of_bins() # add new bins count # set new specialization to gnn gnn.specialization = cfg.specialization gnn.write(cfg.output_db_prefix + ".gnn" if cfg.output_db_prefix else db_prefix["gnn"]) # Recreate .map file based on the new bins print_log( " - " + cfg.output_db_prefix + ".map" if cfg.output_db_prefix else db_prefix["map"], cfg.quiet) bins.write_map_file( cfg.output_db_prefix + ".map" if cfg.output_db_prefix else db_prefix["map"], use_specialization=True if cfg.specialization else False) print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n", cfg.quiet) tx = time.time() print_log("Updating index (ganon-build)", cfg.quiet) # Write aux. file for ganon # This file has to contain all new sequences # in case of update_complete acc_bin_file = tmp_output_folder + "acc_bin.txt" if cfg.update_complete: # all sequences from the bins with added/removed sequences should be written bins.write_acc_bin_file(acc_bin_file, new_binids | updated_binids) # If all sequences of a bin were removed and no new sequence added # insert a dummy entry for ganon-build to clear the bin if removed_binids: with open(acc_bin_file, "a") as abf: for b in removed_binids: print(0, 0, 0, b, sep="\t", file=abf) else: # Only new sequences (updated_bins) either on old or new binids updated_bins.write_acc_bin_file(acc_bin_file) # Update with same values used for build kmer_size = gnn.kmer_size window_size = gnn.window_size hash_functions = gnn.hash_functions # Free memory for build del seqinfo del bins del updated_bins del tax del gnn # Temporary output filter tmp_db_prefix_ibf = tmp_output_folder + "ganon.ibf" run_ganon_build_cmd = " ".join([ cfg.path_exec['build'], "--update-filter-file " + db_prefix["ibf"], "--kmer-size " + str(kmer_size), "--window-size " + str(window_size) if window_size else "", "--count-hashes " if window_size else "", "--hash-functions " + str(hash_functions), "--seqid-bin-file " + acc_bin_file, "--output-filter-file " + tmp_db_prefix_ibf, "--threads " + str(cfg.threads), "--verbose" if cfg.verbose else "", "--quiet" if cfg.quiet else "", "--n-refs " + str(cfg.n_refs) if cfg.n_refs is not None else "", "--n-batches " + str(cfg.n_batches) if cfg.n_batches is not None else "", "--reference-files " + ",".join([file for file in input_files]) if input_files and not cfg.input_directory else "", "--directory-reference-files " + cfg.input_directory if cfg.input_directory else "", "--extension " + cfg.input_extension if cfg.input_extension else "", "--update-complete" if cfg.update_complete else "" ]) stdout, stderr = run(run_ganon_build_cmd, print_stderr=True) # move IBF to final location shutil.move( tmp_db_prefix_ibf, cfg.output_db_prefix + ".ibf" if cfg.output_db_prefix else db_prefix["ibf"]) # Delete temp files rm_tmp_folder(tmp_output_folder) return True
def build(cfg): # validate input files input_files = validate_input_files(cfg.input_files, cfg.input_directory, cfg.input_extension, cfg.quiet) if len(input_files) == 0: print_log("ERROR: No valid input files found", cfg.quiet) return False # Set db prefixes db_prefix = { prefix: cfg.db_prefix + "." + prefix for prefix in ["ibf", "map", "tax", "gnn"] } # Set temporary working folder tmp_output_folder = cfg.db_prefix + "_tmp/" if not set_tmp_folder(tmp_output_folder): return False # Set up taxonomy ncbi_nodes_file, ncbi_merged_file, ncbi_names_file = set_taxdump_files( cfg.taxdump_file, tmp_output_folder, cfg.quiet) # Parse .tax tx = time.time() print_log("Parsing taxonomy", cfg.quiet) tax = Tax(ncbi_nodes=ncbi_nodes_file, ncbi_names=ncbi_names_file) print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n", cfg.quiet) # load seqinfo (file or seqids) seqinfo = load_seqinfo(cfg, input_files) # Retrieve sequence information if not cfg.seq_info_file: retrieve_seqinfo(seqinfo, tmp_output_folder, input_files, cfg) # Check for valid specialization if cfg.specialization: replaced_spec = seqinfo.validate_specialization() if replaced_spec: print_log( str(replaced_spec) + " invalid specialization entries (sequence accession used instead)\n", cfg.quiet) # Write seq-info-file if not cfg.seq_info_file and cfg.write_seq_info_file: seqinfo.write(cfg.db_prefix + ".seqinfo.txt") # check sequences compared to bins added_seqids, _, _ = check_updated_seqids(set(seqinfo.get_seqids()), set()) # Ignore removed sequences if not doing complete update print_log("Build: adding " + str(len(added_seqids)) + " sequences", cfg.quiet) print_log("", cfg.quiet) if not added_seqids: print_log("No valid seq. info to build", cfg.quiet) rm_tmp_folder(tmp_output_folder) return False # Set or calculate best --bin-length if cfg.bin_length: bin_length = cfg.bin_length else: tx = time.time() print_log("Simulating parameters", cfg.quiet) bin_length = estimate_bin_length(cfg, seqinfo, tax) print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n", cfg.quiet) # Set fragment length if cfg.fragment_length == -1: # if ==-1 set default fragment_length = bin_length - cfg.overlap_length elif cfg.fragment_length == 0: # if ==0 deactivate fragment_length = 0 else: # user input fragment_length = cfg.fragment_length - cfg.overlap_length tx = time.time() print_log("Running taxonomic clustering (TaxSBP)", cfg.quiet) bins = run_taxsbp(seqinfo, bin_length, fragment_length, cfg.overlap_length, cfg.rank, cfg.specialization, ncbi_nodes_file, ncbi_merged_file, cfg.verbose) # bin statistics actual_number_of_bins = bins.get_number_of_bins() optimal_number_of_bins = optimal_bins(actual_number_of_bins) max_length_bin = bins.get_max_bin_length() max_kmer_count = estimate_elements(max_length_bin, cfg.kmer_size, cfg.window_size) #max_kmer_count = max_length_bin - cfg.kmer_size + 1 print_log(" - " + str(actual_number_of_bins) + " bins created", cfg.quiet) print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n", cfg.quiet) # Get optimal parameters from user input and taxsbp result if cfg.filter_size: optimal_params = derive_bf_params( max_kmer_count, 0, math.ceil(mb2bits(cfg.filter_size) / optimal_number_of_bins), cfg.hash_functions) else: optimal_params = derive_bf_params(max_kmer_count, cfg.max_fp, 0, cfg.hash_functions) # When fixed size is too small if optimal_params["hash_functions"] == 0: optimal_params["hash_functions"] = 3 print_log("Optimal bins: " + str(optimal_number_of_bins), cfg.quiet) print_log( "Max. false positive: " + str("{0:.5f}".format(optimal_params["false_positive"])), cfg.quiet) print_log("Hash functions: " + str(optimal_params["hash_functions"]), cfg.quiet) if cfg.window_size: # Check lower bound for minimizers with estimated bin-length min_mini = math.floor(((max_length_bin - cfg.kmer_size + 1) / (cfg.window_size - cfg.kmer_size + 1))) min_size_mini = derive_bf_params(min_mini, optimal_params["false_positive"], 0, optimal_params["hash_functions"]) print_log( "Possible elements per bin: " + str(min_mini) + ".." + str(max_kmer_count), cfg.quiet) print_log( "Possible filter sizes: " + str("{0:.2f}".format( bits2mb(min_size_mini["size_bits"] * optimal_number_of_bins))) + "MB.." + str("{0:.2f}".format( bits2mb(optimal_params["size_bits"] * optimal_number_of_bins))) + "MB", cfg.quiet) else: print_log("Elements per bin: " + str(max_kmer_count), cfg.quiet) print_log( "Filter size: " + str("{0:.2f}".format( bits2mb(optimal_params["size_bits"] * optimal_number_of_bins))) + "MB", cfg.quiet) print_log("") # Build database files (map, tax, gnn) tx = time.time() print_log("Building database files", cfg.quiet) # Write .map file print_log(" - " + db_prefix["map"], cfg.quiet) bins.write_map_file( db_prefix["map"], use_specialization=True if cfg.specialization else False) # Write .tax file print_log(" - " + db_prefix["tax"], cfg.quiet) # filter only used taxids tax.filter(bins.get_taxids()) # add specialization nodes if cfg.specialization: tax.add_nodes(bins.get_specialization_taxid(), cfg.specialization) tax.write(db_prefix["tax"]) if cfg.specialization and cfg.rank != "leaves": print_log( " - --rank is set to leaves when using specialization values", cfg.quiet) cfg.rank = "leaves" # Write .gnn file print_log(" - " + db_prefix["gnn"], cfg.quiet) gnn = Gnn(kmer_size=cfg.kmer_size, window_size=cfg.window_size, hash_functions=optimal_params["hash_functions"], number_of_bins=actual_number_of_bins, rank=cfg.rank, specialization=cfg.specialization, bin_length=bin_length, fragment_length=fragment_length, overlap_length=cfg.overlap_length, bins=bins.get_list()) gnn.write(db_prefix["gnn"]) print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n", cfg.quiet) print_log("Building index (ganon-build)", cfg.quiet) # Write aux. file for ganon acc_bin_file = tmp_output_folder + "acc_bin.txt" bins.write_acc_bin_file(acc_bin_file) # Free memory for build del seqinfo del bins del tax del gnn run_ganon_build_cmd = " ".join([ cfg.path_exec['build'], "--seqid-bin-file " + acc_bin_file, "--bin-size-bits " + str(optimal_params["size_bits"]) if cfg.filter_size else "--false-positive " + str(cfg.max_fp), "--kmer-size " + str(cfg.kmer_size), "--window-size " + str(cfg.window_size) if cfg.window_size else "", "--count-hashes " if cfg.window_size else "", "--hash-functions " + str(optimal_params["hash_functions"]), "--threads " + str(cfg.threads), "--output-filter-file " + db_prefix["ibf"], "--verbose" if cfg.verbose else "", "--quiet" if cfg.quiet else "", "--n-refs " + str(cfg.n_refs) if cfg.n_refs is not None else "", "--n-batches " + str(cfg.n_batches) if cfg.n_batches is not None else "", "--reference-files " + ",".join([file for file in input_files]) if input_files and not cfg.input_directory else "", "--directory-reference-files " + cfg.input_directory if cfg.input_directory else "", "--extension " + cfg.input_extension if cfg.input_extension else "" ]) stdout, stderr = run(run_ganon_build_cmd, print_stderr=True) # Delete temp files rm_tmp_folder(tmp_output_folder) return True
def report(cfg): classified_reads, unclassified_reads, reports = parse_rep(cfg.rep_file) tax = Tax([db_prefix+".tax" for db_prefix in cfg.db_prefix]) print_final_report(reports, tax, classified_reads, unclassified_reads, cfg.output_report, cfg.ranks, cfg.min_matches, cfg.min_matches_perc, cfg.taxids) return True
def report(cfg): #validate input input files rep_files = validate_input_files(cfg.rep_files, cfg.input_directory, cfg.input_extension, cfg.quiet) # Parse taxonomy or download new if cfg.db_prefix: dbp = [] for prefix in cfg.db_prefix: if prefix.endswith(".tax"): dbp.append(prefix) else: dbp.append(prefix+".tax") tax = Tax(dbp) else: tmp_output_folder = os.path.dirname(cfg.output_prefix) if not tmp_output_folder: tmp_output_folder = "." tmp_output_folder += "/ganon_report_tmp/" if not set_tmp_folder(tmp_output_folder): return False # Set up taxonomy ncbi_nodes_file, ncbi_merged_file, ncbi_names_file = set_taxdump_files(cfg.taxdump_file, tmp_output_folder, cfg.quiet) tx = time.time() print_log("Parsing taxonomy", cfg.quiet) tax = Tax(ncbi_nodes=ncbi_nodes_file, ncbi_names=ncbi_names_file) rm_tmp_folder(tmp_output_folder) print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n", cfg.quiet) any_rep = False print_log("Generating report(s)", cfg.quiet) # Parse report file for rep_file in rep_files: print_log("", cfg.quiet) reports, counts = parse_rep(rep_file) if not reports: print_log(" - nothing to report for " + rep_file, cfg.quiet) continue # If skipping/keeping hiearchies, remove all assignments from reports if cfg.skip_hierarchy or cfg.keep_hierarchy: reports = remove_hierarchy(reports, counts, cfg.skip_hierarchy, cfg.keep_hierarchy, cfg.quiet) # General output file if len(rep_files) == 1: output_file = cfg.output_prefix else: file_pre = os.path.splitext(os.path.basename(rep_file))[0] output_file = cfg.output_prefix+file_pre if cfg.split_hierarchy: for h in reports: if h not in cfg.skip_hierarchy: output_file_h=output_file+"."+h+".tre" r = print_final_report({h:reports[h]}, counts, tax, output_file_h, cfg) if not r: print_log(" - nothing to report for hierarchy " + h + " in the " + rep_file, cfg.quiet) continue else: print_log(" - report saved to " + output_file_h, cfg.quiet) any_rep = True else: output_file=output_file+".tre" r = print_final_report(reports, counts, tax, output_file, cfg) if not r: print_log(" - nothing to report for " + rep_file, cfg.quiet) continue else: print_log(" - report saved to " + output_file, cfg.quiet) any_rep = True return True if any_rep else False
def update(cfg): tx = time.time() # validate input files input_files = validate_input_files(cfg.input_files, cfg.input_directory, cfg.input_extension, cfg.quiet) if len(input_files) == 0: print_log("ERROR: No valid input files found") return False # Set db prefixes db_prefix = { prefix: cfg.db_prefix + "." + prefix for prefix in ["ibf", "map", "tax", "gnn"] } # Set temporary working folder (current or new output) tmp_output_folder = cfg.output_db_prefix + "_tmp/" if cfg.output_db_prefix else cfg.db_prefix + "_tmp/" if not set_tmp_folder(tmp_output_folder): return False # Load .gnn file gnn = Gnn(file=db_prefix["gnn"]) # Set assembly mode use_assembly = True if gnn.rank == "assembly" else False # load bins bins = Bins(taxsbp_ret=gnn.bins) # Load seqids and generate seqinfo if cfg.seq_info_file: seqinfo = load_seqids(seq_info_file=cfg.seq_info_file, quiet=cfg.quiet) else: seqinfo = load_seqids(files=input_files, quiet=cfg.quiet) # check sequences compared to bins added_seqids, removed_seqids, kept_seqids = check_updated_seqids( set(seqinfo.get_seqids()), set(bins.get_seqids())) # Ignore removed sequences if not doing complete update if cfg.update_complete: print_log( "Update: adding " + str(len(added_seqids)) + " sequences, removing " + str(len(removed_seqids)) + " sequences, keeping " + str(len(kept_seqids)) + " sequences", cfg.quiet) else: removed_seqids = [] print_log( "Update: adding " + str(len(added_seqids)) + " sequences, ignoring " + str(len(kept_seqids)) + " repeated sequences", cfg.quiet) print_log("", cfg.quiet) if not added_seqids and not removed_seqids: print_log("ERROR: Nothing to update") return False if cfg.update_complete: # Remove already included seqids to just retrieve information for added sequences seqinfo.remove_seqids(kept_seqids | removed_seqids) else: # Remove seqids already present in the current version (repeated entries) seqinfo.remove_seqids(kept_seqids) # load seqinfo file with data (after removing ids) if not cfg.seq_info_file: load_seqinfo(tmp_output_folder, seqinfo, cfg.path_exec, cfg.seq_info_mode, use_assembly, cfg.quiet) if cfg.write_seq_info_file: seqinfo.write(cfg.output_db_prefix + ".seqinfo.txt" if cfg. output_db_prefix else cfg.db_prefix + ".seqinfo.txt") # save set of current binids previous_binids = set(bins.get_binids()) # remove seqids from bins if performing update complete if cfg.update_complete and removed_seqids: bins.remove_seqids(removed_seqids) # save set of kept binids after removal kept_binids = set(bins.get_binids()) # Set up taxonomy files ncbi_nodes_file, ncbi_merged_file, ncbi_names_file = set_taxdump_files( cfg.taxdump_file, tmp_output_folder, cfg.quiet) tx = time.time() print_log("Running taxonomic clustering (TaxSBP)", cfg.quiet) taxsbp_params = {} taxsbp_params["update_table"] = bins.get_csv() taxsbp_params["nodes_file"] = ncbi_nodes_file taxsbp_params["bin_len"] = gnn.bin_length if use_assembly: taxsbp_params["bin_exclusive"] = "assembly" elif gnn.rank == "taxid": taxsbp_params["bin_exclusive"] = "leaves" else: taxsbp_params["bin_exclusive"] = gnn.rank if ncbi_merged_file: taxsbp_params["merged_file"] = ncbi_merged_file if gnn.fragment_length: taxsbp_params["fragment_len"] = gnn.fragment_length taxsbp_params["overlap_len"] = gnn.overlap_length if use_assembly: taxsbp_params["specialization"] = "assembly" taxsbp_params["input_table"] = seqinfo.get_csv() updated_bins = Bins(taxsbp_ret=taxsbp.taxsbp.pack(**taxsbp_params)) # bin statistics taxsbp_binids = set(updated_bins.get_binids()) removed_binids = previous_binids.difference(kept_binids | taxsbp_binids) new_binids = taxsbp_binids.difference(previous_binids) updated_binids = taxsbp_binids.intersection(previous_binids) print_log( " - " + str(len(new_binids)) + " bins added, " + str(len(updated_binids)) + " bins updated, " + str(len(removed_binids)) + " bins removed", cfg.quiet) print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n", cfg.quiet) tx = time.time() print_log("Updating database files", cfg.quiet) # load new taxonomy print_log( " - " + cfg.output_db_prefix + ".tax" if cfg.output_db_prefix else db_prefix["tax"], cfg.quiet) tax = Tax(ncbi_nodes=ncbi_nodes_file, ncbi_names=ncbi_names_file) # Update and write .tax file tax.filter(updated_bins.get_taxids()) # filter only used taxids if use_assembly: tax.add_nodes(updated_bins.get_specialization_taxid(), "assembly") # add assembly nodes # Load old .tax file into new taxonomy tax.merge(Tax([db_prefix["tax"]])) # Write .tax file tax.write(cfg.output_db_prefix + ".tax" if cfg.output_db_prefix else db_prefix["tax"]) # TODO - remove entries from .tax from removed entries of the db # merge updated and old bins together bins.merge(updated_bins) # Write .gnn file print_log( " - " + cfg.output_db_prefix + ".gnn" if cfg.output_db_prefix else db_prefix["gnn"], cfg.quiet) gnn.bins = bins.get_list() # save updated bins gnn.number_of_bins = bins.get_number_of_bins() # add new bins count gnn.write(cfg.output_db_prefix + ".gnn" if cfg.output_db_prefix else db_prefix["gnn"]) # Recreate .map file based on the new bins print_log( " - " + cfg.output_db_prefix + ".map" if cfg.output_db_prefix else db_prefix["map"], cfg.quiet) bins.write_map_file( cfg.output_db_prefix + ".map" if cfg.output_db_prefix else db_prefix["map"], use_assembly) print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n", cfg.quiet) tx = time.time() print_log("Updating index (ganon-build)", cfg.quiet) # Write aux. file for ganon # This file has to contain all new sequences # in case of update_complete, acc_bin_file = tmp_output_folder + "acc_bin.txt" if cfg.update_complete: # all sequences from the bins with added/removed sequences should be written bins.write_acc_bin_file(acc_bin_file, new_binids | updated_binids) # If all sequences of a bin were removed and no new sequence added # insert a dummy entry for ganon-build to clear the bin if removed_binids: with open(acc_bin_file, "a") as abf: for b in removed_binids: print(0, 0, 0, b, sep="\t", file=abf) else: # Only new sequences (updated_bins) either on old or new binids updated_bins.write_acc_bin_file(acc_bin_file) # Temporary output filter tmp_db_prefix_ibf = tmp_output_folder + "ganon.ibf" run_ganon_build_cmd = " ".join([ cfg.path_exec['build'], "--update-filter-file " + db_prefix["ibf"], "--seqid-bin-file " + acc_bin_file, "--output-filter-file " + tmp_db_prefix_ibf, "--threads " + str(cfg.threads), "--verbose" if cfg.verbose else "", "--quiet" if cfg.quiet else "", "--n-refs " + str(cfg.n_refs) if cfg.n_refs is not None else "", "--n-batches " + str(cfg.n_batches) if cfg.n_batches is not None else "", "--reference-files " + ",".join([file for file in input_files]) if input_files else "", "--directory-reference-files " + cfg.input_directory if cfg.input_directory else "", "--extension " + cfg.input_extension if cfg.input_extension else "", "--update-complete" if cfg.update_complete else "" ]) stdout, stderr = run(run_ganon_build_cmd, print_stderr=True) # move IBF to final location shutil.move( tmp_db_prefix_ibf, cfg.output_db_prefix + ".ibf" if cfg.output_db_prefix else db_prefix["ibf"]) # Delete temp files rm_tmp_folder(tmp_output_folder) return True
def build(cfg): # validate input files input_files = validate_input_files(cfg.input_files, cfg.input_directory, cfg.input_extension, cfg.quiet) if len(input_files) == 0: print_log("ERROR: No valid input files found") return False # Set db prefixes db_prefix = { prefix: cfg.db_prefix + "." + prefix for prefix in ["ibf", "map", "tax", "gnn"] } # Set temporary working folder tmp_output_folder = cfg.db_prefix + "_tmp/" if not set_tmp_folder(tmp_output_folder): return False # Set assembly mode use_assembly = True if cfg.rank == "assembly" else False # Set up taxonomy ncbi_nodes_file, ncbi_merged_file, ncbi_names_file = set_taxdump_files( cfg.taxdump_file, tmp_output_folder, cfg.quiet) tx = time.time() print_log("Parsing taxonomy", cfg.quiet) tax = Tax(ncbi_nodes=ncbi_nodes_file, ncbi_names=ncbi_names_file) print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n", cfg.quiet) # Load seqids and generate seqinfo if cfg.seq_info_file: seqinfo = load_seqids(seq_info_file=cfg.seq_info_file, quiet=cfg.quiet) else: seqinfo = load_seqids(files=input_files, quiet=cfg.quiet) load_seqinfo(tmp_output_folder, seqinfo, cfg.path_exec, cfg.seq_info_mode, use_assembly, cfg.quiet) if cfg.write_seq_info_file: seqinfo.write(cfg.db_prefix + ".seqinfo.txt") # check sequences compared to bins added_seqids, _, _ = check_updated_seqids(set(seqinfo.get_seqids()), set()) # Ignore removed sequences if not doing complete update print_log("Build: adding " + str(len(added_seqids)) + " sequences", cfg.quiet) print_log("", cfg.quiet) # Set bin length if cfg.bin_length: # user defined bin_length = cfg.bin_length else: tx = time.time() print_log("Calculating best bin length", cfg.quiet) bin_length, approx_size, n_bins = estimate_bin_len_size( cfg, seqinfo, tax, use_assembly) if bin_length <= 0: bin_length = 1000000 print_log( "WARNING: could not estimate bin length, using default of " + str(bin_length) + "bp") else: print_log( " - bin length: " + str(bin_length) + "bp (approx: " + str(n_bins) + " bins / " + str("{0:.2f}".format(approx_size)) + "MB)", cfg.quiet) print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n", cfg.quiet) # Set fragment length if cfg.fragment_length == -1: # if ==-1 set default fragment_length = bin_length - cfg.overlap_length elif cfg.fragment_length == 0: # if ==0 deactivate fragment_length = 0 else: # user input fragment_length = cfg.fragment_length - cfg.overlap_length tx = time.time() print_log("Running taxonomic clustering (TaxSBP)", cfg.quiet) taxsbp_params = {} taxsbp_params["nodes_file"] = ncbi_nodes_file taxsbp_params["bin_len"] = bin_length if use_assembly: taxsbp_params["bin_exclusive"] = "assembly" elif cfg.rank == "taxid": taxsbp_params["bin_exclusive"] = "leaves" else: taxsbp_params["bin_exclusive"] = cfg.rank if ncbi_merged_file: taxsbp_params["merged_file"] = ncbi_merged_file if fragment_length: taxsbp_params["fragment_len"] = fragment_length taxsbp_params["overlap_len"] = cfg.overlap_length if use_assembly: taxsbp_params["specialization"] = "assembly" taxsbp_params["input_table"] = seqinfo.get_csv() bins = Bins(taxsbp_ret=taxsbp.taxsbp.pack(**taxsbp_params)) del taxsbp_params # bin statistics actual_number_of_bins = bins.get_number_of_bins() optimal_number_of_bins = optimal_bins(actual_number_of_bins) max_length_bin = bins.get_max_bin_length() max_kmer_count = max_length_bin - cfg.kmer_size + 1 # aproximate number of unique k-mers by just considering that they are all unique print_log(" - " + str(actual_number_of_bins) + " bins created", cfg.quiet) print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n", cfg.quiet) tx = time.time() print_log("Building database files", cfg.quiet) # Write .map file print_log(" - " + db_prefix["map"], cfg.quiet) bins.write_map_file(db_prefix["map"], use_assembly) # Write .tax file print_log(" - " + db_prefix["tax"], cfg.quiet) tax.filter(bins.get_taxids()) # filter only used taxids if use_assembly: tax.add_nodes(bins.get_specialization_taxid(), "assembly") # add assembly nodes tax.write(db_prefix["tax"]) # Write .gnn file print_log(" - " + db_prefix["gnn"], cfg.quiet) gnn = Gnn(kmer_size=cfg.kmer_size, hash_functions=cfg.hash_functions, number_of_bins=actual_number_of_bins, rank=cfg.rank, bin_length=bin_length, fragment_length=fragment_length, overlap_length=cfg.overlap_length, bins=bins.get_list()) gnn.write(db_prefix["gnn"]) print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n", cfg.quiet) print_log("Building index (ganon-build)", cfg.quiet) # define bloom filter size based on given false positive MBinBits = 8388608 print_log( " - max unique " + str(cfg.kmer_size) + "-mers: " + str(max_kmer_count), cfg.quiet) if not cfg.fixed_bloom_size: bin_size_bits = math.ceil(-(1 / ( (1 - cfg.max_fp**(1 / float(cfg.hash_functions)))** (1 / float(cfg.hash_functions * max_kmer_count)) - 1))) print_log( " - IBF calculated size with fp<=" + str(cfg.max_fp) + ": " + str("{0:.2f}".format( (bin_size_bits * optimal_number_of_bins) / MBinBits)) + "MB (" + str(bin_size_bits) + " bits/bin * " + str(optimal_number_of_bins) + " optimal bins [" + str(actual_number_of_bins) + " real bins])", cfg.quiet) else: bin_size_bits = math.ceil( (cfg.fixed_bloom_size * MBinBits) / optimal_number_of_bins) estimated_max_fp = (1 - ((1 - (1 / float(bin_size_bits)))**( cfg.hash_functions * max_kmer_count)))**cfg.hash_functions print_log( " - IBF calculated max. fp with size=" + str(cfg.fixed_bloom_size) + "MB: " + str("{0:.2f}".format(estimated_max_fp) + " (" + str(optimal_number_of_bins) + " optimal bins [" + str(actual_number_of_bins) + " real bins])"), cfg.quiet) # Write aux. file for ganon acc_bin_file = tmp_output_folder + "acc_bin.txt" bins.write_acc_bin_file(acc_bin_file) run_ganon_build_cmd = " ".join([ cfg.path_exec['build'], "--seqid-bin-file " + acc_bin_file, "--filter-size-bits " + str(bin_size_bits * optimal_number_of_bins) if cfg.max_fp else "--filter-size " + str(cfg.fixed_bloom_size), "--kmer-size " + str(cfg.kmer_size), "--hash-functions " + str(cfg.hash_functions), "--threads " + str(cfg.threads), "--output-filter-file " + db_prefix["ibf"], "--verbose" if cfg.verbose else "", "--quiet" if cfg.quiet else "", "--n-refs " + str(cfg.n_refs) if cfg.n_refs is not None else "", "--n-batches " + str(cfg.n_batches) if cfg.n_batches is not None else "", "--reference-files " + ",".join([file for file in input_files]) if input_files else "", "--directory-reference-files " + cfg.input_directory if cfg.input_directory else "", "--extension " + cfg.input_extension if cfg.input_extension else "" ]) stdout, stderr = run(run_ganon_build_cmd, print_stderr=True) # Delete temp files rm_tmp_folder(tmp_output_folder) return True