def print_citation(cls): if cls._citation_printed: return from sourmash.logging import notify notify("\n== This is sourmash version {version}. ==", version=sourmash.VERSION) notify( "== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==\n" ) cls._citation_printed = True
def load_databases(filenames, scaled=None, verbose=True): "Load multiple LCA databases; return (dblist, ksize, scaled)" ksize_vals = set() scaled_vals = set() dblist = [] # load all the databases for db_name in filenames: if verbose: notify(u'\r\033[K', end=u'') notify('... loading database {}'.format(db_name), end='\r') lca_db = LCA_Database.load(db_name) ksize_vals.add(lca_db.ksize) if len(ksize_vals) > 1: raise Exception('multiple ksizes, quitting') if scaled and scaled > lca_db.scaled: lca_db.downsample_scaled(scaled) scaled_vals.add(lca_db.scaled) dblist.append(lca_db) ksize = ksize_vals.pop() scaled = scaled_vals.pop() if verbose: notify(u'\r\033[K', end=u'') notify('loaded {} LCA databases. ksize={}, scaled={}', len(dblist), ksize, scaled) return dblist, ksize, scaled
def info(verbose=False): notify('sourmash version {}', sourmash.VERSION) notify('- loaded from path: {}', os.path.dirname(__file__)) notify('') if verbose: import khmer notify('khmer version {}', khmer.__version__) notify('- loaded from path: {}', os.path.dirname(khmer.__file__)) notify('') notify('screed version {}', screed.__version__) notify('- loaded from path: {}', os.path.dirname(screed.__file__))
def load_taxonomy_assignments(filename, delimiter=',', start_column=2, use_headers=True, force=False): """ Load a taxonomy assignment spreadsheet into a dictionary. The 'assignments' dictionary that's returned maps identifiers to lineage tuples. """ mode = 'rt' if sys.version_info < (3, ): mode = 'rtU' # parse spreadsheet! fp = open(filename, mode) r = csv.reader(fp, delimiter=delimiter) row_headers = ['identifiers'] row_headers += ['_skip_']*(start_column - 2) row_headers += list(lca_utils.taxlist()) # first check that headers are interpretable. if use_headers: notify('examining spreadsheet headers...') first_row = next(iter(r)) n_disagree = 0 for (column, value) in zip(row_headers, first_row): if column == '_skip_': continue if column.lower() != value.lower(): notify("** assuming column '{}' is {} in spreadsheet", value, column) n_disagree += 1 if n_disagree > 2: error('whoa, too many assumptions. are the headers right?') error('expecting {}', ",".join(row_headers)) if not force: sys.exit(-1) notify('...continue, because --force was specified.') # convert into a lineage pair assignments = {} num_rows = 0 n_species = 0 n_strains = 0 for row in r: if row and row[0].strip(): # want non-empty row num_rows += 1 lineage = list(zip(row_headers, row)) lineage = [ x for x in lineage if x[0] != '_skip_' ] ident = lineage[0][1] lineage = lineage[1:] # clean lineage of null names, replace with 'unassigned' lineage = [ (a, lca_utils.filter_null(b)) for (a,b) in lineage ] lineage = [ LineagePair(a, b) for (a, b) in lineage ] # remove end nulls while lineage and lineage[-1].name == 'unassigned': lineage = lineage[:-1] # store lineage tuple if lineage: # check duplicates if ident in assignments: if assignments[ident] != tuple(lineage): if not force: raise Exception("multiple lineages for identifier {}".format(ident)) else: assignments[ident] = tuple(lineage) if lineage[-1].rank == 'species': n_species += 1 elif lineage[-1].rank == 'strain': n_species += 1 n_strains += 1 fp.close() # this is to guard against a bug that happened once and I can't find # any more, when building a large GTDB-based database :) --CTB if len(assignments) * 0.2 > n_species and len(assignments) > 50: if not force: error('') error("ERROR: fewer than 20% of lineages have species-level resolution!?") error("({} species assignments found, of {} assignments total)", n_species, len(assignments)) error("** If this is intentional, re-run the command with -f.") sys.exit(-1) return assignments, num_rows
def index(args): """ main function for building an LCA database. """ if args.start_column < 2: error('error, --start-column cannot be less than 2') sys.exit(-1) set_quiet(args.quiet, args.debug) args.scaled = int(args.scaled) if args.ksize is None: args.ksize = DEFAULT_LOAD_K moltype = sourmash_args.calculate_moltype(args, default='DNA') notify('Building LCA database with ksize={} scaled={} moltype={}.', args.ksize, args.scaled, moltype) # first, load taxonomy spreadsheet delimiter = ',' if args.tabs: delimiter = '\t' assignments, num_rows = load_taxonomy_assignments(args.csv, delimiter=delimiter, start_column=args.start_column, use_headers=not args.no_headers, force=args.force) notify('{} distinct identities in spreadsheet out of {} rows.', len(assignments), num_rows) notify('{} distinct lineages in spreadsheet out of {} rows.', len(set(assignments.values())), num_rows) db = LCA_Database(args.ksize, args.scaled, moltype) # notify('finding signatures...') if args.traverse_directory: yield_all_files = False # only pick up *.sig files? if args.force: yield_all_files = True inp_files = list(sourmash_args.traverse_find_sigs(args.signatures, yield_all_files=yield_all_files)) else: inp_files = list(args.signatures) # track duplicates md5_to_name = {} # # main loop, connecting lineage ID to signature. # n = 0 total_n = len(inp_files) record_duplicates = set() record_no_lineage = set() record_remnants = set(assignments) record_used_lineages = set() record_used_idents = set() n_skipped = 0 for filename in inp_files: n += 1 for sig in load_signatures(filename, ksize=args.ksize, select_moltype=moltype): notify(u'\r\033[K', end=u'') notify('\r... loading signature {} ({} of {}); skipped {} so far', sig.name()[:30], n, total_n, n_skipped, end='') debug(filename, sig.name()) # block off duplicates. if sig.md5sum() in md5_to_name: debug('WARNING: in file {}, duplicate md5sum: {}; skipping', filename, sig.md5sum()) record_duplicates.add(filename) continue md5_to_name[sig.md5sum()] = sig.name() # parse identifier, potentially with splitting ident = sig.name() if args.split_identifiers: # hack for NCBI-style names, etc. # split on space... ident = ident.split(' ')[0] # ...and on period. ident = ident.split('.')[0] lineage = assignments.get(ident) # punt if no lineage and --require-taxonomy if lineage is None and args.require_taxonomy: debug('(skipping, because --require-taxonomy was specified)') n_skipped += 1 continue # add the signature into the database. db.insert(sig, ident=ident, lineage=lineage) if lineage: # remove from our list of remaining ident -> lineage record_remnants.remove(ident) # track ident as used record_used_idents.add(ident) record_used_lineages.add(lineage) # track lineage info - either no lineage, or this lineage used. else: debug('WARNING: no lineage assignment for {}.', ident) record_no_lineage.add(ident) # end main add signatures loop if n_skipped: notify('... loaded {} signatures; skipped {} because of --require-taxonomy.', total_n, n_skipped) else: notify('... loaded {} signatures.', total_n) # check -- did we find any signatures? if n == 0: error('ERROR: no signatures found. ??') if args.traverse_directory and not args.force: error('(note, with --traverse-directory, you may want to use -f)') sys.exit(1) # check -- did the signatures we found have any hashes? if not db.hashval_to_idx: error('ERROR: no hash values found - are there any signatures?') sys.exit(1) notify('loaded {} hashes at ksize={} scaled={}', len(db.hashval_to_idx), args.ksize, args.scaled) # summarize: notify('{} assigned lineages out of {} distinct lineages in spreadsheet.', len(record_used_lineages), len(set(assignments.values()))) unused_lineages = set(assignments.values()) - record_used_lineages notify('{} identifiers used out of {} distinct identifiers in spreadsheet.', len(record_used_idents), len(set(assignments))) assert record_used_idents.issubset(set(assignments)) unused_identifiers = set(assignments) - record_used_idents # now, save! db_outfile = args.lca_db_out if not (db_outfile.endswith('.lca.json') or \ db_outfile.endswith('.lca.json.gz')): # logic -> db.save db_outfile += '.lca.json' notify('saving to LCA DB: {}'.format(db_outfile)) db.save(db_outfile) ## done! # output a record of stuff if requested/available: if record_duplicates or record_no_lineage or record_remnants or unused_lineages: if record_duplicates: notify('WARNING: {} duplicate signatures.', len(record_duplicates)) if record_no_lineage: notify('WARNING: no lineage provided for {} signatures.', len(record_no_lineage)) if record_remnants: notify('WARNING: no signatures for {} spreadsheet rows.', len(record_remnants)) if unused_lineages: notify('WARNING: {} unused lineages.', len(unused_lineages)) if unused_identifiers: notify('WARNING: {} unused identifiers.', len(unused_identifiers)) if args.report: notify("generating a report and saving in '{}'", args.report) generate_report(record_duplicates, record_no_lineage, record_remnants, unused_lineages, unused_identifiers, args.report) else: notify('(You can use --report to generate a detailed report.)')
def main(): p = argparse.ArgumentParser() p.add_argument('hashfile') # file that contains hashes p.add_argument('-o', '--output', default=None, help='file to output signature to') p.add_argument('-k', '--ksize', default=None, type=int) p.add_argument('--scaled', default=None, type=int) p.add_argument('--num', default=None, type=int) p.add_argument('--name', default='', help='signature name') p.add_argument('--filename', default='', help='filename to add to signature') args = p.parse_args() # check arguments. if args.scaled and args.num: error('cannot specify both --num and --scaled! exiting.') return -1 if not args.ksize: error('must specify --ksize') return -1 if not args.output: error('must specify --output') return -1 # first, load in all the hashes hashes = set() for line in open(args.hashfile, 'rt'): hashval = int(line.strip()) hashes.add(hashval) if not hashes: error("ERROR, no hashes loaded from {}!", args.hashfile) return -1 notify('loaded {} distinct hashes from {}', len(hashes), args.hashfile) # now, create the MinHash object that we'll use. scaled = 0 num = 0 if args.scaled: scaled = args.scaled elif args.num: num = args.num else: notify('setting --num automatically from the number of hashes.') num = len(hashes) # construct empty MinHash object according to args minhash = MinHash(n=num, ksize=args.ksize, scaled=scaled) # add hashes into! minhash.add_many(hashes) if len(minhash) < len(hashes): notify("WARNING: loaded {} hashes, but only {} made it into MinHash.", len(hashes), len(minhash)) if scaled: notify("This is probably because of the scaled argument.") elif args.num: notify("This is probably because your --num is set to {}", args.num) if num > len(minhash): notify("WARNING: --num set to {}, but only {} hashes in signature.", num, len(minhash)) sigobj = sourmash.SourmashSignature(minhash, name=args.name, filename=args.filename) with open(args.output, 'wt') as fp: sourmash.save_signatures([sigobj], fp) notify('wrote signature to {}', args.output)
def info(verbose=False): "Report sourmash version + version of installed dependencies." notify('sourmash version {}', sourmash.VERSION) notify('- loaded from path: {}', os.path.dirname(__file__)) notify('') if verbose: notify('khmer version: None (internal Nodegraph)') notify('') notify('screed version {}', screed.__version__) notify('- loaded from path: {}', os.path.dirname(screed.__file__))
def compare_all_seqs( seqlist1, seqlist2=None, n_jobs=4, ksizes=KSIZES, moltype="protein", n_background=100, paired_seqlists=True, intermediate_csv=False, intermediate_parquet=False, no_final_concatenation=False, ): """Compare k-mer content of sequences across k-mer sizes and alphabets Parameters ---------- seqlist1 : list List of (id, seq) tuples seqlist2 : list, optional List of (id, seq) tuples. If None, then an all-by-all comparison of sequences in seqlist1 is performed, as if seqlist1 was provided as seqlist2. ksizes : iterable of int K-mer sizes to extract and compare the sequences on moltype : str One of "protein" or "dna" -- for knowing which alphabets to use n_background : int When paired_seqlist is True, how many random background sequences to choose from seqlist2 n_jobs : int Number of jobs for multiprocessing paired_seqlists : bool If True, then seqlist1 and seqlist2 have sequences at the same index that need to be compared, i.e. index 0 across the two. Best used when seqlist1 and seqlist2 are lists of homologous protein sequences across two different species intermediate_parquet : bool Write intermediate file of all comparisons at index i to an IO-efficient parquet format intermediate_csv : bool Write intermediate file of all comparisons at index i to an csv format Returns ------- kmer_comparisons : pandas.DataFrame A table of seq1_id, seq2_id, ksize, alphabet encoding, jaccard similarity Raises ------ ValueError: If paired_seqlist=True and seqlist1 and seqlist2 are of different lengths, as the comparison is done pairwise across both, as if the 'zip' operator was used. """ if seqlist2 is not None: if paired_seqlists and len(seqlist1) != len(seqlist2): raise ValueError( "When comparing pairs of sequences, can only " "compare two sequences of equal length" ) elif not paired_seqlists: # Want seqlist1 to be shorter so that there are fewer, bigger jobs # to minimize thread spawning costs if len(seqlist2) > len(seqlist1): # Swap the seqlist orders so seqlist1 is the shorter one old_seqlist1 = seqlist1 old_seqlist2 = seqlist2 seqlist2 = old_seqlist1 seqlist1 = old_seqlist2 else: seqlist2 = seqlist1 n = len(seqlist1) m = len(seqlist2) n_comparisons = n * m t0 = time.time() len_seqlist1 = len(seqlist1) notify(f"Number of comparisons: {n} * {m} = {n_comparisons:,}") # Initialize the function using func.partial with the common arguments like # siglist, ignore_abundance, downsample, for computing all the signatures # The only changing parameter that will be mapped from the pool is the # index func = partial( get_comparison_at_index, seqlist1=seqlist1, seqlist2=seqlist2, n_background=n_background, ksizes=ksizes, moltype=moltype, paired_seqlists=paired_seqlists, intermediate_csv=intermediate_csv, intermediate_parquet=intermediate_parquet, no_final_concatenation=no_final_concatenation, ) notify("Created similarity func") # Initialize multiprocess.pool pool = multiprocessing.Pool(processes=n_jobs) # Calculate chunk size, by default pool.imap chunk size is 1 chunksize, extra = divmod(len_seqlist1, n_jobs) if extra: chunksize += 1 notify("Calculated chunk size for multiprocessing") # This will not generate the results yet, since pool.imap returns a # generator result = pool.imap(func, range(len_seqlist1), chunksize=chunksize) notify("Initialized multiprocessing pool.imap") peptide_kmer_comparisons = pd.concat(itertools.chain(*result), ignore_index=True) notify(f"Total time: {time.time() - t0}") return peptide_kmer_comparisons
def get_comparison_at_index( index, seqlist1, seqlist2=None, ksizes=KSIZES, n_background=100, moltype="protein", verbose=False, paired_seqlists=True, intermediate_csv=False, intermediate_parquet=False, no_final_concatenation=False, ): """Returns similarities of all combinations of seqlist1 seqlist2 at index Parameters ---------- index : int generate masks from this image seqlist1 : list List of (id, seq) tuples seqlist2 : list, optional (default None) List of (id, seq) tuples. If None, then an all-by-all comparison of sequences in seqlist1 is performed, as if seqlist1 was provided as seqlist2. ksizes : iterable of int K-mer sizes to extract and compare the sequences on moltype : str, optional (default "protein") One of "protein" or "dna" -- for knowing which alphabets to use verbose : boolean, default False n_background : int, optional (default 100) When paired_seqlist is True, how many random background sequences to choose from seqlist2 paired_seqlists : bool, optional (default True) If True, then seqlist1 and seqlist2 have sequences at the same index that need to be compared, i.e. index 0 across the two. Best used when seqlist1 and seqlist2 are lists of homologous protein sequences across two different species intermediate_parquet : bool Write intermediate file of all comparisons at index i to an IO-efficient parquet format intermediate_csv : bool Write intermediate file of all comparisons at index i to an csv format Returns ------- comparison_df_list : list list of pandas.DataFrame tables for the combinations of seqlist1 at index, compared to seqlist2 """ startt = time.time() id1 = seqlist1[index][0] id1_sanitized = sanitize_id(id1) csv = id1_sanitized + ".csv" parquet = id1_sanitized + ".parquet" if os.path.exists(parquet): notify(f"Found {parquet} already exists for {id1}, skipping", end="\r") return [] if os.path.exists(csv): notify(f"Found {csv} already exists for {id1}, skipping", end="\r") return [] if seqlist2 is not None: if paired_seqlists: seq_iterator = get_paired_seq_iterator( index, n_background, seqlist1, seqlist2, verbose ) else: seq_iterator = itertools.product([seqlist1[index]], seqlist2) else: seq_iterator = itertools.product([seqlist1[index]], seqlist1[index + 1 :]) func = partial(compare_args_unpack, ksizes=ksizes, moltype=moltype) comparision_df_list = list(map(func, seq_iterator)) notify( "comparison for index {} (id: {}) done in {:.5f} seconds", index, id1, time.time() - startt, end="\n", ) if intermediate_csv or intermediate_parquet: df = pd.concat(comparision_df_list) if intermediate_csv: df.to_csv(csv) if intermediate_parquet: df.to_parquet(parquet) del df if no_final_concatenation: del comparision_df_list return [] else: return comparision_df_list
def abundhist(args): """ output abundance histogram and/or raw abundances. """ set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) outlist = [] total_loaded = 0 for filename in args.signatures: siglist = sourmash.load_file_as_signatures(filename, ksize=args.ksize, select_moltype=moltype) siglist = list(siglist) total_loaded += len(siglist) # select! if args.md5 is not None: siglist = [ss for ss in siglist if args.md5 in ss.md5sum()] if args.name is not None: siglist = [ss for ss in siglist if args.name in ss.name()] notify("loaded {} total that matched ksize & molecule type", total_loaded) if len(siglist) != total_loaded: notify("selected {} via name / md5 selectors".format(len(siglist))) notify('') counts_d = collections.defaultdict(int) for ss in siglist: for hashval, abund in ss.minhash.hashes.items(): counts_d[hashval] += abund all_counts = list(counts_d.values()) min_range = 1 if args.min is not None: min_range = args.min max_range = max(all_counts) if args.max is not None: max_range = args.max n_bins = args.bins if max_range - min_range + 1 < n_bins: n_bins = max_range - min_range + 1 # make hist counts, bin_edges = numpy.histogram(all_counts, range=(min_range, max_range), bins=n_bins) bin_edges = bin_edges.astype(int) # plot fig = tpl.figure() f = fig.barh(counts, [str(x) for x in bin_edges[1:]], force_ascii=True) fig.show() # output histogram in csv? if args.output: with FileOutput(args.output, 'wt') as fp: w = csv.writer(fp) w.writerow(['count', 'n_count']) for nc, c in zip(counts, bin_edges[1:]): w.writerow([c, nc]) # output raw counts tagged with hashval? if args.abundances: with FileOutput(args.abundances, 'wt') as fp: w = csv.writer(fp) w.writerow(['hashval', 'count']) for hashval, count in counts_d.items(): w.writerow([hashval, count])