def main(): p = argparse.ArgumentParser() p.add_argument('query_sig') p.add_argument('known_sig') p.add_argument('unknown_sig') p.add_argument("-k", "--ksize", type=int, default=31, help="ksize for analysis") p.add_argument("--moltype", default="DNA", help="molecule type for analysis") p.add_argument("--scaled", default=None, help="sourmash scaled value for analysis") p.add_argument("--report", help="output signature breakdown information in CSV") args = p.parse_args() ksize = args.ksize moltype = args.moltype query_sig = sourmash.load_file_as_signatures(args.query_sig, ksize=ksize, select_moltype=moltype) query_sig = list(query_sig)[0] known_sig = sourmash.load_file_as_signatures(args.known_sig) known_sig = list(known_sig)[0] unknown_sig = sourmash.load_file_as_signatures(args.unknown_sig) unknown_sig = list(unknown_sig)[0] query_mh = query_sig.minhash known_mh = known_sig.minhash unknown_mh = unknown_sig.minhash assert query_mh.ksize == known_mh.ksize assert query_mh.moltype == known_mh.moltype assert known_mh.scaled == unknown_mh.scaled query_mh = query_mh.downsample(scaled=known_mh.scaled) assert len(query_mh) == len(known_mh) + len(unknown_mh) p_known = len(known_mh) / len(query_mh) * 100 print(f"{len(known_mh)} known hashes of {len(query_mh)} total ({p_known:.1f}% known, {100-p_known:.1f}% unknown).") if args.report: print(f"reporting stats to '{args.report}'") with open(args.report, 'wt') as fp: w = csv.writer(fp) w.writerow(["total_hashes", "known_hashes", "unknown_hashes", "scaled", "moltype", "ksize"]) w.writerow([len(query_mh), len(known_mh), len(unknown_mh), query_mh.scaled, query_mh.moltype, query_mh.ksize ]) return 0
def test_script(self): subprocess.run('mashpit sketch test', shell=True) sig_dict_expected = {} sig_expected = load_file_as_signatures('expected_test.sig') for sig in sig_expected: sig_dict_expected[str(sig)] = str(sig.md5sum()) sig_dict_generated = {} sig_generated = load_file_as_signatures('test.sig') for sig in sig_generated: sig_dict_generated[str(sig)] = str(sig.md5sum()) self.assertDictEqual(dict(sorted(sig_dict_expected.items())), dict(sorted(sig_dict_generated.items())))
def main(): p = argparse.ArgumentParser() p.add_argument('contigs') # this is an assembly p.add_argument('read_sig') # this contains sourmash sig with abunds p.add_argument('-o', '--output', required=True) args = p.parse_args() siglist = sourmash.load_file_as_signatures(args.read_sig) siglist = list(siglist) assert len(siglist) == 1 sig = siglist[0] contigs_mh = sig.minhash.copy_and_clear() for record in screed.open(args.contigs): contigs_mh.add_sequence(record.sequence, force=True) # intersect the genome assembly with the read abundances # so now we get the abundances of only the k-mers that are in the # assembly. abunds = {} for hashval in contigs_mh.hashes: abunds[hashval] = sig.minhash.hashes.get(hashval, 0) output_mh = sig.minhash.copy_and_clear() output_mh.set_abundances(abunds) out_sig = sourmash.SourmashSignature(output_mh) with open(args.output, 'wt') as fp: print(f"Saving output to '{args.output}'") sourmash.save_signatures([out_sig], fp)
def test_smash_sig(): # run 'smash_reads' global _tempdir abundtrim_dir = os.path.join(_tempdir, "abundtrim") os.mkdir(abundtrim_dir) conf = utils.relative_file('tests/test-data/SRR5950647_subset.conf') src = utils.relative_file("tests/test-data/SRR5950647_subset.abundtrim.fq.gz") shutil.copy(src, abundtrim_dir) extra_args = ["smash_reads"] status = run_snakemake( conf, verbose=True, outdir=_tempdir, extra_args=extra_args, ) assert status == 0 output_sig = f"{_tempdir}/sigs/SRR5950647_subset.abundtrim.sig.zip" assert os.path.exists(output_sig) sigs = list(sourmash.load_file_as_signatures(output_sig)) assert len(sigs) == 3 for s in sigs: assert s.minhash.track_abundance
def main(): p = argparse.ArgumentParser() p.add_argument('zipfile') p.add_argument('signatures', nargs='*') p.add_argument('--sig-pathlist') p.add_argument('--compression', type=int, default=9) p.add_argument('--ksize', type=int) # can we accept multiple and write mult sigfiles in one pass? p.add_argument('--scaled', type=int) p.add_argument('--alphabet') args = p.parse_args() zf = zipfile.ZipFile(args.zipfile, 'w') siglist = [x.rstrip() for x in open(args.sig_pathlist)] all_sigs = siglist + args.signatures # is this still needed? feel like we accept aliases now... if args.alphabet == "nucleotide": args.alphabet = "DNA" n = 0 all_md5=set() sig_scaled=None downsample=False for i, filename in enumerate(all_sigs): if n % 10000 == 0: print(f"... processing {n}th signature; currently reading signatures from '{filename}'") for sig in sourmash.load_file_as_signatures(filename, ksize=args.ksize, select_moltype=args.alphabet): # zip needs a unique name for each signature. Use sig md5sum. md5= sig.md5sum() # if this is a duplicate md5sum, add _{number} to make it unique. if md5 in all_md5: sys.stderr.write(f"{str(sig)} has an md5sum identical to one already in the zipfile ({md5})") i=0 full_md5 = f"{md5}_{i}" while full_md5 in all_md5: i+= 1 full_md5 = f"{md5}_{i}" md5=full_md5 sys.stderr.write(f"...adding unique md5 {md5} instead") all_md5.add(md5) md5_name = 'signatures/' + md5 + '.sig' # once, check we can downsample if args.scaled and not sig_scaled: sig_scaled = sig.minhash.scaled if args.scaled < sig_scaled: print(f"Can't downsample: desired scaled {args.scaled} is smaller than original scaled, {sig_scaled}. Exiting!") sys.exit(-1) else: downsample=True # if need to downsample, do it if downsample: sig.minhash = sig.minhash.downsample(scaled=args.scaled) sigstr = sourmash.save_signatures([sig], compression=args.compression) zf.writestr(md5_name, sigstr) n += 1 print(f"wrote {n} signatures to '{args.zipfile}'") return 0
def abundhist(args): """ output abundance histogram and/or raw abundances. """ set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) outlist = [] total_loaded = 0 for filename in args.signatures: siglist = sourmash.load_file_as_signatures(filename, ksize=args.ksize, select_moltype=moltype) siglist = list(siglist) total_loaded += len(siglist) # select! if args.md5 is not None: siglist = [ss for ss in siglist if args.md5 in ss.md5sum()] if args.name is not None: siglist = [ss for ss in siglist if args.name in ss.name()] notify("loaded {} total that matched ksize & molecule type", total_loaded) if len(siglist) != total_loaded: notify("selected {} via name / md5 selectors".format(len(siglist))) notify('') counts_d = collections.defaultdict(int) for ss in siglist: for hashval, abund in ss.minhash.hashes.items(): counts_d[hashval] += abund all_counts = list(counts_d.values()) min_range = 1 if args.min is not None: min_range = args.min max_range = max(all_counts) if args.max is not None: max_range = args.max n_bins = args.bins if max_range - min_range + 1 < n_bins: n_bins = max_range - min_range + 1 # make hist counts, bin_edges = numpy.histogram(all_counts, range=(min_range, max_range), bins=n_bins) bin_edges = bin_edges.astype(int) # plot fig = tpl.figure() f = fig.barh(counts, [str(x) for x in bin_edges[1:]], force_ascii=True) fig.show() # output histogram in csv? if args.output: with FileOutput(args.output, 'wt') as fp: w = csv.writer(fp) w.writerow(['count', 'n_count']) for nc, c in zip(counts, bin_edges[1:]): w.writerow([c, nc]) # output raw counts tagged with hashval? if args.abundances: with FileOutput(args.abundances, 'wt') as fp: w = csv.writer(fp) w.writerow(['hashval', 'count']) for hashval, count in counts_d.items(): w.writerow([hashval, count])
def query(args): sample_path = args.sample sample_name = ntpath.basename(sample_path) cwd = os.getcwd() db_path = os.path.join(cwd, args.database + '.db') database_sig_path = os.path.join(cwd, args.database + '.sig') target_sig_path = os.path.join(cwd, sample_path + '.sig') # check if database and signature file exists if os.path.exists(db_path): pass else: print("Database not found.") exit(0) if os.path.exists(database_sig_path): pass else: print("Database signature file not found") exit(0) conn = create_connection(db_path) c = conn.cursor() # sketch the query sample and load the signature get_target_sig(sample_path) target_sig = load_one_signature(target_sig_path) # manager dict: a shared variable for multiprocessing but slow in iteration manager = multiprocessing.Manager() srr_similarity_manager_dict = manager.dict() # check if the signature file has been splited (need a more elegant way) if os.path.exists(args.database + '_1.sig'): proc_list = [] for i in range(1, args.number): proc = Process(target=calculate_similarity, args=(i, srr_similarity_manager_dict, target_sig, args.database)) proc.start() proc_list.append(proc) for i in proc_list: i.join() else: database_sig = load_file_as_signatures(database_sig_path) for sig in database_sig: similarity = target_sig.jaccard(sig) srr_similarity_manager_dict[str(sig)] = similarity srr_similarity_dict = {} srr_similarity_dict.update(srr_similarity_manager_dict) # get the top 50 results res_srr_similarity_dict = dict( sorted(srr_similarity_dict.items(), key=itemgetter(1), reverse=True)[:50]) c.execute('SELECT * FROM METADATA') output_df = pd.DataFrame([]) names = [description[0] for description in c.description] for i in res_srr_similarity_dict: sql_query = pd.read_sql_query( "select * from METADATA where srr = '" + str(i) + "'", conn) df_query = pd.DataFrame(sql_query, columns=names) df_query['similarity_score'] = res_srr_similarity_dict[i] output_df = output_df.append(df_query, ignore_index=True) # if it is a standard database, add the link of the snp cluster to the output c.execute("SELECT value FROM DESC where name = 'Type';") db_type = c.fetchone()[0] if db_type == 'Standard': pds_list = output_df['PDS_acc'].to_list() cluster_link = [] for pds in pds_list: cluster_link.append( 'https://www.ncbi.nlm.nih.gov/pathogens/isolates/#' + pds) output_df['link'] = cluster_link print(output_df) output_df.to_csv(sample_name + '_output.csv', index=True)
def calculate_similarity(i, similarity_dict, target_sig, database): database_sig = load_file_as_signatures(database + '_' + str(i) + '.sig') for sig in database_sig: similarity = target_sig.jaccard(sig) similarity_dict[str(sig)] = similarity return