def siglist(): demo_path = utils.get_test_data("demo") filenames = sorted(glob.glob(os.path.join(demo_path, "*.sig"))) sigs = [] for filename in filenames: sigs.extend(sourmash.load_signatures(filename)) return sigs
def subtract(args): """ subtract one or more signatures from another """ p = SourmashArgumentParser(prog='sourmash signature subtract') p.add_argument('signature_from') p.add_argument('subtraction_sigs', nargs='+') p.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') p.add_argument('-o', '--output', type=argparse.FileType('wt'), default=sys.stdout, help='output signature to this file') p.add_argument('--flatten', action='store_true', help='remove abundance from signatures before subtracting') sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K) sourmash_args.add_moltype_args(p) args = p.parse_args(args) set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) from_sigfile = args.signature_from from_sigobj = sourmash.load_one_signature(from_sigfile, ksize=args.ksize, select_moltype=moltype) from_mh = from_sigobj.minhash if from_mh.track_abundance and not args.flatten: error('Cannot use subtract on signatures with abundance tracking, sorry!') sys.exit(1) subtract_mins = set(from_mh.get_mins()) notify('loaded signature from {}...', from_sigfile, end='\r') total_loaded = 0 for sigfile in args.subtraction_sigs: for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize, select_moltype=moltype, do_raise=True): if sigobj.minhash.track_abundance and not args.flatten: error('Cannot use subtract on signatures with abundance tracking, sorry!') sys.exit(1) subtract_mins -= set(sigobj.minhash.get_mins()) notify('loaded and subtracted signatures from {}...', sigfile, end='\r') total_loaded += 1 if not total_loaded: error("no signatures to subtract!?") sys.exit(-1) subtract_mh = from_sigobj.minhash.copy_and_clear() subtract_mh.add_many(subtract_mins) subtract_sigobj = sourmash.SourmashSignature(subtract_mh) output_json = sourmash.save_signatures([subtract_sigobj], fp=args.output) notify('loaded and subtracted {} signatures', total_loaded)
def test_linear_index_save(): sig2 = utils.get_test_data('2.fa.sig') sig47 = utils.get_test_data('47.fa.sig') sig63 = utils.get_test_data('63.fa.sig') ss2 = sourmash.load_one_signature(sig2, ksize=31) ss47 = sourmash.load_one_signature(sig47) ss63 = sourmash.load_one_signature(sig63) linear = LinearIndex() linear.insert(ss2) linear.insert(ss47) linear.insert(ss63) with utils.TempDirectory() as location: filename = os.path.join(location, 'foo') linear.save(filename) from sourmash import load_signatures si = set(load_signatures(filename)) x = {ss2, ss47, ss63} print(len(si)) print(len(x)) print(si) print(x) assert si == x, si
def test_linear_index_moltype_select(): # this loads two ksizes(21, 30), and two moltypes (DNA and protein) filename = utils.get_test_data('genome-s10+s11.sig') siglist = sourmash.load_signatures(filename) linear = LinearIndex() for ss in siglist: linear.insert(ss) # select most specific DNA linear2 = linear.select(ksize=30, moltype='DNA') assert len(linear2) == 1 # select most specific protein linear2 = linear.select(ksize=30, moltype='protein') assert len(linear2) == 1 # can leave off ksize, selects all ksizes linear2 = linear.select(moltype='DNA') assert len(linear2) == 2 # can leave off ksize, selects all ksizes linear2 = linear.select(moltype='protein') assert len(linear2) == 2 # select something impossible linear2 = linear.select(ksize=4) assert len(linear2) == 0
def flatten(args): """ flatten a signature, removing abundances. """ p = SourmashArgumentParser(prog='sourmash signature flatten') p.add_argument('signatures', nargs='+') p.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') p.add_argument('-o', '--output', type=argparse.FileType('wt'), default=sys.stdout, help='output signature to this file') p.add_argument('--md5', default=None, help='select signatures whose md5 contains this substring') p.add_argument('--name', default=None, help='select signatures whose name contains this substring') sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K) sourmash_args.add_moltype_args(p) args = p.parse_args(args) set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) outlist = [] total_loaded = 0 for filename in args.signatures: siglist = sourmash.load_signatures(filename, ksize=args.ksize, select_moltype=moltype, do_raise=True) siglist = list(siglist) total_loaded += len(siglist) # select! if args.md5 is not None: siglist = [ss for ss in siglist if args.md5 in ss.md5sum()] if args.name is not None: siglist = [ss for ss in siglist if args.name in ss.name()] for ss in siglist: flattened_mh = ss.minhash.copy_and_clear() _flatten(flattened_mh) flattened_mh.add_many(ss.minhash.get_mins()) ss.minhash = flattened_mh outlist.extend(siglist) output_json = sourmash.save_signatures(outlist, fp=args.output) notify("loaded {} total that matched ksize & molecule type", total_loaded) notify("extracted {} signatures from {} file(s)", len(outlist), len(args.signatures))
def intersect(args): """ intersect one or more signatures by taking the intersection of hashes. This function always removes abundances. """ set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) first_sig = None mins = None total_loaded = 0 for sigfile in args.signatures: for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize, select_moltype=moltype, do_raise=True): if first_sig is None: first_sig = sigobj mins = set(sigobj.minhash.get_mins()) mins.intersection_update(sigobj.minhash.get_mins()) total_loaded += 1 notify('loaded and intersected signatures from {}...', sigfile, end='\r') if total_loaded == 0: error("no signatures to merge!?") sys.exit(-1) # forcibly turn off track_abundance, unless --abundances-from set. if not args.abundances_from: intersect_mh = first_sig.minhash.copy_and_clear() intersect_mh.track_abundance = False intersect_mh.add_many(mins) intersect_sigobj = sourmash.SourmashSignature(intersect_mh) else: notify('loading signature from {}, keeping abundances', args.abundances_from) abund_sig = sourmash.load_one_signature(args.abundances_from, ksize=args.ksize, select_moltype=moltype) if not abund_sig.minhash.track_abundance: error("--track-abundance not set on loaded signature?! exiting.") sys.exit(-1) intersect_mh = abund_sig.minhash.copy_and_clear() abund_mins = abund_sig.minhash.get_mins(with_abundance=True) # do one last intersection mins.intersection_update(abund_mins) abund_mins = { k: abund_mins[k] for k in mins } intersect_mh.set_abundances(abund_mins) intersect_sigobj = sourmash.SourmashSignature(intersect_mh) with FileOutput(args.output, 'wt') as fp: sourmash.save_signatures([intersect_sigobj], fp=fp) notify('loaded and intersected {} signatures', total_loaded)
def downsample(args): """ downsample a scaled signature. """ set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) if not args.num and not args.scaled: error('must specify either --num or --scaled value') sys.exit(-1) if args.num and args.scaled: error('cannot specify both --num and --scaled') sys.exit(-1) output_list = [] total_loaded = 0 for sigfile in args.signatures: siglist = sourmash.load_signatures(sigfile, ksize=args.ksize, select_moltype=moltype, do_raise=True) for sigobj in siglist: mh = sigobj.minhash notify('loading and downsampling signature from {}...', sigfile, end='\r') total_loaded += 1 if args.scaled: if mh.scaled: mh_new = mh.downsample_scaled(args.scaled) else: # try to turn a num into a scaled # first check: can we? max_hash = get_max_hash_for_scaled(args.scaled) mins = mh.get_mins() if max(mins) < max_hash: raise ValueError("this num MinHash does not have enough hashes to convert it into a scaled MinHash.") mh_new = copy.copy(mh) _set_num_scaled(mh_new, 0, args.scaled) elif args.num: if mh.num: mh_new = mh.downsample_n(args.num) else: # try to turn a scaled into a num # first check: can we? if len(mh) < args.num: raise ValueError("this scaled MinHash has only {} hashes") mh_new = copy.copy(mh) _set_num_scaled(mh_new, args.num, 0) sigobj.minhash = mh_new output_list.append(sigobj) with FileOutput(args.output, 'wt') as fp: sourmash.save_signatures(output_list, fp=fp) notify("loaded and downsampled {} signatures", total_loaded)
def extract(args): """ extract signatures. """ p = SourmashArgumentParser(prog='sourmash signature extract') p.add_argument('signatures', nargs='+') p.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') p.add_argument('-o', '--output', type=argparse.FileType('wt'), default=sys.stdout, help='output signature to this file') p.add_argument('--md5', default=None, help='select signatures whose md5 contains this substring') p.add_argument('--name', default=None, help='select signatures whose name contains this substring') sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K) sourmash_args.add_moltype_args(p) args = p.parse_args(args) set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) outlist = [] total_loaded = 0 for filename in args.signatures: siglist = sourmash.load_signatures(filename, ksize=args.ksize, select_moltype=moltype, do_raise=True) siglist = list(siglist) total_loaded += len(siglist) # select! if args.md5 is not None: siglist = [ss for ss in siglist if args.md5 in ss.md5sum()] if args.name is not None: siglist = [ss for ss in siglist if args.name in ss.name()] outlist.extend(siglist) notify("loaded {} total that matched ksize & molecule type", total_loaded) if not outlist: error("no matching signatures!") sys.exit(-1) sourmash.save_signatures(outlist, fp=args.output) notify("extracted {} signatures from {} file(s)", len(outlist), len(args.signatures))
def test_sourmash_signature_api(): e = sourmash.MinHash(n=1, ksize=20) sig = sourmash.SourmashSignature(e) s = sourmash.save_signatures([sig]) sig_x1 = sourmash.load_one_signature(s) sig_x2 = list(sourmash.load_signatures(s))[0] assert sig_x1 == sig assert sig_x2 == sig
def test_sig_downsample_1_scaled_downsample_multisig(c): # downsample many scaled signatures in one file multisig = utils.get_test_data('47+63-multisig.sig') c.run_sourmash('sig', 'downsample', '--scaled', '10000', multisig) # stdout should be new signatures out = c.last_result.out for sig in sourmash.load_signatures(out): assert sig.minhash.scaled == 10000
def merge(args): """ merge one or more signatures. """ set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) first_sig = None mh = None total_loaded = 0 # iterate over all the sigs from all the files. for sigfile in args.signatures: notify('loading signatures from {}...', sigfile, end='\r') this_n = 0 for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize, select_moltype=moltype, do_raise=True): # first signature? initialize a bunch of stuff if first_sig is None: first_sig = sigobj mh = first_sig.minhash.copy_and_clear() # forcibly remove abundance? if args.flatten: mh.track_abundance = False try: sigobj_mh = sigobj.minhash if not args.flatten: _check_abundance_compatibility(first_sig, sigobj) else: sigobj_mh.track_abundance = False mh.merge(sigobj_mh) except: error("ERROR when merging signature '{}' ({}) from file {}", sigobj.name(), sigobj.md5sum()[:8], sigfile) raise this_n += 1 total_loaded += 1 if this_n: notify('loaded and merged {} signatures from {}...', this_n, sigfile, end='\r') if not total_loaded: error("no signatures to merge!?") sys.exit(-1) merged_sigobj = sourmash.SourmashSignature(mh) with FileOutput(args.output, 'wt') as fp: sourmash.save_signatures([merged_sigobj], fp=fp) notify('loaded and merged {} signatures', total_loaded)
def intersect(args): """ intersect one or more signatures by taking the intersection of hashes. This function always removes abundances. """ p = SourmashArgumentParser(prog='sourmash signature intersect') p.add_argument('signatures', nargs='+') p.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') p.add_argument('-o', '--output', type=argparse.FileType('wt'), default=sys.stdout, help='output signature to this file') sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K) sourmash_args.add_moltype_args(p) args = p.parse_args(args) set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) first_sig = None mins = None total_loaded = 0 for sigfile in args.signatures: for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize, select_moltype=moltype, do_raise=True): if first_sig is None: first_sig = sigobj mins = set(sigobj.minhash.get_mins()) mins.intersection_update(sigobj.minhash.get_mins()) total_loaded += 1 notify('loaded and intersected signatures from {}...', sigfile, end='\r') if total_loaded == 0: error("no signatures to merge!?") sys.exit(-1) # forcibly turn off track_abundance intersect_mh = first_sig.minhash.copy_and_clear() _flatten(intersect_mh) intersect_mh.add_many(mins) intersect_sigobj = sourmash.SourmashSignature(intersect_mh) output_json = sourmash.save_signatures([intersect_sigobj], fp=args.output) notify('loaded and intersected {} signatures', total_loaded)
def flatten(args): """ flatten a signature, removing abundances. """ p = SourmashArgumentParser(prog='sourmash signature flatten') p.add_argument('signatures', nargs='+') p.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') p.add_argument('-o', '--output', type=argparse.FileType('wt'), default=sys.stdout, help='output signature to this file') p.add_argument('--md5', default=None, help='select signatures whose md5 contains this substring') p.add_argument('--name', default=None, help='select signatures whose name contains this substring') sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K) sourmash_args.add_moltype_args(p) args = p.parse_args(args) set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) outlist = [] total_loaded = 0 for filename in args.signatures: siglist = sourmash.load_signatures(filename, ksize=args.ksize, select_moltype=moltype, do_raise=True) siglist = list(siglist) total_loaded += len(siglist) # select! if args.md5 is not None: siglist = [ ss for ss in siglist if args.md5 in ss.md5sum() ] if args.name is not None: siglist = [ ss for ss in siglist if args.name in ss.name() ] for ss in siglist: flattened_mh = ss.minhash.copy_and_clear() _flatten(flattened_mh) flattened_mh.add_many(ss.minhash.get_mins()) ss.minhash = flattened_mh outlist.extend(siglist) output_json = sourmash.save_signatures(outlist, fp=args.output) notify("loaded {} total that matched ksize & molecule type", total_loaded) notify("extracted {} signatures from {} file(s)", len(outlist), len(args.signatures))
def test_sig_flatten_2_ksize(c): # flatten only one signature selected using ksize psw_mag = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig') c.run_sourmash('sig', 'flatten', psw_mag, '-k', '31') # stdout should be new signature out = c.last_result.out siglist = sourmash.load_signatures(out) siglist = list(siglist) assert len(siglist) == 1
def test_sig_extract_7(c): # extract matches based on ksize sig2 = utils.get_test_data('2.fa.sig') c.run_sourmash('sig', 'extract', sig2, '-k', '31') # stdout should be new signature out = c.last_result.out siglist = sourmash.load_signatures(out) siglist = list(siglist) assert len(siglist) == 1
def test_sig_extract_7_no_ksize(c): # extract all three matches when -k not specified sig2 = utils.get_test_data('2.fa.sig') c.run_sourmash('sig', 'extract', sig2) # stdout should be new signature out = c.last_result.out siglist = sourmash.load_signatures(out) siglist = list(siglist) assert len(siglist) == 3
def filter(args): """ filter hashes by abundance in all of the signatures """ set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) outlist = [] total_loaded = 0 for filename in args.signatures: siglist = sourmash.load_signatures(filename, ksize=args.ksize, select_moltype=moltype, do_raise=True) siglist = list(siglist) total_loaded += len(siglist) # select! if args.md5 is not None: siglist = [ ss for ss in siglist if args.md5 in ss.md5sum() ] if args.name is not None: siglist = [ ss for ss in siglist if args.name in ss.name() ] for ss in siglist: mh = ss.minhash if not mh.track_abundance: notify('ignoring signature {} - track_abundance not set.', ss) continue abunds = mh.get_mins(with_abundance=True) abunds2 = {} for k, v in abunds.items(): if v >= args.min_abundance: if args.max_abundance is None or \ v <= args.max_abundance: abunds2[k] = v filtered_mh = mh.copy_and_clear() filtered_mh.set_abundances(abunds2) ss.minhash = filtered_mh outlist.extend(siglist) with FileOutput(args.output, 'wt') as fp: sourmash.save_signatures(outlist, fp=fp) notify("loaded {} total that matched ksize & molecule type", total_loaded) notify("extracted {} signatures from {} file(s)", len(outlist), len(args.signatures))
def test_sig_extract_6(c): # extract matches to several names from among several signatures sig47 = utils.get_test_data('47.fa.sig') sig63 = utils.get_test_data('63.fa.sig') c.run_sourmash('sig', 'extract', sig47, sig63, '--name', 'Shewanella') # stdout should be new signature out = c.last_result.out siglist = sourmash.load_signatures(out) siglist = list(siglist) assert len(siglist) == 2
def test_sig_rename_1_multisig(c): # set new name for multiple signatures/files multisig = utils.get_test_data('47+63-multisig.sig') other_sig = utils.get_test_data('2.fa.sig') c.run_sourmash('sig', 'rename', multisig, other_sig, 'fiz bar') # stdout should be new signature out = c.last_result.out n = 0 for sig in sourmash.load_signatures(out): assert sig.name() == 'fiz bar' n += 1 assert n == 9, n
def subtract(args): """ subtract one or more signatures from another """ set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) from_sigfile = args.signature_from from_sigobj = sourmash.load_one_signature(from_sigfile, ksize=args.ksize, select_moltype=moltype) from_mh = from_sigobj.minhash if from_mh.track_abundance and not args.flatten: error('Cannot use subtract on signatures with abundance tracking, sorry!') sys.exit(1) subtract_mins = set(from_mh.get_mins()) notify('loaded signature from {}...', from_sigfile, end='\r') total_loaded = 0 for sigfile in args.subtraction_sigs: for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize, select_moltype=moltype, do_raise=True): if sigobj.minhash.track_abundance and not args.flatten: error('Cannot use subtract on signatures with abundance tracking, sorry!') sys.exit(1) subtract_mins -= set(sigobj.minhash.get_mins()) notify('loaded and subtracted signatures from {}...', sigfile, end='\r') total_loaded += 1 if not total_loaded: error("no signatures to subtract!?") sys.exit(-1) subtract_mh = from_sigobj.minhash.copy_and_clear() subtract_mh.add_many(subtract_mins) subtract_sigobj = sourmash.SourmashSignature(subtract_mh) with FileOutput(args.output, 'wt') as fp: sourmash.save_signatures([subtract_sigobj], fp=fp) notify('loaded and subtracted {} signatures', total_loaded)
def test_sig_cat_2_out(c): # cat several sig47 = utils.get_test_data('47.fa.sig') sig47abund = utils.get_test_data('track_abund/47.fa.sig') multisig = utils.get_test_data('47+63-multisig.sig') c.run_sourmash('sig', 'cat', sig47, sig47abund, multisig, '-o', 'out.sig') # stdout should be same signatures out = c.output('out.sig') siglist = list(sourmash.load_signatures(out)) print(len(siglist)) assert repr( siglist ) == """[SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 57e2b22f), SourmashSignature('NC_009661.1 Shewanella baltica OS185 plasmid pS18501, complete sequence', bde81a41), SourmashSignature('NC_011663.1 Shewanella baltica OS223, complete genome', f033bbd8), SourmashSignature('NC_011664.1 Shewanella baltica OS223 plasmid pS22301, complete sequence', 87a9aec4), SourmashSignature('NC_011668.1 Shewanella baltica OS223 plasmid pS22302, complete sequence', 837bf2a7), SourmashSignature('NC_011665.1 Shewanella baltica OS223 plasmid pS22303, complete sequence', 485c3377)]"""
def test_sig_cat_1_no_unique(c): # cat 47 to 47... twice sig47 = utils.get_test_data('47.fa.sig') c.run_sourmash('sig', 'cat', sig47, sig47) # stdout should be same signature out = c.last_result.out test_cat_sig = sourmash.load_one_signature(sig47) actual_cat_sigs = sourmash.load_signatures(out) for n, sig in enumerate(actual_cat_sigs): assert sig == test_cat_sig assert n == 1 # two signatures, but enumerate stops at 1. assert 'encountered 1 MinHashes multiple times' in c.last_result.err
def test_sig_flatten_1(c): # extract matches to several names from among several signatures & flatten sig47abund = utils.get_test_data('track_abund/47.fa.sig') sig47 = utils.get_test_data('47.fa.sig') c.run_sourmash('sig', 'flatten', sig47abund, '--name', 'Shewanella') # stdout should be new signature out = c.last_result.out siglist = sourmash.load_signatures(out) siglist = list(siglist) assert len(siglist) == 1 test_flattened = sourmash.load_one_signature(sig47) assert test_flattened.minhash == siglist[0].minhash
def test_linear_index_multik_select(): # this loads three ksizes, 21/31/51 sig2 = utils.get_test_data('2.fa.sig') siglist = sourmash.load_signatures(sig2) linear = LinearIndex() for ss in siglist: linear.insert(ss) # select most specifically linear2 = linear.select(ksize=31, moltype='DNA') assert len(linear2) == 1 # all are DNA: linear2 = linear.select(moltype='DNA') assert len(linear2) == 3
def test_sig_filter_1(c): # test basic filtering sig47 = utils.get_test_data('track_abund/47.fa.sig') sig63 = utils.get_test_data('track_abund/63.fa.sig') c.run_sourmash('sig', 'filter', sig47, sig63) # stdout should be new signature out = c.last_result.out filtered_sigs = list(sourmash.load_signatures(out)) assert len(filtered_sigs) == 2 mh47 = sourmash.load_one_signature(sig47).minhash mh63 = sourmash.load_one_signature(sig63).minhash assert filtered_sigs[0].minhash == mh47 assert filtered_sigs[1].minhash == mh63
def test_sig_cat_1_unique(c): # cat 47 to 47... twice... and get unique sig47 = utils.get_test_data('47.fa.sig') c.run_sourmash('sig', 'cat', sig47, sig47, '--unique') # stdout should be same signature out = c.last_result.out err = c.last_result.err test_cat_sig = sourmash.load_one_signature(sig47) actual_cat_sigs = sourmash.load_signatures(out) for n, sig in enumerate(actual_cat_sigs): assert sig == test_cat_sig assert n == 0 # enumerate stops at 0, first sig. assert 'encountered 1 MinHashes multiple times' in err assert '...and removed the duplicates, because --unique was specified.' in err
def rename(args): """ rename one or more signatures. """ p = SourmashArgumentParser(prog='sourmash signature rename') p.add_argument('sigfiles', nargs='+') p.add_argument('name') p.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') p.add_argument('-d', '--debug', action='store_true', help='output debugging output') p.add_argument('-o', '--output', help='output to this file') sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K) sourmash_args.add_moltype_args(p) args = p.parse_args(args) set_quiet(args.quiet, args.quiet) moltype = sourmash_args.calculate_moltype(args) outlist = [] for filename in args.sigfiles: debug('loading {}', filename) siglist = sourmash.load_signatures(filename, ksize=args.ksize, select_moltype=moltype) for sigobj in siglist: sigobj.d['name'] = args.name outlist.append(sigobj) if args.output: fp = open(args.output, 'wt') else: fp = sys.stdout output_json = sourmash.save_signatures(outlist, fp=fp) if args.output: fp.close() notify("set name to '{}' on {} signatures", args.name, len(outlist))
def rename(args): """ rename one or more signatures. """ set_quiet(args.quiet, args.quiet) moltype = sourmash_args.calculate_moltype(args) outlist = [] for filename in args.sigfiles: debug('loading {}', filename) siglist = sourmash.load_signatures(filename, ksize=args.ksize, select_moltype=moltype) for sigobj in siglist: sigobj._name = args.name outlist.append(sigobj) with FileOutput(args.output, 'wt') as fp: sourmash.save_signatures(outlist, fp=fp) notify("set name to '{}' on {} signatures", args.name, len(outlist))
def flatten(args): """ flatten a signature, removing abundances. """ set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) outlist = [] total_loaded = 0 for filename in args.signatures: siglist = sourmash.load_signatures(filename, ksize=args.ksize, select_moltype=moltype, do_raise=True) siglist = list(siglist) total_loaded += len(siglist) # select! if args.md5 is not None: siglist = [ ss for ss in siglist if args.md5 in ss.md5sum() ] if args.name is not None: siglist = [ ss for ss in siglist if args.name in ss.name() ] for ss in siglist: flattened_mh = ss.minhash.copy_and_clear() flattened_mh.track_abundance = False flattened_mh.add_many(ss.minhash.get_mins()) ss.minhash = flattened_mh outlist.extend(siglist) with FileOutput(args.output, 'wt') as fp: sourmash.save_signatures(outlist, fp=fp) notify("loaded {} total that matched ksize & molecule type", total_loaded) notify("extracted {} signatures from {} file(s)", len(outlist), len(args.signatures))
def process_sig(sigfile): counters = {} original = sigfile[5:-4] ident = sigfile.split("/")[3] try: bps, seqs, unique = analyze_file(original) except Exception as e: print(f"Error: {e}") return None counters["id"] = ident counters["bp"] = bps for k in unique: counters[f"unique_{k}"] = len(unique[k]) sigs = sourmash.load_signatures(sigfile) for sig in sigs: mh = sig.minhash k = mh.ksize counters[k] = len(mh) return counters
def test_sbt_zipstorage(tmpdir): # create tree, save to a zip, then load and search. factory = GraphFactory(31, 1e5, 4) tree = SBT(factory) for f in utils.SIG_FILES: sig = next(load_signatures(utils.get_test_data(f))) leaf = SigLeaf(os.path.basename(f), sig) tree.add_node(leaf) to_search = leaf print('*' * 60) print("{}:".format(to_search.metadata)) old_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*old_result, sep='\n') with ZipStorage(str(tmpdir.join("tree.sbt.zip"))) as storage: tree.save(str(tmpdir.join("tree")), storage=storage) with ZipStorage(str(tmpdir.join("tree.sbt.zip"))) as storage: tree = SBT.load(str(tmpdir.join("tree")), leaf_loader=SigLeaf.load, storage=storage) print('*' * 60) print("{}:".format(to_search.metadata)) new_result = { str(s) for s in tree.find(search_minhashes, to_search.data, 0.1) } print(*new_result, sep='\n') assert old_result == new_result
def extract(args): """ extract signatures. """ set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) outlist = [] total_loaded = 0 for filename in args.signatures: siglist = sourmash.load_signatures(filename, ksize=args.ksize, select_moltype=moltype, do_raise=True) siglist = list(siglist) total_loaded += len(siglist) # select! if args.md5 is not None: siglist = [ ss for ss in siglist if args.md5 in ss.md5sum() ] if args.name is not None: siglist = [ ss for ss in siglist if args.name in ss.name() ] outlist.extend(siglist) notify("loaded {} total that matched ksize & molecule type", total_loaded) if not outlist: error("no matching signatures!") sys.exit(-1) with FileOutput(args.output, 'wt') as fp: sourmash.save_signatures(outlist, fp=fp) notify("extracted {} signatures from {} file(s)", len(outlist), len(args.signatures))
def downsample(args): """ downsample a scaled signature. """ p = SourmashArgumentParser(prog='sourmash signature downsample') p.add_argument('signatures', nargs="+") p.add_argument('--scaled', type=int, default=0, help='scaled value to downsample to') p.add_argument('--num', type=int, default=0, help='num value to downsample to') p.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') p.add_argument('-o', '--output', type=argparse.FileType('wt'), default=sys.stdout, help='output signature to this file') sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K) sourmash_args.add_moltype_args(p) args = p.parse_args(args) set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) if not args.num and not args.scaled: error('must specify either --num or --scaled value') sys.exit(-1) if args.num and args.scaled: error('cannot specify both --num and --scaled') sys.exit(-1) output_list = [] total_loaded = 0 for sigfile in args.signatures: siglist = sourmash.load_signatures(sigfile, ksize=args.ksize, select_moltype=moltype, do_raise=True) for sigobj in siglist: mh = sigobj.minhash notify('loading and downsampling signature from {}...', sigfile, end='\r') total_loaded += 1 if args.scaled: if mh.scaled: mh_new = mh.downsample_scaled(args.scaled) else: # try to turn a num into a scaled # first check: can we? max_hash = get_max_hash_for_scaled(args.scaled) mins = mh.get_mins() if max(mins) < max_hash: raise ValueError("this num MinHash does not have enough hashes to convert it into a scaled MinHash.") mh_new = copy.copy(mh) _set_num_scaled(mh_new, 0, args.scaled) elif args.num: if mh.num: mh_new = mh.downsample_n(args.num) else: # try to turn a scaled into a num # first check: can we? if len(mh) < args.num: raise ValueError("this scaled MinHash has only {} hashes") mh_new = copy.copy(mh) _set_num_scaled(mh_new, args.num, 0) sigobj.minhash = mh_new output_list.append(sigobj) output_json = sourmash.save_signatures(output_list, fp=args.output) notify("loaded and downsampled {} signatures", total_loaded)
def merge(args): """ merge one or more signatures. """ p = SourmashArgumentParser(prog='sourmash signature merge') p.add_argument('signatures', nargs='+') p.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') p.add_argument('-o', '--output', type=argparse.FileType('wt'), default=sys.stdout, help='output signature to this file') p.add_argument('--flatten', action='store_true', help='Remove abundances from all signatures.') sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K) sourmash_args.add_moltype_args(p) args = p.parse_args(args) set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) first_sig = None mh = None total_loaded = 0 # iterate over all the sigs from all the files. for sigfile in args.signatures: notify('loading signatures from {}...', sigfile, end='\r') this_n = 0 for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize, select_moltype=moltype, do_raise=True): # first signature? initialize a bunch of stuff if first_sig is None: first_sig = sigobj mh = first_sig.minhash.copy_and_clear() # forcibly remove abundance? if mh.track_abundance and args.flatten: _flatten(mh) try: if not args.flatten: _check_abundance_compatibility(first_sig, sigobj) mh.merge(sigobj.minhash) except: error("ERROR when merging signature '{}' ({}) from file {}", sigobj.name(), sigobj.md5sum()[:8], sigfile) raise this_n += 1 total_loaded += 1 if this_n: notify('loaded and merged {} signatures from {}...', this_n, sigfile, end='\r') if not total_loaded: error("no signatures to merge!?") sys.exit(-1) merged_sigobj = sourmash.SourmashSignature(mh) output_json = sourmash.save_signatures([merged_sigobj], fp=args.output) notify('loaded and merged {} signatures', total_loaded)
def describe(args): """ provide basic info on signatures """ p = SourmashArgumentParser(prog='sourmash signature describe') p.add_argument('signatures', nargs='+') p.add_argument('-q', '--quiet', action='store_true', help='suppress non-error output') p.add_argument('--csv', type=argparse.FileType('wt'), help='output information to a CSV file') args = p.parse_args(args) set_quiet(args.quiet) siglist = [] for sigfile in args.signatures: this_siglist = [] try: this_siglist = sourmash.load_signatures(sigfile, quiet=True, do_raise=True) for k in this_siglist: siglist.append((k, sigfile)) except Exception as exc: error('\nError while reading signatures from {}:'.format(sigfile)) error(str(exc)) error('(continuing)') notify('loaded {} signatures from {}...', len(siglist), sigfile, end='\r') notify('loaded {} signatures total.', len(siglist)) # write CSV? w = None if args.csv: w = csv.DictWriter(args.csv, ['signature_file', 'md5', 'ksize', 'moltype', 'num', 'scaled', 'n_hashes', 'seed', 'with_abundance', 'name', 'filename', 'license'], extrasaction='ignore') w.writeheader() # extract info, write as appropriate. for (sig, signature_file) in siglist: mh = sig.minhash ksize = mh.ksize moltype = 'DNA' if mh.is_protein: moltype = 'protein' scaled = mh.scaled num = mh.num seed = mh.seed n_hashes = len(mh) with_abundance = 0 if mh.track_abundance: with_abundance = 1 md5 = sig.md5sum() name = sig.name() filename = sig.d.get('filename', '') license = sig.d['license'] if w: w.writerow(locals()) print_results('''\ --- signature filename: {signature_file} signature: {name} source file: {filename} md5: {md5} k={ksize} molecule={moltype} num={num} scaled={scaled} seed={seed} track_abundance={with_abundance} size: {n_hashes} signature license: {license} ''', **locals())