def run_mms_for(fastx: str, k: int, epsilon: float, args): """Build and check one single MM sketch See run_sms_for documetantion. """ if (epsilon < 0 or epsilon > 1): raise ValueError("epsilon must be a number between 0 and 1") filename, _, _, _, kmcdb = kmc.getKMCPaths(k, fastx, args.c) pre_file = kmcdb + ".kmc_pre" suf_file = kmcdb + ".kmc_suf" if (not os.path.exists(pre_file) or not os.path.exists(suf_file)): kmc.count(k, fastx, args.c, args.w, args.m, True) sketch_name = "{}k{}e{}".format(filename, k, str(epsilon).split('.')[1]) bin_name = sketch_name + ".mms" arch_name = sketch_name + ".gz" sketch_path = os.path.join(args.f, sketch_name) arch_path = os.path.join(args.f, arch_name) L1, dim, max_val, construction_time = run_fress_mms( kmcdb, sketch_path, epsilon, args.g) L1 = int(L1) dim = int(dim) max_val = int(max_val) construction_time = int(construction_time) ncolls, ntrue_colls, sod, avgd, maxd, avg_qtime = run_fress_mmschk( kmcdb, sketch_path, args.g) ncolls = int(ncolls) ntrue_colls = int(ntrue_colls) avgd = float(avgd) maxd = int(maxd) avg_qtime = int(avg_qtime) sys.stderr.write("number of cells = {}, max freq = {}\n".format( dim, max_val)) theoretical_udim = round(dim * math.ceil(math.log(max_val, 2)) / 8) compress(args.f, [bin_name], arch_path) cdim = os.stat(arch_path).st_size os.remove(arch_path) return "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format( filename, epsilon, k, ntrue_colls, round(L1 * epsilon), sod, avgd, maxd, theoretical_udim, cdim, construction_time, avg_qtime)
def run_bbhash_for(fastx: str, k: int, _, args): """Build and check BBHash MPHF with = 1 See run_sms_for documentation about input parameters. Output: - A big table in tsv format with the following columns: dataset name | k-value | mphf uncompressed size | total uncompressed size | total compressed size """ filename, _, _, _, kmcdb = kmc.getKMCPaths(k, fastx, args.c) pre_file = kmcdb + ".kmc_pre" suf_file = kmcdb + ".kmc_suf" if (not os.path.exists(pre_file) or not os.path.exists(suf_file)): kmc.count(k, fastx, args.c, args.w, args.m, True) sketch_name = "{}k{}".format(filename, k) mphf_name = sketch_name + ".bbh" payload_name = sketch_name + ".pld" arch_name = sketch_name + "_BBH.gz" sketch_path = os.path.join(args.f, sketch_name) mphf_path = os.path.join(args.f, mphf_name) arch_path = os.path.join(args.f, arch_name) max_val, L0, construction_time, avg_qtime = run_fress_bbhash( kmcdb, sketch_path) max_val = int(max_val) L0 = int(L0) construction_time = int(construction_time) avg_qtime = int(avg_qtime) mphf_size = os.stat(mphf_path).st_size theoretical_udim = round( L0 * math.ceil(math.log(max_val, 2)) / 8) + mphf_size compress(args.f, [mphf_name, payload_name], arch_path) cdim = os.stat(arch_path).st_size os.remove(arch_path) return "{}\t{}\t{}\t{}\t{}\t{}\t{}".format(filename, k, mphf_size, theoretical_udim, cdim, construction_time, avg_qtime)
def run_merged_sms_for(fastx: str, k: int, epsilon: float, args): """Build and check one single SM sketch with column merging Similar to run_sms_for this function merges the first args.g columns of the histogram and assigns the weighted average of the removed elements to all k-mers involved in the operation. Output: - A big table in tsv format with the following columns: dataset name | epsilon | k-value | R | B | number of collisions | threshold | L1 sum of deltas | average delta | max delta | uncompressed size | compressed size """ import time if (epsilon < 0 or epsilon > 1): raise ValueError("epsilon must be a number between 0 and 1") filename, _, _, _, kmcdb = kmc.getKMCPaths(k, fastx, args.c) pre_file = kmcdb + ".kmc_pre" suf_file = kmcdb + ".kmc_suf" if (not os.path.exists(pre_file) or not os.path.exists(suf_file)): kmc.count(k, fastx, args.c, args.w, args.m, True) sketch_name = "{}k{}e{}".format(filename, k, str(epsilon).split('.')[1]) histo_name = sketch_name + ".shist.txt" cmb_name = sketch_name + ".cmb.txt" bin_name = sketch_name + ".bin" arch_name = sketch_name + ".gz" sketch_path = os.path.join(args.f, sketch_name) histo_path = os.path.join(args.f, histo_name) cmb_path = os.path.join(args.f, cmb_name) #bin_path = os.path.join(args.f, bin_name) arch_path = os.path.join(args.f, arch_name) tmp_name = str(time.time()) + ".hist.txt" run_fress_histogram(kmcdb, tmp_name) r, b, _, _, freq, unerr = opt_dim_main(tmp_name, epsilon, None, None, args.g) os.remove(tmp_name) L1, dim = run_fress_sense(kmcdb, sketch_path, epsilon, r, b) L1 = int(L1) dim = int(dim) ncolls, ntrue_colls, sod, avgd, maxd = run_fress_check( kmcdb, sketch_path, args.g, freq) ncolls = int(ncolls) ntrue_colls = int(ntrue_colls) sod = float(sod) avgd = float(avgd) maxd = float(maxd) tavg = sod / ntrue_colls #histo = pandas.read_csv(histo_path, sep='\t', header=None) #skewness = skew(histo.to_numpy()[:,1]) ncombinations = 0 with open(cmb_path, "r") as hc: for _ in hc: ncombinations += 1 theoretical_udim = round( dim * math.ceil(math.log(ncombinations, 2)) / 8) + os.stat(histo_path).st_size + os.stat(cmb_path).st_size compress(args.f, [histo_name, cmb_name, bin_name], arch_path) cdim = os.stat(arch_path).st_size os.remove(arch_path) return "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format( filename, epsilon, k, r, b, ntrue_colls, round(L1 * epsilon), sod, avgd, tavg, maxd, theoretical_udim, cdim, args.g, freq, unerr)
def run_sms_for(fastx: str, k: int, epsilon: float, args): """Build and check one single SM sketch Input: - one fasta/fastq to be sketched - the k-mer length - the approximation factor epsilon - a working directory - the output directory for kmc databases - the output directory for the sketch - a temporary directory - maximum allowed memory Computations: - for each dataset and k-mer value apply kmc - get the L1 norm of the kmc databases - get the skewness of the k-mer spectrum - sketch the resulting kmc databases with fress sense - run fress check to have (sum of errors, average error, max error) - get the (theoretical) uncompressed size for each fress sketch - get the compressed size of each fress sketch Output: - A big table in tsv format with the following columns: dataset name | epsilon | k | number of collisions | threshold | L1 sum of deltas | average delta | max delta | uncompressed size | compressed size ATTENTION: average delta is not (L1 sum of deltas / number of collisions) but an average computed over using collisions computed as intersections of size different than one (instead of the wrong frequency) """ if (epsilon < 0 or epsilon > 1): raise ValueError("epsilon must be a number between 0 and 1") filename, _, _, _, kmcdb = kmc.getKMCPaths(k, fastx, args.c) pre_file = kmcdb + ".kmc_pre" suf_file = kmcdb + ".kmc_suf" if (not os.path.exists(pre_file) or not os.path.exists(suf_file)): kmc.count(k, fastx, args.c, args.w, args.m, True) sketch_name = "{}k{}e{}".format(filename, k, str(epsilon).split('.')[1]) histo_name = sketch_name + ".shist.txt" cmb_name = sketch_name + ".cmb.txt" bin_name = sketch_name + ".bin" arch_name = sketch_name + ".gz" sketch_path = os.path.join(args.f, sketch_name) histo_path = os.path.join(args.f, histo_name) cmb_path = os.path.join(args.f, cmb_name) #bin_path = os.path.join(args.f, bin_name) arch_path = os.path.join(args.f, arch_name) L1, dim, construction_time = run_fress_sense(kmcdb, sketch_path, epsilon) L1 = int(L1) dim = int(dim) construction_time = int(construction_time) ncolls, ntrue_colls, sod, avgd, maxd, avg_qtime = run_fress_check( kmcdb, sketch_path) ncolls = int(ncolls) ntrue_colls = int(ntrue_colls) avgd = float(avgd) maxd = int(maxd) avg_qtime = int(avg_qtime) #histo = pandas.read_csv(histo_path, sep='\t', header=None) #skewness = skew(histo.to_numpy()[:,1]) ncombinations = 0 with open(cmb_path, "r") as hc: for _ in hc: ncombinations += 1 theoretical_udim = round( dim * math.ceil(math.log(ncombinations, 2)) / 8) + os.stat(histo_path).st_size + os.stat(cmb_path).st_size compress(args.f, [histo_name, cmb_name, bin_name], arch_path) cdim = os.stat(arch_path).st_size os.remove(arch_path) return "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format( filename, epsilon, k, ntrue_colls, round(L1 * epsilon), sod, avgd, maxd, theoretical_udim, cdim, construction_time, avg_qtime)