def get_count_project(project, list_kmers): cpt_sample = 0 it_sample = project.it_samples sample = it_sample.next() print(time.strftime('%X') + " " + sample.name) cpt_sample = 0 # Ticky: loading all the file in cache os.system("wc -l " + sample.jf_file) jf = jellyfish.QueryMerFile(sample.jf_file) cpt_kmer = 0 for kmer in list_kmers: cpt_kmer += 1 kmer.init_count(project.num_samples) kmer.add_count(kmer.get_count_jf(jf), cpt_sample) del jf for sample in it_sample: print(time.strftime('%X') + " " + sample.name) cpt_sample += 1 # Ticky: loading all the file in cache os.system("wc -l " + sample.jf_file) jf = jellyfish.QueryMerFile(sample.jf_file) for kmer in list_kmers: kmer.add_count(kmer.get_count_jf(jf), cpt_sample) del jf gc.collect() print(time.strftime('%X') + ": Get count done!")
def export_sparse_features(sigmers, sample, indir, outfile): outfh = open(outfile, 'w') i = 0 for (s, l) in sample: i = i + 1 if (i % 50 == 0): echo("\t\t ... Completed %f" % (float(i) / float(len(sample)))) filename = indir + s + "_count.jf" qf = jellyfish.QueryMerFile(filename) outfh.write("%s " % (l)) j = 0 for mer in sigmers: j = j + 1 jmer = jellyfish.MerDNA(mer) jmer.canonicalize() if (qf[jmer] > 0): outfh.write("%d:%d " % (j, qf[jmer])) # outfh.write("%d\t%d\t%d\n" %(i, sigmers[mer], qf[jmer])) outfh.write("\n") outfh.close()
def test_query(self): good = True qf = jellyfish.QueryMerFile(os.path.join(data, "sequence.jf")) for mer, count in self.mf: good = good and count == qf[mer] if not good: break self.assertTrue(good)
def __init__(self, filename, cutoff=0.30, n_cutoff=500, canonical=True): self.jf = jellyfish.QueryMerFile(filename) self.k = jellyfish.MerDNA.k() self.filename = filename self.cutoff = cutoff self.n_cutoff = n_cutoff self.canonical = canonical
def set_jf_file(self, path): """ set the path to the query and read mer files :param path: :return: None """ self.qf_filtered = jellyfish.QueryMerFile(path) self.rf = jellyfish.ReadMerFile(path) return None
def write_kmer(project, dict_seqs, prefix, args, path_dir): boo_header = False if not os.path.exists(path_dir): os.makedirs(path_dir) file_out_tab = os.path.join(path_dir, str(prefix) + "_count.tab") with open(file_out_tab, 'w') as w_tab: for name, seq in list(dict_seqs.items()): list_kmer = [ seq[i:i + args.LKMER] for i in range(len(seq) - args.LKMER) ] list_line = [""] * (len(list_kmer) + 1) list_line[0] = "ID_ASSEMBLY\tKMER\tENTROPY\tSWITCH" cpt_line = 1 for kmer in list_kmer: list_line[cpt_line] = str(name) + "\t" + kmer.seq + "\t" +\ str(kmer.entropy) + "\t" + str(kmer.switch) cpt_line += 1 file_out_r = os.path.join(path_dir, str(name) + "_count_ggplot.csv") with open(file_out_r, 'w') as w_ggplot: w_ggplot.write("POS,KMER,SAMPLE,GROUP,LOG_COUNT,COUNT\n") for sample in project.samples: list_line[0] = list_line[0] + "\t" + sample.name jf = jellyfish.QueryMerFile(sample.jf_file) cpt_line = 1 for kmer in list_kmer: count = get_count(kmer, jf) log_count = np.log10(count * args.LOG_F + args.LOG_C) /\ np.log10(sample.num_kmer + args.LOG_C) w_ggplot.write( str(cpt_line) + "," + kmer.seq + "," + sample.name + "," + sample.group + "," + str(log_count) + "," + str(count) + "\n") list_line[cpt_line] = list_line[cpt_line] + "\t" + str( count) cpt_line += 1 if boo_header: del list_line[0] else: boo_header = True for line in list_line: w_tab.write(line + "\n") del list_line
def prepare_jellyfish(indir, label_file, read_info, k): positive = [] negative = [] positive_factor = [] negative_factor = [] norm_factors = load_read_info(read_info, k) labels = parse_labels(label_file) for (p, l) in labels: filename = os.path.join(indir, p + "_count.jf") if (l == "-1"): negative.append(jellyfish.QueryMerFile(filename)) negative_factor.append(norm_factors[p]) else: positive.append(jellyfish.QueryMerFile(filename)) positive_factor.append(norm_factors[p]) return (positive, negative, positive_factor, negative_factor)
def __init__(self, name, path, json_dump=None): """ initialize a StrainObject :param name: :param path: :return: """ if json_dump is None: self.jellyfish_path = "jellyfish" self.name = name self.path = path self.rapid_mode = False self.do_not_filter = False self.histo = self.get_histo() self.coverage = self.get_estimate_coverage() self.__check_resources() self.kmer_cutoff = None self.has_suitable_coverage = False self.kmer_set = set([]) self.kmer_archive = set([]) self.filtered_jf_file = "/tmp/tmp_filtered_{0}_{1}.jf".\ format(self.name, ''.join(random.choice(string.ascii_uppercase) for i in range(8))) self.ard = {} self.unique_kmers = None self.distinct_kmers = None self.total_kmers = None self.max_count = None self.qf = jellyfish.QueryMerFile(self.path) self.qf_filtered = None self.rf = None self.warnings = [] else: self.name = json_dump["strain_name"] self.path = json_dump["path_count_file"] self.do_not_filter = json_dump["filtered_kmer_set"] self.histo = { int(k): int(v) for k, v in json_dump["kmer_count_histogram"].items() } self.coverage = json_dump["coverage"] self.kmer_cutoff = json_dump["kmer_cutoff"] self.has_suitable_coverage = json_dump["has_suitable_coverage"] self.kmer_set = set(json_dump["kmer_archive"]) self.kmer_archive = set(json_dump["kmer_archive"]) self.unique_kmers = json_dump["unique_kmers"] self.distinct_kmers = json_dump["distinct_kmers"] self.max_count = json_dump["max_count"] self.warnings = json_dump["warnings"] self.ard_result = json_dump["ard_results"]
def __filter_jf_file(self): """ filters the raw kmer count set based on kmer cutoff and set the queryfile and readfile paths :return: None """ if self.do_not_filter: self.filtered_jf_file = self.path self.qf_filtered = jellyfish.QueryMerFile(self.path) self.rf = jellyfish.ReadMerFile(self.path) self.__create_set() else: dummy_jf_file = pkg_resources.resource_filename( 'straintypemer', 'data/dummy_A.jf') subprocess.check_call([ "jellyfish", "merge", "-L", str(int(self.kmer_cutoff) + 1), "-o", self.filtered_jf_file, self.path, dummy_jf_file ], ) self.qf_filtered = jellyfish.QueryMerFile(self.filtered_jf_file) self.rf = jellyfish.ReadMerFile(self.filtered_jf_file) self.__create_set() return
def kmercount(k, fname): try: qf = jellyfish.QueryMerFile(fname) except RuntimeError: raise else: # initialize with pseudo count # add 0.5 for smoothing # store data in doble quantity to use int vector c = np.ones(1 << (2 * k), dtype=np.uint16) i = 0 for l in allkmers(k): c[i] += 2 * qf[jellyfish.MerDNA(''.join(l))] i += 1 # print len(c); return c
def get_kmer_freq_v(jfdb='../data/GRCh38.p2.ch21/GRCh38.p2.ch21.5010000.jf', k=5): try: qf = jellyfish.QueryMerFile(jfdb) except RuntimeError: raise else: alph = ('A', 'C', 'G', 'T') freq_l = [] kmer = None for km in itertools.product(alph, repeat=k): kmer = ''.join(km) freq = qf[jellyfish.MerDNA(kmer)] freq_l.append(freq) # how to close qf?? a = np.array([freq_l], dtype=np.float64) a /= np.sum(a) return a
def test07(jfdb='../data/GRCh38.p2.ch21/GRCh38.p2.ch21.5010000.jf', k=5): try: qf = jellyfish.QueryMerFile(jfdb) except RuntimeError: print 'jellyfish runtime error' raise else: alph = ('A', 'C', 'G', 'T') freq_l = [] for km in itertools.product(alph, repeat=k): kmer = ''.join(km) freq = qf[jellyfish.MerDNA(kmer)] freq_l.append(freq) #print '{kmer}\t{freq}'.format(kmer =kmer, freq = freq); a = np.array([freq_l], dtype=np.float64) a /= np.sum(a) print a return
def kmercount(k, pos, chr = 21, fname_head = '/data/yt/GRCh38.p2.ch21/GRCh38.p2'): try: fname = '{head}.ch{chr}.{pos}.fasta.{k}.jf'.format(head = fname_head, chr = chr, pos = pos, k = k); qf = jellyfish.QueryMerFile(fname); except RuntimeError: raise; else: # initialize with pseudo count # add 0.5 for smoothing # store data in doble quantity to use int vector c = np.ones((1 << (2 * k), 1), dtype = np.uint16); i = 0; for l in allkmers(k): c[i][0] += 2 * qf[jellyfish.MerDNA(''.join(l))]; i += 1; # print c.T # print len(c); return c;
#! /usr/bin/env python import jellyfish import sys qf = jellyfish.QueryMerFile(sys.argv[1]) for str in sys.argv[2:]: print("%s %d" % (str, qf[jellyfish.MerDNA(str)]))
sys.exit() k = int(sys.argv[2]) cosineFile = open("%scosinek%d.log" % (sys.argv[1], k), 'w') jaccardFile = open("%sjaccardk%d.log" % (sys.argv[1], k), 'w') #build our list of files / genomes to compare files = [ sys.argv[1] + f for f in listdir(sys.argv[1]) if isfile(join(sys.argv[1], f)) and f.endswith('k%d.jf' % (k)) ] #print files for idx, jfi_1 in enumerate(files[:-1]): for jfi_2 in files[idx + 1:]: jfi1_RFile = jellyfish.ReadMerFile(jfi_1) jfi1_QFile = jellyfish.QueryMerFile(jfi_1) jfi2_RFile = jellyfish.ReadMerFile(jfi_2) jfi2_QFile = jellyfish.QueryMerFile(jfi_2) t1 = [] t2 = [] notUnion = 0 for mer, count1 in jfi1_RFile: #print count1 count2 = jfi2_QFile[mer] if count2 == 0: notUnion += 1 t1.append(int(count1)) t2.append(int(count2)) for mer, count2 in jfi2_RFile: if jfi1_QFile[mer] == 0: t1.append(0)
def do(output_dir, ref_fpath, contigs_fpaths, logger): logger.print_timestamp() logger.main_info('Running analysis based on unique 101-mers...') addsitedir(jellyfish_python_dirpath) try: compile_jellyfish(logger) import jellyfish try: import imp imp.reload(jellyfish) except: reload(jellyfish) jellyfish.MerDNA.k(KMERS_LEN) except: logger.warning('Failed unique 101-mers analysis.') return checked_assemblies = [] for contigs_fpath in contigs_fpaths: label = qutils.label_from_fpath_for_fname(contigs_fpath) if check_jf_successful_check(output_dir, contigs_fpath, contigs_fpaths, ref_fpath): jf_stats_fpath = join(output_dir, label + '.stat') stats_content = open(jf_stats_fpath).read().split('\n') if len(stats_content) < 4: continue logger.info(' Using existing results for ' + label + '... ') report = reporting.get(contigs_fpath) report.add_field( reporting.Fields.KMER_COMPLETENESS, '%.2f' % float(stats_content[0].strip().split(': ')[-1])) report.add_field( reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % float(stats_content[1].strip().split(': ')[-1])) report.add_field( reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % float(stats_content[2].strip().split(': ')[-1])) report.add_field( reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % float(stats_content[3].strip().split(': ')[-1])) checked_assemblies.append(contigs_fpath) contigs_fpaths = [ fpath for fpath in contigs_fpaths if fpath not in checked_assemblies ] if len(contigs_fpaths) == 0: logger.info('Done.') return logger.info('Running Jellyfish on reference...') jf_out_fpath = join(output_dir, basename(ref_fpath) + '.jf') qutils.call_subprocess([ jellyfish_bin_fpath, 'count', '-m', '101', '-U', '1', '-s', str(getsize(ref_fpath)), '-o', jf_out_fpath, '-t', str(qconfig.max_threads), ref_fpath ]) ref_kmers = jellyfish.ReadMerFile(jf_out_fpath) os.remove(jf_out_fpath) logger.info('Running Jellyfish on assemblies...') contigs_kmers = [] for contigs_fpath in contigs_fpaths: jf_out_fpath = join(output_dir, basename(contigs_fpath) + '.jf') qutils.call_subprocess([ jellyfish_bin_fpath, 'count', '-m', '101', '-U', '1', '-s', str(getsize(contigs_fpath)), '-o', jf_out_fpath, '-t', str(qconfig.max_threads), contigs_fpath ]) contigs_kmers.append(jellyfish.QueryMerFile(jf_out_fpath)) os.remove(jf_out_fpath) logger.info('Analyzing completeness and accuracy of assemblies...') unique_kmers = 0 matched_kmers = defaultdict(int) shared_kmers = set() kmer_i = 0 for kmer, count in ref_kmers: unique_kmers += 1 matches = 0 for idx in range(len(contigs_fpaths)): if contigs_kmers[idx][kmer]: matched_kmers[idx] += 1 matches += 1 if matches == len(contigs_fpaths): if kmer_i % 100 == 0: shared_kmers.add(str(kmer)) kmer_i += 1 for idx, contigs_fpath in enumerate(contigs_fpaths): report = reporting.get(contigs_fpath) completeness = matched_kmers[idx] * 100.0 / unique_kmers report.add_field(reporting.Fields.KMER_COMPLETENESS, '%.2f' % completeness) shared_kmers_by_chrom = dict() ref_contigs = dict((name, seq) for name, seq in read_fasta(ref_fpath)) for name, seq in ref_contigs.items(): seq_kmers = jellyfish.string_mers(seq) for kmer in seq_kmers: if str(kmer) in shared_kmers: shared_kmers_by_chrom[str(kmer)] = name for contigs_fpath in contigs_fpaths: report = reporting.get(contigs_fpath) len_map_to_one_chrom = 0 len_map_to_multi_chrom = 0 total_len = 0 for name, seq in read_fasta(contigs_fpath): total_len += len(seq) seq_kmers = jellyfish.string_mers(seq) chrom_markers = [] for kmer in seq_kmers: kmer_str = str(kmer) if kmer_str in shared_kmers_by_chrom: chrom = shared_kmers_by_chrom[kmer_str] chrom_markers.append(chrom) if len(chrom_markers) < MIN_MARKERS: continue if len(set(chrom_markers)) == 1: len_map_to_one_chrom += len(seq) else: len_map_to_multi_chrom += len(seq) len_map_to_none_chrom = total_len - len_map_to_one_chrom - len_map_to_multi_chrom report.add_field(reporting.Fields.KMER_SCAFFOLDS_ONE_CHROM, '%.2f' % (len_map_to_one_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_MULTI_CHROM, '%.2f' % (len_map_to_multi_chrom * 100.0 / total_len)) report.add_field(reporting.Fields.KMER_SCAFFOLDS_NONE_CHROM, '%.2f' % (len_map_to_none_chrom * 100.0 / total_len)) create_jf_stats_file( output_dir, contigs_fpath, contigs_fpaths, ref_fpath, report.get_field(reporting.Fields.KMER_COMPLETENESS), len_map_to_one_chrom, len_map_to_multi_chrom, len_map_to_none_chrom) logger.info('Done.')
def genquery(genomeFile, jellyFile, totedits, medindel, insprob, delprob, queryfreq, querycount, outputFile): #genome - path to genome #totedits - total number of edits to make #medindel - median (mean) size of indel edits. actual edit length determined from gaussian with mean medindel and std medindel/2 #insprob - probability of insertion #delprob - probability of deletion #outputs all edits into a text file called "sampleedits.txt" if delprob + insprob > 1.0: raise "Error, delprob = {} and insprob = {}. "\ "The sum is {} > 1.0".format( delprob, insprob, delprob + insprob) genome = genomeFile.readline() genomeFile.close() #mf = jellyfish.ReadMerFile(jellyFile) qf = jellyfish.QueryMerFile(jellyFile) numbases = len(genome) - 1 genome = genome[0:numbases] letters = ['A', 'C', 'G', 'T'] randr = [] allinds = [] snpProb = 1.0 - (insprob + delprob) SNPrange = int(snpProb * totedits) insrange = int(insprob * totedits) delrange = int(delprob * totedits) editTypes = (['S'] * SNPrange) +\ (['D'] * delrange) +\ (['I'] * insrange) random.shuffle(editTypes) qcount = 0 effectedkmers = set() for val in editTypes: qcount += 1 if val == 'I': p, s, seq = random_insertion(numbases, medindel) numbases += s outputFile.write('I %d %s\n' % (p, seq)) add_kmers_in_seq(effectedkmers, seq) add_kmers_in_seq(effectedkmers, genome[p - K + 1:p + K]) elif val == 'D': p, s = random_deletion(numbases, medindel) numbases -= s outputFile.write('D %d %d\n' % (p, p + s - 1)) #add_kmers_in_seq(effectedkmers, genome[p-K+1:p+s-1+K]) else: p, seq = random_snp(numbases) outputFile.write('S %d %s\n' % (p, seq)) add_kmers_in_seq(effectedkmers, genome[p - K + 1:p + K - 1]) # if it's time to output some queries if qcount == queryfreq: qcount = 0 for qlist in xrange(querycount): dart = random.random() if dart <= EDIT_QUERY_PROB: kmer = random.sample(effectedkmers, 1)[0] editflag = 'I' else: p = random.randrange(K * 2, numbases - K * 2) kmer = genome[p:p + K].upper() editflag = 'N' kcount = int(qf[jellyfish.MerDNA(kmer)]) outputFile.write('Q %s %s %d\n' % (kmer, editflag, kcount)) outputFile.close()
def import_counts(self): print('importing jellyfish table', self.path) self.qf = jellyfish.QueryMerFile(self.path) print('table loaded')