def calculate_mods_frequency(mods_files, prob_cf): sitekeys = set() sitekey2stats = dict() count, used = 0, 0 for mods_file in mods_files: if mods_file.endswith(".gz"): infile = gzip.open(mods_file, 'rt') else: infile = open(mods_file, 'r') for line in infile: words = line.strip().split("\t") mod_record = ModRecord(words) if mod_record.is_record_callable(prob_cf): if mod_record._site_key not in sitekeys: sitekeys.add(mod_record._site_key) sitekey2stats[mod_record._site_key] = SiteStats(mod_record._strand, mod_record._pos_in_strand, mod_record._kmer) sitekey2stats[mod_record._site_key]._prob_0 += mod_record._prob_0 sitekey2stats[mod_record._site_key]._prob_1 += mod_record._prob_1 sitekey2stats[mod_record._site_key]._coverage += 1 if mod_record._called_label == 1: sitekey2stats[mod_record._site_key]._met += 1 else: sitekey2stats[mod_record._site_key]._unmet += 1 used += 1 count += 1 infile.close() print("{:.2f}% ({} of {}) calls used..".format(used/float(count) * 100, used, count)) return sitekey2stats
def calculate_mods_frequency(mods_files, prob_cf): sitekeys = set() sitekey2stats = dict() count, used = 0, 0 for mods_file in mods_files: with open(mods_file, 'r') as rf: for line in rf: words = line.strip().split("\t") mod_record = ModRecord(words) if mod_record.is_record_callable(prob_cf): if mod_record._site_key not in sitekeys: sitekeys.add(mod_record._site_key) sitekey2stats[mod_record._site_key] = SiteStats(mod_record._strand, mod_record._pos_in_strand, mod_record._kmer) sitekey2stats[mod_record._site_key]._prob_0 += mod_record._prob_0 sitekey2stats[mod_record._site_key]._prob_1 += mod_record._prob_1 sitekey2stats[mod_record._site_key]._coverage += 1 if mod_record._called_label == 1: sitekey2stats[mod_record._site_key]._met += 1 else: sitekey2stats[mod_record._site_key]._unmet += 1 used += 1 count += 1 print("{} of {} samples used..".format(used, count)) return sitekey2stats
def sample_sites(filename, is_methylated): all_crs = list() rf = open(filename) for line in rf: mt_record = ModRecord(line.rstrip().split()) all_crs.append( CallRecord(mt_record._site_key, mt_record._called_label, is_methylated, mt_record._prob_0, mt_record._prob_1)) rf.close() print('there are {} basemod candidates totally'.format(len(all_crs))) random.shuffle(all_crs) return all_crs