def kmer_to_repbase_with_mongo(kmer_file): client = MongoClient() client = MongoClient('mongodb://localhost:27017/') db = client.Repbase index = db.MainIndex name_index = db.NameIndex name_hash = {} print "Iter over kmers" for d in sc_iter_simple_tab_file(kmer_file): (kmer, tf) = d kmer = kmer.lower() print kmer, tf data = index.find_one({'kmer':kmer}) if not data: rkmer = get_revcomp(kmer) data = index.find_one({'kmer':rkmer}) if data: matches = data["index"] for rid, tf in matches: if rid in name_hash: name = name_hash[rid] else: name = name_index.find_one({"kid":rid}) name = name["name"] name_hash[rid] = name print "\t", name, tf else: print "\t???"
def kmer_to_cegma_with_mongo(kmer_file, verbose=False): client = MongoClient() client = MongoClient('mongodb://localhost:27017/') db = client.Repbase index = db.CegmaMainIndex name_index = db.CegmaNameIndex repbase_index = db.MainIndex name_hash = {} print "Iter over kmers" match = { "repbase": 0, "cegma": 0, "repbase_cegma": 0, "other": 0, } match_distr = { "repbase": defaultdict(int), "cegma": defaultdict(int), "repbase_cegma": defaultdict(int), "other": defaultdict(int), } for d in sc_iter_simple_tab_file(kmer_file): (kmer, tf) = d repbase_hit = False cegma_hit = False # print tf, kmer, "\r", tf = int(tf) print match, tf, "\r", kmer = kmer.lower() # if verbose: # print tf, kmer, "\r", data = repbase_index.find_one({'kmer':kmer}) if not data: rkmer = get_revcomp(kmer) data = repbase_index.find_one({'kmer':rkmer}) if data: repbase_hit = True data = index.find_one({'kmer':kmer}) if not data: rkmer = get_revcomp(kmer) data = index.find_one({'kmer':rkmer}) if data: matches = data["index"] cegma_hit = True # print # print kmer, tf # for rid, tf in matches: # if rid in name_hash: # name = name_hash[rid] # else: # name = name_index.find_one({"kid":rid}) # name = name["name"].strip() # name_hash[rid] = name # print "\t", name, tf # else: # # print "\t???" # pass if repbase_hit and cegma_hit: match["repbase_cegma"] += 1 match_distr["repbase_cegma"][tf] += 1 continue elif repbase_hit: match["repbase"] += 1 match_distr["repbase"][tf] += 1 continue elif cegma_hit: match["cegma"] += 1 match_distr["cegma"][tf] += 1 continue else: match["other"] += 1 print print match_distr print match