def get_report_kplets(id2cdd, limit_to=500, load_locations=None): _db = DbClass() _db.cmd = """SET group_concat_max_len=1500000""" _db.execute() _db.cmd = """select ap.* ,count(*) as cnt, sum(w.weight) as wgt, group_concat(awf.name) as an from bacteria_3plets ap inner join bacteria_3plets_win10 apw on ap.id = apw.kplet_id inner join bacteria_win10_files awf on apw.file_id = awf.id inner join sources s on awf.source_id=s.id inner join weights w on w.genome_id=s.genome_id group by ap.id having count(distinct s.genome_id)>1 order by wgt desc limit 0, %d""" % limit_to out_list = [] for row in _db.retrieve(): id = row[0] kplet_codes = ([id2cdd[int(_id)] for _id in row[1:4]]) if len(set(kplet_codes)) != 3: continue count = row[4] weight = row[5] files = row[6].split(',') tmp_kplet = Kplet(id=id, codes=kplet_codes, count=count, files=files) out_list.append(tmp_kplet) _path = neighborhoods_path() if load_locations: [kplet.load_locations(_path) for kplet in out_list] return out_list
def org2src_src2files_map(files): _db = DbClass() _db.cmd = """select awf.name, s.name, g.name from bacteria_win10_files awf inner join sources s on awf.source_id=s.id inner join genomes g on s.genome_id = g.id where awf.name in ('%s')""" % "','".join(files) _org2src = {} _src2files = {} for row in _db.retrieve(): parts = row [_file, _src, _org] = parts if _org in _org2src: _org2src[_org].update([_src]) else: _org2src[_org] = set([_src]) if _src in _src2files: _src2files[_src].update([_file]) else: _src2files[_src] = set([_file]) return _org2src, _src2files
def get_report_kplets(id2cdd, limit_to=500): _db = DbClass() _db.cmd = """SET group_concat_max_len=1500000""" _db.execute() _db.cmd = """select cp.*, count(distinct cwf.id) as cnt, group_concat(cwf.name) as files from crispr_2plets cp inner join crispr_2plets_wgs cpw on cp.id = cpw.kplet_id inner join crispr_wgs_files cwf on cpw.file_id = cwf.id group by cp.id having count(distinct cwf.id)>1 order by cnt desc limit 0, %d""" % limit_to out_list = [] for row in _db.retrieve(): id = row[0] kplet_codes = ([id2cdd[int(_id)] for _id in row[1:3]]) if len(set(kplet_codes)) != 2: continue count = row[3] files = row[4].split(',') tmp_kplet = Kplet(id=id, codes=kplet_codes, count=count, files=files) out_list.append(tmp_kplet) return out_list
def get_report_kplets(id2cdd, limit_to=500): _db = DbClass() _db.cmd = """SET group_concat_max_len=15000000""" _db.execute() _db.cmd = """select cp.*, count(distinct cwf.id) as cnt, group_concat(cwf.name) as files from crispr_5plets cp inner join crispr_5plets_wgs cpw on cp.id = cpw.kplet_id inner join crispr_wgs_files cwf on cpw.file_id = cwf.id group by cp.id having count(distinct cwf.id)>1 order by cnt desc limit 0, %d""" % limit_to out_list = [] for row in _db.retrieve(): id = row[0] kplet_codes = ([id2cdd[int(_id)] for _id in row[1:6]]) if len(set(kplet_codes)) != 5: continue count = row[6] files = row[7].split(',') tmp_kplet = Kplet(id=id, codes=kplet_codes, count=count, files=files) out_list.append(tmp_kplet) return out_list
def get_report_kplets(limit_to=300, load_locations=None): sql_cmd = """select apc.*, s1.cnt, s1.wgt, s1.an from ( select ap.id ,count(*) as cnt, sum(w.weight) as wgt, group_concat(awf.name) as an from archea_4plets ap inner join archea_4plets_win10 apw on ap.id = apw.kplet_id inner join archea_win10_files awf on apw.file_id = awf.id inner join sources s on awf.source_id=s.id inner join weights w on w.genome_id=s.genome_id group by ap.id having count(distinct s.genome_id)>1 ) s1 inner join archea_4plets_codes apc on s1.id=apc.id order by s1.wgt desc limit 0,%d""" % limit_to _db = DbClass() _db.cmd = sql_cmd out_list = [] for row in _db.retrieve(): id = row[0] kplet_codes = (row[1:5]) if len(set(kplet_codes)) != 4: continue count = row[5] files = row[7].split(',') tmp_kplet = Kplet(id=id, codes=kplet_codes, count=count, files=files) out_list.append(tmp_kplet) _path = neighborhoods_path() if load_locations: [kplet.load_locations(_path) for kplet in out_list] return out_list
def get_report_kplets(limit_to=300, load_locations=None): _db = DbClass() _db.cmd = """select apc.*, s1.cnt, s1.wgt, s1.an from ( select ap.id ,count(*) as cnt, sum(w.weight) as wgt, group_concat(awf.name) as an from archea_3plets ap inner join archea_3plets_win10 apw on ap.id = apw.kplet_id inner join archea_win10_files awf on apw.file_id = awf.id inner join sources s on awf.source_id=s.id inner join weights w on w.genome_id=s.genome_id group by ap.id having count(distinct s.genome_id)>1 ) s1 inner join archea_3plets_codes apc on s1.id=apc.id order by s1.wgt desc limit 0,%d""" % limit_to out_list = [] for row in _db.retrieve(): id = row[0] kplet_codes = (row[1:4]) if len(set(kplet_codes)) != 3: continue count = row[4] files = row[6].split(',') tmp_kplet = Kplet(id=id, codes=kplet_codes, count=count, files=files) out_list.append(tmp_kplet) _path = neighborhoods_path() if load_locations: [kplet.load_locations(_path) for kplet in out_list] return out_list
def get_code_kplet(kplet_id): _db = DbClass() _db.cmd = """select kplet_1, kplet_2, kplet_3, kplet_4, kplet_5 from bacteria_5plets_codes where id = %s""" % kplet_id return _db.retrieve()[0]
def get_report_kplet(kplet_id, id2cdd, load_locations=None): _db = DbClass() _db.cmd = """SET group_concat_max_len=1500000;""" _db.execute() _db.cmd = """select ap.* ,count(*) as cnt, sum(w.weight) as wgt, group_concat(awf.name) as an from bacteria_5plets ap inner join bacteria_5plets_win10 apw on ap.id = apw.kplet_id inner join bacteria_win10_files awf on apw.file_id = awf.id inner join sources s on awf.source_id=s.id inner join weights w on w.genome_id=s.genome_id where ap.id=%d group by ap.id""" % kplet_id out_list = [] for row in _db.retrieve(): id = row[0] kplet_codes = [id2cdd[int(_id)] for _id in row[1:6]] if len(set(kplet_codes)) != 5: continue count = row[6] files = row[8].split(',') tmp_kplet = Kplet(id=id, codes=kplet_codes, count=count, files=files) out_list.append(tmp_kplet) _path = neighborhoods_path() if load_locations: [kplet.load_locations(_path) for kplet in out_list] return out_list[0]
def get_code_kplet(kplet_id): _sql_cmd = """SELECT cp1.code, cp2.code, cp3.code, cp4.code FROM PatternQuest.archea_4plets ap inner join cdd_profiles cp1 on cp1.id = ap.kplet_1 inner join cdd_profiles cp2 on cp2.id = ap.kplet_2 inner join cdd_profiles cp3 on cp3.id = ap.kplet_3 inner join cdd_profiles cp4 on cp4.id = ap.kplet_4 where ap.id = %d""" % kplet_id _db = DbClass() _db.cmd = _sql_cmd return _db.retrieve()
def get_code_kplet(kplet_id, id2cdd=None): _db = DbClass() if not id2cdd: _db.cmd = """select cp1.code, cp2.code from bacteria_2plets bp inner join cdd_profiles cp1 on cp1.id = bp.kplet_1 inner join cdd_profiles cp2 on cp2.id = bp.kplet_2 where bp.id = %d""" % kplet_id retval = _db.retrieve()[0] else: _db.cmd = """select kplet_1, kplet_2 from bacteria_2plets where id = %d""" % kplet_id retval = _db.retrieve()[0] retval = set([id2cdd[id] for id in retval]) return retval
def archea_kplet_ids2files(id_list): _sql_cmd = """select distinct awf.name from archea_5plets ap inner join archea_5plets_win10 apw on ap.id = apw.kplet_id inner join archea_win10_files awf on apw.file_id = awf.id where ap.id in (%s)""" % " , ".join(id_list) _db = DbClass() _db.cmd = _sql_cmd return [l[0] for l in _db.retrieve()]
def map_profile2id(profile_list): _db = DbClass() if profile_list == 'all': sql_cmd = """select code, id from cdd_profiles""" else: sql_cmd = """select code, id from cdd_profiles where code in ('%s')""" sql_cmd = sql_cmd % "','".join(profile_list) _db.cmd = sql_cmd return {row[0]: int(row[1]) for row in _db.retrieve()}
def get_multiple_kplets(): _sql_cmd = """select ap.id, count(*) cnt, group_concat(convert(apw.file_id, char(15))) as file_ids from archea_5plets ap inner join archea_5plets_win10 apw on ap.id = apw.kplet_id group by ap.id having count(*)>1 order by cnt desc""" _db = DbClass() _db.cmd = _sql_cmd return _db.retrieve()
def get_multiple_kplets(): _db = DbClass() _db.cmd = "SET group_concat_max_len = 100000;" _db.execute() _db.cmd = """ select ap.id, count(*) cnt, group_concat(convert(apw.file_id, char(15))) as file_ids from bacteria_3plets ap inner join bacteria_3plets_win10 apw on ap.id = apw.kplet_id group by ap.id having count(*)>1 order by cnt desc""" return _db.retrieve()
def map_baited_file2id(): _db = DbClass() _db.cmd = """ select name, id from prok1402_baited_files""" rows = _db.retrieve() file2id = {} id2file = {} for row in rows: file2id[row[0]] = row[1] id2file[row[1]] = row[0] return file2id, id2file
def get_kplet_id(profiles): profile2id = t.map_profile2id(profiles) _db = DbClass() sql_cmd = """select id from archea_4plets where kplet_1=%d and kplet_2=%d and kplet_3=%d and kplet_4=%d""" sql_cmd = sql_cmd % (profile2id[profiles[0]], profile2id[profiles[1]], profile2id[profiles[2]], profile2id[profiles[3]]) _db.cmd = sql_cmd rows = _db.retrieve() if rows: return rows[0][0] return None
def file2src_src2org_map(files): _db = DbClass() _db.cmd = """select awf.name, s.name, g.name from bacteria_win10_files awf inner join sources s on awf.source_id=s.id inner join genomes g on s.genome_id = g.id where awf.name in ('%s')""" % "','".join(files) _src2org = {} _file2src = {} for row in _db.retrieve(): parts = row [_file, _src, _org] = parts _src2org[_src] = _org _file2src[_file] = _src return _file2src, _src2org
def get_archaea_kplets(): sql_cmd = """select apc.*, s1.cnt, s1.wgt from ( select ap.id ,count(*) as cnt, sum(w.weight) as wgt from archea_5plets ap inner join archea_5plets_win10 apw on ap.id = apw.kplet_id inner join archea_win10_files awf on apw.file_id = awf.id inner join sources s on awf.source_id=s.id inner join weights w on w.genome_id=s.genome_id group by ap.id ) s1 inner join archea_5plets_codes apc on s1.id=apc.id order by s1.wgt desc""" _db = DbClass() _db.cmd = sql_cmd return _db.retrieve()
def extract_baited_duplet_aggregates(duplet_type="adj"): if duplet_type == "adj": sql_cmd = """select d.id, p1.code as profile_1, p2.code as profile_2, count(*) as loci, sum(w.weight) as loci_weight, group_concat(CONVERT(df.file_id, CHAR(20)) separator ',') as files from prok1402_baited_adj_duplet d inner join prok1402_baited_adj_duplet_file df on d.id = df.kplet_id inner join cdd_profiles p1 on d.kplet_1 = p1.id inner join cdd_profiles p2 on d.kplet_2 = p2.id inner join prok1402_baited_files bf on bf.id = df.file_id inner join sources s on bf.source_id = s.id inner join genomes g on g.id = s.genome_id inner join weights w on w.genome_id = g.id group by d.id order by loci desc;""" else: sql_cmd = """select d.id, p1.code as profile_1, p2.code as profile_2, count(*) as loci, sum(w.weight) as loci_weight, group_concat(CONVERT(df.file_id, CHAR(20)) separator ',') as files from prok1402_baited_duplet d inner join prok1402_baited_duplet_file df on d.id = df.kplet_id inner join cdd_profiles p1 on d.kplet_1 = p1.id inner join cdd_profiles p2 on d.kplet_2 = p2.id inner join prok1402_baited_files bf on bf.id = df.file_id inner join sources s on bf.source_id = s.id inner join genomes g on g.id = s.genome_id inner join weights w on w.genome_id = g.id group by d.id order by loci desc;""" _db = DbClass() _db.cmd = sql_cmd rows = _db.retrieve() return rows
def map_file_name2id(): _db = DbClass() _db.cmd = """ select name, id from bacteria_win10_files""" rows = _db.retrieve() return {row[0]: row[1] for row in rows}
def get_code_kplet(kplet_id): _sql_cmd = """select kplet_1, kplet_2, kplet_3, kplet_4, kplet_5 from archea_5plets_codes where id = %s""" % kplet_id _db = DbClass() _db.cmd = _sql_cmd return _db.retrieve()
from lib.db import DbClass _db = DbClass() # for table_name in ['crispr_4plets_wgs', 'crispr_3plets_wgs', 'crispr_2plets_wgs']: for table_name in ['crispr_5plets_wgs']: print "Starting for table:", table_name print print "Loading the duplicates" _db.cmd = """SELECT count(*) as cnt, kplet_id, file_id FROM PatternQuest.%s group by kplet_id, file_id having count(*)>1 order by cnt desc""" % table_name table_rows = [row for row in _db.retrieve()] print("Number of duplicate entries:", len(table_rows)) fname = "duplicates_%s.txt" % table_name with open(fname, 'w') as f: [f.write("%d\t%d\t%d\n" % (row)) for row in table_rows] print "Duplicates saved in file:", fname continue sys.exit() print("Starting to remove duplicates") total_cnt = 0 chunk_size = 10000 chunks = [ table_rows[i:i + chunk_size] for i in range(0, len(table_rows), chunk_size)
def map_source2id(): _db = DbClass() _db.cmd = """ SELECT id, name FROM PatternQuest.sources; """ rows = _db.retrieve() return {row[1]: row[0] for row in rows}
def map_file_id2name(): _db = DbClass() _db.cmd = """select id, name from bacteria_win10_files""" return {str(l[0]): l[1] for l in _db.retrieve()}
from lib.db import DbClass _db = DbClass() # for table_name in ['crispr_4plets_wgs', 'crispr_3plets_wgs', 'crispr_2plets_wgs']: for table_name in ['crispr_5plets_wgs']: print "Starting for table:", table_name print print "Loading the duplicates" _db.cmd = """SELECT count(*) as cnt, kplet_id, file_id FROM PatternQuest.%s group by kplet_id, file_id having count(*)>1 order by cnt desc""" % table_name table_rows = [row for row in _db.retrieve()] print("Number of duplicate entries:", len(table_rows)) fname = "duplicates_%s.txt" % table_name with open(fname, 'w') as f: [f.write("%d\t%d\t%d\n"%(row)) for row in table_rows] print "Duplicates saved in file:", fname continue sys.exit() print("Starting to remove duplicates") total_cnt = 0 chunk_size = 10000 chunks = [table_rows[i: i+chunk_size] for i in range(0, len(table_rows), chunk_size)] for row in table_rows: