Пример #1
0
def get_report_kplets(id2cdd, limit_to=500, load_locations=None):

    _db = DbClass()
    _db.cmd = """SET group_concat_max_len=1500000"""
    _db.execute()

    _db.cmd = """select ap.* ,count(*) as cnt, sum(w.weight) as wgt, group_concat(awf.name) as an
                 from bacteria_3plets ap
                 inner join bacteria_3plets_win10 apw on ap.id = apw.kplet_id
                 inner join bacteria_win10_files awf on apw.file_id = awf.id
                 inner join sources s on awf.source_id=s.id
                 inner join weights w on w.genome_id=s.genome_id
                 group by ap.id
                 having count(distinct s.genome_id)>1
                 order by wgt desc
                 limit 0, %d""" % limit_to

    out_list = []

    for row in _db.retrieve():
        id = row[0]
        kplet_codes = ([id2cdd[int(_id)] for _id in row[1:4]])
        if len(set(kplet_codes)) != 3:
            continue
        count = row[4]
        weight = row[5]
        files = row[6].split(',')
        tmp_kplet = Kplet(id=id, codes=kplet_codes, count=count, files=files)
        out_list.append(tmp_kplet)

    _path = neighborhoods_path()
    if load_locations:
        [kplet.load_locations(_path) for kplet in out_list]

    return out_list
Пример #2
0
def org2src_src2files_map(files):

    _db = DbClass()

    _db.cmd = """select awf.name, s.name, g.name
                  from bacteria_win10_files awf
                  inner join sources s on awf.source_id=s.id
                  inner join genomes g on s.genome_id = g.id
                  where awf.name in ('%s')""" % "','".join(files)

    _org2src = {}
    _src2files = {}

    for row in _db.retrieve():
        parts = row
        [_file, _src, _org] = parts
        if _org in _org2src:
            _org2src[_org].update([_src])
        else:
            _org2src[_org] = set([_src])

        if _src in _src2files:
            _src2files[_src].update([_file])
        else:
            _src2files[_src] = set([_file])

    return _org2src, _src2files
Пример #3
0
def get_report_kplets(id2cdd, limit_to=500):

    _db = DbClass()
    _db.cmd = """SET group_concat_max_len=1500000"""
    _db.execute()

    _db.cmd = """select cp.*, count(distinct cwf.id) as cnt, group_concat(cwf.name) as files
                from crispr_2plets cp
                inner join crispr_2plets_wgs cpw on cp.id = cpw.kplet_id
                inner join crispr_wgs_files cwf on cpw.file_id = cwf.id
                group by cp.id
                having count(distinct cwf.id)>1
                order by cnt desc
                limit 0, %d""" % limit_to

    out_list = []

    for row in _db.retrieve():
        id = row[0]
        kplet_codes = ([id2cdd[int(_id)] for _id in row[1:3]])
        if len(set(kplet_codes)) != 2:
            continue
        count = row[3]
        files = row[4].split(',')
        tmp_kplet = Kplet(id=id, codes=kplet_codes, count=count, files=files)
        out_list.append(tmp_kplet)

    return out_list
Пример #4
0
def get_report_kplets(id2cdd, limit_to=500, load_locations=None):

    _db = DbClass()
    _db.cmd = """SET group_concat_max_len=1500000"""
    _db.execute()

    _db.cmd = """select ap.* ,count(*) as cnt, sum(w.weight) as wgt, group_concat(awf.name) as an
                 from bacteria_3plets ap
                 inner join bacteria_3plets_win10 apw on ap.id = apw.kplet_id
                 inner join bacteria_win10_files awf on apw.file_id = awf.id
                 inner join sources s on awf.source_id=s.id
                 inner join weights w on w.genome_id=s.genome_id
                 group by ap.id
                 having count(distinct s.genome_id)>1
                 order by wgt desc
                 limit 0, %d""" % limit_to

    out_list = []

    for row in _db.retrieve():
        id = row[0]
        kplet_codes = ([id2cdd[int(_id)] for _id in row[1:4]])
        if len(set(kplet_codes)) != 3:
            continue
        count = row[4]
        weight = row[5]
        files = row[6].split(',')
        tmp_kplet = Kplet(id=id, codes=kplet_codes, count=count, files=files)
        out_list.append(tmp_kplet)

    _path = neighborhoods_path()
    if load_locations:
        [kplet.load_locations(_path) for kplet in out_list]

    return out_list
Пример #5
0
def get_report_kplets(id2cdd, limit_to=500):

    _db = DbClass()
    _db.cmd = """SET group_concat_max_len=15000000"""
    _db.execute()

    _db.cmd = """select cp.*, count(distinct cwf.id) as cnt, group_concat(cwf.name) as files
                from crispr_5plets cp
                inner join crispr_5plets_wgs cpw on cp.id = cpw.kplet_id
                inner join crispr_wgs_files cwf on cpw.file_id = cwf.id
                group by cp.id
                having count(distinct cwf.id)>1
                order by cnt desc
                limit 0, %d""" % limit_to

    out_list = []

    for row in _db.retrieve():
        id = row[0]
        kplet_codes = ([id2cdd[int(_id)] for _id in row[1:6]])
        if len(set(kplet_codes)) != 5:
            continue
        count = row[6]
        files = row[7].split(',')
        tmp_kplet = Kplet(id=id, codes=kplet_codes, count=count, files=files)
        out_list.append(tmp_kplet)

    return out_list
Пример #6
0
def get_report_kplets(limit_to=300, load_locations=None):

    sql_cmd = """select apc.*, s1.cnt, s1.wgt, s1.an
                from (
                        select ap.id ,count(*) as cnt, sum(w.weight) as wgt, group_concat(awf.name) as an
                        from archea_4plets ap
                        inner join archea_4plets_win10 apw on ap.id = apw.kplet_id
                        inner join archea_win10_files awf on apw.file_id = awf.id
                        inner join sources s on awf.source_id=s.id
                        inner join weights w on w.genome_id=s.genome_id
                        group by ap.id
                        having count(distinct s.genome_id)>1 ) s1
                inner join archea_4plets_codes apc on s1.id=apc.id
                order by s1.wgt desc
                limit 0,%d""" % limit_to

    _db = DbClass()
    _db.cmd = sql_cmd
    out_list = []

    for row in _db.retrieve():
        id = row[0]
        kplet_codes = (row[1:5])
        if len(set(kplet_codes)) != 4:
            continue
        count = row[5]
        files = row[7].split(',')
        tmp_kplet = Kplet(id=id, codes=kplet_codes, count=count, files=files)
        out_list.append(tmp_kplet)

    _path = neighborhoods_path()
    if load_locations:
        [kplet.load_locations(_path) for kplet in out_list]

    return out_list
Пример #7
0
def get_report_kplets(limit_to=300, load_locations=None):

    _db = DbClass()

    _db.cmd = """select apc.*, s1.cnt, s1.wgt, s1.an
                from (
                        select ap.id ,count(*) as cnt, sum(w.weight) as wgt, group_concat(awf.name) as an
                        from archea_3plets ap
                        inner join archea_3plets_win10 apw on ap.id = apw.kplet_id
                        inner join archea_win10_files awf on apw.file_id = awf.id
                        inner join sources s on awf.source_id=s.id
                        inner join weights w on w.genome_id=s.genome_id
                        group by ap.id
                        having count(distinct s.genome_id)>1 ) s1
                inner join archea_3plets_codes apc on s1.id=apc.id
                order by s1.wgt desc
                limit 0,%d""" % limit_to

    out_list = []

    for row in _db.retrieve():
        id = row[0]
        kplet_codes = (row[1:4])
        if len(set(kplet_codes)) != 3:
            continue
        count = row[4]
        files = row[6].split(',')
        tmp_kplet = Kplet(id=id, codes=kplet_codes, count=count, files=files)
        out_list.append(tmp_kplet)

    _path = neighborhoods_path()
    if load_locations:
        [kplet.load_locations(_path) for kplet in out_list]

    return out_list
Пример #8
0
def get_code_kplet(kplet_id):

    _db = DbClass()

    _db.cmd = """select kplet_1, kplet_2, kplet_3, kplet_4, kplet_5 from bacteria_5plets_codes where id = %s""" % kplet_id

    return _db.retrieve()[0]
Пример #9
0
def get_report_kplet(kplet_id, id2cdd, load_locations=None):

    _db = DbClass()
    _db.cmd = """SET group_concat_max_len=1500000;"""
    _db.execute()
    _db.cmd = """select ap.* ,count(*) as cnt, sum(w.weight) as wgt, group_concat(awf.name) as an
                from bacteria_5plets ap
                inner join bacteria_5plets_win10 apw on ap.id = apw.kplet_id
                inner join bacteria_win10_files awf on apw.file_id = awf.id
                inner join sources s on awf.source_id=s.id
                inner join weights w on w.genome_id=s.genome_id
                where ap.id=%d
                group by ap.id""" % kplet_id

    out_list = []

    for row in _db.retrieve():
        id = row[0]
        kplet_codes = [id2cdd[int(_id)] for _id in row[1:6]]

        if len(set(kplet_codes)) != 5:
            continue
        count = row[6]
        files = row[8].split(',')
        tmp_kplet = Kplet(id=id, codes=kplet_codes, count=count, files=files)
        out_list.append(tmp_kplet)

    _path = neighborhoods_path()
    if load_locations:
        [kplet.load_locations(_path) for kplet in out_list]

    return out_list[0]
Пример #10
0
def get_code_kplet(kplet_id):
    _sql_cmd = """SELECT cp1.code, cp2.code, cp3.code, cp4.code
                    FROM PatternQuest.archea_4plets ap
                    inner join cdd_profiles cp1 on cp1.id = ap.kplet_1
                    inner join cdd_profiles cp2 on cp2.id = ap.kplet_2
                    inner join cdd_profiles cp3 on cp3.id = ap.kplet_3
                    inner join cdd_profiles cp4 on cp4.id = ap.kplet_4
                    where ap.id = %d""" % kplet_id
    _db = DbClass()
    _db.cmd = _sql_cmd
    return _db.retrieve()
Пример #11
0
def get_code_kplet(kplet_id, id2cdd=None):

    _db = DbClass()

    if not id2cdd:
        _db.cmd = """select cp1.code, cp2.code
                from bacteria_2plets bp
                inner join cdd_profiles cp1 on cp1.id = bp.kplet_1
                inner join cdd_profiles cp2 on cp2.id = bp.kplet_2
                where bp.id = %d""" % kplet_id
        retval = _db.retrieve()[0]

    else:

        _db.cmd = """select kplet_1, kplet_2
                     from bacteria_2plets where id = %d""" % kplet_id

        retval = _db.retrieve()[0]
        retval = set([id2cdd[id] for id in retval])

    return retval
Пример #12
0
def archea_kplet_ids2files(id_list):

    _sql_cmd = """select distinct awf.name
                  from archea_5plets ap
                  inner join archea_5plets_win10 apw on ap.id = apw.kplet_id
                  inner join archea_win10_files awf on apw.file_id = awf.id
                  where ap.id in (%s)""" % " , ".join(id_list)

    _db = DbClass()
    _db.cmd = _sql_cmd

    return [l[0] for l in _db.retrieve()]
Пример #13
0
def map_profile2id(profile_list):

    _db = DbClass()
    if profile_list == 'all':
        sql_cmd = """select code, id from cdd_profiles"""
    else:
        sql_cmd = """select code, id from cdd_profiles where code in ('%s')"""
        sql_cmd = sql_cmd % "','".join(profile_list)

    _db.cmd = sql_cmd

    return {row[0]: int(row[1]) for row in _db.retrieve()}
Пример #14
0
def get_multiple_kplets():

    _sql_cmd = """select  ap.id, count(*) cnt, group_concat(convert(apw.file_id, char(15))) as file_ids
                  from archea_5plets ap
                  inner join archea_5plets_win10 apw on ap.id = apw.kplet_id
                  group by ap.id
                  having count(*)>1
                  order by cnt desc"""
    _db = DbClass()
    _db.cmd = _sql_cmd

    return _db.retrieve()
Пример #15
0
def get_multiple_kplets():

    _db = DbClass()
    _db.cmd = "SET group_concat_max_len = 100000;"
    _db.execute()
    _db.cmd = """ select  ap.id, count(*) cnt, group_concat(convert(apw.file_id, char(15))) as file_ids
                  from bacteria_3plets ap
                  inner join bacteria_3plets_win10 apw on ap.id = apw.kplet_id
                  group by ap.id
                  having count(*)>1
                  order by cnt desc"""

    return _db.retrieve()
Пример #16
0
def get_multiple_kplets():

    _db = DbClass()
    _db.cmd = "SET group_concat_max_len = 100000;"
    _db.execute()
    _db.cmd = """ select  ap.id, count(*) cnt, group_concat(convert(apw.file_id, char(15))) as file_ids
                  from bacteria_3plets ap
                  inner join bacteria_3plets_win10 apw on ap.id = apw.kplet_id
                  group by ap.id
                  having count(*)>1
                  order by cnt desc"""

    return _db.retrieve()
Пример #17
0
def map_baited_file2id():

    _db = DbClass()
    _db.cmd = """ select name, id from prok1402_baited_files"""
    rows = _db.retrieve()

    file2id = {}
    id2file = {}

    for row in rows:
        file2id[row[0]] = row[1]
        id2file[row[1]] = row[0]

    return file2id, id2file
Пример #18
0
def get_kplet_id(profiles):
    profile2id = t.map_profile2id(profiles)

    _db = DbClass()

    sql_cmd = """select id from archea_4plets where kplet_1=%d and kplet_2=%d and kplet_3=%d and kplet_4=%d"""
    sql_cmd = sql_cmd % (profile2id[profiles[0]], profile2id[profiles[1]], profile2id[profiles[2]], profile2id[profiles[3]])

    _db.cmd = sql_cmd

    rows = _db.retrieve()

    if rows:
        return rows[0][0]
    return None
Пример #19
0
def file2src_src2org_map(files):

    _db = DbClass()
    _db.cmd = """select awf.name, s.name, g.name
                  from bacteria_win10_files awf
                  inner join sources s on awf.source_id=s.id
                  inner join genomes g on s.genome_id = g.id
                  where awf.name in ('%s')""" % "','".join(files)

    _src2org = {}
    _file2src = {}
    for row in _db.retrieve():
        parts = row
        [_file, _src, _org] = parts
        _src2org[_src] = _org
        _file2src[_file] = _src

    return _file2src, _src2org
Пример #20
0
def get_archaea_kplets():

    sql_cmd = """select apc.*, s1.cnt, s1.wgt
                 from (
                        select ap.id ,count(*) as cnt, sum(w.weight) as wgt
                        from archea_5plets ap
                        inner join archea_5plets_win10 apw on ap.id = apw.kplet_id
                        inner join archea_win10_files awf on apw.file_id = awf.id
                        inner join sources s on awf.source_id=s.id
                        inner join weights w on w.genome_id=s.genome_id
                        group by ap.id ) s1
                 inner join archea_5plets_codes apc on s1.id=apc.id
                 order by s1.wgt desc"""

    _db = DbClass()
    _db.cmd = sql_cmd

    return _db.retrieve()
Пример #21
0
def extract_baited_duplet_aggregates(duplet_type="adj"):

    if duplet_type == "adj":
        sql_cmd = """select d.id, 
                        p1.code as profile_1, 
                        p2.code as profile_2, 
                        count(*) as loci, 
                        sum(w.weight) as loci_weight, 
                        group_concat(CONVERT(df.file_id, CHAR(20)) separator ',') as files  
                    from prok1402_baited_adj_duplet d 
                    inner join prok1402_baited_adj_duplet_file df on d.id = df.kplet_id 
                    inner join cdd_profiles p1 on d.kplet_1 = p1.id 
                    inner join cdd_profiles p2 on d.kplet_2 = p2.id 
                    inner join prok1402_baited_files bf on bf.id = df.file_id 
                    inner join sources s on bf.source_id = s.id 
                    inner join genomes g on g.id = s.genome_id 
                    inner join weights w on w.genome_id = g.id 
                    group by d.id 
                    order by loci desc;"""
    else:
        sql_cmd = """select d.id, 
                        p1.code as profile_1, 
                        p2.code as profile_2, 
                        count(*) as loci, 
                        sum(w.weight) as loci_weight, 
                        group_concat(CONVERT(df.file_id, CHAR(20)) separator ',') as files 
                    from prok1402_baited_duplet d 
                    inner join prok1402_baited_duplet_file df on d.id = df.kplet_id
                    inner join cdd_profiles p1 on d.kplet_1 = p1.id
                    inner join cdd_profiles p2 on d.kplet_2 = p2.id
                    inner join prok1402_baited_files bf on bf.id = df.file_id
                    inner join sources s on bf.source_id = s.id
                    inner join genomes g on g.id = s.genome_id
                    inner join weights w on w.genome_id = g.id
                    group by d.id
                    order by loci desc;"""

    _db = DbClass()
    _db.cmd = sql_cmd

    rows = _db.retrieve()

    return rows
Пример #22
0
def map_file_name2id():

    _db = DbClass()
    _db.cmd = """ select name, id from  bacteria_win10_files"""
    rows = _db.retrieve()
    return {row[0]: row[1] for row in rows}
Пример #23
0
def get_code_kplet(kplet_id):
    _sql_cmd = """select kplet_1, kplet_2, kplet_3, kplet_4, kplet_5 from archea_5plets_codes where id = %s""" % kplet_id
    _db = DbClass()
    _db.cmd = _sql_cmd
    return _db.retrieve()
Пример #24
0
from lib.db import DbClass

_db = DbClass()

# for table_name in ['crispr_4plets_wgs', 'crispr_3plets_wgs', 'crispr_2plets_wgs']:
for table_name in ['crispr_5plets_wgs']:
    print "Starting for table:", table_name
    print
    print "Loading the duplicates"
    _db.cmd = """SELECT count(*) as cnt, kplet_id, file_id
                FROM PatternQuest.%s
                group by kplet_id, file_id
                having count(*)>1
                order by cnt desc""" % table_name

    table_rows = [row for row in _db.retrieve()]

    print("Number of duplicate entries:", len(table_rows))
    fname = "duplicates_%s.txt" % table_name
    with open(fname, 'w') as f:
        [f.write("%d\t%d\t%d\n" % (row)) for row in table_rows]
    print "Duplicates saved in file:", fname
    continue
    sys.exit()
    print("Starting to remove duplicates")

    total_cnt = 0
    chunk_size = 10000
    chunks = [
        table_rows[i:i + chunk_size]
        for i in range(0, len(table_rows), chunk_size)
Пример #25
0
def map_source2id():

    _db = DbClass()
    _db.cmd = """ SELECT id, name FROM PatternQuest.sources; """
    rows = _db.retrieve()
    return {row[1]: row[0] for row in rows}
Пример #26
0
def map_file_id2name():

    _db = DbClass()
    _db.cmd = """select id, name from bacteria_win10_files"""
    return {str(l[0]): l[1] for l in _db.retrieve()}
from lib.db import DbClass

_db = DbClass()

# for table_name in ['crispr_4plets_wgs', 'crispr_3plets_wgs', 'crispr_2plets_wgs']:
for table_name in ['crispr_5plets_wgs']:
    print "Starting for table:", table_name
    print
    print "Loading the duplicates"
    _db.cmd = """SELECT count(*) as cnt, kplet_id, file_id
                FROM PatternQuest.%s
                group by kplet_id, file_id
                having count(*)>1
                order by cnt desc""" % table_name

    table_rows = [row for row in _db.retrieve()]

    print("Number of duplicate entries:", len(table_rows))
    fname = "duplicates_%s.txt" % table_name
    with open(fname, 'w') as f:
        [f.write("%d\t%d\t%d\n"%(row)) for row in table_rows]
    print "Duplicates saved in file:", fname
    continue
    sys.exit()
    print("Starting to remove duplicates")

    total_cnt = 0
    chunk_size = 10000
    chunks = [table_rows[i: i+chunk_size] for i in range(0, len(table_rows), chunk_size)]

    for row in table_rows: