Пример #1
0
def store_kplets_pile(kplets_pile, cdd2id, file2id):

    _sql_kplet = """insert ignore into bacteria_3plets (kplet_1, kplet_2, kplet_3) values \n"""

    _sql_kplet_file = """insert ignore into bacteria_3plets_win10 (kplet_id, file_id) values \n"""

    for (kplets, fname) in kplets_pile:

        for kplet in kplets:
            kplet = list(kplet)
            kplet.sort()
            kplet = tuple([int(cdd2id[k]) for k in kplet])

            _sql_kplet += """(%d, %d, %d),\n""" % kplet

            _sql_kplet_file += (
                """((select id from bacteria_3plets where """ +
                """kplet_1=%d and kplet_2=%d and kplet_3=%d),""" +
                """%d),\n""") % (kplet + (int(file2id[fname]), ))

    _sql_kplet = _sql_kplet[:-2]
    _sql_kplet += ';'

    _sql_kplet_file = _sql_kplet_file[:-2]
    _sql_kplet_file += ';'

    _db = DbClass()

    _db.cmd = _sql_kplet
    _db.execute()
    _db.commit()

    _db.cmd = _sql_kplet_file
    _db.execute()
    _db.commit()
Пример #2
0
def get_report_kplets(id2cdd, limit_to=500):

    _db = DbClass()
    _db.cmd = """SET group_concat_max_len=15000000"""
    _db.execute()

    _db.cmd = """select cp.*, count(distinct cwf.id) as cnt, group_concat(cwf.name) as files
                from crispr_5plets cp
                inner join crispr_5plets_wgs cpw on cp.id = cpw.kplet_id
                inner join crispr_wgs_files cwf on cpw.file_id = cwf.id
                group by cp.id
                having count(distinct cwf.id)>1
                order by cnt desc
                limit 0, %d""" % limit_to

    out_list = []

    for row in _db.retrieve():
        id = row[0]
        kplet_codes = ([id2cdd[int(_id)] for _id in row[1:6]])
        if len(set(kplet_codes)) != 5:
            continue
        count = row[6]
        files = row[7].split(',')
        tmp_kplet = Kplet(id=id, codes=kplet_codes, count=count, files=files)
        out_list.append(tmp_kplet)

    return out_list
Пример #3
0
def get_report_kplets(id2cdd, limit_to=500, load_locations=None):

    _db = DbClass()
    _db.cmd = """SET group_concat_max_len=1500000"""
    _db.execute()

    _db.cmd = """select ap.* ,count(*) as cnt, sum(w.weight) as wgt, group_concat(awf.name) as an
                 from bacteria_3plets ap
                 inner join bacteria_3plets_win10 apw on ap.id = apw.kplet_id
                 inner join bacteria_win10_files awf on apw.file_id = awf.id
                 inner join sources s on awf.source_id=s.id
                 inner join weights w on w.genome_id=s.genome_id
                 group by ap.id
                 having count(distinct s.genome_id)>1
                 order by wgt desc
                 limit 0, %d""" % limit_to

    out_list = []

    for row in _db.retrieve():
        id = row[0]
        kplet_codes = ([id2cdd[int(_id)] for _id in row[1:4]])
        if len(set(kplet_codes)) != 3:
            continue
        count = row[4]
        weight = row[5]
        files = row[6].split(',')
        tmp_kplet = Kplet(id=id, codes=kplet_codes, count=count, files=files)
        out_list.append(tmp_kplet)

    _path = neighborhoods_path()
    if load_locations:
        [kplet.load_locations(_path) for kplet in out_list]

    return out_list
Пример #4
0
def get_report_kplet(kplet_id, id2cdd, load_locations=None):

    _db = DbClass()
    _db.cmd = """SET group_concat_max_len=1500000;"""
    _db.execute()
    _db.cmd = """select ap.* ,count(*) as cnt, sum(w.weight) as wgt, group_concat(awf.name) as an
                from bacteria_5plets ap
                inner join bacteria_5plets_win10 apw on ap.id = apw.kplet_id
                inner join bacteria_win10_files awf on apw.file_id = awf.id
                inner join sources s on awf.source_id=s.id
                inner join weights w on w.genome_id=s.genome_id
                where ap.id=%d
                group by ap.id""" % kplet_id

    out_list = []

    for row in _db.retrieve():
        id = row[0]
        kplet_codes = [id2cdd[int(_id)] for _id in row[1:6]]

        if len(set(kplet_codes)) != 5:
            continue
        count = row[6]
        files = row[8].split(',')
        tmp_kplet = Kplet(id=id, codes=kplet_codes, count=count, files=files)
        out_list.append(tmp_kplet)

    _path = neighborhoods_path()
    if load_locations:
        [kplet.load_locations(_path) for kplet in out_list]

    return out_list[0]
Пример #5
0
def get_report_kplets(id2cdd, limit_to=500, load_locations=None):

    _db = DbClass()
    _db.cmd = """SET group_concat_max_len=1500000"""
    _db.execute()

    _db.cmd = """select ap.* ,count(*) as cnt, sum(w.weight) as wgt, group_concat(awf.name) as an
                 from bacteria_3plets ap
                 inner join bacteria_3plets_win10 apw on ap.id = apw.kplet_id
                 inner join bacteria_win10_files awf on apw.file_id = awf.id
                 inner join sources s on awf.source_id=s.id
                 inner join weights w on w.genome_id=s.genome_id
                 group by ap.id
                 having count(distinct s.genome_id)>1
                 order by wgt desc
                 limit 0, %d""" % limit_to

    out_list = []

    for row in _db.retrieve():
        id = row[0]
        kplet_codes = ([id2cdd[int(_id)] for _id in row[1:4]])
        if len(set(kplet_codes)) != 3:
            continue
        count = row[4]
        weight = row[5]
        files = row[6].split(',')
        tmp_kplet = Kplet(id=id, codes=kplet_codes, count=count, files=files)
        out_list.append(tmp_kplet)

    _path = neighborhoods_path()
    if load_locations:
        [kplet.load_locations(_path) for kplet in out_list]

    return out_list
Пример #6
0
def store_kplets_pile(kplets_pile, cdd2id, file2id):

    _sql_kplet = """insert ignore into bacteria_3plets (kplet_1, kplet_2, kplet_3) values \n"""

    _sql_kplet_file = """insert ignore into bacteria_3plets_win10 (kplet_id, file_id) values \n"""

    for (kplets, fname) in kplets_pile:

        for kplet in kplets:
            kplet = list(kplet)
            kplet.sort()
            kplet = tuple([int(cdd2id[k]) for k in kplet])

            _sql_kplet += """(%d, %d, %d),\n""" % kplet

            _sql_kplet_file += ("""((select id from bacteria_3plets where """ +
                                """kplet_1=%d and kplet_2=%d and kplet_3=%d),""" +
                                """%d),\n""") % (kplet + (int(file2id[fname]),))

    _sql_kplet = _sql_kplet[:-2]
    _sql_kplet += ';'

    _sql_kplet_file = _sql_kplet_file[:-2]
    _sql_kplet_file += ';'

    _db = DbClass()

    _db.cmd = _sql_kplet
    _db.execute()
    _db.commit()

    _db.cmd = _sql_kplet_file
    _db.execute()
    _db.commit()
Пример #7
0
def insert_source_duplets(kplets_pile, profile2id, source_id):

    sql_insert_kplet = """insert ignore into prok1402_all_adj_duplet (kplet_1, kplet_2, multidomain) values \n"""
    sql_insert_kplet_file = """insert into prok1402_all_adj_duplet_source (kplet_id, source_id) values \n"""

    for (duplet, fname, multidomain) in kplets_pile:

        terms = sorted(list(duplet))
        terms = [profile2id[_] for _ in terms]

        sql_insert_kplet += """(%s, %s, %d),\n""" % (terms[0], terms[1], multidomain)

        sql_insert_kplet_file += """((select id from prok1402_all_adj_duplet where """ + \
                                  """kplet_1=%s and kplet_2=%s and multidomain=%d), %d),\n""" % \
                                  (terms[0], terms[1], multidomain, source_id)

    sql_insert_kplet_file = sql_insert_kplet_file[:-2]
    sql_insert_kplet_file += ";"

    sql_insert_kplet = sql_insert_kplet[:-2]
    sql_insert_kplet += ";"

    _db = DbClass()

    _db.cmd = sql_insert_kplet
    _db.execute()

    _db.cmd = sql_insert_kplet_file
    _db.execute()

    _db.commit()
Пример #8
0
def insert_duplets(kplets_pile, profile2id, file2id):

    sql_insert_kplet = """insert ignore into prok1402_baited_duplet (kplet_1, kplet_2) values \n"""
    sql_insert_kplet_file = """insert ignore into prok1402_baited_duplet_file (kplet_id, file_id) values \n"""

    for (duplet, fname) in kplets_pile:

        terms = sorted(list(duplet))
        terms = [profile2id[_] for _ in terms]

        fname = os.path.basename(fname)
        file_id = file2id[fname]

        sql_insert_kplet += """(%s, %s),\n""" % (terms[0], terms[1])

        sql_insert_kplet_file += """((select id from prok1402_baited_duplet where """ + \
                                  """kplet_1=%s and kplet_2=%s), %d),\n""" % (terms[0], terms[1] ,file_id)

    sql_insert_kplet_file = sql_insert_kplet_file[:-2]
    sql_insert_kplet_file += ";"

    sql_insert_kplet = sql_insert_kplet[:-2]
    sql_insert_kplet += ";"

    _db = DbClass()

    _db.cmd = sql_insert_kplet
    _db.execute()

    _db.cmd = sql_insert_kplet_file
    _db.execute()

    _db.commit()
Пример #9
0
def get_report_kplets(id2cdd, limit_to=500):

    _db = DbClass()
    _db.cmd = """SET group_concat_max_len=1500000"""
    _db.execute()

    _db.cmd = """select cp.*, count(distinct cwf.id) as cnt, group_concat(cwf.name) as files
                from crispr_2plets cp
                inner join crispr_2plets_wgs cpw on cp.id = cpw.kplet_id
                inner join crispr_wgs_files cwf on cpw.file_id = cwf.id
                group by cp.id
                having count(distinct cwf.id)>1
                order by cnt desc
                limit 0, %d""" % limit_to

    out_list = []

    for row in _db.retrieve():
        id = row[0]
        kplet_codes = ([id2cdd[int(_id)] for _id in row[1:3]])
        if len(set(kplet_codes)) != 2:
            continue
        count = row[3]
        files = row[4].split(',')
        tmp_kplet = Kplet(id=id, codes=kplet_codes, count=count, files=files)
        out_list.append(tmp_kplet)

    return out_list
Пример #10
0
def insert_kplet(profiles):
    profile2id = t.map_profile2id(profiles)

    _db = DbClass()

    sql_cmd = """insert into archea_4plets (kplet_1, kplet_2, kplet_3, kplet_4) values (%d, %d, %d, %d)"""
    sql_cmd = sql_cmd % (profile2id[profiles[0]], profile2id[profiles[1]], profile2id[profiles[2]], profile2id[profiles[3]])
    _db.cmd = sql_cmd
    _db.execute()
Пример #11
0
def get_multiple_kplets():

    _db = DbClass()
    _db.cmd = "SET group_concat_max_len = 100000;"
    _db.execute()
    _db.cmd = """ select  ap.id, count(*) cnt, group_concat(convert(apw.file_id, char(15))) as file_ids
                  from bacteria_3plets ap
                  inner join bacteria_3plets_win10 apw on ap.id = apw.kplet_id
                  group by ap.id
                  having count(*)>1
                  order by cnt desc"""

    return _db.retrieve()
Пример #12
0
def get_multiple_kplets():

    _db = DbClass()
    _db.cmd = "SET group_concat_max_len = 100000;"
    _db.execute()
    _db.cmd = """ select  ap.id, count(*) cnt, group_concat(convert(apw.file_id, char(15))) as file_ids
                  from bacteria_3plets ap
                  inner join bacteria_3plets_win10 apw on ap.id = apw.kplet_id
                  group by ap.id
                  having count(*)>1
                  order by cnt desc"""

    return _db.retrieve()
Пример #13
0
def store_kplets_pile(prefix, kplets_pile, profile2id, file2id):

    kplet_table_name = "%s_3plets" % prefix
    kplet_table_file_name = "%s_3plets_files" % prefix

    _sql_kplet = """insert ignore into %s (kplet_1, kplet_2, kplet_3) values \n""" % kplet_table_name
    _sql_kplet_file = """insert ignore into %s (kplet_id, file_id) values \n""" % kplet_table_file_name

    for (kplets, fname) in kplets_pile:

        for kplet in kplets:
            kplet = list(kplet)
            kplet.sort()

            kplet = tuple([int(profile2id[k]) for k in kplet])

            _sql_kplet += """(%d, %d, %d),\n""" % kplet

            _sql_kplet_file += (
                """((select id from %s where """ +
                """kplet_1=%d and kplet_2=%d and kplet_3=%d),""" +
                """%d),\n""") % ((kplet_table_name, ) + kplet +
                                 (int(file2id[fname]), ))

    _sql_kplet = _sql_kplet[:-2]
    _sql_kplet += ';'

    _sql_kplet_file = _sql_kplet_file[:-2]
    _sql_kplet_file += ';'

    _db = DbClass()

    _db.cmd = _sql_kplet
    _db.execute()
    _db.commit()

    _db.cmd = _sql_kplet_file
    _db.execute()
    _db.commit()
Пример #14
0
def get_report_kplets(limit_to=300, count_filter=1, load_locations=None):

    _db = DbClass()

    _db.cmd = """SET group_concat_max_len=15000"""
    _db.execute()

    _db.cmd = """select apc.*, s1.cnt, s1.wgt, s1.an
                from (
                        select ap.id ,count(*) as cnt, sum(w.weight) as wgt, group_concat(awf.name) as an
                        from archea_2plets ap
                        inner join archea_2plets_win10 apw on ap.id = apw.kplet_id
                        inner join archea_win10_files awf on apw.file_id = awf.id
                        inner join sources s on awf.source_id=s.id
                        inner join weights w on w.genome_id=s.genome_id
                        group by ap.id
                        having count(distinct s.genome_id) > %d ) s1
                inner join archea_2plets_codes apc on s1.id=apc.id
                order by s1.wgt desc
                limit 0,%d""" % (count_filter, limit_to)

    out_list = []

    for row in _db.retrieve():
        id = row[0]
        kplet_codes = (row[1:3])
        if len(set(kplet_codes)) != 2:
            continue
        count = row[3]
        files = row[5].split(',')
        tmp_kplet = Kplet(id=id, codes=kplet_codes, count=count, files=files)
        out_list.append(tmp_kplet)


    if load_locations:
        _path = neighborhoods_path()
        [kplet.load_locations(_path) for kplet in out_list]

    return out_list
Пример #15
0
def store_kplets(kplets, fname):

    _sql_kplet = """insert ignore into bacteria_4plets (kplet_1, kplet_2, kplet_3, kplet_4) values \n"""

    _sql_kplet_file = """insert ignore into bacteria_4plets_win10 (kplet_id, file_id) values \n"""

    for kplet in kplets:
        kplet = list(kplet)
        kplet.sort()
        kplet = tuple(kplet)
        _sql_kplet += (
            """((select id from cdd_profiles where code='%s'),""" +
            """(select id from cdd_profiles where code='%s'),""" +
            """(select id from cdd_profiles where code='%s'),""" +
            """(select id from cdd_profiles where code='%s')),\n""") % kplet

        _sql_kplet_file += (
            """((select id from bacteria_4plets where """ +
            """kplet_1=(select id from cdd_profiles where code='%s') """ +
            """and kplet_2=(select id from cdd_profiles where code='%s') """ +
            """and kplet_3=(select id from cdd_profiles where code='%s') """ +
            """and kplet_4=(select id from cdd_profiles where code='%s')),""" +
            """(select id from bacteria_win10_files where name='%s')),\n"""
        ) % (kplet + (fname, ))

    _sql_kplet = _sql_kplet[:-2]
    _sql_kplet += ';'

    _sql_kplet_file = _sql_kplet_file[:-2]
    _sql_kplet_file += ';'

    _db = DbClass()

    _db.cmd = _sql_kplet
    _db.execute()

    _db.cmd = _sql_kplet_file
    _db.execute()
Пример #16
0
def get_report_kplets(limit_to=300, count_filter=1, load_locations=None):

    _db = DbClass()

    _db.cmd = """SET group_concat_max_len=15000"""
    _db.execute()

    _db.cmd = """select apc.*, s1.cnt, s1.wgt, s1.an
                from (
                        select ap.id ,count(*) as cnt, sum(w.weight) as wgt, group_concat(awf.name) as an
                        from archea_2plets ap
                        inner join archea_2plets_win10 apw on ap.id = apw.kplet_id
                        inner join archea_win10_files awf on apw.file_id = awf.id
                        inner join sources s on awf.source_id=s.id
                        inner join weights w on w.genome_id=s.genome_id
                        group by ap.id
                        having count(distinct s.genome_id) > %d ) s1
                inner join archea_2plets_codes apc on s1.id=apc.id
                order by s1.wgt desc
                limit 0,%d""" % (count_filter, limit_to)

    out_list = []

    for row in _db.retrieve():
        id = row[0]
        kplet_codes = (row[1:3])
        if len(set(kplet_codes)) != 2:
            continue
        count = row[3]
        files = row[5].split(',')
        tmp_kplet = Kplet(id=id, codes=kplet_codes, count=count, files=files)
        out_list.append(tmp_kplet)

    if load_locations:
        _path = neighborhoods_path()
        [kplet.load_locations(_path) for kplet in out_list]

    return out_list
Пример #17
0
def store_kplets_pile(prefix, kplets_pile, profile2id, file2id):

    kplet_table_name = "%s_4plets" % prefix
    kplet_table_file_name = "%s_4plets_files" % prefix

    _sql_kplet = """insert ignore into %s (kplet_1, kplet_2, kplet_3, kplet_4) values \n""" % kplet_table_name
    _sql_kplet_file = """insert ignore into %s (kplet_id, file_id) values \n""" % kplet_table_file_name

    for (kplets, fname) in kplets_pile:

        for kplet in kplets:
            kplet = list(kplet)
            kplet.sort()

            kplet = tuple([int(profile2id[k]) for k in kplet])

            _sql_kplet += """(%d, %d, %d, %d),\n""" % kplet

            _sql_kplet_file += ("""((select id from %s where """ +
                                """kplet_1=%d and kplet_2=%d and kplet_3=%d and kplet_4=%d),""" +
                                """%d),\n""") % ((kplet_table_name,) + kplet + (int(file2id[fname]),))

    _sql_kplet = _sql_kplet[:-2]
    _sql_kplet += ';'

    _sql_kplet_file = _sql_kplet_file[:-2]
    _sql_kplet_file += ';'

    _db = DbClass()

    _db.cmd = _sql_kplet
    _db.execute()
    _db.commit()

    _db.cmd = _sql_kplet_file
    _db.execute()
    _db.commit()
Пример #18
0
    total_cnt = 0
    chunk_size = 10000
    chunks = [
        table_rows[i:i + chunk_size]
        for i in range(0, len(table_rows), chunk_size)
    ]

    for row in table_rows:

        (_cnt, _kplet_id, _file_id) = row

        assert _cnt > 1

        _db.cmd = """delete from %s where kplet_id=%d and file_id=%d""" % (
            table_name, _kplet_id, _file_id)
        _db.execute()
        _db.commit()

        _db.cmd = """insert into %s(kplet_id, file_id) values (%d, %d)""" % (
            table_name, _kplet_id, _file_id)
        _db.execute()
        _db.commit()

        total_cnt += 1
        if total_cnt % 10000 == 0:
            print total_cnt

    print total_cnt
    print "Finished:", table_name
    print
    print
    print "Duplicates saved in file:", fname
    continue
    sys.exit()
    print("Starting to remove duplicates")

    total_cnt = 0
    chunk_size = 10000
    chunks = [table_rows[i: i+chunk_size] for i in range(0, len(table_rows), chunk_size)]

    for row in table_rows:

        (_cnt, _kplet_id, _file_id) = row

        assert _cnt > 1

        _db.cmd = """delete from %s where kplet_id=%d and file_id=%d""" % (table_name, _kplet_id, _file_id)
        _db.execute()
        _db.commit()

        _db.cmd = """insert into %s(kplet_id, file_id) values (%d, %d)""" % (table_name, _kplet_id, _file_id)
        _db.execute()
        _db.commit()

        total_cnt += 1
        if total_cnt % 10000 ==0:
            print total_cnt

    print total_cnt
    print "Finished:", table_name
    print
    print