Пример #1
0
def merging_pipeline_for_order(order, load_from_db=False):
    limit_to = 1000000000
    print "starting for ", order
    if load_from_db:
        print "Loading kplets from DB"
        if order == 2:
            kplet_file = 'duplets.p.bz2'
            #kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file))
        elif order == 3:
            kplet_file = 'triplets.p.bz2'
            #kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file))
        elif order == 4:
            kplet_file = 'quadruplets.p.bz2'
            kplets = q.get_report_kplets(profile_id2code, limit_to=limit_to)
        elif order == 5:
            kplet_file = 'pentaplets.p.bz2'
            kplets = p.get_report_kplets(profile_id2code, limit_to=limit_to)
    else:
        print "Loading kplets from pickle file"
        if order == 2:
            kplet_file = 'duplets.p.bz2'
        elif order == 3:
            kplet_file = 'triplets.p.bz2'
        elif order == 4:
            kplet_file = 'quadruplets.p.bz2'
        elif order == 5:
            kplet_file = 'pentaplets.p.bz2'
        kplet_file_full = os.path.join(data_path, kplet_file)
        print kplet_file_full
        kplets = t.load_compressed_pickle(kplet_file_full)

    # print "Starting for", kplet_file
    # print "Loading kplets"
    # kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file))
    print "No of kplets:", len(kplets)

    #print "Loading file2genes"

    #_file2genes = {}
    #for _f in os.listdir(neighborhood_files_path):
    #    _file2genes[_f] = dt.get_wgs_file(os.path.join(neighborhood_files_path, _f))

    # print 'Filtering'
    # kplets = filter_seed(kplets, _file2genes)
    # print "No of kplets:", len(kplets)
    # fname = os.path.join(data_path,  kplet_file.split('.')[0]+'_seed.p.bz2')
    # print 'Dumping', fname
    # t.dump_compressed_pickle(fname, kplets)

    print "Basic merging"
    merged_lists = merging.basic_merge_within_orders(kplets)
    fname = os.path.join(data_path, "basic_merged_" + kplet_file)
    print "Dumping basic merging: ", fname
    t.dump_compressed_pickle(fname, merged_lists)

    print "Iterative merging"
    merged_lists = merging.merge_kplets_within_orders_iterative(merged_lists)
    fname = os.path.join(data_path, "iterative_merged_" + kplet_file)
    print "Dumping Iterative merging: ", fname
    t.dump_compressed_pickle(fname, merged_lists)
Пример #2
0
def get_profiles_counts(data_path):
    # file_names = ['pentaplets', 'quadruplets', 'triplets', 'duplets']
    file_names = ['duplets', 'triplets', 'quadruplets', 'pentaplets']
    print 'Reading merged kplet files'

    for file_name in file_names:
        print 'Loading the file:', file_name
        dump_file = bz2.BZ2File(os.path.join(data_path, '%s_merged_across.p.bz2'%file_name))
        kplets_pool = pickle.load(dump_file)

        print 'Counting community'
        community_count_pool = []
        community_count_pool_with_flanks = []
        for kplets in kplets_pool:
            _src2org, _, _, community_count, community_count_with_flanks = merging.merge_into_file_summaries(kplets,
                                                                         neighborhood_files_path,
                                                                         file2src_src2org_map)
            if not _src2org:
                continue
            community_count_pool.append(community_count)
            community_count_pool_with_flanks.append(community_count_with_flanks)

        dump_file_name = os.path.join(data_path, '%s_community_count.p.bz2'%file_name)
        print 'Dumping into', dump_file_name
        t.dump_compressed_pickle(dump_file_name, community_count_pool)

        dump_file_name = os.path.join(data_path, '%s_community_count_with_flanks.p.bz2'%file_name)
        print 'Dumping into', dump_file_name
        t.dump_compressed_pickle(dump_file_name, community_count_pool_with_flanks)

        print
        print
Пример #3
0
def generate_reports(merged_lists, reports_dir, neighborhood_files_path):

    if not os.path.exists(reports_dir):
        os.mkdir(reports_dir)

    summary_file = os.path.join(reports_dir, 'summary.xls')
    workbook = x.Workbook(summary_file)
    worksheet = workbook.add_worksheet()

    header_format = workbook.add_format()
    header_format.set_font_size(12)
    header_format.set_bold()
    header_format.set_align('center')
    worksheet.set_column(3,3,50)
    worksheet.write_row(0, 0, ["File name", "Weight", "Loci", "CRISPR/Cas systems"], header_format)

    file_summary_list = []
    filter_weak_hits = False

    for i, kplet_list in enumerate(merged_lists):

        ret = merging.kplet_list_to_file_summaries(kplet_list,
                                                   neighborhood_files_path,
                                                   filter_weak_hits)

        if not ret or not ret.file_summaries:
            continue

        file_summary_list.append(ret)

    file_summaries_list = sorted(file_summary_list, key=lambda x: x.weight, reverse=True)

    ind = 0
    for file_summaries_wrapper in file_summaries_list:
        ind += 1
        t.dump_compressed_pickle('file_summaries_wrapper.p', file_summaries_wrapper)

        xls_file_name = os.path.join(reports_dir, '%d.xls' % ind)
        args = GenericReportingInput()

        args.xls_file_name               = xls_file_name
        args.file_summaries              = file_summaries_wrapper.file_summaries
        args.organisms                   = file_summaries_wrapper.organisms
        args.weight                      = file_summaries_wrapper.weight
        args.profile_code2def            = profile_code2def
        args.local_af_kplet2count        = file_summaries_wrapper.kplet2count_af
        args.local_bf_kplet2count        = file_summaries_wrapper.kplet2count_bf
        args.local_profile2count_bf      = file_summaries_wrapper.profile2count_bf
        args.local_profile2count_af      = file_summaries_wrapper.profile2count_af
        args.cas_type2count              = file_summaries_wrapper.cas_type2count

        r.write_to_xls_generic_kplets(args)

        cas_type_summary = ""

        for (cas_type, count) in sorted(file_summaries_wrapper.cas_type2count.items(), key = itemgetter(1), reverse=True):
            cas_type_summary += "%s : %d ; "%(cas_type, count)

        worksheet.write_row(ind+1, 0, ['%d.xls'%ind, len(file_summaries_wrapper.file_summaries), cas_type_summary])
Пример #4
0
def generate_data():

    print "Loading loci"
    loci = [Locus(os.path.join(files_path, f)) for f in os.listdir(files_path)]
    loci = [locus for locus in loci if len(locus.genes) > 2]

    fname = os.path.join(pickle_path, 'loci.p.bz2')
    t.dump_compressed_pickle(fname, loci)
    # loci = t.load_compressed_pickle(fname)
    out_file = os.path.join(pickle_path, 'jw_scores.npz')
    jackard_weighted_scores(loci, out_file)
def generate_data():

    print "Loading loci"
    loci = [Locus(os.path.join(files_path, f)) for f in os.listdir(files_path)]
    loci = [locus for locus in loci if len(locus.genes) > 2 ]

    fname = os.path.join(pickle_path, 'loci.p.bz2')
    t.dump_compressed_pickle(fname, loci)
    # loci = t.load_compressed_pickle(fname)
    out_file = os.path.join(pickle_path, 'jw_scores.npz')
    jackard_weighted_scores(loci, out_file)
Пример #6
0
def generate_pickles(save_path, limit_to):
    if not os.path.exists(save_path):
        os.mkdir(save_path)

    duplets = d.get_report_kplets(limit_to=limit_to, load_locations=True)

    print 'Dumping to files'
    dump_file = os.path.join(save_path, 'duplets_raw.p.bz2')
    t.dump_compressed_pickle(dump_file, duplets)

    print 'Done for limit_to:', limit_to
    print
    print
def generate_pickles(save_path, limit_to):
    if not os.path.exists(save_path):
        os.mkdir(save_path)

    duplets = d.get_report_kplets(limit_to=limit_to, load_locations=True)

    print 'Dumping to files'
    dump_file = os.path.join(save_path, 'duplets_raw.p.bz2')
    t.dump_compressed_pickle(dump_file, duplets)

    print 'Done for limit_to:', limit_to
    print
    print
Пример #8
0
def generate_pickles(save_path, limit_to):

    if not os.path.exists(save_path):
        os.mkdir(save_path)

    print "Loading from DB"
    print "pentaplets"
    pentaplets  =  p.get_report_kplets(profile_id2code, limit_to=limit_to)
    print "quadruplets"
    quadruplets =  q.get_report_kplets(profile_id2code, limit_to=limit_to)
    print "triplets"
    triplets    = tr.get_report_kplets(profile_id2code, limit_to=limit_to)
    print "duplets"
    duplets     =  d.get_report_kplets(profile_id2code, limit_to=limit_to)

    print "Dumping to files"

    dump_file = os.path.join(save_path, 'duplets.p.bz2')
    print dump_file
    t.dump_compressed_pickle(dump_file, duplets)

    dump_file = os.path.join(save_path, 'triplets.p.bz2')
    print dump_file
    t.dump_compressed_pickle(dump_file, triplets)

    dump_file = os.path.join(save_path, 'quadruplets.p.bz2')
    print dump_file
    t.dump_compressed_pickle(dump_file, quadruplets)

    dump_file = os.path.join(save_path, 'pentaplets.p.bz2')
    print dump_file
    t.dump_compressed_pickle(dump_file, pentaplets)
Пример #9
0
def generate_pickles(save_path, limit_to):

    if not os.path.exists(save_path):
        os.mkdir(save_path)

    print "Loading from DB"
    print "pentaplets"
    pentaplets = p.get_report_kplets(profile_id2code, limit_to=limit_to)
    print "quadruplets"
    quadruplets = q.get_report_kplets(profile_id2code, limit_to=limit_to)
    print "triplets"
    triplets = tr.get_report_kplets(profile_id2code, limit_to=limit_to)
    print "duplets"
    duplets = d.get_report_kplets(profile_id2code, limit_to=limit_to)

    print "Dumping to files"

    dump_file = os.path.join(save_path, 'duplets.p.bz2')
    print dump_file
    t.dump_compressed_pickle(dump_file, duplets)

    dump_file = os.path.join(save_path, 'triplets.p.bz2')
    print dump_file
    t.dump_compressed_pickle(dump_file, triplets)

    dump_file = os.path.join(save_path, 'quadruplets.p.bz2')
    print dump_file
    t.dump_compressed_pickle(dump_file, quadruplets)

    dump_file = os.path.join(save_path, 'pentaplets.p.bz2')
    print dump_file
    t.dump_compressed_pickle(dump_file, pentaplets)
Пример #10
0
def generate_pickle_order(prefix, order, save_path, limit_to):

    print "Loading from DB"
    if order == 2:
        print 'duplets'
        data_file = 'duplets.p.bz2'
        kplets = d.get_report_kplets(prefix,
                                     profile_id2code,
                                     limit_to=limit_to)
    elif order == 3:
        print 'triplets'
        data_file = 'triplets.p.bz2'
        kplets = tr.get_report_kplets(prefix,
                                      profile_id2code,
                                      limit_to=limit_to)
    elif order == 4:
        print 'quadruplets'
        data_file = 'quadruplets.p.bz2'
        kplets = q.get_report_kplets(prefix,
                                     profile_id2code,
                                     limit_to=limit_to)
    elif order == 5:
        print 'pentaplets'
        data_file = 'pentaplets.p.bz2'
        kplets = p.get_report_kplets(prefix,
                                     profile_id2code,
                                     limit_to=limit_to)

    # # block for work aorund of too bign pentaplet
    # print 'Loading file2genes'
    # neighborhood_files_path = os.path.join(gv.project_data_path,'CRISPR/datasets/crispr/wgs')
    # _file2genes = {}
    # for _f in os.listdir(neighborhood_files_path):
    #     _file2genes[_f] = dt.get_wgs_file(os.path.join(neighborhood_files_path, _f))
    #
    # kplets = filter_seed(kplets, _file2genes)

    # dump_file = os.path.join(save_path, data_file)
    # print "Dumtiping to file", dump_file
    # t.dump_compressed_pickle(kplets, dump_file)
    # print "Finished"
    # sys.exit()

    dump_file = os.path.join(save_path, data_file)
    print "Dumping to file", dump_file
    t.dump_compressed_pickle(dump_file, kplets)
    print "Finished"
Пример #11
0
def cas4_extract_dendrogram():

    work_dir = os.path.join(gv.project_data_path, 'cas4')

    print "Loading loci"

    def_file = os.path.join(gv.project_data_path, 'cas4/profiles/defenseProfiles.tab')
    profile2gene = {}

    for l in open(def_file):
        terms = l.split('\t')
        profile = terms[0]
        gene_names = terms[3].split(',')
        if len(gene_names) > 1:
            profile2gene[profile] = gene_names[1]
        else:
            profile2gene[profile] = gene_names[0]

    cdd_profile2gene = t.map_cdd_profile2gene_name()
    cdd_profile2gene.update(profile2gene)

    files_path = os.path.join(work_dir, 'files')

    loci = [Locus(os.path.join(files_path, f), file_format='generic', profile2gene=cdd_profile2gene) for f in
            os.listdir(files_path)]

    tic = time.time()
    print "Generating score matrix"
    M = scores.generate_jackard_score_matrix(loci)

    tac = time.time() - tic
    print "Elapsed time:", float(tac) / 60 / 60, float(tac) / 60, float(tac)

    tic = time.time()
    jw_file = os.path.join(work_dir, 'pickle/jw_scores.p.bz2')
    print "Dumping JW scores to:", jw_file
    t.dump_compressed_pickle(jw_file, M)
    tac = time.time() - tic
    print "Elapsed time:", float(tac) / 60 / 60, float(tac) / 60, float(tac)

    # print "Loading JW scores from:", prok1603_jw_file
    # M = t.load_compressed_pickle(prok1603_jw_file)

    tree_file = os.path.join(work_dir, 'jw_upgma.tre')
    print "Generating tree:", tree_file
    dnd.convert_score_to_newick(M, [os.path.basename(l.file_name) for l in loci], tree_file)
Пример #12
0
def generate_pickle_order(order, save_path, limit_to):

    print "Loading from DB"
    if order == 2:
        print 'duplets'
        data_file = 'duplets.p.bz2'
        kplets = d.get_report_kplets(profile_id2code, limit_to=limit_to)
    elif order == 3:
        print 'triplets'
        data_file = 'triplets.p.bz2'
        kplets = tr.get_report_kplets(profile_id2code, limit_to=limit_to)
    elif order == 4:
        print 'quadruplets'
        data_file = 'quadruplets.p.bz2'
        kplets = q.get_report_kplets(profile_id2code, limit_to=limit_to)
    elif order == 5:
        print 'pentaplets'
        data_file = 'pentaplets.p.bz2'
        kplets = p.get_report_kplets(profile_id2code, limit_to=limit_to)

        # # block for work aorund of too bign pentaplet
        # print 'Loading file2genes'
        # neighborhood_files_path = os.path.join(gv.project_data_path,'CRISPR/datasets/crispr/wgs')
        # _file2genes = {}
        # for _f in os.listdir(neighborhood_files_path):
        #     _file2genes[_f] = dt.get_wgs_file(os.path.join(neighborhood_files_path, _f))
        #
        # kplets = filter_seed(kplets, _file2genes)

        # dump_file = os.path.join(save_path, data_file)
        # print "Dumtiping to file", dump_file
        # t.dump_compressed_pickle(kplets, dump_file)
        # print "Finished"
        # sys.exit()

    dump_file = os.path.join(save_path, data_file)
    print "Dumping to file", dump_file
    t.dump_compressed_pickle(dump_file, kplets)
    print "Finished"
Пример #13
0
def get_profiles_counts(data_path):
    # file_names = ['pentaplets', 'quadruplets', 'triplets', 'duplets']
    file_names = ['duplets', 'triplets', 'quadruplets', 'pentaplets']
    print 'Reading merged kplet files'

    for file_name in file_names:
        print 'Loading the file:', file_name
        dump_file = bz2.BZ2File(
            os.path.join(data_path, '%s_merged_across.p.bz2' % file_name))
        kplets_pool = pickle.load(dump_file)

        print 'Counting community'
        community_count_pool = []
        community_count_pool_with_flanks = []
        for kplets in kplets_pool:
            _src2org, _, _, community_count, community_count_with_flanks = merging.merge_into_file_summaries(
                kplets, neighborhood_files_path, file2src_src2org_map)
            if not _src2org:
                continue
            community_count_pool.append(community_count)
            community_count_pool_with_flanks.append(
                community_count_with_flanks)

        dump_file_name = os.path.join(data_path,
                                      '%s_community_count.p.bz2' % file_name)
        print 'Dumping into', dump_file_name
        t.dump_compressed_pickle(dump_file_name, community_count_pool)

        dump_file_name = os.path.join(
            data_path, '%s_community_count_with_flanks.p.bz2' % file_name)
        print 'Dumping into', dump_file_name
        t.dump_compressed_pickle(dump_file_name,
                                 community_count_pool_with_flanks)

        print
        print
Пример #14
0
def graph_from_prok1603():

    genome2weight = parser.map_org2weight()

    pty_path = "/panfs/pan1.be-md.ncbi.nlm.nih.gov/patternquest/data/Prok1603/pty"
    work_dir = os.path.join(data_path, 'prok1603/graph/')

    crisprs = parser.get_crispr_annotations()

    pair2weight = defaultdict(float)
    pair2count = defaultdict(int)
    profile2weight = defaultdict(float)

    cnt = 1

    weighted_sum = 0
    total_weight = 0

    for dir in os.listdir(pty_path):

        if dir not in genome2weight:
            continue

        print cnt, dir
        _weight = genome2weight[dir]

        source_files = [
            f for f in os.listdir(os.path.join(pty_path, dir))
            if f.endswith(".pty2")
        ]

        for source_file in source_files:

            source = os.path.splitext(source_file)[0]

            genes = t.parse_pty_file(os.path.join(pty_path, dir, source_file))

            if not genes:
                continue

            genes += crisprs[source]
            genes = sorted(genes)

            add_to_graph(pair2weight, pair2count, profile2weight, genes,
                         _weight)

            profiles_num = 0

            for gene in genes:
                _p = gene.profiles
                if _p:
                    profiles_num += len(_p)
                else:
                    profiles_num += 1

            if profiles_num > 0:
                weighted_sum += 2 * _weight / profiles_num
                total_weight += _weight

        cnt += 1
        # if cnt == 10:
        #     break

    print "Weighted average of 2/N", weighted_sum / total_weight
    # Result of the above is: 0.0451314274706

    G = nx.Graph()

    for k, v in pair2weight.items():
        p1, p2 = k.split("-")
        G.add_edge(p1, p2, weight=v)

        _count = pair2count[k]
        G.add_edge(p1, p2, count=_count)

    graph_file = os.path.join(work_dir, "adj_graph.p")
    print "Writing to file:", graph_file
    nx.write_gpickle(G, graph_file)

    graph_file = os.path.join(work_dir, "adj_graph.p.bz2")
    print "Writing to file:", graph_file
    t.dump_compressed_pickle(graph_file, G)
Пример #15
0
def merging_pipeline_for_order(order, data_path, load_from_db=False):
    limit_to = 1000000000
    print "starting for ", order
    if load_from_db:
        print "Loading kplets from DB"
        if order == 2:
            kplet_file = 'duplets.p.bz2'
        elif order == 3:
            kplet_file = 'triplets.p.bz2'
        elif order == 4:
            kplet_file = 'quadruplets.p.bz2'
            kplets = q.get_report_kplets(profile_id2code, limit_to=limit_to)
        elif order == 5:
            kplet_file = 'pentaplets.p.bz2'
            kplets = p.get_report_kplets(profile_id2code, limit_to=limit_to)
    else:
        print "Loading kplets from pickle file"
        if order == 2:
            kplet_file = 'duplets.p.bz2'
        elif order == 3:
            kplet_file = 'triplets.p.bz2'
        elif order == 4:
            kplet_file = 'quadruplets.p.bz2'
        elif order == 5:
            kplet_file = 'pentaplets.p.bz2'

        kplet_file_full = os.path.join(data_path, kplet_file)
        print "Loading :",kplet_file_full
        kplets = t.load_compressed_pickle(kplet_file_full)

    print "No of kplets:", len(kplets)

    loci_threshold = 0.7

    print "Starting to merge with loci_threshold:", loci_threshold
    #print "Loading file2genes"
    tic = time.time()
    print "Basic merging"
    merged_lists = merging.basic_merge_within_orders(kplets, loci_threshold)
    print "Basic merging done. Merged lists:", len(merged_lists)
    fname = os.path.join(data_path, "basic_merged_"+"%f_"%loci_threshold+kplet_file)
    print "Dumping basic merging: ", fname
    t.dump_compressed_pickle(fname, merged_lists)

    print "Iterative merging"
    merged_lists = merging.merge_kplets_within_orders_iterative(merged_lists, loci_threshold)
    print "Iterative merging done. Merged lists:", len(merged_lists)
    fname = os.path.join(data_path, "iterative_merged_"+"%f_"%loci_threshold+kplet_file)
    print "Dumping Iterative merging: ",fname
    t.dump_compressed_pickle(fname, merged_lists)
    print "Completed in:", time.time()-tic, "(s)"

    loci_threshold = 0.8

    print "Starting to merge with loci_threshold:", loci_threshold
    #print "Loading file2genes"
    tic = time.time()
    print "Basic merging"
    merged_lists = merging.basic_merge_within_orders(kplets, loci_threshold)
    print "Basic merging done. Merged lists:", len(merged_lists)
    fname = os.path.join(data_path, "basic_merged_"+"%f_"%loci_threshold+kplet_file)
    print "Dumping basic merging: ", fname
    t.dump_compressed_pickle(fname, merged_lists)

    print "Iterative merging"
    merged_lists = merging.merge_kplets_within_orders_iterative(merged_lists, loci_threshold)
    print "Iterative merging done. Merged lists:", len(merged_lists)
    fname = os.path.join(data_path, "iterative_merged_"+"%f_"%loci_threshold+kplet_file)
    print "Dumping Iterative merging: ",fname
    t.dump_compressed_pickle(fname, merged_lists)
    print "Completed in:", time.time()-tic, "(s)"

    loci_threshold = 0.9

    print "Starting to merge with loci_threshold:", loci_threshold
    #print "Loading file2genes"
    tic = time.time()
    print "Basic merging"
    merged_lists = merging.basic_merge_within_orders(kplets, loci_threshold)
    print "Basic merging done. Merged lists:", len(merged_lists)
    fname = os.path.join(data_path, "basic_merged_"+"%f_"%loci_threshold+kplet_file)
    print "Dumping basic merging: ", fname
    t.dump_compressed_pickle(fname, merged_lists)

    print "Iterative merging"
    merged_lists = merging.merge_kplets_within_orders_iterative(merged_lists, loci_threshold)
    print "Iterative merging done. Merged lists:", len(merged_lists)
    fname = os.path.join(data_path, "iterative_merged_"+"%f_"%loci_threshold+kplet_file)
    print "Dumping Iterative merging: ",fname
    t.dump_compressed_pickle(fname, merged_lists)
    print "Completed in:", time.time()-tic, "(s)"
Пример #16
0
from lib.db.bacteria import pentaplets as p


if __name__=='__main__':

    work_path = os.path.join(gv.project_data_path, 'Bacteria/cases')
    pty_path = gv.pty_data_path

    kplet_id = 306123

    id2cdd = map_id2cdd_clusters()
    kplet = p.get_report_kplet(kplet_id, id2cdd, load_locations=True)

    target_profiles = set(t.bacteria_target_profiles())
    dump_file = os.path.join(work_path, 'kplet.p.bz2')
    t.dump_compressed_pickle(dump_file, kplet)

    kplet = t.load_compressed_pickle(dump_file)
    kplet_codes = kplet.codes.difference(target_profiles)

    org2src, src2blocks = sig.search_kplet_in_genomes(kplet_codes, target_profiles, max_dist=4)

    # dump_file = os.path.join(work_path, 'org2src_global.p.bz2')
    # t.dump_compressed_pickle(dump_file, org2src)
    # dump_file = os.path.join(work_path, 'src2blocks_global.p.bz2')
    # t.dump_compressed_pickle(dump_file, src2blocks)
    #
    # dump_file = os.path.join(work_path, 'org2src_global.p.bz2')
    # org2src = t.load_compressed_pickle(dump_file)
    # dump_file = os.path.join(work_path, 'src2blocks_global.p.bz2')
    # src2blocks = t.load_compressed_pickle(dump_file)
Пример #17
0
def generate_reports(merged_lists, reports_dir, neighborhood_files_path):

    if not os.path.exists(reports_dir):
        os.mkdir(reports_dir)

    summary_file = os.path.join(reports_dir, 'summary.xls')
    workbook = x.Workbook(summary_file)
    worksheet = workbook.add_worksheet()

    header_format = workbook.add_format()
    header_format.set_font_size(12)
    header_format.set_bold()
    header_format.set_align('center')
    worksheet.set_column(3, 3, 50)
    worksheet.write_row(0, 0,
                        ["File name", "Weight", "Loci", "CRISPR/Cas systems"],
                        header_format)

    file_summary_list = []
    filter_weak_hits = False

    for i, kplet_list in enumerate(merged_lists):

        ret = merging.kplet_list_to_file_summaries(kplet_list,
                                                   neighborhood_files_path,
                                                   filter_weak_hits)

        if not ret or not ret.file_summaries:
            continue

        file_summary_list.append(ret)

    file_summaries_list = sorted(file_summary_list,
                                 key=lambda x: x.weight,
                                 reverse=True)

    ind = 0
    for file_summaries_wrapper in file_summaries_list:
        ind += 1
        t.dump_compressed_pickle('file_summaries_wrapper.p',
                                 file_summaries_wrapper)

        xls_file_name = os.path.join(reports_dir, '%d.xls' % ind)
        args = GenericReportingInput()

        args.xls_file_name = xls_file_name
        args.file_summaries = file_summaries_wrapper.file_summaries
        args.organisms = file_summaries_wrapper.organisms
        args.weight = file_summaries_wrapper.weight
        args.profile_code2def = profile_code2def
        args.local_af_kplet2count = file_summaries_wrapper.kplet2count_af
        args.local_bf_kplet2count = file_summaries_wrapper.kplet2count_bf
        args.local_profile2count_bf = file_summaries_wrapper.profile2count_bf
        args.local_profile2count_af = file_summaries_wrapper.profile2count_af
        args.cas_type2count = file_summaries_wrapper.cas_type2count

        r.write_to_xls_generic_kplets(args)

        cas_type_summary = ""

        for (cas_type,
             count) in sorted(file_summaries_wrapper.cas_type2count.items(),
                              key=itemgetter(1),
                              reverse=True):
            cas_type_summary += "%s : %d ; " % (cas_type, count)

        worksheet.write_row(ind + 1, 0, [
            '%d.xls' % ind,
            len(file_summaries_wrapper.file_summaries), cas_type_summary
        ])
Пример #18
0
def merging_pipeline_for_order(order, data_path, load_from_db=False):
    limit_to = 1000000000
    print "starting for ", order
    if load_from_db:
        print "Loading kplets from DB"
        if order == 2:
            kplet_file = 'duplets.p.bz2'
        elif order == 3:
            kplet_file = 'triplets.p.bz2'
        elif order == 4:
            kplet_file = 'quadruplets.p.bz2'
            kplets = q.get_report_kplets(profile_id2code, limit_to=limit_to)
        elif order == 5:
            kplet_file = 'pentaplets.p.bz2'
            kplets = p.get_report_kplets(profile_id2code, limit_to=limit_to)
    else:
        print "Loading kplets from pickle file"
        if order == 2:
            kplet_file = 'duplets.p.bz2'
        elif order == 3:
            kplet_file = 'triplets.p.bz2'
        elif order == 4:
            kplet_file = 'quadruplets.p.bz2'
        elif order == 5:
            kplet_file = 'pentaplets.p.bz2'

        kplet_file_full = os.path.join(data_path, kplet_file)
        print "Loading :", kplet_file_full
        kplets = t.load_compressed_pickle(kplet_file_full)

    print "No of kplets:", len(kplets)

    loci_threshold = 0.7

    print "Starting to merge with loci_threshold:", loci_threshold
    #print "Loading file2genes"
    tic = time.time()
    print "Basic merging"
    merged_lists = merging.basic_merge_within_orders(kplets, loci_threshold)
    print "Basic merging done. Merged lists:", len(merged_lists)
    fname = os.path.join(data_path,
                         "basic_merged_" + "%f_" % loci_threshold + kplet_file)
    print "Dumping basic merging: ", fname
    t.dump_compressed_pickle(fname, merged_lists)

    print "Iterative merging"
    merged_lists = merging.merge_kplets_within_orders_iterative(
        merged_lists, loci_threshold)
    print "Iterative merging done. Merged lists:", len(merged_lists)
    fname = os.path.join(
        data_path, "iterative_merged_" + "%f_" % loci_threshold + kplet_file)
    print "Dumping Iterative merging: ", fname
    t.dump_compressed_pickle(fname, merged_lists)
    print "Completed in:", time.time() - tic, "(s)"

    loci_threshold = 0.8

    print "Starting to merge with loci_threshold:", loci_threshold
    #print "Loading file2genes"
    tic = time.time()
    print "Basic merging"
    merged_lists = merging.basic_merge_within_orders(kplets, loci_threshold)
    print "Basic merging done. Merged lists:", len(merged_lists)
    fname = os.path.join(data_path,
                         "basic_merged_" + "%f_" % loci_threshold + kplet_file)
    print "Dumping basic merging: ", fname
    t.dump_compressed_pickle(fname, merged_lists)

    print "Iterative merging"
    merged_lists = merging.merge_kplets_within_orders_iterative(
        merged_lists, loci_threshold)
    print "Iterative merging done. Merged lists:", len(merged_lists)
    fname = os.path.join(
        data_path, "iterative_merged_" + "%f_" % loci_threshold + kplet_file)
    print "Dumping Iterative merging: ", fname
    t.dump_compressed_pickle(fname, merged_lists)
    print "Completed in:", time.time() - tic, "(s)"

    loci_threshold = 0.9

    print "Starting to merge with loci_threshold:", loci_threshold
    #print "Loading file2genes"
    tic = time.time()
    print "Basic merging"
    merged_lists = merging.basic_merge_within_orders(kplets, loci_threshold)
    print "Basic merging done. Merged lists:", len(merged_lists)
    fname = os.path.join(data_path,
                         "basic_merged_" + "%f_" % loci_threshold + kplet_file)
    print "Dumping basic merging: ", fname
    t.dump_compressed_pickle(fname, merged_lists)

    print "Iterative merging"
    merged_lists = merging.merge_kplets_within_orders_iterative(
        merged_lists, loci_threshold)
    print "Iterative merging done. Merged lists:", len(merged_lists)
    fname = os.path.join(
        data_path, "iterative_merged_" + "%f_" % loci_threshold + kplet_file)
    print "Dumping Iterative merging: ", fname
    t.dump_compressed_pickle(fname, merged_lists)
    print "Completed in:", time.time() - tic, "(s)"
Пример #19
0
def generate_cas4_gi_summary_file(singles,
                                  cluster_packs,
                                  loci,
                                  reports_dir,
                                  cluster2summary_file_name):

        cluster2summary = {int(l.split('\t')[0]):l.split('\t')[1].strip() for l in open(cluster2summary_file_name)}
        summary_file = open(os.path.join(reports_dir, 'cas4_gi_summary.tab'), 'w')

        cas_filter = set(["cas3", "cas5", "cas8c", "cas7", "cas4", "cas1", "cas2"])

        gi2crispr_type = {}

        ind = 1
        for outer_i in range(len(cluster_packs)):
            (cluster, type2count, entropy, gene2count) = cluster_packs[outer_i]

            sorted_gene2count = sorted(gene2count.items(), key=lambda x: x[1], reverse=True)
            top_genes = set(k for k,v in sorted_gene2count[:5])

            cl_loci = [loci[_i] for _i in cluster]

            cl_summary = cluster2summary[ind]

            for locus in cl_loci:

                cas4_genes = [g for g in locus.genes if g.is_seed]

                cl_genes = set(gene_name for gene in locus.genes for gene_name in gene.gene_name.split(','))

                # cas_genes = ",".join(cas_filter.intersection(cl_genes))
                cas_genes_summary = []

                buffer = []
                for gene in locus.genes:
                    for gene_name in set(gene.gene_name.split(",")):

                        if gene_name in top_genes:

                            buffer.append(gene_name)
                            cas_genes_summary += buffer
                            buffer = []
                            continue

                        buffer.append(gene_name if gene_name else "?")

                cas_genes_summary = "+".join(cas_genes_summary)

                for gene in cas4_genes:
                    summary_file.write("%s\t%s\t%s\t%s\t%s\n" % (gene.gid,
                                                             locus.organism,
                                                             cl_summary,
                                                             cas_genes_summary,
                                                             "%d.xlsx" % ind))

                    gi2crispr_type[gene.gid] = locus.crispr_type

            ind += 1

        for single_ind in singles:

            locus = loci[single_ind]

            cas4_genes = [g for g in locus.genes if g.is_seed]
            cas_genes_summary = []

            buffer = []
            for gene in locus.genes:
                for gene_name in gene.gene_name.split(","):

                    if gene_name in cas_filter:
                        buffer.append(gene_name)
                        cas_genes_summary += buffer
                        buffer = []

                    buffer.append(gene_name if gene_name else "?")

            cas_genes_summary = "+".join(cas_genes_summary)

            for gene in cas4_genes:
                summary_file.write("%s\t%s\t%s\t%s\n" % (gene.gid, locus.organism, "singleton", cas_genes_summary))
                gi2crispr_type[gene.gid] = locus.crispr_type

        summary_file.close()

        fname = os.path.join(gv.project_data_path, 'cas4/pickle/cas4_gi2crispr_type.p.bz2')

        t.dump_compressed_pickle(fname, gi2crispr_type)

        return gi2crispr_type
Пример #20
0
from lib.db import map_id2cdd_clusters
from lib.db.bacteria import pentaplets as p

if __name__ == '__main__':

    work_path = os.path.join(gv.project_data_path, 'Bacteria/cases')
    pty_path = gv.pty_data_path

    kplet_id = 306123

    id2cdd = map_id2cdd_clusters()
    kplet = p.get_report_kplet(kplet_id, id2cdd, load_locations=True)

    target_profiles = set(t.bacteria_target_profiles())
    dump_file = os.path.join(work_path, 'kplet.p.bz2')
    t.dump_compressed_pickle(dump_file, kplet)

    kplet = t.load_compressed_pickle(dump_file)
    kplet_codes = kplet.codes.difference(target_profiles)

    org2src, src2blocks = sig.search_kplet_in_genomes(kplet_codes,
                                                      target_profiles,
                                                      max_dist=4)

    # dump_file = os.path.join(work_path, 'org2src_global.p.bz2')
    # t.dump_compressed_pickle(dump_file, org2src)
    # dump_file = os.path.join(work_path, 'src2blocks_global.p.bz2')
    # t.dump_compressed_pickle(dump_file, src2blocks)
    #
    # dump_file = os.path.join(work_path, 'org2src_global.p.bz2')
    # org2src = t.load_compressed_pickle(dump_file)
Пример #21
0
def generate_pickles(save_path, limit_to):

    if not os.path.exists(save_path):
        os.mkdir(save_path)

    # print "Loading kplets from DB"
    # pentaplets  =  p.get_report_kplets(limit_to=limit_to, load_locations=True)
    # quadruplets =  q.get_report_kplets(limit_to=limit_to, load_locations=True)
    # triplets    = tr.get_report_kplets(limit_to=limit_to, load_locations=True)
    # duplets     =  d.get_report_kplets(limit_to=limit_to, load_locations=True)

    # print "Dumping raw kplet data to files"
    # dump_file = bz2.BZ2File(os.path.join(save_path, 'duplets_raw.p.bz2'), 'w')
    # pickle.dump(duplets, dump_file)
    # dump_file = bz2.BZ2File(os.path.join(save_path, 'triplets_raw.p.bz2'), 'w')
    # pickle.dump(triplets, dump_file)
    # dump_file = bz2.BZ2File(os.path.join(save_path, 'quadruplets_raw.p.bz2'), 'w')
    # pickle.dump(quadruplets, dump_file)
    # dump_file = bz2.BZ2File(os.path.join(save_path, 'pentaplets_raw.p.bz2'), 'w')
    # pickle.dump(pentaplets, dump_file)

    print "Loading raw kplets from pickles"
    dump_file = os.path.join(save_path, 'duplets_raw.p.bz2')
    duplets = t.load_compressed_pickle(dump_file)
    dump_file = os.path.join(save_path, 'triplets_raw.p.bz2')
    triplets= t.load_compressed_pickle(dump_file)
    dump_file = os.path.join(save_path, 'quadruplets_raw.p.bz2')
    quadruplets = t.load_compressed_pickle(dump_file)
    # dump_file = os.path.join(save_path, 'pentaplets_raw.p.bz2')
    # pentaplets = t.load_compressed_pickle(dump_file)

    print "Basic within merging"
    # pentaplets = merging.basic_merge_within_orders(pentaplets)
    quadruplets= merging.basic_merge_within_orders(quadruplets)
    triplets = merging.basic_merge_within_orders(triplets)
    duplets = merging.basic_merge_within_orders(duplets)

    print "Dumping basic merges"
    # dump_file = os.path.join(save_path, 'pentaplets_basic_merged.p.bz2')
    # t.dump_compressed_pickle(dump_file, pentaplets)
    dump_file = os.path.join(save_path, 'quadruplets_basic_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, quadruplets)
    dump_file = os.path.join(save_path, 'triplets_basic_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, triplets)
    dump_file = os.path.join(save_path, 'duplets_basic_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, duplets)

    # print "Loading basic merges"
    # dump_file = os.path.join(save_path, 'pentaplets_basic_merged.p.bz2')
    # pentaplets = t.load_compressed_pickle(dump_file)
    # dump_file = os.path.join(save_path, 'quadruplets_basic_merged.p.bz2')
    # quadruplets = t.load_compressed_pickle(dump_file)
    # dump_file = os.path.join(save_path, 'triplets_basic_merged.p.bz2')
    # triplets = t.load_compressed_pickle(dump_file)
    # dump_file = os.path.join(save_path, 'duplets_basic_merged.p.bz2')
    # duplets = t.load_compressed_pickle(dump_file)
    sys.exit()

    print 'Starting iterative within mergings'
    pentaplets  = merging.merge_kplets_within_orders_iterative_2(pentaplets)
    quadruplets = merging.merge_kplets_within_orders_iterative_2(quadruplets)
    triplets    = merging.merge_kplets_within_orders_iterative_2(triplets)
    duplets     = merging.merge_kplets_within_orders_iterative_2(duplets)

    print "Dumping iterative merges"
    dump_file = os.path.join(save_path, 'pentaplets_iterative_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, pentaplets)
    dump_file = os.path.join(save_path, 'quadruplets_iterative_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, quadruplets)
    dump_file = os.path.join(save_path, 'triplets_iterative_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, triplets)
    dump_file = os.path.join(save_path, 'duplets_iterative_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, duplets)
    
    sys.exit()
    print 'Dumping merged kplet lists to files'
    dump_file = bz2.BZ2File(os.path.join(save_path, 'pentaplets_merged_within.p.bz2'), 'w')
    pickle.dump(pentaplets, dump_file)
    dump_file = bz2.BZ2File(os.path.join(save_path, 'quadruplets_merged_within.p.bz2'), 'w')
    pickle.dump(quadruplets, dump_file)
    dump_file = bz2.BZ2File(os.path.join(save_path, 'triplets_merged_within.p.bz2'), 'w')
    pickle.dump(triplets, dump_file)
    dump_file = bz2.BZ2File(os.path.join(save_path, 'duplets_merged_within.p.bz2'), 'w')
    pickle.dump(duplets, dump_file)

    print 'Starting accross mergings'
    triplets, duplets = merging.merge_kplets_across_orders(triplets, duplets)
    quadruplets, triplets = merging.merge_kplets_across_orders(quadruplets, triplets)
    pentaplets, quadruplets = merging.merge_kplets_across_orders(pentaplets, quadruplets)

    print 'Dumping to files'
    dump_file = bz2.BZ2File(os.path.join(save_path, 'pentaplets_merged_across.p.bz2'), 'w')
    pickle.dump(pentaplets, dump_file)
    dump_file = bz2.BZ2File(os.path.join(save_path, 'quadruplets_merged_across.p.bz2'), 'w')
    pickle.dump(quadruplets, dump_file)
    dump_file = bz2.BZ2File(os.path.join(save_path, 'triplets_merged_across.p.bz2'), 'w')
    pickle.dump(triplets, dump_file)
    dump_file = bz2.BZ2File(os.path.join(save_path, 'duplets_merged_across.p.bz2'), 'w')
    pickle.dump(duplets, dump_file)

    print 'Done for limit_to:', limit_to
    print
    print
Пример #22
0
def prok1603_extract_dendrogram():

    work_dir = os.path.join(gv.project_data_path, 'UvrD/')

    files_path = os.path.join(work_dir, 'prok1603/merged_files/')

    print "Loading loci"

    def_file = os.path.join(gv.project_data_path, 'cas4/profiles/defenseProfiles.tab')
    profile2gene = {}

    for l in open(def_file):
        terms = l.split('\t')
        profile = terms[0]
        gene_names = terms[3].split(',')
        if len(gene_names) > 1:
            profile2gene[profile] = gene_names[1]
        else:
            profile2gene[profile] = gene_names[0]

    cdd_profile2gene = t.map_cdd_profile2gene_name()
    cdd_profile2gene.update(profile2gene)

    loci = [BasicLocus(os.path.join(files_path, f), profile2gene=cdd_profile2gene) for f in
            os.listdir(files_path)]

    prok1603_loci_file = os.path.join(work_dir, 'prok1603/prok1603_loci.p.bz2')
    # loci = t.load_compressed_pickle(prok1603_loci_file)
    print "Loci:", len(loci)
    print "Dumping loci to:", prok1603_loci_file

    t.dump_compressed_pickle(prok1603_loci_file, loci)
    sys.exit()

    tic = time.time()
    print "Generating score matrix"
    M = scores.generate_jackard_score_matrix(loci)

    tac = time.time() - tic
    print "Elapsed time:", float(tac)/60/60, float(tac)/60, float(tac)

    tic = time.time()
    prok1603_jw_file = os.path.join(work_dir, 'prok1603_jw_scores.p')
    print "Dumping JW scores to:", prok1603_jw_file
    with open(prok1603_jw_file, 'wb') as outf:
        cPickle.dump(M, outf, protocol=cPickle.HIGHEST_PROTOCOL)
    tac = time.time() - tic
    print "Elapsed time:", float(tac) / 60 / 60, float(tac) / 60, float(tac)

    tic = time.time()
    prok1603_jw_file = os.path.join(work_dir, 'prok1603_jw_scores.p.bz2')
    print "Dumping JW scores to:", prok1603_jw_file
    t.dump_compressed_pickle(prok1603_jw_file, M)
    tac = time.time() - tic
    print "Elapsed time:", float(tac) / 60 / 60, float(tac) / 60, float(tac)

    # print "Loading JW scores from:", prok1603_jw_file
    # M = t.load_compressed_pickle(prok1603_jw_file)

    tic = time.time()
    prok1603_jw_file = os.path.join(work_dir, 'prok1603/prok1603_jw_scores.npz')
    # prok1603_jw_file = os.path.join('/Users/hudaiber/Projects/NewSystems/data/UvrD/prok1603/prok1603_jw_scores.npz')
    print "Dumping JW scores to:", prok1603_jw_file
    np.savez_compressed(prok1603_jw_file, M)
    tac = time.time() - tic
    print "Elapsed time:", float(tac) / 60 / 60, float(tac) / 60, float(tac)

    # print "Loading JW scores from:", prok1603_jw_file
    # M = t.load_compressed_pickle(prok1603_jw_file)

    prok1603_tree_file = os.path.join(work_dir, 'prok1603/prok1603_upgma.tre')
    print "Generating tree:", prok1603_tree_file
    dnd.convert_score_to_newick(M, [os.path.basename(l.file_name) for l in loci], prok1603_tree_file)
Пример #23
0
def generate_pickles(save_path, limit_to):

    if not os.path.exists(save_path):
        os.mkdir(save_path)

    pentaplets  =  p.get_report_kplets(profile_id2code, limit_to=limit_to, load_locations=True)
    quadruplets =  q.get_report_kplets(profile_id2code, limit_to=limit_to, load_locations=True)
    triplets    = tr.get_report_kplets(profile_id2code, limit_to=limit_to, load_locations=True)
    duplets     =  d.get_report_kplets(profile_id2code, limit_to=limit_to, load_locations=True)

    print "Dumping raw kplet data to files"
    dump_file = os.path.join(save_path, 'duplets_raw.p.bz2')
    t.dump_compressed_pickle(dump_file, duplets)
    dump_file = os.path.join(save_path, 'triplets_raw.p.bz2')
    t.dump_compressed_pickle(dump_file, triplets)
    dump_file = os.path.join(save_path, 'quadruplets_raw.p.bz2')
    t.dump_compressed_pickle(dump_file, quadruplets)
    dump_file = os.path.join(save_path, 'pentaplets_raw.p.bz2')
    t.dump_compressed_pickle(dump_file, pentaplets)

    # print "Loading raw kplets from pickles"
    # dump_file = os.path.join(save_path, 'duplets_raw.p.bz2')
    # duplets = t.load_compressed_pickle(dump_file)
    # dump_file = os.path.join(save_path, 'triplets_raw.p.bz2')
    # triplets= t.load_compressed_pickle(dump_file)
    # dump_file = os.path.join(save_path, 'quadruplets_raw.p.bz2')
    # quadruplets = t.load_compressed_pickle(dump_file)
    # dump_file = os.path.join(save_path, 'pentaplets_raw.p.bz2')
    # pentaplets = t.load_compressed_pickle(dump_file)

    print "Basic within merging"
    pentaplets = merging.basic_merge_within_orders(pentaplets)
    quadruplets= merging.basic_merge_within_orders(quadruplets)
    triplets = merging.basic_merge_within_orders(triplets)
    duplets = merging.basic_merge_within_orders(duplets)

    print "Dumping basic merges"
    dump_file = os.path.join(save_path, 'pentaplets_basic_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, pentaplets)
    dump_file = os.path.join(save_path, 'quadruplets_basic_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, quadruplets)
    dump_file = os.path.join(save_path, 'triplets_basic_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, triplets)
    dump_file = os.path.join(save_path, 'duplets_basic_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, duplets)
Пример #24
0
def generate_pickles(save_path, limit_to):

    if not os.path.exists(save_path):
        os.mkdir(save_path)

    pentaplets = p.get_report_kplets(profile_id2code,
                                     limit_to=limit_to,
                                     load_locations=True)
    quadruplets = q.get_report_kplets(profile_id2code,
                                      limit_to=limit_to,
                                      load_locations=True)
    triplets = tr.get_report_kplets(profile_id2code,
                                    limit_to=limit_to,
                                    load_locations=True)
    duplets = d.get_report_kplets(profile_id2code,
                                  limit_to=limit_to,
                                  load_locations=True)

    print "Dumping raw kplet data to files"
    dump_file = os.path.join(save_path, 'duplets_raw.p.bz2')
    t.dump_compressed_pickle(dump_file, duplets)
    dump_file = os.path.join(save_path, 'triplets_raw.p.bz2')
    t.dump_compressed_pickle(dump_file, triplets)
    dump_file = os.path.join(save_path, 'quadruplets_raw.p.bz2')
    t.dump_compressed_pickle(dump_file, quadruplets)
    dump_file = os.path.join(save_path, 'pentaplets_raw.p.bz2')
    t.dump_compressed_pickle(dump_file, pentaplets)

    # print "Loading raw kplets from pickles"
    # dump_file = os.path.join(save_path, 'duplets_raw.p.bz2')
    # duplets = t.load_compressed_pickle(dump_file)
    # dump_file = os.path.join(save_path, 'triplets_raw.p.bz2')
    # triplets= t.load_compressed_pickle(dump_file)
    # dump_file = os.path.join(save_path, 'quadruplets_raw.p.bz2')
    # quadruplets = t.load_compressed_pickle(dump_file)
    # dump_file = os.path.join(save_path, 'pentaplets_raw.p.bz2')
    # pentaplets = t.load_compressed_pickle(dump_file)

    print "Basic within merging"
    pentaplets = merging.basic_merge_within_orders(pentaplets)
    quadruplets = merging.basic_merge_within_orders(quadruplets)
    triplets = merging.basic_merge_within_orders(triplets)
    duplets = merging.basic_merge_within_orders(duplets)

    print "Dumping basic merges"
    dump_file = os.path.join(save_path, 'pentaplets_basic_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, pentaplets)
    dump_file = os.path.join(save_path, 'quadruplets_basic_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, quadruplets)
    dump_file = os.path.join(save_path, 'triplets_basic_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, triplets)
    dump_file = os.path.join(save_path, 'duplets_basic_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, duplets)
Пример #25
0
def generate_cas4_gi_summary_file(singles, cluster_packs, loci, reports_dir,
                                  cluster2summary_file_name):

    cluster2summary = {
        int(l.split('\t')[0]): l.split('\t')[1].strip()
        for l in open(cluster2summary_file_name)
    }
    summary_file = open(os.path.join(reports_dir, 'cas4_gi_summary.tab'), 'w')

    cas_filter = set(["cas3", "cas5", "cas8c", "cas7", "cas4", "cas1", "cas2"])

    gi2crispr_type = {}

    ind = 1
    for outer_i in range(len(cluster_packs)):
        (cluster, type2count, entropy, gene2count) = cluster_packs[outer_i]

        sorted_gene2count = sorted(gene2count.items(),
                                   key=lambda x: x[1],
                                   reverse=True)
        top_genes = set(k for k, v in sorted_gene2count[:5])

        cl_loci = [loci[_i] for _i in cluster]

        cl_summary = cluster2summary[ind]

        for locus in cl_loci:

            cas4_genes = [g for g in locus.genes if g.is_seed]

            cl_genes = set(gene_name for gene in locus.genes
                           for gene_name in gene.gene_name.split(','))

            # cas_genes = ",".join(cas_filter.intersection(cl_genes))
            cas_genes_summary = []

            buffer = []
            for gene in locus.genes:
                for gene_name in set(gene.gene_name.split(",")):

                    if gene_name in top_genes:

                        buffer.append(gene_name)
                        cas_genes_summary += buffer
                        buffer = []
                        continue

                    buffer.append(gene_name if gene_name else "?")

            cas_genes_summary = "+".join(cas_genes_summary)

            for gene in cas4_genes:
                summary_file.write("%s\t%s\t%s\t%s\t%s\n" %
                                   (gene.gid, locus.organism, cl_summary,
                                    cas_genes_summary, "%d.xlsx" % ind))

                gi2crispr_type[gene.gid] = locus.crispr_type

        ind += 1

    for single_ind in singles:

        locus = loci[single_ind]

        cas4_genes = [g for g in locus.genes if g.is_seed]
        cas_genes_summary = []

        buffer = []
        for gene in locus.genes:
            for gene_name in gene.gene_name.split(","):

                if gene_name in cas_filter:
                    buffer.append(gene_name)
                    cas_genes_summary += buffer
                    buffer = []

                buffer.append(gene_name if gene_name else "?")

        cas_genes_summary = "+".join(cas_genes_summary)

        for gene in cas4_genes:
            summary_file.write(
                "%s\t%s\t%s\t%s\n" %
                (gene.gid, locus.organism, "singleton", cas_genes_summary))
            gi2crispr_type[gene.gid] = locus.crispr_type

    summary_file.close()

    fname = os.path.join(gv.project_data_path,
                         'cas4/pickle/cas4_gi2crispr_type.p.bz2')

    t.dump_compressed_pickle(fname, gi2crispr_type)

    return gi2crispr_type
Пример #26
0
def generate_pickles(save_path, limit_to):

    if not os.path.exists(save_path):
        os.mkdir(save_path)

    # print "Loading kplets from DB"
    # pentaplets  =  p.get_report_kplets(limit_to=limit_to, load_locations=True)
    # quadruplets =  q.get_report_kplets(limit_to=limit_to, load_locations=True)
    # triplets    = tr.get_report_kplets(limit_to=limit_to, load_locations=True)
    # duplets     =  d.get_report_kplets(limit_to=limit_to, load_locations=True)

    # print "Dumping raw kplet data to files"
    # dump_file = bz2.BZ2File(os.path.join(save_path, 'duplets_raw.p.bz2'), 'w')
    # pickle.dump(duplets, dump_file)
    # dump_file = bz2.BZ2File(os.path.join(save_path, 'triplets_raw.p.bz2'), 'w')
    # pickle.dump(triplets, dump_file)
    # dump_file = bz2.BZ2File(os.path.join(save_path, 'quadruplets_raw.p.bz2'), 'w')
    # pickle.dump(quadruplets, dump_file)
    # dump_file = bz2.BZ2File(os.path.join(save_path, 'pentaplets_raw.p.bz2'), 'w')
    # pickle.dump(pentaplets, dump_file)

    print "Loading raw kplets from pickles"
    dump_file = os.path.join(save_path, 'duplets_raw.p.bz2')
    duplets = t.load_compressed_pickle(dump_file)
    dump_file = os.path.join(save_path, 'triplets_raw.p.bz2')
    triplets = t.load_compressed_pickle(dump_file)
    dump_file = os.path.join(save_path, 'quadruplets_raw.p.bz2')
    quadruplets = t.load_compressed_pickle(dump_file)
    # dump_file = os.path.join(save_path, 'pentaplets_raw.p.bz2')
    # pentaplets = t.load_compressed_pickle(dump_file)

    print "Basic within merging"
    # pentaplets = merging.basic_merge_within_orders(pentaplets)
    quadruplets = merging.basic_merge_within_orders(quadruplets)
    triplets = merging.basic_merge_within_orders(triplets)
    duplets = merging.basic_merge_within_orders(duplets)

    print "Dumping basic merges"
    # dump_file = os.path.join(save_path, 'pentaplets_basic_merged.p.bz2')
    # t.dump_compressed_pickle(dump_file, pentaplets)
    dump_file = os.path.join(save_path, 'quadruplets_basic_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, quadruplets)
    dump_file = os.path.join(save_path, 'triplets_basic_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, triplets)
    dump_file = os.path.join(save_path, 'duplets_basic_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, duplets)

    # print "Loading basic merges"
    # dump_file = os.path.join(save_path, 'pentaplets_basic_merged.p.bz2')
    # pentaplets = t.load_compressed_pickle(dump_file)
    # dump_file = os.path.join(save_path, 'quadruplets_basic_merged.p.bz2')
    # quadruplets = t.load_compressed_pickle(dump_file)
    # dump_file = os.path.join(save_path, 'triplets_basic_merged.p.bz2')
    # triplets = t.load_compressed_pickle(dump_file)
    # dump_file = os.path.join(save_path, 'duplets_basic_merged.p.bz2')
    # duplets = t.load_compressed_pickle(dump_file)
    sys.exit()

    print 'Starting iterative within mergings'
    pentaplets = merging.merge_kplets_within_orders_iterative_2(pentaplets)
    quadruplets = merging.merge_kplets_within_orders_iterative_2(quadruplets)
    triplets = merging.merge_kplets_within_orders_iterative_2(triplets)
    duplets = merging.merge_kplets_within_orders_iterative_2(duplets)

    print "Dumping iterative merges"
    dump_file = os.path.join(save_path, 'pentaplets_iterative_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, pentaplets)
    dump_file = os.path.join(save_path, 'quadruplets_iterative_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, quadruplets)
    dump_file = os.path.join(save_path, 'triplets_iterative_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, triplets)
    dump_file = os.path.join(save_path, 'duplets_iterative_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, duplets)

    sys.exit()
    print 'Dumping merged kplet lists to files'
    dump_file = bz2.BZ2File(
        os.path.join(save_path, 'pentaplets_merged_within.p.bz2'), 'w')
    pickle.dump(pentaplets, dump_file)
    dump_file = bz2.BZ2File(
        os.path.join(save_path, 'quadruplets_merged_within.p.bz2'), 'w')
    pickle.dump(quadruplets, dump_file)
    dump_file = bz2.BZ2File(
        os.path.join(save_path, 'triplets_merged_within.p.bz2'), 'w')
    pickle.dump(triplets, dump_file)
    dump_file = bz2.BZ2File(
        os.path.join(save_path, 'duplets_merged_within.p.bz2'), 'w')
    pickle.dump(duplets, dump_file)

    print 'Starting accross mergings'
    triplets, duplets = merging.merge_kplets_across_orders(triplets, duplets)
    quadruplets, triplets = merging.merge_kplets_across_orders(
        quadruplets, triplets)
    pentaplets, quadruplets = merging.merge_kplets_across_orders(
        pentaplets, quadruplets)

    print 'Dumping to files'
    dump_file = bz2.BZ2File(
        os.path.join(save_path, 'pentaplets_merged_across.p.bz2'), 'w')
    pickle.dump(pentaplets, dump_file)
    dump_file = bz2.BZ2File(
        os.path.join(save_path, 'quadruplets_merged_across.p.bz2'), 'w')
    pickle.dump(quadruplets, dump_file)
    dump_file = bz2.BZ2File(
        os.path.join(save_path, 'triplets_merged_across.p.bz2'), 'w')
    pickle.dump(triplets, dump_file)
    dump_file = bz2.BZ2File(
        os.path.join(save_path, 'duplets_merged_across.p.bz2'), 'w')
    pickle.dump(duplets, dump_file)

    print 'Done for limit_to:', limit_to
    print
    print
Пример #27
0
def merging_pipeline_for_order(order, load_from_db=False):
    limit_to = 1000000000
    print "starting for ", order
    if load_from_db:
        print "Loading kplets from DB"
        if order == 2:
            kplet_file = 'duplets.p.bz2'
            #kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file))
        elif order == 3:
            kplet_file = 'triplets.p.bz2'
            #kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file))
        elif order == 4:
            kplet_file = 'quadruplets.p.bz2'
            kplets = q.get_report_kplets(profile_id2code, limit_to=limit_to)
        elif order == 5:
            kplet_file = 'pentaplets.p.bz2'
            kplets = p.get_report_kplets(profile_id2code, limit_to=limit_to)
    else:
        print "Loading kplets from pickle file"
        if order == 2:
            kplet_file = 'duplets.p.bz2'
        elif order == 3:
            kplet_file = 'triplets.p.bz2'
        elif order == 4:
            kplet_file = 'quadruplets.p.bz2'
        elif order == 5:
            kplet_file = 'pentaplets.p.bz2'
	kplet_file_full = os.path.join(data_path, kplet_file)
        print kplet_file_full
        kplets = t.load_compressed_pickle(kplet_file_full)



    # print "Starting for", kplet_file
    # print "Loading kplets"
    # kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file))
    print "No of kplets:", len(kplets)
    
    #print "Loading file2genes"

    #_file2genes = {}
    #for _f in os.listdir(neighborhood_files_path):
    #    _file2genes[_f] = dt.get_wgs_file(os.path.join(neighborhood_files_path, _f))

    # print 'Filtering'
    # kplets = filter_seed(kplets, _file2genes)
    # print "No of kplets:", len(kplets)
    # fname = os.path.join(data_path,  kplet_file.split('.')[0]+'_seed.p.bz2')
    # print 'Dumping', fname
    # t.dump_compressed_pickle(fname, kplets)

    print "Basic merging"
    merged_lists = merging.basic_merge_within_orders(kplets)
    fname = os.path.join(data_path, "basic_merged_"+kplet_file)
    print "Dumping basic merging: ", fname
    t.dump_compressed_pickle(fname, merged_lists)

    print "Iterative merging"
    merged_lists = merging.merge_kplets_within_orders_iterative(merged_lists)
    fname = os.path.join(data_path, "iterative_merged_"+kplet_file)
    print "Dumping Iterative merging: ",fname
    t.dump_compressed_pickle(fname, merged_lists)