def generate_plots_from_pickle(limit_to, report_dir, target_profiles, profile2def, gid2arcog_cdd, neighborhood_files_path): data_path = os.path.join(gv.project_data_path, 'Archea/pickle/') fname = os.path.join(data_path, str(limit_to), 'pentaplets_merged_within.p.bz2') pentaplets = t.load_compressed_pickle(fname) # fname = os.path.join(data_path, str(limit_to), 'quadruplets_merged_within.p.bz2') # quadruplets = t.load_compressed_pickle(fname) # fname = os.path.join(data_path, str(limit_to), 'triplets_merged_within.p.bz2') # triplets = t.load_compressed_pickle(fname) # fname = os.path.join(data_path, str(limit_to), 'duplets_merged_within.p.bz2') # duplets = t.load_compressed_pickle(fname) print 'Generationg reports for within orders merged lists' report_files_dir = os.path.join(gv.project_data_path, 'Archea/reports/merged_within_orders/', report_dir) if not os.path.exists(report_files_dir): os.mkdir(report_files_dir) for i, kplet_pool in zip([5, 4, 3, 2], [pentaplets, pentaplets, pentaplets, pentaplets]): print 'Starting for', i j = 0 profile2counts_pool = dist.get_flank_distributions(kplet_pool, neighborhood_files_path, target_profiles) for k in range(len(kplet_pool)): kplet_sublist = kplet_pool[k] cur_reports_folder = os.path.join(report_files_dir, str(i)) if not os.path.exists(cur_reports_folder): os.mkdir(cur_reports_folder) src2org, file_summaries, community, community_count, community_count_with_flanks = \ merging.merge_into_file_summaries(kplet_sublist, neighborhood_files_path, file2src_src2org_map, 'archaea') if not src2org: continue xls_file_name = os.path.join(cur_reports_folder, "%d_%d.xls" % (j+1, i)) class2counts, class2profiles = merging.arcog_profile_count_into_class_count(community_count) class2counts_flank, class2profiles_flank = merging.arcog_profile_count_into_class_count(community_count_with_flanks) profile2counts = profile2counts_pool[k] j += 1 params = dict() params['xls_file_name'] = xls_file_name params['src2org'] = src2org params['file_summaries'] = file_summaries params['community'] = community params['target_profiles'] = target_profiles params['profile2def'] = profile2def params['gid2arcog_cdd'] = gid2arcog_cdd params['class2counts'] = class2counts params['class2profiles'] = class2profiles params['class2counts_flank'] = class2counts_flank params['profile2counts'] = profile2counts r.write_to_xls(params)
def generate_plots_from_pickle(limit_to, report_dir, target_profiles, profile2def, gid2arcog_cdd, neighborhood_files_path): data_path = os.path.join(gv.project_data_path, 'Bacteria/pickle/') fname = os.path.join(data_path, str(limit_to), 'pentaplets_merged_within.p.bz2') pentaplets = t.load_compressed_pickle(fname) fname = os.path.join(data_path, str(limit_to), 'quadruplets_merged_within.p.bz2') quadruplets = t.load_compressed_pickle(fname) fname = os.path.join(data_path, str(limit_to), 'triplets_merged_within.p.bz2') triplets = t.load_compressed_pickle(fname) fname = os.path.join(data_path, str(limit_to), 'duplets_merged_within.p.bz2') duplets = t.load_compressed_pickle(fname) print 'Generationg reports for across orders merged lists' flank_report_fnames = ['pentaplets_flank_counts.xls', 'quadruplets_flank_counts.xls', \ 'triplets_flank_counts.xls', 'duplets_flank_counts.xls'] kplets = [pentaplets, quadruplets, triplets, duplets] titles = ['Pentaplets', 'Quadruplets', 'Triplets', 'Duplets'] report_files_dir = os.path.join(gv.project_data_path, 'Bacteria/reports/merged_within_orders/', report_dir) for i in range(len(flank_report_fnames)): _fname = flank_report_fnames[i] _kplet_list = kplets[i] _title = titles[i] flank_counts, cog2gids, gid2weight = dist.get_flank_distributions( _kplet_list, neighborhood_files_path, target_profiles) universal_flank_counts = t.merge_dict_set_list(cog2gids, gid2weight) params = dict() params['xls_file_name'] = os.path.join(report_files_dir, _fname) params['profile2def'] = profile2def params['flank_counts'] = universal_flank_counts params['title'] = _title params['target_profiles'] = target_profiles r.write_flanking_count_xls(params) sys.exit() if not os.path.exists(report_files_dir): os.mkdir(report_files_dir) for i, kplet_pool in zip([5, 4, 3, 2], [pentaplets, quadruplets, triplets, duplets]): print i j = 0 # flank_counts_pool = dist.get_flank_distributions(kplet_pool, neighborhood_files_path, target_profiles) for k in range(len(kplet_pool)): kplet_sublist = kplet_pool[k] cur_reports_folder = os.path.join(report_files_dir, str(i)) if not os.path.exists(cur_reports_folder): os.mkdir(cur_reports_folder) src2org, file_summaries, community, community_count, community_count_with_flanks\ = merging.merge_into_file_summaries(kplet_sublist, neighborhood_files_path, file2src_src2org_map) if not src2org: continue xls_file_name = os.path.join(cur_reports_folder, "%d_%d.xls" % (j + 1, i)) class2counts, class2profiles = merging.cdd_profile_count_into_class_count( community_count) class2counts_flank, class2profiles_flank = merging.cdd_profile_count_into_class_count( community_count_with_flanks) # profile2counts = flank_counts_pool[k] j += 1 params = dict() params['xls_file_name'] = xls_file_name params['src2org'] = src2org params['file_summaries'] = file_summaries params['community'] = community params['target_profiles'] = target_profiles params['profile2def'] = profile2def params['gid2arcog_cdd'] = gid2arcog_cdd params['class2counts'] = class2counts params['class2profiles'] = class2profiles params['class2counts_flank'] = class2counts_flank # params['profile2counts'] = profile2counts r.write_to_xls(params)
fname = os.path.join(data_path, str(limit_to), 'pentaplets_merged_across.p.bz2') pentaplets = t.load_compressed_pickle(fname) report_dir = 'all' report_files_dir = os.path.join(gv.project_data_path, 'Archea/reports/merged_across_orders/', report_dir) j = 0 for kplet_sublist in pentaplets: cur_reports_folder = os.path.join(report_files_dir, str(5)) src2org, file_summaries, community, community_count, community_count_with_flanks = \ merging.merge_into_file_summaries(kplet_sublist, neighborhood_files_path, file2src_src2org_map, 'archaea') if not src2org: continue community_classes = merging.arcog_profile_count_into_class_count( community_count) community_flank_classes = merging.arcog_profile_count_into_class_count( community_count_with_flanks) xls_file_name = os.path.join(cur_reports_folder, "%d_%d.xls" % (j + 1, 5)) print xls_file_name j += 1 params = dict() params['xls_file_name'] = xls_file_name params['src2org'] = src2org
if len(pivot.codes.intersection(_kplet.codes)) == 4: _orgs = set([file2organism[f] for f in _kplet.files]) print i+1, len(set(pivot.files).intersection(set(_kplet.files))), len(_kplet.files), len(pivot_orgs), len(_orgs), len(pivot_orgs.intersection(_orgs)), _kplet.codes profile2counts_pool = dist.get_flank_distributions(pentaplets, neighborhood_files_path, target_profiles) j = 0 for k in range(len(pentaplets)): kplet_sublist = pentaplets[k] cur_reports_folder = os.path.join(report_files_dir, str(5)) src2org, file_summaries, community, community_count, community_count_with_flanks = \ merging.merge_into_file_summaries(kplet_sublist, neighborhood_files_path, file2src_src2org_map, 'archaea') if not src2org: continue class2counts, class2profiles = merging.arcog_profile_count_into_class_count(community_count) class2counts_flank, class2profiles_flank = merging.arcog_profile_count_into_class_count(community_count_with_flanks) profile2counts = profile2counts_pool[k] xls_file_name = os.path.join(cur_reports_folder, "%d_%d.xls" % (j+1, 5)) print xls_file_name j += 1 params = dict() params['xls_file_name'] = xls_file_name params['src2org'] = src2org
profiles = {} for kplet in merged_kplets[0]: for code in kplet.codes: if code in target_profiles: if code in profiles: profiles[code] += 1 else: profiles[code] = 1 fname = '/Users/hudaiber/Projects/NewSystems/data/Bacteria/pickle/100000/merged_kplets.p.bz2' dest_dir = '/Users/hudaiber/Projects/NewSystems/data/Bacteria/reports/tmp/' for cnt, kplet_sublist in enumerate(merged_kplets): print cnt + 1 xls_file_name = os.path.join(dest_dir, "%d.xls" % (cnt + 1)) src2org, file_summaries, community, community_count = merging.merge_into_file_summaries( kplet_sublist, neighborhood_files_path, file2src_src2org_map) params = {} params['xls_file_name'] = xls_file_name params['src2org'] = src2org params['file_summaries'] = file_summaries params['community'] = community params['target_profiles'] = target_profiles params['profile2def'] = profile2def params['gid2arcog_cdd'] = gid2arcog_cdd r.write_to_xls(params) if cnt == 200: break
profiles = {} for kplet in merged_kplets[0]: for code in kplet.codes: if code in target_profiles: if code in profiles: profiles[code] += 1 else: profiles[code] = 1 fname = '/Users/hudaiber/Projects/NewSystems/data/Bacteria/pickle/100000/merged_kplets.p.bz2' dest_dir = '/Users/hudaiber/Projects/NewSystems/data/Bacteria/reports/tmp/' for cnt, kplet_sublist in enumerate(merged_kplets): print cnt+1 xls_file_name = os.path.join(dest_dir, "%d.xls" % (cnt+1)) src2org, file_summaries, community, community_count = merging.merge_into_file_summaries(kplet_sublist, neighborhood_files_path, file2src_src2org_map) params = {} params['xls_file_name'] = xls_file_name params['src2org'] = src2org params['file_summaries'] = file_summaries params['community'] = community params['target_profiles'] = target_profiles params['profile2def'] = profile2def params['gid2arcog_cdd'] = gid2arcog_cdd r.write_to_xls(params) if cnt == 200: break
def generate_plots_from_pickle(limit_to, report_dir, target_profiles, profile2def, gid2arcog_cdd, neighborhood_files_path): data_path = os.path.join(gv.project_data_path, 'Bacteria/pickle/') fname = os.path.join(data_path, str(limit_to), 'pentaplets_merged_within.p.bz2') pentaplets = t.load_compressed_pickle(fname) fname = os.path.join(data_path, str(limit_to), 'quadruplets_merged_within.p.bz2') quadruplets = t.load_compressed_pickle(fname) fname = os.path.join(data_path, str(limit_to), 'triplets_merged_within.p.bz2') triplets = t.load_compressed_pickle(fname) fname = os.path.join(data_path, str(limit_to), 'duplets_merged_within.p.bz2') duplets = t.load_compressed_pickle(fname) print 'Generationg reports for across orders merged lists' flank_report_fnames = ['pentaplets_flank_counts.xls', 'quadruplets_flank_counts.xls', \ 'triplets_flank_counts.xls', 'duplets_flank_counts.xls'] kplets = [pentaplets, quadruplets, triplets, duplets] titles = ['Pentaplets', 'Quadruplets', 'Triplets', 'Duplets'] report_files_dir = os.path.join(gv.project_data_path, 'Bacteria/reports/merged_within_orders/', report_dir) for i in range(len(flank_report_fnames)): _fname = flank_report_fnames[i] _kplet_list = kplets[i] _title = titles[i] flank_counts, cog2gids, gid2weight = dist.get_flank_distributions(_kplet_list, neighborhood_files_path, target_profiles) universal_flank_counts = t.merge_dict_set_list(cog2gids, gid2weight) params = dict() params['xls_file_name'] = os.path.join(report_files_dir, _fname) params['profile2def'] = profile2def params['flank_counts'] = universal_flank_counts params['title'] = _title params['target_profiles'] = target_profiles r.write_flanking_count_xls(params) sys.exit() if not os.path.exists(report_files_dir): os.mkdir(report_files_dir) for i, kplet_pool in zip([5, 4, 3, 2], [pentaplets, quadruplets, triplets, duplets]): print i j = 0 # flank_counts_pool = dist.get_flank_distributions(kplet_pool, neighborhood_files_path, target_profiles) for k in range(len(kplet_pool)): kplet_sublist = kplet_pool[k] cur_reports_folder = os.path.join(report_files_dir, str(i)) if not os.path.exists(cur_reports_folder): os.mkdir(cur_reports_folder) src2org, file_summaries, community, community_count, community_count_with_flanks\ = merging.merge_into_file_summaries(kplet_sublist, neighborhood_files_path, file2src_src2org_map) if not src2org: continue xls_file_name = os.path.join(cur_reports_folder, "%d_%d.xls" % (j+1, i)) class2counts, class2profiles = merging.cdd_profile_count_into_class_count(community_count) class2counts_flank, class2profiles_flank = merging.cdd_profile_count_into_class_count(community_count_with_flanks) # profile2counts = flank_counts_pool[k] j += 1 params = dict() params['xls_file_name'] = xls_file_name params['src2org'] = src2org params['file_summaries'] = file_summaries params['community'] = community params['target_profiles'] = target_profiles params['profile2def'] = profile2def params['gid2arcog_cdd'] = gid2arcog_cdd params['class2counts'] = class2counts params['class2profiles'] = class2profiles params['class2counts_flank'] = class2counts_flank # params['profile2counts'] = profile2counts r.write_to_xls(params)
if not os.path.exists(reports_dir): os.mkdir(reports_dir) ind = 0 for kplet in kplet_list[:1000]: codes = list(kplet.codes) suffix = ['s' if code in target_profiles else 'n' for code in codes] if 's' not in suffix: continue xls_file_name = os.path.join(reports_dir, '%d_%s.xls' % (ind+1, "".join(suffix))) ind += 1 src2org, file_summaries, community, community_count, community_count_with_flanks, weight = \ merging.merge_into_file_summaries([kplet], neighborhood_files_path, file2src_src2org_map) # xls_file_name = os.path.join(reports_dir, '%d.xls' % (ind+1)) params = dict() params[ 'xls_file_name'] = xls_file_name params[ 'src2org'] = src2org params[ 'file_summaries'] = file_summaries params[ 'target_profiles'] = target_profiles params[ 'profile2def'] = profile2def params[ 'gid2arcog_cdd'] = gid2arcog_cdd # print "Writing", ind, xls_file_name r.write_to_xls_raw_kplet(params)
# profile_id2code = map_id2cdd() # pickle.dump(profile_id2code, open('profile_id2code.p','w')) profile_id2code = cPickle.load(open('/Users/hudaiber/Projects/NewSystems/code/Bacteria/profile_id2code.p')) # fname = '/Users/hudaiber/Projects/NewSystems/data/Bacteria/pickle/100000/pentaplets_merged_across.p.bz2' fname = '/Users/hudaiber/Projects/NewSystems/data/Bacteria/pickle/100000/merged_kplets.p.bz2' f = bz2.BZ2File(fname, 'rb') merged_kplets = cPickle.load(f) dest_dir = '/Users/hudaiber/Projects/NewSystems/data/Bacteria/reports/tmp/' print 'starting reports' for cnt, kplet_sublist in enumerate(merged_kplets): print cnt+1 xls_file_name = os.path.join(dest_dir, "%d.xls" % (cnt+1)) src2org, file_summaries, community = merging.merge_into_file_summaries(kplet_sublist, neighborhood_files_path, file2src_src2org_map, 'bacteria') params = {} params['xls_file_name'] = xls_file_name params['src2org'] = src2org params['file_summaries'] = file_summaries params['community'] = community params['target_profiles'] = target_profiles params['profile2def'] = profile2def params['gid2arcog_cdd'] = gid2arcog_cdd r.write_to_xls(params) break if cnt == 200: break