def prok1603_architecture_frequencies(): work_dir = os.path.join(gv.project_data_path, 'UvrD/') map_file = os.path.join(work_dir, 'prok1603/prok1603_weights.txt') locus2weight = {l.split()[0]:float(l.split()[1]) for l in open(map_file)} def_file = os.path.join(gv.project_data_path, 'cas4/profiles/defenseProfiles.tab') profile2gene={} profile2def = {} for l in open(def_file): terms = l.strip().split('\t') profile = terms[0] gene_names = terms[3].split(',') if len(gene_names)>1: profile2gene[profile] = gene_names[1] else: profile2gene[profile] = gene_names[0] profile2def[profile] = terms[4] cdd_profile2gene = t.map_cdd_profile2gene_name() cdd_profile2gene.update(profile2gene) cdd_profile2def = t.map_cdd_profile2def() cdd_profile2def.update(profile2def) prok1603_loci_file = os.path.join(work_dir, 'prok1603The CRISPR/prok1603_loci.p.bz2') loci = t.load_compressed_pickle(prok1603_loci_file) profile2loci = {} for locus in loci: for _profile in locus.profiles: if _profile in profile2loci: profile2loci[_profile].append(locus) else: profile2loci[_profile] = [locus] for (profile, loci) in sorted(profile2loci.items(), key=lambda x: len(x[1]), reverse=True): _weight = sum([locus2weight[locus.base_file_name] for locus in loci]) print "%s\t%s\t%d\t%f\t%s" % (profile, cdd_profile2gene[profile] if profile in cdd_profile2gene else "", len(loci), _weight, cdd_profile2def[profile] if profile in cdd_profile2def else "")
# print 'Dumping to files' # dump_file = bz2.BZ2File(os.path.join(save_path, 'pentaplets_merged_across.p.bz2'), 'w') # pickle.dump(pentaplets, dump_file) # dump_file = bz2.BZ2File(os.path.join(save_path, 'quadruplets_merged_across.p.bz2'), 'w') # pickle.dump(quadruplets, dump_file) # dump_file = bz2.BZ2File(os.path.join(save_path, 'triplets_merged_across.p.bz2'), 'w') # pickle.dump(triplets, dump_file) # dump_file = bz2.BZ2File(os.path.join(save_path, 'duplets_merged_across.p.bz2'), 'w') # pickle.dump(duplets, dump_file) if __name__ == '__main__': print 'Pre-Loading dictionaries' target_profiles = t.bacteria_target_profiles() profile2def = t.map_cdd_profile2def() gid2arcog_cdd = t.map_gid2arcog_cdd() neighborhood_files_path = neighborhoods_path() profile_id2code = map_id2cdd() # for limit_to, report_dir in zip([300, 500, 1000, 100000],['top_300', 'top_500', 'top_1000', 'top_100000']): # # print "Limit_to:", limit_to # print # generate_plots(limit_to, report_dir, target_profiles, profile2def, gid2arcog_cdd, neighborhood_files_path, profile_id2code) # print 'Done' # print "------------------------" data_path = os.path.join(gv.project_data_path, 'Bacteria/pickle/') print 'Generating pickles'
def calculate_profile_based_crispricity(cas1402_loci, cas1402_gis, cas1402_organisms, prok1402_path_file): print "Loding global maps" global_profile2orgs2gis = load_maps_simple(prok1402_path_file, cas1402_gis) print "Loading weights" gnm2weight = t.map_genome2weight() print "Loading CDD definitions" profile2def = t.map_cdd_profile2def() print "Counting in CRISPR loci" profile2orgs2obj = {} for locus in cas1402_loci: for gene in locus: for _cogid in gene.cogid.split(): if _cogid not in profile2orgs2obj: profile2orgs2obj[_cogid] = {} for _org in cas1402_organisms: _orgObj = ProfileInOrganismCount(_org, _cogid) if _cogid in global_profile2orgs2gis: _orgObj.outside = len(global_profile2orgs2gis[_cogid][_org]) \ if _org in global_profile2orgs2gis[_cogid] \ else 0 else: _orgObj.outside = 0 profile2orgs2obj[_cogid][_org] = _orgObj profile2orgs2obj[_cogid][gene.organism].in_crispr += 1 out_file = os.path.join(gv.project_data_path, 'cas1402/crispricity_count.tab') in_crispr_all = [] crispricity_all = [] profiles_all = [] print "Writing to file:", out_file with open(out_file, 'w') as outf: outf.write( "Profile\tOccurrence in CRISPR loci\tCrispricity\tDefinition\n") for profile in profile2orgs2obj: in_crispr = 0 everywhere = 0 # for org in profile2orgs2obj[profile]: # _org = profile2orgs2obj[profile][org] # in_crispr += _org.in_crispr * gnm2weight[org] # everywhere += (_org.in_crispr + _org.outside) * gnm2weight[org] for org in profile2orgs2obj[profile]: _org = profile2orgs2obj[profile][org] in_crispr += _org.in_crispr everywhere += (_org.in_crispr + _org.outside) crispricity = in_crispr / everywhere in_crispr_all.append(in_crispr) crispricity_all.append(crispricity) profiles_all.append(profile) outf.write("%s\t%f\t%f\t%s\n" % (profile, in_crispr, crispricity, profile2def[profile])) in_crispr_all = np.asarray(in_crispr_all) in_crispr_all = np.log10(in_crispr_all) crispricity_all = np.asarray(crispricity_all) # crispricity_all = np.log(crispricity_all) plt.ioff() fig, ax = plt.subplots() ax.scatter(in_crispr_all, crispricity_all, s=1) plt.xlabel("Effective orcurrence in CRISPR loci (log10)") plt.ylabel("X-axis / Effective occurrences") # fig.savefig('first.png') plt.savefig('first_count.png')
def calculate_profile_based_baiticity(bacteria_loci, loci_gis, loci_organisms, arcog_path_file, bait_profiles, filter_threshold, save_path): print "Loding global maps" global_profile2orgs2gis = load_maps_simple(arcog_path_file, loci_gis) print "Loading weights" gnm2weight = t.map_genome2weight() print "Loading CDD definitions" profile2def = t.map_cdd_profile2def() profile2def.update(t.map_profile2def()) print "Counting in loci" profile2orgs2obj = {} gi_checklist = set() for locus in bacteria_loci: for gene in locus: if gene.gid in gi_checklist: continue for _cogid in gene.cogid.split(): if _cogid in bait_profiles: continue if _cogid not in profile2orgs2obj: profile2orgs2obj[_cogid] = {} for _org in loci_organisms: _orgObj = ProfileInOrganismCount(_org, _cogid) if _cogid in global_profile2orgs2gis: _orgObj.outside = len(global_profile2orgs2gis[_cogid][_org]) \ if _org in global_profile2orgs2gis[_cogid] \ else 0 else: _orgObj.outside = 0 profile2orgs2obj[_cogid][_org] = _orgObj profile2orgs2obj[_cogid][gene.organism].in_locus += 1 gi_checklist.update([gene.gid]) print len(profile2orgs2obj['arCOG08578']) # print profile2orgs2obj['arCOG08578'].keys() for org, obj in profile2orgs2obj['arCOG08578'].items(): if obj.in_locus + obj.outside > 0: print org, obj.in_locus, obj.outside sys.exit() out_file = os.path.join(save_path, 'baiticity.tab') profiles = [] in_loci_count = [] baiticity_count = [] in_loci_weight = [] baiticity_weight = [] rare_profiles_file = open(os.path.join(save_path, 'rare_profiles.tab'), 'w') rare_profiles_file.write( "Profile\tOccurence everywhere\tOccurrence in loci\tBaiticity\tDefinition\n" ) print "Writing to file:", out_file with open(out_file, 'w') as outf: outf.write( "Profile\tOccurrence in loci(count)\tBaiticity(count)\tOccurrence in loci(weight)\tBaiticity(weight)\tDefinition\n" ) for profile in profile2orgs2obj: if profile == 'arCOG14077': continue in_locus_count = 0 everywhere_count = 0 in_locus_weight = 0 everywhere_weight = 0 for org in profile2orgs2obj[profile]: if org in [ 'Nitrosoarchaeum_koreensis_MY1_MY1', 'Nitrosoarchaeum_limnia_SFB1' ]: continue _org = profile2orgs2obj[profile][org] in_locus_count += _org.in_locus everywhere_count += (_org.in_locus + _org.outside) in_locus_weight += _org.in_locus * gnm2weight[org] everywhere_weight += (_org.in_locus + _org.outside) * gnm2weight[org] _baiticity_count = 1.0 * in_locus_count / everywhere_count _baiticity_weight = in_locus_weight / everywhere_weight if everywhere_weight < filter_threshold: rare_profiles_file.write( "%s\t%f\t%f\t%f\t%s\n" % (profile, everywhere_count, in_locus_count, _baiticity_count, profile2def[profile])) continue in_loci_count.append(in_locus_count) baiticity_count.append(_baiticity_count) in_loci_weight.append(in_locus_weight) baiticity_weight.append(_baiticity_weight) profiles.append(profile) outf.write( "%s\t%f\t%f\t%f\t%f\t%s\n" % (profile, in_locus_count, _baiticity_count, in_locus_weight, _baiticity_weight, profile2def[profile])) in_loci_weight = np.asarray(in_loci_weight) in_loci_weight = np.log10(in_loci_weight) baiticity_weight = np.asarray(baiticity_weight) plt.ioff() fig, ax = plt.subplots() ax.scatter(in_loci_weight, baiticity_weight, s=1) plt.xlabel("Effective orcurrence in loci (log10)") plt.ylabel("Baiticity") image_file = os.path.join(save_path, 'baiticity.png') plt.savefig(image_file) # for i, profile in enumerate(profiles_all): # ax.annotate(profile, (in_loci_all[i], crispricity_all[i])) # fig.savefig('second.png') # plt.savefig('second.png') rare_profiles_file.close()
for j, kplet_sublist in enumerate(kplet_pool): cur_reports_folder = os.path.join(report_files_dir, str(i)) if not os.path.exists(cur_reports_folder): os.mkdir(cur_reports_folder) xls_file_name = os.path.join(cur_reports_folder, "%d_%d.xls" % (j+1, i)) r.write_to_xls(xls_file_name,kplet_sublist,target_profiles,profile2def,gid2arcog_cdd,neighborhood_files_path,file2src_src2org_map) if __name__ == '__main__': import cPickle import bz2 print 'Pre-Loading dictionaries' target_profiles = t.bacteria_target_profiles() profile2def = t.map_cdd_profile2def() gid2arcog_cdd = t.map_gid2arcog_cdd() neighborhood_files_path = neighborhoods_path() # profile_id2code = map_id2cdd() # pickle.dump(profile_id2code, open('profile_id2code.p','w')) profile_id2code = cPickle.load(open('/Users/hudaiber/Projects/NewSystems/code/Bacteria/profile_id2code.p')) fname = '/Users/hudaiber/Projects/NewSystems/data/Bacteria/pickle/100000/pentaplets_merged_across.p.bz2' f = bz2.BZ2File(fname, 'rb') buffer = "" while 1: data = f.read() if data == "": break buffer += data
def calculate_profile_based_baiticity(bacteria_loci, loci_gis, loci_organisms, prok1402_path_file, bait_profiles, filter_threshold, save_path): print "Loding global maps" global_profile2orgs2gis = load_maps_simple(prok1402_path_file, loci_gis) print "Loading weights" gnm2weight = t.map_genome2weight() print "Loading CDD definitions" profile2def = t.map_cdd_profile2def() print "Counting in loci" profile2orgs2obj = {} gi_checklist = set() for locus in bacteria_loci: for gene in locus: if gene.gid in gi_checklist: continue for _cogid in gene.cogid.split(): if _cogid in bait_profiles: continue if _cogid not in profile2orgs2obj: profile2orgs2obj[_cogid] = {} for _org in loci_organisms: _orgObj = ProfileInOrganismCount(_org, _cogid) if _cogid in global_profile2orgs2gis: _orgObj.outside = len(global_profile2orgs2gis[_cogid][_org]) \ if _org in global_profile2orgs2gis[_cogid] \ else 0 else: _orgObj.outside = 0 profile2orgs2obj[_cogid][_org] = _orgObj profile2orgs2obj[_cogid][gene.organism].in_locus += 1 gi_checklist.update([gene.gid]) out_file = os.path.join(save_path, 'baiticity.tab') profiles = [] in_loci_count = [] baiticity_count = [] in_loci_weight = [] baiticity_weight = [] rare_profiles_file = open(os.path.join(save_path, 'rare_profiles.tab'), 'w') rare_profiles_file.write("Profile\tOccurence everywhere\tOccurrence in loci\tBaiticity\tDefinition\n") print "Writing to file:", out_file with open(out_file, 'w') as outf: outf.write("Profile\tOccurrence in loci(count)\tBaiticity(count)\tOccurrence in loci(weight)\tBaiticity(weight)\tDefinition\n") for profile in profile2orgs2obj: in_locus_count = 0 everywhere_count = 0 in_locus_weight = 0 everywhere_weight = 0 for org in profile2orgs2obj[profile]: _org = profile2orgs2obj[profile][org] in_locus_count += _org.in_locus everywhere_count += (_org.in_locus + _org.outside) in_locus_weight += _org.in_locus * gnm2weight[org] everywhere_weight += (_org.in_locus + _org.outside) * gnm2weight[org] _baiticity_count = 1.0 * in_locus_count / everywhere_count _baiticity_weight = in_locus_weight / everywhere_weight if everywhere_weight < filter_threshold: rare_profiles_file.write("%s\t%f\t%f\t%f\t%s\n"%(profile, everywhere_count, in_locus_count, _baiticity_count, profile2def[profile])) continue in_loci_count.append(in_locus_count) baiticity_count.append(_baiticity_count) in_loci_weight.append(in_locus_weight) baiticity_weight.append(_baiticity_weight) profiles.append(profile) outf.write("%s\t%f\t%f\t%f\t%f\t%s\n"%(profile, in_locus_count, _baiticity_count, in_locus_weight, _baiticity_weight, profile2def[profile])) in_loci_weight = np.asarray(in_loci_weight) in_loci_weight = np.log10(in_loci_weight) baiticity_weight = np.asarray(baiticity_weight) plt.ioff() fig, ax = plt.subplots() ax.scatter(in_loci_weight, baiticity_weight, s=1) plt.xlabel("Effective orcurrence in loci (log10)") plt.ylabel("Baiticity") image_file = os.path.join(save_path, 'baiticity.png') plt.savefig(image_file) # for i, profile in enumerate(profiles_all): # ax.annotate(profile, (in_loci_all[i], crispricity_all[i])) # fig.savefig('second.png') # plt.savefig('second.png') rare_profiles_file.close()
def calculate_profile_based_crispricity(cas1402_loci, cas1402_gis, cas1402_organisms, prok1402_path_file): print "Loding global maps" global_profile2orgs2gis = load_maps_simple(prok1402_path_file, cas1402_gis) print "Loading weights" gnm2weight = t.map_genome2weight() print "Loading CDD definitions" profile2def = t.map_cdd_profile2def() print "Counting in CRISPR loci" profile2orgs2obj = {} for locus in cas1402_loci: for gene in locus: for _cogid in gene.cogid.split(): if _cogid not in profile2orgs2obj: profile2orgs2obj[_cogid] = {} for _org in cas1402_organisms: _orgObj = ProfileInOrganismCount(_org, _cogid) if _cogid in global_profile2orgs2gis: _orgObj.outside = len(global_profile2orgs2gis[_cogid][_org]) \ if _org in global_profile2orgs2gis[_cogid] \ else 0 else: _orgObj.outside = 0 profile2orgs2obj[_cogid][_org] = _orgObj profile2orgs2obj[_cogid][gene.organism].in_crispr += 1 out_file = os.path.join(gv.project_data_path, 'cas1402/crispricity_count.tab') in_crispr_all = [] crispricity_all = [] profiles_all = [] print "Writing to file:", out_file with open(out_file, 'w') as outf: outf.write("Profile\tOccurrence in CRISPR loci\tCrispricity\tDefinition\n") for profile in profile2orgs2obj: in_crispr = 0 everywhere = 0 # for org in profile2orgs2obj[profile]: # _org = profile2orgs2obj[profile][org] # in_crispr += _org.in_crispr * gnm2weight[org] # everywhere += (_org.in_crispr + _org.outside) * gnm2weight[org] for org in profile2orgs2obj[profile]: _org = profile2orgs2obj[profile][org] in_crispr += _org.in_crispr everywhere += (_org.in_crispr + _org.outside) crispricity = in_crispr / everywhere in_crispr_all.append(in_crispr) crispricity_all.append(crispricity) profiles_all.append(profile) outf.write("%s\t%f\t%f\t%s\n"%(profile, in_crispr, crispricity, profile2def[profile])) in_crispr_all = np.asarray(in_crispr_all) in_crispr_all = np.log10(in_crispr_all) crispricity_all = np.asarray(crispricity_all) # crispricity_all = np.log(crispricity_all) plt.ioff() fig, ax = plt.subplots() ax.scatter(in_crispr_all, crispricity_all, s=1) plt.xlabel("Effective orcurrence in CRISPR loci (log10)") plt.ylabel("X-axis / Effective occurrences") # fig.savefig('first.png') plt.savefig('first_count.png')