示例#1
0
def prok1603_architecture_frequencies():

    work_dir = os.path.join(gv.project_data_path, 'UvrD/')

    map_file = os.path.join(work_dir, 'prok1603/prok1603_weights.txt')

    locus2weight = {l.split()[0]:float(l.split()[1]) for l in open(map_file)}

    def_file = os.path.join(gv.project_data_path, 'cas4/profiles/defenseProfiles.tab')
    profile2gene={}
    profile2def = {}

    for l in open(def_file):
        terms = l.strip().split('\t')
        profile = terms[0]
        gene_names = terms[3].split(',')
        if len(gene_names)>1:
            profile2gene[profile] = gene_names[1]
        else:
            profile2gene[profile] = gene_names[0]

        profile2def[profile] = terms[4]

    cdd_profile2gene = t.map_cdd_profile2gene_name()
    cdd_profile2gene.update(profile2gene)

    cdd_profile2def = t.map_cdd_profile2def()
    cdd_profile2def.update(profile2def)

    prok1603_loci_file = os.path.join(work_dir, 'prok1603The CRISPR/prok1603_loci.p.bz2')
    loci = t.load_compressed_pickle(prok1603_loci_file)

    profile2loci = {}

    for locus in loci:
        for _profile in locus.profiles:
            if _profile in profile2loci:
                profile2loci[_profile].append(locus)
            else:
                profile2loci[_profile] = [locus]

    for (profile, loci) in sorted(profile2loci.items(), key=lambda x: len(x[1]), reverse=True):
        _weight = sum([locus2weight[locus.base_file_name] for locus in loci])
        print "%s\t%s\t%d\t%f\t%s" % (profile,
                                      cdd_profile2gene[profile] if profile in cdd_profile2gene else "",
                                      len(loci),
                                      _weight,
                                      cdd_profile2def[profile] if profile in cdd_profile2def else "")
示例#2
0
    # print 'Dumping to files'
    # dump_file = bz2.BZ2File(os.path.join(save_path, 'pentaplets_merged_across.p.bz2'), 'w')
    # pickle.dump(pentaplets, dump_file)
    # dump_file = bz2.BZ2File(os.path.join(save_path, 'quadruplets_merged_across.p.bz2'), 'w')
    # pickle.dump(quadruplets, dump_file)
    # dump_file = bz2.BZ2File(os.path.join(save_path, 'triplets_merged_across.p.bz2'), 'w')
    # pickle.dump(triplets, dump_file)
    # dump_file = bz2.BZ2File(os.path.join(save_path, 'duplets_merged_across.p.bz2'), 'w')
    # pickle.dump(duplets, dump_file)


if __name__ == '__main__':

    print 'Pre-Loading dictionaries'
    target_profiles = t.bacteria_target_profiles()
    profile2def = t.map_cdd_profile2def()
    gid2arcog_cdd = t.map_gid2arcog_cdd()
    neighborhood_files_path = neighborhoods_path()
    profile_id2code = map_id2cdd()

    # for limit_to, report_dir in zip([300, 500, 1000, 100000],['top_300', 'top_500', 'top_1000', 'top_100000']):
    #
    #     print "Limit_to:", limit_to
    #     print
    #     generate_plots(limit_to, report_dir, target_profiles, profile2def, gid2arcog_cdd, neighborhood_files_path, profile_id2code)
    #     print 'Done'
    #     print "------------------------"

    data_path = os.path.join(gv.project_data_path, 'Bacteria/pickle/')

    print 'Generating pickles'
示例#3
0
def calculate_profile_based_crispricity(cas1402_loci, cas1402_gis,
                                        cas1402_organisms, prok1402_path_file):

    print "Loding global maps"
    global_profile2orgs2gis = load_maps_simple(prok1402_path_file, cas1402_gis)
    print "Loading weights"
    gnm2weight = t.map_genome2weight()
    print "Loading CDD definitions"
    profile2def = t.map_cdd_profile2def()

    print "Counting in CRISPR loci"

    profile2orgs2obj = {}

    for locus in cas1402_loci:
        for gene in locus:

            for _cogid in gene.cogid.split():

                if _cogid not in profile2orgs2obj:
                    profile2orgs2obj[_cogid] = {}
                    for _org in cas1402_organisms:
                        _orgObj = ProfileInOrganismCount(_org, _cogid)

                        if _cogid in global_profile2orgs2gis:
                            _orgObj.outside = len(global_profile2orgs2gis[_cogid][_org]) \
                                              if _org in global_profile2orgs2gis[_cogid] \
                                              else 0
                        else:
                            _orgObj.outside = 0

                        profile2orgs2obj[_cogid][_org] = _orgObj

                profile2orgs2obj[_cogid][gene.organism].in_crispr += 1

    out_file = os.path.join(gv.project_data_path,
                            'cas1402/crispricity_count.tab')

    in_crispr_all = []
    crispricity_all = []
    profiles_all = []

    print "Writing to file:", out_file
    with open(out_file, 'w') as outf:

        outf.write(
            "Profile\tOccurrence in CRISPR loci\tCrispricity\tDefinition\n")

        for profile in profile2orgs2obj:
            in_crispr = 0
            everywhere = 0

            # for org in profile2orgs2obj[profile]:
            #     _org = profile2orgs2obj[profile][org]
            #     in_crispr +=  _org.in_crispr * gnm2weight[org]
            #     everywhere += (_org.in_crispr + _org.outside) * gnm2weight[org]

            for org in profile2orgs2obj[profile]:
                _org = profile2orgs2obj[profile][org]
                in_crispr += _org.in_crispr
                everywhere += (_org.in_crispr + _org.outside)

            crispricity = in_crispr / everywhere

            in_crispr_all.append(in_crispr)
            crispricity_all.append(crispricity)
            profiles_all.append(profile)
            outf.write("%s\t%f\t%f\t%s\n" %
                       (profile, in_crispr, crispricity, profile2def[profile]))

    in_crispr_all = np.asarray(in_crispr_all)
    in_crispr_all = np.log10(in_crispr_all)
    crispricity_all = np.asarray(crispricity_all)
    # crispricity_all = np.log(crispricity_all)

    plt.ioff()
    fig, ax = plt.subplots()
    ax.scatter(in_crispr_all, crispricity_all, s=1)

    plt.xlabel("Effective orcurrence in CRISPR loci (log10)")
    plt.ylabel("X-axis / Effective occurrences")

    # fig.savefig('first.png')
    plt.savefig('first_count.png')
示例#4
0
def calculate_profile_based_baiticity(bacteria_loci, loci_gis, loci_organisms,
                                      arcog_path_file, bait_profiles,
                                      filter_threshold, save_path):

    print "Loding global maps"
    global_profile2orgs2gis = load_maps_simple(arcog_path_file, loci_gis)
    print "Loading weights"
    gnm2weight = t.map_genome2weight()
    print "Loading CDD definitions"
    profile2def = t.map_cdd_profile2def()

    profile2def.update(t.map_profile2def())

    print "Counting in loci"

    profile2orgs2obj = {}

    gi_checklist = set()

    for locus in bacteria_loci:
        for gene in locus:

            if gene.gid in gi_checklist:
                continue

            for _cogid in gene.cogid.split():

                if _cogid in bait_profiles:
                    continue

                if _cogid not in profile2orgs2obj:
                    profile2orgs2obj[_cogid] = {}
                    for _org in loci_organisms:
                        _orgObj = ProfileInOrganismCount(_org, _cogid)

                        if _cogid in global_profile2orgs2gis:
                            _orgObj.outside = len(global_profile2orgs2gis[_cogid][_org]) \
                                              if _org in global_profile2orgs2gis[_cogid] \
                                              else 0
                        else:
                            _orgObj.outside = 0

                        profile2orgs2obj[_cogid][_org] = _orgObj

                profile2orgs2obj[_cogid][gene.organism].in_locus += 1

            gi_checklist.update([gene.gid])

    print len(profile2orgs2obj['arCOG08578'])
    # print profile2orgs2obj['arCOG08578'].keys()

    for org, obj in profile2orgs2obj['arCOG08578'].items():
        if obj.in_locus + obj.outside > 0:
            print org, obj.in_locus, obj.outside

    sys.exit()
    out_file = os.path.join(save_path, 'baiticity.tab')

    profiles = []

    in_loci_count = []
    baiticity_count = []

    in_loci_weight = []
    baiticity_weight = []

    rare_profiles_file = open(os.path.join(save_path, 'rare_profiles.tab'),
                              'w')
    rare_profiles_file.write(
        "Profile\tOccurence everywhere\tOccurrence in loci\tBaiticity\tDefinition\n"
    )

    print "Writing to file:", out_file
    with open(out_file, 'w') as outf:

        outf.write(
            "Profile\tOccurrence in loci(count)\tBaiticity(count)\tOccurrence in loci(weight)\tBaiticity(weight)\tDefinition\n"
        )

        for profile in profile2orgs2obj:

            if profile == 'arCOG14077':
                continue

            in_locus_count = 0
            everywhere_count = 0
            in_locus_weight = 0
            everywhere_weight = 0

            for org in profile2orgs2obj[profile]:

                if org in [
                        'Nitrosoarchaeum_koreensis_MY1_MY1',
                        'Nitrosoarchaeum_limnia_SFB1'
                ]:
                    continue

                _org = profile2orgs2obj[profile][org]

                in_locus_count += _org.in_locus
                everywhere_count += (_org.in_locus + _org.outside)

                in_locus_weight += _org.in_locus * gnm2weight[org]
                everywhere_weight += (_org.in_locus +
                                      _org.outside) * gnm2weight[org]

            _baiticity_count = 1.0 * in_locus_count / everywhere_count
            _baiticity_weight = in_locus_weight / everywhere_weight

            if everywhere_weight < filter_threshold:

                rare_profiles_file.write(
                    "%s\t%f\t%f\t%f\t%s\n" %
                    (profile, everywhere_count, in_locus_count,
                     _baiticity_count, profile2def[profile]))
                continue

            in_loci_count.append(in_locus_count)
            baiticity_count.append(_baiticity_count)
            in_loci_weight.append(in_locus_weight)
            baiticity_weight.append(_baiticity_weight)

            profiles.append(profile)
            outf.write(
                "%s\t%f\t%f\t%f\t%f\t%s\n" %
                (profile, in_locus_count, _baiticity_count, in_locus_weight,
                 _baiticity_weight, profile2def[profile]))

    in_loci_weight = np.asarray(in_loci_weight)
    in_loci_weight = np.log10(in_loci_weight)
    baiticity_weight = np.asarray(baiticity_weight)

    plt.ioff()
    fig, ax = plt.subplots()
    ax.scatter(in_loci_weight, baiticity_weight, s=1)

    plt.xlabel("Effective orcurrence in loci (log10)")
    plt.ylabel("Baiticity")

    image_file = os.path.join(save_path, 'baiticity.png')
    plt.savefig(image_file)

    # for i, profile in enumerate(profiles_all):
    #     ax.annotate(profile, (in_loci_all[i], crispricity_all[i]))
    # fig.savefig('second.png')
    # plt.savefig('second.png')

    rare_profiles_file.close()
        for j, kplet_sublist in enumerate(kplet_pool):
            cur_reports_folder = os.path.join(report_files_dir, str(i))
            if not os.path.exists(cur_reports_folder):
                os.mkdir(cur_reports_folder)
            xls_file_name = os.path.join(cur_reports_folder,  "%d_%d.xls" % (j+1, i))
            r.write_to_xls(xls_file_name,kplet_sublist,target_profiles,profile2def,gid2arcog_cdd,neighborhood_files_path,file2src_src2org_map)


if __name__ == '__main__':

    import cPickle
    import bz2

    print 'Pre-Loading dictionaries'
    target_profiles = t.bacteria_target_profiles()
    profile2def = t.map_cdd_profile2def()
    gid2arcog_cdd = t.map_gid2arcog_cdd()
    neighborhood_files_path = neighborhoods_path()
    # profile_id2code = map_id2cdd()
    # pickle.dump(profile_id2code, open('profile_id2code.p','w'))
    profile_id2code = cPickle.load(open('/Users/hudaiber/Projects/NewSystems/code/Bacteria/profile_id2code.p'))

    fname = '/Users/hudaiber/Projects/NewSystems/data/Bacteria/pickle/100000/pentaplets_merged_across.p.bz2'
    f = bz2.BZ2File(fname, 'rb')

    buffer = ""
    while 1:
        data = f.read()
        if data == "":
            break
        buffer += data
示例#6
0
def calculate_profile_based_baiticity(bacteria_loci, loci_gis,
                                      loci_organisms,
                                      prok1402_path_file,
                                      bait_profiles,
                                      filter_threshold,
                                      save_path):

    print "Loding global maps"
    global_profile2orgs2gis = load_maps_simple(prok1402_path_file, loci_gis)
    print "Loading weights"
    gnm2weight = t.map_genome2weight()
    print "Loading CDD definitions"
    profile2def = t.map_cdd_profile2def()

    print "Counting in loci"

    profile2orgs2obj = {}

    gi_checklist = set()

    for locus in bacteria_loci:
        for gene in locus:

            if gene.gid in gi_checklist:
                continue

            for _cogid in gene.cogid.split():

                if _cogid in bait_profiles:
                    continue

                if _cogid not in profile2orgs2obj:
                    profile2orgs2obj[_cogid] = {}
                    for _org in loci_organisms:
                        _orgObj = ProfileInOrganismCount(_org, _cogid)

                        if _cogid in global_profile2orgs2gis:
                            _orgObj.outside = len(global_profile2orgs2gis[_cogid][_org]) \
                                              if _org in global_profile2orgs2gis[_cogid] \
                                              else 0
                        else:
                            _orgObj.outside = 0

                        profile2orgs2obj[_cogid][_org] = _orgObj

                profile2orgs2obj[_cogid][gene.organism].in_locus += 1

            gi_checklist.update([gene.gid])

    out_file = os.path.join(save_path, 'baiticity.tab')

    profiles = []

    in_loci_count = []
    baiticity_count = []

    in_loci_weight = []
    baiticity_weight = []

    rare_profiles_file = open(os.path.join(save_path, 'rare_profiles.tab'), 'w')
    rare_profiles_file.write("Profile\tOccurence everywhere\tOccurrence in loci\tBaiticity\tDefinition\n")

    print "Writing to file:", out_file
    with open(out_file, 'w') as outf:

        outf.write("Profile\tOccurrence in loci(count)\tBaiticity(count)\tOccurrence in loci(weight)\tBaiticity(weight)\tDefinition\n")

        for profile in profile2orgs2obj:

            in_locus_count = 0
            everywhere_count = 0
            in_locus_weight = 0
            everywhere_weight = 0

            for org in profile2orgs2obj[profile]:
                _org = profile2orgs2obj[profile][org]

                in_locus_count   += _org.in_locus
                everywhere_count += (_org.in_locus + _org.outside)

                in_locus_weight   += _org.in_locus * gnm2weight[org]
                everywhere_weight += (_org.in_locus + _org.outside) * gnm2weight[org]

            _baiticity_count  = 1.0 * in_locus_count / everywhere_count
            _baiticity_weight = in_locus_weight / everywhere_weight

            if everywhere_weight < filter_threshold:

                rare_profiles_file.write("%s\t%f\t%f\t%f\t%s\n"%(profile, everywhere_count, in_locus_count, _baiticity_count, profile2def[profile]))
                continue

            in_loci_count.append(in_locus_count)
            baiticity_count.append(_baiticity_count)
            in_loci_weight.append(in_locus_weight)
            baiticity_weight.append(_baiticity_weight)

            profiles.append(profile)
            outf.write("%s\t%f\t%f\t%f\t%f\t%s\n"%(profile,
                                                    in_locus_count,
                                                    _baiticity_count,
                                                    in_locus_weight,
                                                    _baiticity_weight,
                                                    profile2def[profile]))

    in_loci_weight = np.asarray(in_loci_weight)
    in_loci_weight = np.log10(in_loci_weight)
    baiticity_weight = np.asarray(baiticity_weight)

    plt.ioff()
    fig, ax = plt.subplots()
    ax.scatter(in_loci_weight, baiticity_weight, s=1)

    plt.xlabel("Effective orcurrence in loci (log10)")
    plt.ylabel("Baiticity")

    image_file = os.path.join(save_path, 'baiticity.png')
    plt.savefig(image_file)

    # for i, profile in enumerate(profiles_all):
    #     ax.annotate(profile, (in_loci_all[i], crispricity_all[i]))
    # fig.savefig('second.png')
    # plt.savefig('second.png')

    rare_profiles_file.close()
示例#7
0
def calculate_profile_based_crispricity(cas1402_loci, cas1402_gis, cas1402_organisms, prok1402_path_file):

    print "Loding global maps"
    global_profile2orgs2gis = load_maps_simple(prok1402_path_file, cas1402_gis)
    print "Loading weights"
    gnm2weight = t.map_genome2weight()
    print "Loading CDD definitions"
    profile2def = t.map_cdd_profile2def()

    print "Counting in CRISPR loci"

    profile2orgs2obj = {}

    for locus in cas1402_loci:
        for gene in locus:

            for _cogid in gene.cogid.split():

                if _cogid not in profile2orgs2obj:
                    profile2orgs2obj[_cogid] = {}
                    for _org in cas1402_organisms:
                        _orgObj = ProfileInOrganismCount(_org, _cogid)

                        if _cogid in global_profile2orgs2gis:
                            _orgObj.outside = len(global_profile2orgs2gis[_cogid][_org]) \
                                              if _org in global_profile2orgs2gis[_cogid] \
                                              else 0
                        else:
                            _orgObj.outside = 0

                        profile2orgs2obj[_cogid][_org] = _orgObj

                profile2orgs2obj[_cogid][gene.organism].in_crispr += 1

    out_file = os.path.join(gv.project_data_path, 'cas1402/crispricity_count.tab')

    in_crispr_all   = []
    crispricity_all = []
    profiles_all    = []

    print "Writing to file:", out_file
    with open(out_file, 'w') as outf:

        outf.write("Profile\tOccurrence in CRISPR loci\tCrispricity\tDefinition\n")

        for profile in profile2orgs2obj:
            in_crispr = 0
            everywhere = 0

            # for org in profile2orgs2obj[profile]:
            #     _org = profile2orgs2obj[profile][org]
            #     in_crispr +=  _org.in_crispr * gnm2weight[org]
            #     everywhere += (_org.in_crispr + _org.outside) * gnm2weight[org]

            for org in profile2orgs2obj[profile]:
                _org = profile2orgs2obj[profile][org]
                in_crispr += _org.in_crispr
                everywhere += (_org.in_crispr + _org.outside)

            crispricity = in_crispr / everywhere

            in_crispr_all.append(in_crispr)
            crispricity_all.append(crispricity)
            profiles_all.append(profile)
            outf.write("%s\t%f\t%f\t%s\n"%(profile, in_crispr, crispricity, profile2def[profile]))

    in_crispr_all   = np.asarray(in_crispr_all)
    in_crispr_all   = np.log10(in_crispr_all)
    crispricity_all = np.asarray(crispricity_all)
    # crispricity_all = np.log(crispricity_all)

    plt.ioff()
    fig, ax = plt.subplots()
    ax.scatter(in_crispr_all, crispricity_all, s=1)

    plt.xlabel("Effective orcurrence in CRISPR loci (log10)")
    plt.ylabel("X-axis / Effective occurrences")

    # fig.savefig('first.png')
    plt.savefig('first_count.png')