Exemplo n.º 1
0
def line_maker(cds):
    nome = Genome.find_one(cds.genome).name
    offset = gi1_mins[nome]
    group = ":".join(
        sum([
            COG.find_one(cog).group
            for cog in cds.cogs if COG.find_one(cog).group
        ], []))
    return "\t".join([
        Genome.find_one(cds.genome).name,
        str(cds.start - offset),
        str(cds.end - offset), group
    ])
Exemplo n.º 2
0
def make_line(feature, offset, thresh=10**-3):
    name = feature.pretty_id.split(":")[0] + "_" + feature.pretty_id.split(
        ":")[-1]
    product = feature.more.get('product')
    if offset > 0:
        start = feature.start - offset
        end = feature.end - offset
        strand = feature.strand
    else:
        end = start
    if feature.pfams:
        pfams = ";".join([
            p.get('query_accession').split(".")[0] for p in feature.pfams
            if p.get('i-Evalue') < thresh
        ])
        pfam_names = ";".join([
            pfam2name[p.get('query_accession').split(".")[0]]
            for p in feature.pfams if p.get('i-Evalue') < thresh
        ])
    else:
        pfams = ""
        pfam_names = ""
    cog = COG.find_one({'_feature_list': feature.id})
    data = [
        name, product,
        str(start),
        str(end),
        str(strand), pfams, pfam_names, cog.name if cog else "",
        ";".join(cog.group) if cog and cog.group else "", feature.type
    ]
    return "\t".join(data)
Exemplo n.º 3
0
 def FromDict(cls, name, dictionary):
     obj = cls()
     cogs = [COG.FromList(k,v) for k,v in tqdm(dictionary.items())]
     obj.cogs = cogs
     obj.name
     obj.save()
     return obj
Exemplo n.º 4
0
    def precluster(self, file, simi = 95):
        if Genome.find_one().species_cluster:
            g2clust = {g.id : g.species_cluster for g in Genome.find()}
            genome_clusters = [[] for x in range(max(g2clust.values())+1)]
            for g in Genome.find() :
                genome_clusters[g.species_cluster] += [g]
        else :
            with open(file) as handle:
                hanis = [h.split() for h in handle.readlines()]
            hanis = { (h[0][:-4], h[1][:-4]) : (float(h[2]), float(h[3]), float(h[4]))  for h in hanis}


            species_graph = igraph.Graph()
            vertexDeict = { v : i for i,v in enumerate(set([x for k in hanis.keys() for x in k]))}
            species_graph.add_vertices(len(vertexDeict))
            species_graph.add_edges([(vertexDeict[k[0]], vertexDeict[k[1]]) for k, v in hanis.items() if float(v[0]) > simi and k[0] != k[1] and float(v[1]) > (float(v[2])*3/4) ])
            vertex2genome = { v : Genome.find_one(ObjectId(k)) for k,v in vertexDeict.items()}
            genome_clusters = [[vertex2genome[cc] for cc in c ] for c in species_graph.components(mode=igraph.WEAK)]

        get_cogs = lambda genome_list : [set([c.id for c in COG.find({'_genomes' : g.id})])  for g in tqdm(genome_list)]
        self.cluster2cog = [list(set.union(*get_cogs(v))) for v in tqdm(genome_clusters)]
        for i, ll in enumerate(genome_clusters):
            for g in ll:
                g.species_cluster = i
                g.save()
        return self.cluster2cog
Exemplo n.º 5
0
    def duck_silix_loader(cls, file):
        with open(file) as handle:
            pairs = [(l.split()[0], l.strip().split()[1]) for l in handle ]
        cogs = {p[0] : [] for p in pairs}
        for k,v in tqdm(pairs):
            cogs[k] += [ v ]

        cogs_clean = {}
        for k,v in tqdm(cogs.items()):
            founds = CDS.find({'_id' : {'$in' : [ObjectId(vv) for vv in v]}})
            cogs_clean[k] = list(founds)
        cogs = [COG.FromList(k,v) for k,v in tqdm(cogs_clean.items())]
        return cogs
Exemplo n.º 6
0
 def gff_line(self):
     contig = Genomic.find_one(self.genomic).pretty_id
     source = "prokka"
     typ = "CDS"
     start = str(self.start)
     end = str(self.end)
     score = "."
     strand = "."
     phase = "."
     atts = ["ID=" + self.pretty_id
             ] + [k + "=" + v for k, v in self.more.items()]
     if self.cog:
         cc = COG.find_one(self.cog)
         atts += [
             "COG_name=" + cc.name
         ]  #, "COG_fct=" + ":".join([str(v) for v in cc.get_function()])  ]
     atts = ";".join(atts)
     return "\t".join([
         contig, source, typ, start, end, score, strand, phase, atts
     ]) + "\n"
Exemplo n.º 7
0
def load_genomes_to_db():
    im_dict = {
        "IMCC19121": "IMA7",
        "IMCC26103": "IMA4",
        "IMCC25003": "IMA1",
        "IMCC26077": "IMC1"
    }
    im_ncbi = {
        "IMCC19121": "NZ_CP015605",
        "IMCC26103": "NZ_CP015604",
        "IMCC25003": "NZ_CP015603",
        "IMCC26077": "NZ_CP015606"
    }
    trans_name = lambda name: "CP" + name.split(".")[0][
        -2:] if "CP" in name else im_dict[name.split(".")[0]]
    get_ncbi = lambda name: "NZ_" + name.split(".")[
        0] if "CP" in name else im_ncbi[name.split(".")[0]]

    ## Load genomes
    for g in genome_files:
        print("processing {genome}".format(genome=g))
        Genome.FromRawFile(file_name=genome_path + g,
                           name=trans_name(g),
                           whatever_id={
                               'ncbi': get_ncbi(g),
                               'short': trans_name(g),
                               'full': g
                           },
                           gtdb_taxon_string=None)
        Feature.parse_gff(genome_path + g)

## Load COGs
    compliants = {
        s.id: s.seq
        for g in Genome.find()
        for s in SeqIO.parse(compliant_path + g.name + ".fasta", "fasta")
    }
    with open(cluster_file) as handle:
        clusters_lines = {
            l.split(": ")[0]: l[:-1].split(": ")[1].split()
            for l in handle
        }
    clust_sets = {
        k: [vv for vv in v if vv in compliants]
        for k, v in clusters_lines.items()
    }
    clust_sets = {k: v for k, v in clust_sets.items() if len(v) > 0}
    cds_dict = {str(c.protein[:-1]): c.id for c in CDS.find()}

    soft_keys = {
        b: [k for k in cds_dict.keys() if str(compliants[b])[1:-1] in k]
        for b in tqdm(sum(list(clust_sets.values()), []))
    }
    soft_keys = {k: v for k, v in soft_keys.items() if len(v) > 0}

    cleaner = {
        k: [
            cds_dict[vv] for vv in v if Genome.find_one(
                CDS.find_one(cds_dict[vv]).genome)._name == k.split("|")[0]
        ]
        for k, v in soft_keys.items()
    }
    clust_sets = {
        k: sum([cleaner.get(vv) for vv in v if cleaner.get(vv)], [])
        for k, v in clust_sets.items()
    }
    clust_sets = {
        k: [CDS.find_one(vv) for vv in v]
        for k, v in clust_sets.items()
    }
    weis = COGing.FromDict(clust_sets)

    COG_groups = {}
    for p in tqdm(os.listdir(group_folder)):
        with open(pjoin(group_folder, p)) as handle:
            COG_groups[p[:-7]] = [
                COG.find_one({
                    'name': h.split(": ")[0]
                }).id for h in handle.readlines()
            ]

    for k, v in COG_groups.items():
        for vv in v:
            cog = COG.find_one(vv)
            if type(cog.group) == str:
                del cog.group
            if not cog.group:
                cog.group = []
            cog.group += [k]
            cog.save()
## run hmms runs

    for g in tqdm(Genome.find()):
        pfam_db = pjoin(home, "data/Pfam-A.hmm")
        hmm_file = pjoin(temp_hmm, g.name + ".hmm")
        prot_file = pjoin(temp_hmm, g.name + ".faa")

        g.write_fasta(prot_file)
        exe_string = "hmmsearch --cpu {cpus} --domtblout {out} {db} {seqs} > /dev/null".format(
            cpus=3, out=hmm_file, db=pfam_db, seqs=prot_file)
        os.system(exe_string)

        output = parse_pfam_file(hmm_file)

        for cds, hits in tqdm(output.items()):
            cds_obj = CDS.find_one(cds)
            cds_obj.pfams = hits
            cds_obj.save()


## all_v_all blast

    for g in tqdm(Genome.find()):
        prot_file = pjoin(temp_hmm, g.name + ".fna")
        g.write_fasta(prot_file, pretty=True)

    def make_tree_fig(tree_file, out_name, tax_level=None):
        with open(tree_file) as handle:
            lines = handle.readlines()

            if len(lines) > 0:
                ete_tree = Tree(lines[0][:-1].replace(";IM", "-IM").replace(
                    ";CP", "-"))
            else:
                return None
        if tax_level:
            taxa = {}
            #            taxa = {xx : [x for x in xx.name.replace(" ","-").split("_")[1:] if len(x) > 0 and not x[0].isdigit() ] for xx in ete_tree.get_leaves()}
            for xx in ete_tree.get_leaves():
                id = xx.name
                taxon = taxas.get(id)
                if taxon:
                    xx.name = ";".join([id] + taxon if taxon else [])
                taxa[xx] = taxon[tax_level] if taxon and len(
                    taxon) > tax_level else None

            for leaf in taxa:
                leaf.set_style(NodeStyle())
                if taxa.get(leaf) and cols.get(taxa[leaf]):
                    leaf.img_style["bgcolor"] = cols[taxa[leaf]]
                elif "acI" in leaf.name:
                    leaf.img_style["bgcolor"] = cols['acI']
        else:
            taxa = None
        styl = TreeStyle()
        styl.mode = 'c'
        #        styl.arc_start = -180
        #        styl.arc_span = 180 #
        print(out_name)
        ete_tree.render(out_name,
                        w=len(ete_tree.get_leaves()) * 5,
                        tree_style=styl)
Exemplo n.º 8
0
    .format(file=seq_file))

for k, v in gi1_features.items():
    file_path = pjoin(gi_path, "210_GI1", k + "_GI1.csv")
    with open(file_path, "w") as handle:
        handle.writelines([
            "name\tproduct\tstart\tend\tstrand\tpfams\tpfamNames\tcog\tgroup\ttype\n"
        ])
        handle.writelines([make_line(vv, mins[k]) + "\n" for vv in v])

all_gi_cdss = [gg.id for g in gi1_features.values() for gg in g]

links = {
    cds: sum([[
        CDS.find_one(f)
        for f in COG.find_one(c)._feature_list if f in all_gi_cdss
    ] for c in cds.cogs], [])
    for island in gi1_features.values() for cds in island if cds.cogs
}

gi1_mins = {k: min([f.start for f in fs]) for k, fs in gi1_features.items()}


def line_maker(cds):
    nome = Genome.find_one(cds.genome).name
    offset = gi1_mins[nome]
    group = ":".join(
        sum([
            COG.find_one(cog).group
            for cog in cds.cogs if COG.find_one(cog).group
        ], []))
Exemplo n.º 9
0
 def cogs(self):
     cogs = list(COG.find({'_id' : {'$in' : self._cogs}}))
     return cogs
Exemplo n.º 10
0
 def __getitem__(self, key):
     if type(key) == int :
         cog_id = self._cogs[key]
         return COG.find_one(ObjectId(cog_id))
     else :
         print("Not implemented yet for non numeric")
Exemplo n.º 11
0
 def completness(self, marker_cogs):
     cogs = [COG.find_one(ObjectId(c)) for c in marker_cogs]
     sc_cogs = [c for c in cogs if len(c.genomes) == len(c._feature_list)]
     hits = [self.id in c.genomes for c in sc_cogs]
     return sum(hits) / len(sc_cogs)
Exemplo n.º 12
0
    }
    for i in range(2, 7)
}).to_csv("test_data/cluster_metadata.csv")

core_mat = {
    cog: {
        clust.get_tax_string(full=True): int(cog in genes)
        for clust, genes in cumul_core.items()
    }
    for cog in tqdm(all_cores)
}
pandas.DataFrame.from_dict(core_mat,
                           orient="index").to_csv("test_data/cog_md.csv")

all_core_cogs = [
    COG.find_one(ObjectId(c)) for c in set.union(*tax2cores.values())
]

cog2function = {cog.id: cog.get_function()[0] for cog in tqdm(all_core_cogs)}
function2cog = {
    f: [c for c, ff in cog2function.items() if ff == f]
    for f in set(cog2function.values())
}
function2cog['hypothetical protein'] = []

for taxon, core in tqdm(hierarch_core.items()):
    data = {}
    cluster_set = tax2cluster[taxon]
    genome_set = [
        g for g in sum([genome_clusters[c] for c in tax2cluster[taxon]], [])
    ]