Пример #1
0
    def annotate_gene(self, reaction, gene_name):
        try:
            proteins = Protein.objects(organism=self.organism, gene=gene_name)
            if not len(proteins):
                proteins = Protein.objects(organism=self.organism,
                                           alias__iexact=gene_name)
            if len(proteins):
                for protein in proteins:
                    if not hasattr(protein, "reactions"):
                        protein.reactions = [
                        ]  # self.add_pathways_info_to_gene(reaction, protein)
                    self.add_reaction_info_to_gene(reaction, protein)
                    self.add_properties_to_gene(protein, gene_name)
                    protein.save()
            else:

                _log.warn("%s not found" % gene_name)
                self.unmapped_genes.append(gene_name)

        except DoesNotExist:
            self.unmapped_genes.append(gene_name)
            _log.warn("%s not found" % gene_name)
        except MultipleObjectsReturned:
            self.unmapped_genes.append(gene_name)
            _log.warn(gene_name)
Пример #2
0
def update_proteins(annotation_dir,
                    proteome,
                    seq_col_name,
                    tax_id,
                    identity=0.9,
                    cpus=multiprocessing.cpu_count(),
                    db_init=None):

    # if db_init:
    #     from SNDG.Sequence.ProteinAnnotator import PABase
    #     PABase.sqldb.initialize(db_init)
    # mkdir(annotation_dir)
    # out = annotation_dir + "/species_blast.tbl"
    #
    # tax = Tax.select().where(Tax.ncbi_taxon_id == tax_id).get()
    # species_tax = None
    # for tax in Tax.parents(tax):
    #     if tax.node_rank == "genus":
    #         species_tax = tax
    #         break
    # tax_data = "/data/xomeq/tax/"
    # species_fasta = tax_data + str(int(species_tax.ncbi_taxon_id)) + ".fasta"

    if not os.path.exists(out):

        if not os.path.exists(species_fasta):
            Uniprot.download_proteome_from_tax(str(species_tax.ncbi_taxon_id),
                                               tax_data)

        cmd = "blastp -query %s  -db %s -evalue 0.00001 -outfmt 6  -max_hsps 1 -qcov_hsp_perc 0.9 -num_threads %i -out %s"
        execute(cmd % (proteome, species_fasta, cpus, out))
    species_desc = {
        x.id.split("|")[1]: " ".join(x.description.split()[1:])
        for x in bpio.parse(species_fasta, "fasta")
    }

    total = Protein.objects(organism=seq_col_name).count()
    with tqdm(bpsio.parse(out, "blast-tab"), total=total) as pbar:
        for query in pbar:
            pbar.set_description(query.id)
            if query[0][0].ident_pct > identity:

                unip = query[0].id.split(
                    "|")[1] if "|" in query[0].id else query[0].id
                dbxrefs = [
                    x.db + "||" + x.value
                    for x in Mapping.select().where(Mapping.uniprot == unip)
                ]
                p = Protein.objects(gene=query.id,
                                    organism=seq_col_name).no_cache().get()

                if not p.description and unip in species_desc:
                    p.description = species_desc[unip].split(
                        "OS=")[0] + " | homology with: " + unip
                    p.save()

                if dbxrefs:
                    p = SearchLoader.update_protein_with_dbxref(
                        query.id, dbxrefs, seq_col_name)
                    p.save()
Пример #3
0
    def process(
            self,
            data_csv="/data/projects/23staphylo/raw/metadata/saureus_resist_snps.csv",
            genome="SaureusN315",
            user="******"):
        Protein.objects(organism=genome).update(
            __raw__={"$pull": {
                "features": {
                    "type": "Aanensen2016"
                }
            }})

        df = self.create_df(data_csv)
        for i, r in df.iterrows():

            gene = r["Core gene"]
            gene = gene.strip()
            assert gene

            # regex = re.compile(gene + '.*')
            has_prot = False
            for prot in Protein.objects(organism=genome, gene__iexact=gene):
                has_prot = True
                # print len([x.gene[1] if len(x.gene) > 1 else x.gene[0] for x in prot])
                self._process_prot(prot, r, i)
            if not has_prot:
                if "RNA" not in r["Core gene"]:
                    if r["Core gene"]:
                        print "Core gene not found %s" % gene
                    else:
                        print "%s not found" % gene
Пример #4
0
 def _process_proteins(self, genome):
     total = Protein.objects(organism=genome.name, reactions__0__exists=True).count()
     iterprot = tqdm(Protein.objects(organism=genome.name, reactions__0__exists=True).no_cache().timeout(
         False),total=total)
     for protein in iterprot:  # @UndefinedVariable
         keywords1 = self._protein_keywords(protein)
         keywords2 = self._add_drugability_props_to_protein(protein)
         protein.keywords = list(
             set([x.strip().lower() for x in keywords1 + keywords2 + protein.keywords if x.strip()]))
         protein.save()
     self.update_ont_org_idx()
Пример #5
0
def common_annotations(collection_name, tmp_dir, cpu=1, remove_tmp=False):
    process_pdb = Protein.objects(
        __raw__={
            "organism": collection_name,
            "features.type": SO_TERMS["polypeptide_structural_motif"]
        }).count()
    process_hmm = not (Protein.objects(
        __raw__={
            "organism": collection_name,
            "features.type": SO_TERMS["polypeptide_domain"]
        }).count())

    _common_annotations(collection_name, tmp_dir, cpu, remove_tmp, process_pdb,
                        process_hmm)
Пример #6
0
    def update_protein_with_dbxref(protein_gene, annotations, organism):
        p = Protein.objects(gene=protein_gene,
                            organism=organism).no_cache().get()

        for ann in annotations:
            word = ann.split("||")[1]
            if ann.lower().startswith("ec") or ann.lower().startswith("go"):
                p.ontologies.append(word)
                p.keywords.append(word)
            for attr in """EcoGene
            Ensembl_PRO
            Ensembl_TRS
            WormBase_PRO
            WormBase_TRS
            UniGene
            GeneDB
            EuPathDB""".split():
                if ann.startswith(attr):
                    p.keywords.append(word)
                    p.alias.append(word)
            for g in ["Gene_Name", "Gene_OrderedLocusName", "Gene_ORFName"]:
                if ann.startswith(g):
                    p.keywords.append(word)
                    p.alias.append(word)
                    p.gene.append(word)

        #if "Uncharacterized" in p.description:
        #    p.description =
        p.ontologies = list(set([x.lower() for x in p.ontologies]))
        p.keywords = list(set([x.lower() for x in p.keywords]))
        p.alias = list(set(p.alias))
        p.gene = [protein_gene] + [x for x in set(p.gene) if x != protein_gene]
        return p
Пример #7
0
    def clean_structures(self, organism):
        proteins = list(
            Protein.objects(organism=organism).no_cache().timeout(False))
        for p in proteins:
            experimentals = p.cristals()
            models = p.models()
            if experimentals and models:
                for model in models:
                    for exp in experimentals:
                        hit = [
                            f for f in p.features
                            if f.identifier.startswith(exp.name)
                        ][0]
                        model_query = model.templates[0].aln_query
                        model_range = set(
                            range(model_query.start, model_query.end))
                        exp_range = set(
                            range(hit.location.start, hit.location.end))
                        if ((len(model_range & exp_range) * 1.0 /
                             len(model_range)) > 0.8):
                            model.delete()

            if len(models) > 1:
                for i, j in combinations(range(len(models), 2)):
                    if ((models[i].templates[0].aln_query.start
                         == models[j].templates[0].aln_query.start)
                            and (models[i].templates[0].aln_query.end
                                 == models[j].templates[0].aln_query.end)):
                        models[i].delete()
Пример #8
0
def common_annotations(collection_name,
                       tmp_dir,
                       cpu=1,
                       remove_tmp=False,
                       pfam_db="/data/databases/xfam/Pfam-A.hmm",
                       pdbs_path="/data/databases/pdb/pdb_seqres.txt"):
    process_pdb = not Protein.objects(
        __raw__={
            "organism": collection_name,
            "features.type": SO_TERMS["polypeptide_structural_motif"]
        }).count()
    process_hmm = not (Protein.objects(
        __raw__={
            "organism": collection_name,
            "features.type": SO_TERMS["polypeptide_domain"]
        }).count())

    _common_annotations(collection_name, tmp_dir, cpu, remove_tmp, process_pdb,
                        process_hmm, pfam_db, pdbs_path)
Пример #9
0
    def build_index(self, query={}, error_output="/tmp/index_struct.log"):
        self.update_collection_params()
        if not query:
            proteins = Protein.objects(
                organism=self.collection.name).no_cache().timeout(False)
            prot_count = Protein.objects(organism=self.collection.name).count()
        else:
            query["organism"] = self.collection.name
            proteins = Protein.objects(__raw__=query).no_cache().timeout(False)
            prot_count = Protein.objects(__raw__=query).count()
        with tqdm(proteins, total=prot_count) as pbar:
            for protein in pbar:

                pbar.set_description(protein.name)
                try:
                    if not protein.search:
                        protein.search = ProteinDruggabilitySearch()

                    self.initDrugabilitySearch(protein.search)

                    cristals, models = self.get_protein_structures(protein)

                    if len(cristals + models) > 0:
                        self.annotate_protein_with_structures(
                            cristals, models, protein)

                    else:
                        protein.keywords = [
                            x for x in protein.keywords if x not in [
                                "poorly_druggable", "druggable",
                                "highly_druggable", "has_structure"
                            ]
                        ]
                        protein.keywords.append("non_druggable")
                        for x in StructuromeIndexer.search_params:
                            protein.search[x[0]] = None
                    protein.save()
                except Exception as ex:
                    error_line = protein.name + "," + protein.organism + "," + str(
                        protein.id) + "," + str(ex)
                    _log.warn(error_line)
                    with open(error_output, "a") as h:
                        h.write(error_line)
Пример #10
0
def create_proteome(tmp_dir, collection_name):
    protein_fasta = tmp_dir + "/proteins.fasta"
    if not os.path.exists(protein_fasta) or (
            not os.path.getsize(protein_fasta)):
        with open(protein_fasta, "w") as h:
            for p in Protein.objects(organism=collection_name).no_cache():
                bpio.write(
                    SeqRecord(id=p.gene[0], description="", seq=Seq(p.seq)), h,
                    "fasta")
    return protein_fasta
Пример #11
0
def load_pdb_domains(organism,
                     blast_file,
                     feature_type="SO:0001079",
                     min_identity=0.9,
                     min_query_coverage=0.9,
                     min_hit_coverage=0.9):
    queries = list(bpsio.parse(blast_file, 'blast-xml'))
    features_added = 0
    with tqdm(queries) as pbar:
        for query in queries:
            pbar.set_description(query.id)
            pfam, dnstart, dnend = query.id.split("_")[-3:]
            dnstart, dnend = int(dnstart), int(dnend)
            gene = "_".join(query.id.split("_")[:-3])

            proteins = Protein.objects(organism=organism,
                                       gene=gene).no_cache().timeout(False)
            change = False
            for protein in proteins:
                for hit in query:
                    hsp = hit[0]
                    dn = [
                        x for x in protein.domains()
                        if (abs(x.location.start - dnstart) < 10) and (
                            abs(x.location.end - dnend) < 10)
                        and x.identifier.split(".")[0] == pfam.split(".")[0]
                    ]
                    if dn:
                        dn = dn[0]
                        ident_fn = lambda fident: "_".join(
                            fident.split("_")[0:1] + fident.split("_")[-2:])
                        pdb = [
                            x for x in protein.features
                            if x.type == feature_type
                            and ident_fn(x.identifier) == ident_fn(hit.id)
                        ]
                        if not pdb:
                            posSet = set(
                                range(dn.location.start, dn.location.end))
                            dncover = 1.0 * len(
                                posSet & set(range(dnstart, dnend))) / (
                                    dn.location.end - dn.location.start)
                            if (min_identity <= identity(hsp)) and (
                                    dncover >= min_query_coverage):
                                hsp_feature = BioDocFactory.feature_from_hsp(
                                    hsp, feature_type)
                                hsp_feature.location.start += dnstart
                                hsp_feature.location.end += dnstart
                                features_added = features_added + 1
                                change = True
                                protein.features.append(hsp_feature)
                if change:
                    protein.save()

    _log.info("Features added: " + str(features_added))
Пример #12
0
 def properties_from_feature(self,
                             organism,
                             feature_type,
                             value_fn,
                             property_name=None,
                             url_fn=None):
     proteins = Protein.objects(__raw__={
         "organism": organism,
         "features.type": feature_type
     }).no_cache().timeout(False)
     if property_name == "human_offtarget":
         res = self.db.proteins.update(
             {
                 "organism": organism,
                 "search.human_offtarget": {
                     "$exists": True
                 }
             }, {"$unset": {
                 "search.human_offtarget": ""
             }},
             multi=True)
         _log.info(res)
     for p in proteins:
         f = [x for x in p.features if x.type == feature_type]
         if f:
             f = f[0]
             bp = BioProperty(
                 _type=feature_type,
                 property=property_name if property_name else feature_type,
                 value=value_fn(f),
             )
             if url_fn:
                 bp.url = url_fn(f)
             props = [
                 x for x in p.properties
                 if not (x._type == feature_type and (
                     (not property_name) or (x.property == property_name)))
             ]
             p.properties = props
             p.properties.append(bp)
             if property_name == "human_offtarget":
                 p.search.human_offtarget = 1 - f.aln.identity
             p.save()
     if property_name == "human_offtarget":
         res = self.db.proteins.update(
             {
                 "organism": organism,
                 "search.human_offtarget": {
                     "$exists": False
                 }
             }, {"$set": {
                 "search.human_offtarget": 1
             }},
             multi=True)
         _log.info(res)
Пример #13
0
    def update_genome_props(self):
        if self.user == "demo":
            user2 = ""
        else:
            user2 = self.user + "."
        search_params = [("resistance", "Associated with resistance",
                          "variant-db", SeqColDruggabilityParamTypes.value,
                          ["true", "false"], "true", "equal", "avg")]
        search_params = search_params + [
            (x.lower(), "Associated with " + x + " resistance", "variant-db",
             SeqColDruggabilityParamTypes.value, ["true", "false"
                                                  ], "true", "equal", "avg")
            for x in Saureus.drugs
        ]

        SeqCollection.objects(name=self.organism).update(
            __raw__={
                "$pull": {
                    "druggabilityParams": {
                        "target": "variant-db",
                        "uploader": self.user
                    }
                }
            })
        collection = SeqCollection.objects(name=self.organism).get()
        for name, description, target, _type, options, defaultValue, defaultOperation, defaultGroupOperation in search_params:
            Protein.objects(organism=self.organism).update(
                __raw__={"$set": {
                    "search." + user2 + name: False
                }})
            if not collection.has_druggability_param(name):
                dp = SeqColDruggabilityParam(name=name,
                                             description=description,
                                             target=target,
                                             type=_type,
                                             uploader=self.user)
                dp.options = options
                dp.defaultValue = defaultValue
                dp.defaultOperation = defaultOperation
                dp.defaultGroupOperation = defaultGroupOperation
                collection.druggabilityParams.append(dp)
        collection.save()
Пример #14
0
    def load_priam_hits(self, seq_collection_name, path_genomeEnzymes):

        for line in open(path_genomeEnzymes):
            arr_line = line.split("\t")
            ec = "ec:" + arr_line[0]
            prot_id = arr_line[1].split(" ")[0]

            for prot in Protein.objects(organism=seq_collection_name, alias=prot_id):
                prot.ontologies.append(ec)
                prot.keywords.append(ec)
                prot.save()
Пример #15
0
 def load_from_emapper(self, organism, emapperv2_file):
     from SNDG.Annotation.EMapper import EMapper
     em = EMapper()
     em.read_file(emapperv2_file)
     for locus_tag, record in em.data.items():
         prot = Protein.objects(organism=organism, gene=locus_tag).get()
         for ec in record["EC"].split(","):
             prot.ontologies.append("ec:" + ec)
         for go in record["GOs"].split(","):
             prot.ontologies.append(go.lower())
         prot.save()
Пример #16
0
def load_hmm(organism,
             hmm_file,
             transform_query_regexp=None,
             transform_hit_regexp=None):
    assert os.path.exists(hmm_file)
    for query in tqdm(bpsio.parse(hmm_file, 'hmmer3-text')):
        for hit in query:
            for hsp in hit:
                gene = query.id
                if transform_query_regexp:
                    gene = re.search(transform_query_regexp, query.id,
                                     re.IGNORECASE).group(1)

                hit_name = hit.id
                if transform_query_regexp:
                    hit_name = re.search(transform_hit_regexp, hit_name,
                                         re.IGNORECASE).group(1)

                proteins = Protein.objects(
                    organism=organism, alias=gene).no_cache().timeout(False)
                for protein in proteins:
                    dn = [
                        d for d in protein.domains()
                        if (d.identifier == hit_name) and (
                            d.location.start == hsp.query_start) and (
                                d.location.end == hsp.query_end)
                    ]
                    if dn:
                        protein.features.remove(dn[0])

                    hsp_feature = Feature(
                        _id=ObjectId(),
                        location=Location(start=hsp.query_start,
                                          end=hsp.query_end),
                        aln=SimpleAlignment(
                            evalue=hsp.evalue,
                            aln_query=AlnLine(name=hsp.query_id,
                                              seq=str(hsp.aln[0].seq),
                                              start=hsp.query_start,
                                              end=hsp.query_end),
                            aln_hit=AlnLine(name=hsp.hit.id,
                                            seq=str(hsp.aln[1].seq),
                                            start=hsp.hit_start,
                                            end=hsp.hit_end),
                            aln_cd=hsp.aln_annotation["CS"]
                            if "CS" in hsp.aln_annotation else "",
                            aln_pp=hsp.aln_annotation["PP"]
                            if "PP" in hsp.aln_annotation else ""),
                        identifier=hsp.hit.id,
                        type=SO_TERMS["polypeptide_domain"])

                    protein.features.append(hsp_feature)
                    protein.save()
Пример #17
0
def load_blast_features(organism,
                        blast_file,
                        feature_type,
                        min_identity=0,
                        min_query_coverage=0,
                        min_hit_coverage=0):
    queries = list(bpsio.parse(blast_file, 'blast-xml'))

    def check_overlap(features, new_feature, max_aa_overlap):
        for f in features:
            if (1.0 * len(new_feature & f) / len(f)) > 0.8:
                return True
        return False

    features_added = 0
    for query in tqdm(queries):

        gene = query.id

        proteins = Protein.objects(organism=organism,
                                   gene=gene).no_cache().timeout(False)
        change = False
        for protein in proteins:
            for hit in query:
                hsp = hit[0]
                if ((identity(hsp) >= min_identity)
                        and (coverage(query, hsp) >= min_query_coverage)
                        and (hit_coverage(hit, hsp) >= min_hit_coverage)):

                    hsp_feature = BioDocFactory.feature_from_hsp(
                        hsp, feature_type)
                    features_added = features_added + 1
                    change = True
                    protein.features.append(hsp_feature)
                elif (identity(hsp) >= min_identity) and (hit_coverage(
                        hit, hsp) >= min_hit_coverage):
                    for dn in protein.domains():
                        posSet = set(range(dn.location.start, dn.location.end))
                        dncover = 1.0 * len(posSet & set(
                            range(hsp.query_start, hsp.query_end))) / (
                                dn.location.end - dn.location.start)
                        if dncover >= min_query_coverage:
                            hsp_feature = BioDocFactory.feature_from_hsp(
                                hsp, feature_type)
                            features_added = features_added + 1
                            change = True
                            protein.features.append(hsp_feature)
            if change:
                protein.save()

    _log.info("Features added: " + str(features_added))
def main(argv=None):  # IGNORE:C0111

    program_version = "v%s" % __version__
    program_build_date = str(__updated__)
    program_version_message = '%%(prog)s %s (%s)' % (program_version, program_build_date)
    program_shortdesc = __import__('__main__').__doc__.split("\n")[1]
    program_license = '''%s

  Created by user_name on %s.
  Copyright 2015 BIA. All rights reserved.

  Licensed under the Apache License 2.0
  http://www.apache.org/licenses/LICENSE-2.0

  Distributed on an "AS IS" basis without warranties
  or conditions of any kind, either express or implied.

USAGE
''' % (program_shortdesc, str(__date__))

    parser = ArgumentParser(description=program_license, formatter_class=RawDescriptionHelpFormatter)
    parser.add_argument("-v", "--verbose", dest="verbose", action="count",
                        help="set verbosity level [default: %(default)s]")

    parser.add_argument("-n", "--name", required=True)
    parser.add_argument("-dir", "--structs_dir", required=True)
    parser.add_argument("-db_structure", "--db_structure",help="Mongo structure db", default='pdb')
    parser.add_argument("-db_genome", "--db_genome",help="Mongo proteins db", default='xomeq')
    parser.add_argument("-host", "--db_host", default='127.0.0.1')
    parser.add_argument('-V', '--version', action='version', version=program_version_message)

    args = parser.parse_args()

    BioMongoDB(args.db_genome)
    db = pymongo.MongoClient(args.db_host)[args.db_structure]

    sa = StructureAnotator(args.structs_dir + "/")
    total = sa.total(db, args.name, {})

    with tqdm(sa.iterator(db, args.name, {}), total=total) as pbar:
        for model in pbar:
            pbar.set_description(model.name)

            template = model.templates[0]

            try:
                protein = Protein.objects(organism=args.name, alias=template.aln_query.name).get()
            except DoesNotExist:
                _log.warn(template.aln_query.name + " does not exists")
            sa.annotate_model(model, protein.domains())
            model.save()
Пример #19
0
    def load_from_interpro(self, organism, interprot_gff):
        for l in tqdm(open(interprot_gff)):
            if l.startswith(">"):
                break
            if l.startswith("##"):
                continue
            l = l.replace("EC=", "EC ")
            locus_tag, source, feature, start, end, score, strand, frame = l.split(
                "\t")[:8]
            attributes = " ".join(l.split("\t")[8:])

            if feature == "polypeptide":
                continue

            start, end = int(start), int(end)

            if "signature_desc=" in attributes:
                repl = attributes.split("signature_desc=")[1].split(
                    ";Name=")[0]
                attributes = attributes.replace(
                    repl,
                    repl.replace("=", "%3D").replace(";", "%3B"))

            attributes = {
                x.split("=")[0]: x.split("=")[1]
                for x in attributes.split(";")
            }
            # [seq,source,feature,start,end,score,strand,frame,attributes ])
            feature = Feature(_id=ObjectId(),
                              location=Location(start=start, end=end),
                              identifier=attributes["Name"],
                              type=source)
            prot = Protein.objects(organism=organism, gene=locus_tag).get()

            if "signature_desc" in attributes:
                feature.qualifiers = {
                    "description": attributes["signature_desc"]
                }
            if "Ontology_term" in attributes:
                for ont in attributes["Ontology_term"].split(","):
                    ont = ont.replace('"', "").strip()
                    prot.ontologies.append(ont.lower())
            if "Dbxref" in attributes:
                for ont in attributes["Dbxref"].split(","):
                    ont = ont.replace('"', "").strip()
                    prot.ontologies.append(ont.lower())

            prot.features.append(feature)
            prot.save()
Пример #20
0
    def props_from_dbxref(self, name):

        i = 0
        for p in Protein.objects(organism=name).no_cache().timeout(False):
            if p.dbxrefs:
                i += 1
                prop = BioProperty(_type="dbxref",
                                   property="links",
                                   value=p.dbxrefs)
                p.alias += [
                    x.split(":")[1] for x in p.dbxrefs
                    if x.lower().startswith("uniprot")
                ]
                p.alias = list(set(p.alias))
                p.properties.append(prop)
                p.save()
Пример #21
0
    def organism_iterator(self, organism, seq_map=None):
        for dbcontig in Contig.objects(organism=organism).no_cache():
            if seq_map:
                seq = str(seq_map[dbcontig.name].seq)
            else:
                seq = dbcontig.seq
            contig = SeqRecord(id=dbcontig.name, seq=Seq(seq))
            for dbfeature in dbcontig.features:
                qualifiers = {"locus_tag": [dbfeature.locus_tag]}
                p = list(
                    Protein.objects(organism=organism,
                                    gene=dbfeature.identifier))
                if p:
                    p = p[0]
                    qualifiers["description"] = [p.description]
                    qualifiers["gene_symbol"] = p.gene
                    qualifiers["Note"] = [p.description]

                    ecs = [
                        x.upper() for x in p.ontologies if x.startswith("ec:")
                    ]
                    gos = [
                        x.upper() for x in p.ontologies if x.startswith("go:")
                    ]
                    if ecs:
                        qualifiers["EC"] = ecs
                    if gos:
                        qualifiers["GO"] = gos
                    feature = SeqFeature(id=dbfeature.identifier,
                                         type=dbfeature.type,
                                         qualifiers=qualifiers,
                                         location=FeatureLocation(
                                             start=dbfeature.location.start,
                                             end=dbfeature.location.end,
                                             strand=dbfeature.location.strand))
                    contig.features.append(feature)
            yield contig
Пример #22
0
def correct_chokes(self, name):
    metabolites_in = defaultdict(list)
    metabolites_out = defaultdict(list)
    for p in self.db.proteins.find({
            "organism": name,
            "reactions.0": {
                "$exists": True
            }
    }):
        for r in p["reactions"]:
            for m in r["products"]:
                metabolites_out[m["name"]].append(r["name"])
            for m in r["substrates"]:
                metabolites_in[m["name"]].append(r["name"])

    for m, r in metabolites_in.items():
        if (len(set(r)) > 1):
            # or (self.db.proteins.count({"organism":name,"reactions.name": r[0]}) > 1)):
            del metabolites_in[m]
    for m, r in metabolites_out.items():
        if (len(set(r)) > 1):
            # or (self.db.proteins.count({"organism":name,"reactions.name": r[0]}) > 1)):
            del metabolites_out[m]

    choke_reactions_in = []
    for rs in metabolites_in.values():
        choke_reactions_in += rs
    choke_reactions_out = []
    for rs in metabolites_out.values():
        choke_reactions_out += rs

    reaction_metabolites = defaultdict(lambda: [])
    for m, rs in metabolites_in.items():
        for r in rs:
            reaction_metabolites[r].append(m)
    for m, rs in metabolites_out.items():
        for r in rs:
            reaction_metabolites[r].append(m)

    for p in Protein.objects(
            organism=name,
            reactions__0__exists=True).no_cache().timeout(False):
        cout = bool(
            [r.name for r in p.reactions if r.name in choke_reactions_out])
        cin = bool(
            [r.name for r in p.reactions if r.name in choke_reactions_in])
        p.search.chokepoint = cout | cin
        if p.search.chokepoint:
            p.search.chokepoint_type = "double" if (cout & cin) else (
                "production" if cout else "consuming")
            prop = [x for x in p.properties if x.property == "chokepoint"]
            if prop:
                prop = prop[0]
                prop.metabolites = []
                for x in p.reactions:
                    if x.name in reaction_metabolites:
                        prop.metabolites += reaction_metabolites[x.name]
            else:
                prop = BioProperty(_type="pathways",
                                   property="chokepoint",
                                   metabolites=[],
                                   type=p.search.chokepoint_type)
                for x in p.reactions:
                    if x.name in reaction_metabolites:
                        prop.metabolites += reaction_metabolites[x.name]
                p.properties.append(prop)
        else:
            del p.search.chokepoint_type
            p.properties = [
                x for x in p.properties if x.property != "chokepoint"
            ]
        p.save()
Пример #23
0
def index_seq_collection(db,
                         genome,
                         ec=True,
                         go=True,
                         keywords=True,
                         organism_idx=True,
                         pathways=True,
                         structure=False,
                         go_obo="/data/databases/go/go.obo"):

    collection = SeqCollection.objects(name=genome).get()

    if ec:
        ec2mongo = EC2Mongo(db)
        _log.debug("Building EC index...")
        ec2mongo.pre_build_index(collection)
        collection.ec_index = True
        _log.debug("EC index finished")
        collection.save()

    if go:
        go2mongo = GO2Mongo(go_obo, db)
        go2mongo.init()
        _log.debug("Building GO index...")
        go2mongo.pre_build_index(collection)
        collection.go_index = True
        collection.save()
        _log.debug("GO index finished")

    if structure:
        si = StructuromeIndexer(collection)
        si.build_index()

    if pathways:
        biocyc = BioCyc(db)
        biocyc.user = BioMongoDB.demo
        _log.debug("Building Biocyc index...")
        biocyc.pre_build_index(collection)
        _log.debug("Biocyc index finished")

    if keywords:

        _log.debug("indexing by keyword...")
        ki = KeywordIndexer()
        cache = {}
        total_p = db.proteins.count({"organism": genome})
        with tqdm(Protein.objects(organism=genome).no_cache().timeout(False),
                  total=total_p) as pbar:

            for prot in pbar:
                pbar.set_description(prot.name)
                # Basic keywords
                current_keywords = list(
                    set([
                        x.lower().strip() for x in reduce(
                            list.__add__,
                            map(ki.extract_keywords,
                                [prot.name, prot.description] + prot.gene))
                    ]))

                prot.keywords = current_keywords + prot.keywords
                # ontologies keywords
                terms = prot.ontologies
                terms = terms + [
                    x.identifier.strip().lower()
                    for x in prot.features if x.identifier
                ]
                terms = terms + [
                    x.type.strip().lower()
                    for x in prot.features if x.identifier
                ]
                terms = list(set([x.lower() for x in terms]))

                for term in terms:
                    if term not in cache:
                        ont = Ontology.objects(term=str(term))
                        if len(ont):
                            cache[term] = ont.first()

                    if term in cache:
                        prot.keywords = prot.keywords + cache[term].keywords
                    # SO:0001060 missense_variant

                prot.keywords = list(set(prot.keywords + terms))
                prot.save()
        _log.debug("Keyword index finished")

    if organism_idx:
        _log.debug("indexing ontology by organism")
        prots = list(
            db.proteins.find({
                "organism": genome,
                "ontologies.0": {
                    "$exists": True
                }
            }))
        for prot in tqdm(prots):
            for term in prot["ontologies"]:
                if (term
                        in cache) and cache[term].ontology not in ["ec", "go"]:
                    seq_col_ont_idx = SeqColOntologyIndex(
                        seq_collection_id=collection.id,
                        term=term,
                        seq_collection_name=genome,
                        name=cache[term].name,
                        ontology=cache[term].ontology,
                        keywords=cache[term].keywords)
                    seq_col_ont_idx.save()
        SeqColOntologyIndex.objects(count=0).delete()
        _log.debug("Organism index finished")
    collection.save()
    _log.info("indexing %s finished" + genome)
Пример #24
0
    def load_in_sndg(self, organism="H37Rv"):
        from SNDG.BioMongo.Model.Protein import Protein
        from SNDG.BioMongo.Model.Feature import Feature, Location
        from SNDG.BioMongo.Model.SeqCollection import SeqCollection
        from SNDG.BioMongo.Model.SeqColDruggabilityParam import SeqColDruggabilityParamTypes, SeqColDruggabilityParam

        from bson.objectid import ObjectId

        search_params = [("resistance", "Associated with resistance", "variant-db",
                          SeqColDruggabilityParamTypes.value, ["true", "false"], "true", "equal", "avg")

                         ]
        search_params = search_params + [
            (x, "Associated with " + x + " resistance", "variant-db",
             SeqColDruggabilityParamTypes.value, ["true", "false"], "true", "equal", "avg")
            for x in TBDream.drugs
        ]

        Protein.objects(organism=organism).update(__raw__={"$pull": {"features": {"type": "tbdream"}}})
        collection = SeqCollection.objects(name=organism).get()
        for name, description, target, _type, options, defaultValue, defaultOperation, defaultGroupOperation in search_params:
            Protein.objects(organism=organism).update(__raw__={"$set": {"search." + name: False}})
            if not collection.has_druggability_param(name):
                dp = SeqColDruggabilityParam(name=name, description=description, target=target,
                                             type=_type, uploader="demo")
                dp.options = options
                dp.defaultValue = defaultValue
                dp.defaultOperation = defaultOperation
                dp.defaultGroupOperation = defaultGroupOperation
                collection.druggabilityParams.append(dp)
        collection.save()

        for rv, rows in self._df.groupby("rv"):
            prot = list(Protein.objects(organism=organism, gene__iexact=rv))
            if prot:
                prot = prot[0]
                for _, r in rows.iterrows():
                    mut = None
                    if r.change:
                        change = str(r.change[0]) + "/" + str(r.change[1])
                        mut = SeqUtils.seq1(r.change[1])
                    else:
                        change = r.AminoAcid
                    if math.isnan(r.codon):
                        try:
                            pos = int(r.AminoAcid)
                        except:
                            _log.warn("couldnt find the variant position")
                            continue
                    else:
                        pos = int(r.codon)

                    try:
                        res, t = r.RTotalIsolates.strip().split("/")
                        r_div_total_coef = int(res) * 1.0 / int(t)
                        r_div_total = r.RTotalIsolates.strip()

                    except:
                        r_div_total = None
                        r_div_total_coef = None

                    quals = {
                        "drug": r.Drug,
                        "change": change,
                        "gene": r.GeneID,
                        "pattern": r.ResistancePattern,
                        "additional": r.AdditionalMutations,
                        "r_div_total": r_div_total,
                        "r_div_total_coef": r_div_total_coef,
                        "mic": r.MIC}
                    if mut:
                        quals["mut"] = mut
                    fvariant = Feature(_id=ObjectId(), location=Location(start=pos, end=pos), type="tbdream",
                                       identifier="TBDream id " + r.ID,
                                       qualifiers=quals)
                    prot.features.append(fvariant)
                    prot.search.resistance = True
                    prot.search[r.Drug] = True
                prot.save()
Пример #25
0
            model_filename = model_file.split("/")[-1]
            model_name = model_filename.split(".pdb")[0]

            hunt_pockets(model_file)
            build_assessments(os.path.dirname(model_file),
                              os.path.basename(model_filename))

            seq_name = "_".join(model_name.split("_")[:-4])
            org_model_name = model_name
            template_name = "_".join(model_name.split("_")[-4:])

            if ModeledStructure.objects(organism=organism,
                                        name=model_name).count():
                continue

            prot = list(Protein.objects(organism=organism, gene=seq_name))
            if len(prot) == 0:
                _log.warn("Not found: " + seq_name)
                continue

            aln = [
                hit[0] for hit in list(
                    bpsio.read(
                        basepath + "/" + seq_name +
                        "/profile_search.xml", "blast-xml"))
                if hit.id == template_name
            ][0]

            with open(model_file + ".json") as h:
                assessments = json.load(h)
            pockets = []
Пример #26
0
#             pockets_json = model_file + ".pocket.json"
#             if os.path.exists(pockets_json):
#                 rss = StructureAnotator.pocket_residue_set(pockets_json, model.get_atoms())
#                 strdoc.pockets = rss
#             strdoc.save()
#         except Exception as ex:
#             _log.error(ex)
#
# print ("OK!")

sa = StructureAnotator(basepath,
                       struct_path=lambda wd, modeldoc: glob("/".join([
                           wd, modeldoc.templates[0].aln_query.name, modeldoc.
                           templates[0].aln_query.name, modeldoc.name, "*.pdb"
                       ]))[0])
total = sa.total(db, organism, {})

with tqdm(sa.iterator(db, organism, {}), total=total) as pbar:
    for model in pbar:
        pbar.set_description(model.name)

        template = model.templates[0]
        try:
            protein = Protein.objects(organism=organism,
                                      alias=template.aln_query.name).get()
        except:
            print template.aln_query.name + " does not exists"
            continue
        sa.annotate_model(model, protein.domains())
        model.save()
Пример #27
0
def annotate_variants_with_prots(organism_name, dbs, drugs, force=False):
    """
    drugs: list of strings, example TBDream.drugs or Saureus.drugs
    """
    for idx, p in enumerate(
            Protein.objects(
                __raw__={
                    "organism": organism_name,
                    "features.qualifiers.strain": {
                        "$exists": 1
                    }
                }).no_cache()):
        print idx
        pvariants = list(
            Variant.objects(organism=organism_name, gene__in=p.gene))

        for vd in pvariants:

            if ((vd.search == None) or force):

                psearch = p.search
                del psearch.structures
                vd.search = psearch
                vd.ontologies = p.ontologies

                for r in p.reactions:
                    for pw in r.pathways:
                        vd.ontologies.append(pw)
                vd.ontologies = list(set(vd.ontologies))

            for drug in drugs:
                vd.search[drug] = False
            vd.search["resistance"] = False
            for db in dbs:

                for sample_allele in vd.sample_alleles:
                    aa_pos = sample_allele.aa_pos
                    feature = [
                        f for f in p.features
                        if (f.type == db) and (f.location.start == aa_pos)
                    ]
                    if feature:
                        feature = feature[0]
                        if feature._data["qualifiers"]:
                            vd.search["resistance"] = True

                        feature = [
                            f for f in p.features
                            if (f.type == db) and (f.location.start == aa_pos)
                            and ("mut" in f._data["qualifiers"]) and (
                                (f._data["qualifiers"]["mut"] ==
                                 sample_allele.aa_alt) or (
                                     (f._data["qualifiers"]["change"].lower()
                                      == "frameshift" and sample_allele.
                                      variant_type == "frameshift_variant")))
                        ]
                        if feature:
                            sample_allele.feature = feature[0]
                            if sample_allele.feature._data["qualifiers"]:
                                vd.search[sample_allele.feature._data[
                                    "qualifiers"]["drug"]] = True  #
            vd.save()
Пример #28
0
def annotate_variants(organism_name, strain_name, database, parse_change):
    """
    parse_change: function that transforms  dbvar.qualifiers["change"] into aa_ref, aa_alt
    """

    collection = SeqCollection.objects(name=organism_name).get()

    prop = strain_name + "_" + database
    Protein.objects(organism=organism_name).update(
        __raw__={"$set": {
            "search." + prop: False
        }})
    if not collection.has_druggability_param(prop):
        dp = SeqColDruggabilityParam(name=prop,
                                     description="Variant in strain " +
                                     strain_name + " is reported in " +
                                     database,
                                     target="variant-strain",
                                     type=SeqColDruggabilityParamTypes.value,
                                     uploader="demo")
        dp.options = ["true", "false"]
        dp.defaultValue = "true"
        dp.defaultOperation = "equal"
        dp.defaultGroupOperation = "avg"
        collection.druggabilityParams.append(dp)
    prop = strain_name + "_" + database + "_pos"
    Protein.objects(organism=organism_name).update(
        __raw__={"$set": {
            "search." + prop: False
        }})
    if not collection.has_druggability_param(prop):
        dp = SeqColDruggabilityParam(
            name=prop,
            description="The position of the variant the strain " +
            strain_name + " is reported in " + database,
            target="variant-strain",
            type=SeqColDruggabilityParamTypes.value,
            uploader="demo")
        dp.options = ["true", "false"]
        dp.defaultValue = "true"
        dp.defaultOperation = "equal"
        dp.defaultGroupOperation = "avg"
        collection.druggabilityParams.append(dp)
    collection.save()

    for p in Protein.objects(__raw__={
            "organism": organism_name,
            "features.qualifiers.strain": strain_name
    }).no_cache():
        dbvars = [f for f in p.features if f.type == database]
        dirty = False
        if dbvars:
            strainvars = [
                f for f in p.features if (f.type == "strain_variant") and (
                    f._data["qualifiers"]["strain"] == strain_name)
            ]

            for dbvar in dbvars:
                dirty = True
                for strainvar in strainvars:
                    strainvar._data["qualifiers"]["ref_pos"] = False
                    if dbvar.location.start == strainvar.location.start:
                        p.search[strain_name + "_" + database + "_pos"] = True
                        strainvar._data["qualifiers"]["ref_pos"] = dbvar._id

                        try:
                            dref, dalt = parse_change(
                                dbvar._data["qualifiers"]["change"])
                            sref, salt = strainvar._data["qualifiers"][
                                "change"].strip().split("/")
                            sref = sref.strip()
                            salt = salt.strip()
                            if (dref == sref) and (dalt == salt):
                                p.search[strain_name + "_" + database] = True
                                strainvar._data["qualifiers"][
                                    "ref"] = dbvar._id

                        except Exception as ex:
                            _log.warn(ex)
                        if (("frameshift"
                             in dbvar._data["qualifiers"]["change"].lower())
                                and
                            ("frameshift" in strainvar._data["qualifiers"]
                             ["change"].lower())):
                            p.search[strain_name + "_" + database] = True
            if dirty:
                p.save()
Пример #29
0
 def protein_fasta(outfile_path, organism):
     with open(outfile_path, "w") as h:
         for p in Protein.objects(organism=organism).no_cache():
             r = SeqRecord(id=p.gene[0], description="", seq=Seq(p.seq))
             bpio.write(r, h, "fasta")
Пример #30
0
    def load_metadata(self, organism_name, datafile, uploader=demo):
        import pandas as pd
        from tqdm import tqdm

        seqCollection = list(SeqCollection.objects(name=organism_name))
        seqCollection = seqCollection[0]
        errors = []

        upload = DataUpload(uploader=uploader, errors=errors)

        df = pd.read_table(datafile, comment="#", index_col=False)

        headerProperties = [
            c for c in df.columns if c != BioMongoDB.GENE_FIELD_IMPORT
        ]
        prots = Protein.objects(organism=organism_name)
        for hp in headerProperties:
            prots.update(
                __raw__={
                    "$pull": {
                        "properties": {
                            "property": hp,
                            "_type": uploader
                        }
                    },
                    "$unset": {
                        "search." + hp: ""
                    }
                })

        upload.properties = headerProperties

        numericFields = []

        for k, v in dict(df.dtypes).items():
            if v not in [np.float64, np.int64]:
                df[k] = df[k].astype('category')
            else:
                numericFields.append(k)

        assert BioMongoDB.GENE_FIELD_IMPORT in df.columns

        for linenum, fields in tqdm(df.iterrows()):

            gene = fields[BioMongoDB.GENE_FIELD_IMPORT]

            if not gene:
                text = str(linenum) + " gene field is empty"
                errors.append(text)
                continue

            count = Protein.objects(organism=organism_name,
                                    alias=gene).count()

            if not count:
                text = str(
                    linenum
                ) + " " + gene + " does not exists in " + organism_name
                print(text)
                errors.append(text)
                continue

            prots = Protein.objects(organism=organism_name, alias=gene)

            for propertyName in headerProperties:
                prop = {"_type": uploader, "value": fields[propertyName]}
                prop["property"] = propertyName
                prots.update(
                    __raw__={
                        "$push": {
                            "properties": prop
                        },
                        "$set": {
                            "search." + propertyName: fields[propertyName]
                        }
                    })

        for p in headerProperties:
            dpType = "number" if p in numericFields else "value"

            options = [] if p in numericFields else list(set(df[p]))
            currentDp = seqCollection.druggabilityParam(p, uploader)

            if currentDp:
                currentDp = currentDp[0]
                currentDp.options = options
                currentDp.type = dpType
            else:
                dp = SeqColDruggabilityParam(type=dpType,
                                             name=p,
                                             options=options,
                                             uploader=uploader,
                                             target="protein")
                seqCollection.druggabilityParams.append(dp)

        seqCollection.uploads.append(upload)
        seqCollection.save()