def get_ontology(self, data_type: DataType, provider=None):
        """Get Ontology"""

        ontology = Ontology()
        terms_pairs = []
        if data_type == DataType.GO:
            terms_pairs = Neo4jHelper.run_single_parameter_query(
                self.get_ontology_pairs_query.format("GO", "GO"),
                None)
        elif data_type == DataType.DO:
            terms_pairs = Neo4jHelper.run_single_parameter_query(
                self.get_ontology_pairs_query.format("DO", "DO"),
                None)
        elif data_type == DataType.EXPR:
            if provider in EXPRESSION_PRVD_SUBTYPE_MAP:
                terms_pairs = Neo4jHelper.run_single_parameter_query(
                    self.get_ontology_pairs_query.format(EXPRESSION_PRVD_SUBTYPE_MAP[provider],
                                                         EXPRESSION_PRVD_SUBTYPE_MAP[provider]),
                    None)
        for terms_pair in terms_pairs:
            self.add_neo_term_to_ontobio_ontology_if_not_exists(
                terms_pair["term1.primaryKey"], terms_pair["term1.name"], terms_pair["term1.type"],
                terms_pair["term1.isObsolete"], ontology)
            self.add_neo_term_to_ontobio_ontology_if_not_exists(
                terms_pair["term2.primaryKey"], terms_pair["term2.name"], terms_pair["term2.type"],
                terms_pair["term2.isObsolete"], ontology)
            ontology.add_parent(terms_pair["term1.primaryKey"], terms_pair["term2.primaryKey"],
                                relation="subClassOf" if terms_pair["rel_type"] == "IS_A" else "BFO:0000050")
        if data_type == DataType.EXPR and provider == "MGI":
            self.add_neo_term_to_ontobio_ontology_if_not_exists("EMAPA_ARTIFICIAL_NODE:99999",
                                                                "embryo",
                                                                "anatomical_structure",
                                                                False,
                                                                ontology)
            ontology.add_parent("EMAPA_ARTIFICIAL_NODE:99999", "EMAPA:0", relation="subClassOf")
            self.add_neo_term_to_ontobio_ontology_if_not_exists("EMAPA_ARTIFICIAL_NODE:99998",
                                                                "head",
                                                                "anatomical_structure",
                                                                False,
                                                                ontology)
            ontology.add_parent("EMAPA_ARTIFICIAL_NODE:99998", "EMAPA:0", relation="subClassOf")
            GeneDescriptionsETL.add_neo_term_to_ontobio_ontology_if_not_exists(
                "EMAPA_ARTIFICIAL_NODE:99997",
                "gland",
                "anatomical_structure",
                False,
                ontology)
            ontology.add_parent("EMAPA_ARTIFICIAL_NODE:99997", "EMAPA:0", relation="subClassOf")
        elif data_type == DataType.EXPR and provider == "FB":
            GeneDescriptionsETL.add_neo_term_to_ontobio_ontology_if_not_exists(
                "FBbt_ARTIFICIAL_NODE:99999",
                "organism",
                "",
                False,
                ontology)
            ontology.add_parent("FBbt_ARTIFICIAL_NODE:99999",
                                "FBbt:10000000",
                                relation="subClassOf")

        return ontology
Exemplo n.º 2
0
    def delete_empty_nodes(self):
        """Delete Empty Nodes."""
        self.logger.debug("delete empty nodes")

        delete_empty_do_nodes_query = """
                MATCH (dd:DOTerm)
                WHERE keys(dd)[0] = 'primaryKey'
                      AND size(keys(dd)) = 1
                DETACH DELETE (dd)"""

        Neo4jHelper.run_single_query(delete_empty_do_nodes_query)
    def get_generators(self, data_provider, gd_data_manager, gd_config,
                       json_desc_writer):
        """Create generators."""
        gene_prefix = ""
        if data_provider == "HUMAN":
            return_set = Neo4jHelper.run_single_parameter_query(
                self.get_all_genes_human_query, "RGD")
            gene_prefix = "RGD:"
        else:
            return_set = Neo4jHelper.run_single_parameter_query(
                self.get_all_genes_query, data_provider)
        descriptions = []
        best_orthologs = self.get_best_orthologs_from_db(
            data_provider=data_provider)
        for record in return_set:
            gene = Gene(id=gene_prefix + record["g.primaryKey"],
                        name=record["g.symbol"],
                        dead=False,
                        pseudo=False)
            gene_desc = GeneDescription(gene_id=record["g.primaryKey"],
                                        gene_name=gene.name,
                                        add_gene_name=False,
                                        config=gd_config)
            set_gene_ontology_module(dm=gd_data_manager,
                                     conf_parser=gd_config,
                                     gene_desc=gene_desc,
                                     gene=gene)
            set_expression_module(df=gd_data_manager,
                                  conf_parser=gd_config,
                                  gene_desc=gene_desc,
                                  gene=gene)
            set_disease_module(df=gd_data_manager,
                               conf_parser=gd_config,
                               gene_desc=gene_desc,
                               gene=gene,
                               human=data_provider == "HUMAN")
            if gene.id in best_orthologs:
                gene_desc.stats.set_best_orthologs = best_orthologs[gene.id][0]
                set_alliance_human_orthology_module(
                    orthologs=best_orthologs[gene.id][0],
                    excluded_orthologs=best_orthologs[gene.id][1],
                    gene_desc=gene_desc,
                    config=gd_config)

            if gene_desc.description:
                descriptions.append({
                    "genePrimaryKey": gene_desc.gene_id,
                    "geneDescription": gene_desc.description
                })
            json_desc_writer.add_gene_desc(gene_desc)
        yield [descriptions]
 def get_best_orthologs_from_db(data_provider):
     """Get Best Orthologs_from_db."""
     orthologs_set = Neo4jHelper.run_single_parameter_query(
         GeneDescriptionsETL.get_filtered_human_orthologs_query,
         data_provider)
     genes_orthologs_algos = defaultdict(lambda: defaultdict(int))
     best_orthologs = {}
     orthologs_info = {}
     for ortholog_algo in orthologs_set:
         genes_orthologs_algos[ortholog_algo["geneId"]][
             ortholog_algo["orthoId"]] += 1
         if ortholog_algo["orthoId"] not in orthologs_info:
             orthologs_info[ortholog_algo["orthoId"]] = (
                 ortholog_algo["orthoSymbol"], ortholog_algo["orthoName"])
     for gene_id in genes_orthologs_algos.keys():
         best_orthologs[gene_id] = [
             [[
                 ortholog_id, orthologs_info[ortholog_id][0],
                 orthologs_info[ortholog_id][1]
             ] for ortholog_id in genes_orthologs_algos[gene_id].keys()
              if genes_orthologs_algos[gene_id][ortholog_id] == max(
                  genes_orthologs_algos[gene_id].values())], False
         ]
         best_orthologs[gene_id][-1] \
             = len(best_orthologs[gene_id][0]) != len(genes_orthologs_algos[gene_id].keys())
     return best_orthologs
 def _get_primary_gene_ids_to_ensembl_ids():
     return_set = Neo4jHelper.run_single_query(
         ExpressionAtlasETL.get_all_gene_primary_to_ensmbl_ids_query)
     return {
         record["c.localId"].lower(): record["g.primaryKey"]
         for record in return_set
     }
    def get_generators(self, expression_atlas_gene_pages, data_provider,
                       batch_size):
        """Get Generators."""
        return_set = Neo4jHelper.run_single_parameter_query(
            ExpressionAtlasETL.get_genes_with_expression_atlas_links_query,
            list(expression_atlas_gene_pages.keys()))

        counter = 0
        cross_reference_list = []
        for record in return_set:
            counter += 1
            cross_reference = ETLHelper.get_xref_dict(
                record["g.primaryKey"].split(":")[1], "ExpressionAtlas_gene",
                "gene/expression-atlas", "gene/expressionAtlas",
                record["g.modLocalId"],
                expression_atlas_gene_pages[record["g.primaryKey"].lower()],
                data_provider + ":" + record["g.modLocalId"] +
                "gene/expression-atlas")
            cross_reference["genePrimaryKey"] = record["g.primaryKey"]
            cross_reference_list.append(cross_reference)
            if counter > batch_size:
                yield [cross_reference_list]
                counter = 0
                cross_reference_list = []

        if counter > 0:
            yield [cross_reference_list]
 def _get_mod_gene_symbol_to_primary_ids(data_provider):
     return_set = Neo4jHelper.run_single_parameter_query(
         ExpressionAtlasETL.get_mod_gene_symbol_to_primary_ids_query,
         data_provider)
     return {
         record["g.symbol"].lower(): record["g.primaryKey"]
         for record in return_set
     }
Exemplo n.º 8
0
    def add_other(self):
        """Add Other."""
        self.logger.debug("made it to the addOther statement")

        add_other_query = """

            MERGE(other:UBERONTerm:Ontology {primaryKey:'UBERON:AnatomyOtherLocation'})
                ON CREATE SET other.name = 'other'
            MERGE(otherstage:UBERONTerm:Ontology {primaryKey:'UBERON:PostEmbryonicPreAdult'})
                ON CREATE SET otherstage.name = 'post embryonic, pre-adult'
            MERGE(othergo:GOTerm:Ontology {primaryKey:'GO:otherLocations'})
                ON CREATE SET othergo.name = 'other locations'
                ON CREATE SET othergo.definition = 'temporary node to group expression entities up to ribbon terms'
                ON CREATE SET othergo.type = 'other'
                ON CREATE SET othergo.subset = 'goslim_agr' """

        Neo4jHelper.run_single_query(add_other_query)
    def get_disease_annotations_from_db(data_provider, gd_data_manager, logger):
        """Get Disease Annotations From DB"""

        annotations = []
        gene_annot_set = Neo4jHelper.run_single_parameter_query(
            GeneDescriptionsETL.get_gene_disease_annot_query,
            data_provider)
        GeneDescriptionsETL.add_annotations(annotations,
                                            gene_annot_set,
                                            data_provider,
                                            DataType.DO,
                                            logger)

        feature_annot_set = Neo4jHelper.run_single_parameter_query(
            GeneDescriptionsETL.get_feature_disease_annot_query,
            data_provider)
        allele_do_annot = defaultdict(list)
        for feature_annot in feature_annot_set:
            if all([feature_annot["geneId"] != annot[0]
                    for annot in allele_do_annot[(feature_annot["alleleId"],
                                                  feature_annot["TermId"])]]):
                allele_do_annot[(feature_annot["alleleId"],
                                 feature_annot["TermId"])].append(feature_annot)
        # keep only disease annotations through simple entities
        # (e.g., alleles related to one gene only)
        feature_annot_set = [feature_annots[0] for feature_annots in allele_do_annot.values() if
                             len(feature_annots) == 1]
        GeneDescriptionsETL.add_annotations(annotations,
                                            feature_annot_set,
                                            data_provider,
                                            DataType.DO,
                                            logger)
        disease_via_orth_records = Neo4jHelper.run_single_parameter_query(
            GeneDescriptionsETL.get_disease_via_orthology_query, data_provider)
        for orth_annot in disease_via_orth_records:
            annotations.append(GeneDescriptionsETL.create_annotation_record(
                gene_id=orth_annot["geneId"],
                gene_symbol=orth_annot["geneSymbol"],
                term_id=orth_annot["TermId"],
                aspect="D",
                ecode="DVO",
                prvdr=data_provider,
                qualifier=""))
        return AssociationSetFactory().create_from_assocs(assocs=list(annotations),
                                                          ontology=gd_data_manager.do_ontology)
    def populate_genes():
        """Populate Genes."""
        master_gene_set = set()

        query = "MATCH (g:Gene) RETURN g.primaryKey"

        result = Neo4jHelper().run_single_query(query)

        for record in result:
            master_gene_set.add(record['g.primaryKey'])

        return master_gene_set
 def get_expression_annotations_from_db(data_provider, gd_data_manager,
                                        logger):
     """Get Expression Annotations From DB."""
     annotations = []
     gene_annot_set = Neo4jHelper.run_single_parameter_query(
         GeneDescriptionsETL.get_expression_annotations_query,
         data_provider)
     GeneDescriptionsETL.add_annotations(
         annotations, gene_annot_set, data_provider, DataType.EXPR, logger,
         gd_data_manager.expression_ontology)
     return AssociationSetFactory().create_from_assocs(
         assocs=list(annotations),
         ontology=gd_data_manager.expression_ontology)
Exemplo n.º 12
0
    def get_generators(self, sub_type, batch_size, species_encoded):
        """Get Generators"""

        entrez_ids = []

        geo_data_file_contents = Path(sub_type.get_filepath()).read_text()
        geo_data = json.loads(
            json.dumps(xmltodict.parse(geo_data_file_contents)))
        for efetch_value in dict(geo_data.items()).values():
            # IdList is a value returned from efetch XML spec,
            # within IdList, there is another map with "Id"
            # as the key and the entrez local ids a list value.
            for sub_map_key, sub_map_value in efetch_value.items():
                if sub_map_key == 'IdList':
                    for id_list in dict(sub_map_value.items()).values():
                        for entrez_id in id_list:
                            self.logger.debug("here is the entrez id: %s",
                                              entrez_id)
                            entrez_ids.append("NCBI_Gene:" + entrez_id)

        geo_data_list = []
        return_set = Neo4jHelper.run_single_parameter_query(
            self.gene_crossref_query_template, entrez_ids)

        for record in return_set:
            gene_primary_key = record["g.primaryKey"]
            mod_local_id = record["g.modLocalId"]
            global_cross_ref_id = record["cr.globalCrossRefId"]
            geo_xref = ETLHelper.get_xref_dict(global_cross_ref_id.split(":")[1],
                                               "NCBI_Gene",
                                               "gene/other_expression",
                                               "gene/other_expression",
                                               "GEO",
                                               "https://www.ncbi.nlm.nih.gov/sites/entrez?" \
                                                       + "Db=geoprofiles"\
                                                       + "&DbFrom=gene"\
                                                       + "&Cmd=Link"\
                                                       + "&LinkName=gene_geoprofiles"\
                                                       + "&LinkReadableName=GEO%20Profiles"\
                                                       + "&IdsFromResult="\
                                                       + global_cross_ref_id.split(":")[1],
                                               global_cross_ref_id+"gene/other_expression")

            geo_xref["genePrimaryKey"] = gene_primary_key
            geo_xref["modLocalId"] = mod_local_id

            geo_data_list.append(geo_xref)

        yield [geo_data_list]
Exemplo n.º 13
0
    def get_generators(self, sub_type, batch_size, species_encoded):
        """Get Generators."""
        entrez_ids = []

        geo_data_file_contents = Path(sub_type.get_filepath()).read_text()
        geo_data = json.loads(
            json.dumps(xmltodict.parse(geo_data_file_contents)))
        for efetch_value in dict(geo_data.items()).values():
            # IdList is a value returned from efetch XML spec,
            # within IdList, there is another map with "Id"
            # as the key and the entrez local ids a list value.
            for sub_map_key, sub_map_value in efetch_value.items():
                if sub_map_key == 'IdList':
                    for id_list in dict(sub_map_value.items()).values():
                        for entrez_id in id_list:
                            self.logger.debug("here is the entrez id: %s",
                                              entrez_id)
                            entrez_ids.append("NCBI_Gene:" + entrez_id)

        geo_data_list = []
        return_set = Neo4jHelper.run_single_parameter_query(
            self.gene_crossref_query_template, entrez_ids)

        for record in return_set:
            gene_primary_key = record["g.primaryKey"]
            mod_local_id = record["g.modLocalId"]
            global_cross_ref_id = record["cr.globalCrossRefId"]
            url = self.etlh.rdh2.return_url_from_key_value(
                'GEO',
                global_cross_ref_id.split(":")[1], 'entrezgene')
            geo_xref = ETLHelper.get_xref_dict(
                global_cross_ref_id.split(":")[1], "NCBI_Gene",
                "gene/other_expression", "gene/other_expression", "GEO", url,
                global_cross_ref_id + "gene/other_expression")

            geo_xref["genePrimaryKey"] = gene_primary_key
            geo_xref["modLocalId"] = mod_local_id

            geo_data_list.append(geo_xref)

        yield [geo_data_list]
 def query_crossreferences(crossref_prefix):
     """Query Cross References."""
     query = """MATCH (g:Gene)-[C:CROSS_REFERENCE]-(cr:CrossReference)
                WHERE cr.prefix = {parameter}
                RETURN g.primaryKey, cr.globalCrossRefId"""
     return Neo4jHelper().run_single_parameter_query(query, crossref_prefix)