def get_ontology(self, data_type: DataType, provider=None): """Get Ontology""" ontology = Ontology() terms_pairs = [] if data_type == DataType.GO: terms_pairs = Neo4jHelper.run_single_parameter_query( self.get_ontology_pairs_query.format("GO", "GO"), None) elif data_type == DataType.DO: terms_pairs = Neo4jHelper.run_single_parameter_query( self.get_ontology_pairs_query.format("DO", "DO"), None) elif data_type == DataType.EXPR: if provider in EXPRESSION_PRVD_SUBTYPE_MAP: terms_pairs = Neo4jHelper.run_single_parameter_query( self.get_ontology_pairs_query.format(EXPRESSION_PRVD_SUBTYPE_MAP[provider], EXPRESSION_PRVD_SUBTYPE_MAP[provider]), None) for terms_pair in terms_pairs: self.add_neo_term_to_ontobio_ontology_if_not_exists( terms_pair["term1.primaryKey"], terms_pair["term1.name"], terms_pair["term1.type"], terms_pair["term1.isObsolete"], ontology) self.add_neo_term_to_ontobio_ontology_if_not_exists( terms_pair["term2.primaryKey"], terms_pair["term2.name"], terms_pair["term2.type"], terms_pair["term2.isObsolete"], ontology) ontology.add_parent(terms_pair["term1.primaryKey"], terms_pair["term2.primaryKey"], relation="subClassOf" if terms_pair["rel_type"] == "IS_A" else "BFO:0000050") if data_type == DataType.EXPR and provider == "MGI": self.add_neo_term_to_ontobio_ontology_if_not_exists("EMAPA_ARTIFICIAL_NODE:99999", "embryo", "anatomical_structure", False, ontology) ontology.add_parent("EMAPA_ARTIFICIAL_NODE:99999", "EMAPA:0", relation="subClassOf") self.add_neo_term_to_ontobio_ontology_if_not_exists("EMAPA_ARTIFICIAL_NODE:99998", "head", "anatomical_structure", False, ontology) ontology.add_parent("EMAPA_ARTIFICIAL_NODE:99998", "EMAPA:0", relation="subClassOf") GeneDescriptionsETL.add_neo_term_to_ontobio_ontology_if_not_exists( "EMAPA_ARTIFICIAL_NODE:99997", "gland", "anatomical_structure", False, ontology) ontology.add_parent("EMAPA_ARTIFICIAL_NODE:99997", "EMAPA:0", relation="subClassOf") elif data_type == DataType.EXPR and provider == "FB": GeneDescriptionsETL.add_neo_term_to_ontobio_ontology_if_not_exists( "FBbt_ARTIFICIAL_NODE:99999", "organism", "", False, ontology) ontology.add_parent("FBbt_ARTIFICIAL_NODE:99999", "FBbt:10000000", relation="subClassOf") return ontology
def delete_empty_nodes(self): """Delete Empty Nodes.""" self.logger.debug("delete empty nodes") delete_empty_do_nodes_query = """ MATCH (dd:DOTerm) WHERE keys(dd)[0] = 'primaryKey' AND size(keys(dd)) = 1 DETACH DELETE (dd)""" Neo4jHelper.run_single_query(delete_empty_do_nodes_query)
def get_generators(self, data_provider, gd_data_manager, gd_config, json_desc_writer): """Create generators.""" gene_prefix = "" if data_provider == "HUMAN": return_set = Neo4jHelper.run_single_parameter_query( self.get_all_genes_human_query, "RGD") gene_prefix = "RGD:" else: return_set = Neo4jHelper.run_single_parameter_query( self.get_all_genes_query, data_provider) descriptions = [] best_orthologs = self.get_best_orthologs_from_db( data_provider=data_provider) for record in return_set: gene = Gene(id=gene_prefix + record["g.primaryKey"], name=record["g.symbol"], dead=False, pseudo=False) gene_desc = GeneDescription(gene_id=record["g.primaryKey"], gene_name=gene.name, add_gene_name=False, config=gd_config) set_gene_ontology_module(dm=gd_data_manager, conf_parser=gd_config, gene_desc=gene_desc, gene=gene) set_expression_module(df=gd_data_manager, conf_parser=gd_config, gene_desc=gene_desc, gene=gene) set_disease_module(df=gd_data_manager, conf_parser=gd_config, gene_desc=gene_desc, gene=gene, human=data_provider == "HUMAN") if gene.id in best_orthologs: gene_desc.stats.set_best_orthologs = best_orthologs[gene.id][0] set_alliance_human_orthology_module( orthologs=best_orthologs[gene.id][0], excluded_orthologs=best_orthologs[gene.id][1], gene_desc=gene_desc, config=gd_config) if gene_desc.description: descriptions.append({ "genePrimaryKey": gene_desc.gene_id, "geneDescription": gene_desc.description }) json_desc_writer.add_gene_desc(gene_desc) yield [descriptions]
def get_best_orthologs_from_db(data_provider): """Get Best Orthologs_from_db.""" orthologs_set = Neo4jHelper.run_single_parameter_query( GeneDescriptionsETL.get_filtered_human_orthologs_query, data_provider) genes_orthologs_algos = defaultdict(lambda: defaultdict(int)) best_orthologs = {} orthologs_info = {} for ortholog_algo in orthologs_set: genes_orthologs_algos[ortholog_algo["geneId"]][ ortholog_algo["orthoId"]] += 1 if ortholog_algo["orthoId"] not in orthologs_info: orthologs_info[ortholog_algo["orthoId"]] = ( ortholog_algo["orthoSymbol"], ortholog_algo["orthoName"]) for gene_id in genes_orthologs_algos.keys(): best_orthologs[gene_id] = [ [[ ortholog_id, orthologs_info[ortholog_id][0], orthologs_info[ortholog_id][1] ] for ortholog_id in genes_orthologs_algos[gene_id].keys() if genes_orthologs_algos[gene_id][ortholog_id] == max( genes_orthologs_algos[gene_id].values())], False ] best_orthologs[gene_id][-1] \ = len(best_orthologs[gene_id][0]) != len(genes_orthologs_algos[gene_id].keys()) return best_orthologs
def _get_primary_gene_ids_to_ensembl_ids(): return_set = Neo4jHelper.run_single_query( ExpressionAtlasETL.get_all_gene_primary_to_ensmbl_ids_query) return { record["c.localId"].lower(): record["g.primaryKey"] for record in return_set }
def get_generators(self, expression_atlas_gene_pages, data_provider, batch_size): """Get Generators.""" return_set = Neo4jHelper.run_single_parameter_query( ExpressionAtlasETL.get_genes_with_expression_atlas_links_query, list(expression_atlas_gene_pages.keys())) counter = 0 cross_reference_list = [] for record in return_set: counter += 1 cross_reference = ETLHelper.get_xref_dict( record["g.primaryKey"].split(":")[1], "ExpressionAtlas_gene", "gene/expression-atlas", "gene/expressionAtlas", record["g.modLocalId"], expression_atlas_gene_pages[record["g.primaryKey"].lower()], data_provider + ":" + record["g.modLocalId"] + "gene/expression-atlas") cross_reference["genePrimaryKey"] = record["g.primaryKey"] cross_reference_list.append(cross_reference) if counter > batch_size: yield [cross_reference_list] counter = 0 cross_reference_list = [] if counter > 0: yield [cross_reference_list]
def _get_mod_gene_symbol_to_primary_ids(data_provider): return_set = Neo4jHelper.run_single_parameter_query( ExpressionAtlasETL.get_mod_gene_symbol_to_primary_ids_query, data_provider) return { record["g.symbol"].lower(): record["g.primaryKey"] for record in return_set }
def add_other(self): """Add Other.""" self.logger.debug("made it to the addOther statement") add_other_query = """ MERGE(other:UBERONTerm:Ontology {primaryKey:'UBERON:AnatomyOtherLocation'}) ON CREATE SET other.name = 'other' MERGE(otherstage:UBERONTerm:Ontology {primaryKey:'UBERON:PostEmbryonicPreAdult'}) ON CREATE SET otherstage.name = 'post embryonic, pre-adult' MERGE(othergo:GOTerm:Ontology {primaryKey:'GO:otherLocations'}) ON CREATE SET othergo.name = 'other locations' ON CREATE SET othergo.definition = 'temporary node to group expression entities up to ribbon terms' ON CREATE SET othergo.type = 'other' ON CREATE SET othergo.subset = 'goslim_agr' """ Neo4jHelper.run_single_query(add_other_query)
def get_disease_annotations_from_db(data_provider, gd_data_manager, logger): """Get Disease Annotations From DB""" annotations = [] gene_annot_set = Neo4jHelper.run_single_parameter_query( GeneDescriptionsETL.get_gene_disease_annot_query, data_provider) GeneDescriptionsETL.add_annotations(annotations, gene_annot_set, data_provider, DataType.DO, logger) feature_annot_set = Neo4jHelper.run_single_parameter_query( GeneDescriptionsETL.get_feature_disease_annot_query, data_provider) allele_do_annot = defaultdict(list) for feature_annot in feature_annot_set: if all([feature_annot["geneId"] != annot[0] for annot in allele_do_annot[(feature_annot["alleleId"], feature_annot["TermId"])]]): allele_do_annot[(feature_annot["alleleId"], feature_annot["TermId"])].append(feature_annot) # keep only disease annotations through simple entities # (e.g., alleles related to one gene only) feature_annot_set = [feature_annots[0] for feature_annots in allele_do_annot.values() if len(feature_annots) == 1] GeneDescriptionsETL.add_annotations(annotations, feature_annot_set, data_provider, DataType.DO, logger) disease_via_orth_records = Neo4jHelper.run_single_parameter_query( GeneDescriptionsETL.get_disease_via_orthology_query, data_provider) for orth_annot in disease_via_orth_records: annotations.append(GeneDescriptionsETL.create_annotation_record( gene_id=orth_annot["geneId"], gene_symbol=orth_annot["geneSymbol"], term_id=orth_annot["TermId"], aspect="D", ecode="DVO", prvdr=data_provider, qualifier="")) return AssociationSetFactory().create_from_assocs(assocs=list(annotations), ontology=gd_data_manager.do_ontology)
def populate_genes(): """Populate Genes.""" master_gene_set = set() query = "MATCH (g:Gene) RETURN g.primaryKey" result = Neo4jHelper().run_single_query(query) for record in result: master_gene_set.add(record['g.primaryKey']) return master_gene_set
def get_expression_annotations_from_db(data_provider, gd_data_manager, logger): """Get Expression Annotations From DB.""" annotations = [] gene_annot_set = Neo4jHelper.run_single_parameter_query( GeneDescriptionsETL.get_expression_annotations_query, data_provider) GeneDescriptionsETL.add_annotations( annotations, gene_annot_set, data_provider, DataType.EXPR, logger, gd_data_manager.expression_ontology) return AssociationSetFactory().create_from_assocs( assocs=list(annotations), ontology=gd_data_manager.expression_ontology)
def get_generators(self, sub_type, batch_size, species_encoded): """Get Generators""" entrez_ids = [] geo_data_file_contents = Path(sub_type.get_filepath()).read_text() geo_data = json.loads( json.dumps(xmltodict.parse(geo_data_file_contents))) for efetch_value in dict(geo_data.items()).values(): # IdList is a value returned from efetch XML spec, # within IdList, there is another map with "Id" # as the key and the entrez local ids a list value. for sub_map_key, sub_map_value in efetch_value.items(): if sub_map_key == 'IdList': for id_list in dict(sub_map_value.items()).values(): for entrez_id in id_list: self.logger.debug("here is the entrez id: %s", entrez_id) entrez_ids.append("NCBI_Gene:" + entrez_id) geo_data_list = [] return_set = Neo4jHelper.run_single_parameter_query( self.gene_crossref_query_template, entrez_ids) for record in return_set: gene_primary_key = record["g.primaryKey"] mod_local_id = record["g.modLocalId"] global_cross_ref_id = record["cr.globalCrossRefId"] geo_xref = ETLHelper.get_xref_dict(global_cross_ref_id.split(":")[1], "NCBI_Gene", "gene/other_expression", "gene/other_expression", "GEO", "https://www.ncbi.nlm.nih.gov/sites/entrez?" \ + "Db=geoprofiles"\ + "&DbFrom=gene"\ + "&Cmd=Link"\ + "&LinkName=gene_geoprofiles"\ + "&LinkReadableName=GEO%20Profiles"\ + "&IdsFromResult="\ + global_cross_ref_id.split(":")[1], global_cross_ref_id+"gene/other_expression") geo_xref["genePrimaryKey"] = gene_primary_key geo_xref["modLocalId"] = mod_local_id geo_data_list.append(geo_xref) yield [geo_data_list]
def get_generators(self, sub_type, batch_size, species_encoded): """Get Generators.""" entrez_ids = [] geo_data_file_contents = Path(sub_type.get_filepath()).read_text() geo_data = json.loads( json.dumps(xmltodict.parse(geo_data_file_contents))) for efetch_value in dict(geo_data.items()).values(): # IdList is a value returned from efetch XML spec, # within IdList, there is another map with "Id" # as the key and the entrez local ids a list value. for sub_map_key, sub_map_value in efetch_value.items(): if sub_map_key == 'IdList': for id_list in dict(sub_map_value.items()).values(): for entrez_id in id_list: self.logger.debug("here is the entrez id: %s", entrez_id) entrez_ids.append("NCBI_Gene:" + entrez_id) geo_data_list = [] return_set = Neo4jHelper.run_single_parameter_query( self.gene_crossref_query_template, entrez_ids) for record in return_set: gene_primary_key = record["g.primaryKey"] mod_local_id = record["g.modLocalId"] global_cross_ref_id = record["cr.globalCrossRefId"] url = self.etlh.rdh2.return_url_from_key_value( 'GEO', global_cross_ref_id.split(":")[1], 'entrezgene') geo_xref = ETLHelper.get_xref_dict( global_cross_ref_id.split(":")[1], "NCBI_Gene", "gene/other_expression", "gene/other_expression", "GEO", url, global_cross_ref_id + "gene/other_expression") geo_xref["genePrimaryKey"] = gene_primary_key geo_xref["modLocalId"] = mod_local_id geo_data_list.append(geo_xref) yield [geo_data_list]
def query_crossreferences(crossref_prefix): """Query Cross References.""" query = """MATCH (g:Gene)-[C:CROSS_REFERENCE]-(cr:CrossReference) WHERE cr.prefix = {parameter} RETURN g.primaryKey, cr.globalCrossRefId""" return Neo4jHelper().run_single_parameter_query(query, crossref_prefix)