def get_canonical_curies_dict(curie: Union[str, List[str]], log: ARAXResponse) -> Dict[str, Dict[str, str]]: curies = convert_string_or_list_to_list(curie) try: synonymizer = NodeSynonymizer() log.debug( f"Sending NodeSynonymizer.get_canonical_curies() a list of {len(curies)} curies" ) canonical_curies_dict = synonymizer.get_canonical_curies(curies) log.debug(f"Got response back from NodeSynonymizer") except Exception: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() log.error(f"Encountered a problem using NodeSynonymizer: {tb}", error_code=error_type.__name__) return {} else: if canonical_curies_dict is not None: unrecognized_curies = { input_curie for input_curie in canonical_curies_dict if not canonical_curies_dict.get(input_curie) } if unrecognized_curies: log.warning( f"NodeSynonymizer did not return canonical info for: {unrecognized_curies}" ) return canonical_curies_dict else: log.error(f"NodeSynonymizer returned None", error_code="NodeNormalizationIssue") return {}
def get_canonical_curies_list(curie: Union[str, List[str]], log: ARAXResponse) -> List[str]: curies = convert_to_list(curie) try: synonymizer = NodeSynonymizer() log.debug(f"Sending NodeSynonymizer.get_canonical_curies() a list of {len(curies)} curies") canonical_curies_dict = synonymizer.get_canonical_curies(curies) log.debug(f"Got response back from NodeSynonymizer") except Exception: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() log.error(f"Encountered a problem using NodeSynonymizer: {tb}", error_code=error_type.__name__) return [] else: if canonical_curies_dict is not None: recognized_input_curies = {input_curie for input_curie in canonical_curies_dict if canonical_curies_dict.get(input_curie)} unrecognized_curies = set(curies).difference(recognized_input_curies) if unrecognized_curies: log.warning(f"NodeSynonymizer did not return canonical info for: {unrecognized_curies}") canonical_curies = {canonical_curies_dict[recognized_curie].get('preferred_curie') for recognized_curie in recognized_input_curies} # Include any original curies we weren't able to find a canonical version for canonical_curies.update(unrecognized_curies) if not canonical_curies: log.error(f"Final list of canonical curies is empty. This shouldn't happen!", error_code="CanonicalCurieIssue") return list(canonical_curies) else: log.error(f"NodeSynonymizer returned None", error_code="NodeNormalizationIssue") return []
def get_preferred_categories(curie: Union[str, List[str]], log: ARAXResponse) -> Optional[List[str]]: curies = convert_to_list(curie) synonymizer = NodeSynonymizer() log.debug( f"Sending NodeSynonymizer.get_canonical_curies() a list of {len(curies)} curies" ) canonical_curies_dict = synonymizer.get_canonical_curies(curies) log.debug(f"Got response back from NodeSynonymizer") if canonical_curies_dict is not None: recognized_input_curies = { input_curie for input_curie in canonical_curies_dict if canonical_curies_dict.get(input_curie) } unrecognized_curies = set(curies).difference(recognized_input_curies) if unrecognized_curies: log.warning( f"NodeSynonymizer did not recognize: {unrecognized_curies}") preferred_categories = { canonical_curies_dict[recognized_curie].get('preferred_category') for recognized_curie in recognized_input_curies } if preferred_categories: return list(preferred_categories) else: log.warning( f"Unable to find any preferred categories; will default to biolink:NamedThing" ) return ["biolink:NamedThing"] else: log.error(f"NodeSynonymizer returned None", error_code="NodeNormalizationIssue") return []
def estimate_percent_nodes_covered_by_backup_method(kg: str): print( f"Estimating the percent of {kg} nodes mappable by the 'backup' NGD method (uses eUtils)" ) backup_ngd = NormGoogleDistance() synonymizer = NodeSynonymizer() percentages_mapped = [] num_batches = 10 batch_size = 10 for number in range(num_batches): print(f" Batch {number + 1}") # Get random selection of nodes from the KG query = f"match (a) return a.id, a.name, rand() as r order by r limit {batch_size}" results = _run_cypher_query(query, kg) canonical_curie_info = synonymizer.get_canonical_curies( [result['a.id'] for result in results]) recognized_curies = { input_curie for input_curie in canonical_curie_info if canonical_curie_info.get(input_curie) } # Use the back-up NGD method to try to grab PMIDs for each num_with_pmids = 0 for curie in recognized_curies: # Try to map this to a MESH term using the backup method (the chokepoint) node_id = canonical_curie_info[curie].get('preferred_curie') node_name = canonical_curie_info[curie].get('preferred_name') node_type = canonical_curie_info[curie].get('preferred_type') try: pmids = backup_ngd.get_pmids_for_all([node_id], [node_name]) except Exception: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() print(f"ERROR using back-up method: {tb}") else: if len(pmids) and ([ pmid_list for pmid_list in pmids if pmid_list ]): num_with_pmids += 1 print( f" Found {len(pmids[0])} PMIDs for {node_id}, {node_name}." ) else: print(f" Not found. ({node_id}, {node_name})") percentage_with_pmids = (num_with_pmids / len(recognized_curies)) * 100 print( f" {percentage_with_pmids}% of nodes were mapped to PMIDs using backup method." ) percentages_mapped.append(percentage_with_pmids) print(f" Percentages for all batches: {percentages_mapped}.") average = sum(percentages_mapped) / len(percentages_mapped) print( f"Final estimate of backup method's coverage of {kg} nodes: {round(average)}%" )
def estimate_percent_nodes_covered_by_ultrafast_ngd(kg: str): print( f"Estimating the percent of {kg} nodes covered by the local NGD system.." ) curie_to_pmid_db = SqliteDict(f"./curie_to_pmids.sqlite") percentages_mapped = [] num_batches = 20 batch_size = 4000 all_nodes_mapped_by_type = dict() for number in range(num_batches): # Get random selection of node IDs from the KG random_node_ids = _get_random_node_ids(batch_size, kg) # Use synonymizer to get their canonicalized info synonymizer = NodeSynonymizer() canonical_curie_info = synonymizer.get_canonical_curies( list(random_node_ids)) recognized_curies = { input_curie for input_curie in canonical_curie_info if canonical_curie_info.get(input_curie) } # See if those canonical curies are in our local database num_mapped_to_pmids = 0 for input_curie in recognized_curies: canonical_curie = canonical_curie_info[input_curie].get( 'preferred_curie') preferred_type = canonical_curie_info[input_curie].get( 'preferred_type') if preferred_type not in all_nodes_mapped_by_type: all_nodes_mapped_by_type[preferred_type] = { 'covered': 0, 'not_covered': 0 } if canonical_curie and canonical_curie in curie_to_pmid_db: num_mapped_to_pmids += 1 all_nodes_mapped_by_type[preferred_type]['covered'] += 1 else: all_nodes_mapped_by_type[preferred_type]['not_covered'] += 1 percentage_mapped = (num_mapped_to_pmids / len(random_node_ids)) * 100 percentages_mapped.append(percentage_mapped) average = sum(percentages_mapped) / len(percentages_mapped) print(f"Estimated coverage of {kg} nodes: {round(average)}%.") node_type_percentages_dict = dict() for node_type, coverage_info in all_nodes_mapped_by_type.items(): num_covered = coverage_info['covered'] num_total = coverage_info['covered'] + coverage_info['not_covered'] percentage = round((num_covered / num_total) * 100) node_type_percentages_dict[node_type] = percentage for node_type, percentage in sorted(node_type_percentages_dict.items(), key=lambda item: item[1], reverse=True): print(f" {node_type}: {percentage}%")
def _canonicalize_nodes(kg2pre_nodes: List[Dict[str, any]]) -> Tuple[Dict[str, Dict[str, any]], Dict[str, str]]: logging.info(f"Canonicalizing nodes..") synonymizer = NodeSynonymizer() node_ids = [node.get('id') for node in kg2pre_nodes if node.get('id')] logging.info(f" Sending NodeSynonymizer.get_canonical_curies() {len(node_ids)} curies..") canonicalized_info = synonymizer.get_canonical_curies(curies=node_ids, return_all_categories=True) all_canonical_curies = {canonical_info['preferred_curie'] for canonical_info in canonicalized_info.values() if canonical_info} logging.info(f" Sending NodeSynonymizer.get_equivalent_nodes() {len(all_canonical_curies)} curies..") equivalent_curies_info = synonymizer.get_equivalent_nodes(all_canonical_curies) recognized_curies = {curie for curie in equivalent_curies_info if equivalent_curies_info.get(curie)} equivalent_curies_dict = {curie: list(equivalent_curies_info.get(curie)) for curie in recognized_curies} with open(f"{KG2C_DIR}/equivalent_curies.pickle", "wb") as equiv_curies_dump: # Save these for use by downstream script pickle.dump(equivalent_curies_dict, equiv_curies_dump, protocol=pickle.HIGHEST_PROTOCOL) logging.info(f" Creating canonicalized nodes..") curie_map = dict() canonicalized_nodes = dict() for kg2pre_node in kg2pre_nodes: # Grab relevant info for this node and its canonical version canonical_info = canonicalized_info.get(kg2pre_node['id']) canonicalized_curie = canonical_info.get('preferred_curie', kg2pre_node['id']) if canonical_info else kg2pre_node['id'] publications = kg2pre_node['publications'] if kg2pre_node.get('publications') else [] descriptions_list = [kg2pre_node['description']] if kg2pre_node.get('description') else [] if canonicalized_curie in canonicalized_nodes: # Merge this node into its corresponding canonical node existing_canonical_node = canonicalized_nodes[canonicalized_curie] existing_canonical_node['publications'] = _merge_two_lists(existing_canonical_node['publications'], publications) existing_canonical_node['all_names'] = _merge_two_lists(existing_canonical_node['all_names'], [kg2pre_node['name']]) existing_canonical_node['descriptions_list'] = _merge_two_lists(existing_canonical_node['descriptions_list'], descriptions_list) # Make sure any nodes subject to #1074-like problems still appear in equivalent curies existing_canonical_node['equivalent_curies'] = _merge_two_lists(existing_canonical_node['equivalent_curies'], [kg2pre_node['id']]) # Add the IRI for the 'preferred' curie, if we've found that node if kg2pre_node['id'] == canonicalized_curie: existing_canonical_node['iri'] = kg2pre_node.get('iri') else: # Initiate the canonical node for this synonym group name = canonical_info['preferred_name'] if canonical_info else kg2pre_node['name'] category = canonical_info['preferred_category'] if canonical_info else kg2pre_node['category'] all_categories = list(canonical_info['all_categories']) if canonical_info else [kg2pre_node['category']] iri = kg2pre_node['iri'] if kg2pre_node['id'] == canonicalized_curie else None all_names = [kg2pre_node['name']] canonicalized_node = _create_node(preferred_curie=canonicalized_curie, name=name, category=category, all_categories=all_categories, publications=publications, equivalent_curies=equivalent_curies_dict.get(canonicalized_curie, [canonicalized_curie]), iri=iri, description=None, descriptions_list=descriptions_list, all_names=all_names) canonicalized_nodes[canonicalized_node['id']] = canonicalized_node curie_map[kg2pre_node['id']] = canonicalized_curie # Record this mapping for easy lookup later logging.info(f"Number of KG2pre nodes was reduced to {len(canonicalized_nodes)} " f"({round((len(canonicalized_nodes) / len(kg2pre_nodes)) * 100)}%)") return canonicalized_nodes, curie_map
def _get_canonical_curies_map(self, curies): self.response.debug(f"Canonicalizing curies of relevant nodes using NodeSynonymizer") synonymizer = NodeSynonymizer() try: canonicalized_node_info = synonymizer.get_canonical_curies(curies) except Exception: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(f"Encountered a problem using NodeSynonymizer: {tb}", error_code=error_type.__name__) return {} else: canonical_curies_map = dict() for input_curie, node_info in canonicalized_node_info.items(): if node_info: canonical_curies_map[input_curie] = node_info.get('preferred_curie', input_curie) else: canonical_curies_map[input_curie] = input_curie return canonical_curies_map
def get_entity_by_string(search_string): # noqa: E501 """Obtain the CURIE and type of some entity by name # noqa: E501 :param search_string: Some string to search by (name, abbreviation, CURIE, etc.) :type search_string: str :rtype: List[object] """ synonymizer = NodeSynonymizer() result = synonymizer.get_canonical_curies(curies=search_string, names=search_string) response = {} if result[search_string] is not None: response = { 'curie': result[search_string]['preferred_curie'], 'name': result[search_string]['preferred_name'], 'type': result[search_string]['preferred_type'] } return response
class CHPQuerier: def __init__(self, response_object: ARAXResponse): self.response = response_object self.synonymizer = NodeSynonymizer() self.kp_name = "CHP" # Instantiate a client self.client = get_client() def answer_one_hop_query( self, query_graph: QueryGraph) -> QGOrganizedKnowledgeGraph: """ This function answers a one-hop (single-edge) query using CHP client. :param query_graph: A TRAPI query graph. :return: An (almost) TRAPI knowledge graph containing all of the nodes and edges returned as results for the query. (Organized by QG IDs.) """ # Set up the required parameters log = self.response self.CHP_survival_threshold = float( self.response.data['parameters']['CHP_survival_threshold']) allowable_curies = self.client.curies() self.allowable_gene_curies = list( allowable_curies['biolink:Gene'].keys()) self.allowable_drug_curies = [ curie_id.replace('CHEMBL:', 'CHEMBL.COMPOUND:') for curie_id in list(allowable_curies['biolink:Drug'].keys()) ] final_kg = QGOrganizedKnowledgeGraph() final_kg = self._answer_query_using_CHP_client(query_graph, log) return final_kg def _answer_query_using_CHP_client( self, query_graph: QueryGraph, log: ARAXResponse) -> QGOrganizedKnowledgeGraph: qedge_key = next(qedge_key for qedge_key in query_graph.edges) log.debug( f"Processing query results for edge {qedge_key} by using CHP client" ) final_kg = QGOrganizedKnowledgeGraph() gene_label_list = ['gene'] drug_label_list = ['drug', 'chemicalsubstance'] # use for checking the requirement source_pass_nodes = None source_category = None target_pass_nodes = None target_category = None qedge = query_graph.edges[qedge_key] source_qnode_key = qedge.subject target_qnode_key = qedge.object source_qnode = query_graph.nodes[source_qnode_key] target_qnode = query_graph.nodes[target_qnode_key] # check if both ends of edge have no curie if (source_qnode.id is None) and (target_qnode.id is None): log.error(f"Both ends of edge {qedge_key} are None", error_code="BadEdge") return final_kg # check if the query nodes are drug or disease if source_qnode.id is not None: if type(source_qnode.id) is str: source_pass_nodes = [source_qnode.id] else: source_pass_nodes = source_qnode.id has_error, pass_nodes, not_pass_nodes = self._check_id( source_qnode.id, log) if has_error: return final_kg else: if len(not_pass_nodes) == 0 and len(pass_nodes) != 0: source_pass_nodes = pass_nodes elif len(not_pass_nodes) != 0 and len(pass_nodes) != 0: source_pass_nodes = pass_nodes if len(not_pass_nodes) == 1: log.warning( f"The curie id of {not_pass_nodes[0]} is not allowable based on CHP client" ) else: log.warning( f"The curie ids of these nodes {not_pass_nodes} are not allowable based on CHP client" ) else: if type(source_qnode.id) is str: log.error( f"The curie id of {source_qnode.id} is not allowable based on CHP client", error_code="NotAllowable") return final_kg else: log.error( f"The curie ids of {source_qnode.id} are not allowable based on CHP client", error_code="NotAllowable") return final_kg else: category = source_qnode.category[0].replace( 'biolink:', '').replace('_', '').lower() source_category = category if (category in drug_label_list) or (category in gene_label_list): source_category = category else: log.error( f"The category of query node {source_qnode_key} is unsatisfiable. It has to be drug/chemical_substance or gene", error_code="CategoryError") return final_kg if target_qnode.id is not None: if type(target_qnode.id) is str: target_pass_nodes = [target_qnode.id] else: target_pass_nodes = target_qnode.id has_error, pass_nodes, not_pass_nodes = self._check_id( target_qnode.id, log) if has_error: return final_kg else: if len(not_pass_nodes) == 0 and len(pass_nodes) != 0: target_pass_nodes = pass_nodes elif len(not_pass_nodes) != 0 and len(pass_nodes) != 0: target_pass_nodes = pass_nodes if len(not_pass_nodes) == 1: log.warning( f"The curie id of {not_pass_nodes[0]} is not allowable based on CHP client" ) else: log.warning( f"The curie ids of these nodes {not_pass_nodes} are not allowable based on CHP client" ) else: if type(target_qnode.id) is str: log.error( f"The curie id of {target_qnode.id} is not allowable based on CHP client", error_code="CategoryError") return final_kg else: log.error( f"The curie ids of {target_qnode.id} are not allowable based on CHP client", error_code="CategoryError") return final_kg else: category = target_qnode.category[0].replace( 'biolink:', '').replace('_', '').lower() target_category = category if (category in drug_label_list) or (category in gene_label_list): target_category = category else: log.error( f"The category of query node {target_qnode_key} is unsatisfiable. It has to be drug/chemical_substance or gene", error_code="CategoryError") return final_kg if (source_pass_nodes is None) and (target_pass_nodes is None): return final_kg elif (source_pass_nodes is not None) and (target_pass_nodes is not None): source_dict = dict() target_dict = dict() if source_pass_nodes[0] in self.allowable_drug_curies: source_category_temp = 'drug' else: source_category_temp = 'gene' if target_pass_nodes[0] in self.allowable_drug_curies: target_category_temp = 'drug' else: target_category_temp = 'gene' if source_category_temp == target_category_temp: log.error( f"The query nodes in both ends of edge are the same type which is {source_category_temp}", error_code="CategoryError") return final_kg else: for (source_curie, target_curie) in itertools.product( source_pass_nodes, target_pass_nodes): if source_category_temp == 'drug': source_curie_temp = source_curie.replace( 'CHEMBL.COMPOUND:', 'CHEMBL:') # Let's build a simple single query q = build_query(genes=[target_curie], therapeutic=source_curie_temp, disease='MONDO:0007254', outcome=('EFO:0000714', '>=', self.CHP_survival_threshold)) response = self.client.query(q) max_probability = self.client.get_outcome_prob( response) swagger_edge_key, swagger_edge = self._convert_to_swagger_edge( target_curie, source_curie, "paired_with", max_probability) else: target_curie_temp = target_curie.replace( 'CHEMBL.COMPOUND:', 'CHEMBL:') # Let's build a simple single query q = build_query(genes=[source_curie], therapeutic=target_curie_temp, disease='MONDO:0007254', outcome=('EFO:0000714', '>=', self.CHP_survival_threshold)) response = self.client.query(q) max_probability = self.client.get_outcome_prob( response) swagger_edge_key, swagger_edge = self._convert_to_swagger_edge( source_curie, target_curie, "paired_with", max_probability) source_dict[source_curie] = source_qnode_key target_dict[target_curie] = target_qnode_key # Finally add the current edge to our answer knowledge graph final_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key) # Add the nodes to our answer knowledge graph if len(source_dict) != 0: for source_curie in source_dict: swagger_node_key, swagger_node = self._convert_to_swagger_node( source_curie) final_kg.add_node(swagger_node_key, swagger_node, source_dict[source_curie]) if len(target_dict) != 0: for target_curie in target_dict: swagger_node_key, swagger_node = self._convert_to_swagger_node( target_curie) final_kg.add_node(swagger_node_key, swagger_node, target_dict[target_curie]) return final_kg elif source_pass_nodes is not None: source_dict = dict() target_dict = dict() if source_pass_nodes[0] in self.allowable_drug_curies: source_category_temp = 'drug' else: source_category_temp = 'gene' if target_category in drug_label_list: target_category_temp = 'drug' else: target_category_temp = 'gene' if source_category_temp == target_category_temp: log.error( f"The query nodes in both ends of edge are the same type which is {source_category_temp}", error_code="CategoryError") return final_kg else: if source_category_temp == 'drug': for source_curie in source_pass_nodes: genes = [ curie for curie in self.allowable_gene_curies if self.synonymizer.get_canonical_curies(curie) [curie] is not None and target_category in [ category.replace('biolink:', '').replace( '_', '').lower() for category in list( self.synonymizer.get_canonical_curies( curie, return_all_categories=True) [curie]['all_categories'].keys()) ] ] therapeutic = source_curie.replace( 'CHEMBL.COMPOUND:', 'CHEMBL:') disease = 'MONDO:0007254' outcome = ('EFO:0000714', '>=', self.CHP_survival_threshold) queries = [] for gene in genes: queries.append( build_query( genes=[gene], therapeutic=therapeutic, disease=disease, outcome=outcome, )) # use the query_all endpoint to run the batch of queries res = self.client.query_all(queries) for result, gene in zip(res["message"], genes): prob = self.client.get_outcome_prob(result) swagger_edge_key, swagger_edge = self._convert_to_swagger_edge( gene, source_curie, "paired_with", prob) source_dict[source_curie] = source_qnode_key target_dict[gene] = target_qnode_key # Finally add the current edge to our answer knowledge graph final_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key) else: for source_curie in source_pass_nodes: genes = [source_curie] therapeutic = [ curie.replace('CHEMBL.COMPOUND:', 'CHEMBL:') for curie in self.allowable_drug_curies if self.synonymizer.get_canonical_curies( curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:')) [curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:')] is not None and target_category in [ category.replace('biolink:', '').replace( '_', '').lower() for category in list( self.synonymizer.get_canonical_curies( curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:'), return_all_categories=True)[ curie.replace( 'CHEMBL:', 'CHEMBL.COMPOUND:')] ['all_categories'].keys()) ] ] disease = 'MONDO:0007254' outcome = ('EFO:0000714', '>=', self.CHP_survival_threshold) queries = [] for drug in therapeutic: queries.append( build_query( genes=genes, therapeutic=drug, disease=disease, outcome=outcome, )) # use the query_all endpoint to run the batch of queries res = self.client.query_all(queries) for result, drug in zip(res["message"], therapeutic): drug = drug.replace('CHEMBL:', 'CHEMBL.COMPOUND:') prob = self.client.get_outcome_prob(result) swagger_edge_key, swagger_edge = self._convert_to_swagger_edge( source_curie, drug, "paired_with", prob) source_dict[source_curie] = source_qnode_key target_dict[drug] = target_qnode_key # Finally add the current edge to our answer knowledge graph final_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key) # Add the nodes to our answer knowledge graph if len(source_dict) != 0: for source_curie in source_dict: swagger_node_key, swagger_node = self._convert_to_swagger_node( source_curie) final_kg.add_node(swagger_node_key, swagger_node, source_dict[source_curie]) if len(target_dict) != 0: for target_curie in target_dict: swagger_node_key, swagger_node = self._convert_to_swagger_node( target_curie) final_kg.add_node(swagger_node_key, swagger_node, target_dict[target_curie]) return final_kg else: source_dict = dict() target_dict = dict() if target_pass_nodes[0] in self.allowable_drug_curies: target_category_temp = 'drug' else: target_category_temp = 'gene' if source_category in drug_label_list: source_category_temp = 'drug' else: source_category_temp = 'gene' if source_category_temp == target_category_temp: log.error( f"The query nodes in both ends of edge are the same type which is {source_category_temp}", error_code="CategoryError") return final_kg else: if target_category_temp == 'drug': for target_curie in target_pass_nodes: genes = [ curie for curie in self.allowable_gene_curies if self.synonymizer.get_canonical_curies(curie) [curie] is not None and source_category in [ category.replace('biolink:', '').replace( '_', '').lower() for category in list( self.synonymizer.get_canonical_curies( curie, return_all_categories=True) [curie]['all_categories'].keys()) ] ] therapeutic = target_curie.replace( 'CHEMBL.COMPOUND:', 'CHEMBL:') disease = 'MONDO:0007254' outcome = ('EFO:0000714', '>=', self.CHP_survival_threshold) queries = [] for gene in genes: queries.append( build_query( genes=[gene], therapeutic=therapeutic, disease=disease, outcome=outcome, )) # use the query_all endpoint to run the batch of queries res = self.client.query_all(queries) for result, gene in zip(res["message"], genes): prob = self.client.get_outcome_prob(result) swagger_edge_key, swagger_edge = self._convert_to_swagger_edge( gene, target_curie, "paired_with", prob) source_dict[gene] = source_qnode_key target_dict[target_curie] = target_qnode_key # Finally add the current edge to our answer knowledge graph final_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key) else: for target_curie in target_pass_nodes: genes = [target_curie] therapeutic = [ curie.replace('CHEMBL.COMPOUND:', 'CHEMBL:') for curie in self.allowable_drug_curies if self.synonymizer.get_canonical_curies( curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:')) [curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:')] is not None and source_category in [ category.replace('biolink:', '').replace( '_', '').lower() for category in list( self.synonymizer.get_canonical_curies( curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:'), return_all_categories=True)[ curie.replace( 'CHEMBL:', 'CHEMBL.COMPOUND:')] ['all_categories'].keys()) ] ] disease = 'MONDO:0007254' outcome = ('EFO:0000714', '>=', self.CHP_survival_threshold) queries = [] for drug in therapeutic: queries.append( build_query( genes=genes, therapeutic=drug, disease=disease, outcome=outcome, )) # use the query_all endpoint to run the batch of queries res = self.client.query_all(queries) for result, drug in zip(res["message"], therapeutic): drug = drug.replace('CHEMBL:', 'CHEMBL.COMPOUND:') prob = self.client.get_outcome_prob(result) swagger_edge_key, swagger_edge = self._convert_to_swagger_edge( target_curie, drug, "paired_with", prob) source_dict[drug] = source_qnode_key target_dict[target_curie] = target_qnode_key # Finally add the current edge to our answer knowledge graph final_kg.add_edge(swagger_edge_key, swagger_edge, qedge_key) # Add the nodes to our answer knowledge graph if len(source_dict) != 0: for source_curie in source_dict: swagger_node_key, swagger_node = self._convert_to_swagger_node( source_curie) final_kg.add_node(swagger_node_key, swagger_node, source_dict[source_curie]) if len(target_dict) != 0: for target_curie in target_dict: swagger_node_key, swagger_node = self._convert_to_swagger_node( target_curie) final_kg.add_node(swagger_node_key, swagger_node, target_dict[target_curie]) return final_kg def _check_id(self, qnode_id, log): if type(qnode_id) is str: if qnode_id in self.allowable_gene_curies or qnode_id in self.allowable_drug_curies: return [False, [qnode_id], []] else: return [False, [], [qnode_id]] else: pass_nodes_gene_temp = list() pass_nodes_drug_temp = list() not_pass_nodes = list() for curie in qnode_id: if curie in self.allowable_gene_curies: pass_nodes_gene_temp += [curie] elif curie in self.allowable_drug_curies: pass_nodes_drug_temp += [curie] else: not_pass_nodes += [curie] if len(pass_nodes_gene_temp) != 0 and len( pass_nodes_drug_temp) != 0: log.error( f"The curie ids of {qnode_id} contain both gene and drug", error_code="MixedTypes") return [True, [], []] else: pass_nodes = pass_nodes_gene_temp + pass_nodes_drug_temp return [False, pass_nodes, not_pass_nodes] def _convert_to_swagger_edge(self, subject: str, object: str, name: str, value: float) -> Tuple[str, Edge]: swagger_edge = Edge() swagger_edge.predicate = f"biolink:{name}" swagger_edge.subject = subject swagger_edge.object = object swagger_edge_key = f"CHP:{subject}-{name}-{object}" swagger_edge.relation = None type = "EDAM:data_0951" url = "https://github.com/di2ag/chp_client" swagger_edge.attributes = [ Attribute(type=type, name=name, value=str(value), url=url), Attribute(name="provided_by", value=self.kp_name, type=eu.get_attribute_type("provided_by")), Attribute(name="is_defined_by", value="ARAX", type=eu.get_attribute_type("is_defined_by")) ] return swagger_edge_key, swagger_edge def _convert_to_swagger_node(self, node_key: str) -> Tuple[str, Node]: swagger_node = Node() swagger_node_key = node_key swagger_node.name = self.synonymizer.get_canonical_curies( node_key)[node_key]['preferred_name'] swagger_node.description = None swagger_node.category = self.synonymizer.get_canonical_curies( node_key)[node_key]['preferred_category'] return swagger_node_key, swagger_node
def main(): parser = argparse.ArgumentParser( description="Refresh DTD model and database", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--synoymizer_folder', type=str, help="Full path of folder containing NodeSynonymizer", default='~/RTX/code/ARAX/NodeSynonymizer/') parser.add_argument( '--DTD_prob_db_file', type=str, help="Full path of DTD probability database file", default= '~/work/RTX/code/ARAX/KnowledgeSources/Prediction/DTD_probability_database_v1.0_KG2.3.4.db' ) parser.add_argument( '--emb_file', type=str, help="Full path of DTD model embedding file", default= '~/work/RTX/code/ARAX/KnowledgeSources/Prediction/rel_max_v1.0_KG2.3.4.emb.gz' ) parser.add_argument( '--map_file', type=str, help="Full path of DTD model mapping file", default= '~/work/RTX/code/ARAX/KnowledgeSources/Prediction/map_v1.0_KG2.3.4.txt' ) parser.add_argument( '--output_folder', type=str, help="Full path of output folder", default='~/work/RTX/code/ARAX/KnowledgeSources/Prediction/') args = parser.parse_args() if os.path.isdir(args.synoymizer_folder): sys.path.append(args.synoymizer_folder) from node_synonymizer import NodeSynonymizer synonymizer = NodeSynonymizer() else: print(f"Error: Not found this folder: {args.synoymizer_folder}") exit(0) if os.path.isfile(args.DTD_prob_db_file): print(f'Start to refresh DTD_probability_database.db', flush=True) con = sqlite3.connect(args.DTD_prob_db_file) DTD_prob_table = pd.read_sql_query("SELECT * from DTD_PROBABILITY", con) con.close() DTD_prob_table = DTD_prob_table.apply(lambda row: [ refresh_disease(row[0], synonymizer), refresh_drug(row[1], synonymizer), row[2] ], axis=1, result_type='expand') DTD_prob_table = DTD_prob_table.dropna().reset_index(drop=True) con = sqlite3.connect( os.path.join(args.output_folder, 'DTD_probability_database_refreshed.db')) con.execute( f"CREATE TABLE DTD_PROBABILITY( disease VARCHAR(255), drug VARCHAR(255), probability FLOAT )" ) insert_command = "INSERT INTO DTD_PROBABILITY VALUES (?, ?, ?)" databasefile = list(DTD_prob_table.to_records(index=False)) print(f"INFO: Populating table", flush=True) insert_command = "INSERT INTO DTD_PROBABILITY VALUES (?, ?, ?)" batch = list(range(0, len(databasefile), 5000)) batch.append(len(databasefile)) count = 0 for i in range(len(batch)): if ((i + 1) < len(batch)): start = batch[i] end = batch[i + 1] rows = databasefile[start:end] con.executemany(insert_command, rows) con.commit() count = count + len(rows) percentage = round((count * 100.0 / len(databasefile)), 2) print(str(percentage) + "%..", end='', flush=True) print(f"INFO: Populating tables is completed", flush=True) print(f"INFO: Creating INDEXes on DTD_PROBABILITY", flush=True) con.execute( f"CREATE INDEX idx_DTD_PROBABILITY_disease ON DTD_PROBABILITY(disease)" ) con.execute( f"CREATE INDEX idx_DTD_PROBABILITY_drug ON DTD_PROBABILITY(drug)") con.commit() con.close() print(f"INFO: Creating INDEXes is completed", flush=True) else: print(f"Error: Not found this file: {args.DTD_prob_db_file}") exit(0) if os.path.isfile(args.emb_file) and os.path.isfile(args.map_file): rel_max = pd.read_csv(args.emb_file, sep=' ', skiprows=1, header=None) mapfile = pd.read_csv(args.map_file, sep='\t', header=0) merged_table = mapfile.merge(rel_max, left_on='id', right_on=0) merged_table = merged_table.loc[:, ['curie'] + list(merged_table.columns)[3:]] new_curie_ids = [ synonymizer.get_canonical_curies(curie)[curie]['preferred_curie'] if synonymizer.get_canonical_curies(curie)[curie] is not None else None for curie in list(merged_table.curie) ] graph = pd.concat( [pd.DataFrame(new_curie_ids), merged_table.iloc[:, 1:]], axis=1) graph = graph.dropna().reset_index(drop=True) con = sqlite3.connect( os.path.join(args.output_folder, 'GRAPH_refreshed.sqlite')) con.execute(f"DROP TABLE IF EXISTs GRAPH") insert_command1 = f"CREATE TABLE GRAPH(curie VARCHAR(255)" for num in range(1, graph.shape[1]): insert_command1 = insert_command1 + f", col{num} INT" insert_command1 = insert_command1 + ")" con.execute(insert_command1) con.commit() count = 0 print(f"Insert data into database", flush=True) for row in range(graph.shape[0]): count = count + 1 insert_command1 = f"INSERT INTO GRAPH" insert_command2 = f" values (" for _ in range(graph.shape[1]): insert_command2 = insert_command2 + f"?," insert_command = insert_command1 + insert_command2 + ")" insert_command = insert_command.replace(',)', ')') line = tuple(graph.loc[row, :]) con.execute(insert_command, line) if count % 5000 == 0: con.commit() percentage = int(count * 100.0 / graph.shape[0]) print(str(percentage) + "%..", end='', flush=True) con.commit() percentage = int(count * 100.0 / graph.shape[0]) print(str(percentage) + "%..", end='', flush=True) con.execute(f"CREATE INDEX idx_GRAPH_curie ON GRAPH(curie)") con.commit() con.close() print(f"INFO: Database created successfully", flush=True)
def _canonicalize_nodes( nodes: List[Dict[str, any]] ) -> Tuple[Dict[str, Dict[str, any]], Dict[str, str]]: synonymizer = NodeSynonymizer() node_ids = [node.get('id') for node in nodes if node.get('id')] print( f" Sending NodeSynonymizer.get_canonical_curies() {len(node_ids)} curies.." ) canonicalized_info = synonymizer.get_canonical_curies( curies=node_ids, return_all_types=True) all_canonical_curies = { canonical_info['preferred_curie'] for canonical_info in canonicalized_info.values() if canonical_info } print( f" Sending NodeSynonymizer.get_equivalent_nodes() {len(all_canonical_curies)} curies.." ) equivalent_curies_info = synonymizer.get_equivalent_nodes( all_canonical_curies) recognized_curies = { curie for curie in equivalent_curies_info if equivalent_curies_info.get(curie) } equivalent_curies_dict = { curie: list(equivalent_curies_info.get(curie)) for curie in recognized_curies } print(f" Creating canonicalized nodes..") curie_map = dict() canonicalized_nodes = dict() for node in nodes: canonical_info = canonicalized_info.get(node['id']) canonicalized_curie = canonical_info.get( 'preferred_curie', node['id']) if canonical_info else node['id'] publications = node['publications'] if node.get('publications') else [] description_in_list = [node['description'] ] if node.get('description') else [] if canonicalized_curie in canonicalized_nodes: existing_canonical_node = canonicalized_nodes[canonicalized_curie] existing_canonical_node['publications'] = _merge_two_lists( existing_canonical_node['publications'], publications) existing_canonical_node['all_names'] = _merge_two_lists( existing_canonical_node['all_names'], [node['name']]) existing_canonical_node['description'] = _merge_two_lists( existing_canonical_node['description'], description_in_list) # Add the IRI for the 'preferred' curie, if we've found that node if node['id'] == canonicalized_curie: existing_canonical_node['iri'] = node.get('iri') else: name = canonical_info[ 'preferred_name'] if canonical_info else node['name'] preferred_type = canonical_info[ 'preferred_type'] if canonical_info else node['category_label'] types = list(canonical_info['all_types']) if canonical_info else [ node['category_label'] ] iri = node['iri'] if node['id'] == canonicalized_curie else None all_names = [node['name']] canonicalized_node = _create_node( node_id=canonicalized_curie, name=name, preferred_type=preferred_type, types=types, publications=publications, equivalent_curies=equivalent_curies_dict.get( canonicalized_curie, []), iri=iri, description=description_in_list, all_names=all_names) canonicalized_nodes[canonicalized_node['id']] = canonicalized_node curie_map[node[ 'id']] = canonicalized_curie # Record this mapping for easy lookup later return canonicalized_nodes, curie_map
def query_size_of_adjacent_nodes(self, node_curie, source_type, adjacent_type, kp="infores:rtx-kg2", rel_type=None): """ Query adjacent nodes of a given source node based on adjacent node type. :param node_curie: (required) the curie id of query node. It accepts both single curie id or curie id list eg. "UniProtKB:P14136" or ['UniProtKB:P02675', 'UniProtKB:P01903', 'UniProtKB:P09601', 'UniProtKB:Q02878'] :param source_type: (required) the type of source node, eg. "gene" :param adjacent_type: (required) the type of adjacent node, eg. "biological_process" :param kp: (optional) the knowledge provider to use, eg. "infores:rtx-kg2"(default) :param rel_type: (optional) edge type to consider, eg. "involved_in" :return a tuple with a dict containing the number of adjacent nodes for the query node and a list of removed nodes """ res = None source_type = ComputeFTEST.convert_string_to_snake_case(source_type.replace('biolink:','')) source_type = ComputeFTEST.convert_string_biolinkformat(source_type) adjacent_type = ComputeFTEST.convert_string_to_snake_case(adjacent_type.replace('biolink:','')) adjacent_type = ComputeFTEST.convert_string_biolinkformat(adjacent_type) if rel_type is None: nodesynonymizer = NodeSynonymizer() normalized_nodes = nodesynonymizer.get_canonical_curies(node_curie) failure_nodes = list() mapping = {node:normalized_nodes[node]['preferred_curie'] for node in normalized_nodes if normalized_nodes[node] is not None} failure_nodes += list(normalized_nodes.keys() - mapping.keys()) query_nodes = list(set(mapping.values())) query_nodes = [curie_id.replace("'", "''") if "'" in curie_id else curie_id for curie_id in query_nodes] # special_curie_ids = [curie_id for curie_id in query_nodes if "'" in curie_id] # Get connected to kg2c sqlite connection = sqlite3.connect(self.sqlite_file_path) cursor = connection.cursor() # Extract the neighbor count data node_keys_str = "','".join(query_nodes) # SQL wants ('node1', 'node2') format for string lists sql_query = f"SELECT N.id, N.neighbor_counts " \ f"FROM neighbors AS N " \ f"WHERE N.id IN ('{node_keys_str}')" cursor.execute(sql_query) rows = cursor.fetchall() rows = [curie_id.replace("\'","'").replace("''", "'") if "'" in curie_id else curie_id for curie_id in rows] connection.close() # Load the counts into a dictionary neighbor_counts_dict = {row[0]:eval(row[1]) for row in rows} res_dict = {node:neighbor_counts_dict[mapping[node]].get(adjacent_type) for node in mapping if mapping[node] in neighbor_counts_dict and neighbor_counts_dict[mapping[node]].get(adjacent_type) is not None} failure_nodes += list(mapping.keys() - res_dict.keys()) if len(failure_nodes) != 0: return (res_dict, failure_nodes) else: return (res_dict, []) else: if kp == 'ARAX/KG1': self.response.warning(f"Since the edge type '{rel_type}' is from KG1, we still use the DSL expand(kg=ARAX/KG1) to query neighbor count. However, the total node count is based on KG2c from 'nodesynonymizer.get_total_entity_count'. So the FET result might not be accurate.") # construct the instance of ARAXQuery class araxq = ARAXQuery() # check if node_curie is a str or a list if type(node_curie) is str: query_node_curie = node_curie elif type(node_curie) is list: node_id_list_str = "[" for index in range(len(node_curie)): node = node_curie[index] if index + 1 == len(node_curie): node_id_list_str = node_id_list_str + str(node) + "]" else: node_id_list_str = node_id_list_str + str(node) + "," query_node_curie = node_id_list_str else: self.response.error("The 'node_curie' argument of 'query_size_of_adjacent_nodes' method within FET only accepts str or list") return res # call the method of ARAXQuery class to query adjacent node query = {"operations": {"actions": [ "create_message", f"add_qnode(ids={query_node_curie}, categories={source_type}, key=FET_n00)", f"add_qnode(categories={adjacent_type}, key=FET_n01)", f"add_qedge(subject=FET_n00, object=FET_n01, key=FET_e00, predicates={rel_type})", f"expand(edge_key=FET_e00,kp={kp})", #"resultify()", "return(message=true, store=false)" ]}} try: result = araxq.query(query) if result.status != 'OK': self.response.error(f"Fail to query adjacent nodes from infores:rtx-kg2 for {node_curie}") return res else: res_dict = dict() message = araxq.response.envelope.message if type(node_curie) is str: tmplist = set([edge_key for edge_key in message.knowledge_graph.edges if message.knowledge_graph.edges[edge_key].subject == node_curie or message.knowledge_graph.edges[edge_key].object == node_curie]) ## edge has no direction if len(tmplist) == 0: self.response.warning(f"Fail to query adjacent nodes from {kp} for {node_curie} in FET probably because expander ignores node type. For more details, please see issue897.") return (res_dict,[node_curie]) res_dict[node_curie] = len(tmplist) return (res_dict,[]) else: check_empty = False failure_nodes = list() for node in node_curie: tmplist = set([edge_key for edge_key in message.knowledge_graph.edges if message.knowledge_graph.edges[edge_key].subject == node or message.knowledge_graph.edges[edge_key].object == node]) ## edge has no direction if len(tmplist) == 0: self.response.warning(f"Fail to query adjacent nodes from {kp} for {node} in FET probably because expander ignores node type. For more details, please see issue897.") failure_nodes.append(node) check_empty = True continue res_dict[node] = len(tmplist) if check_empty is True: return (res_dict,failure_nodes) else: return (res_dict,[]) except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Something went wrong with querying adjacent nodes from {kp} for {node_curie}") return res
class NGDDatabaseBuilder: def __init__(self, pubmed_directory_path, is_test, live="Production"): self.RTXConfig = RTXConfiguration() self.RTXConfig.live = live ngd_filepath = os.path.sep.join([ *pathlist[:(RTXindex + 1)], 'code', 'ARAX', 'KnowledgeSources', 'NormalizedGoogleDistance' ]) self.pubmed_directory_path = pubmed_directory_path self.conceptname_to_pmids_db_path = "conceptname_to_pmids.db" self.curie_to_pmids_db_path = f"{ngd_filepath}{os.path.sep}{self.RTXConfig.curie_to_pmids_path.split('/')[-1]}" self.status = 'OK' self.synonymizer = NodeSynonymizer() self.is_test = is_test def build_conceptname_to_pmids_db(self): # This function extracts curie -> PMIDs mappings from a Pubmed XML download (saves data in a pickledb) print( f"Starting to build {self.conceptname_to_pmids_db_path} from pubmed files.." ) start = time.time() pubmed_directory = os.fsencode(self.pubmed_directory_path) all_file_names = [ os.fsdecode(file) for file in os.listdir(pubmed_directory) ] pubmed_file_names = [ file_name for file_name in all_file_names if file_name.startswith('pubmed') and file_name.endswith('.xml.gz') ] if not pubmed_file_names: print( f"ERROR: Couldn't find any PubMed XML files to scrape. Provide the path to the directory " f"containing your PubMed download as a command line argument.") self.status = 'ERROR' else: conceptname_to_pmids_map = dict() # Go through each downloaded pubmed file and build our dictionary of mappings pubmed_file_names_to_process = pubmed_file_names if not self.is_test else pubmed_file_names[: 1] for file_name in pubmed_file_names_to_process: print( f" Starting to process file '{file_name}'.. ({pubmed_file_names_to_process.index(file_name) + 1}" f" of {len(pubmed_file_names_to_process)})") file_start_time = time.time() with gzip.open(f"{self.pubmed_directory_path}/{file_name}" ) as pubmed_file: file_contents_tree = etree.parse(pubmed_file) pubmed_articles = file_contents_tree.xpath("//PubmedArticle") for article in pubmed_articles: # Link each concept name to the PMID of this article current_pmid = article.xpath( ".//MedlineCitation/PMID/text()")[0] descriptor_names = article.xpath( ".//MedlineCitation/MeshHeadingList/MeshHeading/DescriptorName/text()" ) qualifier_names = article.xpath( ".//MedlineCitation/MeshHeadingList/MeshHeading/QualifierName/text()" ) chemical_names = article.xpath( ".//MedlineCitation/ChemicalList/Chemical/NameOfSubstance/text()" ) gene_symbols = article.xpath( ".//MedlineCitation/GeneSymbolList/GeneSymbol/text()") keywords = article.xpath( ".//MedlineCitation/KeywordList/Keyword/text()") all_concept_names = descriptor_names + qualifier_names + chemical_names + gene_symbols + keywords unique_concept_names = { concept_name for concept_name in all_concept_names if concept_name } for concept_name in unique_concept_names: self._add_pmids_mapping(concept_name, current_pmid, conceptname_to_pmids_map) self._destroy_etree( file_contents_tree) # Hack around lxml memory leak print( f" took {round((time.time() - file_start_time) / 60, 2)} minutes" ) # Save the data to the PickleDB after we're done print(" Loading data into PickleDB..") conceptname_to_pmids_db = pickledb.load( self.conceptname_to_pmids_db_path, False) for concept_name, pmid_list in conceptname_to_pmids_map.items(): conceptname_to_pmids_db.set( concept_name, list({ self._create_pmid_curie_from_local_id(pmid) for pmid in pmid_list })) print(" Saving PickleDB file..") conceptname_to_pmids_db.dump() print( f"Done! Building {self.conceptname_to_pmids_db_path} took {round(((time.time() - start) / 60) / 60, 3)} hours" ) def build_curie_to_pmids_db(self): # This function creates a final sqlite database of curie->PMIDs mappings using data scraped from Pubmed AND KG2 print( f"Starting to build {self.curie_to_pmids_db_path.split(os.path.sep)[-1]}.." ) start = time.time() curie_to_pmids_map = dict() self._add_pmids_from_pubmed_scrape(curie_to_pmids_map) if self.status != 'OK': return self._add_pmids_from_kg2_edges(curie_to_pmids_map) self._add_pmids_from_kg2_nodes(curie_to_pmids_map) print( f" In the end, found PMID lists for {len(curie_to_pmids_map)} (canonical) curies" ) self._save_data_in_sqlite_db(curie_to_pmids_map) print( f"Done! Building {self.curie_to_pmids_db_path.split(os.path.sep)[-1]} took {round((time.time() - start) / 60)} minutes." ) # Helper methods def _add_pmids_from_kg2_edges(self, curie_to_pmids_map): print(f" Getting PMIDs from edges in KG2 neo4j..") edge_query = f"match (n)-[e]->(m) where e.publications is not null and e.publications <> '[]' " \ f"return distinct n.id, m.id, e.publications{' limit 100' if self.is_test else ''}" edge_results = self._run_cypher_query(edge_query, 'KG2') print(f" Processing results..") node_ids = {result['n.id'] for result in edge_results }.union(result['m.id'] for result in edge_results) canonicalized_curies_dict = self._get_canonicalized_curies_dict( list(node_ids)) for result in edge_results: canonicalized_node_ids = { canonicalized_curies_dict[result['n.id']], canonicalized_curies_dict[result['m.id']] } pmids = self._extract_and_format_pmids(result['e.publications']) if pmids: # Sometimes publications list includes only non-PMID identifiers (like ISBN) for canonical_curie in canonicalized_node_ids: self._add_pmids_mapping(canonical_curie, pmids, curie_to_pmids_map) def _add_pmids_from_kg2_nodes(self, curie_to_pmids_map): print(f" Getting PMIDs from nodes in KG2 neo4j..") node_query = f"match (n) where n.publications is not null and n.publications <> '[]' " \ f"return distinct n.id, n.publications{' limit 100' if self.is_test else ''}" node_results = self._run_cypher_query(node_query, 'KG2') print(f" Processing results..") node_ids = {result['n.id'] for result in node_results} canonicalized_curies_dict = self._get_canonicalized_curies_dict( list(node_ids)) for result in node_results: canonical_curie = canonicalized_curies_dict[result['n.id']] pmids = self._extract_and_format_pmids(result['n.publications']) if pmids: # Sometimes publications list includes only non-PMID identifiers (like ISBN) self._add_pmids_mapping(canonical_curie, pmids, curie_to_pmids_map) def _add_pmids_from_pubmed_scrape(self, curie_to_pmids_map): # Load the data from the first half of the build process (scraping pubmed) print( f" Loading pickle DB containing pubmed scrapings ({self.conceptname_to_pmids_db_path}).." ) conceptname_to_pmids_db = pickledb.load( self.conceptname_to_pmids_db_path, False) if not conceptname_to_pmids_db.getall(): print( f"ERROR: {self.conceptname_to_pmids_db_path} must exist to do a partial build. Use --full or locate " f"that file.") self.status = 'ERROR' return # Get canonical curies for all of the concept names in our big pubmed pickleDB using the NodeSynonymizer concept_names = list(conceptname_to_pmids_db.getall()) print( f" Sending NodeSynonymizer.get_canonical_curies() a list of {len(concept_names)} concept names.." ) canonical_curies_dict = self.synonymizer.get_canonical_curies( names=concept_names) print( f" Got results back from NodeSynonymizer. (Returned dict contains {len(canonical_curies_dict)} keys.)" ) # Map all of the concept names scraped from pubmed to curies if canonical_curies_dict: recognized_concepts = { concept for concept in canonical_curies_dict if canonical_curies_dict.get(concept) } print( f" NodeSynonymizer recognized {round((len(recognized_concepts) / len(concept_names)) * 100)}% of " f"concept names scraped from pubmed.") # Store which concept names the NodeSynonymizer didn't know about, for learning purposes unrecognized_concepts = set(canonical_curies_dict).difference( recognized_concepts) with open('unrecognized_pubmed_concept_names.txt', 'w+') as unrecognized_concepts_file: unrecognized_concepts_file.write(f"{unrecognized_concepts}") print( f" Unrecognized concept names were written to 'unrecognized_pubmed_concept_names.txt'." ) # Map the canonical curie for each recognized concept to the concept's PMID list print(f" Mapping canonical curies to PMIDs..") for concept_name in recognized_concepts: canonical_curie = canonical_curies_dict[concept_name].get( 'preferred_curie') pmids_for_this_concept = conceptname_to_pmids_db.get( concept_name) self._add_pmids_mapping(canonical_curie, pmids_for_this_concept, curie_to_pmids_map) print( f" Mapped {len(curie_to_pmids_map)} canonical curies to PMIDs based on pubmed scrapings." ) else: print(f"ERROR: NodeSynonymizer didn't return anything!") self.status = 'ERROR' def _save_data_in_sqlite_db(self, curie_to_pmids_map): print(" Loading data into sqlite database..") # Remove any preexisting version of this database if os.path.exists(self.curie_to_pmids_db_path): os.remove(self.curie_to_pmids_db_path) connection = sqlite3.connect(self.curie_to_pmids_db_path) cursor = connection.cursor() cursor.execute("CREATE TABLE curie_to_pmids (curie TEXT, pmids TEXT)") cursor.execute( "CREATE UNIQUE INDEX unique_curie ON curie_to_pmids (curie)") print(f" Gathering row data..") rows = [[ curie, json.dumps( list( filter(None, {self._get_local_id_as_int(pmid) for pmid in pmids}))) ] for curie, pmids in curie_to_pmids_map.items()] rows_in_chunks = self._divide_list_into_chunks(rows, 5000) print(f" Inserting row data into database..") for chunk in rows_in_chunks: cursor.executemany( f"INSERT INTO curie_to_pmids (curie, pmids) VALUES (?, ?)", chunk) connection.commit() # Log how many rows we've added in the end (for debugging purposes) cursor.execute(f"SELECT COUNT(*) FROM curie_to_pmids") count = cursor.fetchone()[0] print(f" Done saving data in sqlite; database contains {count} rows.") cursor.close() def _get_canonicalized_curies_dict(self, curies: List[str]) -> Dict[str, str]: print( f" Sending a batch of {len(curies)} curies to NodeSynonymizer.get_canonical_curies()" ) canonicalized_nodes_info = self.synonymizer.get_canonical_curies( curies) canonicalized_curies_dict = dict() for input_curie, preferred_info_dict in canonicalized_nodes_info.items( ): if preferred_info_dict: canonicalized_curies_dict[ input_curie] = preferred_info_dict.get( 'preferred_curie', input_curie) else: canonicalized_curies_dict[input_curie] = input_curie print(f" Got results back from synonymizer") return canonicalized_curies_dict def _extract_and_format_pmids(self, publications: List[str]) -> List[str]: pmids = { publication_id for publication_id in publications if publication_id.upper().startswith('PMID') } # Make sure all PMIDs are given in same format (e.g., PMID:18299583 rather than PMID18299583) formatted_pmids = [ self._create_pmid_curie_from_local_id( pmid.replace('PMID', '').replace(':', '')) for pmid in pmids ] return formatted_pmids @staticmethod def _add_pmids_mapping(key: str, value_to_append: Union[str, List[str]], mappings_dict: Dict[str, List[str]]): if key not in mappings_dict: mappings_dict[key] = [] if isinstance(value_to_append, list): mappings_dict[key] += value_to_append else: mappings_dict[key].append(value_to_append) @staticmethod def _create_pmid_curie_from_local_id(pmid): return f"PMID:{pmid}" @staticmethod def _get_local_id_as_int(curie): # Converts "PMID:1234" to 1234 curie_pieces = curie.split(":") local_id_str = curie_pieces[-1] # Remove any strange characters (like in "PMID:_19960544") stripped_id_str = "".join( [character for character in local_id_str if character.isdigit()]) return int(stripped_id_str) if stripped_id_str else None @staticmethod def _destroy_etree(file_contents_tree): # Thank you to https://stackoverflow.com/a/49139904 for this method; important to prevent memory blow-up root = file_contents_tree.getroot() element_tracker = {root: [0, None]} for element in root.iterdescendants(): parent = element.getparent() element_tracker[element] = [element_tracker[parent][0] + 1, parent] element_tracker = sorted( [(depth, parent, child) for child, (depth, parent) in element_tracker.items()], key=lambda x: x[0], reverse=True) for _, parent, child in element_tracker: if parent is None: break parent.remove(child) del file_contents_tree @staticmethod def _run_cypher_query(cypher_query: str, kg='KG2') -> List[Dict[str, any]]: rtxc = RTXConfiguration() if kg == 'KG2': rtxc.live = "KG2" try: driver = GraphDatabase.driver(rtxc.neo4j_bolt, auth=(rtxc.neo4j_username, rtxc.neo4j_password)) with driver.session() as session: query_results = session.run(cypher_query).data() driver.close() except Exception: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() print(f"Encountered an error interacting with {kg} neo4j. {tb}") return [] else: return query_results @staticmethod def _divide_list_into_chunks(input_list: List[any], chunk_size: int) -> List[List[any]]: num_chunks = len(input_list) // chunk_size if len( input_list) % chunk_size == 0 else (len(input_list) // chunk_size) + 1 start_index = 0 stop_index = chunk_size all_chunks = [] for num in range(num_chunks): chunk = input_list[start_index:stop_index] if stop_index <= len( input_list) else input_list[start_index:] all_chunks.append(chunk) start_index += chunk_size stop_index += chunk_size return all_chunks
def assess(self, message): #### Define a default response response = ARAXResponse() self.response = response self.message = message response.debug(f"Assessing the QueryGraph for basic information") #### Get shorter handles query_graph = message.query_graph nodes = query_graph.nodes edges = query_graph.edges #### Store number of nodes and edges self.n_nodes = len(nodes) self.n_edges = len(edges) response.debug(f"Found {self.n_nodes} nodes and {self.n_edges} edges") #### Handle impossible cases if self.n_nodes == 0: response.error( "QueryGraph has 0 nodes. At least 1 node is required", error_code="QueryGraphZeroNodes") return response if self.n_nodes == 1 and self.n_edges > 0: response.error( "QueryGraph may not have edges if there is only one node", error_code="QueryGraphTooManyEdges") return response #if self.n_nodes == 2 and self.n_edges > 1: # response.error("QueryGraph may not have more than 1 edge if there are only 2 nodes", error_code="QueryGraphTooManyEdges") # return response #### Loop through nodes computing some stats node_info = {} self.node_category_map = {} for key, qnode in nodes.items(): node_info[key] = { 'key': key, 'node_object': qnode, 'has_id': False, 'category': qnode.category, 'has_category': False, 'is_set': False, 'n_edges': 0, 'n_links': 0, 'is_connected': False, 'edges': [], 'edge_dict': {} } if qnode.id is not None: node_info[key]['has_id'] = True #### If the user did not specify a category, but there is a curie, try to figure out the category if node_info[key]['category'] is None: synonymizer = NodeSynonymizer() curie = qnode.id curies_list = qnode.id if isinstance(qnode.id, list): curie = qnode.id[0] else: curies_list = [qnode.id] canonical_curies = synonymizer.get_canonical_curies( curies=curies_list, return_all_categories=True) if curie in canonical_curies and 'preferred_type' in canonical_curies[ curie]: node_info[key]['has_category'] = True node_info[key]['category'] = canonical_curies[curie][ 'preferred_type'] if qnode.category is not None: node_info[key]['has_category'] = True #if qnode.is_set is not None: node_info[key]['is_set'] = True if key is None: response.error( "QueryGraph has a node with null key. This is not permitted", error_code="QueryGraphNodeWithNoId") return response #### Remap the node categorys from unsupported to supported if qnode.category is not None: qnode.category = self.remap_node_category(qnode.category) #### Store lookup of categorys warning_counter = 0 if qnode.category is None or (isinstance(qnode.category, list) and len(qnode.category) == 0): if warning_counter == 0: #response.debug("QueryGraph has nodes with no category. This may cause problems with results inference later") pass warning_counter += 1 self.node_category_map['unknown'] = key else: category = qnode.category if isinstance(qnode.category, list): category = qnode.category[ 0] # FIXME this is a hack prior to proper list handling self.node_category_map[category] = key #### Loop through edges computing some stats edge_info = {} self.edge_predicate_map = {} unique_links = {} #### Ignore special informationational edges for now. virtual_edge_predicates = { 'has_normalized_google_distance_with': 1, 'has_fisher_exact_test_p-value_with': 1, 'has_jaccard_index_with': 1, 'probably_treats': 1, 'has_paired_concept_frequency_with': 1, 'has_observed_expected_ratio_with': 1, 'has_chi_square_with': 1 } for key, qedge in edges.items(): predicate = qedge.predicate if isinstance(predicate, list): if len(predicate) == 0: predicate = None else: predicate = predicate[ 0] # FIXME Hack before dealing with predicates as lists! if predicate is not None and predicate in virtual_edge_predicates: continue edge_info[key] = { 'key': key, 'has_predicate': False, 'subject': qedge.subject, 'object': qedge.object, 'predicate': None } if predicate is not None: edge_info[key]['has_predicate'] = True edge_info[key]['predicate'] = predicate if key is None: response.error( "QueryGraph has a edge with null key. This is not permitted", error_code="QueryGraphEdgeWithNoKey") return response #### Create a unique node link string link_string = ','.join(sorted([qedge.subject, qedge.object])) if link_string not in unique_links: node_info[qedge.subject]['n_links'] += 1 node_info[qedge.object]['n_links'] += 1 unique_links[link_string] = 1 #print(link_string) node_info[qedge.subject]['n_edges'] += 1 node_info[qedge.object]['n_edges'] += 1 node_info[qedge.subject]['is_connected'] = True node_info[qedge.object]['is_connected'] = True #node_info[qedge.subject]['edges'].append(edge_info[key]) #node_info[qedge.object]['edges'].append(edge_info[key]) node_info[qedge.subject]['edges'].append(edge_info[key]) node_info[qedge.object]['edges'].append(edge_info[key]) node_info[qedge.subject]['edge_dict'][key] = edge_info[key] node_info[qedge.object]['edge_dict'][key] = edge_info[key] #### Store lookup of predicates warning_counter = 0 edge_predicate = 'any' if predicate is None: if warning_counter == 0: response.debug( "QueryGraph has edges with no predicate. This may cause problems with results inference later" ) warning_counter += 1 else: edge_predicate = predicate #### It's not clear yet whether we need to store the whole sentence or just the predicate #predicate_encoding = f"{node_info[qedge.subject]['predicate']}---{edge_predicate}---{node_info[qedge.object]['predicate']}" predicate_encoding = edge_predicate self.edge_predicate_map[predicate_encoding] = key #### Loop through the nodes again, trying to identify the start_node and the end_node singletons = [] for node_id, node_data in node_info.items(): if node_data['n_links'] < 2: singletons.append(node_data) elif node_data['n_links'] > 2: self.is_bifurcated_graph = True response.warning( "QueryGraph appears to have a fork in it. This might cause trouble" ) #### If this doesn't produce any singletons, then try curie based selection if len(singletons) == 0: for node_id, node_data in node_info.items(): if node_data['has_id']: singletons.append(node_data) #### If this doesn't produce any singletons, then we don't know how to continue if len(singletons) == 0: response.error("Unable to understand the query graph", error_code="QueryGraphCircular") return response #### Try to identify the start_node and the end_node start_node = singletons[0] if len(nodes) == 1: # Just a single node, fine pass elif len(singletons) < 2: response.warning( "QueryGraph appears to be circular or has a strange geometry. This might cause trouble" ) elif len(singletons) > 2: response.warning( "QueryGraph appears to have a fork in it. This might cause trouble" ) else: if singletons[0]['has_id'] is True and singletons[1][ 'has_id'] is False: start_node = singletons[0] elif singletons[0]['has_id'] is False and singletons[1][ 'has_id'] is True: start_node = singletons[1] else: start_node = singletons[0] #### Hmm, that's not very robust against odd graphs. This needs work. FIXME self.node_info = node_info self.edge_info = edge_info self.start_node = start_node current_node = start_node node_order = [start_node] edge_order = [] edges = current_node['edges'] debug = False while 1: if debug: tmp = { 'astate': '1', 'current_node': current_node, 'node_order': node_order, 'edge_order': edge_order, 'edges': edges } print( json.dumps(ast.literal_eval(repr(tmp)), sort_keys=True, indent=2)) print( '==================================================================================' ) tmp = input() if len(edges) == 0: break #if len(edges) > 1: if current_node['n_links'] > 1: response.error( f"Help, two edges at A583. Don't know what to do: {current_node['n_links']}", error_code="InteralErrorA583") return response edge_order.append(edges[0]) previous_node = current_node if edges[0]['subject'] == current_node['key']: current_node = node_info[edges[0]['object']] elif edges[0]['object'] == current_node['key']: current_node = node_info[edges[0]['subject']] else: response.error("Help, edge error A584. Don't know what to do", error_code="InteralErrorA584") return response node_order.append(current_node) #tmp = { 'astate': '2', 'current_node': current_node, 'node_order': node_order, 'edge_order': edge_order, 'edges': edges } #print(json.dumps(ast.literal_eval(repr(tmp)),sort_keys=True,indent=2)) #print('==================================================================================') #tmp = input() edges = current_node['edges'] new_edges = [] for edge in edges: key = edge['key'] if key not in previous_node['edge_dict']: new_edges.append(edge) edges = new_edges if len(edges) == 0: break #tmp = { 'astate': '3', 'current_node': current_node, 'node_order': node_order, 'edge_order': edge_order, 'edges': edges } #print(json.dumps(ast.literal_eval(repr(tmp)),sort_keys=True,indent=2)) #print('==================================================================================') #tmp = input() self.node_order = node_order self.edge_order = edge_order # Create a text rendering of the QueryGraph geometry for matching against a template self.query_graph_templates = { 'simple': '', 'detailed': { 'n_nodes': len(node_order), 'components': [] } } node_index = 0 edge_index = 0 #print(json.dumps(ast.literal_eval(repr(node_order)),sort_keys=True,indent=2)) for node in node_order: component_id = f"n{node_index:02}" content = '' component = { 'component_type': 'node', 'component_id': component_id, 'has_id': node['has_id'], 'has_category': node['has_category'], 'category_value': None } self.query_graph_templates['detailed']['components'].append( component) if node['has_id']: content = 'id' elif node['has_category'] and node[ 'node_object'].category is not None: content = f"category={node['node_object'].category}" component['category_value'] = node['node_object'].category elif node['has_category']: content = 'category' template_part = f"{component_id}({content})" self.query_graph_templates['simple'] += template_part # Since queries with intermediate nodes that are not is_set=true tend to blow up, for now, make them is_set=true unless explicitly set to false if node_index > 0 and node_index < (self.n_nodes - 1): if 'is_set' not in node or node['is_set'] is None: node['node_object'].is_set = True response.warning( f"Setting unspecified is_set to true for {node['key']} because this will probably lead to a happier result" ) elif node['is_set'] is True: response.debug( f"Value for is_set is already true for {node['key']} so that's good" ) elif node['is_set'] is False: #response.info(f"Value for is_set is set to false for intermediate node {node['key']}. This could lead to weird results. Consider setting it to true") response.info( f"Value for is_set is false for intermediate node {node['key']}. Setting to true because this will probably lead to a happier result" ) node['node_object'].is_set = True #else: # response.error(f"Unrecognized value is_set='{node['is_set']}' for {node['key']}. This should be true or false") node_index += 1 if node_index < self.n_nodes: #print(json.dumps(ast.literal_eval(repr(node)),sort_keys=True,indent=2)) #### Extract the has_predicate and predicate_value from the edges of the node #### This could fail if there are two edges coming out of the node FIXME has_predicate = False predicate_value = None if 'edges' in node: for related_edge in node['edges']: if related_edge['subject'] == node['key']: has_predicate = related_edge['has_predicate'] if has_predicate is True and 'predicate' in related_edge: predicate_value = related_edge['predicate'] component_id = f"e{edge_index:02}" template_part = f"-{component_id}()-" self.query_graph_templates['simple'] += template_part component = { 'component_type': 'edge', 'component_id': component_id, 'has_id': False, 'has_predicate': has_predicate, 'predicate_value': predicate_value } self.query_graph_templates['detailed']['components'].append( component) edge_index += 1 response.debug( f"The QueryGraph reference template is: {self.query_graph_templates['simple']}" ) #tmp = { 'node_info': node_info, 'edge_info': edge_info, 'start_node': start_node, 'n_nodes': self.n_nodes, 'n_edges': self.n_edges, # 'is_bifurcated_graph': self.is_bifurcated_graph, 'node_order': node_order, 'edge_order': edge_order } #print(json.dumps(ast.literal_eval(repr(tmp)),sort_keys=True,indent=2)) #sys.exit(0) #### Return the response return response
def add_qnode(self, response, input_parameters, describe=False): """ Adds a new QNode object to the QueryGraph inside the Message object :return: ARAXResponse object with execution information :rtype: ARAXResponse """ # #### Command definition for autogenerated documentation command_definition = { 'dsl_command': 'add_qnode()', 'description': """The `add_qnode` method adds an additional QNode to the QueryGraph in the Message object.""", 'parameters': { 'key': { 'is_required': False, 'examples': ['n00', 'n01'], 'default': '', 'type': 'string', 'description': """Any string that is unique among all QNode key fields, with recommended format n00, n01, n02, etc. If no value is provided, autoincrementing values beginning for n00 are used.""", }, 'id': { 'is_required': False, 'examples': ['DOID:9281', '[UniProtKB:P12345,UniProtKB:Q54321]'], 'type': 'string', 'description': 'Any compact URI (CURIE) (e.g. DOID:9281) (May also be a list like [UniProtKB:P12345,UniProtKB:Q54321])', }, 'name': { 'is_required': False, 'examples': ['hypertension', 'insulin'], 'type': 'string', 'description': 'Any name of a bioentity that will be resolved into a CURIE if possible or result in an error if not (e.g. hypertension, insulin)', }, 'category': { 'is_required': False, 'examples': ['protein', 'chemical_substance', 'disease'], 'type': 'ARAXnode', 'description': 'Any valid Translator bioentity category (e.g. protein, chemical_substance, disease)', }, 'is_set': { 'is_required': False, 'enum': ["true", "false", "True", "False", "t", "f", "T", "F"], 'examples': ['true', 'false'], 'type': 'boolean', 'description': 'If set to true, this QNode represents a set of nodes that are all in common between the two other linked QNodes (assumed to be false if not specified or value is not recognized as true/t case insensitive)' }, 'option_group_id': { 'is_required': False, 'examples': ['1', 'a', 'b2', 'option'], 'type': 'string', 'description': 'A group identifier indicating a group of nodes and edges should either all be included or all excluded. An optional match for all elements in this group. If not included Node will be treated as required.' }, } } if describe: return command_definition #### Extract the message to work on message = response.envelope.message #### Basic checks on arguments if not isinstance(input_parameters, dict): response.error("Provided parameters is not a dict", error_code="ParametersNotDict") return response #### Define a complete set of allowed parameters and their defaults parameters = { 'key': None, 'id': None, 'name': None, 'category': None, 'is_set': None, 'option_group_id': None, } #### Loop through the input_parameters and override the defaults and make sure they are allowed for key, value in input_parameters.items(): if key not in parameters: response.error(f"Supplied parameter {key} is not permitted", error_code="UnknownParameter") else: parameters[key] = value #### Check for option_group_id and is_set: if parameters['option_group_id'] is not None and parameters[ 'id'] is None and parameters['name'] is None: if parameters['is_set'] is None: parameters['is_set'] = 'true' response.warning( f"An 'option_group_id' was set to {parameters['option_group_id']}, but 'is_set' was not an included parameter. It must be true when an 'option_group_id' is given, so automatically setting to true. Avoid this warning by explictly setting to true." ) elif not (parameters['is_set'].lower() == 'true' or parameters['is_set'].lower() == 't'): response.error( f"When an 'option_group_id' is given 'is_set' must be set to true. However, supplied input for parameter 'is_set' was {parameters['is_set']}.", error_code="InputMismatch") #### Return if any of the parameters generated an error (showing not just the first one) if response.status != 'OK': return response #### Now apply the filters. Order of operations is probably quite important #### Scalar value filters probably come first like minimum_confidence, then complex logic filters #### based on edge or node properties, and then finally maximum_results response.info( f"Adding a QueryNode to Message with input parameters {parameters}" ) #### Make sure there's a query_graph already here if message.query_graph is None: message.query_graph = QueryGraph() message.query_graph.nodes = {} message.query_graph.edges = {} if message.query_graph.nodes is None: message.query_graph.nodes = {} #### Set up the NodeSynonymizer to find curies and names synonymizer = NodeSynonymizer() # Create the QNode and set the key qnode = QNode() if parameters['key'] is not None: key = parameters['key'] else: key = self.__get_next_free_node_key() if parameters['option_group_id'] is not None: qnode.option_group_id = parameters['option_group_id'] # Set the is_set parameter to what the user selected if parameters['is_set'] is not None: qnode.is_set = (parameters['is_set'].lower() == 'true' or parameters['is_set'].lower() == 't') #### If the id is specified, try to find that if parameters['id'] is not None: # If the id is a scalar then treat it here as a list of one if isinstance(parameters['id'], str): id_list = [parameters['id']] is_id_a_list = False if parameters['is_set'] is not None and qnode.is_set is True: response.error( f"Specified id '{parameters['id']}' is a scalar, but is_set=true, which doesn't make sense", error_code="IdScalarButIsSetTrue") return response # Or else set it up as a list elif isinstance(parameters['id'], list): id_list = parameters['id'] is_id_a_list = True qnode.id = [] if parameters['is_set'] is None: response.warning( f"Specified id '{parameters['id']}' is a list, but is_set was not set to true. It must be true in this context, so automatically setting to true. Avoid this warning by explictly setting to true." ) qnode.is_set = True else: if qnode.is_set == False: response.warning( f"Specified id '{parameters['id']}' is a list, but is_set=false, which doesn't make sense, so automatically setting to true. Avoid this warning by explictly setting to true." ) qnode.is_set = True # Or if it's neither a list or a string, then error out. This cannot be handled at present else: response.error( f"Specified id '{parameters['id']}' is neither a string nor a list. This cannot to handled", error_code="IdNotListOrScalar") return response # Loop over the available ids and create the list for id in id_list: response.debug(f"Looking up id {id} in NodeSynonymizer") synonymizer_results = synonymizer.get_canonical_curies( curies=[id]) # If nothing was found, we won't bail out, but rather just issue a warning that this id is suspect if synonymizer_results[id] is None: response.warning( f"A node with id {id} is not in our knowledge graph KG2, but will continue with it" ) if is_id_a_list: qnode.id.append(id) else: qnode.id = id # And if it is found, keep the same id but report the preferred id else: response.info(f"id {id} is found. Adding it to the qnode") if is_id_a_list: qnode.id.append(id) else: qnode.id = id if 'category' in parameters and parameters[ 'category'] is not None: if isinstance(parameters['category'], str): qnode.category = parameters['category'] else: qnode.category = parameters['category'][0] message.query_graph.nodes[key] = qnode return response #### If the name is specified, try to find that if parameters['name'] is not None: name = parameters['name'] response.debug( f"Looking up id for name '{name}' in NodeSynonymizer") synonymizer_results = synonymizer.get_canonical_curies( curies=[name], names=[name]) if synonymizer_results[name] is None: response.error( f"A node with name '{name}' is not in our knowledge graph", error_code="UnresolvableNodeName") return response qnode.id = synonymizer_results[name]['preferred_curie'] response.info( f"Creating QueryNode with id '{qnode.id}' for name '{name}'") if parameters['category'] is not None: qnode.category = parameters['category'] message.query_graph.nodes[key] = qnode return response #### If the category is specified, just add that category. There should be checking that it is legal. FIXME if parameters['category'] is not None: qnode.category = parameters['category'] if parameters['is_set'] is not None: qnode.is_set = (parameters['is_set'].lower() == 'true') message.query_graph.nodes[key] = qnode return response #### If we get here, it means that all three main parameters are null. Just a generic node with no category or anything. This is okay. message.query_graph.nodes[key] = qnode return response
else: message = response.envelope.message target_curie_list += [ node_key for node_key, _ in message.knowledge_graph.nodes.items() ] if database == 'DTD': if len(check_wrong_queries) != 0: print( f'Something wrong occurred in these DSL queries {check_wrong_queries}' ) exit() else: target_curie_list = list(set(target_curie_list)) target_curie_list = [ synonymizer.get_canonical_curies(curie)[curie]['preferred_curie'] for curie in target_curie_list if synonymizer.get_canonical_curies(curie)[curie] is not None ] # print(target_curie_list) if os.path.isfile(DTD_prob_db_file): ## pull all data from `DTD_probability_database.db` database con = sqlite3.connect(DTD_prob_db_file) table = pd.read_sql_query("SELECT * from DTD_PROBABILITY", con) con.close() drug_list = [ synonymizer.get_canonical_curies( curie)[curie]['preferred_curie'] for curie in target_curie_list if synonymizer.get_canonical_curies(curie)[curie] is not None and synonymizer.get_canonical_curies(
def _canonicalize_nodes( neo4j_nodes: List[Dict[str, any]] ) -> Tuple[Dict[str, Dict[str, any]], Dict[str, str]]: synonymizer = NodeSynonymizer() node_ids = [node.get('id') for node in neo4j_nodes if node.get('id')] print( f" Sending NodeSynonymizer.get_canonical_curies() {len(node_ids)} curies.." ) canonicalized_info = synonymizer.get_canonical_curies( curies=node_ids, return_all_categories=True) all_canonical_curies = { canonical_info['preferred_curie'] for canonical_info in canonicalized_info.values() if canonical_info } print( f" Sending NodeSynonymizer.get_equivalent_nodes() {len(all_canonical_curies)} curies.." ) equivalent_curies_info = synonymizer.get_equivalent_nodes( all_canonical_curies) recognized_curies = { curie for curie in equivalent_curies_info if equivalent_curies_info.get(curie) } equivalent_curies_dict = { curie: list(equivalent_curies_info.get(curie)) for curie in recognized_curies } print(f" Creating canonicalized nodes..") curie_map = dict() canonicalized_nodes = dict() for neo4j_node in neo4j_nodes: # Grab relevant info for this node and its canonical version canonical_info = canonicalized_info.get(neo4j_node['id']) canonicalized_curie = canonical_info.get( 'preferred_curie', neo4j_node['id']) if canonical_info else neo4j_node['id'] publications = neo4j_node['publications'] if neo4j_node.get( 'publications') else [] descriptions_list = [neo4j_node['description'] ] if neo4j_node.get('description') else [] if canonicalized_curie in canonicalized_nodes: # Merge this node into its corresponding canonical node existing_canonical_node = canonicalized_nodes[canonicalized_curie] existing_canonical_node['publications'] = _merge_two_lists( existing_canonical_node['publications'], publications) existing_canonical_node['all_names'] = _merge_two_lists( existing_canonical_node['all_names'], [neo4j_node['name']]) existing_canonical_node['descriptions_list'] = _merge_two_lists( existing_canonical_node['descriptions_list'], descriptions_list) # Make sure any nodes subject to #1074-like problems still appear in equivalent curies existing_canonical_node['equivalent_curies'] = _merge_two_lists( existing_canonical_node['equivalent_curies'], [neo4j_node['id']]) # Add the IRI and description for the 'preferred' curie, if we've found that node if neo4j_node['id'] == canonicalized_curie: existing_canonical_node['iri'] = neo4j_node.get('iri') existing_canonical_node['description'] = neo4j_node.get( 'description') else: # Initiate the canonical node for this synonym group name = canonical_info[ 'preferred_name'] if canonical_info else neo4j_node['name'] category = canonical_info[ 'preferred_category'] if canonical_info else neo4j_node[ 'category'] if not category.startswith("biolink:"): print( f" WARNING: Preferred category for {canonicalized_curie} doesn't start with 'biolink:': {category}" ) all_categories = list( canonical_info['all_categories']) if canonical_info else [ neo4j_node['category'] ] expanded_categories = list( canonical_info['expanded_categories']) if canonical_info else [ neo4j_node['category'] ] iri = neo4j_node['iri'] if neo4j_node[ 'id'] == canonicalized_curie else None description = neo4j_node.get( 'description' ) if neo4j_node['id'] == canonicalized_curie else None all_names = [neo4j_node['name']] # Check for bug where not all categories in synonymizer were of "biolink:PascalCase" format if not all( category.startswith("biolink:") for category in all_categories): print( f" WARNING: all_categories for {canonicalized_curie} contain non 'biolink:PascalCase' " f"items: {all_categories}") if not all( category.startswith("biolink:") for category in expanded_categories): print( f" WARNING: expanded_categories for {canonicalized_curie} contain non 'biolink:PascalCase' " f"items: {expanded_categories}") canonicalized_node = _create_node( preferred_curie=canonicalized_curie, name=name, category=category, all_categories=all_categories, expanded_categories=expanded_categories, publications=publications, equivalent_curies=equivalent_curies_dict.get( canonicalized_curie, [canonicalized_curie]), iri=iri, description=description, descriptions_list=descriptions_list, all_names=all_names) canonicalized_nodes[canonicalized_node['id']] = canonicalized_node curie_map[neo4j_node[ 'id']] = canonicalized_curie # Record this mapping for easy lookup later return canonicalized_nodes, curie_map
def report_on_curies_missed_by_local_ngd(kg: str): backup_ngd = NormGoogleDistance() synonymizer = NodeSynonymizer() curie_to_pmid_db = SqliteDict(f"./curie_to_pmids.sqlite") batch_size = 50 # Get random selection of nodes from the KG query = f"match (a) return a.id, a.name, rand() as r order by r limit {batch_size}" results = _run_cypher_query(query, kg) canonical_curie_info = synonymizer.get_canonical_curies( [result['a.id'] for result in results]) recognized_curies = { input_curie for input_curie in canonical_curie_info if canonical_curie_info.get(input_curie) } # Figure out which of these local ngd misses misses = set() for curie in recognized_curies: canonical_curie = canonical_curie_info[curie].get('preferred_curie') if canonical_curie not in curie_to_pmid_db: misses.add(curie) percent_missed = round((len(misses) / len(recognized_curies)) * 100) print( f"Local ngd missed {len(misses)} of {len(recognized_curies)} curies ({percent_missed}%)" ) # Try eUtils for each of the curies local ngd missed num_eutils_found = 0 try: with open('misses_found_by_eutils.json', 'r') as file_to_add_to: found_dict = json.load(file_to_add_to) except Exception: found_dict = dict() for missed_curie in misses: # Try eUtils for this node node_id = canonical_curie_info[missed_curie].get('preferred_curie') node_name = canonical_curie_info[missed_curie].get('preferred_name') node_type = canonical_curie_info[missed_curie].get('preferred_type') try: pmids = backup_ngd.get_pmids_for_all([node_id], [node_name]) except Exception: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() print(f"ERROR using back-up method: {tb}") else: if len(pmids) and ([pmid_list for pmid_list in pmids if pmid_list]): num_eutils_found += 1 print( f" Found {len(pmids[0])} PMIDs for {node_id}, {node_name}." ) found_dict[node_id] = {'name': node_name, 'type': node_type} else: print(f" Not found. ({node_id}, {node_name})") # Report some findings percent_found_by_eutils = round((num_eutils_found / len(misses)) * 100) print( f"Eutils found {num_eutils_found} out of {len(misses)} curies that local ngd missed ({percent_found_by_eutils}%)" ) found_types = [ node_info['type'] for node_id, node_info in found_dict.items() ] counter = collections.Counter(found_types) print(counter) # Save the data to a JSON file for access later with open('misses_found_by_eutils.json', 'w+') as output_file: json.dump(found_dict, output_file)
class PredictDrugTreatsDisease: #### Constructor def __init__(self, response, message, parameters): self.response = response self.message = message self.parameters = parameters self.global_iter = 0 ## check if the new model files exists in /predictor/retrain_data. If not, scp it from arax.ncats.io pathlist = os.path.realpath(__file__).split(os.path.sep) RTXindex = pathlist.index("RTX") filepath = os.path.sep.join([*pathlist[:(RTXindex + 1)], 'code', 'ARAX', 'ARAXQuery', 'Overlay', 'predictor','retrain_data']) ## check if there is LogModel.pkl pkl_file = f"{filepath}/LogModel.pkl" if os.path.exists(pkl_file): pass else: os.system("scp [email protected]:/data/orangeboard/databases/KG2.3.4/LogModel.pkl " + pkl_file) ## check if there is GRAPH.sqlite db_file = f"{filepath}/GRAPH.sqlite" if os.path.exists(db_file): pass else: os.system("scp [email protected]:/data/orangeboard/databases/KG2.3.4/GRAPH.sqlite " + db_file) ## check if there is DTD_probability_database.db DTD_prob_db_file = f"{filepath}/DTD_probability_database_v1.0.db" if os.path.exists(DTD_prob_db_file): pass else: os.system("scp [email protected]:/data/orangeboard/databases/KG2.3.4/DTD_probability_database_v1.0.db " + DTD_prob_db_file) # use NodeSynonymizer to replace map.txt # check if there is map.txt # map_file = f"{filepath}/map.txt" # if os.path.exists(map_file): # pass # else: # os.system("scp [email protected]:/home/ubuntu/drug_repurposing_model_retrain/map.txt " + map_file) self.use_prob_db = True if self.use_prob_db is True: try: self.pred = predictor(DTD_prob_file=DTD_prob_db_file, use_prob_db=True) except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Internal Error encountered connecting to the local DTD prediction database.") else: try: self.pred = predictor(model_file=pkl_file, use_prob_db=False) except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Internal Error encountered connecting to the local LogModel.pkl file.") try: self.pred.import_file(None, graph_database=db_file) except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Internal Error encountered connecting to the local graph database file.") # with open(map_file, 'r') as infile: # map_file_content = infile.readlines() # map_file_content.pop(0) ## remove title # self.known_curies = set(line.strip().split('\t')[0] for line in map_file_content) self.synonymizer = NodeSynonymizer() def convert_to_trained_curies(self, input_curie): """ Takes an input curie from the KG, uses the synonymizer, and then returns something that the map.csv can handle """ normalizer_result = self.synonymizer.get_canonical_curies(input_curie) curies_in_model = normalizer_result[input_curie] # curies_in_model = [curie for curie in curies_in_model if curie in self.known_curies] # equivalent_curies = [] # start with empty equivalent_curies # try: # equivalent_curies = [x['identifier'] for x in normalizer_result[input_curie]['equivalent_identifiers']] # except: # self.response.warning(f"NodeSynonmizer could not find curies for {input_curie}, skipping this one.") # for curie in equivalent_curies: # curie_prefix = curie.split(':')[0] # # FIXME: fix this when re-training the ML model, as when this was originally trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123 # if curie_prefix == "CHEMBL.COMPOUND": # chembl_fix = 'ChEMBL:' + curie[22:] # if chembl_fix in self.known_curies: # curies_in_model.add(chembl_fix) # elif curie in self.known_curies: # curies_in_model.add(curie) return curies_in_model def predict_drug_treats_disease(self): """ Iterate over all the edges in the knowledge graph, add the drug-disease treatment probability for appropriate edges on the edge_attributes :return: response """ parameters = self.parameters self.response.debug(f"Computing drug disease treatment probability based on a machine learning model") self.response.info(f"Computing drug disease treatment probability based on a machine learning model: See [this publication](https://doi.org/10.1101/765305) for more details about how this is accomplished.") attribute_name = "probability_treats" attribute_type = "EDAM:data_0951" value = 0 # this will be the default value. If the model returns 0, or the default is there, don't include that edge url = "https://doi.org/10.1101/765305" # if you want to add virtual edges, identify the source/targets, decorate the edges, add them to the KG, and then add one to the QG corresponding to them if 'virtual_relation_label' in parameters: source_curies_to_decorate = set() target_curies_to_decorate = set() curie_to_name = dict() # identify the nodes that we should be adding virtual edges for for node_key, node in self.message.knowledge_graph.nodes.items(): if hasattr(node, 'qnode_keys'): if parameters['subject_qnode_key'] in node.qnode_keys: if "drug" in node.category or "chemical_substance" in node.category or "biolink:Drug" in node.category or "biolink:ChemicalSubstance" in node.category: # this is now NOT checked by ARAX_overlay source_curies_to_decorate.add(node_key) curie_to_name[node_key] = node.name if parameters['object_qnode_key'] in node.qnode_keys: if "disease" in node.category or "phenotypic_feature" in node.category or "biolink:Disease" in node.category or "biolink:PhenotypicFeature" in node.category: # this is now NOT checked by ARAX_overlay target_curies_to_decorate.add(node_key) curie_to_name[node_key] = node.name added_flag = False # check to see if any edges where added # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute for (source_curie, target_curie) in itertools.product(source_curies_to_decorate, target_curies_to_decorate): self.response.debug(f"Predicting probability that {curie_to_name[source_curie]} treats {curie_to_name[target_curie]}") # create the edge attribute if it can be # loop over all equivalent curies and take the highest probability max_probability = 0 converted_source_curie = self.convert_to_trained_curies(source_curie) if converted_source_curie is None: continue else: preferred_type = converted_source_curie['preferred_type'] if preferred_type == "drug" or preferred_type == "chemical_substance" or preferred_type == "biolink:Drug" or preferred_type == "biolink:ChemicalSubstance": converted_source_curie = converted_source_curie['preferred_curie'] else: continue converted_target_curie = self.convert_to_trained_curies(target_curie) if converted_target_curie is None: continue else: preferred_type = converted_target_curie['preferred_type'] if preferred_type == "disease" or preferred_type == "phenotypic_feature" or preferred_type == "biolink:Disease" or preferred_type == "biolink:PhenotypicFeature": converted_target_curie = converted_target_curie['preferred_curie'] else: continue if self.use_prob_db is True: probability = self.pred.get_prob_from_DTD_db(converted_source_curie, converted_target_curie) if probability is not None: if np.isfinite(probability): max_probability = probability else: probability = self.pred.prob_single(converted_source_curie, converted_target_curie) if probability is not None: probability = probability[0] if np.isfinite(probability): max_probability = probability # if len(res) != 0: # all_probabilities = self.pred.prob_all(res) # if isinstance(all_probabilities, list): # max_probability = max([value for value in all_probabilities if np.isfinite(value)]) value = max_probability #probability = self.pred.prob_single('ChEMBL:' + source_curie[22:], target_curie) # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123 #if probability and np.isfinite(probability): # finite, that's ok, otherwise, stay with default # value = probability[0] edge_attribute = EdgeAttribute(type=attribute_type, name=attribute_name, value=str(value), url=url) # populate the edge attribute if edge_attribute and value != 0: added_flag = True # make the edge, add the attribute # edge properties now = datetime.now() edge_type = "biolink:probably_treats" qedge_keys = [parameters['virtual_relation_label']] relation = parameters['virtual_relation_label'] is_defined_by = "ARAX" defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S") provided_by = "ARAX" confidence = None weight = None # TODO: could make the actual value of the attribute subject_key = source_curie object_key = target_curie # now actually add the virtual edges in id = f"{relation}_{self.global_iter}" self.global_iter += 1 edge_attribute_list = [ edge_attribute, EdgeAttribute(name="is_defined_by", value=is_defined_by, type="ARAX_TYPE_PLACEHOLDER"), EdgeAttribute(name="defined_datetime", value=defined_datetime, type="metatype:Datetime"), EdgeAttribute(name="provided_by", value=provided_by, type="biolink:provided_by"), #EdgeAttribute(name="confidence", value=confidence, type="biolink:ConfidenceLevel"), #EdgeAttribute(name="weight", value=weight, type="metatype:Float") ] edge = Edge(predicate=edge_type, subject=subject_key, object=object_key, relation=relation, attributes=edge_attribute_list) edge.qedge_keys = qedge_keys self.message.knowledge_graph.edges[id] = edge # Now add a q_edge the query_graph since I've added an extra edge to the KG if added_flag: edge_type = "biolink:probably_treats" relation = parameters['virtual_relation_label'] subject_qnode_key = parameters['subject_qnode_key'] object_qnode_key = parameters['object_qnode_key'] option_group_id = ou.determine_virtual_qedge_option_group(subject_qnode_key, object_qnode_key, self.message.query_graph, self.response) q_edge = QEdge(predicate=edge_type, relation=relation, subject=subject_qnode_key, object=object_qnode_key, option_group_id=option_group_id) self.message.query_graph.edges[relation] = q_edge return self.response else: # you want to add it for each edge in the KG # iterate over KG edges, add the information try: # map curies to types curie_to_type = dict() curie_to_name = dict() for node_key, node in self.message.knowledge_graph.nodes.items(): curie_to_type[node_key] = node.category curie_to_name[node_key] = node.name # then iterate over the edges and decorate if appropriate for edge_key, edge in self.message.knowledge_graph.edges.items(): # Make sure the edge_attributes are not None if not edge.attributes: edge.attributes = [] # should be an array, but why not a list? # now go and actually get the probability source_curie = edge.subject target_curie = edge.object source_types = curie_to_type[source_curie] target_types = curie_to_type[target_curie] if (("drug" in source_types) or ("chemical_substance" in source_types) or ("biolink:Drug" in source_types) or ("biolink:ChemicalSubstance" in source_types)) and (("disease" in target_types) or ("phenotypic_feature" in target_types) or ("biolink:Disease" in target_types) or ("biolink:PhenotypicFeature" in target_types)): # loop over all pairs of equivalent curies and take the highest probability self.response.debug(f"Predicting treatment probability between {curie_to_name[source_curie]} and {curie_to_name[target_curie]}") max_probability = 0 converted_source_curie = self.convert_to_trained_curies(source_curie) if converted_source_curie is None: continue else: preferred_type = converted_source_curie['preferred_type'] if preferred_type == "drug" or preferred_type == "chemical_substance" or preferred_type == "biolink:Drug" or preferred_type == "biolink:ChemicalSubstance": converted_source_curie = converted_source_curie['preferred_curie'] else: continue converted_target_curie = self.convert_to_trained_curies(target_curie) if converted_target_curie is None: continue else: preferred_type = converted_target_curie['preferred_type'] if preferred_type == "disease" or preferred_type == "phenotypic_feature" or preferred_type == "biolink:Disease" or preferred_type == "biolink:PhenotypicFeature": converted_target_curie = converted_target_curie['preferred_curie'] else: continue if self.use_prob_db is True: probability = self.pred.get_prob_from_DTD_db(converted_source_curie, converted_target_curie) if probability is not None: if np.isfinite(probability): max_probability = probability else: probability = self.pred.prob_single(converted_source_curie, converted_target_curie) if probability is not None: probability = probability[0] if np.isfinite(probability): max_probability = probability # res = list(itertools.product(converted_source_curie, converted_target_curie)) # if len(res) != 0: # all_probabilities = self.pred.prob_all(res) # if isinstance(all_probabilities, list): # max_probability = max([value for value in all_probabilities if np.isfinite(value)]) value = max_probability #probability = self.pred.prob_single('ChEMBL:' + source_curie[22:], target_curie) # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123 #if probability and np.isfinite(probability): # finite, that's ok, otherwise, stay with default # value = probability[0] elif (("drug" in target_types) or ("chemical_substance" in target_types) or ("biolink:Drug" in target_types) or ("biolink:ChemicalSubstance" in target_types)) and (("disease" in source_types) or ("phenotypic_feature" in source_types) or ("biolink:Disease" in source_types) or ("biolink:PhenotypicFeature" in source_types)): #probability = self.pred.prob_single('ChEMBL:' + target_curie[22:], source_curie) # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123 #if probability and np.isfinite(probability): # finite, that's ok, otherwise, stay with default # value = probability[0] self.response.debug(f"Predicting treatment probability between {curie_to_name[source_curie]} and {curie_to_name[target_curie]}") max_probability = 0 converted_source_curie = self.convert_to_trained_curies(source_curie) if converted_source_curie is None: continue else: preferred_type = converted_source_curie['preferred_type'] if preferred_type == "disease" or preferred_type == "phenotypic_feature" or preferred_type == "biolink:Disease" or preferred_type == "biolink:PhenotypicFeature": converted_source_curie = converted_source_curie['preferred_curie'] else: continue converted_target_curie = self.convert_to_trained_curies(target_curie) if converted_target_curie is None: continue else: preferred_type = converted_target_curie['preferred_type'] if preferred_type == "drug" or preferred_type == "chemical_substance" or preferred_type == "biolink:Drug" or preferred_type == "biolink:ChemicalSubstance": converted_target_curie = converted_target_curie['preferred_curie'] else: continue if self.use_prob_db is True: probability = self.pred.get_prob_from_DTD_db(converted_target_curie, converted_source_curie) if probability is not None: if np.isfinite(probability): max_probability = probability else: probability = self.pred.prob_single(converted_target_curie, converted_source_curie) if probability is not None: probability = probability[0] if np.isfinite(probability): max_probability = probability # res = list(itertools.product(converted_target_curie, converted_source_curie)) # if len(res) != 0: # all_probabilities = self.pred.prob_all(res) # if isinstance(all_probabilities, list): # max_probability = max([value for value in all_probabilities if np.isfinite(value)]) value = max_probability else: continue if value != 0: edge_attribute = EdgeAttribute(type=attribute_type, name=attribute_name, value=str(value), url=url) # populate the attribute edge.attributes.append(edge_attribute) # append it to the list of attributes except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Something went wrong adding the drug disease treatment probability") else: self.response.info(f"Drug disease treatment probability successfully added to edges") return self.response
def _canonicalize_nodes( nodes: List[Dict[str, any]]) -> Tuple[List[Dict[str, any]], Dict[str, str]]: synonymizer = NodeSynonymizer() node_ids = [node.get('id') for node in nodes if node.get('id')] print( f" Sending NodeSynonymizer.get_canonical_curies() a list of {len(node_ids)} curies.." ) canonicalized_info = synonymizer.get_canonical_curies( curies=node_ids, return_all_types=True) print(f" Creating canonicalized nodes..") curie_map = dict() canonicalized_nodes = dict() for node in nodes: canonical_info = canonicalized_info.get(node['id']) canonicalized_curie = canonical_info.get( 'preferred_curie', node['id']) if canonical_info else node['id'] node['publications'] = _literal_eval_list( node['publications'] ) # Only need to do this until kg2.2+ is rolled out if canonicalized_curie in canonicalized_nodes: existing_canonical_node = canonicalized_nodes[canonicalized_curie] existing_canonical_node['publications'] = _merge_two_lists( existing_canonical_node['publications'], node['publications']) else: if canonical_info: canonicalized_node = { 'id': canonicalized_curie, 'name': canonical_info.get('preferred_name', node['name']), 'types': list(canonical_info.get('all_types')), 'preferred_type': canonical_info.get('preferred_type', node['category_label']), 'publications': node['publications'] } else: canonicalized_node = { 'id': canonicalized_curie, 'name': node['name'], 'types': [node['category_label']], 'preferred_type': node['category_label'], 'publications': node['publications'] } canonicalized_nodes[canonicalized_node['id']] = canonicalized_node curie_map[node[ 'id']] = canonicalized_curie # Record this mapping for easy lookup later # Create a node containing information about this KG2C build new_build_node = { 'id': 'RTX:KG2C', 'name': f"KG2C:Build created on {datetime.now().strftime('%Y-%m-%d %H:%M')}", 'types': ['data_file'], 'preferred_type': 'data_file', 'publications': [] } canonicalized_nodes[new_build_node['id']] = new_build_node # Decorate nodes with equivalent curies print( f" Sending NodeSynonymizer.get_equivalent_nodes() a list of {len(canonicalized_nodes)} curies.." ) equivalent_curies_dict = synonymizer.get_equivalent_nodes( list(canonicalized_nodes.keys())) for curie, canonical_node in canonicalized_nodes.items(): equivalent_curies = [] equivalent_curies_dict_for_curie = equivalent_curies_dict.get(curie) if equivalent_curies_dict_for_curie is not None: for equivalent_curie in equivalent_curies_dict_for_curie: equivalent_curies.append(equivalent_curie) canonical_node['equivalent_curies'] = equivalent_curies # Convert array fields into the format neo4j wants and do final processing for canonicalized_node in canonicalized_nodes.values(): canonicalized_node['types'] = _convert_list_to_neo4j_format( canonicalized_node['types']) canonicalized_node['publications'] = _convert_list_to_neo4j_format( canonicalized_node['publications']) canonicalized_node[ 'equivalent_curies'] = _convert_list_to_neo4j_format( canonicalized_node['equivalent_curies']) canonicalized_node[ 'preferred_type_for_conversion'] = canonicalized_node[ 'preferred_type'] return list(canonicalized_nodes.values()), curie_map
args = parser.parse_args() curie_type = eval(args.CurieType) NodeNamesDescriptions = pd.read_csv( args.NodeDescriptionFile, sep='\t', header=None, names=['curie', 'name', 'full_name', 'type']) NodeNamesDescriptions = NodeNamesDescriptions.loc[ NodeNamesDescriptions.type.isin(curie_type), :].reset_index(drop=True) preferred_synonyms = dict() synonymizer = NodeSynonymizer() for curie in NodeNamesDescriptions['curie']: preferred_curie = synonymizer.get_canonical_curies(curies=curie)[curie] if preferred_curie is None: print(f"{curie} doesn't have preferred curies", flush=True) else: if preferred_curie['preferred_curie'] not in preferred_synonyms: preferred_synonyms[preferred_curie['preferred_curie']] = dict() preferred_synonyms[preferred_curie['preferred_curie']][ 'preferred_name'] = preferred_curie['preferred_name'] preferred_synonyms[preferred_curie['preferred_curie']][ 'preferred_type'] = preferred_curie['preferred_category'] preferred_synonyms[ preferred_curie['preferred_curie']]['synonyms'] = [curie] else: synonyms = set(preferred_synonyms[ preferred_curie['preferred_curie']]['synonyms']) synonyms.update(set([curie]))
class NGDDatabaseBuilder: def __init__(self, is_test): logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s', handlers=[ logging.FileHandler("ngdbuild.log"), logging.StreamHandler() ]) self.pubmed_directory_path = f"{NGD_DIR}/pubmed_xml_files" self.conceptname_to_pmids_db_name = "conceptname_to_pmids.db" self.conceptname_to_pmids_db_path = f"{NGD_DIR}/{self.conceptname_to_pmids_db_name}" self.curie_to_pmids_db_name = "curie_to_pmids.sqlite" self.curie_to_pmids_db_path = f"{NGD_DIR}/{self.curie_to_pmids_db_name}" self.status = 'OK' self.synonymizer = NodeSynonymizer() self.is_test = is_test def build_ngd_database(self, do_full_build: bool): if do_full_build: self.build_conceptname_to_pmids_db() else: conceptname_to_pmids_db = pathlib.Path( self.conceptname_to_pmids_db_path) if not conceptname_to_pmids_db.exists(): logging.error( f"You did not specify to do a full build, but the artifact necessary for a partial " f"build ({self.conceptname_to_pmids_db_name}) does not yet exist. Either use --full " f"to do a full build or put your {self.conceptname_to_pmids_db_name} into the right" f"place ({self.conceptname_to_pmids_db_path}).") self.status = "ERROR" if self.status == 'OK': self.build_curie_to_pmids_db() def build_conceptname_to_pmids_db(self): # This function extracts curie -> PMIDs mappings from the latest Pubmed XML files (saves data in a pickle DB) logging.info( f"Starting to build {self.conceptname_to_pmids_db_name} from pubmed files.." ) start = time.time() logging.info(f" Deleting any pre-existing Pubmed files..") subprocess.call(["rm", "-rf", self.pubmed_directory_path]) logging.info( f" Downloading latest Pubmed XML files (baseline and update files).." ) subprocess.check_call([ "wget", "-r", "ftp://ftp.ncbi.nlm.nih.gov/pubmed", "-P", self.pubmed_directory_path ]) for sub_dir_name in ["baseline", "updatefiles"]: xml_file_sub_dir = f"{self.pubmed_directory_path}/ftp.ncbi.nlm.nih.gov/pubmed/{sub_dir_name}" all_file_names = [ os.fsdecode(file) for file in os.listdir(xml_file_sub_dir) ] pubmed_file_names = [ file_name for file_name in all_file_names if file_name.lower().startswith('pubmed') and file_name.lower().endswith('.xml.gz') ] # Make sure the files seem to have been downloaded ok if not pubmed_file_names: if sub_dir_name == "baseline": logging.error( "Couldn't find any PubMed baseline XML files to scrape. Something must've gone wrong " "downloading them.") self.status = 'ERROR' return else: logging.warning( f"No Pubmed 'update' files detected. This might be ok (it's possible none exist), " f"but it's a little weird.") logging.info(f" Starting to process {sub_dir_name} PubMed files..") conceptname_to_pmids_map = dict() # Go through each downloaded pubmed file and build our dictionary of mappings pubmed_file_names_to_process = pubmed_file_names if not self.is_test else pubmed_file_names[: 1] for file_name in pubmed_file_names_to_process: logging.info( f" Starting to process file '{file_name}'.. ({pubmed_file_names_to_process.index(file_name) + 1}" f" of {len(pubmed_file_names_to_process)})") file_start_time = time.time() with gzip.open( f"{xml_file_sub_dir}/{file_name}") as pubmed_file: file_contents_tree = etree.parse(pubmed_file) pubmed_articles = file_contents_tree.xpath("//PubmedArticle") for article in pubmed_articles: # Link each concept name to the PMID of this article current_pmid = article.xpath( ".//MedlineCitation/PMID/text()")[0] descriptor_names = article.xpath( ".//MedlineCitation/MeshHeadingList/MeshHeading/DescriptorName/text()" ) qualifier_names = article.xpath( ".//MedlineCitation/MeshHeadingList/MeshHeading/QualifierName/text()" ) chemical_names = article.xpath( ".//MedlineCitation/ChemicalList/Chemical/NameOfSubstance/text()" ) gene_symbols = article.xpath( ".//MedlineCitation/GeneSymbolList/GeneSymbol/text()") keywords = article.xpath( ".//MedlineCitation/KeywordList/Keyword/text()") all_concept_names = descriptor_names + qualifier_names + chemical_names + gene_symbols + keywords unique_concept_names = { concept_name for concept_name in all_concept_names if concept_name } for concept_name in unique_concept_names: self._add_pmids_mapping(concept_name, current_pmid, conceptname_to_pmids_map) self._destroy_etree( file_contents_tree) # Hack around lxml memory leak logging.info( f" took {round((time.time() - file_start_time) / 60, 2)} minutes" ) # Save the data to the PickleDB after we're done logging.info(" Loading data into PickleDB..") conceptname_to_pmids_db = pickledb.load( self.conceptname_to_pmids_db_path, False) for concept_name, pmid_list in conceptname_to_pmids_map.items(): conceptname_to_pmids_db.set( concept_name, list({ self._create_pmid_curie_from_local_id(pmid) for pmid in pmid_list })) logging.info(" Saving PickleDB file..") conceptname_to_pmids_db.dump() logging.info( f"Done! Building {self.conceptname_to_pmids_db_name} took {round(((time.time() - start) / 60) / 60, 3)} hours" ) def build_curie_to_pmids_db(self): # This function creates a final sqlite database of curie->PMIDs mappings using data scraped from Pubmed AND KG2 logging.info(f"Starting to build {self.curie_to_pmids_db_name}..") start = time.time() curie_to_pmids_map = dict() self._add_pmids_from_pubmed_scrape(curie_to_pmids_map) if self.status != 'OK': return self._add_pmids_from_kg2_edges(curie_to_pmids_map) self._add_pmids_from_kg2_nodes(curie_to_pmids_map) logging.info( f" In the end, found PMID lists for {len(curie_to_pmids_map)} (canonical) curies" ) self._save_data_in_sqlite_db(curie_to_pmids_map) logging.info( f"Done! Building {self.curie_to_pmids_db_name} took {round((time.time() - start) / 60)} minutes." ) # Helper methods def _add_pmids_from_kg2_edges(self, curie_to_pmids_map): logging.info(f" Getting PMIDs from edges in KG2 neo4j..") edge_query = f"match (n)-[e]->(m) where e.publications is not null " \ f"return distinct n.id, m.id, e.publications{' limit 100' if self.is_test else ''}" edge_results = self._run_cypher_query(edge_query) logging.info(f" Processing results..") node_ids = {result['n.id'] for result in edge_results }.union(result['m.id'] for result in edge_results) canonicalized_curies_dict = self._get_canonicalized_curies_dict( list(node_ids)) for result in edge_results: canonicalized_node_ids = { canonicalized_curies_dict[result['n.id']], canonicalized_curies_dict[result['m.id']] } pmids = self._extract_and_format_pmids(result['e.publications']) if pmids: # Sometimes publications list includes only non-PMID identifiers (like ISBN) for canonical_curie in canonicalized_node_ids: self._add_pmids_mapping(canonical_curie, pmids, curie_to_pmids_map) def _add_pmids_from_kg2_nodes(self, curie_to_pmids_map): logging.info(f" Getting PMIDs from nodes in KG2 neo4j..") node_query = f"match (n) where n.publications is not null " \ f"return distinct n.id, n.publications{' limit 100' if self.is_test else ''}" node_results = self._run_cypher_query(node_query) logging.info(f" Processing results..") node_ids = {result['n.id'] for result in node_results} canonicalized_curies_dict = self._get_canonicalized_curies_dict( list(node_ids)) for result in node_results: canonical_curie = canonicalized_curies_dict[result['n.id']] pmids = self._extract_and_format_pmids(result['n.publications']) if pmids: # Sometimes publications list includes only non-PMID identifiers (like ISBN) self._add_pmids_mapping(canonical_curie, pmids, curie_to_pmids_map) def _add_pmids_from_pubmed_scrape(self, curie_to_pmids_map): # Load the data from the first half of the build process (scraping pubmed) logging.info( f" Loading pickle DB containing pubmed scrapings ({self.conceptname_to_pmids_db_name}).." ) conceptname_to_pmids_db = pickledb.load( self.conceptname_to_pmids_db_path, False) if not conceptname_to_pmids_db.getall(): logging.error( f"{self.conceptname_to_pmids_db_name} must exist in order to do a partial build. Use " f"--full to do a full build or put your {self.conceptname_to_pmids_db_name} into the right" f" place ({self.conceptname_to_pmids_db_path}).") self.status = 'ERROR' return # Get canonical curies for all of the concept names in our big pubmed pickleDB using the NodeSynonymizer concept_names = list(conceptname_to_pmids_db.getall()) logging.info( f" Sending NodeSynonymizer.get_canonical_curies() a list of {len(concept_names)} concept names.." ) canonical_curies_dict = self.synonymizer.get_canonical_curies( names=concept_names) logging.info( f" Got results back from NodeSynonymizer. (Returned dict contains {len(canonical_curies_dict)} keys.)" ) # Map all of the concept names scraped from pubmed to curies if canonical_curies_dict: recognized_concepts = { concept for concept in canonical_curies_dict if canonical_curies_dict.get(concept) } logging.info( f" NodeSynonymizer recognized {round((len(recognized_concepts) / len(concept_names)) * 100)}%" f" of concept names scraped from pubmed.") # Store which concept names the NodeSynonymizer didn't know about, for learning purposes unrecognized_concepts = set(canonical_curies_dict).difference( recognized_concepts) with open(f"{NGD_DIR}/unrecognized_pubmed_concept_names.txt", "w+") as unrecognized_concepts_file: unrecognized_concepts_file.write(f"{unrecognized_concepts}") logging.info( f" Unrecognized concept names were written to unrecognized_pubmed_concept_names.txt." ) # Map the canonical curie for each recognized concept to the concept's PMID list logging.info(f" Mapping canonical curies to PMIDs..") for concept_name in recognized_concepts: canonical_curie = canonical_curies_dict[concept_name].get( 'preferred_curie') pmids_for_this_concept = conceptname_to_pmids_db.get( concept_name) self._add_pmids_mapping(canonical_curie, pmids_for_this_concept, curie_to_pmids_map) logging.info( f" Mapped {len(curie_to_pmids_map)} canonical curies to PMIDs based on pubmed scrapings." ) else: logging.error(f"NodeSynonymizer didn't return anything!") self.status = 'ERROR' def _save_data_in_sqlite_db(self, curie_to_pmids_map): logging.info(" Loading data into sqlite database..") # Remove any preexisting version of this database if os.path.exists(self.curie_to_pmids_db_path): os.remove(self.curie_to_pmids_db_path) connection = sqlite3.connect(self.curie_to_pmids_db_path) cursor = connection.cursor() cursor.execute("CREATE TABLE curie_to_pmids (curie TEXT, pmids TEXT)") cursor.execute( "CREATE UNIQUE INDEX unique_curie ON curie_to_pmids (curie)") logging.info(f" Gathering row data..") rows = [[ curie, json.dumps( list( filter(None, {self._get_local_id_as_int(pmid) for pmid in pmids}))) ] for curie, pmids in curie_to_pmids_map.items()] rows_in_chunks = self._divide_list_into_chunks(rows, 5000) logging.info(f" Inserting row data into database..") for chunk in rows_in_chunks: cursor.executemany( f"INSERT INTO curie_to_pmids (curie, pmids) VALUES (?, ?)", chunk) connection.commit() # Log how many rows we've added in the end (for debugging purposes) cursor.execute(f"SELECT COUNT(*) FROM curie_to_pmids") count = cursor.fetchone()[0] logging.info( f" Done saving data in sqlite; database contains {count} rows.") cursor.close() def _get_canonicalized_curies_dict(self, curies: List[str]) -> Dict[str, str]: logging.info( f" Sending a batch of {len(curies)} curies to NodeSynonymizer.get_canonical_curies()" ) canonicalized_nodes_info = self.synonymizer.get_canonical_curies( curies) canonicalized_curies_dict = dict() for input_curie, preferred_info_dict in canonicalized_nodes_info.items( ): if preferred_info_dict: canonicalized_curies_dict[ input_curie] = preferred_info_dict.get( 'preferred_curie', input_curie) else: canonicalized_curies_dict[input_curie] = input_curie logging.info(f" Got results back from synonymizer") return canonicalized_curies_dict def _extract_and_format_pmids(self, publications: List[str]) -> List[str]: pmids = { publication_id for publication_id in publications if publication_id.upper().startswith('PMID') } # Make sure all PMIDs are given in same format (e.g., PMID:18299583 rather than PMID18299583) formatted_pmids = [ self._create_pmid_curie_from_local_id( pmid.replace('PMID', '').replace(':', '')) for pmid in pmids ] return formatted_pmids @staticmethod def _add_pmids_mapping(key: str, value_to_append: Union[str, List[str]], mappings_dict: Dict[str, List[str]]): if key not in mappings_dict: mappings_dict[key] = [] if isinstance(value_to_append, list): mappings_dict[key] += value_to_append else: mappings_dict[key].append(value_to_append) @staticmethod def _create_pmid_curie_from_local_id(pmid): return f"PMID:{pmid}" @staticmethod def _get_local_id_as_int(curie): # Converts "PMID:1234" to 1234 curie_pieces = curie.split(":") local_id_str = curie_pieces[-1] # Remove any strange characters (like in "PMID:_19960544") stripped_id_str = "".join( [character for character in local_id_str if character.isdigit()]) return int(stripped_id_str) if stripped_id_str else None @staticmethod def _destroy_etree(file_contents_tree): # Thank you to https://stackoverflow.com/a/49139904 for this method; important to prevent memory blow-up root = file_contents_tree.getroot() element_tracker = {root: [0, None]} for element in root.iterdescendants(): parent = element.getparent() element_tracker[element] = [element_tracker[parent][0] + 1, parent] element_tracker = sorted( [(depth, parent, child) for child, (depth, parent) in element_tracker.items()], key=lambda x: x[0], reverse=True) for _, parent, child in element_tracker: if parent is None: break parent.remove(child) del file_contents_tree @staticmethod def _run_cypher_query(cypher_query: str) -> List[Dict[str, any]]: rtxc = RTXConfiguration() rtxc.live = "KG2" try: driver = GraphDatabase.driver(rtxc.neo4j_bolt, auth=(rtxc.neo4j_username, rtxc.neo4j_password)) with driver.session() as session: query_results = session.run(cypher_query).data() driver.close() except Exception: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() logging.error( f"Encountered an error interacting with KG2 neo4j. {tb}") return [] else: return query_results @staticmethod def _divide_list_into_chunks(input_list: List[any], chunk_size: int) -> List[List[any]]: num_chunks = len(input_list) // chunk_size if len( input_list) % chunk_size == 0 else (len(input_list) // chunk_size) + 1 start_index = 0 stop_index = chunk_size all_chunks = [] for num in range(num_chunks): chunk = input_list[start_index:stop_index] if stop_index <= len( input_list) else input_list[start_index:] all_chunks.append(chunk) start_index += chunk_size stop_index += chunk_size return all_chunks