def _get_node_synonyms(knowledge_graph): synonymizer = NodeSynonymizer() node_keys = {key for key in knowledge_graph.nodes.keys()} equivalent_curie_info = synonymizer.get_equivalent_nodes(node_keys) return { node_key: set(equivalent_curies_dict) for node_key, equivalent_curies_dict in equivalent_curie_info.items() }
def _get_node_synonyms(knowledge_graph): synonymizer = NodeSynonymizer() node_ids = {node.id for node in knowledge_graph.nodes} equivalent_curie_info = synonymizer.get_equivalent_nodes(node_ids, kg_name='KG2') return { node_id: set(equivalent_curies_dict) for node_id, equivalent_curies_dict in equivalent_curie_info.items() }
def _canonicalize_nodes(kg2pre_nodes: List[Dict[str, any]]) -> Tuple[Dict[str, Dict[str, any]], Dict[str, str]]: logging.info(f"Canonicalizing nodes..") synonymizer = NodeSynonymizer() node_ids = [node.get('id') for node in kg2pre_nodes if node.get('id')] logging.info(f" Sending NodeSynonymizer.get_canonical_curies() {len(node_ids)} curies..") canonicalized_info = synonymizer.get_canonical_curies(curies=node_ids, return_all_categories=True) all_canonical_curies = {canonical_info['preferred_curie'] for canonical_info in canonicalized_info.values() if canonical_info} logging.info(f" Sending NodeSynonymizer.get_equivalent_nodes() {len(all_canonical_curies)} curies..") equivalent_curies_info = synonymizer.get_equivalent_nodes(all_canonical_curies) recognized_curies = {curie for curie in equivalent_curies_info if equivalent_curies_info.get(curie)} equivalent_curies_dict = {curie: list(equivalent_curies_info.get(curie)) for curie in recognized_curies} with open(f"{KG2C_DIR}/equivalent_curies.pickle", "wb") as equiv_curies_dump: # Save these for use by downstream script pickle.dump(equivalent_curies_dict, equiv_curies_dump, protocol=pickle.HIGHEST_PROTOCOL) logging.info(f" Creating canonicalized nodes..") curie_map = dict() canonicalized_nodes = dict() for kg2pre_node in kg2pre_nodes: # Grab relevant info for this node and its canonical version canonical_info = canonicalized_info.get(kg2pre_node['id']) canonicalized_curie = canonical_info.get('preferred_curie', kg2pre_node['id']) if canonical_info else kg2pre_node['id'] publications = kg2pre_node['publications'] if kg2pre_node.get('publications') else [] descriptions_list = [kg2pre_node['description']] if kg2pre_node.get('description') else [] if canonicalized_curie in canonicalized_nodes: # Merge this node into its corresponding canonical node existing_canonical_node = canonicalized_nodes[canonicalized_curie] existing_canonical_node['publications'] = _merge_two_lists(existing_canonical_node['publications'], publications) existing_canonical_node['all_names'] = _merge_two_lists(existing_canonical_node['all_names'], [kg2pre_node['name']]) existing_canonical_node['descriptions_list'] = _merge_two_lists(existing_canonical_node['descriptions_list'], descriptions_list) # Make sure any nodes subject to #1074-like problems still appear in equivalent curies existing_canonical_node['equivalent_curies'] = _merge_two_lists(existing_canonical_node['equivalent_curies'], [kg2pre_node['id']]) # Add the IRI for the 'preferred' curie, if we've found that node if kg2pre_node['id'] == canonicalized_curie: existing_canonical_node['iri'] = kg2pre_node.get('iri') else: # Initiate the canonical node for this synonym group name = canonical_info['preferred_name'] if canonical_info else kg2pre_node['name'] category = canonical_info['preferred_category'] if canonical_info else kg2pre_node['category'] all_categories = list(canonical_info['all_categories']) if canonical_info else [kg2pre_node['category']] iri = kg2pre_node['iri'] if kg2pre_node['id'] == canonicalized_curie else None all_names = [kg2pre_node['name']] canonicalized_node = _create_node(preferred_curie=canonicalized_curie, name=name, category=category, all_categories=all_categories, publications=publications, equivalent_curies=equivalent_curies_dict.get(canonicalized_curie, [canonicalized_curie]), iri=iri, description=None, descriptions_list=descriptions_list, all_names=all_names) canonicalized_nodes[canonicalized_node['id']] = canonicalized_node curie_map[kg2pre_node['id']] = canonicalized_curie # Record this mapping for easy lookup later logging.info(f"Number of KG2pre nodes was reduced to {len(canonicalized_nodes)} " f"({round((len(canonicalized_nodes) / len(kg2pre_nodes)) * 100)}%)") return canonicalized_nodes, curie_map
def get_curie_synonyms(curie: Union[str, List[str]], log: ARAXResponse) -> List[str]: curies = convert_string_or_list_to_list(curie) try: synonymizer = NodeSynonymizer() log.debug( f"Sending NodeSynonymizer.get_equivalent_nodes() a list of {len(curies)} curies" ) equivalent_curies_dict = synonymizer.get_equivalent_nodes( curies, kg_name="KG2") log.debug(f"Got response back from NodeSynonymizer") except Exception: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() log.error(f"Encountered a problem using NodeSynonymizer: {tb}", error_code=error_type.__name__) return [] else: if equivalent_curies_dict is not None: curies_missing_info = { curie for curie in equivalent_curies_dict if not equivalent_curies_dict.get(curie) } if curies_missing_info: log.warning( f"NodeSynonymizer did not find any equivalent curies for: {curies_missing_info}" ) equivalent_curies = { curie for curie_dict in equivalent_curies_dict.values() if curie_dict for curie in curie_dict } all_curies = equivalent_curies.union(set( curies)) # Make sure even curies without synonyms are included return sorted(list(all_curies)) else: log.error(f"NodeSynonymizer returned None", error_code="NodeNormalizationIssue") return []
def get_curie_synonyms_dict( curie: Union[str, List[str]], log: Optional[ARAXResponse] = ARAXResponse() ) -> Dict[str, List[str]]: curies = convert_to_list(curie) try: synonymizer = NodeSynonymizer() log.debug( f"Sending NodeSynonymizer.get_equivalent_nodes() a list of {len(curies)} curies" ) equivalent_curies_dict = synonymizer.get_equivalent_nodes(curies) log.debug(f"Got response back from NodeSynonymizer") except Exception: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() log.error(f"Encountered a problem using NodeSynonymizer: {tb}", error_code=error_type.__name__) return dict() else: if equivalent_curies_dict is not None: curies_missing_info = { curie for curie in equivalent_curies_dict if not equivalent_curies_dict.get(curie) } if curies_missing_info: log.warning( f"NodeSynonymizer did not find any equivalent curies for: {curies_missing_info}" ) final_curie_dict = dict() for input_curie in curies: curie_dict = equivalent_curies_dict.get(input_curie) final_curie_dict[input_curie] = list( curie_dict) if curie_dict else [input_curie] return final_curie_dict else: log.error(f"NodeSynonymizer returned None", error_code="NodeNormalizationIssue") return dict()
class PredictDrugTreatsDisease: #### Constructor def __init__(self, response, message, parameters): self.response = response self.message = message self.parameters = parameters self.global_iter = 0 ## check if the new model files exists in /predictor/retrain_data. If not, scp it from arax.rtx.ai pathlist = os.path.realpath(__file__).split(os.path.sep) RTXindex = pathlist.index("RTX") filepath = os.path.sep.join([*pathlist[:(RTXindex + 1)], 'code', 'ARAX', 'ARAXQuery', 'Overlay', 'predictor','retrain_data']) ## check if there is LogModel.pkl pkl_file = f"{filepath}/LogModel.pkl" if os.path.exists(pkl_file): pass else: os.system("scp [email protected]:/home/ubuntu/drug_repurposing_model_retrain/LogModel.pkl " + pkl_file) ## check if there is GRAPH.sqlite db_file = f"{filepath}/GRAPH.sqlite" if os.path.exists(db_file): pass else: os.system("scp [email protected]:/home/ubuntu/drug_repurposing_model_retrain/GRAPH.sqlite " + db_file) # use NodeSynonymizer to replace map.txt # check if there is map.txt # map_file = f"{filepath}/map.txt" # if os.path.exists(map_file): # pass # else: # os.system("scp [email protected]:/home/ubuntu/drug_repurposing_model_retrain/map.txt " + map_file) self.pred = predictor(model_file=pkl_file) self.pred.import_file(None, graph_database=db_file) # with open(map_file, 'r') as infile: # map_file_content = infile.readlines() # map_file_content.pop(0) ## remove title # self.known_curies = set(line.strip().split('\t')[0] for line in map_file_content) self.synonymizer = NodeSynonymizer() def convert_to_trained_curies(self, input_curie): """ Takes an input curie from the KG, uses the synonymizer, and then returns something that the map.csv can handle """ normalizer_result = self.synonymizer.get_equivalent_nodes(input_curie, kg_name='KG2') curies_in_model = normalizer_result[input_curie] # curies_in_model = [curie for curie in curies_in_model if curie in self.known_curies] # equivalent_curies = [] # start with empty equivalent_curies # try: # equivalent_curies = [x['identifier'] for x in normalizer_result[input_curie]['equivalent_identifiers']] # except: # self.response.warning(f"NodeSynonmizer could not find curies for {input_curie}, skipping this one.") # for curie in equivalent_curies: # curie_prefix = curie.split(':')[0] # # FIXME: fix this when re-training the ML model, as when this was originally trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123 # if curie_prefix == "CHEMBL.COMPOUND": # chembl_fix = 'ChEMBL:' + curie[22:] # if chembl_fix in self.known_curies: # curies_in_model.add(chembl_fix) # elif curie in self.known_curies: # curies_in_model.add(curie) return curies_in_model def predict_drug_treats_disease(self): """ Iterate over all the edges in the knowledge graph, add the drug-disease treatment probability for appropriate edges on the edge_attributes :return: response """ parameters = self.parameters self.response.debug(f"Computing drug disease treatment probability based on a machine learning model") self.response.info(f"Computing drug disease treatment probability based on a machine learning model: See [this publication](https://doi.org/10.1101/765305) for more details about how this is accomplished.") attribute_name = "probability_treats" attribute_type = "EDAM:data_0951" value = 0 # this will be the default value. If the model returns 0, or the default is there, don't include that edge url = "https://doi.org/10.1101/765305" # if you want to add virtual edges, identify the source/targets, decorate the edges, add them to the KG, and then add one to the QG corresponding to them if 'virtual_relation_label' in parameters: source_curies_to_decorate = set() target_curies_to_decorate = set() # identify the nodes that we should be adding virtual edges for for node in self.message.knowledge_graph.nodes: if hasattr(node, 'qnode_ids'): if parameters['source_qnode_id'] in node.qnode_ids: if "drug" in node.type or "chemical_substance" in node.type: # this is now NOT checked by ARAX_overlay source_curies_to_decorate.add(node.id) if parameters['target_qnode_id'] in node.qnode_ids: if "disease" in node.type or "phenotypic_feature" in node.type: # this is now NOT checked by ARAX_overlay target_curies_to_decorate.add(node.id) added_flag = False # check to see if any edges where added # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute for (source_curie, target_curie) in itertools.product(source_curies_to_decorate, target_curies_to_decorate): # create the edge attribute if it can be # loop over all equivalent curies and take the highest probability max_probability = 0 converted_source_curie = self.convert_to_trained_curies(source_curie) converted_target_curie = self.convert_to_trained_curies(target_curie) if converted_source_curie is None or converted_target_curie is None: continue res = list(itertools.product(converted_source_curie, converted_target_curie)) if len(res) != 0: all_probabilities = self.pred.prob_all(res) if isinstance(all_probabilities, list): max_probability = max([value for value in all_probabilities if np.isfinite(value)]) value = max_probability #probability = self.pred.prob_single('ChEMBL:' + source_curie[22:], target_curie) # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123 #if probability and np.isfinite(probability): # finite, that's ok, otherwise, stay with default # value = probability[0] edge_attribute = EdgeAttribute(type=attribute_type, name=attribute_name, value=str(value), url=url) # populate the edge attribute if edge_attribute and value != 0: added_flag = True # make the edge, add the attribute # edge properties now = datetime.now() edge_type = "probably_treats" qedge_ids = [parameters['virtual_relation_label']] relation = parameters['virtual_relation_label'] is_defined_by = "ARAX" defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S") provided_by = "ARAX" confidence = None weight = None # TODO: could make the actual value of the attribute source_id = source_curie target_id = target_curie # now actually add the virtual edges in id = f"{relation}_{self.global_iter}" self.global_iter += 1 edge = Edge(id=id, type=edge_type, relation=relation, source_id=source_id, target_id=target_id, is_defined_by=is_defined_by, defined_datetime=defined_datetime, provided_by=provided_by, confidence=confidence, weight=weight, edge_attributes=[edge_attribute], qedge_ids=qedge_ids) self.message.knowledge_graph.edges.append(edge) # Now add a q_edge the query_graph since I've added an extra edge to the KG if added_flag: edge_type = "probably_treats" relation = parameters['virtual_relation_label'] qedge_id = parameters['virtual_relation_label'] q_edge = QEdge(id=relation, type=edge_type, relation=relation, source_id=parameters['source_qnode_id'], target_id=parameters['target_qnode_id']) # TODO: ok to make the id and type the same thing? self.message.query_graph.edges.append(q_edge) return self.response else: # you want to add it for each edge in the KG # iterate over KG edges, add the information try: # map curies to types curie_to_type = dict() for node in self.message.knowledge_graph.nodes: curie_to_type[node.id] = node.type # then iterate over the edges and decorate if appropriate for edge in self.message.knowledge_graph.edges: # Make sure the edge_attributes are not None if not edge.edge_attributes: edge.edge_attributes = [] # should be an array, but why not a list? # now go and actually get the NGD source_curie = edge.source_id target_curie = edge.target_id source_types = curie_to_type[source_curie] target_types = curie_to_type[target_curie] if (("drug" in source_types) or ("chemical_substance" in source_types)) and (("disease" in target_types) or ("phenotypic_feature" in target_types)): temp_value = 0 # loop over all pairs of equivalent curies and take the highest probability max_probability = 0 converted_source_curie = self.convert_to_trained_curies(source_curie) converted_target_curie = self.convert_to_trained_curies(target_curie) if converted_source_curie is None or converted_target_curie is None: continue res = list(itertools.product(converted_source_curie, converted_target_curie)) if len(res) != 0: all_probabilities = self.pred.prob_all(res) if isinstance(all_probabilities, list): max_probability = max([value for value in all_probabilities if np.isfinite(value)]) value = max_probability #probability = self.pred.prob_single('ChEMBL:' + source_curie[22:], target_curie) # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123 #if probability and np.isfinite(probability): # finite, that's ok, otherwise, stay with default # value = probability[0] elif (("drug" in target_types) or ("chemical_substance" in target_types)) and (("disease" in source_types) or ("phenotypic_feature" in source_types)): #probability = self.pred.prob_single('ChEMBL:' + target_curie[22:], source_curie) # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123 #if probability and np.isfinite(probability): # finite, that's ok, otherwise, stay with default # value = probability[0] max_probability = 0 converted_source_curie = self.convert_to_trained_curies(source_curie) converted_target_curie = self.convert_to_trained_curies(target_curie) if converted_source_curie is None or converted_target_curie is None: continue res = list(itertools.product(converted_target_curie, converted_source_curie)) if len(res) != 0: all_probabilities = self.pred.prob_all(res) if isinstance(all_probabilities, list): max_probability = max([value for value in all_probabilities if np.isfinite(value)]) value = max_probability else: continue if value != 0: edge_attribute = EdgeAttribute(type=attribute_type, name=attribute_name, value=str(value), url=url) # populate the attribute edge.edge_attributes.append(edge_attribute) # append it to the list of attributes except: tb = traceback.format_exc() error_type, error, _ = sys.exc_info() self.response.error(tb, error_code=error_type.__name__) self.response.error(f"Something went wrong adding the drug disease treatment probability") else: self.response.info(f"Drug disease treatment probability successfully added to edges") return self.response
def _canonicalize_nodes( nodes: List[Dict[str, any]]) -> Tuple[List[Dict[str, any]], Dict[str, str]]: synonymizer = NodeSynonymizer() node_ids = [node.get('id') for node in nodes if node.get('id')] print( f" Sending NodeSynonymizer.get_canonical_curies() a list of {len(node_ids)} curies.." ) canonicalized_info = synonymizer.get_canonical_curies( curies=node_ids, return_all_types=True) print(f" Creating canonicalized nodes..") curie_map = dict() canonicalized_nodes = dict() for node in nodes: canonical_info = canonicalized_info.get(node['id']) canonicalized_curie = canonical_info.get( 'preferred_curie', node['id']) if canonical_info else node['id'] node['publications'] = _literal_eval_list( node['publications'] ) # Only need to do this until kg2.2+ is rolled out if canonicalized_curie in canonicalized_nodes: existing_canonical_node = canonicalized_nodes[canonicalized_curie] existing_canonical_node['publications'] = _merge_two_lists( existing_canonical_node['publications'], node['publications']) else: if canonical_info: canonicalized_node = { 'id': canonicalized_curie, 'name': canonical_info.get('preferred_name', node['name']), 'types': list(canonical_info.get('all_types')), 'preferred_type': canonical_info.get('preferred_type', node['category_label']), 'publications': node['publications'] } else: canonicalized_node = { 'id': canonicalized_curie, 'name': node['name'], 'types': [node['category_label']], 'preferred_type': node['category_label'], 'publications': node['publications'] } canonicalized_nodes[canonicalized_node['id']] = canonicalized_node curie_map[node[ 'id']] = canonicalized_curie # Record this mapping for easy lookup later # Create a node containing information about this KG2C build new_build_node = { 'id': 'RTX:KG2C', 'name': f"KG2C:Build created on {datetime.now().strftime('%Y-%m-%d %H:%M')}", 'types': ['data_file'], 'preferred_type': 'data_file', 'publications': [] } canonicalized_nodes[new_build_node['id']] = new_build_node # Decorate nodes with equivalent curies print( f" Sending NodeSynonymizer.get_equivalent_nodes() a list of {len(canonicalized_nodes)} curies.." ) equivalent_curies_dict = synonymizer.get_equivalent_nodes( list(canonicalized_nodes.keys())) for curie, canonical_node in canonicalized_nodes.items(): equivalent_curies = [] equivalent_curies_dict_for_curie = equivalent_curies_dict.get(curie) if equivalent_curies_dict_for_curie is not None: for equivalent_curie in equivalent_curies_dict_for_curie: equivalent_curies.append(equivalent_curie) canonical_node['equivalent_curies'] = equivalent_curies # Convert array fields into the format neo4j wants and do final processing for canonicalized_node in canonicalized_nodes.values(): canonicalized_node['types'] = _convert_list_to_neo4j_format( canonicalized_node['types']) canonicalized_node['publications'] = _convert_list_to_neo4j_format( canonicalized_node['publications']) canonicalized_node[ 'equivalent_curies'] = _convert_list_to_neo4j_format( canonicalized_node['equivalent_curies']) canonicalized_node[ 'preferred_type_for_conversion'] = canonicalized_node[ 'preferred_type'] return list(canonicalized_nodes.values()), curie_map
def _canonicalize_nodes( neo4j_nodes: List[Dict[str, any]] ) -> Tuple[Dict[str, Dict[str, any]], Dict[str, str]]: synonymizer = NodeSynonymizer() node_ids = [node.get('id') for node in neo4j_nodes if node.get('id')] print( f" Sending NodeSynonymizer.get_canonical_curies() {len(node_ids)} curies.." ) canonicalized_info = synonymizer.get_canonical_curies( curies=node_ids, return_all_categories=True) all_canonical_curies = { canonical_info['preferred_curie'] for canonical_info in canonicalized_info.values() if canonical_info } print( f" Sending NodeSynonymizer.get_equivalent_nodes() {len(all_canonical_curies)} curies.." ) equivalent_curies_info = synonymizer.get_equivalent_nodes( all_canonical_curies) recognized_curies = { curie for curie in equivalent_curies_info if equivalent_curies_info.get(curie) } equivalent_curies_dict = { curie: list(equivalent_curies_info.get(curie)) for curie in recognized_curies } print(f" Creating canonicalized nodes..") curie_map = dict() canonicalized_nodes = dict() for neo4j_node in neo4j_nodes: # Grab relevant info for this node and its canonical version canonical_info = canonicalized_info.get(neo4j_node['id']) canonicalized_curie = canonical_info.get( 'preferred_curie', neo4j_node['id']) if canonical_info else neo4j_node['id'] publications = neo4j_node['publications'] if neo4j_node.get( 'publications') else [] descriptions_list = [neo4j_node['description'] ] if neo4j_node.get('description') else [] if canonicalized_curie in canonicalized_nodes: # Merge this node into its corresponding canonical node existing_canonical_node = canonicalized_nodes[canonicalized_curie] existing_canonical_node['publications'] = _merge_two_lists( existing_canonical_node['publications'], publications) existing_canonical_node['all_names'] = _merge_two_lists( existing_canonical_node['all_names'], [neo4j_node['name']]) existing_canonical_node['descriptions_list'] = _merge_two_lists( existing_canonical_node['descriptions_list'], descriptions_list) # Make sure any nodes subject to #1074-like problems still appear in equivalent curies existing_canonical_node['equivalent_curies'] = _merge_two_lists( existing_canonical_node['equivalent_curies'], [neo4j_node['id']]) # Add the IRI and description for the 'preferred' curie, if we've found that node if neo4j_node['id'] == canonicalized_curie: existing_canonical_node['iri'] = neo4j_node.get('iri') existing_canonical_node['description'] = neo4j_node.get( 'description') else: # Initiate the canonical node for this synonym group name = canonical_info[ 'preferred_name'] if canonical_info else neo4j_node['name'] category = canonical_info[ 'preferred_category'] if canonical_info else neo4j_node[ 'category'] if not category.startswith("biolink:"): print( f" WARNING: Preferred category for {canonicalized_curie} doesn't start with 'biolink:': {category}" ) all_categories = list( canonical_info['all_categories']) if canonical_info else [ neo4j_node['category'] ] expanded_categories = list( canonical_info['expanded_categories']) if canonical_info else [ neo4j_node['category'] ] iri = neo4j_node['iri'] if neo4j_node[ 'id'] == canonicalized_curie else None description = neo4j_node.get( 'description' ) if neo4j_node['id'] == canonicalized_curie else None all_names = [neo4j_node['name']] # Check for bug where not all categories in synonymizer were of "biolink:PascalCase" format if not all( category.startswith("biolink:") for category in all_categories): print( f" WARNING: all_categories for {canonicalized_curie} contain non 'biolink:PascalCase' " f"items: {all_categories}") if not all( category.startswith("biolink:") for category in expanded_categories): print( f" WARNING: expanded_categories for {canonicalized_curie} contain non 'biolink:PascalCase' " f"items: {expanded_categories}") canonicalized_node = _create_node( preferred_curie=canonicalized_curie, name=name, category=category, all_categories=all_categories, expanded_categories=expanded_categories, publications=publications, equivalent_curies=equivalent_curies_dict.get( canonicalized_curie, [canonicalized_curie]), iri=iri, description=description, descriptions_list=descriptions_list, all_names=all_names) canonicalized_nodes[canonicalized_node['id']] = canonicalized_node curie_map[neo4j_node[ 'id']] = canonicalized_curie # Record this mapping for easy lookup later return canonicalized_nodes, curie_map
def _canonicalize_nodes( nodes: List[Dict[str, any]] ) -> Tuple[Dict[str, Dict[str, any]], Dict[str, str]]: synonymizer = NodeSynonymizer() node_ids = [node.get('id') for node in nodes if node.get('id')] print( f" Sending NodeSynonymizer.get_canonical_curies() {len(node_ids)} curies.." ) canonicalized_info = synonymizer.get_canonical_curies( curies=node_ids, return_all_types=True) all_canonical_curies = { canonical_info['preferred_curie'] for canonical_info in canonicalized_info.values() if canonical_info } print( f" Sending NodeSynonymizer.get_equivalent_nodes() {len(all_canonical_curies)} curies.." ) equivalent_curies_info = synonymizer.get_equivalent_nodes( all_canonical_curies) recognized_curies = { curie for curie in equivalent_curies_info if equivalent_curies_info.get(curie) } equivalent_curies_dict = { curie: list(equivalent_curies_info.get(curie)) for curie in recognized_curies } print(f" Creating canonicalized nodes..") curie_map = dict() canonicalized_nodes = dict() for node in nodes: canonical_info = canonicalized_info.get(node['id']) canonicalized_curie = canonical_info.get( 'preferred_curie', node['id']) if canonical_info else node['id'] publications = node['publications'] if node.get('publications') else [] description_in_list = [node['description'] ] if node.get('description') else [] if canonicalized_curie in canonicalized_nodes: existing_canonical_node = canonicalized_nodes[canonicalized_curie] existing_canonical_node['publications'] = _merge_two_lists( existing_canonical_node['publications'], publications) existing_canonical_node['all_names'] = _merge_two_lists( existing_canonical_node['all_names'], [node['name']]) existing_canonical_node['description'] = _merge_two_lists( existing_canonical_node['description'], description_in_list) # Add the IRI for the 'preferred' curie, if we've found that node if node['id'] == canonicalized_curie: existing_canonical_node['iri'] = node.get('iri') else: name = canonical_info[ 'preferred_name'] if canonical_info else node['name'] preferred_type = canonical_info[ 'preferred_type'] if canonical_info else node['category_label'] types = list(canonical_info['all_types']) if canonical_info else [ node['category_label'] ] iri = node['iri'] if node['id'] == canonicalized_curie else None all_names = [node['name']] canonicalized_node = _create_node( node_id=canonicalized_curie, name=name, preferred_type=preferred_type, types=types, publications=publications, equivalent_curies=equivalent_curies_dict.get( canonicalized_curie, []), iri=iri, description=description_in_list, all_names=all_names) canonicalized_nodes[canonicalized_node['id']] = canonicalized_node curie_map[node[ 'id']] = canonicalized_curie # Record this mapping for easy lookup later return canonicalized_nodes, curie_map