示例#1
0
 def _get_node_synonyms(knowledge_graph):
     synonymizer = NodeSynonymizer()
     node_keys = {key for key in knowledge_graph.nodes.keys()}
     equivalent_curie_info = synonymizer.get_equivalent_nodes(node_keys)
     return {
         node_key: set(equivalent_curies_dict)
         for node_key, equivalent_curies_dict in
         equivalent_curie_info.items()
     }
 def _get_node_synonyms(knowledge_graph):
     synonymizer = NodeSynonymizer()
     node_ids = {node.id for node in knowledge_graph.nodes}
     equivalent_curie_info = synonymizer.get_equivalent_nodes(node_ids,
                                                              kg_name='KG2')
     return {
         node_id: set(equivalent_curies_dict)
         for node_id, equivalent_curies_dict in
         equivalent_curie_info.items()
     }
示例#3
0
def _canonicalize_nodes(kg2pre_nodes: List[Dict[str, any]]) -> Tuple[Dict[str, Dict[str, any]], Dict[str, str]]:
    logging.info(f"Canonicalizing nodes..")
    synonymizer = NodeSynonymizer()
    node_ids = [node.get('id') for node in kg2pre_nodes if node.get('id')]
    logging.info(f"  Sending NodeSynonymizer.get_canonical_curies() {len(node_ids)} curies..")
    canonicalized_info = synonymizer.get_canonical_curies(curies=node_ids, return_all_categories=True)
    all_canonical_curies = {canonical_info['preferred_curie'] for canonical_info in canonicalized_info.values() if canonical_info}
    logging.info(f"  Sending NodeSynonymizer.get_equivalent_nodes() {len(all_canonical_curies)} curies..")
    equivalent_curies_info = synonymizer.get_equivalent_nodes(all_canonical_curies)
    recognized_curies = {curie for curie in equivalent_curies_info if equivalent_curies_info.get(curie)}
    equivalent_curies_dict = {curie: list(equivalent_curies_info.get(curie)) for curie in recognized_curies}
    with open(f"{KG2C_DIR}/equivalent_curies.pickle", "wb") as equiv_curies_dump:  # Save these for use by downstream script
        pickle.dump(equivalent_curies_dict, equiv_curies_dump, protocol=pickle.HIGHEST_PROTOCOL)
    logging.info(f"  Creating canonicalized nodes..")
    curie_map = dict()
    canonicalized_nodes = dict()
    for kg2pre_node in kg2pre_nodes:
        # Grab relevant info for this node and its canonical version
        canonical_info = canonicalized_info.get(kg2pre_node['id'])
        canonicalized_curie = canonical_info.get('preferred_curie', kg2pre_node['id']) if canonical_info else kg2pre_node['id']
        publications = kg2pre_node['publications'] if kg2pre_node.get('publications') else []
        descriptions_list = [kg2pre_node['description']] if kg2pre_node.get('description') else []
        if canonicalized_curie in canonicalized_nodes:
            # Merge this node into its corresponding canonical node
            existing_canonical_node = canonicalized_nodes[canonicalized_curie]
            existing_canonical_node['publications'] = _merge_two_lists(existing_canonical_node['publications'], publications)
            existing_canonical_node['all_names'] = _merge_two_lists(existing_canonical_node['all_names'], [kg2pre_node['name']])
            existing_canonical_node['descriptions_list'] = _merge_two_lists(existing_canonical_node['descriptions_list'], descriptions_list)
            # Make sure any nodes subject to #1074-like problems still appear in equivalent curies
            existing_canonical_node['equivalent_curies'] = _merge_two_lists(existing_canonical_node['equivalent_curies'], [kg2pre_node['id']])
            # Add the IRI for the 'preferred' curie, if we've found that node
            if kg2pre_node['id'] == canonicalized_curie:
                existing_canonical_node['iri'] = kg2pre_node.get('iri')
        else:
            # Initiate the canonical node for this synonym group
            name = canonical_info['preferred_name'] if canonical_info else kg2pre_node['name']
            category = canonical_info['preferred_category'] if canonical_info else kg2pre_node['category']
            all_categories = list(canonical_info['all_categories']) if canonical_info else [kg2pre_node['category']]
            iri = kg2pre_node['iri'] if kg2pre_node['id'] == canonicalized_curie else None
            all_names = [kg2pre_node['name']]
            canonicalized_node = _create_node(preferred_curie=canonicalized_curie,
                                              name=name,
                                              category=category,
                                              all_categories=all_categories,
                                              publications=publications,
                                              equivalent_curies=equivalent_curies_dict.get(canonicalized_curie, [canonicalized_curie]),
                                              iri=iri,
                                              description=None,
                                              descriptions_list=descriptions_list,
                                              all_names=all_names)
            canonicalized_nodes[canonicalized_node['id']] = canonicalized_node
        curie_map[kg2pre_node['id']] = canonicalized_curie  # Record this mapping for easy lookup later
    logging.info(f"Number of KG2pre nodes was reduced to {len(canonicalized_nodes)} "
                 f"({round((len(canonicalized_nodes) / len(kg2pre_nodes)) * 100)}%)")
    return canonicalized_nodes, curie_map
示例#4
0
def get_curie_synonyms(curie: Union[str, List[str]],
                       log: ARAXResponse) -> List[str]:
    curies = convert_string_or_list_to_list(curie)
    try:
        synonymizer = NodeSynonymizer()
        log.debug(
            f"Sending NodeSynonymizer.get_equivalent_nodes() a list of {len(curies)} curies"
        )
        equivalent_curies_dict = synonymizer.get_equivalent_nodes(
            curies, kg_name="KG2")
        log.debug(f"Got response back from NodeSynonymizer")
    except Exception:
        tb = traceback.format_exc()
        error_type, error, _ = sys.exc_info()
        log.error(f"Encountered a problem using NodeSynonymizer: {tb}",
                  error_code=error_type.__name__)
        return []
    else:
        if equivalent_curies_dict is not None:
            curies_missing_info = {
                curie
                for curie in equivalent_curies_dict
                if not equivalent_curies_dict.get(curie)
            }
            if curies_missing_info:
                log.warning(
                    f"NodeSynonymizer did not find any equivalent curies for: {curies_missing_info}"
                )
            equivalent_curies = {
                curie
                for curie_dict in equivalent_curies_dict.values() if curie_dict
                for curie in curie_dict
            }
            all_curies = equivalent_curies.union(set(
                curies))  # Make sure even curies without synonyms are included
            return sorted(list(all_curies))
        else:
            log.error(f"NodeSynonymizer returned None",
                      error_code="NodeNormalizationIssue")
            return []
示例#5
0
def get_curie_synonyms_dict(
    curie: Union[str, List[str]], log: Optional[ARAXResponse] = ARAXResponse()
) -> Dict[str, List[str]]:
    curies = convert_to_list(curie)
    try:
        synonymizer = NodeSynonymizer()
        log.debug(
            f"Sending NodeSynonymizer.get_equivalent_nodes() a list of {len(curies)} curies"
        )
        equivalent_curies_dict = synonymizer.get_equivalent_nodes(curies)
        log.debug(f"Got response back from NodeSynonymizer")
    except Exception:
        tb = traceback.format_exc()
        error_type, error, _ = sys.exc_info()
        log.error(f"Encountered a problem using NodeSynonymizer: {tb}",
                  error_code=error_type.__name__)
        return dict()
    else:
        if equivalent_curies_dict is not None:
            curies_missing_info = {
                curie
                for curie in equivalent_curies_dict
                if not equivalent_curies_dict.get(curie)
            }
            if curies_missing_info:
                log.warning(
                    f"NodeSynonymizer did not find any equivalent curies for: {curies_missing_info}"
                )
            final_curie_dict = dict()
            for input_curie in curies:
                curie_dict = equivalent_curies_dict.get(input_curie)
                final_curie_dict[input_curie] = list(
                    curie_dict) if curie_dict else [input_curie]
            return final_curie_dict
        else:
            log.error(f"NodeSynonymizer returned None",
                      error_code="NodeNormalizationIssue")
            return dict()
示例#6
0
class PredictDrugTreatsDisease:

    #### Constructor
    def __init__(self, response, message, parameters):
        self.response = response
        self.message = message
        self.parameters = parameters
        self.global_iter = 0
        ## check if the new model files exists in /predictor/retrain_data. If not, scp it from arax.rtx.ai
        pathlist = os.path.realpath(__file__).split(os.path.sep)
        RTXindex = pathlist.index("RTX")
        filepath = os.path.sep.join([*pathlist[:(RTXindex + 1)], 'code', 'ARAX', 'ARAXQuery', 'Overlay', 'predictor','retrain_data'])

        ## check if there is LogModel.pkl
        pkl_file = f"{filepath}/LogModel.pkl"
        if os.path.exists(pkl_file):
            pass
        else:
            os.system("scp [email protected]:/home/ubuntu/drug_repurposing_model_retrain/LogModel.pkl " + pkl_file)

        ## check if there is GRAPH.sqlite
        db_file = f"{filepath}/GRAPH.sqlite"
        if os.path.exists(db_file):
            pass
        else:
            os.system("scp [email protected]:/home/ubuntu/drug_repurposing_model_retrain/GRAPH.sqlite " + db_file)

        # use NodeSynonymizer to replace map.txt
        # check if there is map.txt
        # map_file = f"{filepath}/map.txt"
        # if os.path.exists(map_file):
        #     pass
        # else:
        #     os.system("scp [email protected]:/home/ubuntu/drug_repurposing_model_retrain/map.txt " + map_file)

        self.pred = predictor(model_file=pkl_file)
        self.pred.import_file(None, graph_database=db_file)
        # with open(map_file, 'r') as infile:
        #     map_file_content = infile.readlines()
        #     map_file_content.pop(0) ## remove title
        #     self.known_curies = set(line.strip().split('\t')[0] for line in map_file_content)

        self.synonymizer = NodeSynonymizer()

    def convert_to_trained_curies(self, input_curie):
        """
        Takes an input curie from the KG, uses the synonymizer, and then returns something that the map.csv can handle
        """
        normalizer_result = self.synonymizer.get_equivalent_nodes(input_curie, kg_name='KG2')
        curies_in_model = normalizer_result[input_curie]
        # curies_in_model = [curie for curie in curies_in_model if curie in self.known_curies]
        # equivalent_curies = []  # start with empty equivalent_curies
        # try:
        #     equivalent_curies = [x['identifier'] for x in normalizer_result[input_curie]['equivalent_identifiers']]
        # except:
        #     self.response.warning(f"NodeSynonmizer could not find curies for {input_curie}, skipping this one.")
        # for curie in equivalent_curies:
        #     curie_prefix = curie.split(':')[0]
        #     # FIXME: fix this when re-training the ML model, as when this was originally trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123
        #     if curie_prefix == "CHEMBL.COMPOUND":
        #         chembl_fix = 'ChEMBL:' + curie[22:]
        #         if chembl_fix in self.known_curies:
        #             curies_in_model.add(chembl_fix)
        #     elif curie in self.known_curies:
        #         curies_in_model.add(curie)
        return curies_in_model

    def predict_drug_treats_disease(self):
        """
        Iterate over all the edges in the knowledge graph, add the drug-disease treatment probability for appropriate edges
        on the edge_attributes
        :return: response
        """
        parameters = self.parameters
        self.response.debug(f"Computing drug disease treatment probability based on a machine learning model")
        self.response.info(f"Computing drug disease treatment probability based on a machine learning model: See [this publication](https://doi.org/10.1101/765305) for more details about how this is accomplished.")

        attribute_name = "probability_treats"
        attribute_type = "EDAM:data_0951"
        value = 0  # this will be the default value. If the model returns 0, or the default is there, don't include that edge
        url = "https://doi.org/10.1101/765305"

        # if you want to add virtual edges, identify the source/targets, decorate the edges, add them to the KG, and then add one to the QG corresponding to them
        if 'virtual_relation_label' in parameters:
            source_curies_to_decorate = set()
            target_curies_to_decorate = set()
            # identify the nodes that we should be adding virtual edges for
            for node in self.message.knowledge_graph.nodes:
                if hasattr(node, 'qnode_ids'):
                    if parameters['source_qnode_id'] in node.qnode_ids:
                        if "drug" in node.type or "chemical_substance" in node.type:  # this is now NOT checked by ARAX_overlay
                            source_curies_to_decorate.add(node.id)
                    if parameters['target_qnode_id'] in node.qnode_ids:
                        if "disease" in node.type or "phenotypic_feature" in node.type:  # this is now NOT checked by ARAX_overlay
                            target_curies_to_decorate.add(node.id)

            added_flag = False  # check to see if any edges where added
            # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute

            for (source_curie, target_curie) in itertools.product(source_curies_to_decorate, target_curies_to_decorate):
                # create the edge attribute if it can be
                # loop over all equivalent curies and take the highest probability

                max_probability = 0
                converted_source_curie = self.convert_to_trained_curies(source_curie)
                converted_target_curie = self.convert_to_trained_curies(target_curie)
                if converted_source_curie is None or converted_target_curie is None:
                    continue
                res = list(itertools.product(converted_source_curie, converted_target_curie))
                if len(res) != 0:
                    all_probabilities = self.pred.prob_all(res)
                    if isinstance(all_probabilities, list):
                        max_probability = max([value for value in all_probabilities if np.isfinite(value)])

                value = max_probability

                #probability = self.pred.prob_single('ChEMBL:' + source_curie[22:], target_curie)  # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123
                #if probability and np.isfinite(probability):  # finite, that's ok, otherwise, stay with default
                #    value = probability[0]
                edge_attribute = EdgeAttribute(type=attribute_type, name=attribute_name, value=str(value), url=url)  # populate the edge attribute
                if edge_attribute and value != 0:
                    added_flag = True
                    # make the edge, add the attribute

                    # edge properties
                    now = datetime.now()
                    edge_type = "probably_treats"
                    qedge_ids = [parameters['virtual_relation_label']]
                    relation = parameters['virtual_relation_label']
                    is_defined_by = "ARAX"
                    defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S")
                    provided_by = "ARAX"
                    confidence = None
                    weight = None  # TODO: could make the actual value of the attribute
                    source_id = source_curie
                    target_id = target_curie

                    # now actually add the virtual edges in
                    id = f"{relation}_{self.global_iter}"
                    self.global_iter += 1
                    edge = Edge(id=id, type=edge_type, relation=relation, source_id=source_id,
                                target_id=target_id,
                                is_defined_by=is_defined_by, defined_datetime=defined_datetime,
                                provided_by=provided_by,
                                confidence=confidence, weight=weight, edge_attributes=[edge_attribute], qedge_ids=qedge_ids)
                    self.message.knowledge_graph.edges.append(edge)

            # Now add a q_edge the query_graph since I've added an extra edge to the KG
            if added_flag:
                edge_type = "probably_treats"
                relation = parameters['virtual_relation_label']
                qedge_id = parameters['virtual_relation_label']
                q_edge = QEdge(id=relation, type=edge_type, relation=relation,
                               source_id=parameters['source_qnode_id'], target_id=parameters['target_qnode_id'])  # TODO: ok to make the id and type the same thing?
                self.message.query_graph.edges.append(q_edge)
            return self.response

        else:  # you want to add it for each edge in the KG
            # iterate over KG edges, add the information
            try:
                # map curies to types
                curie_to_type = dict()
                for node in self.message.knowledge_graph.nodes:
                    curie_to_type[node.id] = node.type
                # then iterate over the edges and decorate if appropriate
                for edge in self.message.knowledge_graph.edges:
                    # Make sure the edge_attributes are not None
                    if not edge.edge_attributes:
                        edge.edge_attributes = []  # should be an array, but why not a list?
                    # now go and actually get the NGD
                    source_curie = edge.source_id
                    target_curie = edge.target_id
                    source_types = curie_to_type[source_curie]
                    target_types = curie_to_type[target_curie]
                    if (("drug" in source_types) or ("chemical_substance" in source_types)) and (("disease" in target_types) or ("phenotypic_feature" in target_types)):
                        temp_value = 0
                        # loop over all pairs of equivalent curies and take the highest probability

                        max_probability = 0
                        converted_source_curie = self.convert_to_trained_curies(source_curie)
                        converted_target_curie = self.convert_to_trained_curies(target_curie)
                        if converted_source_curie is None or converted_target_curie is None:
                            continue
                        res = list(itertools.product(converted_source_curie, converted_target_curie))
                        if len(res) != 0:
                            all_probabilities = self.pred.prob_all(res)
                            if isinstance(all_probabilities, list):
                                max_probability = max([value for value in all_probabilities if np.isfinite(value)])

                        value = max_probability

                        #probability = self.pred.prob_single('ChEMBL:' + source_curie[22:], target_curie)  # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123
                        #if probability and np.isfinite(probability):  # finite, that's ok, otherwise, stay with default
                        #    value = probability[0]
                    elif (("drug" in target_types) or ("chemical_substance" in target_types)) and (("disease" in source_types) or ("phenotypic_feature" in source_types)):
                        #probability = self.pred.prob_single('ChEMBL:' + target_curie[22:], source_curie)  # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123
                        #if probability and np.isfinite(probability):  # finite, that's ok, otherwise, stay with default
                        #    value = probability[0]

                        max_probability = 0
                        converted_source_curie = self.convert_to_trained_curies(source_curie)
                        converted_target_curie = self.convert_to_trained_curies(target_curie)
                        if converted_source_curie is None or converted_target_curie is None:
                            continue
                        res = list(itertools.product(converted_target_curie, converted_source_curie))
                        if len(res) != 0:
                            all_probabilities = self.pred.prob_all(res)
                            if isinstance(all_probabilities, list):
                                max_probability = max([value for value in all_probabilities if np.isfinite(value)])

                        value = max_probability

                    else:
                        continue
                    if value != 0:
                        edge_attribute = EdgeAttribute(type=attribute_type, name=attribute_name, value=str(value), url=url)  # populate the attribute
                        edge.edge_attributes.append(edge_attribute)  # append it to the list of attributes
            except:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                self.response.error(tb, error_code=error_type.__name__)
                self.response.error(f"Something went wrong adding the drug disease treatment probability")
            else:
                self.response.info(f"Drug disease treatment probability successfully added to edges")

            return self.response
示例#7
0
def _canonicalize_nodes(
    nodes: List[Dict[str,
                     any]]) -> Tuple[List[Dict[str, any]], Dict[str, str]]:
    synonymizer = NodeSynonymizer()
    node_ids = [node.get('id') for node in nodes if node.get('id')]
    print(
        f"  Sending NodeSynonymizer.get_canonical_curies() a list of {len(node_ids)} curies.."
    )
    canonicalized_info = synonymizer.get_canonical_curies(
        curies=node_ids, return_all_types=True)
    print(f"  Creating canonicalized nodes..")
    curie_map = dict()
    canonicalized_nodes = dict()
    for node in nodes:
        canonical_info = canonicalized_info.get(node['id'])
        canonicalized_curie = canonical_info.get(
            'preferred_curie', node['id']) if canonical_info else node['id']
        node['publications'] = _literal_eval_list(
            node['publications']
        )  # Only need to do this until kg2.2+ is rolled out
        if canonicalized_curie in canonicalized_nodes:
            existing_canonical_node = canonicalized_nodes[canonicalized_curie]
            existing_canonical_node['publications'] = _merge_two_lists(
                existing_canonical_node['publications'], node['publications'])
        else:
            if canonical_info:
                canonicalized_node = {
                    'id':
                    canonicalized_curie,
                    'name':
                    canonical_info.get('preferred_name', node['name']),
                    'types':
                    list(canonical_info.get('all_types')),
                    'preferred_type':
                    canonical_info.get('preferred_type',
                                       node['category_label']),
                    'publications':
                    node['publications']
                }
            else:
                canonicalized_node = {
                    'id': canonicalized_curie,
                    'name': node['name'],
                    'types': [node['category_label']],
                    'preferred_type': node['category_label'],
                    'publications': node['publications']
                }
            canonicalized_nodes[canonicalized_node['id']] = canonicalized_node
        curie_map[node[
            'id']] = canonicalized_curie  # Record this mapping for easy lookup later

    # Create a node containing information about this KG2C build
    new_build_node = {
        'id': 'RTX:KG2C',
        'name':
        f"KG2C:Build created on {datetime.now().strftime('%Y-%m-%d %H:%M')}",
        'types': ['data_file'],
        'preferred_type': 'data_file',
        'publications': []
    }
    canonicalized_nodes[new_build_node['id']] = new_build_node

    # Decorate nodes with equivalent curies
    print(
        f"  Sending NodeSynonymizer.get_equivalent_nodes() a list of {len(canonicalized_nodes)} curies.."
    )
    equivalent_curies_dict = synonymizer.get_equivalent_nodes(
        list(canonicalized_nodes.keys()))
    for curie, canonical_node in canonicalized_nodes.items():
        equivalent_curies = []
        equivalent_curies_dict_for_curie = equivalent_curies_dict.get(curie)
        if equivalent_curies_dict_for_curie is not None:
            for equivalent_curie in equivalent_curies_dict_for_curie:
                equivalent_curies.append(equivalent_curie)
        canonical_node['equivalent_curies'] = equivalent_curies

    # Convert array fields into the format neo4j wants and do final processing
    for canonicalized_node in canonicalized_nodes.values():
        canonicalized_node['types'] = _convert_list_to_neo4j_format(
            canonicalized_node['types'])
        canonicalized_node['publications'] = _convert_list_to_neo4j_format(
            canonicalized_node['publications'])
        canonicalized_node[
            'equivalent_curies'] = _convert_list_to_neo4j_format(
                canonicalized_node['equivalent_curies'])
        canonicalized_node[
            'preferred_type_for_conversion'] = canonicalized_node[
                'preferred_type']
    return list(canonicalized_nodes.values()), curie_map
def _canonicalize_nodes(
    neo4j_nodes: List[Dict[str, any]]
) -> Tuple[Dict[str, Dict[str, any]], Dict[str, str]]:
    synonymizer = NodeSynonymizer()
    node_ids = [node.get('id') for node in neo4j_nodes if node.get('id')]
    print(
        f"  Sending NodeSynonymizer.get_canonical_curies() {len(node_ids)} curies.."
    )
    canonicalized_info = synonymizer.get_canonical_curies(
        curies=node_ids, return_all_categories=True)
    all_canonical_curies = {
        canonical_info['preferred_curie']
        for canonical_info in canonicalized_info.values() if canonical_info
    }
    print(
        f"  Sending NodeSynonymizer.get_equivalent_nodes() {len(all_canonical_curies)} curies.."
    )
    equivalent_curies_info = synonymizer.get_equivalent_nodes(
        all_canonical_curies)
    recognized_curies = {
        curie
        for curie in equivalent_curies_info
        if equivalent_curies_info.get(curie)
    }
    equivalent_curies_dict = {
        curie: list(equivalent_curies_info.get(curie))
        for curie in recognized_curies
    }
    print(f"  Creating canonicalized nodes..")
    curie_map = dict()
    canonicalized_nodes = dict()
    for neo4j_node in neo4j_nodes:
        # Grab relevant info for this node and its canonical version
        canonical_info = canonicalized_info.get(neo4j_node['id'])
        canonicalized_curie = canonical_info.get(
            'preferred_curie',
            neo4j_node['id']) if canonical_info else neo4j_node['id']
        publications = neo4j_node['publications'] if neo4j_node.get(
            'publications') else []
        descriptions_list = [neo4j_node['description']
                             ] if neo4j_node.get('description') else []
        if canonicalized_curie in canonicalized_nodes:
            # Merge this node into its corresponding canonical node
            existing_canonical_node = canonicalized_nodes[canonicalized_curie]
            existing_canonical_node['publications'] = _merge_two_lists(
                existing_canonical_node['publications'], publications)
            existing_canonical_node['all_names'] = _merge_two_lists(
                existing_canonical_node['all_names'], [neo4j_node['name']])
            existing_canonical_node['descriptions_list'] = _merge_two_lists(
                existing_canonical_node['descriptions_list'],
                descriptions_list)
            # Make sure any nodes subject to #1074-like problems still appear in equivalent curies
            existing_canonical_node['equivalent_curies'] = _merge_two_lists(
                existing_canonical_node['equivalent_curies'],
                [neo4j_node['id']])
            # Add the IRI and description for the 'preferred' curie, if we've found that node
            if neo4j_node['id'] == canonicalized_curie:
                existing_canonical_node['iri'] = neo4j_node.get('iri')
                existing_canonical_node['description'] = neo4j_node.get(
                    'description')
        else:
            # Initiate the canonical node for this synonym group
            name = canonical_info[
                'preferred_name'] if canonical_info else neo4j_node['name']
            category = canonical_info[
                'preferred_category'] if canonical_info else neo4j_node[
                    'category']
            if not category.startswith("biolink:"):
                print(
                    f"  WARNING: Preferred category for {canonicalized_curie} doesn't start with 'biolink:': {category}"
                )
            all_categories = list(
                canonical_info['all_categories']) if canonical_info else [
                    neo4j_node['category']
                ]
            expanded_categories = list(
                canonical_info['expanded_categories']) if canonical_info else [
                    neo4j_node['category']
                ]
            iri = neo4j_node['iri'] if neo4j_node[
                'id'] == canonicalized_curie else None
            description = neo4j_node.get(
                'description'
            ) if neo4j_node['id'] == canonicalized_curie else None
            all_names = [neo4j_node['name']]

            # Check for bug where not all categories in synonymizer were of "biolink:PascalCase" format
            if not all(
                    category.startswith("biolink:")
                    for category in all_categories):
                print(
                    f" WARNING: all_categories for {canonicalized_curie} contain non 'biolink:PascalCase' "
                    f"items: {all_categories}")
            if not all(
                    category.startswith("biolink:")
                    for category in expanded_categories):
                print(
                    f" WARNING: expanded_categories for {canonicalized_curie} contain non 'biolink:PascalCase' "
                    f"items: {expanded_categories}")

            canonicalized_node = _create_node(
                preferred_curie=canonicalized_curie,
                name=name,
                category=category,
                all_categories=all_categories,
                expanded_categories=expanded_categories,
                publications=publications,
                equivalent_curies=equivalent_curies_dict.get(
                    canonicalized_curie, [canonicalized_curie]),
                iri=iri,
                description=description,
                descriptions_list=descriptions_list,
                all_names=all_names)
            canonicalized_nodes[canonicalized_node['id']] = canonicalized_node
        curie_map[neo4j_node[
            'id']] = canonicalized_curie  # Record this mapping for easy lookup later
    return canonicalized_nodes, curie_map
示例#9
0
def _canonicalize_nodes(
    nodes: List[Dict[str, any]]
) -> Tuple[Dict[str, Dict[str, any]], Dict[str, str]]:
    synonymizer = NodeSynonymizer()
    node_ids = [node.get('id') for node in nodes if node.get('id')]
    print(
        f"  Sending NodeSynonymizer.get_canonical_curies() {len(node_ids)} curies.."
    )
    canonicalized_info = synonymizer.get_canonical_curies(
        curies=node_ids, return_all_types=True)
    all_canonical_curies = {
        canonical_info['preferred_curie']
        for canonical_info in canonicalized_info.values() if canonical_info
    }
    print(
        f"  Sending NodeSynonymizer.get_equivalent_nodes() {len(all_canonical_curies)} curies.."
    )
    equivalent_curies_info = synonymizer.get_equivalent_nodes(
        all_canonical_curies)
    recognized_curies = {
        curie
        for curie in equivalent_curies_info
        if equivalent_curies_info.get(curie)
    }
    equivalent_curies_dict = {
        curie: list(equivalent_curies_info.get(curie))
        for curie in recognized_curies
    }
    print(f"  Creating canonicalized nodes..")
    curie_map = dict()
    canonicalized_nodes = dict()
    for node in nodes:
        canonical_info = canonicalized_info.get(node['id'])
        canonicalized_curie = canonical_info.get(
            'preferred_curie', node['id']) if canonical_info else node['id']
        publications = node['publications'] if node.get('publications') else []
        description_in_list = [node['description']
                               ] if node.get('description') else []
        if canonicalized_curie in canonicalized_nodes:
            existing_canonical_node = canonicalized_nodes[canonicalized_curie]
            existing_canonical_node['publications'] = _merge_two_lists(
                existing_canonical_node['publications'], publications)
            existing_canonical_node['all_names'] = _merge_two_lists(
                existing_canonical_node['all_names'], [node['name']])
            existing_canonical_node['description'] = _merge_two_lists(
                existing_canonical_node['description'], description_in_list)
            # Add the IRI for the 'preferred' curie, if we've found that node
            if node['id'] == canonicalized_curie:
                existing_canonical_node['iri'] = node.get('iri')
        else:
            name = canonical_info[
                'preferred_name'] if canonical_info else node['name']
            preferred_type = canonical_info[
                'preferred_type'] if canonical_info else node['category_label']
            types = list(canonical_info['all_types']) if canonical_info else [
                node['category_label']
            ]
            iri = node['iri'] if node['id'] == canonicalized_curie else None
            all_names = [node['name']]
            canonicalized_node = _create_node(
                node_id=canonicalized_curie,
                name=name,
                preferred_type=preferred_type,
                types=types,
                publications=publications,
                equivalent_curies=equivalent_curies_dict.get(
                    canonicalized_curie, []),
                iri=iri,
                description=description_in_list,
                all_names=all_names)

            canonicalized_nodes[canonicalized_node['id']] = canonicalized_node
        curie_map[node[
            'id']] = canonicalized_curie  # Record this mapping for easy lookup later
    return canonicalized_nodes, curie_map