Пример #1
0
def get_canonical_curies_dict(curie: Union[str, List[str]],
                              log: ARAXResponse) -> Dict[str, Dict[str, str]]:
    curies = convert_string_or_list_to_list(curie)
    try:
        synonymizer = NodeSynonymizer()
        log.debug(
            f"Sending NodeSynonymizer.get_canonical_curies() a list of {len(curies)} curies"
        )
        canonical_curies_dict = synonymizer.get_canonical_curies(curies)
        log.debug(f"Got response back from NodeSynonymizer")
    except Exception:
        tb = traceback.format_exc()
        error_type, error, _ = sys.exc_info()
        log.error(f"Encountered a problem using NodeSynonymizer: {tb}",
                  error_code=error_type.__name__)
        return {}
    else:
        if canonical_curies_dict is not None:
            unrecognized_curies = {
                input_curie
                for input_curie in canonical_curies_dict
                if not canonical_curies_dict.get(input_curie)
            }
            if unrecognized_curies:
                log.warning(
                    f"NodeSynonymizer did not return canonical info for: {unrecognized_curies}"
                )
            return canonical_curies_dict
        else:
            log.error(f"NodeSynonymizer returned None",
                      error_code="NodeNormalizationIssue")
            return {}
Пример #2
0
def get_canonical_curies_list(curie: Union[str, List[str]], log: ARAXResponse) -> List[str]:
    curies = convert_to_list(curie)
    try:
        synonymizer = NodeSynonymizer()
        log.debug(f"Sending NodeSynonymizer.get_canonical_curies() a list of {len(curies)} curies")
        canonical_curies_dict = synonymizer.get_canonical_curies(curies)
        log.debug(f"Got response back from NodeSynonymizer")
    except Exception:
        tb = traceback.format_exc()
        error_type, error, _ = sys.exc_info()
        log.error(f"Encountered a problem using NodeSynonymizer: {tb}", error_code=error_type.__name__)
        return []
    else:
        if canonical_curies_dict is not None:
            recognized_input_curies = {input_curie for input_curie in canonical_curies_dict if canonical_curies_dict.get(input_curie)}
            unrecognized_curies = set(curies).difference(recognized_input_curies)
            if unrecognized_curies:
                log.warning(f"NodeSynonymizer did not return canonical info for: {unrecognized_curies}")
            canonical_curies = {canonical_curies_dict[recognized_curie].get('preferred_curie') for recognized_curie in recognized_input_curies}
            # Include any original curies we weren't able to find a canonical version for
            canonical_curies.update(unrecognized_curies)
            if not canonical_curies:
                log.error(f"Final list of canonical curies is empty. This shouldn't happen!", error_code="CanonicalCurieIssue")
            return list(canonical_curies)
        else:
            log.error(f"NodeSynonymizer returned None", error_code="NodeNormalizationIssue")
            return []
Пример #3
0
def get_preferred_categories(curie: Union[str, List[str]],
                             log: ARAXResponse) -> Optional[List[str]]:
    curies = convert_to_list(curie)
    synonymizer = NodeSynonymizer()
    log.debug(
        f"Sending NodeSynonymizer.get_canonical_curies() a list of {len(curies)} curies"
    )
    canonical_curies_dict = synonymizer.get_canonical_curies(curies)
    log.debug(f"Got response back from NodeSynonymizer")
    if canonical_curies_dict is not None:
        recognized_input_curies = {
            input_curie
            for input_curie in canonical_curies_dict
            if canonical_curies_dict.get(input_curie)
        }
        unrecognized_curies = set(curies).difference(recognized_input_curies)
        if unrecognized_curies:
            log.warning(
                f"NodeSynonymizer did not recognize: {unrecognized_curies}")
        preferred_categories = {
            canonical_curies_dict[recognized_curie].get('preferred_category')
            for recognized_curie in recognized_input_curies
        }
        if preferred_categories:
            return list(preferred_categories)
        else:
            log.warning(
                f"Unable to find any preferred categories; will default to biolink:NamedThing"
            )
            return ["biolink:NamedThing"]
    else:
        log.error(f"NodeSynonymizer returned None",
                  error_code="NodeNormalizationIssue")
        return []
Пример #4
0
def estimate_percent_nodes_covered_by_backup_method(kg: str):
    print(
        f"Estimating the percent of {kg} nodes mappable by the 'backup' NGD method (uses eUtils)"
    )
    backup_ngd = NormGoogleDistance()
    synonymizer = NodeSynonymizer()
    percentages_mapped = []
    num_batches = 10
    batch_size = 10
    for number in range(num_batches):
        print(f"  Batch {number + 1}")
        # Get random selection of nodes from the KG
        query = f"match (a) return a.id, a.name, rand() as r order by r limit {batch_size}"
        results = _run_cypher_query(query, kg)
        canonical_curie_info = synonymizer.get_canonical_curies(
            [result['a.id'] for result in results])
        recognized_curies = {
            input_curie
            for input_curie in canonical_curie_info
            if canonical_curie_info.get(input_curie)
        }

        # Use the back-up NGD method to try to grab PMIDs for each
        num_with_pmids = 0
        for curie in recognized_curies:
            # Try to map this to a MESH term using the backup method (the chokepoint)
            node_id = canonical_curie_info[curie].get('preferred_curie')
            node_name = canonical_curie_info[curie].get('preferred_name')
            node_type = canonical_curie_info[curie].get('preferred_type')
            try:
                pmids = backup_ngd.get_pmids_for_all([node_id], [node_name])
            except Exception:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                print(f"ERROR using back-up method: {tb}")
            else:
                if len(pmids) and ([
                        pmid_list for pmid_list in pmids if pmid_list
                ]):
                    num_with_pmids += 1
                    print(
                        f"    Found {len(pmids[0])} PMIDs for {node_id}, {node_name}."
                    )
                else:
                    print(f"    Not found. ({node_id}, {node_name})")
        percentage_with_pmids = (num_with_pmids / len(recognized_curies)) * 100
        print(
            f"    {percentage_with_pmids}% of nodes were mapped to PMIDs using backup method."
        )
        percentages_mapped.append(percentage_with_pmids)

    print(f"  Percentages for all batches: {percentages_mapped}.")
    average = sum(percentages_mapped) / len(percentages_mapped)
    print(
        f"Final estimate of backup method's coverage of {kg} nodes: {round(average)}%"
    )
Пример #5
0
def estimate_percent_nodes_covered_by_ultrafast_ngd(kg: str):
    print(
        f"Estimating the percent of {kg} nodes covered by the local NGD system.."
    )
    curie_to_pmid_db = SqliteDict(f"./curie_to_pmids.sqlite")
    percentages_mapped = []
    num_batches = 20
    batch_size = 4000
    all_nodes_mapped_by_type = dict()
    for number in range(num_batches):
        # Get random selection of node IDs from the KG
        random_node_ids = _get_random_node_ids(batch_size, kg)

        # Use synonymizer to get their canonicalized info
        synonymizer = NodeSynonymizer()
        canonical_curie_info = synonymizer.get_canonical_curies(
            list(random_node_ids))
        recognized_curies = {
            input_curie
            for input_curie in canonical_curie_info
            if canonical_curie_info.get(input_curie)
        }

        # See if those canonical curies are in our local database
        num_mapped_to_pmids = 0
        for input_curie in recognized_curies:
            canonical_curie = canonical_curie_info[input_curie].get(
                'preferred_curie')
            preferred_type = canonical_curie_info[input_curie].get(
                'preferred_type')
            if preferred_type not in all_nodes_mapped_by_type:
                all_nodes_mapped_by_type[preferred_type] = {
                    'covered': 0,
                    'not_covered': 0
                }
            if canonical_curie and canonical_curie in curie_to_pmid_db:
                num_mapped_to_pmids += 1
                all_nodes_mapped_by_type[preferred_type]['covered'] += 1
            else:
                all_nodes_mapped_by_type[preferred_type]['not_covered'] += 1
        percentage_mapped = (num_mapped_to_pmids / len(random_node_ids)) * 100
        percentages_mapped.append(percentage_mapped)

    average = sum(percentages_mapped) / len(percentages_mapped)
    print(f"Estimated coverage of {kg} nodes: {round(average)}%.")
    node_type_percentages_dict = dict()
    for node_type, coverage_info in all_nodes_mapped_by_type.items():
        num_covered = coverage_info['covered']
        num_total = coverage_info['covered'] + coverage_info['not_covered']
        percentage = round((num_covered / num_total) * 100)
        node_type_percentages_dict[node_type] = percentage
    for node_type, percentage in sorted(node_type_percentages_dict.items(),
                                        key=lambda item: item[1],
                                        reverse=True):
        print(f"  {node_type}: {percentage}%")
Пример #6
0
def _canonicalize_nodes(kg2pre_nodes: List[Dict[str, any]]) -> Tuple[Dict[str, Dict[str, any]], Dict[str, str]]:
    logging.info(f"Canonicalizing nodes..")
    synonymizer = NodeSynonymizer()
    node_ids = [node.get('id') for node in kg2pre_nodes if node.get('id')]
    logging.info(f"  Sending NodeSynonymizer.get_canonical_curies() {len(node_ids)} curies..")
    canonicalized_info = synonymizer.get_canonical_curies(curies=node_ids, return_all_categories=True)
    all_canonical_curies = {canonical_info['preferred_curie'] for canonical_info in canonicalized_info.values() if canonical_info}
    logging.info(f"  Sending NodeSynonymizer.get_equivalent_nodes() {len(all_canonical_curies)} curies..")
    equivalent_curies_info = synonymizer.get_equivalent_nodes(all_canonical_curies)
    recognized_curies = {curie for curie in equivalent_curies_info if equivalent_curies_info.get(curie)}
    equivalent_curies_dict = {curie: list(equivalent_curies_info.get(curie)) for curie in recognized_curies}
    with open(f"{KG2C_DIR}/equivalent_curies.pickle", "wb") as equiv_curies_dump:  # Save these for use by downstream script
        pickle.dump(equivalent_curies_dict, equiv_curies_dump, protocol=pickle.HIGHEST_PROTOCOL)
    logging.info(f"  Creating canonicalized nodes..")
    curie_map = dict()
    canonicalized_nodes = dict()
    for kg2pre_node in kg2pre_nodes:
        # Grab relevant info for this node and its canonical version
        canonical_info = canonicalized_info.get(kg2pre_node['id'])
        canonicalized_curie = canonical_info.get('preferred_curie', kg2pre_node['id']) if canonical_info else kg2pre_node['id']
        publications = kg2pre_node['publications'] if kg2pre_node.get('publications') else []
        descriptions_list = [kg2pre_node['description']] if kg2pre_node.get('description') else []
        if canonicalized_curie in canonicalized_nodes:
            # Merge this node into its corresponding canonical node
            existing_canonical_node = canonicalized_nodes[canonicalized_curie]
            existing_canonical_node['publications'] = _merge_two_lists(existing_canonical_node['publications'], publications)
            existing_canonical_node['all_names'] = _merge_two_lists(existing_canonical_node['all_names'], [kg2pre_node['name']])
            existing_canonical_node['descriptions_list'] = _merge_two_lists(existing_canonical_node['descriptions_list'], descriptions_list)
            # Make sure any nodes subject to #1074-like problems still appear in equivalent curies
            existing_canonical_node['equivalent_curies'] = _merge_two_lists(existing_canonical_node['equivalent_curies'], [kg2pre_node['id']])
            # Add the IRI for the 'preferred' curie, if we've found that node
            if kg2pre_node['id'] == canonicalized_curie:
                existing_canonical_node['iri'] = kg2pre_node.get('iri')
        else:
            # Initiate the canonical node for this synonym group
            name = canonical_info['preferred_name'] if canonical_info else kg2pre_node['name']
            category = canonical_info['preferred_category'] if canonical_info else kg2pre_node['category']
            all_categories = list(canonical_info['all_categories']) if canonical_info else [kg2pre_node['category']]
            iri = kg2pre_node['iri'] if kg2pre_node['id'] == canonicalized_curie else None
            all_names = [kg2pre_node['name']]
            canonicalized_node = _create_node(preferred_curie=canonicalized_curie,
                                              name=name,
                                              category=category,
                                              all_categories=all_categories,
                                              publications=publications,
                                              equivalent_curies=equivalent_curies_dict.get(canonicalized_curie, [canonicalized_curie]),
                                              iri=iri,
                                              description=None,
                                              descriptions_list=descriptions_list,
                                              all_names=all_names)
            canonicalized_nodes[canonicalized_node['id']] = canonicalized_node
        curie_map[kg2pre_node['id']] = canonicalized_curie  # Record this mapping for easy lookup later
    logging.info(f"Number of KG2pre nodes was reduced to {len(canonicalized_nodes)} "
                 f"({round((len(canonicalized_nodes) / len(kg2pre_nodes)) * 100)}%)")
    return canonicalized_nodes, curie_map
Пример #7
0
 def _get_canonical_curies_map(self, curies):
     self.response.debug(f"Canonicalizing curies of relevant nodes using NodeSynonymizer")
     synonymizer = NodeSynonymizer()
     try:
         canonicalized_node_info = synonymizer.get_canonical_curies(curies)
     except Exception:
         tb = traceback.format_exc()
         error_type, error, _ = sys.exc_info()
         self.response.error(f"Encountered a problem using NodeSynonymizer: {tb}", error_code=error_type.__name__)
         return {}
     else:
         canonical_curies_map = dict()
         for input_curie, node_info in canonicalized_node_info.items():
             if node_info:
                 canonical_curies_map[input_curie] = node_info.get('preferred_curie', input_curie)
             else:
                 canonical_curies_map[input_curie] = input_curie
         return canonical_curies_map
Пример #8
0
def get_entity_by_string(search_string):  # noqa: E501
    """Obtain the CURIE and type of some entity by name

     # noqa: E501

    :param search_string: Some string to search by (name, abbreviation, CURIE, etc.)
    :type search_string: str

    :rtype: List[object]
    """
    synonymizer = NodeSynonymizer()
    result = synonymizer.get_canonical_curies(curies=search_string,
                                              names=search_string)
    response = {}
    if result[search_string] is not None:
        response = {
            'curie': result[search_string]['preferred_curie'],
            'name': result[search_string]['preferred_name'],
            'type': result[search_string]['preferred_type']
        }
    return response
Пример #9
0
class CHPQuerier:
    def __init__(self, response_object: ARAXResponse):
        self.response = response_object
        self.synonymizer = NodeSynonymizer()
        self.kp_name = "CHP"
        # Instantiate a client
        self.client = get_client()

    def answer_one_hop_query(
            self, query_graph: QueryGraph) -> QGOrganizedKnowledgeGraph:
        """
        This function answers a one-hop (single-edge) query using CHP client.
        :param query_graph: A TRAPI query graph.
        :return: An (almost) TRAPI knowledge graph containing all of the nodes and edges returned as
                results for the query. (Organized by QG IDs.)
        """
        # Set up the required parameters
        log = self.response
        self.CHP_survival_threshold = float(
            self.response.data['parameters']['CHP_survival_threshold'])
        allowable_curies = self.client.curies()
        self.allowable_gene_curies = list(
            allowable_curies['biolink:Gene'].keys())
        self.allowable_drug_curies = [
            curie_id.replace('CHEMBL:', 'CHEMBL.COMPOUND:')
            for curie_id in list(allowable_curies['biolink:Drug'].keys())
        ]
        final_kg = QGOrganizedKnowledgeGraph()

        final_kg = self._answer_query_using_CHP_client(query_graph, log)

        return final_kg

    def _answer_query_using_CHP_client(
            self, query_graph: QueryGraph,
            log: ARAXResponse) -> QGOrganizedKnowledgeGraph:
        qedge_key = next(qedge_key for qedge_key in query_graph.edges)
        log.debug(
            f"Processing query results for edge {qedge_key} by using CHP client"
        )
        final_kg = QGOrganizedKnowledgeGraph()
        gene_label_list = ['gene']
        drug_label_list = ['drug', 'chemicalsubstance']
        # use for checking the requirement
        source_pass_nodes = None
        source_category = None
        target_pass_nodes = None
        target_category = None

        qedge = query_graph.edges[qedge_key]
        source_qnode_key = qedge.subject
        target_qnode_key = qedge.object
        source_qnode = query_graph.nodes[source_qnode_key]
        target_qnode = query_graph.nodes[target_qnode_key]

        # check if both ends of edge have no curie
        if (source_qnode.id is None) and (target_qnode.id is None):
            log.error(f"Both ends of edge {qedge_key} are None",
                      error_code="BadEdge")
            return final_kg

        # check if the query nodes are drug or disease
        if source_qnode.id is not None:

            if type(source_qnode.id) is str:
                source_pass_nodes = [source_qnode.id]
            else:
                source_pass_nodes = source_qnode.id
            has_error, pass_nodes, not_pass_nodes = self._check_id(
                source_qnode.id, log)
            if has_error:
                return final_kg
            else:
                if len(not_pass_nodes) == 0 and len(pass_nodes) != 0:
                    source_pass_nodes = pass_nodes
                elif len(not_pass_nodes) != 0 and len(pass_nodes) != 0:
                    source_pass_nodes = pass_nodes
                    if len(not_pass_nodes) == 1:
                        log.warning(
                            f"The curie id of {not_pass_nodes[0]} is not allowable based on CHP client"
                        )
                    else:
                        log.warning(
                            f"The curie ids of these nodes {not_pass_nodes} are not allowable based on CHP client"
                        )
                else:
                    if type(source_qnode.id) is str:
                        log.error(
                            f"The curie id of {source_qnode.id} is not allowable based on CHP client",
                            error_code="NotAllowable")
                        return final_kg
                    else:
                        log.error(
                            f"The curie ids of {source_qnode.id} are not allowable based on CHP client",
                            error_code="NotAllowable")
                        return final_kg
        else:
            category = source_qnode.category[0].replace(
                'biolink:', '').replace('_', '').lower()
            source_category = category
            if (category in drug_label_list) or (category in gene_label_list):
                source_category = category
            else:
                log.error(
                    f"The category of query node {source_qnode_key} is unsatisfiable. It has to be drug/chemical_substance or gene",
                    error_code="CategoryError")
                return final_kg

        if target_qnode.id is not None:

            if type(target_qnode.id) is str:
                target_pass_nodes = [target_qnode.id]
            else:
                target_pass_nodes = target_qnode.id
            has_error, pass_nodes, not_pass_nodes = self._check_id(
                target_qnode.id, log)
            if has_error:
                return final_kg
            else:
                if len(not_pass_nodes) == 0 and len(pass_nodes) != 0:
                    target_pass_nodes = pass_nodes
                elif len(not_pass_nodes) != 0 and len(pass_nodes) != 0:
                    target_pass_nodes = pass_nodes
                    if len(not_pass_nodes) == 1:
                        log.warning(
                            f"The curie id of {not_pass_nodes[0]} is not allowable based on CHP client"
                        )
                    else:
                        log.warning(
                            f"The curie ids of these nodes {not_pass_nodes} are not allowable based on CHP client"
                        )
                else:
                    if type(target_qnode.id) is str:
                        log.error(
                            f"The curie id of {target_qnode.id} is not allowable based on CHP client",
                            error_code="CategoryError")
                        return final_kg
                    else:
                        log.error(
                            f"The curie ids of {target_qnode.id} are not allowable based on CHP client",
                            error_code="CategoryError")
                        return final_kg
        else:
            category = target_qnode.category[0].replace(
                'biolink:', '').replace('_', '').lower()
            target_category = category
            if (category in drug_label_list) or (category in gene_label_list):
                target_category = category
            else:
                log.error(
                    f"The category of query node {target_qnode_key} is unsatisfiable. It has to be drug/chemical_substance or gene",
                    error_code="CategoryError")
                return final_kg

        if (source_pass_nodes is None) and (target_pass_nodes is None):
            return final_kg

        elif (source_pass_nodes is not None) and (target_pass_nodes
                                                  is not None):
            source_dict = dict()
            target_dict = dict()
            if source_pass_nodes[0] in self.allowable_drug_curies:
                source_category_temp = 'drug'
            else:
                source_category_temp = 'gene'
            if target_pass_nodes[0] in self.allowable_drug_curies:
                target_category_temp = 'drug'
            else:
                target_category_temp = 'gene'
            if source_category_temp == target_category_temp:
                log.error(
                    f"The query nodes in both ends of edge are the same type which is {source_category_temp}",
                    error_code="CategoryError")
                return final_kg
            else:
                for (source_curie, target_curie) in itertools.product(
                        source_pass_nodes, target_pass_nodes):

                    if source_category_temp == 'drug':
                        source_curie_temp = source_curie.replace(
                            'CHEMBL.COMPOUND:', 'CHEMBL:')
                        # Let's build a simple single query
                        q = build_query(genes=[target_curie],
                                        therapeutic=source_curie_temp,
                                        disease='MONDO:0007254',
                                        outcome=('EFO:0000714', '>=',
                                                 self.CHP_survival_threshold))

                        response = self.client.query(q)
                        max_probability = self.client.get_outcome_prob(
                            response)
                        swagger_edge_key, swagger_edge = self._convert_to_swagger_edge(
                            target_curie, source_curie, "paired_with",
                            max_probability)
                    else:
                        target_curie_temp = target_curie.replace(
                            'CHEMBL.COMPOUND:', 'CHEMBL:')
                        # Let's build a simple single query
                        q = build_query(genes=[source_curie],
                                        therapeutic=target_curie_temp,
                                        disease='MONDO:0007254',
                                        outcome=('EFO:0000714', '>=',
                                                 self.CHP_survival_threshold))

                        response = self.client.query(q)
                        max_probability = self.client.get_outcome_prob(
                            response)
                        swagger_edge_key, swagger_edge = self._convert_to_swagger_edge(
                            source_curie, target_curie, "paired_with",
                            max_probability)

                    source_dict[source_curie] = source_qnode_key
                    target_dict[target_curie] = target_qnode_key

                    # Finally add the current edge to our answer knowledge graph
                    final_kg.add_edge(swagger_edge_key, swagger_edge,
                                      qedge_key)

                # Add the nodes to our answer knowledge graph
                if len(source_dict) != 0:
                    for source_curie in source_dict:
                        swagger_node_key, swagger_node = self._convert_to_swagger_node(
                            source_curie)
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          source_dict[source_curie])
                if len(target_dict) != 0:
                    for target_curie in target_dict:
                        swagger_node_key, swagger_node = self._convert_to_swagger_node(
                            target_curie)
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          target_dict[target_curie])

                return final_kg

        elif source_pass_nodes is not None:
            source_dict = dict()
            target_dict = dict()

            if source_pass_nodes[0] in self.allowable_drug_curies:
                source_category_temp = 'drug'
            else:
                source_category_temp = 'gene'
            if target_category in drug_label_list:
                target_category_temp = 'drug'
            else:
                target_category_temp = 'gene'
            if source_category_temp == target_category_temp:
                log.error(
                    f"The query nodes in both ends of edge are the same type which is {source_category_temp}",
                    error_code="CategoryError")
                return final_kg
            else:
                if source_category_temp == 'drug':
                    for source_curie in source_pass_nodes:

                        genes = [
                            curie for curie in self.allowable_gene_curies
                            if self.synonymizer.get_canonical_curies(curie)
                            [curie] is not None and target_category in [
                                category.replace('biolink:', '').replace(
                                    '_', '').lower() for category in list(
                                        self.synonymizer.get_canonical_curies(
                                            curie, return_all_categories=True)
                                        [curie]['all_categories'].keys())
                            ]
                        ]
                        therapeutic = source_curie.replace(
                            'CHEMBL.COMPOUND:', 'CHEMBL:')
                        disease = 'MONDO:0007254'
                        outcome = ('EFO:0000714', '>=',
                                   self.CHP_survival_threshold)

                        queries = []
                        for gene in genes:
                            queries.append(
                                build_query(
                                    genes=[gene],
                                    therapeutic=therapeutic,
                                    disease=disease,
                                    outcome=outcome,
                                ))

                        # use the query_all endpoint to run the batch of queries
                        res = self.client.query_all(queries)

                        for result, gene in zip(res["message"], genes):
                            prob = self.client.get_outcome_prob(result)
                            swagger_edge_key, swagger_edge = self._convert_to_swagger_edge(
                                gene, source_curie, "paired_with", prob)

                            source_dict[source_curie] = source_qnode_key
                            target_dict[gene] = target_qnode_key

                            # Finally add the current edge to our answer knowledge graph
                            final_kg.add_edge(swagger_edge_key, swagger_edge,
                                              qedge_key)
                else:
                    for source_curie in source_pass_nodes:

                        genes = [source_curie]
                        therapeutic = [
                            curie.replace('CHEMBL.COMPOUND:', 'CHEMBL:')
                            for curie in self.allowable_drug_curies
                            if self.synonymizer.get_canonical_curies(
                                curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:'))
                            [curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:')]
                            is not None and target_category in [
                                category.replace('biolink:', '').replace(
                                    '_', '').lower()
                                for category in list(
                                    self.synonymizer.get_canonical_curies(
                                        curie.replace('CHEMBL:',
                                                      'CHEMBL.COMPOUND:'),
                                        return_all_categories=True)[
                                            curie.replace(
                                                'CHEMBL:', 'CHEMBL.COMPOUND:')]
                                    ['all_categories'].keys())
                            ]
                        ]
                        disease = 'MONDO:0007254'
                        outcome = ('EFO:0000714', '>=',
                                   self.CHP_survival_threshold)

                        queries = []
                        for drug in therapeutic:
                            queries.append(
                                build_query(
                                    genes=genes,
                                    therapeutic=drug,
                                    disease=disease,
                                    outcome=outcome,
                                ))

                        # use the query_all endpoint to run the batch of queries
                        res = self.client.query_all(queries)

                        for result, drug in zip(res["message"], therapeutic):
                            drug = drug.replace('CHEMBL:', 'CHEMBL.COMPOUND:')
                            prob = self.client.get_outcome_prob(result)
                            swagger_edge_key, swagger_edge = self._convert_to_swagger_edge(
                                source_curie, drug, "paired_with", prob)

                            source_dict[source_curie] = source_qnode_key
                            target_dict[drug] = target_qnode_key

                            # Finally add the current edge to our answer knowledge graph
                            final_kg.add_edge(swagger_edge_key, swagger_edge,
                                              qedge_key)

                # Add the nodes to our answer knowledge graph
                if len(source_dict) != 0:
                    for source_curie in source_dict:
                        swagger_node_key, swagger_node = self._convert_to_swagger_node(
                            source_curie)
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          source_dict[source_curie])
                if len(target_dict) != 0:
                    for target_curie in target_dict:
                        swagger_node_key, swagger_node = self._convert_to_swagger_node(
                            target_curie)
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          target_dict[target_curie])

                return final_kg
        else:
            source_dict = dict()
            target_dict = dict()

            if target_pass_nodes[0] in self.allowable_drug_curies:
                target_category_temp = 'drug'
            else:
                target_category_temp = 'gene'
            if source_category in drug_label_list:
                source_category_temp = 'drug'
            else:
                source_category_temp = 'gene'
            if source_category_temp == target_category_temp:
                log.error(
                    f"The query nodes in both ends of edge are the same type which is {source_category_temp}",
                    error_code="CategoryError")
                return final_kg
            else:
                if target_category_temp == 'drug':
                    for target_curie in target_pass_nodes:

                        genes = [
                            curie for curie in self.allowable_gene_curies
                            if self.synonymizer.get_canonical_curies(curie)
                            [curie] is not None and source_category in [
                                category.replace('biolink:', '').replace(
                                    '_', '').lower() for category in list(
                                        self.synonymizer.get_canonical_curies(
                                            curie, return_all_categories=True)
                                        [curie]['all_categories'].keys())
                            ]
                        ]
                        therapeutic = target_curie.replace(
                            'CHEMBL.COMPOUND:', 'CHEMBL:')
                        disease = 'MONDO:0007254'
                        outcome = ('EFO:0000714', '>=',
                                   self.CHP_survival_threshold)

                        queries = []
                        for gene in genes:
                            queries.append(
                                build_query(
                                    genes=[gene],
                                    therapeutic=therapeutic,
                                    disease=disease,
                                    outcome=outcome,
                                ))

                        # use the query_all endpoint to run the batch of queries
                        res = self.client.query_all(queries)

                        for result, gene in zip(res["message"], genes):
                            prob = self.client.get_outcome_prob(result)
                            swagger_edge_key, swagger_edge = self._convert_to_swagger_edge(
                                gene, target_curie, "paired_with", prob)

                            source_dict[gene] = source_qnode_key
                            target_dict[target_curie] = target_qnode_key

                            # Finally add the current edge to our answer knowledge graph
                            final_kg.add_edge(swagger_edge_key, swagger_edge,
                                              qedge_key)

                else:
                    for target_curie in target_pass_nodes:

                        genes = [target_curie]
                        therapeutic = [
                            curie.replace('CHEMBL.COMPOUND:', 'CHEMBL:')
                            for curie in self.allowable_drug_curies
                            if self.synonymizer.get_canonical_curies(
                                curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:'))
                            [curie.replace('CHEMBL:', 'CHEMBL.COMPOUND:')]
                            is not None and source_category in [
                                category.replace('biolink:', '').replace(
                                    '_', '').lower()
                                for category in list(
                                    self.synonymizer.get_canonical_curies(
                                        curie.replace('CHEMBL:',
                                                      'CHEMBL.COMPOUND:'),
                                        return_all_categories=True)[
                                            curie.replace(
                                                'CHEMBL:', 'CHEMBL.COMPOUND:')]
                                    ['all_categories'].keys())
                            ]
                        ]
                        disease = 'MONDO:0007254'
                        outcome = ('EFO:0000714', '>=',
                                   self.CHP_survival_threshold)

                        queries = []
                        for drug in therapeutic:
                            queries.append(
                                build_query(
                                    genes=genes,
                                    therapeutic=drug,
                                    disease=disease,
                                    outcome=outcome,
                                ))

                        # use the query_all endpoint to run the batch of queries
                        res = self.client.query_all(queries)

                        for result, drug in zip(res["message"], therapeutic):
                            drug = drug.replace('CHEMBL:', 'CHEMBL.COMPOUND:')
                            prob = self.client.get_outcome_prob(result)
                            swagger_edge_key, swagger_edge = self._convert_to_swagger_edge(
                                target_curie, drug, "paired_with", prob)

                            source_dict[drug] = source_qnode_key
                            target_dict[target_curie] = target_qnode_key

                            # Finally add the current edge to our answer knowledge graph
                            final_kg.add_edge(swagger_edge_key, swagger_edge,
                                              qedge_key)

                # Add the nodes to our answer knowledge graph
                if len(source_dict) != 0:
                    for source_curie in source_dict:
                        swagger_node_key, swagger_node = self._convert_to_swagger_node(
                            source_curie)
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          source_dict[source_curie])
                if len(target_dict) != 0:
                    for target_curie in target_dict:
                        swagger_node_key, swagger_node = self._convert_to_swagger_node(
                            target_curie)
                        final_kg.add_node(swagger_node_key, swagger_node,
                                          target_dict[target_curie])

                return final_kg

    def _check_id(self, qnode_id, log):

        if type(qnode_id) is str:
            if qnode_id in self.allowable_gene_curies or qnode_id in self.allowable_drug_curies:
                return [False, [qnode_id], []]
            else:
                return [False, [], [qnode_id]]
        else:
            pass_nodes_gene_temp = list()
            pass_nodes_drug_temp = list()
            not_pass_nodes = list()
            for curie in qnode_id:
                if curie in self.allowable_gene_curies:
                    pass_nodes_gene_temp += [curie]
                elif curie in self.allowable_drug_curies:
                    pass_nodes_drug_temp += [curie]
                else:
                    not_pass_nodes += [curie]

            if len(pass_nodes_gene_temp) != 0 and len(
                    pass_nodes_drug_temp) != 0:
                log.error(
                    f"The curie ids of {qnode_id} contain both gene and drug",
                    error_code="MixedTypes")
                return [True, [], []]
            else:
                pass_nodes = pass_nodes_gene_temp + pass_nodes_drug_temp
                return [False, pass_nodes, not_pass_nodes]

    def _convert_to_swagger_edge(self, subject: str, object: str, name: str,
                                 value: float) -> Tuple[str, Edge]:
        swagger_edge = Edge()
        swagger_edge.predicate = f"biolink:{name}"
        swagger_edge.subject = subject
        swagger_edge.object = object
        swagger_edge_key = f"CHP:{subject}-{name}-{object}"
        swagger_edge.relation = None

        type = "EDAM:data_0951"
        url = "https://github.com/di2ag/chp_client"

        swagger_edge.attributes = [
            Attribute(type=type, name=name, value=str(value), url=url),
            Attribute(name="provided_by",
                      value=self.kp_name,
                      type=eu.get_attribute_type("provided_by")),
            Attribute(name="is_defined_by",
                      value="ARAX",
                      type=eu.get_attribute_type("is_defined_by"))
        ]
        return swagger_edge_key, swagger_edge

    def _convert_to_swagger_node(self, node_key: str) -> Tuple[str, Node]:
        swagger_node = Node()
        swagger_node_key = node_key
        swagger_node.name = self.synonymizer.get_canonical_curies(
            node_key)[node_key]['preferred_name']
        swagger_node.description = None
        swagger_node.category = self.synonymizer.get_canonical_curies(
            node_key)[node_key]['preferred_category']

        return swagger_node_key, swagger_node
def main():

    parser = argparse.ArgumentParser(
        description="Refresh DTD model and database",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--synoymizer_folder',
                        type=str,
                        help="Full path of folder containing NodeSynonymizer",
                        default='~/RTX/code/ARAX/NodeSynonymizer/')
    parser.add_argument(
        '--DTD_prob_db_file',
        type=str,
        help="Full path of DTD probability database file",
        default=
        '~/work/RTX/code/ARAX/KnowledgeSources/Prediction/DTD_probability_database_v1.0_KG2.3.4.db'
    )
    parser.add_argument(
        '--emb_file',
        type=str,
        help="Full path of DTD model embedding file",
        default=
        '~/work/RTX/code/ARAX/KnowledgeSources/Prediction/rel_max_v1.0_KG2.3.4.emb.gz'
    )
    parser.add_argument(
        '--map_file',
        type=str,
        help="Full path of DTD model mapping file",
        default=
        '~/work/RTX/code/ARAX/KnowledgeSources/Prediction/map_v1.0_KG2.3.4.txt'
    )
    parser.add_argument(
        '--output_folder',
        type=str,
        help="Full path of output folder",
        default='~/work/RTX/code/ARAX/KnowledgeSources/Prediction/')
    args = parser.parse_args()

    if os.path.isdir(args.synoymizer_folder):
        sys.path.append(args.synoymizer_folder)
        from node_synonymizer import NodeSynonymizer
        synonymizer = NodeSynonymizer()
    else:
        print(f"Error: Not found this folder: {args.synoymizer_folder}")
        exit(0)

    if os.path.isfile(args.DTD_prob_db_file):
        print(f'Start to refresh DTD_probability_database.db', flush=True)
        con = sqlite3.connect(args.DTD_prob_db_file)
        DTD_prob_table = pd.read_sql_query("SELECT * from DTD_PROBABILITY",
                                           con)
        con.close()
        DTD_prob_table = DTD_prob_table.apply(lambda row: [
            refresh_disease(row[0], synonymizer),
            refresh_drug(row[1], synonymizer), row[2]
        ],
                                              axis=1,
                                              result_type='expand')
        DTD_prob_table = DTD_prob_table.dropna().reset_index(drop=True)
        con = sqlite3.connect(
            os.path.join(args.output_folder,
                         'DTD_probability_database_refreshed.db'))
        con.execute(
            f"CREATE TABLE DTD_PROBABILITY( disease VARCHAR(255), drug VARCHAR(255), probability FLOAT )"
        )
        insert_command = "INSERT INTO DTD_PROBABILITY VALUES (?, ?, ?)"
        databasefile = list(DTD_prob_table.to_records(index=False))

        print(f"INFO: Populating table", flush=True)
        insert_command = "INSERT INTO DTD_PROBABILITY VALUES (?, ?, ?)"
        batch = list(range(0, len(databasefile), 5000))
        batch.append(len(databasefile))
        count = 0
        for i in range(len(batch)):
            if ((i + 1) < len(batch)):
                start = batch[i]
                end = batch[i + 1]
                rows = databasefile[start:end]
                con.executemany(insert_command, rows)
                con.commit()
                count = count + len(rows)
                percentage = round((count * 100.0 / len(databasefile)), 2)
                print(str(percentage) + "%..", end='', flush=True)

        print(f"INFO: Populating tables is completed", flush=True)

        print(f"INFO: Creating INDEXes on DTD_PROBABILITY", flush=True)
        con.execute(
            f"CREATE INDEX idx_DTD_PROBABILITY_disease ON DTD_PROBABILITY(disease)"
        )
        con.execute(
            f"CREATE INDEX idx_DTD_PROBABILITY_drug ON DTD_PROBABILITY(drug)")
        con.commit()
        con.close()
        print(f"INFO: Creating INDEXes is completed", flush=True)
    else:
        print(f"Error: Not found this file: {args.DTD_prob_db_file}")
        exit(0)

    if os.path.isfile(args.emb_file) and os.path.isfile(args.map_file):
        rel_max = pd.read_csv(args.emb_file, sep=' ', skiprows=1, header=None)
        mapfile = pd.read_csv(args.map_file, sep='\t', header=0)
        merged_table = mapfile.merge(rel_max, left_on='id', right_on=0)
        merged_table = merged_table.loc[:, ['curie'] +
                                        list(merged_table.columns)[3:]]
        new_curie_ids = [
            synonymizer.get_canonical_curies(curie)[curie]['preferred_curie']
            if synonymizer.get_canonical_curies(curie)[curie] is not None else
            None for curie in list(merged_table.curie)
        ]
        graph = pd.concat(
            [pd.DataFrame(new_curie_ids), merged_table.iloc[:, 1:]], axis=1)
        graph = graph.dropna().reset_index(drop=True)

        con = sqlite3.connect(
            os.path.join(args.output_folder, 'GRAPH_refreshed.sqlite'))
        con.execute(f"DROP TABLE IF EXISTs GRAPH")
        insert_command1 = f"CREATE TABLE GRAPH(curie VARCHAR(255)"
        for num in range(1, graph.shape[1]):
            insert_command1 = insert_command1 + f", col{num} INT"
        insert_command1 = insert_command1 + ")"
        con.execute(insert_command1)
        con.commit()

        count = 0

        print(f"Insert data into database", flush=True)
        for row in range(graph.shape[0]):
            count = count + 1
            insert_command1 = f"INSERT INTO GRAPH"
            insert_command2 = f" values ("

            for _ in range(graph.shape[1]):
                insert_command2 = insert_command2 + f"?,"

            insert_command = insert_command1 + insert_command2 + ")"
            insert_command = insert_command.replace(',)', ')')
            line = tuple(graph.loc[row, :])
            con.execute(insert_command, line)
            if count % 5000 == 0:
                con.commit()
                percentage = int(count * 100.0 / graph.shape[0])
                print(str(percentage) + "%..", end='', flush=True)

        con.commit()
        percentage = int(count * 100.0 / graph.shape[0])
        print(str(percentage) + "%..", end='', flush=True)

        con.execute(f"CREATE INDEX idx_GRAPH_curie ON GRAPH(curie)")
        con.commit()
        con.close()
        print(f"INFO: Database created successfully", flush=True)
Пример #11
0
def _canonicalize_nodes(
    nodes: List[Dict[str, any]]
) -> Tuple[Dict[str, Dict[str, any]], Dict[str, str]]:
    synonymizer = NodeSynonymizer()
    node_ids = [node.get('id') for node in nodes if node.get('id')]
    print(
        f"  Sending NodeSynonymizer.get_canonical_curies() {len(node_ids)} curies.."
    )
    canonicalized_info = synonymizer.get_canonical_curies(
        curies=node_ids, return_all_types=True)
    all_canonical_curies = {
        canonical_info['preferred_curie']
        for canonical_info in canonicalized_info.values() if canonical_info
    }
    print(
        f"  Sending NodeSynonymizer.get_equivalent_nodes() {len(all_canonical_curies)} curies.."
    )
    equivalent_curies_info = synonymizer.get_equivalent_nodes(
        all_canonical_curies)
    recognized_curies = {
        curie
        for curie in equivalent_curies_info
        if equivalent_curies_info.get(curie)
    }
    equivalent_curies_dict = {
        curie: list(equivalent_curies_info.get(curie))
        for curie in recognized_curies
    }
    print(f"  Creating canonicalized nodes..")
    curie_map = dict()
    canonicalized_nodes = dict()
    for node in nodes:
        canonical_info = canonicalized_info.get(node['id'])
        canonicalized_curie = canonical_info.get(
            'preferred_curie', node['id']) if canonical_info else node['id']
        publications = node['publications'] if node.get('publications') else []
        description_in_list = [node['description']
                               ] if node.get('description') else []
        if canonicalized_curie in canonicalized_nodes:
            existing_canonical_node = canonicalized_nodes[canonicalized_curie]
            existing_canonical_node['publications'] = _merge_two_lists(
                existing_canonical_node['publications'], publications)
            existing_canonical_node['all_names'] = _merge_two_lists(
                existing_canonical_node['all_names'], [node['name']])
            existing_canonical_node['description'] = _merge_two_lists(
                existing_canonical_node['description'], description_in_list)
            # Add the IRI for the 'preferred' curie, if we've found that node
            if node['id'] == canonicalized_curie:
                existing_canonical_node['iri'] = node.get('iri')
        else:
            name = canonical_info[
                'preferred_name'] if canonical_info else node['name']
            preferred_type = canonical_info[
                'preferred_type'] if canonical_info else node['category_label']
            types = list(canonical_info['all_types']) if canonical_info else [
                node['category_label']
            ]
            iri = node['iri'] if node['id'] == canonicalized_curie else None
            all_names = [node['name']]
            canonicalized_node = _create_node(
                node_id=canonicalized_curie,
                name=name,
                preferred_type=preferred_type,
                types=types,
                publications=publications,
                equivalent_curies=equivalent_curies_dict.get(
                    canonicalized_curie, []),
                iri=iri,
                description=description_in_list,
                all_names=all_names)

            canonicalized_nodes[canonicalized_node['id']] = canonicalized_node
        curie_map[node[
            'id']] = canonicalized_curie  # Record this mapping for easy lookup later
    return canonicalized_nodes, curie_map
Пример #12
0
    def query_size_of_adjacent_nodes(self, node_curie, source_type, adjacent_type, kp="infores:rtx-kg2", rel_type=None):
        """
        Query adjacent nodes of a given source node based on adjacent node type.
        :param node_curie: (required) the curie id of query node. It accepts both single curie id or curie id list eg. "UniProtKB:P14136" or ['UniProtKB:P02675', 'UniProtKB:P01903', 'UniProtKB:P09601', 'UniProtKB:Q02878']
        :param source_type: (required) the type of source node, eg. "gene"
        :param adjacent_type: (required) the type of adjacent node, eg. "biological_process"
        :param kp: (optional) the knowledge provider to use, eg. "infores:rtx-kg2"(default)
        :param rel_type: (optional) edge type to consider, eg. "involved_in"
        :return a tuple with a dict containing the number of adjacent nodes for the query node and a list of removed nodes
        """

        res = None
        source_type = ComputeFTEST.convert_string_to_snake_case(source_type.replace('biolink:',''))
        source_type = ComputeFTEST.convert_string_biolinkformat(source_type)
        adjacent_type = ComputeFTEST.convert_string_to_snake_case(adjacent_type.replace('biolink:',''))
        adjacent_type = ComputeFTEST.convert_string_biolinkformat(adjacent_type)

        if rel_type is None:
            nodesynonymizer = NodeSynonymizer()
            normalized_nodes = nodesynonymizer.get_canonical_curies(node_curie)
            failure_nodes = list()
            mapping = {node:normalized_nodes[node]['preferred_curie'] for node in normalized_nodes if normalized_nodes[node] is not None}
            failure_nodes += list(normalized_nodes.keys() - mapping.keys())
            query_nodes = list(set(mapping.values()))
            query_nodes = [curie_id.replace("'", "''") if "'" in curie_id else curie_id for curie_id in query_nodes]
            # special_curie_ids = [curie_id for curie_id in query_nodes if "'" in curie_id]

            # Get connected to kg2c sqlite
            connection = sqlite3.connect(self.sqlite_file_path)
            cursor = connection.cursor()

            # Extract the neighbor count data
            node_keys_str = "','".join(query_nodes)  # SQL wants ('node1', 'node2') format for string lists
            sql_query = f"SELECT N.id, N.neighbor_counts " \
                        f"FROM neighbors AS N " \
                        f"WHERE N.id IN ('{node_keys_str}')"
            cursor.execute(sql_query)
            rows = cursor.fetchall()
            rows = [curie_id.replace("\'","'").replace("''", "'") if "'" in curie_id else curie_id for curie_id in rows]
            connection.close()

            # Load the counts into a dictionary
            neighbor_counts_dict = {row[0]:eval(row[1]) for row in rows}

            res_dict = {node:neighbor_counts_dict[mapping[node]].get(adjacent_type) for node in mapping if mapping[node] in neighbor_counts_dict and neighbor_counts_dict[mapping[node]].get(adjacent_type) is not None}
            failure_nodes += list(mapping.keys() - res_dict.keys())

            if len(failure_nodes) != 0:
                return (res_dict, failure_nodes)
            else:
                return (res_dict, [])

        else:
            if kp == 'ARAX/KG1':
                self.response.warning(f"Since the edge type '{rel_type}' is from KG1, we still use the DSL expand(kg=ARAX/KG1) to query neighbor count. However, the total node count is based on KG2c from 'nodesynonymizer.get_total_entity_count'. So the FET result might not be accurate.")

            # construct the instance of ARAXQuery class
            araxq = ARAXQuery()

            # check if node_curie is a str or a list
            if type(node_curie) is str:
                query_node_curie = node_curie
            elif type(node_curie) is list:
                node_id_list_str = "["
                for index in range(len(node_curie)):
                    node = node_curie[index]
                    if index + 1 == len(node_curie):
                        node_id_list_str = node_id_list_str + str(node) + "]"
                    else:
                        node_id_list_str = node_id_list_str + str(node) + ","

                query_node_curie = node_id_list_str
            else:
                self.response.error("The 'node_curie' argument of 'query_size_of_adjacent_nodes' method within FET only accepts str or list")
                return res

            # call the method of ARAXQuery class to query adjacent node
            query = {"operations": {"actions": [
                "create_message",
                f"add_qnode(ids={query_node_curie}, categories={source_type}, key=FET_n00)",
                f"add_qnode(categories={adjacent_type}, key=FET_n01)",
                f"add_qedge(subject=FET_n00, object=FET_n01, key=FET_e00, predicates={rel_type})",
                f"expand(edge_key=FET_e00,kp={kp})",
                #"resultify()",
                "return(message=true, store=false)"
            ]}}

            try:
                result = araxq.query(query)
                if result.status != 'OK':
                    self.response.error(f"Fail to query adjacent nodes from infores:rtx-kg2 for {node_curie}")
                    return res
                else:
                    res_dict = dict()
                    message = araxq.response.envelope.message
                    if type(node_curie) is str:
                        tmplist = set([edge_key for edge_key in message.knowledge_graph.edges if message.knowledge_graph.edges[edge_key].subject == node_curie or message.knowledge_graph.edges[edge_key].object == node_curie])  ## edge has no direction
                        if len(tmplist) == 0:
                            self.response.warning(f"Fail to query adjacent nodes from {kp} for {node_curie} in FET probably because expander ignores node type. For more details, please see issue897.")
                            return (res_dict,[node_curie])
                        res_dict[node_curie] = len(tmplist)
                        return (res_dict,[])
                    else:
                        check_empty = False
                        failure_nodes = list()
                        for node in node_curie:
                            tmplist = set([edge_key for edge_key in message.knowledge_graph.edges if message.knowledge_graph.edges[edge_key].subject == node or message.knowledge_graph.edges[edge_key].object == node])  ## edge has no direction
                            if len(tmplist) == 0:
                                self.response.warning(f"Fail to query adjacent nodes from {kp} for {node} in FET probably because expander ignores node type. For more details, please see issue897.")
                                failure_nodes.append(node)
                                check_empty = True
                                continue
                            res_dict[node] = len(tmplist)

                        if check_empty is True:
                            return (res_dict,failure_nodes)
                        else:
                            return (res_dict,[])
            except:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                self.response.error(tb, error_code=error_type.__name__)
                self.response.error(f"Something went wrong with querying adjacent nodes from {kp} for {node_curie}")
                return res
Пример #13
0
class NGDDatabaseBuilder:
    def __init__(self, pubmed_directory_path, is_test, live="Production"):
        self.RTXConfig = RTXConfiguration()
        self.RTXConfig.live = live
        ngd_filepath = os.path.sep.join([
            *pathlist[:(RTXindex + 1)], 'code', 'ARAX', 'KnowledgeSources',
            'NormalizedGoogleDistance'
        ])
        self.pubmed_directory_path = pubmed_directory_path
        self.conceptname_to_pmids_db_path = "conceptname_to_pmids.db"
        self.curie_to_pmids_db_path = f"{ngd_filepath}{os.path.sep}{self.RTXConfig.curie_to_pmids_path.split('/')[-1]}"
        self.status = 'OK'
        self.synonymizer = NodeSynonymizer()
        self.is_test = is_test

    def build_conceptname_to_pmids_db(self):
        # This function extracts curie -> PMIDs mappings from a Pubmed XML download (saves data in a pickledb)
        print(
            f"Starting to build {self.conceptname_to_pmids_db_path} from pubmed files.."
        )
        start = time.time()
        pubmed_directory = os.fsencode(self.pubmed_directory_path)
        all_file_names = [
            os.fsdecode(file) for file in os.listdir(pubmed_directory)
        ]
        pubmed_file_names = [
            file_name for file_name in all_file_names
            if file_name.startswith('pubmed') and file_name.endswith('.xml.gz')
        ]
        if not pubmed_file_names:
            print(
                f"ERROR: Couldn't find any PubMed XML files to scrape. Provide the path to the directory "
                f"containing your PubMed download as a command line argument.")
            self.status = 'ERROR'
        else:
            conceptname_to_pmids_map = dict()
            # Go through each downloaded pubmed file and build our dictionary of mappings
            pubmed_file_names_to_process = pubmed_file_names if not self.is_test else pubmed_file_names[:
                                                                                                        1]
            for file_name in pubmed_file_names_to_process:
                print(
                    f"  Starting to process file '{file_name}'.. ({pubmed_file_names_to_process.index(file_name) + 1}"
                    f" of {len(pubmed_file_names_to_process)})")
                file_start_time = time.time()
                with gzip.open(f"{self.pubmed_directory_path}/{file_name}"
                               ) as pubmed_file:
                    file_contents_tree = etree.parse(pubmed_file)
                pubmed_articles = file_contents_tree.xpath("//PubmedArticle")

                for article in pubmed_articles:
                    # Link each concept name to the PMID of this article
                    current_pmid = article.xpath(
                        ".//MedlineCitation/PMID/text()")[0]
                    descriptor_names = article.xpath(
                        ".//MedlineCitation/MeshHeadingList/MeshHeading/DescriptorName/text()"
                    )
                    qualifier_names = article.xpath(
                        ".//MedlineCitation/MeshHeadingList/MeshHeading/QualifierName/text()"
                    )
                    chemical_names = article.xpath(
                        ".//MedlineCitation/ChemicalList/Chemical/NameOfSubstance/text()"
                    )
                    gene_symbols = article.xpath(
                        ".//MedlineCitation/GeneSymbolList/GeneSymbol/text()")
                    keywords = article.xpath(
                        ".//MedlineCitation/KeywordList/Keyword/text()")
                    all_concept_names = descriptor_names + qualifier_names + chemical_names + gene_symbols + keywords
                    unique_concept_names = {
                        concept_name
                        for concept_name in all_concept_names if concept_name
                    }
                    for concept_name in unique_concept_names:
                        self._add_pmids_mapping(concept_name, current_pmid,
                                                conceptname_to_pmids_map)

                self._destroy_etree(
                    file_contents_tree)  # Hack around lxml memory leak
                print(
                    f"    took {round((time.time() - file_start_time) / 60, 2)} minutes"
                )

            # Save the data to the PickleDB after we're done
            print("  Loading data into PickleDB..")
            conceptname_to_pmids_db = pickledb.load(
                self.conceptname_to_pmids_db_path, False)
            for concept_name, pmid_list in conceptname_to_pmids_map.items():
                conceptname_to_pmids_db.set(
                    concept_name,
                    list({
                        self._create_pmid_curie_from_local_id(pmid)
                        for pmid in pmid_list
                    }))
            print("  Saving PickleDB file..")
            conceptname_to_pmids_db.dump()
            print(
                f"Done! Building {self.conceptname_to_pmids_db_path} took {round(((time.time() - start) / 60) / 60, 3)} hours"
            )

    def build_curie_to_pmids_db(self):
        # This function creates a final sqlite database of curie->PMIDs mappings using data scraped from Pubmed AND KG2
        print(
            f"Starting to build {self.curie_to_pmids_db_path.split(os.path.sep)[-1]}.."
        )
        start = time.time()
        curie_to_pmids_map = dict()
        self._add_pmids_from_pubmed_scrape(curie_to_pmids_map)
        if self.status != 'OK':
            return
        self._add_pmids_from_kg2_edges(curie_to_pmids_map)
        self._add_pmids_from_kg2_nodes(curie_to_pmids_map)
        print(
            f"  In the end, found PMID lists for {len(curie_to_pmids_map)} (canonical) curies"
        )
        self._save_data_in_sqlite_db(curie_to_pmids_map)
        print(
            f"Done! Building {self.curie_to_pmids_db_path.split(os.path.sep)[-1]} took {round((time.time() - start) / 60)} minutes."
        )

    # Helper methods

    def _add_pmids_from_kg2_edges(self, curie_to_pmids_map):
        print(f"  Getting PMIDs from edges in KG2 neo4j..")
        edge_query = f"match (n)-[e]->(m) where e.publications is not null and e.publications <> '[]' " \
                     f"return distinct n.id, m.id, e.publications{' limit 100' if self.is_test else ''}"
        edge_results = self._run_cypher_query(edge_query, 'KG2')
        print(f"  Processing results..")
        node_ids = {result['n.id']
                    for result in edge_results
                    }.union(result['m.id'] for result in edge_results)
        canonicalized_curies_dict = self._get_canonicalized_curies_dict(
            list(node_ids))
        for result in edge_results:
            canonicalized_node_ids = {
                canonicalized_curies_dict[result['n.id']],
                canonicalized_curies_dict[result['m.id']]
            }
            pmids = self._extract_and_format_pmids(result['e.publications'])
            if pmids:  # Sometimes publications list includes only non-PMID identifiers (like ISBN)
                for canonical_curie in canonicalized_node_ids:
                    self._add_pmids_mapping(canonical_curie, pmids,
                                            curie_to_pmids_map)

    def _add_pmids_from_kg2_nodes(self, curie_to_pmids_map):
        print(f"  Getting PMIDs from nodes in KG2 neo4j..")
        node_query = f"match (n) where n.publications is not null and n.publications <> '[]' " \
                     f"return distinct n.id, n.publications{' limit 100' if self.is_test else ''}"
        node_results = self._run_cypher_query(node_query, 'KG2')
        print(f"  Processing results..")
        node_ids = {result['n.id'] for result in node_results}
        canonicalized_curies_dict = self._get_canonicalized_curies_dict(
            list(node_ids))
        for result in node_results:
            canonical_curie = canonicalized_curies_dict[result['n.id']]
            pmids = self._extract_and_format_pmids(result['n.publications'])
            if pmids:  # Sometimes publications list includes only non-PMID identifiers (like ISBN)
                self._add_pmids_mapping(canonical_curie, pmids,
                                        curie_to_pmids_map)

    def _add_pmids_from_pubmed_scrape(self, curie_to_pmids_map):
        # Load the data from the first half of the build process (scraping pubmed)
        print(
            f"  Loading pickle DB containing pubmed scrapings ({self.conceptname_to_pmids_db_path}).."
        )
        conceptname_to_pmids_db = pickledb.load(
            self.conceptname_to_pmids_db_path, False)
        if not conceptname_to_pmids_db.getall():
            print(
                f"ERROR: {self.conceptname_to_pmids_db_path} must exist to do a partial build. Use --full or locate "
                f"that file.")
            self.status = 'ERROR'
            return

        # Get canonical curies for all of the concept names in our big pubmed pickleDB using the NodeSynonymizer
        concept_names = list(conceptname_to_pmids_db.getall())
        print(
            f"  Sending NodeSynonymizer.get_canonical_curies() a list of {len(concept_names)} concept names.."
        )
        canonical_curies_dict = self.synonymizer.get_canonical_curies(
            names=concept_names)
        print(
            f"  Got results back from NodeSynonymizer. (Returned dict contains {len(canonical_curies_dict)} keys.)"
        )

        # Map all of the concept names scraped from pubmed to curies
        if canonical_curies_dict:
            recognized_concepts = {
                concept
                for concept in canonical_curies_dict
                if canonical_curies_dict.get(concept)
            }
            print(
                f"  NodeSynonymizer recognized {round((len(recognized_concepts) / len(concept_names)) * 100)}% of "
                f"concept names scraped from pubmed.")
            # Store which concept names the NodeSynonymizer didn't know about, for learning purposes
            unrecognized_concepts = set(canonical_curies_dict).difference(
                recognized_concepts)
            with open('unrecognized_pubmed_concept_names.txt',
                      'w+') as unrecognized_concepts_file:
                unrecognized_concepts_file.write(f"{unrecognized_concepts}")
            print(
                f"  Unrecognized concept names were written to 'unrecognized_pubmed_concept_names.txt'."
            )

            # Map the canonical curie for each recognized concept to the concept's PMID list
            print(f"  Mapping canonical curies to PMIDs..")
            for concept_name in recognized_concepts:
                canonical_curie = canonical_curies_dict[concept_name].get(
                    'preferred_curie')
                pmids_for_this_concept = conceptname_to_pmids_db.get(
                    concept_name)
                self._add_pmids_mapping(canonical_curie,
                                        pmids_for_this_concept,
                                        curie_to_pmids_map)
            print(
                f"  Mapped {len(curie_to_pmids_map)} canonical curies to PMIDs based on pubmed scrapings."
            )
        else:
            print(f"ERROR: NodeSynonymizer didn't return anything!")
            self.status = 'ERROR'

    def _save_data_in_sqlite_db(self, curie_to_pmids_map):
        print("  Loading data into sqlite database..")
        # Remove any preexisting version of this database
        if os.path.exists(self.curie_to_pmids_db_path):
            os.remove(self.curie_to_pmids_db_path)
        connection = sqlite3.connect(self.curie_to_pmids_db_path)
        cursor = connection.cursor()
        cursor.execute("CREATE TABLE curie_to_pmids (curie TEXT, pmids TEXT)")
        cursor.execute(
            "CREATE UNIQUE INDEX unique_curie ON curie_to_pmids (curie)")
        print(f"  Gathering row data..")
        rows = [[
            curie,
            json.dumps(
                list(
                    filter(None,
                           {self._get_local_id_as_int(pmid)
                            for pmid in pmids})))
        ] for curie, pmids in curie_to_pmids_map.items()]
        rows_in_chunks = self._divide_list_into_chunks(rows, 5000)
        print(f"  Inserting row data into database..")
        for chunk in rows_in_chunks:
            cursor.executemany(
                f"INSERT INTO curie_to_pmids (curie, pmids) VALUES (?, ?)",
                chunk)
            connection.commit()
        # Log how many rows we've added in the end (for debugging purposes)
        cursor.execute(f"SELECT COUNT(*) FROM curie_to_pmids")
        count = cursor.fetchone()[0]
        print(f"  Done saving data in sqlite; database contains {count} rows.")
        cursor.close()

    def _get_canonicalized_curies_dict(self,
                                       curies: List[str]) -> Dict[str, str]:
        print(
            f"  Sending a batch of {len(curies)} curies to NodeSynonymizer.get_canonical_curies()"
        )
        canonicalized_nodes_info = self.synonymizer.get_canonical_curies(
            curies)
        canonicalized_curies_dict = dict()
        for input_curie, preferred_info_dict in canonicalized_nodes_info.items(
        ):
            if preferred_info_dict:
                canonicalized_curies_dict[
                    input_curie] = preferred_info_dict.get(
                        'preferred_curie', input_curie)
            else:
                canonicalized_curies_dict[input_curie] = input_curie
        print(f"  Got results back from synonymizer")
        return canonicalized_curies_dict

    def _extract_and_format_pmids(self, publications: List[str]) -> List[str]:
        pmids = {
            publication_id
            for publication_id in publications
            if publication_id.upper().startswith('PMID')
        }
        # Make sure all PMIDs are given in same format (e.g., PMID:18299583 rather than PMID18299583)
        formatted_pmids = [
            self._create_pmid_curie_from_local_id(
                pmid.replace('PMID', '').replace(':', '')) for pmid in pmids
        ]
        return formatted_pmids

    @staticmethod
    def _add_pmids_mapping(key: str, value_to_append: Union[str, List[str]],
                           mappings_dict: Dict[str, List[str]]):
        if key not in mappings_dict:
            mappings_dict[key] = []
        if isinstance(value_to_append, list):
            mappings_dict[key] += value_to_append
        else:
            mappings_dict[key].append(value_to_append)

    @staticmethod
    def _create_pmid_curie_from_local_id(pmid):
        return f"PMID:{pmid}"

    @staticmethod
    def _get_local_id_as_int(curie):
        # Converts "PMID:1234" to 1234
        curie_pieces = curie.split(":")
        local_id_str = curie_pieces[-1]
        # Remove any strange characters (like in "PMID:_19960544")
        stripped_id_str = "".join(
            [character for character in local_id_str if character.isdigit()])
        return int(stripped_id_str) if stripped_id_str else None

    @staticmethod
    def _destroy_etree(file_contents_tree):
        # Thank you to https://stackoverflow.com/a/49139904 for this method; important to prevent memory blow-up
        root = file_contents_tree.getroot()
        element_tracker = {root: [0, None]}
        for element in root.iterdescendants():
            parent = element.getparent()
            element_tracker[element] = [element_tracker[parent][0] + 1, parent]
        element_tracker = sorted(
            [(depth, parent, child)
             for child, (depth, parent) in element_tracker.items()],
            key=lambda x: x[0],
            reverse=True)
        for _, parent, child in element_tracker:
            if parent is None:
                break
            parent.remove(child)
        del file_contents_tree

    @staticmethod
    def _run_cypher_query(cypher_query: str, kg='KG2') -> List[Dict[str, any]]:
        rtxc = RTXConfiguration()
        if kg == 'KG2':
            rtxc.live = "KG2"
        try:
            driver = GraphDatabase.driver(rtxc.neo4j_bolt,
                                          auth=(rtxc.neo4j_username,
                                                rtxc.neo4j_password))
            with driver.session() as session:
                query_results = session.run(cypher_query).data()
            driver.close()
        except Exception:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            print(f"Encountered an error interacting with {kg} neo4j. {tb}")
            return []
        else:
            return query_results

    @staticmethod
    def _divide_list_into_chunks(input_list: List[any],
                                 chunk_size: int) -> List[List[any]]:
        num_chunks = len(input_list) // chunk_size if len(
            input_list) % chunk_size == 0 else (len(input_list) //
                                                chunk_size) + 1
        start_index = 0
        stop_index = chunk_size
        all_chunks = []
        for num in range(num_chunks):
            chunk = input_list[start_index:stop_index] if stop_index <= len(
                input_list) else input_list[start_index:]
            all_chunks.append(chunk)
            start_index += chunk_size
            stop_index += chunk_size
        return all_chunks
Пример #14
0
    def assess(self, message):

        #### Define a default response
        response = ARAXResponse()
        self.response = response
        self.message = message
        response.debug(f"Assessing the QueryGraph for basic information")

        #### Get shorter handles
        query_graph = message.query_graph
        nodes = query_graph.nodes
        edges = query_graph.edges

        #### Store number of nodes and edges
        self.n_nodes = len(nodes)
        self.n_edges = len(edges)
        response.debug(f"Found {self.n_nodes} nodes and {self.n_edges} edges")

        #### Handle impossible cases
        if self.n_nodes == 0:
            response.error(
                "QueryGraph has 0 nodes. At least 1 node is required",
                error_code="QueryGraphZeroNodes")
            return response
        if self.n_nodes == 1 and self.n_edges > 0:
            response.error(
                "QueryGraph may not have edges if there is only one node",
                error_code="QueryGraphTooManyEdges")
            return response
        #if self.n_nodes == 2 and self.n_edges > 1:
        #    response.error("QueryGraph may not have more than 1 edge if there are only 2 nodes", error_code="QueryGraphTooManyEdges")
        #    return response

        #### Loop through nodes computing some stats
        node_info = {}
        self.node_category_map = {}
        for key, qnode in nodes.items():
            node_info[key] = {
                'key': key,
                'node_object': qnode,
                'has_id': False,
                'category': qnode.category,
                'has_category': False,
                'is_set': False,
                'n_edges': 0,
                'n_links': 0,
                'is_connected': False,
                'edges': [],
                'edge_dict': {}
            }
            if qnode.id is not None:
                node_info[key]['has_id'] = True

                #### If the user did not specify a category, but there is a curie, try to figure out the category
                if node_info[key]['category'] is None:
                    synonymizer = NodeSynonymizer()
                    curie = qnode.id
                    curies_list = qnode.id
                    if isinstance(qnode.id, list):
                        curie = qnode.id[0]
                    else:
                        curies_list = [qnode.id]

                    canonical_curies = synonymizer.get_canonical_curies(
                        curies=curies_list, return_all_categories=True)
                    if curie in canonical_curies and 'preferred_type' in canonical_curies[
                            curie]:
                        node_info[key]['has_category'] = True
                        node_info[key]['category'] = canonical_curies[curie][
                            'preferred_type']

            if qnode.category is not None:
                node_info[key]['has_category'] = True

            #if qnode.is_set is not None: node_info[key]['is_set'] = True
            if key is None:
                response.error(
                    "QueryGraph has a node with null key. This is not permitted",
                    error_code="QueryGraphNodeWithNoId")
                return response

            #### Remap the node categorys from unsupported to supported
            if qnode.category is not None:
                qnode.category = self.remap_node_category(qnode.category)

            #### Store lookup of categorys
            warning_counter = 0
            if qnode.category is None or (isinstance(qnode.category, list)
                                          and len(qnode.category) == 0):
                if warning_counter == 0:
                    #response.debug("QueryGraph has nodes with no category. This may cause problems with results inference later")
                    pass
                warning_counter += 1
                self.node_category_map['unknown'] = key
            else:
                category = qnode.category
                if isinstance(qnode.category, list):
                    category = qnode.category[
                        0]  # FIXME this is a hack prior to proper list handling
                self.node_category_map[category] = key

        #### Loop through edges computing some stats
        edge_info = {}
        self.edge_predicate_map = {}
        unique_links = {}

        #### Ignore special informationational edges for now.
        virtual_edge_predicates = {
            'has_normalized_google_distance_with': 1,
            'has_fisher_exact_test_p-value_with': 1,
            'has_jaccard_index_with': 1,
            'probably_treats': 1,
            'has_paired_concept_frequency_with': 1,
            'has_observed_expected_ratio_with': 1,
            'has_chi_square_with': 1
        }

        for key, qedge in edges.items():

            predicate = qedge.predicate
            if isinstance(predicate, list):
                if len(predicate) == 0:
                    predicate = None
                else:
                    predicate = predicate[
                        0]  # FIXME Hack before dealing with predicates as lists!

            if predicate is not None and predicate in virtual_edge_predicates:
                continue

            edge_info[key] = {
                'key': key,
                'has_predicate': False,
                'subject': qedge.subject,
                'object': qedge.object,
                'predicate': None
            }
            if predicate is not None:
                edge_info[key]['has_predicate'] = True
                edge_info[key]['predicate'] = predicate

            if key is None:
                response.error(
                    "QueryGraph has a edge with null key. This is not permitted",
                    error_code="QueryGraphEdgeWithNoKey")
                return response

            #### Create a unique node link string
            link_string = ','.join(sorted([qedge.subject, qedge.object]))
            if link_string not in unique_links:
                node_info[qedge.subject]['n_links'] += 1
                node_info[qedge.object]['n_links'] += 1
                unique_links[link_string] = 1
                #print(link_string)

            node_info[qedge.subject]['n_edges'] += 1
            node_info[qedge.object]['n_edges'] += 1
            node_info[qedge.subject]['is_connected'] = True
            node_info[qedge.object]['is_connected'] = True
            #node_info[qedge.subject]['edges'].append(edge_info[key])
            #node_info[qedge.object]['edges'].append(edge_info[key])
            node_info[qedge.subject]['edges'].append(edge_info[key])
            node_info[qedge.object]['edges'].append(edge_info[key])
            node_info[qedge.subject]['edge_dict'][key] = edge_info[key]
            node_info[qedge.object]['edge_dict'][key] = edge_info[key]

            #### Store lookup of predicates
            warning_counter = 0
            edge_predicate = 'any'
            if predicate is None:
                if warning_counter == 0:
                    response.debug(
                        "QueryGraph has edges with no predicate. This may cause problems with results inference later"
                    )
                warning_counter += 1
            else:
                edge_predicate = predicate

            #### It's not clear yet whether we need to store the whole sentence or just the predicate
            #predicate_encoding = f"{node_info[qedge.subject]['predicate']}---{edge_predicate}---{node_info[qedge.object]['predicate']}"
            predicate_encoding = edge_predicate
            self.edge_predicate_map[predicate_encoding] = key

        #### Loop through the nodes again, trying to identify the start_node and the end_node
        singletons = []
        for node_id, node_data in node_info.items():
            if node_data['n_links'] < 2:
                singletons.append(node_data)
            elif node_data['n_links'] > 2:
                self.is_bifurcated_graph = True
                response.warning(
                    "QueryGraph appears to have a fork in it. This might cause trouble"
                )

        #### If this doesn't produce any singletons, then try curie based selection
        if len(singletons) == 0:
            for node_id, node_data in node_info.items():
                if node_data['has_id']:
                    singletons.append(node_data)

        #### If this doesn't produce any singletons, then we don't know how to continue
        if len(singletons) == 0:
            response.error("Unable to understand the query graph",
                           error_code="QueryGraphCircular")
            return response

        #### Try to identify the start_node and the end_node
        start_node = singletons[0]
        if len(nodes) == 1:
            # Just a single node, fine
            pass
        elif len(singletons) < 2:
            response.warning(
                "QueryGraph appears to be circular or has a strange geometry. This might cause trouble"
            )
        elif len(singletons) > 2:
            response.warning(
                "QueryGraph appears to have a fork in it. This might cause trouble"
            )
        else:
            if singletons[0]['has_id'] is True and singletons[1][
                    'has_id'] is False:
                start_node = singletons[0]
            elif singletons[0]['has_id'] is False and singletons[1][
                    'has_id'] is True:
                start_node = singletons[1]
            else:
                start_node = singletons[0]
        #### Hmm, that's not very robust against odd graphs. This needs work. FIXME

        self.node_info = node_info
        self.edge_info = edge_info
        self.start_node = start_node

        current_node = start_node
        node_order = [start_node]
        edge_order = []
        edges = current_node['edges']
        debug = False

        while 1:
            if debug:
                tmp = {
                    'astate': '1',
                    'current_node': current_node,
                    'node_order': node_order,
                    'edge_order': edge_order,
                    'edges': edges
                }
                print(
                    json.dumps(ast.literal_eval(repr(tmp)),
                               sort_keys=True,
                               indent=2))
                print(
                    '=================================================================================='
                )
                tmp = input()

            if len(edges) == 0:
                break
            #if len(edges) > 1:
            if current_node['n_links'] > 1:
                response.error(
                    f"Help, two edges at A583. Don't know what to do: {current_node['n_links']}",
                    error_code="InteralErrorA583")
                return response
            edge_order.append(edges[0])
            previous_node = current_node
            if edges[0]['subject'] == current_node['key']:
                current_node = node_info[edges[0]['object']]
            elif edges[0]['object'] == current_node['key']:
                current_node = node_info[edges[0]['subject']]
            else:
                response.error("Help, edge error A584. Don't know what to do",
                               error_code="InteralErrorA584")
                return response
            node_order.append(current_node)

            #tmp = { 'astate': '2', 'current_node': current_node, 'node_order': node_order, 'edge_order': edge_order, 'edges': edges }
            #print(json.dumps(ast.literal_eval(repr(tmp)),sort_keys=True,indent=2))
            #print('==================================================================================')
            #tmp = input()

            edges = current_node['edges']
            new_edges = []
            for edge in edges:
                key = edge['key']
                if key not in previous_node['edge_dict']:
                    new_edges.append(edge)
            edges = new_edges
            if len(edges) == 0:
                break
            #tmp = { 'astate': '3', 'current_node': current_node, 'node_order': node_order, 'edge_order': edge_order, 'edges': edges }
            #print(json.dumps(ast.literal_eval(repr(tmp)),sort_keys=True,indent=2))
            #print('==================================================================================')
            #tmp = input()

        self.node_order = node_order
        self.edge_order = edge_order

        # Create a text rendering of the QueryGraph geometry for matching against a template
        self.query_graph_templates = {
            'simple': '',
            'detailed': {
                'n_nodes': len(node_order),
                'components': []
            }
        }
        node_index = 0
        edge_index = 0
        #print(json.dumps(ast.literal_eval(repr(node_order)),sort_keys=True,indent=2))
        for node in node_order:
            component_id = f"n{node_index:02}"
            content = ''
            component = {
                'component_type': 'node',
                'component_id': component_id,
                'has_id': node['has_id'],
                'has_category': node['has_category'],
                'category_value': None
            }
            self.query_graph_templates['detailed']['components'].append(
                component)
            if node['has_id']:
                content = 'id'
            elif node['has_category'] and node[
                    'node_object'].category is not None:
                content = f"category={node['node_object'].category}"
                component['category_value'] = node['node_object'].category
            elif node['has_category']:
                content = 'category'
            template_part = f"{component_id}({content})"
            self.query_graph_templates['simple'] += template_part

            # Since queries with intermediate nodes that are not is_set=true tend to blow up, for now, make them is_set=true unless explicitly set to false
            if node_index > 0 and node_index < (self.n_nodes - 1):
                if 'is_set' not in node or node['is_set'] is None:
                    node['node_object'].is_set = True
                    response.warning(
                        f"Setting unspecified is_set to true for {node['key']} because this will probably lead to a happier result"
                    )
                elif node['is_set'] is True:
                    response.debug(
                        f"Value for is_set is already true for {node['key']} so that's good"
                    )
                elif node['is_set'] is False:
                    #response.info(f"Value for is_set is set to false for intermediate node {node['key']}. This could lead to weird results. Consider setting it to true")
                    response.info(
                        f"Value for is_set is false for intermediate node {node['key']}. Setting to true because this will probably lead to a happier result"
                    )
                    node['node_object'].is_set = True
                #else:
                #    response.error(f"Unrecognized value is_set='{node['is_set']}' for {node['key']}. This should be true or false")

            node_index += 1
            if node_index < self.n_nodes:
                #print(json.dumps(ast.literal_eval(repr(node)),sort_keys=True,indent=2))

                #### Extract the has_predicate and predicate_value from the edges of the node
                #### This could fail if there are two edges coming out of the node FIXME
                has_predicate = False
                predicate_value = None
                if 'edges' in node:
                    for related_edge in node['edges']:
                        if related_edge['subject'] == node['key']:
                            has_predicate = related_edge['has_predicate']
                            if has_predicate is True and 'predicate' in related_edge:
                                predicate_value = related_edge['predicate']

                component_id = f"e{edge_index:02}"
                template_part = f"-{component_id}()-"
                self.query_graph_templates['simple'] += template_part
                component = {
                    'component_type': 'edge',
                    'component_id': component_id,
                    'has_id': False,
                    'has_predicate': has_predicate,
                    'predicate_value': predicate_value
                }
                self.query_graph_templates['detailed']['components'].append(
                    component)
                edge_index += 1

        response.debug(
            f"The QueryGraph reference template is: {self.query_graph_templates['simple']}"
        )

        #tmp = { 'node_info': node_info, 'edge_info': edge_info, 'start_node': start_node, 'n_nodes': self.n_nodes, 'n_edges': self.n_edges,
        #    'is_bifurcated_graph': self.is_bifurcated_graph, 'node_order': node_order, 'edge_order': edge_order }
        #print(json.dumps(ast.literal_eval(repr(tmp)),sort_keys=True,indent=2))
        #sys.exit(0)

        #### Return the response
        return response
Пример #15
0
    def add_qnode(self, response, input_parameters, describe=False):
        """
        Adds a new QNode object to the QueryGraph inside the Message object
        :return: ARAXResponse object with execution information
        :rtype: ARAXResponse
        """

        # #### Command definition for autogenerated documentation
        command_definition = {
            'dsl_command': 'add_qnode()',
            'description':
            """The `add_qnode` method adds an additional QNode to the QueryGraph in the Message object.""",
            'parameters': {
                'key': {
                    'is_required':
                    False,
                    'examples': ['n00', 'n01'],
                    'default':
                    '',
                    'type':
                    'string',
                    'description':
                    """Any string that is unique among all QNode key fields, with recommended format n00, n01, n02, etc.
                        If no value is provided, autoincrementing values beginning for n00 are used.""",
                },
                'id': {
                    'is_required':
                    False,
                    'examples':
                    ['DOID:9281', '[UniProtKB:P12345,UniProtKB:Q54321]'],
                    'type':
                    'string',
                    'description':
                    'Any compact URI (CURIE) (e.g. DOID:9281) (May also be a list like [UniProtKB:P12345,UniProtKB:Q54321])',
                },
                'name': {
                    'is_required':
                    False,
                    'examples': ['hypertension', 'insulin'],
                    'type':
                    'string',
                    'description':
                    'Any name of a bioentity that will be resolved into a CURIE if possible or result in an error if not (e.g. hypertension, insulin)',
                },
                'category': {
                    'is_required':
                    False,
                    'examples': ['protein', 'chemical_substance', 'disease'],
                    'type':
                    'ARAXnode',
                    'description':
                    'Any valid Translator bioentity category (e.g. protein, chemical_substance, disease)',
                },
                'is_set': {
                    'is_required':
                    False,
                    'enum':
                    ["true", "false", "True", "False", "t", "f", "T", "F"],
                    'examples': ['true', 'false'],
                    'type':
                    'boolean',
                    'description':
                    'If set to true, this QNode represents a set of nodes that are all in common between the two other linked QNodes (assumed to be false if not specified or value is not recognized as true/t case insensitive)'
                },
                'option_group_id': {
                    'is_required':
                    False,
                    'examples': ['1', 'a', 'b2', 'option'],
                    'type':
                    'string',
                    'description':
                    'A group identifier indicating a group of nodes and edges should either all be included or all excluded. An optional match for all elements in this group. If not included Node will be treated as required.'
                },
            }
        }

        if describe:
            return command_definition

        #### Extract the message to work on
        message = response.envelope.message

        #### Basic checks on arguments
        if not isinstance(input_parameters, dict):
            response.error("Provided parameters is not a dict",
                           error_code="ParametersNotDict")
            return response

        #### Define a complete set of allowed parameters and their defaults
        parameters = {
            'key': None,
            'id': None,
            'name': None,
            'category': None,
            'is_set': None,
            'option_group_id': None,
        }

        #### Loop through the input_parameters and override the defaults and make sure they are allowed
        for key, value in input_parameters.items():
            if key not in parameters:
                response.error(f"Supplied parameter {key} is not permitted",
                               error_code="UnknownParameter")
            else:
                parameters[key] = value

        #### Check for option_group_id and is_set:
        if parameters['option_group_id'] is not None and parameters[
                'id'] is None and parameters['name'] is None:
            if parameters['is_set'] is None:
                parameters['is_set'] = 'true'
                response.warning(
                    f"An 'option_group_id' was set to {parameters['option_group_id']}, but 'is_set' was not an included parameter. It must be true when an 'option_group_id' is given, so automatically setting to true. Avoid this warning by explictly setting to true."
                )
            elif not (parameters['is_set'].lower() == 'true'
                      or parameters['is_set'].lower() == 't'):
                response.error(
                    f"When an 'option_group_id' is given 'is_set' must be set to true. However, supplied input for parameter 'is_set' was {parameters['is_set']}.",
                    error_code="InputMismatch")

        #### Return if any of the parameters generated an error (showing not just the first one)
        if response.status != 'OK':
            return response

        #### Now apply the filters. Order of operations is probably quite important
        #### Scalar value filters probably come first like minimum_confidence, then complex logic filters
        #### based on edge or node properties, and then finally maximum_results
        response.info(
            f"Adding a QueryNode to Message with input parameters {parameters}"
        )

        #### Make sure there's a query_graph already here
        if message.query_graph is None:
            message.query_graph = QueryGraph()
            message.query_graph.nodes = {}
            message.query_graph.edges = {}
        if message.query_graph.nodes is None:
            message.query_graph.nodes = {}

        #### Set up the NodeSynonymizer to find curies and names
        synonymizer = NodeSynonymizer()

        # Create the QNode and set the key
        qnode = QNode()
        if parameters['key'] is not None:
            key = parameters['key']
        else:
            key = self.__get_next_free_node_key()

        if parameters['option_group_id'] is not None:
            qnode.option_group_id = parameters['option_group_id']

        # Set the is_set parameter to what the user selected
        if parameters['is_set'] is not None:
            qnode.is_set = (parameters['is_set'].lower() == 'true'
                            or parameters['is_set'].lower() == 't')

        #### If the id is specified, try to find that
        if parameters['id'] is not None:

            # If the id is a scalar then treat it here as a list of one
            if isinstance(parameters['id'], str):
                id_list = [parameters['id']]
                is_id_a_list = False
                if parameters['is_set'] is not None and qnode.is_set is True:
                    response.error(
                        f"Specified id '{parameters['id']}' is a scalar, but is_set=true, which doesn't make sense",
                        error_code="IdScalarButIsSetTrue")
                    return response

            # Or else set it up as a list
            elif isinstance(parameters['id'], list):
                id_list = parameters['id']
                is_id_a_list = True
                qnode.id = []
                if parameters['is_set'] is None:
                    response.warning(
                        f"Specified id '{parameters['id']}' is a list, but is_set was not set to true. It must be true in this context, so automatically setting to true. Avoid this warning by explictly setting to true."
                    )
                    qnode.is_set = True
                else:
                    if qnode.is_set == False:
                        response.warning(
                            f"Specified id '{parameters['id']}' is a list, but is_set=false, which doesn't make sense, so automatically setting to true. Avoid this warning by explictly setting to true."
                        )
                        qnode.is_set = True

            # Or if it's neither a list or a string, then error out. This cannot be handled at present
            else:
                response.error(
                    f"Specified id '{parameters['id']}' is neither a string nor a list. This cannot to handled",
                    error_code="IdNotListOrScalar")
                return response

            # Loop over the available ids and create the list
            for id in id_list:
                response.debug(f"Looking up id {id} in NodeSynonymizer")
                synonymizer_results = synonymizer.get_canonical_curies(
                    curies=[id])

                # If nothing was found, we won't bail out, but rather just issue a warning that this id is suspect
                if synonymizer_results[id] is None:
                    response.warning(
                        f"A node with id {id} is not in our knowledge graph KG2, but will continue with it"
                    )
                    if is_id_a_list:
                        qnode.id.append(id)
                    else:
                        qnode.id = id

                # And if it is found, keep the same id but report the preferred id
                else:

                    response.info(f"id {id} is found. Adding it to the qnode")
                    if is_id_a_list:
                        qnode.id.append(id)
                    else:
                        qnode.id = id

                if 'category' in parameters and parameters[
                        'category'] is not None:
                    if isinstance(parameters['category'], str):
                        qnode.category = parameters['category']
                    else:
                        qnode.category = parameters['category'][0]

            message.query_graph.nodes[key] = qnode
            return response

        #### If the name is specified, try to find that
        if parameters['name'] is not None:
            name = parameters['name']
            response.debug(
                f"Looking up id for name '{name}' in NodeSynonymizer")
            synonymizer_results = synonymizer.get_canonical_curies(
                curies=[name], names=[name])

            if synonymizer_results[name] is None:
                response.error(
                    f"A node with name '{name}' is not in our knowledge graph",
                    error_code="UnresolvableNodeName")
                return response

            qnode.id = synonymizer_results[name]['preferred_curie']
            response.info(
                f"Creating QueryNode with id '{qnode.id}' for name '{name}'")
            if parameters['category'] is not None:
                qnode.category = parameters['category']
            message.query_graph.nodes[key] = qnode
            return response

        #### If the category is specified, just add that category. There should be checking that it is legal. FIXME
        if parameters['category'] is not None:
            qnode.category = parameters['category']
            if parameters['is_set'] is not None:
                qnode.is_set = (parameters['is_set'].lower() == 'true')
            message.query_graph.nodes[key] = qnode
            return response

        #### If we get here, it means that all three main parameters are null. Just a generic node with no category or anything. This is okay.
        message.query_graph.nodes[key] = qnode
        return response
    else:
        message = response.envelope.message
        target_curie_list += [
            node_key for node_key, _ in message.knowledge_graph.nodes.items()
        ]

if database == 'DTD':
    if len(check_wrong_queries) != 0:
        print(
            f'Something wrong occurred in these DSL queries {check_wrong_queries}'
        )
        exit()
    else:
        target_curie_list = list(set(target_curie_list))
        target_curie_list = [
            synonymizer.get_canonical_curies(curie)[curie]['preferred_curie']
            for curie in target_curie_list
            if synonymizer.get_canonical_curies(curie)[curie] is not None
        ]
        # print(target_curie_list)
        if os.path.isfile(DTD_prob_db_file):
            ## pull all data from `DTD_probability_database.db` database
            con = sqlite3.connect(DTD_prob_db_file)
            table = pd.read_sql_query("SELECT * from DTD_PROBABILITY", con)
            con.close()
            drug_list = [
                synonymizer.get_canonical_curies(
                    curie)[curie]['preferred_curie']
                for curie in target_curie_list
                if synonymizer.get_canonical_curies(curie)[curie] is not None
                and synonymizer.get_canonical_curies(
Пример #17
0
def _canonicalize_nodes(
    neo4j_nodes: List[Dict[str, any]]
) -> Tuple[Dict[str, Dict[str, any]], Dict[str, str]]:
    synonymizer = NodeSynonymizer()
    node_ids = [node.get('id') for node in neo4j_nodes if node.get('id')]
    print(
        f"  Sending NodeSynonymizer.get_canonical_curies() {len(node_ids)} curies.."
    )
    canonicalized_info = synonymizer.get_canonical_curies(
        curies=node_ids, return_all_categories=True)
    all_canonical_curies = {
        canonical_info['preferred_curie']
        for canonical_info in canonicalized_info.values() if canonical_info
    }
    print(
        f"  Sending NodeSynonymizer.get_equivalent_nodes() {len(all_canonical_curies)} curies.."
    )
    equivalent_curies_info = synonymizer.get_equivalent_nodes(
        all_canonical_curies)
    recognized_curies = {
        curie
        for curie in equivalent_curies_info
        if equivalent_curies_info.get(curie)
    }
    equivalent_curies_dict = {
        curie: list(equivalent_curies_info.get(curie))
        for curie in recognized_curies
    }
    print(f"  Creating canonicalized nodes..")
    curie_map = dict()
    canonicalized_nodes = dict()
    for neo4j_node in neo4j_nodes:
        # Grab relevant info for this node and its canonical version
        canonical_info = canonicalized_info.get(neo4j_node['id'])
        canonicalized_curie = canonical_info.get(
            'preferred_curie',
            neo4j_node['id']) if canonical_info else neo4j_node['id']
        publications = neo4j_node['publications'] if neo4j_node.get(
            'publications') else []
        descriptions_list = [neo4j_node['description']
                             ] if neo4j_node.get('description') else []
        if canonicalized_curie in canonicalized_nodes:
            # Merge this node into its corresponding canonical node
            existing_canonical_node = canonicalized_nodes[canonicalized_curie]
            existing_canonical_node['publications'] = _merge_two_lists(
                existing_canonical_node['publications'], publications)
            existing_canonical_node['all_names'] = _merge_two_lists(
                existing_canonical_node['all_names'], [neo4j_node['name']])
            existing_canonical_node['descriptions_list'] = _merge_two_lists(
                existing_canonical_node['descriptions_list'],
                descriptions_list)
            # Make sure any nodes subject to #1074-like problems still appear in equivalent curies
            existing_canonical_node['equivalent_curies'] = _merge_two_lists(
                existing_canonical_node['equivalent_curies'],
                [neo4j_node['id']])
            # Add the IRI and description for the 'preferred' curie, if we've found that node
            if neo4j_node['id'] == canonicalized_curie:
                existing_canonical_node['iri'] = neo4j_node.get('iri')
                existing_canonical_node['description'] = neo4j_node.get(
                    'description')
        else:
            # Initiate the canonical node for this synonym group
            name = canonical_info[
                'preferred_name'] if canonical_info else neo4j_node['name']
            category = canonical_info[
                'preferred_category'] if canonical_info else neo4j_node[
                    'category']
            if not category.startswith("biolink:"):
                print(
                    f"  WARNING: Preferred category for {canonicalized_curie} doesn't start with 'biolink:': {category}"
                )
            all_categories = list(
                canonical_info['all_categories']) if canonical_info else [
                    neo4j_node['category']
                ]
            expanded_categories = list(
                canonical_info['expanded_categories']) if canonical_info else [
                    neo4j_node['category']
                ]
            iri = neo4j_node['iri'] if neo4j_node[
                'id'] == canonicalized_curie else None
            description = neo4j_node.get(
                'description'
            ) if neo4j_node['id'] == canonicalized_curie else None
            all_names = [neo4j_node['name']]

            # Check for bug where not all categories in synonymizer were of "biolink:PascalCase" format
            if not all(
                    category.startswith("biolink:")
                    for category in all_categories):
                print(
                    f" WARNING: all_categories for {canonicalized_curie} contain non 'biolink:PascalCase' "
                    f"items: {all_categories}")
            if not all(
                    category.startswith("biolink:")
                    for category in expanded_categories):
                print(
                    f" WARNING: expanded_categories for {canonicalized_curie} contain non 'biolink:PascalCase' "
                    f"items: {expanded_categories}")

            canonicalized_node = _create_node(
                preferred_curie=canonicalized_curie,
                name=name,
                category=category,
                all_categories=all_categories,
                expanded_categories=expanded_categories,
                publications=publications,
                equivalent_curies=equivalent_curies_dict.get(
                    canonicalized_curie, [canonicalized_curie]),
                iri=iri,
                description=description,
                descriptions_list=descriptions_list,
                all_names=all_names)
            canonicalized_nodes[canonicalized_node['id']] = canonicalized_node
        curie_map[neo4j_node[
            'id']] = canonicalized_curie  # Record this mapping for easy lookup later
    return canonicalized_nodes, curie_map
Пример #18
0
def report_on_curies_missed_by_local_ngd(kg: str):
    backup_ngd = NormGoogleDistance()
    synonymizer = NodeSynonymizer()
    curie_to_pmid_db = SqliteDict(f"./curie_to_pmids.sqlite")
    batch_size = 50

    # Get random selection of nodes from the KG
    query = f"match (a) return a.id, a.name, rand() as r order by r limit {batch_size}"
    results = _run_cypher_query(query, kg)
    canonical_curie_info = synonymizer.get_canonical_curies(
        [result['a.id'] for result in results])
    recognized_curies = {
        input_curie
        for input_curie in canonical_curie_info
        if canonical_curie_info.get(input_curie)
    }

    # Figure out which of these local ngd misses
    misses = set()
    for curie in recognized_curies:
        canonical_curie = canonical_curie_info[curie].get('preferred_curie')
        if canonical_curie not in curie_to_pmid_db:
            misses.add(curie)
    percent_missed = round((len(misses) / len(recognized_curies)) * 100)
    print(
        f"Local ngd missed {len(misses)} of {len(recognized_curies)} curies ({percent_missed}%)"
    )

    # Try eUtils for each of the curies local ngd missed
    num_eutils_found = 0
    try:
        with open('misses_found_by_eutils.json', 'r') as file_to_add_to:
            found_dict = json.load(file_to_add_to)
    except Exception:
        found_dict = dict()
    for missed_curie in misses:
        # Try eUtils for this node
        node_id = canonical_curie_info[missed_curie].get('preferred_curie')
        node_name = canonical_curie_info[missed_curie].get('preferred_name')
        node_type = canonical_curie_info[missed_curie].get('preferred_type')
        try:
            pmids = backup_ngd.get_pmids_for_all([node_id], [node_name])
        except Exception:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            print(f"ERROR using back-up method: {tb}")
        else:
            if len(pmids) and ([pmid_list
                                for pmid_list in pmids if pmid_list]):
                num_eutils_found += 1
                print(
                    f"    Found {len(pmids[0])} PMIDs for {node_id}, {node_name}."
                )
                found_dict[node_id] = {'name': node_name, 'type': node_type}
            else:
                print(f"    Not found. ({node_id}, {node_name})")

    # Report some findings
    percent_found_by_eutils = round((num_eutils_found / len(misses)) * 100)
    print(
        f"Eutils found {num_eutils_found} out of {len(misses)} curies that local ngd missed ({percent_found_by_eutils}%)"
    )
    found_types = [
        node_info['type'] for node_id, node_info in found_dict.items()
    ]
    counter = collections.Counter(found_types)
    print(counter)

    # Save the data to a JSON file for access later
    with open('misses_found_by_eutils.json', 'w+') as output_file:
        json.dump(found_dict, output_file)
class PredictDrugTreatsDisease:

    #### Constructor
    def __init__(self, response, message, parameters):
        self.response = response
        self.message = message
        self.parameters = parameters
        self.global_iter = 0
        ## check if the new model files exists in /predictor/retrain_data. If not, scp it from arax.ncats.io
        pathlist = os.path.realpath(__file__).split(os.path.sep)
        RTXindex = pathlist.index("RTX")
        filepath = os.path.sep.join([*pathlist[:(RTXindex + 1)], 'code', 'ARAX', 'ARAXQuery', 'Overlay', 'predictor','retrain_data'])

        ## check if there is LogModel.pkl
        pkl_file = f"{filepath}/LogModel.pkl"
        if os.path.exists(pkl_file):
            pass
        else:
            os.system("scp [email protected]:/data/orangeboard/databases/KG2.3.4/LogModel.pkl " + pkl_file)

        ## check if there is GRAPH.sqlite
        db_file = f"{filepath}/GRAPH.sqlite"
        if os.path.exists(db_file):
            pass
        else:
            os.system("scp [email protected]:/data/orangeboard/databases/KG2.3.4/GRAPH.sqlite " + db_file)

        ## check if there is DTD_probability_database.db
        DTD_prob_db_file = f"{filepath}/DTD_probability_database_v1.0.db"
        if os.path.exists(DTD_prob_db_file):
            pass
        else:
            os.system("scp [email protected]:/data/orangeboard/databases/KG2.3.4/DTD_probability_database_v1.0.db " + DTD_prob_db_file)

        # use NodeSynonymizer to replace map.txt
        # check if there is map.txt
        # map_file = f"{filepath}/map.txt"
        # if os.path.exists(map_file):
        #     pass
        # else:
        #     os.system("scp [email protected]:/home/ubuntu/drug_repurposing_model_retrain/map.txt " + map_file)

        self.use_prob_db = True
        if self.use_prob_db is True:
            try:
                self.pred = predictor(DTD_prob_file=DTD_prob_db_file, use_prob_db=True)
            except:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                self.response.error(tb, error_code=error_type.__name__)
                self.response.error(f"Internal Error encountered connecting to the local DTD prediction database.")
        else:
            try:
                self.pred = predictor(model_file=pkl_file, use_prob_db=False)
            except:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                self.response.error(tb, error_code=error_type.__name__)
                self.response.error(f"Internal Error encountered connecting to the local LogModel.pkl file.")
            try:
                self.pred.import_file(None, graph_database=db_file)
            except:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                self.response.error(tb, error_code=error_type.__name__)
                self.response.error(f"Internal Error encountered connecting to the local graph database file.")
        # with open(map_file, 'r') as infile:
        #     map_file_content = infile.readlines()
        #     map_file_content.pop(0) ## remove title
        #     self.known_curies = set(line.strip().split('\t')[0] for line in map_file_content)

        self.synonymizer = NodeSynonymizer()

    def convert_to_trained_curies(self, input_curie):
        """
        Takes an input curie from the KG, uses the synonymizer, and then returns something that the map.csv can handle
        """
        normalizer_result = self.synonymizer.get_canonical_curies(input_curie)
        curies_in_model = normalizer_result[input_curie]
        # curies_in_model = [curie for curie in curies_in_model if curie in self.known_curies]
        # equivalent_curies = []  # start with empty equivalent_curies
        # try:
        #     equivalent_curies = [x['identifier'] for x in normalizer_result[input_curie]['equivalent_identifiers']]
        # except:
        #     self.response.warning(f"NodeSynonmizer could not find curies for {input_curie}, skipping this one.")
        # for curie in equivalent_curies:
        #     curie_prefix = curie.split(':')[0]
        #     # FIXME: fix this when re-training the ML model, as when this was originally trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123
        #     if curie_prefix == "CHEMBL.COMPOUND":
        #         chembl_fix = 'ChEMBL:' + curie[22:]
        #         if chembl_fix in self.known_curies:
        #             curies_in_model.add(chembl_fix)
        #     elif curie in self.known_curies:
        #         curies_in_model.add(curie)
        return curies_in_model

    def predict_drug_treats_disease(self):
        """
        Iterate over all the edges in the knowledge graph, add the drug-disease treatment probability for appropriate edges
        on the edge_attributes
        :return: response
        """
        parameters = self.parameters
        self.response.debug(f"Computing drug disease treatment probability based on a machine learning model")
        self.response.info(f"Computing drug disease treatment probability based on a machine learning model: See [this publication](https://doi.org/10.1101/765305) for more details about how this is accomplished.")

        attribute_name = "probability_treats"
        attribute_type = "EDAM:data_0951"
        value = 0  # this will be the default value. If the model returns 0, or the default is there, don't include that edge
        url = "https://doi.org/10.1101/765305"

        # if you want to add virtual edges, identify the source/targets, decorate the edges, add them to the KG, and then add one to the QG corresponding to them
        if 'virtual_relation_label' in parameters:
            source_curies_to_decorate = set()
            target_curies_to_decorate = set()
            curie_to_name = dict()
            # identify the nodes that we should be adding virtual edges for
            for node_key, node in self.message.knowledge_graph.nodes.items():
                if hasattr(node, 'qnode_keys'):
                    if parameters['subject_qnode_key'] in node.qnode_keys:
                        if "drug" in node.category or "chemical_substance" in node.category or "biolink:Drug" in node.category or "biolink:ChemicalSubstance" in node.category:  # this is now NOT checked by ARAX_overlay
                            source_curies_to_decorate.add(node_key)
                            curie_to_name[node_key] = node.name
                    if parameters['object_qnode_key'] in node.qnode_keys:
                        if "disease" in node.category or "phenotypic_feature" in node.category or "biolink:Disease" in node.category or "biolink:PhenotypicFeature" in node.category:  # this is now NOT checked by ARAX_overlay
                            target_curies_to_decorate.add(node_key)
                            curie_to_name[node_key] = node.name

            added_flag = False  # check to see if any edges where added
            # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute

            for (source_curie, target_curie) in itertools.product(source_curies_to_decorate, target_curies_to_decorate):
                self.response.debug(f"Predicting probability that {curie_to_name[source_curie]} treats {curie_to_name[target_curie]}")
                # create the edge attribute if it can be
                # loop over all equivalent curies and take the highest probability

                max_probability = 0
                converted_source_curie = self.convert_to_trained_curies(source_curie)
                if converted_source_curie is None:
                    continue
                else:
                    preferred_type = converted_source_curie['preferred_type']
                    if preferred_type == "drug" or preferred_type == "chemical_substance" or preferred_type == "biolink:Drug" or preferred_type == "biolink:ChemicalSubstance":
                        converted_source_curie = converted_source_curie['preferred_curie']
                    else:
                        continue
                converted_target_curie = self.convert_to_trained_curies(target_curie)
                if converted_target_curie is None:
                    continue
                else:
                    preferred_type = converted_target_curie['preferred_type']
                    if preferred_type == "disease" or preferred_type == "phenotypic_feature" or preferred_type == "biolink:Disease" or preferred_type == "biolink:PhenotypicFeature":
                        converted_target_curie = converted_target_curie['preferred_curie']
                    else:
                        continue
                if self.use_prob_db is True:
                    probability = self.pred.get_prob_from_DTD_db(converted_source_curie, converted_target_curie)
                    if probability is not None:
                        if np.isfinite(probability):
                            max_probability = probability
                else:
                    probability = self.pred.prob_single(converted_source_curie, converted_target_curie)
                    if probability is not None:
                        probability = probability[0]
                        if np.isfinite(probability):
                            max_probability = probability
                # if len(res) != 0:
                #     all_probabilities = self.pred.prob_all(res)
                #     if isinstance(all_probabilities, list):
                #         max_probability = max([value for value in all_probabilities if np.isfinite(value)])

                value = max_probability

                #probability = self.pred.prob_single('ChEMBL:' + source_curie[22:], target_curie)  # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123
                #if probability and np.isfinite(probability):  # finite, that's ok, otherwise, stay with default
                #    value = probability[0]
                edge_attribute = EdgeAttribute(type=attribute_type, name=attribute_name, value=str(value), url=url)  # populate the edge attribute
                if edge_attribute and value != 0:
                    added_flag = True
                    # make the edge, add the attribute

                    # edge properties
                    now = datetime.now()
                    edge_type = "biolink:probably_treats"
                    qedge_keys = [parameters['virtual_relation_label']]
                    relation = parameters['virtual_relation_label']
                    is_defined_by = "ARAX"
                    defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S")
                    provided_by = "ARAX"
                    confidence = None
                    weight = None  # TODO: could make the actual value of the attribute
                    subject_key = source_curie
                    object_key = target_curie

                    # now actually add the virtual edges in
                    id = f"{relation}_{self.global_iter}"
                    self.global_iter += 1
                    edge_attribute_list = [
                        edge_attribute,
                        EdgeAttribute(name="is_defined_by", value=is_defined_by, type="ARAX_TYPE_PLACEHOLDER"),
                        EdgeAttribute(name="defined_datetime", value=defined_datetime, type="metatype:Datetime"),
                        EdgeAttribute(name="provided_by", value=provided_by, type="biolink:provided_by"),
                        #EdgeAttribute(name="confidence", value=confidence, type="biolink:ConfidenceLevel"),
                        #EdgeAttribute(name="weight", value=weight, type="metatype:Float")
                    ]
                    edge = Edge(predicate=edge_type, subject=subject_key, object=object_key, relation=relation,
                                attributes=edge_attribute_list)
                    edge.qedge_keys = qedge_keys
                    self.message.knowledge_graph.edges[id] = edge

            # Now add a q_edge the query_graph since I've added an extra edge to the KG
            if added_flag:
                edge_type = "biolink:probably_treats"
                relation = parameters['virtual_relation_label']
                subject_qnode_key = parameters['subject_qnode_key']
                object_qnode_key = parameters['object_qnode_key']
                option_group_id = ou.determine_virtual_qedge_option_group(subject_qnode_key, object_qnode_key, self.message.query_graph, self.response)
                q_edge = QEdge(predicate=edge_type, relation=relation, subject=subject_qnode_key, object=object_qnode_key, option_group_id=option_group_id)
                self.message.query_graph.edges[relation] = q_edge
            return self.response

        else:  # you want to add it for each edge in the KG
            # iterate over KG edges, add the information
            try:
                # map curies to types
                curie_to_type = dict()
                curie_to_name = dict()
                for node_key, node in self.message.knowledge_graph.nodes.items():
                    curie_to_type[node_key] = node.category
                    curie_to_name[node_key] = node.name
                # then iterate over the edges and decorate if appropriate
                for edge_key, edge in self.message.knowledge_graph.edges.items():
                    # Make sure the edge_attributes are not None
                    if not edge.attributes:
                        edge.attributes = []  # should be an array, but why not a list?
                    # now go and actually get the probability
                    source_curie = edge.subject
                    target_curie = edge.object
                    source_types = curie_to_type[source_curie]
                    target_types = curie_to_type[target_curie]
                    if (("drug" in source_types) or ("chemical_substance" in source_types) or ("biolink:Drug" in source_types) or ("biolink:ChemicalSubstance" in source_types)) and (("disease" in target_types) or ("phenotypic_feature" in target_types) or ("biolink:Disease" in target_types) or ("biolink:PhenotypicFeature" in target_types)):
                        # loop over all pairs of equivalent curies and take the highest probability
                        self.response.debug(f"Predicting treatment probability between {curie_to_name[source_curie]} and {curie_to_name[target_curie]}")
                        max_probability = 0
                        converted_source_curie = self.convert_to_trained_curies(source_curie)
                        if converted_source_curie is None:
                            continue
                        else:
                            preferred_type = converted_source_curie['preferred_type']
                            if preferred_type == "drug" or preferred_type == "chemical_substance" or preferred_type == "biolink:Drug" or preferred_type == "biolink:ChemicalSubstance":
                                converted_source_curie = converted_source_curie['preferred_curie']
                            else:
                                continue
                        converted_target_curie = self.convert_to_trained_curies(target_curie)
                        if converted_target_curie is None:
                            continue
                        else:
                            preferred_type = converted_target_curie['preferred_type']
                            if preferred_type == "disease" or preferred_type == "phenotypic_feature" or preferred_type == "biolink:Disease" or preferred_type == "biolink:PhenotypicFeature":
                                converted_target_curie = converted_target_curie['preferred_curie']
                            else:
                                continue
                        if self.use_prob_db is True:
                            probability = self.pred.get_prob_from_DTD_db(converted_source_curie, converted_target_curie)
                            if probability is not None:
                                if np.isfinite(probability):
                                    max_probability = probability
                        else:
                            probability = self.pred.prob_single(converted_source_curie, converted_target_curie)
                            if probability is not None:
                                probability = probability[0]
                                if np.isfinite(probability):
                                    max_probability = probability
                        # res = list(itertools.product(converted_source_curie, converted_target_curie))
                        # if len(res) != 0:
                        #     all_probabilities = self.pred.prob_all(res)
                        #     if isinstance(all_probabilities, list):
                        #         max_probability = max([value for value in all_probabilities if np.isfinite(value)])

                        value = max_probability

                        #probability = self.pred.prob_single('ChEMBL:' + source_curie[22:], target_curie)  # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123
                        #if probability and np.isfinite(probability):  # finite, that's ok, otherwise, stay with default
                        #    value = probability[0]
                    elif (("drug" in target_types) or ("chemical_substance" in target_types) or ("biolink:Drug" in target_types) or ("biolink:ChemicalSubstance" in target_types)) and (("disease" in source_types) or ("phenotypic_feature" in source_types) or ("biolink:Disease" in source_types) or ("biolink:PhenotypicFeature" in source_types)):
                        #probability = self.pred.prob_single('ChEMBL:' + target_curie[22:], source_curie)  # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123
                        #if probability and np.isfinite(probability):  # finite, that's ok, otherwise, stay with default
                        #    value = probability[0]
                        self.response.debug(f"Predicting treatment probability between {curie_to_name[source_curie]} and {curie_to_name[target_curie]}")
                        max_probability = 0
                        converted_source_curie = self.convert_to_trained_curies(source_curie)
                        if converted_source_curie is None:
                            continue
                        else:
                            preferred_type = converted_source_curie['preferred_type']
                            if preferred_type == "disease" or preferred_type == "phenotypic_feature" or preferred_type == "biolink:Disease" or preferred_type == "biolink:PhenotypicFeature":
                                converted_source_curie = converted_source_curie['preferred_curie']
                            else:
                                continue
                        converted_target_curie = self.convert_to_trained_curies(target_curie)
                        if converted_target_curie is None:
                            continue
                        else:
                            preferred_type = converted_target_curie['preferred_type']
                            if preferred_type == "drug" or preferred_type == "chemical_substance" or preferred_type == "biolink:Drug" or preferred_type == "biolink:ChemicalSubstance":
                                converted_target_curie = converted_target_curie['preferred_curie']
                            else:
                                continue

                        if self.use_prob_db is True:
                            probability = self.pred.get_prob_from_DTD_db(converted_target_curie, converted_source_curie)
                            if probability is not None:
                                if np.isfinite(probability):
                                    max_probability = probability
                        else:
                            probability = self.pred.prob_single(converted_target_curie, converted_source_curie)
                            if probability is not None:
                                probability = probability[0]
                                if np.isfinite(probability):
                                    max_probability = probability
                        # res = list(itertools.product(converted_target_curie, converted_source_curie))
                        # if len(res) != 0:
                        #     all_probabilities = self.pred.prob_all(res)
                        #     if isinstance(all_probabilities, list):
                        #         max_probability = max([value for value in all_probabilities if np.isfinite(value)])

                        value = max_probability

                    else:
                        continue
                    if value != 0:
                        edge_attribute = EdgeAttribute(type=attribute_type, name=attribute_name, value=str(value), url=url)  # populate the attribute
                        edge.attributes.append(edge_attribute)  # append it to the list of attributes
            except:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                self.response.error(tb, error_code=error_type.__name__)
                self.response.error(f"Something went wrong adding the drug disease treatment probability")
            else:
                self.response.info(f"Drug disease treatment probability successfully added to edges")

            return self.response
Пример #20
0
def _canonicalize_nodes(
    nodes: List[Dict[str,
                     any]]) -> Tuple[List[Dict[str, any]], Dict[str, str]]:
    synonymizer = NodeSynonymizer()
    node_ids = [node.get('id') for node in nodes if node.get('id')]
    print(
        f"  Sending NodeSynonymizer.get_canonical_curies() a list of {len(node_ids)} curies.."
    )
    canonicalized_info = synonymizer.get_canonical_curies(
        curies=node_ids, return_all_types=True)
    print(f"  Creating canonicalized nodes..")
    curie_map = dict()
    canonicalized_nodes = dict()
    for node in nodes:
        canonical_info = canonicalized_info.get(node['id'])
        canonicalized_curie = canonical_info.get(
            'preferred_curie', node['id']) if canonical_info else node['id']
        node['publications'] = _literal_eval_list(
            node['publications']
        )  # Only need to do this until kg2.2+ is rolled out
        if canonicalized_curie in canonicalized_nodes:
            existing_canonical_node = canonicalized_nodes[canonicalized_curie]
            existing_canonical_node['publications'] = _merge_two_lists(
                existing_canonical_node['publications'], node['publications'])
        else:
            if canonical_info:
                canonicalized_node = {
                    'id':
                    canonicalized_curie,
                    'name':
                    canonical_info.get('preferred_name', node['name']),
                    'types':
                    list(canonical_info.get('all_types')),
                    'preferred_type':
                    canonical_info.get('preferred_type',
                                       node['category_label']),
                    'publications':
                    node['publications']
                }
            else:
                canonicalized_node = {
                    'id': canonicalized_curie,
                    'name': node['name'],
                    'types': [node['category_label']],
                    'preferred_type': node['category_label'],
                    'publications': node['publications']
                }
            canonicalized_nodes[canonicalized_node['id']] = canonicalized_node
        curie_map[node[
            'id']] = canonicalized_curie  # Record this mapping for easy lookup later

    # Create a node containing information about this KG2C build
    new_build_node = {
        'id': 'RTX:KG2C',
        'name':
        f"KG2C:Build created on {datetime.now().strftime('%Y-%m-%d %H:%M')}",
        'types': ['data_file'],
        'preferred_type': 'data_file',
        'publications': []
    }
    canonicalized_nodes[new_build_node['id']] = new_build_node

    # Decorate nodes with equivalent curies
    print(
        f"  Sending NodeSynonymizer.get_equivalent_nodes() a list of {len(canonicalized_nodes)} curies.."
    )
    equivalent_curies_dict = synonymizer.get_equivalent_nodes(
        list(canonicalized_nodes.keys()))
    for curie, canonical_node in canonicalized_nodes.items():
        equivalent_curies = []
        equivalent_curies_dict_for_curie = equivalent_curies_dict.get(curie)
        if equivalent_curies_dict_for_curie is not None:
            for equivalent_curie in equivalent_curies_dict_for_curie:
                equivalent_curies.append(equivalent_curie)
        canonical_node['equivalent_curies'] = equivalent_curies

    # Convert array fields into the format neo4j wants and do final processing
    for canonicalized_node in canonicalized_nodes.values():
        canonicalized_node['types'] = _convert_list_to_neo4j_format(
            canonicalized_node['types'])
        canonicalized_node['publications'] = _convert_list_to_neo4j_format(
            canonicalized_node['publications'])
        canonicalized_node[
            'equivalent_curies'] = _convert_list_to_neo4j_format(
                canonicalized_node['equivalent_curies'])
        canonicalized_node[
            'preferred_type_for_conversion'] = canonicalized_node[
                'preferred_type']
    return list(canonicalized_nodes.values()), curie_map
Пример #21
0
args = parser.parse_args()

curie_type = eval(args.CurieType)
NodeNamesDescriptions = pd.read_csv(
    args.NodeDescriptionFile,
    sep='\t',
    header=None,
    names=['curie', 'name', 'full_name', 'type'])
NodeNamesDescriptions = NodeNamesDescriptions.loc[
    NodeNamesDescriptions.type.isin(curie_type), :].reset_index(drop=True)

preferred_synonyms = dict()
synonymizer = NodeSynonymizer()

for curie in NodeNamesDescriptions['curie']:
    preferred_curie = synonymizer.get_canonical_curies(curies=curie)[curie]
    if preferred_curie is None:
        print(f"{curie} doesn't have preferred curies", flush=True)
    else:
        if preferred_curie['preferred_curie'] not in preferred_synonyms:
            preferred_synonyms[preferred_curie['preferred_curie']] = dict()
            preferred_synonyms[preferred_curie['preferred_curie']][
                'preferred_name'] = preferred_curie['preferred_name']
            preferred_synonyms[preferred_curie['preferred_curie']][
                'preferred_type'] = preferred_curie['preferred_category']
            preferred_synonyms[
                preferred_curie['preferred_curie']]['synonyms'] = [curie]
        else:
            synonyms = set(preferred_synonyms[
                preferred_curie['preferred_curie']]['synonyms'])
            synonyms.update(set([curie]))
Пример #22
0
class NGDDatabaseBuilder:
    def __init__(self, is_test):
        logging.basicConfig(level=logging.INFO,
                            format='%(asctime)s %(levelname)s: %(message)s',
                            handlers=[
                                logging.FileHandler("ngdbuild.log"),
                                logging.StreamHandler()
                            ])
        self.pubmed_directory_path = f"{NGD_DIR}/pubmed_xml_files"
        self.conceptname_to_pmids_db_name = "conceptname_to_pmids.db"
        self.conceptname_to_pmids_db_path = f"{NGD_DIR}/{self.conceptname_to_pmids_db_name}"
        self.curie_to_pmids_db_name = "curie_to_pmids.sqlite"
        self.curie_to_pmids_db_path = f"{NGD_DIR}/{self.curie_to_pmids_db_name}"
        self.status = 'OK'
        self.synonymizer = NodeSynonymizer()
        self.is_test = is_test

    def build_ngd_database(self, do_full_build: bool):
        if do_full_build:
            self.build_conceptname_to_pmids_db()
        else:
            conceptname_to_pmids_db = pathlib.Path(
                self.conceptname_to_pmids_db_path)
            if not conceptname_to_pmids_db.exists():
                logging.error(
                    f"You did not specify to do a full build, but the artifact necessary for a partial "
                    f"build ({self.conceptname_to_pmids_db_name}) does not yet exist. Either use --full "
                    f"to do a full build or put your {self.conceptname_to_pmids_db_name} into the right"
                    f"place ({self.conceptname_to_pmids_db_path}).")
                self.status = "ERROR"
        if self.status == 'OK':
            self.build_curie_to_pmids_db()

    def build_conceptname_to_pmids_db(self):
        # This function extracts curie -> PMIDs mappings from the latest Pubmed XML files (saves data in a pickle DB)
        logging.info(
            f"Starting to build {self.conceptname_to_pmids_db_name} from pubmed files.."
        )
        start = time.time()
        logging.info(f" Deleting any pre-existing Pubmed files..")
        subprocess.call(["rm", "-rf", self.pubmed_directory_path])
        logging.info(
            f" Downloading latest Pubmed XML files (baseline and update files).."
        )
        subprocess.check_call([
            "wget", "-r", "ftp://ftp.ncbi.nlm.nih.gov/pubmed", "-P",
            self.pubmed_directory_path
        ])
        for sub_dir_name in ["baseline", "updatefiles"]:
            xml_file_sub_dir = f"{self.pubmed_directory_path}/ftp.ncbi.nlm.nih.gov/pubmed/{sub_dir_name}"
            all_file_names = [
                os.fsdecode(file) for file in os.listdir(xml_file_sub_dir)
            ]
            pubmed_file_names = [
                file_name for file_name in all_file_names
                if file_name.lower().startswith('pubmed')
                and file_name.lower().endswith('.xml.gz')
            ]

            # Make sure the files seem to have been downloaded ok
            if not pubmed_file_names:
                if sub_dir_name == "baseline":
                    logging.error(
                        "Couldn't find any PubMed baseline XML files to scrape. Something must've gone wrong "
                        "downloading them.")
                    self.status = 'ERROR'
                    return
                else:
                    logging.warning(
                        f"No Pubmed 'update' files detected. This might be ok (it's possible none exist), "
                        f"but it's a little weird.")

            logging.info(f" Starting to process {sub_dir_name} PubMed files..")
            conceptname_to_pmids_map = dict()
            # Go through each downloaded pubmed file and build our dictionary of mappings
            pubmed_file_names_to_process = pubmed_file_names if not self.is_test else pubmed_file_names[:
                                                                                                        1]
            for file_name in pubmed_file_names_to_process:
                logging.info(
                    f"  Starting to process file '{file_name}'.. ({pubmed_file_names_to_process.index(file_name) + 1}"
                    f" of {len(pubmed_file_names_to_process)})")
                file_start_time = time.time()
                with gzip.open(
                        f"{xml_file_sub_dir}/{file_name}") as pubmed_file:
                    file_contents_tree = etree.parse(pubmed_file)
                pubmed_articles = file_contents_tree.xpath("//PubmedArticle")

                for article in pubmed_articles:
                    # Link each concept name to the PMID of this article
                    current_pmid = article.xpath(
                        ".//MedlineCitation/PMID/text()")[0]
                    descriptor_names = article.xpath(
                        ".//MedlineCitation/MeshHeadingList/MeshHeading/DescriptorName/text()"
                    )
                    qualifier_names = article.xpath(
                        ".//MedlineCitation/MeshHeadingList/MeshHeading/QualifierName/text()"
                    )
                    chemical_names = article.xpath(
                        ".//MedlineCitation/ChemicalList/Chemical/NameOfSubstance/text()"
                    )
                    gene_symbols = article.xpath(
                        ".//MedlineCitation/GeneSymbolList/GeneSymbol/text()")
                    keywords = article.xpath(
                        ".//MedlineCitation/KeywordList/Keyword/text()")
                    all_concept_names = descriptor_names + qualifier_names + chemical_names + gene_symbols + keywords
                    unique_concept_names = {
                        concept_name
                        for concept_name in all_concept_names if concept_name
                    }
                    for concept_name in unique_concept_names:
                        self._add_pmids_mapping(concept_name, current_pmid,
                                                conceptname_to_pmids_map)

                self._destroy_etree(
                    file_contents_tree)  # Hack around lxml memory leak
                logging.info(
                    f"    took {round((time.time() - file_start_time) / 60, 2)} minutes"
                )

        # Save the data to the PickleDB after we're done
        logging.info("  Loading data into PickleDB..")
        conceptname_to_pmids_db = pickledb.load(
            self.conceptname_to_pmids_db_path, False)
        for concept_name, pmid_list in conceptname_to_pmids_map.items():
            conceptname_to_pmids_db.set(
                concept_name,
                list({
                    self._create_pmid_curie_from_local_id(pmid)
                    for pmid in pmid_list
                }))
        logging.info("  Saving PickleDB file..")
        conceptname_to_pmids_db.dump()
        logging.info(
            f"Done! Building {self.conceptname_to_pmids_db_name} took {round(((time.time() - start) / 60) / 60, 3)} hours"
        )

    def build_curie_to_pmids_db(self):
        # This function creates a final sqlite database of curie->PMIDs mappings using data scraped from Pubmed AND KG2
        logging.info(f"Starting to build {self.curie_to_pmids_db_name}..")
        start = time.time()
        curie_to_pmids_map = dict()
        self._add_pmids_from_pubmed_scrape(curie_to_pmids_map)
        if self.status != 'OK':
            return
        self._add_pmids_from_kg2_edges(curie_to_pmids_map)
        self._add_pmids_from_kg2_nodes(curie_to_pmids_map)
        logging.info(
            f"  In the end, found PMID lists for {len(curie_to_pmids_map)} (canonical) curies"
        )
        self._save_data_in_sqlite_db(curie_to_pmids_map)
        logging.info(
            f"Done! Building {self.curie_to_pmids_db_name} took {round((time.time() - start) / 60)} minutes."
        )

    # Helper methods

    def _add_pmids_from_kg2_edges(self, curie_to_pmids_map):
        logging.info(f"  Getting PMIDs from edges in KG2 neo4j..")
        edge_query = f"match (n)-[e]->(m) where e.publications is not null " \
                     f"return distinct n.id, m.id, e.publications{' limit 100' if self.is_test else ''}"
        edge_results = self._run_cypher_query(edge_query)
        logging.info(f"  Processing results..")
        node_ids = {result['n.id']
                    for result in edge_results
                    }.union(result['m.id'] for result in edge_results)
        canonicalized_curies_dict = self._get_canonicalized_curies_dict(
            list(node_ids))
        for result in edge_results:
            canonicalized_node_ids = {
                canonicalized_curies_dict[result['n.id']],
                canonicalized_curies_dict[result['m.id']]
            }
            pmids = self._extract_and_format_pmids(result['e.publications'])
            if pmids:  # Sometimes publications list includes only non-PMID identifiers (like ISBN)
                for canonical_curie in canonicalized_node_ids:
                    self._add_pmids_mapping(canonical_curie, pmids,
                                            curie_to_pmids_map)

    def _add_pmids_from_kg2_nodes(self, curie_to_pmids_map):
        logging.info(f"  Getting PMIDs from nodes in KG2 neo4j..")
        node_query = f"match (n) where n.publications is not null " \
                     f"return distinct n.id, n.publications{' limit 100' if self.is_test else ''}"
        node_results = self._run_cypher_query(node_query)
        logging.info(f"  Processing results..")
        node_ids = {result['n.id'] for result in node_results}
        canonicalized_curies_dict = self._get_canonicalized_curies_dict(
            list(node_ids))
        for result in node_results:
            canonical_curie = canonicalized_curies_dict[result['n.id']]
            pmids = self._extract_and_format_pmids(result['n.publications'])
            if pmids:  # Sometimes publications list includes only non-PMID identifiers (like ISBN)
                self._add_pmids_mapping(canonical_curie, pmids,
                                        curie_to_pmids_map)

    def _add_pmids_from_pubmed_scrape(self, curie_to_pmids_map):
        # Load the data from the first half of the build process (scraping pubmed)
        logging.info(
            f"  Loading pickle DB containing pubmed scrapings ({self.conceptname_to_pmids_db_name}).."
        )
        conceptname_to_pmids_db = pickledb.load(
            self.conceptname_to_pmids_db_path, False)
        if not conceptname_to_pmids_db.getall():
            logging.error(
                f"{self.conceptname_to_pmids_db_name} must exist in order to do a partial build. Use "
                f"--full to do a full build or put your {self.conceptname_to_pmids_db_name} into the right"
                f" place ({self.conceptname_to_pmids_db_path}).")
            self.status = 'ERROR'
            return

        # Get canonical curies for all of the concept names in our big pubmed pickleDB using the NodeSynonymizer
        concept_names = list(conceptname_to_pmids_db.getall())
        logging.info(
            f"  Sending NodeSynonymizer.get_canonical_curies() a list of {len(concept_names)} concept names.."
        )
        canonical_curies_dict = self.synonymizer.get_canonical_curies(
            names=concept_names)
        logging.info(
            f"  Got results back from NodeSynonymizer. (Returned dict contains {len(canonical_curies_dict)} keys.)"
        )

        # Map all of the concept names scraped from pubmed to curies
        if canonical_curies_dict:
            recognized_concepts = {
                concept
                for concept in canonical_curies_dict
                if canonical_curies_dict.get(concept)
            }
            logging.info(
                f"  NodeSynonymizer recognized {round((len(recognized_concepts) / len(concept_names)) * 100)}%"
                f" of concept names scraped from pubmed.")
            # Store which concept names the NodeSynonymizer didn't know about, for learning purposes
            unrecognized_concepts = set(canonical_curies_dict).difference(
                recognized_concepts)
            with open(f"{NGD_DIR}/unrecognized_pubmed_concept_names.txt",
                      "w+") as unrecognized_concepts_file:
                unrecognized_concepts_file.write(f"{unrecognized_concepts}")
            logging.info(
                f"  Unrecognized concept names were written to unrecognized_pubmed_concept_names.txt."
            )

            # Map the canonical curie for each recognized concept to the concept's PMID list
            logging.info(f"  Mapping canonical curies to PMIDs..")
            for concept_name in recognized_concepts:
                canonical_curie = canonical_curies_dict[concept_name].get(
                    'preferred_curie')
                pmids_for_this_concept = conceptname_to_pmids_db.get(
                    concept_name)
                self._add_pmids_mapping(canonical_curie,
                                        pmids_for_this_concept,
                                        curie_to_pmids_map)
            logging.info(
                f"  Mapped {len(curie_to_pmids_map)} canonical curies to PMIDs based on pubmed scrapings."
            )
        else:
            logging.error(f"NodeSynonymizer didn't return anything!")
            self.status = 'ERROR'

    def _save_data_in_sqlite_db(self, curie_to_pmids_map):
        logging.info("  Loading data into sqlite database..")
        # Remove any preexisting version of this database
        if os.path.exists(self.curie_to_pmids_db_path):
            os.remove(self.curie_to_pmids_db_path)
        connection = sqlite3.connect(self.curie_to_pmids_db_path)
        cursor = connection.cursor()
        cursor.execute("CREATE TABLE curie_to_pmids (curie TEXT, pmids TEXT)")
        cursor.execute(
            "CREATE UNIQUE INDEX unique_curie ON curie_to_pmids (curie)")
        logging.info(f"  Gathering row data..")
        rows = [[
            curie,
            json.dumps(
                list(
                    filter(None,
                           {self._get_local_id_as_int(pmid)
                            for pmid in pmids})))
        ] for curie, pmids in curie_to_pmids_map.items()]
        rows_in_chunks = self._divide_list_into_chunks(rows, 5000)
        logging.info(f"  Inserting row data into database..")
        for chunk in rows_in_chunks:
            cursor.executemany(
                f"INSERT INTO curie_to_pmids (curie, pmids) VALUES (?, ?)",
                chunk)
            connection.commit()
        # Log how many rows we've added in the end (for debugging purposes)
        cursor.execute(f"SELECT COUNT(*) FROM curie_to_pmids")
        count = cursor.fetchone()[0]
        logging.info(
            f"  Done saving data in sqlite; database contains {count} rows.")
        cursor.close()

    def _get_canonicalized_curies_dict(self,
                                       curies: List[str]) -> Dict[str, str]:
        logging.info(
            f"  Sending a batch of {len(curies)} curies to NodeSynonymizer.get_canonical_curies()"
        )
        canonicalized_nodes_info = self.synonymizer.get_canonical_curies(
            curies)
        canonicalized_curies_dict = dict()
        for input_curie, preferred_info_dict in canonicalized_nodes_info.items(
        ):
            if preferred_info_dict:
                canonicalized_curies_dict[
                    input_curie] = preferred_info_dict.get(
                        'preferred_curie', input_curie)
            else:
                canonicalized_curies_dict[input_curie] = input_curie
        logging.info(f"  Got results back from synonymizer")
        return canonicalized_curies_dict

    def _extract_and_format_pmids(self, publications: List[str]) -> List[str]:
        pmids = {
            publication_id
            for publication_id in publications
            if publication_id.upper().startswith('PMID')
        }
        # Make sure all PMIDs are given in same format (e.g., PMID:18299583 rather than PMID18299583)
        formatted_pmids = [
            self._create_pmid_curie_from_local_id(
                pmid.replace('PMID', '').replace(':', '')) for pmid in pmids
        ]
        return formatted_pmids

    @staticmethod
    def _add_pmids_mapping(key: str, value_to_append: Union[str, List[str]],
                           mappings_dict: Dict[str, List[str]]):
        if key not in mappings_dict:
            mappings_dict[key] = []
        if isinstance(value_to_append, list):
            mappings_dict[key] += value_to_append
        else:
            mappings_dict[key].append(value_to_append)

    @staticmethod
    def _create_pmid_curie_from_local_id(pmid):
        return f"PMID:{pmid}"

    @staticmethod
    def _get_local_id_as_int(curie):
        # Converts "PMID:1234" to 1234
        curie_pieces = curie.split(":")
        local_id_str = curie_pieces[-1]
        # Remove any strange characters (like in "PMID:_19960544")
        stripped_id_str = "".join(
            [character for character in local_id_str if character.isdigit()])
        return int(stripped_id_str) if stripped_id_str else None

    @staticmethod
    def _destroy_etree(file_contents_tree):
        # Thank you to https://stackoverflow.com/a/49139904 for this method; important to prevent memory blow-up
        root = file_contents_tree.getroot()
        element_tracker = {root: [0, None]}
        for element in root.iterdescendants():
            parent = element.getparent()
            element_tracker[element] = [element_tracker[parent][0] + 1, parent]
        element_tracker = sorted(
            [(depth, parent, child)
             for child, (depth, parent) in element_tracker.items()],
            key=lambda x: x[0],
            reverse=True)
        for _, parent, child in element_tracker:
            if parent is None:
                break
            parent.remove(child)
        del file_contents_tree

    @staticmethod
    def _run_cypher_query(cypher_query: str) -> List[Dict[str, any]]:
        rtxc = RTXConfiguration()
        rtxc.live = "KG2"
        try:
            driver = GraphDatabase.driver(rtxc.neo4j_bolt,
                                          auth=(rtxc.neo4j_username,
                                                rtxc.neo4j_password))
            with driver.session() as session:
                query_results = session.run(cypher_query).data()
            driver.close()
        except Exception:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            logging.error(
                f"Encountered an error interacting with KG2 neo4j. {tb}")
            return []
        else:
            return query_results

    @staticmethod
    def _divide_list_into_chunks(input_list: List[any],
                                 chunk_size: int) -> List[List[any]]:
        num_chunks = len(input_list) // chunk_size if len(
            input_list) % chunk_size == 0 else (len(input_list) //
                                                chunk_size) + 1
        start_index = 0
        stop_index = chunk_size
        all_chunks = []
        for num in range(num_chunks):
            chunk = input_list[start_index:stop_index] if stop_index <= len(
                input_list) else input_list[start_index:]
            all_chunks.append(chunk)
            start_index += chunk_size
            stop_index += chunk_size
        return all_chunks