Exemplo n.º 1
0
def remove_overly_general_nodes(canonicalized_nodes_dict: Dict[str, Dict[str, any]],
                                canonicalized_edges_dict: Dict[str, Dict[str, any]],
                                biolink_version: str,
                                is_test: bool) -> Tuple[Dict[str, Dict[str, any]], Dict[str, Dict[str, any]]]:
    logging.info(f"Removing overly general nodes from the graph..")
    bh = BiolinkHelper(biolink_version)
    # Remove all nodes that have a biolink category as an equivalent identifier, as well as a few others
    all_biolink_categories = set(bh.get_descendants("biolink:NamedThing"))
    overly_general_curies = {"MESH:D010361", "SO:0001217", "MONDO:0000001", "FMA:67257", "MESH:D002477",
                             "MESH:D005796", "UMLS:C1257890", "UMLS:C0237401", "PR:000029067", "UMLS:C1457887",
                             "biolink:Cohort", "UMLS:C1550655", "CHEBI:25212", "GO:0008150", "UMLS:C0029235",
                             "LOINC:LP7790-1"}.union(all_biolink_categories)
    # TODO: Later use some better heuristics to identify such nodes?

    node_ids_to_remove = {node_id for node_id, node in canonicalized_nodes_dict.items()
                          if set(node["equivalent_curies"]).intersection(overly_general_curies)}
    logging.info(f" Identified {len(node_ids_to_remove)} nodes to remove: {node_ids_to_remove}")
    for node_id in node_ids_to_remove:
        canonicalized_nodes_dict.pop(node_id, None)

    # Delete any now orphaned edges
    if not is_test:
        orphaned_edge_ids = {edge_id for edge_id, edge in canonicalized_edges_dict.items()
                             if edge["subject"] not in canonicalized_nodes_dict or
                             edge["object"] not in canonicalized_nodes_dict}
        logging.info(f"  Deleting {len(orphaned_edge_ids)} edges that were orphaned by the above steps..")
        for edge_id in orphaned_edge_ids:
            canonicalized_edges_dict.pop(edge_id, None)

    logging.info(f"Done removing overly general nodes: resulting KG2c now has {len(canonicalized_nodes_dict)} nodes "
                 f"and {len(canonicalized_edges_dict)} edges")
    return canonicalized_nodes_dict, canonicalized_edges_dict
Exemplo n.º 2
0
def record_meta_kg_info(is_test: bool):
    kg2c_lite_file_name = f"kg2c_lite{'_test' if is_test else ''}.json"
    meta_kg_file_name = f"kg2c_meta_kg{'_test' if is_test else ''}.json"
    sqlite_file_name = f"kg2c{'_test' if is_test else ''}.sqlite"
    fda_approved_file_name = f"fda_approved_drugs{'_test' if is_test else ''}.pickle"
    # Initiate a BiolinkHelper for the proper Biolink model version
    with open("kg2c_config.json") as config_file:
        config_info = json.load(config_file)
    bh = BiolinkHelper(config_info["biolink_version"])

    start = time.time()
    # Load the 'lite' KG2c file into node/edge dictionaries
    with open(f"{KG2C_DIR}/{kg2c_lite_file_name}", "r") as input_kg_file:
        logging.info(f"Loading {kg2c_lite_file_name} into memory..")
        kg2c_dict = json.load(input_kg_file)
        nodes_by_id = {node["id"]: node for node in kg2c_dict["nodes"]}
        edges_by_id = {edge["id"]: edge for edge in kg2c_dict["edges"]}
        del kg2c_dict
    # Add the 'expanded' node labels (including category ancestors) into the node dictionary
    expanded_labels_property_name = "expanded_labels"
    for node in nodes_by_id.values():
        node[expanded_labels_property_name] = bh.get_ancestors(
            node["all_categories"], include_mixins=True)

    build_meta_kg(nodes_by_id, edges_by_id, meta_kg_file_name, bh, is_test)
    add_neighbor_counts_to_sqlite(nodes_by_id, edges_by_id, sqlite_file_name,
                                  expanded_labels_property_name, is_test)
    add_category_counts_to_sqlite(nodes_by_id, sqlite_file_name,
                                  expanded_labels_property_name)
    generate_fda_approved_drugs_pickle(edges_by_id, fda_approved_file_name)

    logging.info(
        f"Recording meta KG info took {round((time.time() - start) / 60, 1)} minutes."
    )
Exemplo n.º 3
0
def create_kg2c_tsv_files(canonicalized_nodes_dict: Dict[str, Dict[str, any]],
                          canonicalized_edges_dict: Dict[str, Dict[str, any]],
                          biolink_version: str, is_test: bool):
    bh = BiolinkHelper(biolink_version)
    # Convert array fields into the format neo4j wants and do some final processing
    array_node_columns = _get_array_properties("node").union({"node_labels"})
    array_edge_columns = _get_array_properties("edge")
    node_labels_property = _get_node_labels_property()
    for canonicalized_node in canonicalized_nodes_dict.values():
        canonicalized_node['node_labels'] = bh.get_ancestors(canonicalized_node[node_labels_property], include_mixins=True)
        for list_node_property in array_node_columns:
            canonicalized_node[list_node_property] = _convert_list_to_string_encoded_format(canonicalized_node[list_node_property])
    for canonicalized_edge in canonicalized_edges_dict.values():
        if not is_test:  # Make sure we don't have any orphan edges
            assert canonicalized_edge['subject'] in canonicalized_nodes_dict
            assert canonicalized_edge['object'] in canonicalized_nodes_dict
        for list_edge_property in array_edge_columns:
            canonicalized_edge[list_edge_property] = _convert_list_to_string_encoded_format(canonicalized_edge[list_edge_property])
        canonicalized_edge['predicate_for_conversion'] = canonicalized_edge['predicate']
        canonicalized_edge['subject_for_conversion'] = canonicalized_edge['subject']
        canonicalized_edge['object_for_conversion'] = canonicalized_edge['object']

    # Finally dump all our nodes/edges into TSVs (formatted for neo4j)
    logging.info(f" Creating TSVs for Neo4j..")
    _write_list_to_neo4j_ready_tsv(list(canonicalized_nodes_dict.values()), "nodes_c", is_test)
    _write_list_to_neo4j_ready_tsv(list(canonicalized_edges_dict.values()), "edges_c", is_test)
Exemplo n.º 4
0
 def __init__(self, log: ARAXResponse = ARAXResponse()):
     self.meta_map_path = f"{os.path.dirname(os.path.abspath(__file__))}/meta_map_v2.pickle"
     self.timeout_record_path = f"{os.path.dirname(os.path.abspath(__file__))}/kp_timeout_record.pickle"
     self.log = log
     self.all_kps = eu.get_all_kps()
     self.timeout_record = self._load_timeout_record()
     self.meta_map = self._load_meta_map()
     self.biolink_helper = BiolinkHelper()
Exemplo n.º 5
0
def build_meta_kg(nodes_by_id: Dict[str, Dict[str, any]],
                  edges_by_id: Dict[str, Dict[str,
                                              any]], meta_kg_file_name: str,
                  biolink_helper: BiolinkHelper, is_test: bool):
    logging.info(f"Building meta KG..")
    logging.info(" Gathering all meta triples..")
    meta_triples = set()
    for edge in edges_by_id.values():
        subject_node_id = edge["subject"]
        object_node_id = edge["object"]
        if not is_test or (subject_node_id in nodes_by_id
                           and object_node_id in nodes_by_id):
            subject_node = nodes_by_id[subject_node_id]
            object_node = nodes_by_id[object_node_id]
            subject_categories = biolink_helper.add_conflations(
                subject_node["all_categories"])
            object_categories = biolink_helper.add_conflations(
                object_node["all_categories"])
            predicate = edge["predicate"]
            for subject_category in subject_categories:
                for object_category in object_categories:
                    meta_triples.add(
                        (subject_category, predicate, object_category))
    kg2_infores_curie = "infores:rtx-kg2"
    standard_attributes = [{
        "attribute_type_id": "biolink:knowledge_source",
        "attribute_source": kg2_infores_curie
    }, {
        "attribute_type_id": "biolink:aggregator_knowledge_source",
        "attribute_source": kg2_infores_curie
    }]
    meta_edges = [{
        "subject": triple[0],
        "predicate": triple[1],
        "object": triple[2],
        "attributes": standard_attributes
    } for triple in meta_triples]
    logging.info(f" Created {len(meta_edges)} meta edges")

    logging.info(" Gathering all meta nodes..")
    with open(f"{KG2C_DIR}/equivalent_curies.pickle",
              "rb") as equiv_curies_file:
        equivalent_curies_dict = pickle.load(equiv_curies_file)
    meta_nodes = defaultdict(lambda: defaultdict(lambda: set()))
    for node_id, node in nodes_by_id.items():
        equivalent_curies = equivalent_curies_dict.get(node_id, [node_id])
        prefixes = {curie.split(":")[0] for curie in equivalent_curies}
        categories = biolink_helper.add_conflations(node["category"])
        for category in categories:
            meta_nodes[category]["id_prefixes"].update(prefixes)
    logging.info(f" Created {len(meta_nodes)} meta nodes")

    logging.info(" Saving meta KG to JSON file..")
    meta_kg = {"nodes": meta_nodes, "edges": meta_edges}
    with open(f"{KG2C_DIR}/{meta_kg_file_name}", "w+") as meta_kg_file:
        json.dump(meta_kg, meta_kg_file, default=serialize_with_sets, indent=2)
Exemplo n.º 6
0
 def _get_supported_prefixes(self, categories: List[str],
                             kp: str) -> Set[str]:
     bh = BiolinkHelper()
     categories_with_descendants = bh.get_descendants(
         eu.convert_to_list(categories), include_mixins=False)
     supported_prefixes = {
         prefix.upper()
         for category in categories_with_descendants
         for prefix in self.meta_map[kp]["prefixes"].get(category, set())
     }
     return supported_prefixes
Exemplo n.º 7
0
 def __init__(self, response_object: ARAXResponse):
     self.response = response_object
     self.biolink_helper = BiolinkHelper()
     self.kg2_infores_curie = "infores:rtx-kg2"
     self.max_allowed_edges = 1000000
     self.max_edges_per_input_curie = 1000
     self.curie_batch_size = 100
Exemplo n.º 8
0
def check_for_canonical_predicates(
        kg: QGOrganizedKnowledgeGraph, kp_name: str,
        log: ARAXResponse) -> QGOrganizedKnowledgeGraph:
    non_canonical_predicates_used = set()
    biolink_helper = BiolinkHelper()
    for qedge_id, edges in kg.edges_by_qg_id.items():
        for edge in edges.values():
            canonical_predicate = biolink_helper.get_canonical_predicates(
                edge.predicate)[0]
            if canonical_predicate != edge.predicate:
                non_canonical_predicates_used.add(edge.predicate)
                _ = flip_edge(edge, canonical_predicate)
    if non_canonical_predicates_used:
        log.warning(
            f"{kp_name}: Found edges in {kp_name}'s answer that use non-canonical "
            f"predicates: {non_canonical_predicates_used}. I corrected these.")
    return kg
Exemplo n.º 9
0
 def __init__(self, response, message, params):
     self.response = response
     self.message = message
     self.parameters = params
     self.who_knows_about_what = {
         'COHD': [
             'small_molecule', 'phenotypic_feature', 'disease', 'drug',
             'biolink:SmallMolecule', 'biolink:PhenotypicFeature',
             'biolink:Disease', 'biolink:Drug'
         ]
     }  # FIXME: replace this with information about the KP's, KS's, and their API's
     self.node_curie_to_type = dict()
     self.biolink_helper = BiolinkHelper()
     self.global_iter = 0
     try:
         self.cohdIndex = COHDIndex()
     except:
         tb = traceback.format_exc()
         error_type, error, _ = sys.exc_info()
         self.response.error(tb, error_code=error_type.__name__)
         self.response.error(
             f"Internal Error encountered connecting to the local COHD database."
         )
Exemplo n.º 10
0
    def __init__(self):
        self.location = os.path.dirname(os.path.abspath(__file__))
        requests_cache.install_cache(self.location + '/category_manager.cache')
        self.bh = BiolinkHelper()

        self.categories = {'ancestors': {}, 'relevant_categories': {}}

        self.approved_conflations = {
            'biolink:Gene': ['biolink:Protein'],
            'biolink:Protein': ['biolink:Gene'],
            # Decided 2021-07-28 mini-hackathon that we may be best off NOT doing conflation here. Just use
            # ChemicalEntity to refer to everything
            # #'biolink:Drug': [ 'biolink:ChemicalEntity', 'biolink:MolecularEntity', 'biolink:SmallMolecule' ],
            #'biolink:ChemicalEntity': [ 'biolink:Drug', 'biolink:MolecularEntity', 'biolink:SmallMolecule' ],
            #'biolink:SmallMolecule': [ 'biolink:Drug', 'biolink:MolecularEntity', 'biolink:ChemicalEntity' ],
            #'biolink:MolecularEntity': [ 'biolink:Drug', 'biolink:SmallMolecule', 'biolink:ChemicalEntity' ],
            'biolink:Disease': ['biolink:PhenotypicFeature'],
            'biolink:PhenotypicFeature': ['biolink:Disease'],
            'biolink:DiseaseOrPhenotypicFeature':
            ['biolink:Disease', 'biolink:PhenotypicFeature'],
        }
Exemplo n.º 11
0
class OverlayClinicalInfo:

    #### Constructor
    def __init__(self, response, message, params):
        self.response = response
        self.message = message
        self.parameters = params
        self.who_knows_about_what = {
            'COHD': [
                'small_molecule', 'phenotypic_feature', 'disease', 'drug',
                'biolink:SmallMolecule', 'biolink:PhenotypicFeature',
                'biolink:Disease', 'biolink:Drug'
            ]
        }  # FIXME: replace this with information about the KP's, KS's, and their API's
        self.node_curie_to_type = dict()
        self.biolink_helper = BiolinkHelper()
        self.global_iter = 0
        try:
            self.cohdIndex = COHDIndex()
        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(
                f"Internal Error encountered connecting to the local COHD database."
            )

    def decorate(self):
        """
        Main decorator: looks at parameters and figures out which subroutine to farm out to
        :param parameters:
        :return: response object
        """
        # First, make a dictionary between node curie and type to make sure we're only looking at edges we can handle
        self.response.info(
            "Converting CURIE identifiers to human readable names")
        try:
            for key, node in self.message.knowledge_graph.nodes.items():
                self.node_curie_to_type[
                    key] = node.categories  # WARNING: this is a list
        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(f"Something went wrong when converting names")
            return self.response

        parameters = self.parameters
        if 'paired_concept_frequency' in parameters:
            if parameters['paired_concept_frequency'] == 'true':
                self.paired_concept_frequency()
                # TODO: should I return the response and merge, or is it passed by reference and just return at the end?
        if 'associated_concept_freq' in parameters:
            if parameters['associated_concept_freq'] == 'true':
                #self.associated_concept_freq()  # TODO: make this function, and all the other COHD functions too
                pass
        if 'chi_square' in parameters:
            if parameters['chi_square'] == 'true':
                self.chi_square(
                )  # TODO: make this function, and all the other COHD functions too
                pass
        if 'observed_expected_ratio' in parameters:
            if parameters['observed_expected_ratio'] == 'true':
                self.observed_expected_ratio(
                )  # TODO: make this function, and all the other COHD functions too
                pass
        if 'relative_frequency' in parameters:
            if parameters['relative_frequency'] == 'true':
                #self.associated_concept_freq()  # TODO: make this function, and all the other COHD functions too
                pass

        return self.response

    def in_common(self, list1, list2):
        """
        Helper function that returns true iff list1 and list2 have any elements in common
        :param list1: a list of strings (intended to be biolink node types)
        :param list2: another list of strings (intended to be biolink node types)
        :return: True/False if they share an element in common
        """
        if set(list1).intersection(set(list2)):
            return True
        else:
            return False

    def make_edge_attribute_from_curies(self,
                                        subject_curie,
                                        object_curie,
                                        subject_name="",
                                        object_name="",
                                        default=0.,
                                        name=""):
        """
        Generic function to make an edge attribute
        :subject_curie: CURIE of the subject node for the edge under consideration
        :object_curie: CURIE of the object node for the edge under consideration
        :subject_name: text name of the subject node (in case the KP doesn't understand the CURIE)
        :object: text name of the object node (in case the KP doesn't understand the CURIE)
        :default: default value of the edge attribute
        :name: name of the KP functionality you want to apply
        """
        try:
            # edge attributes
            name = name
            type = "EDAM:data_0951"
            url = "http://cohd.smart-api.info/"
            value = default

            node_curie_to_type = self.node_curie_to_type
            subject_type = node_curie_to_type[subject_curie]
            object_type = node_curie_to_type[object_curie]
            # figure out which knowledge provider to use  # TODO: should handle this in a more structured fashion, does there exist a standardized KP API format?
            KP_to_use = None
            for KP in self.who_knows_about_what:
                # see which KP's can label both subjects of information
                if self.in_common(
                        self.biolink_helper.get_descendants(
                            subject_type, include_mixins=False),
                        self.who_knows_about_what[KP]) and self.in_common(
                            self.biolink_helper.get_descendants(
                                object_type, include_mixins=False),
                            self.who_knows_about_what[KP]):
                    KP_to_use = KP

            if KP_to_use == 'COHD':
                self.response.debug(
                    f"Querying Columbia Open Health data for info about {subject_name} and {object_name}"
                )
                # convert CURIE to OMOP identifiers
                # subject_OMOPs = [str(x['omop_standard_concept_id']) for x in COHD.get_xref_to_OMOP(subject_curie, 1)]
                res = self.mapping_curie_to_omop_ids.get(subject_curie, [])
                if len(res) != 0:
                    subject_OMOPs = res
                else:
                    subject_OMOPs = []
                # object_OMOPs = [str(x['omop_standard_concept_id']) for x in COHD.get_xref_to_OMOP(object_curie, 1)]
                res = self.mapping_curie_to_omop_ids.get(object_curie, [])
                if len(res) != 0:
                    object_OMOPs = res
                else:
                    object_OMOPs = []
                # for domain in ["Condition", "Drug", "Procedure"]:
                #     subject_OMOPs.update([str(x['concept_id']) for x in COHD.find_concept_ids(subject_name, domain=domain, dataset_id=3)])
                #     object_OMOPs.update([str(x['concept_id']) for x in COHD.find_concept_ids(object_name, domain=domain, dataset_id=3)])
                #################################################
                # FIXME: this was the old way
                # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs
                # if subject_curie.split('.')[0] == 'CHEMBL':
                #     subject_OMOPs = [str(x['concept_id']) for x in
                #                     COHD.find_concept_ids(subject_name, domain="Drug", dataset_id=3)]
                # if object_curie.split('.')[0] == 'CHEMBL':
                #     object_OMOPs = [str(x['concept_id']) for x in
                #                     COHD.find_concept_ids(object_name, domain="Drug", dataset_id=3)]

                # uniquify everything
                # subject_OMOPs = list(set(subject_OMOPs))
                # object_OMOPs = list(set(object_OMOPs))

                # Decide how to handle the response from the KP
                if name == 'paired_concept_frequency':
                    # sum up all frequencies  #TODO check with COHD people to see if this is kosher
                    frequency = default
                    # for (omop1, omop2) in itertools.product(subject_OMOPs, object_OMOPs):
                    #     freq_data_list = self.cohdIndex.get_paired_concept_freq(omop1, omop2, 3) # use the hierarchical dataset
                    #     if len(freq_data_list) != 0:
                    #         freq_data = freq_data_list[0]
                    #         temp_value = freq_data['concept_frequency']
                    #         if temp_value > frequency:
                    #             frequency = temp_value
                    omop_pairs = [
                        f"{omop1}_{omop2}"
                        for (omop1, omop2) in itertools.product(
                            subject_OMOPs, object_OMOPs)
                    ]
                    if len(omop_pairs) != 0:
                        res = self.cohdIndex.get_paired_concept_freq(
                            concept_id_pair=omop_pairs,
                            dataset_id=3)  # use the hierarchical dataset
                        if len(res) != 0:
                            maximum_concept_frequency = res[0][
                                'concept_frequency']  # the result returned from get_paired_concept_freq was sorted by decreasing order
                            frequency = maximum_concept_frequency
                    # decorate the edges
                    value = frequency

                elif name == 'observed_expected_ratio':
                    # should probably take the largest obs/exp ratio  # TODO: check with COHD people to see if this is kosher
                    # FIXME: the ln_ratio can be negative, so I should probably account for this, but the object model doesn't like -np.inf
                    value = float(
                        "-inf"
                    )  # FIXME: unclear in object model if attribute type dictates value type, or if value always needs to be a string

                    ###############################
                    # The following code was an experiment to see if it would speed things up, leaving it out for now since it's difficult to quantify if it does speed things up given the cacheing
                    #if len(subject_OMOPs) < len(object_OMOPs):
                    #    for omop1 in subject_OMOPs:
                    #        omop_to_ln_ratio = dict()
                    #        response = COHD.get_obs_exp_ratio(omop1, domain="", dataset_id=3)  # use the hierarchical dataset
                    #        if response:
                    #            for res in response:
                    #                omop_to_ln_ratio[str(res['concept_id_2'])] = res['ln_ratio']
                    #        for omop2 in object_OMOPs:
                    #            if omop2 in omop_to_ln_ratio:
                    #                temp_value = omop_to_ln_ratio[omop2]
                    #                if temp_value > value:
                    #                    value = temp_value
                    #else:
                    #    for omop1 in object_OMOPs:
                    #        omop_to_ln_ratio = dict()
                    #        response = COHD.get_obs_exp_ratio(omop1, domain="", dataset_id=3)  # use the hierarchical dataset
                    #        if response:
                    #            for res in response:
                    #                omop_to_ln_ratio[str(res['concept_id_2'])] = res['ln_ratio']
                    #        for omop2 in subject_OMOPs:
                    #            if omop2 in omop_to_ln_ratio:
                    #                temp_value = omop_to_ln_ratio[omop2]
                    #                if temp_value > value:
                    #                    value = temp_value
                    ###################################

                    # for (omop1, omop2) in itertools.product(subject_OMOPs, object_OMOPs):
                    #     #print(f"{omop1},{omop2}")
                    #     response = self.cohdIndex.get_obs_exp_ratio(omop1, concept_id_2=omop2, domain="", dataset_id=3)  # use the hierarchical dataset
                    #     # response is a list, since this function is overloaded and can omit concept_id_2, take the first element
                    #     if response and 'ln_ratio' in response[0]:
                    #         temp_val = response[0]['ln_ratio']
                    #         if temp_val > value:
                    #             value = temp_val
                    omop_pairs = [
                        f"{omop1}_{omop2}"
                        for (omop1, omop2) in itertools.product(
                            subject_OMOPs, object_OMOPs)
                    ]
                    if len(omop_pairs) != 0:
                        res = self.cohdIndex.get_obs_exp_ratio(
                            concept_id_pair=omop_pairs,
                            domain="",
                            dataset_id=3)  # use the hierarchical dataset
                        if len(res) != 0:
                            maximum_ln_ratio = res[0][
                                'ln_ratio']  # the result returned from get_paired_concept_freq was sorted by decreasing order
                            value = maximum_ln_ratio

                elif name == 'chi_square':
                    value = float("inf")
                    # for (omop1, omop2) in itertools.product(subject_OMOPs, object_OMOPs):
                    #     response = self.cohdIndex.get_chi_square(omop1, concept_id_2=omop2, domain="", dataset_id=3)  # use the hierarchical dataset
                    #     # response is a list, since this function is overloaded and can omit concept_id_2, take the first element
                    #     if response and 'p-value' in response[0]:
                    #         temp_val = response[0]['p-value']
                    #         if temp_val < value:  # looking at p=values, so lower is better
                    #             value = temp_val
                    omop_pairs = [
                        f"{omop1}_{omop2}"
                        for (omop1, omop2) in itertools.product(
                            subject_OMOPs, object_OMOPs)
                    ]
                    if len(omop_pairs) != 0:
                        res = self.cohdIndex.get_chi_square(
                            concept_id_pair=omop_pairs,
                            domain="",
                            dataset_id=3)  # use the hierarchical dataset
                        if len(res) != 0:
                            minimum_pvalue = res[0][
                                'p-value']  # the result returned from get_paired_concept_freq was sorted by decreasing order
                            value = minimum_pvalue

                # create the edge attribute
                edge_attribute = EdgeAttribute(
                    attribute_type_id=type,
                    original_attribute_name=name,
                    value=str(value),
                    value_url=url
                )  # populate the edge attribute # FIXME: unclear in object model if attribute type dictates value type, or if value always needs to be a string
                return edge_attribute
            else:
                return None
        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(
                f"Something went wrong when adding the edge attribute from {KP_to_use}."
            )

    def add_virtual_edge(self, name="", default=0.):
        """
        Generic function to add a virtual edge to the KG an QG
        :name: name of the functionality of the KP to use
        """
        parameters = self.parameters
        subject_curies_to_decorate = set()
        object_curies_to_decorate = set()
        curies_to_names = dict(
        )  # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs
        # identify the nodes that we should be adding virtual edges for
        for key, node in self.message.knowledge_graph.nodes.items():
            if hasattr(node, 'qnode_keys'):
                if parameters['subject_qnode_key'] in node.qnode_keys:
                    subject_curies_to_decorate.add(key)
                    curies_to_names[
                        key] = node.name  # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs
                if parameters['object_qnode_key'] in node.qnode_keys:
                    object_curies_to_decorate.add(key)
                    curies_to_names[
                        key] = node.name  # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs
        added_flag = False  # check to see if any edges where added
        # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute

        ## call COHD api one time to save time
        curies_to_decorate = set()
        curies_to_decorate.update(subject_curies_to_decorate)
        curies_to_decorate.update(object_curies_to_decorate)
        self.mapping_curie_to_omop_ids = self.cohdIndex.get_concept_ids(
            curies_to_decorate)
        for (subject_curie,
             object_curie) in itertools.product(subject_curies_to_decorate,
                                                object_curies_to_decorate):
            # create the edge attribute if it can be
            edge_attribute = self.make_edge_attribute_from_curies(
                subject_curie,
                object_curie,
                subject_name=curies_to_names[subject_curie],
                object_name=curies_to_names[object_curie],
                default=default,
                name=name)
            if edge_attribute:
                added_flag = True
                # make the edge, add the attribute

                # edge properties
                now = datetime.now()
                edge_type = f"biolink:has_real_world_evidence_of_association_with"
                qedge_keys = [parameters['virtual_relation_label']]
                relation = parameters['virtual_relation_label']
                is_defined_by = "ARAX"
                defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S")
                provided_by = "infores:arax"
                confidence = None
                weight = None  # TODO: could make the actual value of the attribute
                subject_key = subject_curie
                object_key = object_curie

                # now actually add the virtual edges in
                id = f"{relation}_{self.global_iter}"
                # ensure the id is unique
                # might need to change after expand is implemented for TRAPI 1.0
                while id in self.message.knowledge_graph.edges:
                    id = f"{relation}_{self.global_iter}.{random.randint(10**(9-1), (10**9)-1)}"
                self.global_iter += 1
                edge_attribute_list = [
                    edge_attribute,
                    EdgeAttribute(
                        original_attribute_name="virtual_relation_label",
                        value=relation,
                        attribute_type_id="biolink:Unknown"),
                    #EdgeAttribute(original_attribute_name="is_defined_by", value=is_defined_by, attribute_type_id="biolink:Unknown"),
                    EdgeAttribute(original_attribute_name="defined_datetime",
                                  value=defined_datetime,
                                  attribute_type_id="metatype:Datetime"),
                    EdgeAttribute(
                        original_attribute_name="provided_by",
                        value=provided_by,
                        attribute_type_id="biolink:aggregator_knowledge_source",
                        attribute_source=provided_by,
                        value_type_id="biolink:InformationResource"),
                    EdgeAttribute(
                        original_attribute_name=None,
                        value=True,
                        attribute_type_id="biolink:computed_value",
                        attribute_source="infores:arax-reasoner-ara",
                        value_type_id="metatype:Boolean",
                        value_url=None,
                        description=
                        "This edge is a container for a computed value between two nodes that is not directly attachable to other edges."
                    )
                    #EdgeAttribute(name="confidence", value=confidence, type="biolink:ConfidenceLevel"),
                    #EdgeAttribute(name="weight", value=weight, type="metatype:Float"),
                    #EdgeAttribute(name="qedge_ids", value=qedge_ids)
                ]
                # edge = Edge(id=id, type=edge_type, relation=relation, subject_key=subject_key,
                #             object_key=object_key,
                #             is_defined_by=is_defined_by, defined_datetime=defined_datetime,
                #             provided_by=provided_by,
                #             confidence=confidence, weight=weight, attributes=[edge_attribute], qedge_ids=qedge_ids)
                edge = Edge(predicate=edge_type,
                            subject=subject_key,
                            object=object_key,
                            attributes=edge_attribute_list)
                edge.qedge_keys = qedge_keys
                self.message.knowledge_graph.edges[id] = edge
                if self.message.results is not None and len(
                        self.message.results) > 0:
                    ou.update_results_with_overlay_edge(
                        subject_knode_key=subject_key,
                        object_knode_key=object_key,
                        kedge_key=id,
                        message=self.message,
                        log=self.response)

        # Now add a q_edge the query_graph since I've added an extra edge to the KG
        if added_flag:
            edge_type = f"biolink:has_real_world_evidence_of_association_with"
            relation = parameters['virtual_relation_label']
            qedge_keys = [parameters['virtual_relation_label']]
            subject_qnode_key = parameters['subject_qnode_key']
            object_qnode_key = parameters['object_qnode_key']
            option_group_id = ou.determine_virtual_qedge_option_group(
                subject_qnode_key, object_qnode_key, self.message.query_graph,
                self.response)
            # q_edge = QEdge(id=relation, type=edge_type, relation=relation,
            #                subject_key=subject_qnode_key, object_key=object_qnode_key,
            #                option_group_id=option_group_id)  # TODO: ok to make the id and type the same thing?
            q_edge = QEdge(predicates=edge_type,
                           subject=subject_qnode_key,
                           object=object_qnode_key,
                           option_group_id=option_group_id)
            q_edge.relation = relation
            self.message.query_graph.edges[relation] = q_edge

    def add_all_edges(self, name="", default=0.):
        curies_to_names = dict()
        all_curie_set = set()
        for key, node in self.message.knowledge_graph.nodes.items():
            curies_to_names[key] = node.name
            all_curie_set.add(key)
        self.mapping_curie_to_omop_ids = self.cohdIndex.get_concept_ids(
            all_curie_set)
        for edge in self.message.knowledge_graph.edges.values():
            if not edge.attributes:  # populate if not already there
                edge.attributes = []
            subject_curie = edge.subject
            object_curie = edge.object
            edge_attribute = self.make_edge_attribute_from_curies(
                subject_curie,
                object_curie,
                subject_name=curies_to_names[subject_curie],
                object_name=curies_to_names[object_curie],
                default=default,
                name=name
            )  # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs
            if edge_attribute:  # make sure an edge attribute was actually created
                edge.attributes.append(edge_attribute)

    def paired_concept_frequency(self, default=0):
        """
        calulate paired concept frequency.
        Retrieves observed clinical frequencies of a pair of concepts.
        :return: response
        """
        parameters = self.parameters
        self.response.debug("Computing paired concept frequencies.")
        self.response.info(
            "Overlaying paired concept frequencies utilizing Columbia Open Health Data. This calls an external knowledge provider and may take a while"
        )

        # Now add the edges or virtual edges
        try:
            if 'virtual_relation_label' in parameters:
                if 'subject_qnode_key' in parameters and 'object_qnode_key' in parameters:
                    self.add_virtual_edge(name="paired_concept_frequency",
                                          default=default)
                else:
                    seen_node_pairs = set()
                    qgraph_edges = copy.deepcopy(
                        list(self.response.envelope.message.query_graph.edges.
                             values()))
                    for query_edge in qgraph_edges:
                        current_subject_qnode_key = query_edge.subject
                        current_object_qnode_key = query_edge.object
                        if current_subject_qnode_key < current_object_qnode_key:
                            qnode_key_pair = (current_subject_qnode_key,
                                              current_object_qnode_key)
                        else:
                            qnode_key_pair = (current_object_qnode_key,
                                              current_subject_qnode_key)
                        # FW: check if we have already added an edge for this pair
                        if qnode_key_pair in seen_node_pairs:
                            pass
                        else:
                            seen_node_pairs.add(qnode_key_pair)
                            parameters[
                                'subject_qnode_key'] = current_subject_qnode_key
                            parameters[
                                'object_qnode_key'] = current_object_qnode_key
                            self.add_virtual_edge(
                                name="paired_concept_frequency",
                                default=default)
                            parameters.pop('subject_qnode_key')
                            parameters.pop('object_qnode_key')
            else:  # otherwise, just add to existing edges in the KG
                self.add_all_edges(name="paired_concept_frequency",
                                   default=default)

        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(
                f"Something went wrong when overlaying clinical info")

    def observed_expected_ratio(self, default=0):
        """
        Returns the natural logarithm of the ratio between the observed count and expected count.
        Expected count is calculated from the single concept frequencies and assuming independence between the concepts.
        Results are returned as maximum over all ln_ratios matching to OMOP concept id.
        """
        parameters = self.parameters
        self.response.debug("Computing observed expected ratios.")
        self.response.info(
            "Overlaying observed expected ratios utilizing Columbia Open Health Data. This calls an external knowledge provider and may take a while"
        )

        # Now add the edges or virtual edges
        try:
            if 'virtual_relation_label' in parameters:
                if 'subject_qnode_key' in parameters and 'object_qnode_key' in parameters:
                    self.add_virtual_edge(name="observed_expected_ratio",
                                          default=default)
                else:
                    seen_node_pairs = set()
                    qgraph_edges = copy.deepcopy(
                        list(self.response.envelope.message.query_graph.edges.
                             values()))
                    for query_edge in qgraph_edges:
                        current_subject_qnode_key = query_edge.subject
                        current_object_qnode_key = query_edge.object
                        if current_subject_qnode_key < current_object_qnode_key:
                            qnode_key_pair = (current_subject_qnode_key,
                                              current_object_qnode_key)
                        else:
                            qnode_key_pair = (current_object_qnode_key,
                                              current_subject_qnode_key)
                        # FW: check if we have already added an edge for this pair
                        if qnode_key_pair in seen_node_pairs:
                            pass
                        else:
                            seen_node_pairs.add(qnode_key_pair)
                            parameters[
                                'subject_qnode_key'] = current_subject_qnode_key
                            parameters[
                                'object_qnode_key'] = current_object_qnode_key
                            self.add_virtual_edge(
                                name="observed_expected_ratio",
                                default=default)
                            parameters.pop('subject_qnode_key')
                            parameters.pop('object_qnode_key')
            else:  # otherwise, just add to existing edges in the KG
                self.add_all_edges(name="observed_expected_ratio",
                                   default=default)

        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(
                f"Something went wrong when overlaying clinical info")

    def chi_square(self, default=float("inf")):
        """
        Returns the chi-square statistic and p-value between pairs of concepts. Results are returned in descending order of the chi-square statistic. Note that due to large sample sizes, the chi-square can become very large.
        The expected frequencies for the chi-square analysis are calculated based on the single concept frequencies and assuming independence between concepts. P-value is calculated with 1 DOF.
        """
        parameters = self.parameters
        self.response.debug("Computing Chi square p-values.")
        self.response.info(
            "Overlaying Chi square p-values utilizing Columbia Open Health Data. This calls an external knowledge provider and may take a while"
        )

        # Now add the edges or virtual edges
        try:
            if 'virtual_relation_label' in parameters:
                if 'subject_qnode_key' in parameters and 'object_qnode_key' in parameters:
                    self.add_virtual_edge(name="chi_square", default=default)
                else:
                    seen_node_pairs = set()
                    qgraph_edges = copy.deepcopy(
                        list(self.response.envelope.message.query_graph.edges.
                             values()))
                    for query_edge in qgraph_edges:
                        current_subject_qnode_key = query_edge.subject
                        current_object_qnode_key = query_edge.object
                        if current_subject_qnode_key < current_object_qnode_key:
                            qnode_key_pair = (current_subject_qnode_key,
                                              current_object_qnode_key)
                        else:
                            qnode_key_pair = (current_object_qnode_key,
                                              current_subject_qnode_key)
                        # FW: check if we have already added an edge for this pair
                        if qnode_key_pair in seen_node_pairs:
                            pass
                        else:
                            seen_node_pairs.add(qnode_key_pair)
                            parameters[
                                'subject_qnode_key'] = current_subject_qnode_key
                            parameters[
                                'object_qnode_key'] = current_object_qnode_key
                            self.add_virtual_edge(name="chi_square",
                                                  default=default)
                            parameters.pop('subject_qnode_key')
                            parameters.pop('object_qnode_key')
            else:  # otherwise, just add to existing edges in the KG
                self.add_all_edges(name="chi_square", default=default)

        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(
                f"Something went wrong when overlaying clinical info")
Exemplo n.º 12
0
class KPSelector:
    def __init__(self, log: ARAXResponse = ARAXResponse()):
        self.meta_map_path = f"{os.path.dirname(os.path.abspath(__file__))}/meta_map_v2.pickle"
        self.timeout_record_path = f"{os.path.dirname(os.path.abspath(__file__))}/kp_timeout_record.pickle"
        self.log = log
        self.all_kps = eu.get_all_kps()
        self.timeout_record = self._load_timeout_record()
        self.meta_map = self._load_meta_map()
        self.biolink_helper = BiolinkHelper()

    def get_kps_for_single_hop_qg(self, qg: QueryGraph) -> Optional[Set[str]]:
        """
        This function returns the names of the KPs that say they can answer the given one-hop query graph (based on
        the categories/predicates the QG uses).
        """
        qedge_key = next(qedge_key for qedge_key in qg.edges)
        qedge = qg.edges[qedge_key]
        self.log.debug(f"Selecting KPs to use for qedge {qedge_key}")
        # confirm that the qg is one hop
        if len(qg.edges) > 1:
            self.log.error(
                f"Query graph can only have one edge, but instead has {len(qg.edges)}.",
                error_code="UnexpectedQG")
            return None
        # isolate possible subject predicate object from qg
        sub_categories = set(
            self.biolink_helper.get_descendants(
                qg.nodes[qedge.subject].categories))
        obj_categories = set(
            self.biolink_helper.get_descendants(
                qg.nodes[qedge.object].categories))
        predicates = set(self.biolink_helper.get_descendants(qedge.predicates))

        symmetrical_predicates = set(
            filter(self.biolink_helper.is_symmetric, predicates))

        # use metamap to check kp for predicate triple
        accepting_kps = set()
        for kp in self.meta_map:
            if self._triple_is_in_meta_map(kp, sub_categories, predicates,
                                           obj_categories):
                accepting_kps.add(kp)
            # account for symmetrical predicates by checking if kp accepts with swapped sub and obj categories
            elif self._triple_is_in_meta_map(kp, obj_categories,
                                             symmetrical_predicates,
                                             sub_categories):
                accepting_kps.add(kp)
            else:
                self.log.update_query_plan(
                    qedge_key, kp, "Skipped",
                    "MetaKG indicates this qedge is unsupported")
        kps_missing_meta_info = self.all_kps.difference(set(self.meta_map))
        for missing_kp in kps_missing_meta_info:
            self.log.update_query_plan(qedge_key, missing_kp, "Skipped",
                                       "No MetaKG info available")

        return accepting_kps

    def kp_accepts_single_hop_qg(self, qg: QueryGraph,
                                 kp: str) -> Optional[bool]:
        """
        This function determines whether a KP can answer a given one-hop query based on the categories/predicates
        used in the query graph.
        """
        self.log.debug(
            f"Verifying that {kp} can answer this kind of one-hop query")
        # Confirm that the qg is one-hop
        if len(qg.edges) > 1:
            self.log.error(
                f"Query graph can only have one edge, but instead has {len(qg.edges)}.",
                error_code="UnexpectedQG")
            return None

        qedge = list(qg.edges.values())[0]
        sub_categories = set(
            self.biolink_helper.get_descendants(
                qg.nodes[qedge.subject].categories))
        obj_categories = set(
            self.biolink_helper.get_descendants(
                qg.nodes[qedge.object].categories))
        predicates = set(self.biolink_helper.get_descendants(qedge.predicates))
        kp_accepts = self._triple_is_in_meta_map(kp, sub_categories,
                                                 predicates, obj_categories)

        # account for symmetrical predicates by checking if kp accepts with swapped sub and obj categories
        symmetrical_predicates = set(
            filter(self.biolink_helper.is_symmetric, predicates))
        kp_accepts = kp_accepts or self._triple_is_in_meta_map(
            kp, obj_categories, symmetrical_predicates, sub_categories)

        return kp_accepts

    def get_desirable_equivalent_curies(self, curies: List[str],
                                        categories: Optional[List[str]],
                                        kp: str) -> List[str]:
        """
        For each input curie, this function returns an equivalent curie(s) that uses a prefix the KP supports.
        """
        self.log.debug(
            f"{kp}: Converting curies in the QG to kinds that {kp} can answer")
        if not self.meta_map.get(kp):
            self.log.warning(
                f"{kp}: Somehow missing meta info for {kp}. Cannot do curie prefix conversion; will send "
                f"curies as they are.")
            return curies
        elif not self.meta_map[kp].get("prefixes"):
            self.log.warning(
                f"{kp}: No supported prefix info is available for {kp}. Will send curies as they are."
            )
            return curies
        else:
            supported_prefixes = self._get_supported_prefixes(
                eu.convert_to_list(categories), kp)
            self.log.debug(
                f"{kp}: Prefixes {kp} supports for categories {categories} (and descendants) are: "
                f"{supported_prefixes}")
            converted_curies = set()
            unsupported_curies = set()
            synonyms_dict = eu.get_curie_synonyms_dict(curies)
            # Convert each input curie to a preferred, supported prefix
            for input_curie, equivalent_curies in synonyms_dict.items():
                input_curie_prefix = self._get_uppercase_prefix(input_curie)
                supported_equiv_curies_by_prefix = defaultdict(set)
                for curie in equivalent_curies:
                    prefix = self._get_uppercase_prefix(curie)
                    if prefix in supported_prefixes:
                        supported_equiv_curies_by_prefix[prefix].add(curie)
                if supported_equiv_curies_by_prefix:
                    # Grab equivalent curies with the same prefix as the input curie, if available
                    if input_curie_prefix in supported_equiv_curies_by_prefix:
                        curies_to_send = supported_equiv_curies_by_prefix[
                            input_curie_prefix]
                    # Otherwise pick any supported curie prefix present
                    else:
                        curies_to_send = next(
                            curie_set for curie_set in
                            supported_equiv_curies_by_prefix.values())
                    converted_curies = converted_curies.union(curies_to_send)
                else:
                    unsupported_curies.add(input_curie)
            if unsupported_curies:
                self.log.warning(
                    f"{kp}: Could not find curies with prefixes {kp} prefers for these curies: "
                    f"{unsupported_curies}; will not send these to KP")
            return list(converted_curies)

    # returns True if at least one possible triple exists in the KP's meta map
    def _triple_is_in_meta_map(self, kp: str, subject_categories: Set[str],
                               predicates: Set[str],
                               object_categories: Set[str]) -> bool:
        kp_meta_map = self.meta_map.get(kp)
        if not kp_meta_map:
            if kp not in self.all_kps:
                self.log.error(
                    f"{kp} does not seem to be a valid KP for ARAX. Valid KPs are: {self.all_kps}",
                    error_code="InvalidKP")
            else:
                self.log.warning(f"Somehow missing meta info for {kp}.")
            return False
        else:
            predicates_map = kp_meta_map["predicates"]
            # handle potential emptiness of sub, obj, predicate lists
            if not subject_categories:  # any subject
                subject_categories = set(predicates_map.keys())
            if not object_categories:  # any object
                object_set = set()
                _ = [
                    object_set.add(obj)
                    for obj_dict in predicates_map.values()
                    for obj in obj_dict.keys()
                ]
                object_categories = object_set
            any_predicate = False if predicates or kp == "NGD" else True

            # handle combinations of subject and objects using cross product
            qg_sub_obj_dict = defaultdict(lambda: set())
            for sub, obj in list(product(subject_categories,
                                         object_categories)):
                qg_sub_obj_dict[sub].add(obj)

            # check for subjects
            kp_allowed_subs = set(predicates_map.keys())
            accepted_subs = kp_allowed_subs.intersection(
                set(qg_sub_obj_dict.keys()))

            # check for objects
            for sub in accepted_subs:
                kp_allowed_objs = set(predicates_map[sub].keys())
                accepted_objs = kp_allowed_objs.intersection(
                    qg_sub_obj_dict[sub])
                if len(accepted_objs) > 0:
                    # check predicates
                    for obj in accepted_objs:
                        if any_predicate or predicates.intersection(
                                predicates_map[sub][obj]):
                            return True
            return False

    def _load_meta_map(self):
        # This function loads the meta map and updates it as needed
        meta_map_file = pathlib.Path(self.meta_map_path)
        one_day_ago = datetime.now() - timedelta(hours=24)
        if not meta_map_file.exists():
            self.log.debug(f"Creating local copy of meta map for all KPs")
            meta_map = self._refresh_meta_map()
        elif datetime.fromtimestamp(
                meta_map_file.stat().st_mtime) < one_day_ago:
            self.log.debug(f"Doing a refresh of local meta map for all KPs")
            meta_map = self._refresh_meta_map()
        else:
            self.log.debug(
                f"Loading meta map (already exists and isn't due for a refresh)"
            )
            with open(self.meta_map_path, "rb") as map_file:
                meta_map = pickle.load(map_file)
            # Check for any missing KPs
            missing_kps = self.all_kps.difference(set(meta_map))
            if missing_kps:
                self.log.debug(f"Missing meta info for {missing_kps}")
                meta_map = self._refresh_meta_map(missing_kps, meta_map)

        # Make sure the map doesn't contain any 'stale' KPs
        stale_kps = set(meta_map).difference(self.all_kps)
        if stale_kps:
            for stale_kp in stale_kps:
                self.log.debug(
                    f"Detected a stale KP in meta map ({stale_kp}) - deleting it"
                )
                del meta_map[stale_kp]
            with open(self.meta_map_path, "wb") as map_file:
                pickle.dump(meta_map, map_file)  # Save these changes

        return meta_map

    def _refresh_meta_map(self,
                          kps: Optional[Set[str]] = None,
                          meta_map: Optional[Dict[str, dict]] = None):
        # Create an up to date version of the meta map
        kps_to_update = kps if kps else self.all_kps

        if not meta_map:
            # Load whatever pre-existing meta-map we might already have (could use this info in case an API fails)
            meta_map_file = pathlib.Path(self.meta_map_path)
            if meta_map_file.exists():
                with open(self.meta_map_path, "rb") as existing_meta_map_file:
                    meta_map = pickle.load(existing_meta_map_file)
            else:
                meta_map = dict()

        # Then (try to) get updated meta info from each KP
        ten_minutes_ago = datetime.now() - timedelta(minutes=10)
        non_functioning_kps = [
            kp for kp in kps_to_update if self.timeout_record.get(kp)
            and self.timeout_record[kp] > ten_minutes_ago
        ]
        if non_functioning_kps:
            self.log.debug(
                f"Not trying to grab meta info for {non_functioning_kps} because they timed out "
                f"within the last 10 minutes")
        functioning_kps_to_update = set(kps_to_update).difference(
            set(non_functioning_kps))
        for kp in functioning_kps_to_update:
            kp_endpoint = eu.get_kp_endpoint_url(kp)
            if kp_endpoint:
                try:
                    self.log.debug(f"Getting meta info from {kp}")
                    with requests_cache.disabled():
                        kp_response = requests.get(
                            f"{kp_endpoint}/meta_knowledge_graph", timeout=10)
                except requests.exceptions.Timeout:
                    self.log.warning(
                        f"Timed out when trying to hit {kp}'s /meta_knowledge_graph endpoint "
                        f"(waited 10 seconds)")
                    self.timeout_record[kp] = datetime.now()
                except Exception:
                    self.log.warning(
                        f"Ran into a problem getting {kp}'s meta info")
                else:
                    if kp_response.status_code == 200:
                        kp_meta_kg = kp_response.json()
                        meta_map[kp] = {
                            "predicates":
                            self._convert_to_meta_map(kp_meta_kg),
                            "prefixes": {
                                category: meta_node["id_prefixes"]
                                for category, meta_node in
                                kp_meta_kg["nodes"].items()
                            }
                        }
                    else:
                        self.log.warning(
                            f"Unable to access {kp}'s /meta_knowledge_graph endpoint (returned status of "
                            f"{kp_response.status_code})")
            elif kp == "infores:arax-drug-treats-disease":
                meta_map[kp] = {
                    "predicates": self._get_dtd_meta_map(),
                    "prefixes": dict()
                }
            elif kp == "infores:arax-normalized-google-distance":
                # This is just a placeholder; not really used for KP selection
                predicates = {
                    "biolink:NamedThing": {
                        "biolink:NamedThing":
                        {"biolink:has_normalized_google_distance_with"}
                    }
                }
                meta_map[kp] = {"predicates": predicates, "prefixes": dict()}

        # Save our big combined metamap to a local json file
        with open(self.meta_map_path, "wb") as map_file:
            pickle.dump(meta_map, map_file)
        with open(self.timeout_record_path, "wb") as timeout_file:
            pickle.dump(self.timeout_record, timeout_file)

        return meta_map

    @staticmethod
    def _convert_to_meta_map(kp_meta_kg: dict) -> dict:
        kp_meta_map = dict()
        for meta_edge in kp_meta_kg["edges"]:
            subject_category = meta_edge["subject"]
            object_category = meta_edge["object"]
            predicate = meta_edge["predicate"]
            if subject_category not in kp_meta_map:
                kp_meta_map[subject_category] = dict()
            if object_category not in kp_meta_map[subject_category]:
                kp_meta_map[subject_category][object_category] = set()
            kp_meta_map[subject_category][object_category].add(predicate)
        return kp_meta_map

    @staticmethod
    def _get_dtd_meta_map():
        dtd_predicates = {"biolink:treats", "biolink:treated_by"}
        drug_ish_dict = {
            "biolink:Drug": dtd_predicates,
            "biolink:SmallMolecule": dtd_predicates
        }
        disease_ish_dict = {
            "biolink:Disease": dtd_predicates,
            "biolink:PhenotypicFeature": dtd_predicates,
            "biolink:DiseaseOrPhenotypicFeature": dtd_predicates
        }
        dtd_meta_map = {
            "biolink:Drug": disease_ish_dict,
            "biolink:SmallMolecule": disease_ish_dict,
            "biolink:Disease": drug_ish_dict,
            "biolink:PhenotypicFeature": drug_ish_dict,
            "biolink:DiseaseOrPhenotypicFeature": drug_ish_dict
        }
        return dtd_meta_map

    def _load_timeout_record(self) -> Dict[str, datetime]:
        self.log.debug(f"Loading record of KP timeouts")
        timeout_record_file = pathlib.Path(self.timeout_record_path)
        if not timeout_record_file.exists():
            return dict()
        else:
            with open(self.timeout_record_path, "rb") as timeout_file:
                return pickle.load(timeout_file)

    def make_qg_use_supported_prefixes(
            self, qg: QueryGraph, kp_name: str,
            log: ARAXResponse) -> Optional[QueryGraph]:
        for qnode_key, qnode in qg.nodes.items():
            if qnode.ids:
                if kp_name == "infores:rtx-kg2":
                    # Just convert them into canonical curies
                    qnode.ids = eu.get_canonical_curies_list(qnode.ids, log)
                else:
                    # Otherwise figure out which kind of curies KPs want
                    categories = eu.convert_to_list(qnode.categories)
                    supported_prefixes = self._get_supported_prefixes(
                        categories, kp_name)
                    used_prefixes = {
                        self._get_uppercase_prefix(curie)
                        for curie in qnode.ids
                    }
                    # Only convert curie(s) if any use an unsupported prefix
                    if used_prefixes.issubset(supported_prefixes):
                        self.log.debug(
                            f"{kp_name}: All {qnode_key} curies use prefix(es) {kp_name} supports; no "
                            f"conversion necessary")
                    else:
                        self.log.debug(
                            f"{kp_name}: One or more {qnode_key} curies use a prefix {kp_name} doesn't "
                            f"support; will convert these")
                        converted_curies = self.get_desirable_equivalent_curies(
                            qnode.ids, qnode.categories, kp_name)
                        if converted_curies:
                            log.debug(
                                f"{kp_name}: Converted {qnode_key}'s {len(qnode.ids)} curies to a list of "
                                f"{len(converted_curies)} curies tailored for {kp_name}"
                            )
                            qnode.ids = converted_curies
                        else:
                            log.info(
                                f"{kp_name} cannot answer the query because no equivalent curies were found "
                                f"with prefixes it supports for qnode {qnode_key}. Original curies were: "
                                f"{qnode.ids}")
                            return None
        return qg

    @staticmethod
    def _get_uppercase_prefix(curie: str) -> str:
        return curie.split(":")[0].upper()

    def _get_supported_prefixes(self, categories: List[str],
                                kp: str) -> Set[str]:
        bh = BiolinkHelper()
        categories_with_descendants = bh.get_descendants(
            eu.convert_to_list(categories), include_mixins=False)
        supported_prefixes = {
            prefix.upper()
            for category in categories_with_descendants
            for prefix in self.meta_map[kp]["prefixes"].get(category, set())
        }
        return supported_prefixes