def __init__(self, response, message, params):
     self.response = response
     self.message = message
     self.parameters = params
     self.who_knows_about_what = {
         'COHD': ['chemical_substance', 'phenotypic_feature', 'disease']
     }  # FIXME: replace this with information about the KP's, KS's, and their API's
     self.node_curie_to_type = dict()
     self.global_iter = 0
     self.cohdIndex = COHDIndex()
Пример #2
0
 def __init__(self, response, message, params):
     self.response = response
     self.message = message
     self.parameters = params
     self.who_knows_about_what = {'COHD': ['chemical_substance', 'phenotypic_feature', 'disease', 'drug',
                                             'biolink:ChemicalSubstance', 'biolink:PhenotypicFeature', 'biolink:Disease', 'biolink:Drug']}  # FIXME: replace this with information about the KP's, KS's, and their API's
     self.node_curie_to_type = dict()
     self.global_iter = 0
     try:
         self.cohdIndex = COHDIndex()
     except:
         tb = traceback.format_exc()
         error_type, error, _ = sys.exc_info()
         self.response.error(tb, error_code=error_type.__name__)
         self.response.error(f"Internal Error encountered connecting to the local COHD database.")
class OverlayClinicalInfo:

    #### Constructor
    def __init__(self, response, message, params):
        self.response = response
        self.message = message
        self.parameters = params
        self.who_knows_about_what = {
            'COHD': ['chemical_substance', 'phenotypic_feature', 'disease']
        }  # FIXME: replace this with information about the KP's, KS's, and their API's
        self.node_curie_to_type = dict()
        self.global_iter = 0
        self.cohdIndex = COHDIndex()

    def decorate(self):
        """
        Main decorator: looks at parameters and figures out which subroutine to farm out to
        :param parameters:
        :return: response object
        """
        # First, make a dictionary between node curie and type to make sure we're only looking at edges we can handle
        self.response.info(
            "Converting CURIE identifiers to human readable names")
        try:
            for node in self.message.knowledge_graph.nodes:
                self.node_curie_to_type[
                    node.id] = node.type  # WARNING: this is a list
        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(f"Something went wrong when converting names")
            return self.response

        parameters = self.parameters
        if 'paired_concept_frequency' in parameters:
            if parameters['paired_concept_frequency'] == 'true':
                self.paired_concept_frequency()
                # TODO: should I return the response and merge, or is it passed by reference and just return at the end?
        if 'associated_concept_freq' in parameters:
            if parameters['associated_concept_freq'] == 'true':
                #self.associated_concept_freq()  # TODO: make this function, and all the other COHD functions too
                pass
        if 'chi_square' in parameters:
            if parameters['chi_square'] == 'true':
                self.chi_square(
                )  # TODO: make this function, and all the other COHD functions too
                pass
        if 'observed_expected_ratio' in parameters:
            if parameters['observed_expected_ratio'] == 'true':
                self.observed_expected_ratio(
                )  # TODO: make this function, and all the other COHD functions too
                pass
        if 'relative_frequency' in parameters:
            if parameters['relative_frequency'] == 'true':
                #self.associated_concept_freq()  # TODO: make this function, and all the other COHD functions too
                pass

        return self.response

    def in_common(self, list1, list2):
        """
        Helper function that returns true iff list1 and list2 have any elements in common
        :param list1: a list of strings (intended to be biolink node types)
        :param list2: another list of strings (intended to be biolink node types)
        :return: True/False if they share an element in common
        """
        if set(list1).intersection(set(list2)):
            return True
        else:
            return False

    def make_edge_attribute_from_curies(self,
                                        source_curie,
                                        target_curie,
                                        source_name="",
                                        target_name="",
                                        default=0.,
                                        name=""):
        """
        Generic function to make an edge attribute
        :source_curie: CURIE of the source node for the edge under consideration
        :target_curie: CURIE of the target node for the edge under consideration
        :source_name: text name of the source node (in case the KP doesn't understand the CURIE)
        :target: text name of the target node (in case the KP doesn't understand the CURIE)
        :default: default value of the edge attribute
        :name: name of the KP functionality you want to apply
        """
        try:
            # edge attributes
            name = name
            type = "EDAM:data_0951"
            url = "http://cohd.smart-api.info/"
            value = default

            node_curie_to_type = self.node_curie_to_type
            source_type = node_curie_to_type[source_curie]
            target_type = node_curie_to_type[target_curie]
            # figure out which knowledge provider to use  # TODO: should handle this in a more structured fashion, does there exist a standardized KP API format?
            KP_to_use = None
            for KP in self.who_knows_about_what:
                # see which KP's can label both sources of information
                if self.in_common(
                        source_type,
                        self.who_knows_about_what[KP]) and self.in_common(
                            target_type, self.who_knows_about_what[KP]):
                    KP_to_use = KP
            if KP_to_use == 'COHD':
                # convert CURIE to OMOP identifiers
                # source_OMOPs = [str(x['omop_standard_concept_id']) for x in COHD.get_xref_to_OMOP(source_curie, 1)]
                res = self.cohdIndex.get_concept_ids(source_curie)
                if len(res) != 0:
                    source_OMOPs = res
                else:
                    source_OMOPs = []
                # target_OMOPs = [str(x['omop_standard_concept_id']) for x in COHD.get_xref_to_OMOP(target_curie, 1)]
                res = self.cohdIndex.get_concept_ids(target_curie)
                if len(res) != 0:
                    target_OMOPs = res
                else:
                    target_OMOPs = []
                # for domain in ["Condition", "Drug", "Procedure"]:
                #     source_OMOPs.update([str(x['concept_id']) for x in COHD.find_concept_ids(source_name, domain=domain, dataset_id=3)])
                #     target_OMOPs.update([str(x['concept_id']) for x in COHD.find_concept_ids(target_name, domain=domain, dataset_id=3)])
                #################################################
                # FIXME: this was the old way
                # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs
                # if source_curie.split('.')[0] == 'CHEMBL':
                #     source_OMOPs = [str(x['concept_id']) for x in
                #                     COHD.find_concept_ids(source_name, domain="Drug", dataset_id=3)]
                # if target_curie.split('.')[0] == 'CHEMBL':
                #     target_OMOPs = [str(x['concept_id']) for x in
                #                     COHD.find_concept_ids(target_name, domain="Drug", dataset_id=3)]

                # uniquify everything
                # source_OMOPs = list(set(source_OMOPs))
                # target_OMOPs = list(set(target_OMOPs))

                # Decide how to handle the response from the KP
                if name == 'paired_concept_frequency':
                    # sum up all frequencies  #TODO check with COHD people to see if this is kosher
                    frequency = default
                    # for (omop1, omop2) in itertools.product(source_OMOPs, target_OMOPs):
                    #     freq_data_list = self.cohdIndex.get_paired_concept_freq(omop1, omop2, 3) # use the hierarchical dataset
                    #     if len(freq_data_list) != 0:
                    #         freq_data = freq_data_list[0]
                    #         temp_value = freq_data['concept_frequency']
                    #         if temp_value > frequency:
                    #             frequency = temp_value
                    omop_pairs = [
                        f"{omop1}_{omop2}"
                        for (omop1, omop2
                             ) in itertools.product(source_OMOPs, target_OMOPs)
                    ]
                    if len(omop_pairs) != 0:
                        res = self.cohdIndex.get_paired_concept_freq(
                            concept_id_pair=omop_pairs,
                            dataset_id=3)  # use the hierarchical dataset
                        if len(res) != 0:
                            maximum_concept_frequency = res[0][
                                'concept_frequency']  # the result returned from get_paired_concept_freq was sorted by decreasing order
                            frequency = maximum_concept_frequency
                    # decorate the edges
                    value = frequency

                elif name == 'observed_expected_ratio':
                    # should probably take the largest obs/exp ratio  # TODO: check with COHD people to see if this is kosher
                    # FIXME: the ln_ratio can be negative, so I should probably account for this, but the object model doesn't like -np.inf
                    value = float(
                        "-inf"
                    )  # FIXME: unclear in object model if attribute type dictates value type, or if value always needs to be a string

                    ###############################
                    # The following code was an experiment to see if it would speed things up, leaving it out for now since it's difficult to quantify if it does speed things up given the cacheing
                    #if len(source_OMOPs) < len(target_OMOPs):
                    #    for omop1 in source_OMOPs:
                    #        omop_to_ln_ratio = dict()
                    #        response = COHD.get_obs_exp_ratio(omop1, domain="", dataset_id=3)  # use the hierarchical dataset
                    #        if response:
                    #            for res in response:
                    #                omop_to_ln_ratio[str(res['concept_id_2'])] = res['ln_ratio']
                    #        for omop2 in target_OMOPs:
                    #            if omop2 in omop_to_ln_ratio:
                    #                temp_value = omop_to_ln_ratio[omop2]
                    #                if temp_value > value:
                    #                    value = temp_value
                    #else:
                    #    for omop1 in target_OMOPs:
                    #        omop_to_ln_ratio = dict()
                    #        response = COHD.get_obs_exp_ratio(omop1, domain="", dataset_id=3)  # use the hierarchical dataset
                    #        if response:
                    #            for res in response:
                    #                omop_to_ln_ratio[str(res['concept_id_2'])] = res['ln_ratio']
                    #        for omop2 in source_OMOPs:
                    #            if omop2 in omop_to_ln_ratio:
                    #                temp_value = omop_to_ln_ratio[omop2]
                    #                if temp_value > value:
                    #                    value = temp_value
                    ###################################

                    # for (omop1, omop2) in itertools.product(source_OMOPs, target_OMOPs):
                    #     #print(f"{omop1},{omop2}")
                    #     response = self.cohdIndex.get_obs_exp_ratio(omop1, concept_id_2=omop2, domain="", dataset_id=3)  # use the hierarchical dataset
                    #     # response is a list, since this function is overloaded and can omit concept_id_2, take the first element
                    #     if response and 'ln_ratio' in response[0]:
                    #         temp_val = response[0]['ln_ratio']
                    #         if temp_val > value:
                    #             value = temp_val
                    omop_pairs = [
                        f"{omop1}_{omop2}"
                        for (omop1, omop2
                             ) in itertools.product(source_OMOPs, target_OMOPs)
                    ]
                    if len(omop_pairs) != 0:
                        res = self.cohdIndex.get_obs_exp_ratio(
                            concept_id_pair=omop_pairs,
                            domain="",
                            dataset_id=3)  # use the hierarchical dataset
                        if len(res) != 0:
                            maximum_ln_ratio = res[0][
                                'ln_ratio']  # the result returned from get_paired_concept_freq was sorted by decreasing order
                            value = maximum_ln_ratio

                elif name == 'chi_square':
                    value = float("inf")
                    # for (omop1, omop2) in itertools.product(source_OMOPs, target_OMOPs):
                    #     response = self.cohdIndex.get_chi_square(omop1, concept_id_2=omop2, domain="", dataset_id=3)  # use the hierarchical dataset
                    #     # response is a list, since this function is overloaded and can omit concept_id_2, take the first element
                    #     if response and 'p-value' in response[0]:
                    #         temp_val = response[0]['p-value']
                    #         if temp_val < value:  # looking at p=values, so lower is better
                    #             value = temp_val
                    omop_pairs = [
                        f"{omop1}_{omop2}"
                        for (omop1, omop2
                             ) in itertools.product(source_OMOPs, target_OMOPs)
                    ]
                    if len(omop_pairs) != 0:
                        res = self.cohdIndex.get_chi_square(
                            concept_id_pair=omop_pairs,
                            domain="",
                            dataset_id=3)  # use the hierarchical dataset
                        if len(res) != 0:
                            minimum_pvalue = res[0][
                                'p-value']  # the result returned from get_paired_concept_freq was sorted by decreasing order
                            value = minimum_pvalue

                # create the edge attribute
                edge_attribute = EdgeAttribute(
                    type=type, name=name, value=str(value), url=url
                )  # populate the edge attribute # FIXME: unclear in object model if attribute type dictates value type, or if value always needs to be a string
                return edge_attribute
            else:
                return None
        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(
                f"Something went wrong when adding the edge attribute from {KP_to_use}."
            )

    def add_virtual_edge(self, name="", default=0.):
        """
        Generic function to add a virtual edge to the KG an QG
        :name: name of the functionality of the KP to use
        """
        parameters = self.parameters
        source_curies_to_decorate = set()
        target_curies_to_decorate = set()
        curies_to_names = dict(
        )  # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs
        # identify the nodes that we should be adding virtual edges for
        for node in self.message.knowledge_graph.nodes:
            if hasattr(node, 'qnode_ids'):
                if parameters['source_qnode_id'] in node.qnode_ids:
                    source_curies_to_decorate.add(node.id)
                    curies_to_names[
                        node.
                        id] = node.name  # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs
                if parameters['target_qnode_id'] in node.qnode_ids:
                    target_curies_to_decorate.add(node.id)
                    curies_to_names[
                        node.
                        id] = node.name  # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs
        added_flag = False  # check to see if any edges where added
        # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute
        for (source_curie,
             target_curie) in itertools.product(source_curies_to_decorate,
                                                target_curies_to_decorate):
            # create the edge attribute if it can be
            edge_attribute = self.make_edge_attribute_from_curies(
                source_curie,
                target_curie,
                source_name=curies_to_names[source_curie],
                target_name=curies_to_names[target_curie],
                default=default,
                name=name)
            if edge_attribute:
                added_flag = True
                # make the edge, add the attribute

                # edge properties
                now = datetime.now()
                edge_type = f"has_{name}_with"
                qedge_ids = [parameters['virtual_relation_label']]
                relation = parameters['virtual_relation_label']
                is_defined_by = "ARAX"
                defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S")
                provided_by = "ARAX"
                confidence = None
                weight = None  # TODO: could make the actual value of the attribute
                source_id = source_curie
                target_id = target_curie

                # now actually add the virtual edges in
                id = f"{relation}_{self.global_iter}"
                self.global_iter += 1
                edge = Edge(id=id,
                            type=edge_type,
                            relation=relation,
                            source_id=source_id,
                            target_id=target_id,
                            is_defined_by=is_defined_by,
                            defined_datetime=defined_datetime,
                            provided_by=provided_by,
                            confidence=confidence,
                            weight=weight,
                            edge_attributes=[edge_attribute],
                            qedge_ids=qedge_ids)
                self.message.knowledge_graph.edges.append(edge)

        # Now add a q_edge the query_graph since I've added an extra edge to the KG
        if added_flag:
            edge_type = f"has_{name}_with"
            relation = parameters['virtual_relation_label']
            qedge_ids = [parameters['virtual_relation_label']]
            q_edge = QEdge(
                id=relation,
                type=edge_type,
                relation=relation,
                source_id=parameters['source_qnode_id'],
                target_id=parameters['target_qnode_id']
            )  # TODO: ok to make the id and type the same thing?
            self.message.query_graph.edges.append(q_edge)

    def add_all_edges(self, name="", default=0.):
        curies_to_names = dict()
        for node in self.message.knowledge_graph.nodes:
            curies_to_names[node.id] = node.name
        for edge in self.message.knowledge_graph.edges:
            if not edge.edge_attributes:  # populate if not already there
                edge.edge_attributes = []
            source_curie = edge.source_id
            target_curie = edge.target_id
            edge_attribute = self.make_edge_attribute_from_curies(
                source_curie,
                target_curie,
                source_name=curies_to_names[source_curie],
                target_name=curies_to_names[target_curie],
                default=default,
                name=name
            )  # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs
            if edge_attribute:  # make sure an edge attribute was actually created
                edge.edge_attributes.append(edge_attribute)

    def paired_concept_frequency(self, default=0):
        """
        calulate paired concept frequency.
        Retrieves observed clinical frequencies of a pair of concepts.
        :return: response
        """
        parameters = self.parameters
        self.response.debug("Computing paired concept frequencies.")
        self.response.info(
            "Overlaying paired concept frequencies utilizing Columbia Open Health Data. This calls an external knowledge provider and may take a while"
        )

        # Now add the edges or virtual edges
        try:
            if 'virtual_relation_label' in parameters:
                self.add_virtual_edge(name="paired_concept_frequency",
                                      default=default)
            else:  # otherwise, just add to existing edges in the KG
                self.add_all_edges(name="paired_concept_frequency",
                                   default=default)

        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(
                f"Something went wrong when overlaying clinical info")

    def observed_expected_ratio(self, default=0):
        """
        Returns the natural logarithm of the ratio between the observed count and expected count.
        Expected count is calculated from the single concept frequencies and assuming independence between the concepts.
        Results are returned as maximum over all ln_ratios matching to OMOP concept id.
        """
        parameters = self.parameters
        self.response.debug("Computing observed expected ratios.")
        self.response.info(
            "Overlaying observed expected ratios utilizing Columbia Open Health Data. This calls an external knowledge provider and may take a while"
        )

        # Now add the edges or virtual edges
        try:
            if 'virtual_relation_label' in parameters:
                self.add_virtual_edge(name="observed_expected_ratio",
                                      default=default)
            else:  # otherwise, just add to existing edges in the KG
                self.add_all_edges(name="observed_expected_ratio",
                                   default=default)

        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(
                f"Something went wrong when overlaying clinical info")

    def chi_square(self, default=float("inf")):
        """
        Returns the chi-square statistic and p-value between pairs of concepts. Results are returned in descending order of the chi-square statistic. Note that due to large sample sizes, the chi-square can become very large.
        The expected frequencies for the chi-square analysis are calculated based on the single concept frequencies and assuming independence between concepts. P-value is calculated with 1 DOF.
        """
        parameters = self.parameters
        self.response.debug("Computing Chi square p-values.")
        self.response.info(
            "Overlaying Chi square p-values utilizing Columbia Open Health Data. This calls an external knowledge provider and may take a while"
        )

        # Now add the edges or virtual edges
        try:
            if 'virtual_relation_label' in parameters:
                self.add_virtual_edge(name="chi_square", default=default)
            else:  # otherwise, just add to existing edges in the KG
                self.add_all_edges(name="chi_square", default=default)

        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(
                f"Something went wrong when overlaying clinical info")
Пример #4
0
class OverlayClinicalInfo:

    #### Constructor
    def __init__(self, response, message, params):
        self.response = response
        self.message = message
        self.parameters = params
        self.who_knows_about_what = {
            'COHD': [
                'small_molecule', 'phenotypic_feature', 'disease', 'drug',
                'biolink:SmallMolecule', 'biolink:PhenotypicFeature',
                'biolink:Disease', 'biolink:Drug'
            ]
        }  # FIXME: replace this with information about the KP's, KS's, and their API's
        self.node_curie_to_type = dict()
        self.biolink_helper = BiolinkHelper()
        self.global_iter = 0
        try:
            self.cohdIndex = COHDIndex()
        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(
                f"Internal Error encountered connecting to the local COHD database."
            )

    def decorate(self):
        """
        Main decorator: looks at parameters and figures out which subroutine to farm out to
        :param parameters:
        :return: response object
        """
        # First, make a dictionary between node curie and type to make sure we're only looking at edges we can handle
        self.response.info(
            "Converting CURIE identifiers to human readable names")
        try:
            for key, node in self.message.knowledge_graph.nodes.items():
                self.node_curie_to_type[
                    key] = node.categories  # WARNING: this is a list
        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(f"Something went wrong when converting names")
            return self.response

        parameters = self.parameters
        if 'paired_concept_frequency' in parameters:
            if parameters['paired_concept_frequency'] == 'true':
                self.paired_concept_frequency()
                # TODO: should I return the response and merge, or is it passed by reference and just return at the end?
        if 'associated_concept_freq' in parameters:
            if parameters['associated_concept_freq'] == 'true':
                #self.associated_concept_freq()  # TODO: make this function, and all the other COHD functions too
                pass
        if 'chi_square' in parameters:
            if parameters['chi_square'] == 'true':
                self.chi_square(
                )  # TODO: make this function, and all the other COHD functions too
                pass
        if 'observed_expected_ratio' in parameters:
            if parameters['observed_expected_ratio'] == 'true':
                self.observed_expected_ratio(
                )  # TODO: make this function, and all the other COHD functions too
                pass
        if 'relative_frequency' in parameters:
            if parameters['relative_frequency'] == 'true':
                #self.associated_concept_freq()  # TODO: make this function, and all the other COHD functions too
                pass

        return self.response

    def in_common(self, list1, list2):
        """
        Helper function that returns true iff list1 and list2 have any elements in common
        :param list1: a list of strings (intended to be biolink node types)
        :param list2: another list of strings (intended to be biolink node types)
        :return: True/False if they share an element in common
        """
        if set(list1).intersection(set(list2)):
            return True
        else:
            return False

    def make_edge_attribute_from_curies(self,
                                        subject_curie,
                                        object_curie,
                                        subject_name="",
                                        object_name="",
                                        default=0.,
                                        name=""):
        """
        Generic function to make an edge attribute
        :subject_curie: CURIE of the subject node for the edge under consideration
        :object_curie: CURIE of the object node for the edge under consideration
        :subject_name: text name of the subject node (in case the KP doesn't understand the CURIE)
        :object: text name of the object node (in case the KP doesn't understand the CURIE)
        :default: default value of the edge attribute
        :name: name of the KP functionality you want to apply
        """
        try:
            # edge attributes
            name = name
            type = "EDAM:data_0951"
            url = "http://cohd.smart-api.info/"
            value = default

            node_curie_to_type = self.node_curie_to_type
            subject_type = node_curie_to_type[subject_curie]
            object_type = node_curie_to_type[object_curie]
            # figure out which knowledge provider to use  # TODO: should handle this in a more structured fashion, does there exist a standardized KP API format?
            KP_to_use = None
            for KP in self.who_knows_about_what:
                # see which KP's can label both subjects of information
                if self.in_common(
                        self.biolink_helper.get_descendants(
                            subject_type, include_mixins=False),
                        self.who_knows_about_what[KP]) and self.in_common(
                            self.biolink_helper.get_descendants(
                                object_type, include_mixins=False),
                            self.who_knows_about_what[KP]):
                    KP_to_use = KP

            if KP_to_use == 'COHD':
                self.response.debug(
                    f"Querying Columbia Open Health data for info about {subject_name} and {object_name}"
                )
                # convert CURIE to OMOP identifiers
                # subject_OMOPs = [str(x['omop_standard_concept_id']) for x in COHD.get_xref_to_OMOP(subject_curie, 1)]
                res = self.mapping_curie_to_omop_ids.get(subject_curie, [])
                if len(res) != 0:
                    subject_OMOPs = res
                else:
                    subject_OMOPs = []
                # object_OMOPs = [str(x['omop_standard_concept_id']) for x in COHD.get_xref_to_OMOP(object_curie, 1)]
                res = self.mapping_curie_to_omop_ids.get(object_curie, [])
                if len(res) != 0:
                    object_OMOPs = res
                else:
                    object_OMOPs = []
                # for domain in ["Condition", "Drug", "Procedure"]:
                #     subject_OMOPs.update([str(x['concept_id']) for x in COHD.find_concept_ids(subject_name, domain=domain, dataset_id=3)])
                #     object_OMOPs.update([str(x['concept_id']) for x in COHD.find_concept_ids(object_name, domain=domain, dataset_id=3)])
                #################################################
                # FIXME: this was the old way
                # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs
                # if subject_curie.split('.')[0] == 'CHEMBL':
                #     subject_OMOPs = [str(x['concept_id']) for x in
                #                     COHD.find_concept_ids(subject_name, domain="Drug", dataset_id=3)]
                # if object_curie.split('.')[0] == 'CHEMBL':
                #     object_OMOPs = [str(x['concept_id']) for x in
                #                     COHD.find_concept_ids(object_name, domain="Drug", dataset_id=3)]

                # uniquify everything
                # subject_OMOPs = list(set(subject_OMOPs))
                # object_OMOPs = list(set(object_OMOPs))

                # Decide how to handle the response from the KP
                if name == 'paired_concept_frequency':
                    # sum up all frequencies  #TODO check with COHD people to see if this is kosher
                    frequency = default
                    # for (omop1, omop2) in itertools.product(subject_OMOPs, object_OMOPs):
                    #     freq_data_list = self.cohdIndex.get_paired_concept_freq(omop1, omop2, 3) # use the hierarchical dataset
                    #     if len(freq_data_list) != 0:
                    #         freq_data = freq_data_list[0]
                    #         temp_value = freq_data['concept_frequency']
                    #         if temp_value > frequency:
                    #             frequency = temp_value
                    omop_pairs = [
                        f"{omop1}_{omop2}"
                        for (omop1, omop2) in itertools.product(
                            subject_OMOPs, object_OMOPs)
                    ]
                    if len(omop_pairs) != 0:
                        res = self.cohdIndex.get_paired_concept_freq(
                            concept_id_pair=omop_pairs,
                            dataset_id=3)  # use the hierarchical dataset
                        if len(res) != 0:
                            maximum_concept_frequency = res[0][
                                'concept_frequency']  # the result returned from get_paired_concept_freq was sorted by decreasing order
                            frequency = maximum_concept_frequency
                    # decorate the edges
                    value = frequency

                elif name == 'observed_expected_ratio':
                    # should probably take the largest obs/exp ratio  # TODO: check with COHD people to see if this is kosher
                    # FIXME: the ln_ratio can be negative, so I should probably account for this, but the object model doesn't like -np.inf
                    value = float(
                        "-inf"
                    )  # FIXME: unclear in object model if attribute type dictates value type, or if value always needs to be a string

                    ###############################
                    # The following code was an experiment to see if it would speed things up, leaving it out for now since it's difficult to quantify if it does speed things up given the cacheing
                    #if len(subject_OMOPs) < len(object_OMOPs):
                    #    for omop1 in subject_OMOPs:
                    #        omop_to_ln_ratio = dict()
                    #        response = COHD.get_obs_exp_ratio(omop1, domain="", dataset_id=3)  # use the hierarchical dataset
                    #        if response:
                    #            for res in response:
                    #                omop_to_ln_ratio[str(res['concept_id_2'])] = res['ln_ratio']
                    #        for omop2 in object_OMOPs:
                    #            if omop2 in omop_to_ln_ratio:
                    #                temp_value = omop_to_ln_ratio[omop2]
                    #                if temp_value > value:
                    #                    value = temp_value
                    #else:
                    #    for omop1 in object_OMOPs:
                    #        omop_to_ln_ratio = dict()
                    #        response = COHD.get_obs_exp_ratio(omop1, domain="", dataset_id=3)  # use the hierarchical dataset
                    #        if response:
                    #            for res in response:
                    #                omop_to_ln_ratio[str(res['concept_id_2'])] = res['ln_ratio']
                    #        for omop2 in subject_OMOPs:
                    #            if omop2 in omop_to_ln_ratio:
                    #                temp_value = omop_to_ln_ratio[omop2]
                    #                if temp_value > value:
                    #                    value = temp_value
                    ###################################

                    # for (omop1, omop2) in itertools.product(subject_OMOPs, object_OMOPs):
                    #     #print(f"{omop1},{omop2}")
                    #     response = self.cohdIndex.get_obs_exp_ratio(omop1, concept_id_2=omop2, domain="", dataset_id=3)  # use the hierarchical dataset
                    #     # response is a list, since this function is overloaded and can omit concept_id_2, take the first element
                    #     if response and 'ln_ratio' in response[0]:
                    #         temp_val = response[0]['ln_ratio']
                    #         if temp_val > value:
                    #             value = temp_val
                    omop_pairs = [
                        f"{omop1}_{omop2}"
                        for (omop1, omop2) in itertools.product(
                            subject_OMOPs, object_OMOPs)
                    ]
                    if len(omop_pairs) != 0:
                        res = self.cohdIndex.get_obs_exp_ratio(
                            concept_id_pair=omop_pairs,
                            domain="",
                            dataset_id=3)  # use the hierarchical dataset
                        if len(res) != 0:
                            maximum_ln_ratio = res[0][
                                'ln_ratio']  # the result returned from get_paired_concept_freq was sorted by decreasing order
                            value = maximum_ln_ratio

                elif name == 'chi_square':
                    value = float("inf")
                    # for (omop1, omop2) in itertools.product(subject_OMOPs, object_OMOPs):
                    #     response = self.cohdIndex.get_chi_square(omop1, concept_id_2=omop2, domain="", dataset_id=3)  # use the hierarchical dataset
                    #     # response is a list, since this function is overloaded and can omit concept_id_2, take the first element
                    #     if response and 'p-value' in response[0]:
                    #         temp_val = response[0]['p-value']
                    #         if temp_val < value:  # looking at p=values, so lower is better
                    #             value = temp_val
                    omop_pairs = [
                        f"{omop1}_{omop2}"
                        for (omop1, omop2) in itertools.product(
                            subject_OMOPs, object_OMOPs)
                    ]
                    if len(omop_pairs) != 0:
                        res = self.cohdIndex.get_chi_square(
                            concept_id_pair=omop_pairs,
                            domain="",
                            dataset_id=3)  # use the hierarchical dataset
                        if len(res) != 0:
                            minimum_pvalue = res[0][
                                'p-value']  # the result returned from get_paired_concept_freq was sorted by decreasing order
                            value = minimum_pvalue

                # create the edge attribute
                edge_attribute = EdgeAttribute(
                    attribute_type_id=type,
                    original_attribute_name=name,
                    value=str(value),
                    value_url=url
                )  # populate the edge attribute # FIXME: unclear in object model if attribute type dictates value type, or if value always needs to be a string
                return edge_attribute
            else:
                return None
        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(
                f"Something went wrong when adding the edge attribute from {KP_to_use}."
            )

    def add_virtual_edge(self, name="", default=0.):
        """
        Generic function to add a virtual edge to the KG an QG
        :name: name of the functionality of the KP to use
        """
        parameters = self.parameters
        subject_curies_to_decorate = set()
        object_curies_to_decorate = set()
        curies_to_names = dict(
        )  # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs
        # identify the nodes that we should be adding virtual edges for
        for key, node in self.message.knowledge_graph.nodes.items():
            if hasattr(node, 'qnode_keys'):
                if parameters['subject_qnode_key'] in node.qnode_keys:
                    subject_curies_to_decorate.add(key)
                    curies_to_names[
                        key] = node.name  # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs
                if parameters['object_qnode_key'] in node.qnode_keys:
                    object_curies_to_decorate.add(key)
                    curies_to_names[
                        key] = node.name  # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs
        added_flag = False  # check to see if any edges where added
        # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute

        ## call COHD api one time to save time
        curies_to_decorate = set()
        curies_to_decorate.update(subject_curies_to_decorate)
        curies_to_decorate.update(object_curies_to_decorate)
        self.mapping_curie_to_omop_ids = self.cohdIndex.get_concept_ids(
            curies_to_decorate)
        for (subject_curie,
             object_curie) in itertools.product(subject_curies_to_decorate,
                                                object_curies_to_decorate):
            # create the edge attribute if it can be
            edge_attribute = self.make_edge_attribute_from_curies(
                subject_curie,
                object_curie,
                subject_name=curies_to_names[subject_curie],
                object_name=curies_to_names[object_curie],
                default=default,
                name=name)
            if edge_attribute:
                added_flag = True
                # make the edge, add the attribute

                # edge properties
                now = datetime.now()
                edge_type = f"biolink:has_real_world_evidence_of_association_with"
                qedge_keys = [parameters['virtual_relation_label']]
                relation = parameters['virtual_relation_label']
                is_defined_by = "ARAX"
                defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S")
                provided_by = "infores:arax"
                confidence = None
                weight = None  # TODO: could make the actual value of the attribute
                subject_key = subject_curie
                object_key = object_curie

                # now actually add the virtual edges in
                id = f"{relation}_{self.global_iter}"
                # ensure the id is unique
                # might need to change after expand is implemented for TRAPI 1.0
                while id in self.message.knowledge_graph.edges:
                    id = f"{relation}_{self.global_iter}.{random.randint(10**(9-1), (10**9)-1)}"
                self.global_iter += 1
                edge_attribute_list = [
                    edge_attribute,
                    EdgeAttribute(
                        original_attribute_name="virtual_relation_label",
                        value=relation,
                        attribute_type_id="biolink:Unknown"),
                    #EdgeAttribute(original_attribute_name="is_defined_by", value=is_defined_by, attribute_type_id="biolink:Unknown"),
                    EdgeAttribute(original_attribute_name="defined_datetime",
                                  value=defined_datetime,
                                  attribute_type_id="metatype:Datetime"),
                    EdgeAttribute(
                        original_attribute_name="provided_by",
                        value=provided_by,
                        attribute_type_id="biolink:aggregator_knowledge_source",
                        attribute_source=provided_by,
                        value_type_id="biolink:InformationResource"),
                    EdgeAttribute(
                        original_attribute_name=None,
                        value=True,
                        attribute_type_id="biolink:computed_value",
                        attribute_source="infores:arax-reasoner-ara",
                        value_type_id="metatype:Boolean",
                        value_url=None,
                        description=
                        "This edge is a container for a computed value between two nodes that is not directly attachable to other edges."
                    )
                    #EdgeAttribute(name="confidence", value=confidence, type="biolink:ConfidenceLevel"),
                    #EdgeAttribute(name="weight", value=weight, type="metatype:Float"),
                    #EdgeAttribute(name="qedge_ids", value=qedge_ids)
                ]
                # edge = Edge(id=id, type=edge_type, relation=relation, subject_key=subject_key,
                #             object_key=object_key,
                #             is_defined_by=is_defined_by, defined_datetime=defined_datetime,
                #             provided_by=provided_by,
                #             confidence=confidence, weight=weight, attributes=[edge_attribute], qedge_ids=qedge_ids)
                edge = Edge(predicate=edge_type,
                            subject=subject_key,
                            object=object_key,
                            attributes=edge_attribute_list)
                edge.qedge_keys = qedge_keys
                self.message.knowledge_graph.edges[id] = edge
                if self.message.results is not None and len(
                        self.message.results) > 0:
                    ou.update_results_with_overlay_edge(
                        subject_knode_key=subject_key,
                        object_knode_key=object_key,
                        kedge_key=id,
                        message=self.message,
                        log=self.response)

        # Now add a q_edge the query_graph since I've added an extra edge to the KG
        if added_flag:
            edge_type = f"biolink:has_real_world_evidence_of_association_with"
            relation = parameters['virtual_relation_label']
            qedge_keys = [parameters['virtual_relation_label']]
            subject_qnode_key = parameters['subject_qnode_key']
            object_qnode_key = parameters['object_qnode_key']
            option_group_id = ou.determine_virtual_qedge_option_group(
                subject_qnode_key, object_qnode_key, self.message.query_graph,
                self.response)
            # q_edge = QEdge(id=relation, type=edge_type, relation=relation,
            #                subject_key=subject_qnode_key, object_key=object_qnode_key,
            #                option_group_id=option_group_id)  # TODO: ok to make the id and type the same thing?
            q_edge = QEdge(predicates=edge_type,
                           subject=subject_qnode_key,
                           object=object_qnode_key,
                           option_group_id=option_group_id)
            q_edge.relation = relation
            self.message.query_graph.edges[relation] = q_edge

    def add_all_edges(self, name="", default=0.):
        curies_to_names = dict()
        all_curie_set = set()
        for key, node in self.message.knowledge_graph.nodes.items():
            curies_to_names[key] = node.name
            all_curie_set.add(key)
        self.mapping_curie_to_omop_ids = self.cohdIndex.get_concept_ids(
            all_curie_set)
        for edge in self.message.knowledge_graph.edges.values():
            if not edge.attributes:  # populate if not already there
                edge.attributes = []
            subject_curie = edge.subject
            object_curie = edge.object
            edge_attribute = self.make_edge_attribute_from_curies(
                subject_curie,
                object_curie,
                subject_name=curies_to_names[subject_curie],
                object_name=curies_to_names[object_curie],
                default=default,
                name=name
            )  # FIXME: Super hacky way to get around the fact that COHD can't map CHEMBL drugs
            if edge_attribute:  # make sure an edge attribute was actually created
                edge.attributes.append(edge_attribute)

    def paired_concept_frequency(self, default=0):
        """
        calulate paired concept frequency.
        Retrieves observed clinical frequencies of a pair of concepts.
        :return: response
        """
        parameters = self.parameters
        self.response.debug("Computing paired concept frequencies.")
        self.response.info(
            "Overlaying paired concept frequencies utilizing Columbia Open Health Data. This calls an external knowledge provider and may take a while"
        )

        # Now add the edges or virtual edges
        try:
            if 'virtual_relation_label' in parameters:
                if 'subject_qnode_key' in parameters and 'object_qnode_key' in parameters:
                    self.add_virtual_edge(name="paired_concept_frequency",
                                          default=default)
                else:
                    seen_node_pairs = set()
                    qgraph_edges = copy.deepcopy(
                        list(self.response.envelope.message.query_graph.edges.
                             values()))
                    for query_edge in qgraph_edges:
                        current_subject_qnode_key = query_edge.subject
                        current_object_qnode_key = query_edge.object
                        if current_subject_qnode_key < current_object_qnode_key:
                            qnode_key_pair = (current_subject_qnode_key,
                                              current_object_qnode_key)
                        else:
                            qnode_key_pair = (current_object_qnode_key,
                                              current_subject_qnode_key)
                        # FW: check if we have already added an edge for this pair
                        if qnode_key_pair in seen_node_pairs:
                            pass
                        else:
                            seen_node_pairs.add(qnode_key_pair)
                            parameters[
                                'subject_qnode_key'] = current_subject_qnode_key
                            parameters[
                                'object_qnode_key'] = current_object_qnode_key
                            self.add_virtual_edge(
                                name="paired_concept_frequency",
                                default=default)
                            parameters.pop('subject_qnode_key')
                            parameters.pop('object_qnode_key')
            else:  # otherwise, just add to existing edges in the KG
                self.add_all_edges(name="paired_concept_frequency",
                                   default=default)

        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(
                f"Something went wrong when overlaying clinical info")

    def observed_expected_ratio(self, default=0):
        """
        Returns the natural logarithm of the ratio between the observed count and expected count.
        Expected count is calculated from the single concept frequencies and assuming independence between the concepts.
        Results are returned as maximum over all ln_ratios matching to OMOP concept id.
        """
        parameters = self.parameters
        self.response.debug("Computing observed expected ratios.")
        self.response.info(
            "Overlaying observed expected ratios utilizing Columbia Open Health Data. This calls an external knowledge provider and may take a while"
        )

        # Now add the edges or virtual edges
        try:
            if 'virtual_relation_label' in parameters:
                if 'subject_qnode_key' in parameters and 'object_qnode_key' in parameters:
                    self.add_virtual_edge(name="observed_expected_ratio",
                                          default=default)
                else:
                    seen_node_pairs = set()
                    qgraph_edges = copy.deepcopy(
                        list(self.response.envelope.message.query_graph.edges.
                             values()))
                    for query_edge in qgraph_edges:
                        current_subject_qnode_key = query_edge.subject
                        current_object_qnode_key = query_edge.object
                        if current_subject_qnode_key < current_object_qnode_key:
                            qnode_key_pair = (current_subject_qnode_key,
                                              current_object_qnode_key)
                        else:
                            qnode_key_pair = (current_object_qnode_key,
                                              current_subject_qnode_key)
                        # FW: check if we have already added an edge for this pair
                        if qnode_key_pair in seen_node_pairs:
                            pass
                        else:
                            seen_node_pairs.add(qnode_key_pair)
                            parameters[
                                'subject_qnode_key'] = current_subject_qnode_key
                            parameters[
                                'object_qnode_key'] = current_object_qnode_key
                            self.add_virtual_edge(
                                name="observed_expected_ratio",
                                default=default)
                            parameters.pop('subject_qnode_key')
                            parameters.pop('object_qnode_key')
            else:  # otherwise, just add to existing edges in the KG
                self.add_all_edges(name="observed_expected_ratio",
                                   default=default)

        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(
                f"Something went wrong when overlaying clinical info")

    def chi_square(self, default=float("inf")):
        """
        Returns the chi-square statistic and p-value between pairs of concepts. Results are returned in descending order of the chi-square statistic. Note that due to large sample sizes, the chi-square can become very large.
        The expected frequencies for the chi-square analysis are calculated based on the single concept frequencies and assuming independence between concepts. P-value is calculated with 1 DOF.
        """
        parameters = self.parameters
        self.response.debug("Computing Chi square p-values.")
        self.response.info(
            "Overlaying Chi square p-values utilizing Columbia Open Health Data. This calls an external knowledge provider and may take a while"
        )

        # Now add the edges or virtual edges
        try:
            if 'virtual_relation_label' in parameters:
                if 'subject_qnode_key' in parameters and 'object_qnode_key' in parameters:
                    self.add_virtual_edge(name="chi_square", default=default)
                else:
                    seen_node_pairs = set()
                    qgraph_edges = copy.deepcopy(
                        list(self.response.envelope.message.query_graph.edges.
                             values()))
                    for query_edge in qgraph_edges:
                        current_subject_qnode_key = query_edge.subject
                        current_object_qnode_key = query_edge.object
                        if current_subject_qnode_key < current_object_qnode_key:
                            qnode_key_pair = (current_subject_qnode_key,
                                              current_object_qnode_key)
                        else:
                            qnode_key_pair = (current_object_qnode_key,
                                              current_subject_qnode_key)
                        # FW: check if we have already added an edge for this pair
                        if qnode_key_pair in seen_node_pairs:
                            pass
                        else:
                            seen_node_pairs.add(qnode_key_pair)
                            parameters[
                                'subject_qnode_key'] = current_subject_qnode_key
                            parameters[
                                'object_qnode_key'] = current_object_qnode_key
                            self.add_virtual_edge(name="chi_square",
                                                  default=default)
                            parameters.pop('subject_qnode_key')
                            parameters.pop('object_qnode_key')
            else:  # otherwise, just add to existing edges in the KG
                self.add_all_edges(name="chi_square", default=default)

        except:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            self.response.error(tb, error_code=error_type.__name__)
            self.response.error(
                f"Something went wrong when overlaying clinical info")
Пример #5
0
sys.path.append(
    os.path.sep.join([*pathlist[:(RTXindex + 1)], 'code', 'ARAX',
                      'ARAXQuery']))
from ARAX_query import ARAXQuery
from ARAX_response import ARAXResponse
sys.path.append(
    os.path.sep.join(
        [*pathlist[:(RTXindex + 1)], 'code', 'ARAX', 'NodeSynonymizer/']))
from node_synonymizer import NodeSynonymizer  ##Note: For different version of kg2, use corresponding nodesynonymizer
sys.path.append(
    os.path.sep.join([
        *pathlist[:(RTXindex + 1)], 'code', 'ARAX', 'KnowledgeSources',
        'COHD_local', 'scripts'
    ]))
from COHDIndex import COHDIndex
cohdindex = COHDIndex()

# Get the file paths for the databases
dtd_filepath = os.path.sep.join([
    *pathlist[:(RTXindex + 1)], 'code', 'ARAX', 'KnowledgeSources',
    'Prediction'
])
cohd_filepath = os.path.sep.join([
    *pathlist[:(RTXindex + 1)], 'code', 'ARAX', 'KnowledgeSources',
    'COHD_local', 'data'
])

# Import RTX config
sys.path.append(os.path.sep.join([*pathlist[:(RTXindex + 1)], 'code']))
from RTXConfiguration import RTXConfiguration