예제 #1
0
 def __init__(self, pubmed_directory_path, is_test):
     self.pubmed_directory_path = pubmed_directory_path
     self.conceptname_to_pmids_db_path = "conceptname_to_pmids.db"
     self.curie_to_pmids_db_path = "curie_to_pmids.sqlite"
     self.status = 'OK'
     self.synonymizer = NodeSynonymizer()
     self.is_test = is_test
예제 #2
0
def get_canonical_curies_list(curie: Union[str, List[str]], log: ARAXResponse) -> List[str]:
    curies = convert_to_list(curie)
    try:
        synonymizer = NodeSynonymizer()
        log.debug(f"Sending NodeSynonymizer.get_canonical_curies() a list of {len(curies)} curies")
        canonical_curies_dict = synonymizer.get_canonical_curies(curies)
        log.debug(f"Got response back from NodeSynonymizer")
    except Exception:
        tb = traceback.format_exc()
        error_type, error, _ = sys.exc_info()
        log.error(f"Encountered a problem using NodeSynonymizer: {tb}", error_code=error_type.__name__)
        return []
    else:
        if canonical_curies_dict is not None:
            recognized_input_curies = {input_curie for input_curie in canonical_curies_dict if canonical_curies_dict.get(input_curie)}
            unrecognized_curies = set(curies).difference(recognized_input_curies)
            if unrecognized_curies:
                log.warning(f"NodeSynonymizer did not return canonical info for: {unrecognized_curies}")
            canonical_curies = {canonical_curies_dict[recognized_curie].get('preferred_curie') for recognized_curie in recognized_input_curies}
            # Include any original curies we weren't able to find a canonical version for
            canonical_curies.update(unrecognized_curies)
            if not canonical_curies:
                log.error(f"Final list of canonical curies is empty. This shouldn't happen!", error_code="CanonicalCurieIssue")
            return list(canonical_curies)
        else:
            log.error(f"NodeSynonymizer returned None", error_code="NodeNormalizationIssue")
            return []
예제 #3
0
def estimate_percent_nodes_with_mesh_mapping_via_synonymizer(kg: str):
    print(
        f"Estimating the percent of {kg} nodes mappable to a MESH curie via NodeSynonymizer"
    )
    percentages_with_mesh = []
    num_batches = 20
    batch_size = 4000
    for number in range(num_batches):
        print(f"  Batch {number + 1}")
        # Get random selection of node IDs from the KG
        random_node_ids = _get_random_node_ids(batch_size, kg)

        # Use synonymizer to get their equivalent curies and check for a MESH term
        print(f"    Getting equivalent curies for those random node IDs..")
        synonymizer = NodeSynonymizer()
        curie_synonym_info = synonymizer.get_equivalent_curies(
            list(random_node_ids), kg_name='KG2')
        num_curies_with_mesh_term = 0
        for input_curie, synonym_curies in curie_synonym_info.items():
            if synonym_curies:
                if any(curie for curie in synonym_curies
                       if curie.startswith('MESH')):
                    num_curies_with_mesh_term += 1
        percentage_with_mesh = (num_curies_with_mesh_term /
                                len(random_node_ids)) * 100
        print(
            f"    {percentage_with_mesh}% of nodes had a synonym MESH term in this batch."
        )
        percentages_with_mesh.append(percentage_with_mesh)

    print(f"  Percentages for all batches: {percentages_with_mesh}.")
    average = sum(percentages_with_mesh) / len(percentages_with_mesh)
    print(
        f"Final estimate of {kg} nodes mappable to a MESH term via NodeSynonymizer: {round(average)}%"
    )
예제 #4
0
def get_preferred_categories(curie: Union[str, List[str]],
                             log: ARAXResponse) -> Optional[List[str]]:
    curies = convert_to_list(curie)
    synonymizer = NodeSynonymizer()
    log.debug(
        f"Sending NodeSynonymizer.get_canonical_curies() a list of {len(curies)} curies"
    )
    canonical_curies_dict = synonymizer.get_canonical_curies(curies)
    log.debug(f"Got response back from NodeSynonymizer")
    if canonical_curies_dict is not None:
        recognized_input_curies = {
            input_curie
            for input_curie in canonical_curies_dict
            if canonical_curies_dict.get(input_curie)
        }
        unrecognized_curies = set(curies).difference(recognized_input_curies)
        if unrecognized_curies:
            log.warning(
                f"NodeSynonymizer did not recognize: {unrecognized_curies}")
        preferred_categories = {
            canonical_curies_dict[recognized_curie].get('preferred_category')
            for recognized_curie in recognized_input_curies
        }
        if preferred_categories:
            return list(preferred_categories)
        else:
            log.warning(
                f"Unable to find any preferred categories; will default to biolink:NamedThing"
            )
            return ["biolink:NamedThing"]
    else:
        log.error(f"NodeSynonymizer returned None",
                  error_code="NodeNormalizationIssue")
        return []
예제 #5
0
def get_canonical_curies_dict(curie: Union[str, List[str]],
                              log: ARAXResponse) -> Dict[str, Dict[str, str]]:
    curies = convert_string_or_list_to_list(curie)
    try:
        synonymizer = NodeSynonymizer()
        log.debug(
            f"Sending NodeSynonymizer.get_canonical_curies() a list of {len(curies)} curies"
        )
        canonical_curies_dict = synonymizer.get_canonical_curies(curies)
        log.debug(f"Got response back from NodeSynonymizer")
    except Exception:
        tb = traceback.format_exc()
        error_type, error, _ = sys.exc_info()
        log.error(f"Encountered a problem using NodeSynonymizer: {tb}",
                  error_code=error_type.__name__)
        return {}
    else:
        if canonical_curies_dict is not None:
            unrecognized_curies = {
                input_curie
                for input_curie in canonical_curies_dict
                if not canonical_curies_dict.get(input_curie)
            }
            if unrecognized_curies:
                log.warning(
                    f"NodeSynonymizer did not return canonical info for: {unrecognized_curies}"
                )
            return canonical_curies_dict
        else:
            log.error(f"NodeSynonymizer returned None",
                      error_code="NodeNormalizationIssue")
            return {}
예제 #6
0
 def _get_node_synonyms(knowledge_graph):
     synonymizer = NodeSynonymizer()
     node_keys = {key for key in knowledge_graph.nodes.keys()}
     equivalent_curie_info = synonymizer.get_equivalent_nodes(node_keys)
     return {
         node_key: set(equivalent_curies_dict)
         for node_key, equivalent_curies_dict in
         equivalent_curie_info.items()
     }
예제 #7
0
def estimate_percent_nodes_covered_by_backup_method(kg: str):
    print(
        f"Estimating the percent of {kg} nodes mappable by the 'backup' NGD method (uses eUtils)"
    )
    backup_ngd = NormGoogleDistance()
    synonymizer = NodeSynonymizer()
    percentages_mapped = []
    num_batches = 10
    batch_size = 10
    for number in range(num_batches):
        print(f"  Batch {number + 1}")
        # Get random selection of nodes from the KG
        query = f"match (a) return a.id, a.name, rand() as r order by r limit {batch_size}"
        results = _run_cypher_query(query, kg)
        canonical_curie_info = synonymizer.get_canonical_curies(
            [result['a.id'] for result in results])
        recognized_curies = {
            input_curie
            for input_curie in canonical_curie_info
            if canonical_curie_info.get(input_curie)
        }

        # Use the back-up NGD method to try to grab PMIDs for each
        num_with_pmids = 0
        for curie in recognized_curies:
            # Try to map this to a MESH term using the backup method (the chokepoint)
            node_id = canonical_curie_info[curie].get('preferred_curie')
            node_name = canonical_curie_info[curie].get('preferred_name')
            node_type = canonical_curie_info[curie].get('preferred_type')
            try:
                pmids = backup_ngd.get_pmids_for_all([node_id], [node_name])
            except Exception:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                print(f"ERROR using back-up method: {tb}")
            else:
                if len(pmids) and ([
                        pmid_list for pmid_list in pmids if pmid_list
                ]):
                    num_with_pmids += 1
                    print(
                        f"    Found {len(pmids[0])} PMIDs for {node_id}, {node_name}."
                    )
                else:
                    print(f"    Not found. ({node_id}, {node_name})")
        percentage_with_pmids = (num_with_pmids / len(recognized_curies)) * 100
        print(
            f"    {percentage_with_pmids}% of nodes were mapped to PMIDs using backup method."
        )
        percentages_mapped.append(percentage_with_pmids)

    print(f"  Percentages for all batches: {percentages_mapped}.")
    average = sum(percentages_mapped) / len(percentages_mapped)
    print(
        f"Final estimate of backup method's coverage of {kg} nodes: {round(average)}%"
    )
 def _get_node_synonyms(knowledge_graph):
     synonymizer = NodeSynonymizer()
     node_ids = {node.id for node in knowledge_graph.nodes}
     equivalent_curie_info = synonymizer.get_equivalent_nodes(node_ids,
                                                              kg_name='KG2')
     return {
         node_id: set(equivalent_curies_dict)
         for node_id, equivalent_curies_dict in
         equivalent_curie_info.items()
     }
예제 #9
0
def _canonicalize_nodes(kg2pre_nodes: List[Dict[str, any]]) -> Tuple[Dict[str, Dict[str, any]], Dict[str, str]]:
    logging.info(f"Canonicalizing nodes..")
    synonymizer = NodeSynonymizer()
    node_ids = [node.get('id') for node in kg2pre_nodes if node.get('id')]
    logging.info(f"  Sending NodeSynonymizer.get_canonical_curies() {len(node_ids)} curies..")
    canonicalized_info = synonymizer.get_canonical_curies(curies=node_ids, return_all_categories=True)
    all_canonical_curies = {canonical_info['preferred_curie'] for canonical_info in canonicalized_info.values() if canonical_info}
    logging.info(f"  Sending NodeSynonymizer.get_equivalent_nodes() {len(all_canonical_curies)} curies..")
    equivalent_curies_info = synonymizer.get_equivalent_nodes(all_canonical_curies)
    recognized_curies = {curie for curie in equivalent_curies_info if equivalent_curies_info.get(curie)}
    equivalent_curies_dict = {curie: list(equivalent_curies_info.get(curie)) for curie in recognized_curies}
    with open(f"{KG2C_DIR}/equivalent_curies.pickle", "wb") as equiv_curies_dump:  # Save these for use by downstream script
        pickle.dump(equivalent_curies_dict, equiv_curies_dump, protocol=pickle.HIGHEST_PROTOCOL)
    logging.info(f"  Creating canonicalized nodes..")
    curie_map = dict()
    canonicalized_nodes = dict()
    for kg2pre_node in kg2pre_nodes:
        # Grab relevant info for this node and its canonical version
        canonical_info = canonicalized_info.get(kg2pre_node['id'])
        canonicalized_curie = canonical_info.get('preferred_curie', kg2pre_node['id']) if canonical_info else kg2pre_node['id']
        publications = kg2pre_node['publications'] if kg2pre_node.get('publications') else []
        descriptions_list = [kg2pre_node['description']] if kg2pre_node.get('description') else []
        if canonicalized_curie in canonicalized_nodes:
            # Merge this node into its corresponding canonical node
            existing_canonical_node = canonicalized_nodes[canonicalized_curie]
            existing_canonical_node['publications'] = _merge_two_lists(existing_canonical_node['publications'], publications)
            existing_canonical_node['all_names'] = _merge_two_lists(existing_canonical_node['all_names'], [kg2pre_node['name']])
            existing_canonical_node['descriptions_list'] = _merge_two_lists(existing_canonical_node['descriptions_list'], descriptions_list)
            # Make sure any nodes subject to #1074-like problems still appear in equivalent curies
            existing_canonical_node['equivalent_curies'] = _merge_two_lists(existing_canonical_node['equivalent_curies'], [kg2pre_node['id']])
            # Add the IRI for the 'preferred' curie, if we've found that node
            if kg2pre_node['id'] == canonicalized_curie:
                existing_canonical_node['iri'] = kg2pre_node.get('iri')
        else:
            # Initiate the canonical node for this synonym group
            name = canonical_info['preferred_name'] if canonical_info else kg2pre_node['name']
            category = canonical_info['preferred_category'] if canonical_info else kg2pre_node['category']
            all_categories = list(canonical_info['all_categories']) if canonical_info else [kg2pre_node['category']]
            iri = kg2pre_node['iri'] if kg2pre_node['id'] == canonicalized_curie else None
            all_names = [kg2pre_node['name']]
            canonicalized_node = _create_node(preferred_curie=canonicalized_curie,
                                              name=name,
                                              category=category,
                                              all_categories=all_categories,
                                              publications=publications,
                                              equivalent_curies=equivalent_curies_dict.get(canonicalized_curie, [canonicalized_curie]),
                                              iri=iri,
                                              description=None,
                                              descriptions_list=descriptions_list,
                                              all_names=all_names)
            canonicalized_nodes[canonicalized_node['id']] = canonicalized_node
        curie_map[kg2pre_node['id']] = canonicalized_curie  # Record this mapping for easy lookup later
    logging.info(f"Number of KG2pre nodes was reduced to {len(canonicalized_nodes)} "
                 f"({round((len(canonicalized_nodes) / len(kg2pre_nodes)) * 100)}%)")
    return canonicalized_nodes, curie_map
예제 #10
0
def estimate_percent_nodes_covered_by_ultrafast_ngd(kg: str):
    print(
        f"Estimating the percent of {kg} nodes covered by the local NGD system.."
    )
    curie_to_pmid_db = SqliteDict(f"./curie_to_pmids.sqlite")
    percentages_mapped = []
    num_batches = 20
    batch_size = 4000
    all_nodes_mapped_by_type = dict()
    for number in range(num_batches):
        # Get random selection of node IDs from the KG
        random_node_ids = _get_random_node_ids(batch_size, kg)

        # Use synonymizer to get their canonicalized info
        synonymizer = NodeSynonymizer()
        canonical_curie_info = synonymizer.get_canonical_curies(
            list(random_node_ids))
        recognized_curies = {
            input_curie
            for input_curie in canonical_curie_info
            if canonical_curie_info.get(input_curie)
        }

        # See if those canonical curies are in our local database
        num_mapped_to_pmids = 0
        for input_curie in recognized_curies:
            canonical_curie = canonical_curie_info[input_curie].get(
                'preferred_curie')
            preferred_type = canonical_curie_info[input_curie].get(
                'preferred_type')
            if preferred_type not in all_nodes_mapped_by_type:
                all_nodes_mapped_by_type[preferred_type] = {
                    'covered': 0,
                    'not_covered': 0
                }
            if canonical_curie and canonical_curie in curie_to_pmid_db:
                num_mapped_to_pmids += 1
                all_nodes_mapped_by_type[preferred_type]['covered'] += 1
            else:
                all_nodes_mapped_by_type[preferred_type]['not_covered'] += 1
        percentage_mapped = (num_mapped_to_pmids / len(random_node_ids)) * 100
        percentages_mapped.append(percentage_mapped)

    average = sum(percentages_mapped) / len(percentages_mapped)
    print(f"Estimated coverage of {kg} nodes: {round(average)}%.")
    node_type_percentages_dict = dict()
    for node_type, coverage_info in all_nodes_mapped_by_type.items():
        num_covered = coverage_info['covered']
        num_total = coverage_info['covered'] + coverage_info['not_covered']
        percentage = round((num_covered / num_total) * 100)
        node_type_percentages_dict[node_type] = percentage
    for node_type, percentage in sorted(node_type_percentages_dict.items(),
                                        key=lambda item: item[1],
                                        reverse=True):
        print(f"  {node_type}: {percentage}%")
예제 #11
0
    def __init__(self, kg="KG1"):
        """Initialize the class instance.

        Args:
            kg (str, optional): the name of knowledge provider e.g. "KG1" or "KG2". Defaults to "KG1".

        """
        kg = kg.upper()
        self.kg = kg
        self.get_synonyms_done = False
        self.synonymizer = NodeSynonymizer()

        ## set up the path of KGmetadata
        pre_path = os.path.sep.join(
            [*pathlist[:(RTXindex + 1)], 'data', 'KGmetadata'])

        if kg == "KG1":
            fpath = pre_path + "/NodeNamesDescriptions_KG1.tsv"
        elif kg == "KG2":
            fpath = pre_path + "/NodeNamesDescriptions_KG2.tsv"
        else:
            raise ValueError("The parameter 'kg' only accepts 'KG1' or 'KG2'")

        ## read KGmetadata
        try:
            self.kpdata = pd.read_csv(fpath,
                                      sep="\t",
                                      header=None,
                                      names=['curie', 'name', 'type'])
        except FileNotFoundError:
            raise FileNotFoundError(
                "Please go to $RTX/data/KGmetadata and run 'python3 KGNodeIndex.py -b' first"
            )

        self.kpdata_dict = dict()
        for row_index in range(self.kpdata.shape[0]):
            if self.kpdata.loc[row_index, 'curie'] not in self.kpdata_dict:
                self.kpdata_dict[self.kpdata.loc[row_index, 'curie']] = {
                    'name': {self.kpdata.loc[row_index, 'name']},
                    'type': {self.kpdata.loc[row_index, 'type']}
                }
            else:
                self.kpdata_dict[self.kpdata.loc[row_index,
                                                 'curie']]['name'].update([
                                                     self.kpdata.loc[row_index,
                                                                     'name']
                                                 ])
                self.kpdata_dict[self.kpdata.loc[row_index,
                                                 'curie']]['type'].update([
                                                     self.kpdata.loc[row_index,
                                                                     'type']
                                                 ])
예제 #12
0
 def __init__(self, pubmed_directory_path, is_test, live="Production"):
     self.RTXConfig = RTXConfiguration()
     self.RTXConfig.live = live
     ngd_filepath = os.path.sep.join([
         *pathlist[:(RTXindex + 1)], 'code', 'ARAX', 'KnowledgeSources',
         'NormalizedGoogleDistance'
     ])
     self.pubmed_directory_path = pubmed_directory_path
     self.conceptname_to_pmids_db_path = "conceptname_to_pmids.db"
     self.curie_to_pmids_db_path = f"{ngd_filepath}{os.path.sep}{self.RTXConfig.curie_to_pmids_path.split('/')[-1]}"
     self.status = 'OK'
     self.synonymizer = NodeSynonymizer()
     self.is_test = is_test
예제 #13
0
def get_entity(q):  # noqa: E501
    """Obtain CURIE and synonym information about a search term

     # noqa: E501

    :param q: A string to search by (name, abbreviation, CURIE, etc.). The parameter may be repeated for multiple search strings.
    :type q: List[str]

    :rtype: object
    """
    synonymizer = NodeSynonymizer()
    response = synonymizer.get_normalizer_results(q)

    return response
예제 #14
0
def get_curie_names(curie: Union[str, List[str]],
                    log: ARAXResponse) -> Dict[str, str]:
    curies = convert_to_list(curie)
    synonymizer = NodeSynonymizer()
    log.debug(
        f"Looking up names for {len(curies)} input curies using NodeSynonymizer"
    )
    synonymizer_info = synonymizer.get_normalizer_results(curies)
    curie_to_name_map = dict()
    if synonymizer_info:
        recognized_input_curies = {
            input_curie
            for input_curie in synonymizer_info
            if synonymizer_info.get(input_curie)
        }
        unrecognized_curies = set(curies).difference(recognized_input_curies)
        if unrecognized_curies:
            log.warning(
                f"NodeSynonymizer did not recognize: {unrecognized_curies}")
        input_curies_without_matching_node = set()
        for input_curie in recognized_input_curies:
            equivalent_nodes = synonymizer_info[input_curie]["nodes"]
            # Find the 'node' in the synonymizer corresponding to this curie
            input_curie_nodes = [
                node for node in equivalent_nodes
                if node["identifier"] == input_curie
            ]
            if not input_curie_nodes:
                # Try looking for slight variation (KG2 vs. SRI discrepancy): "KEGG:C02700" vs. "KEGG.COMPOUND:C02700"
                input_curie_stripped = input_curie.replace(".COMPOUND", "")
                input_curie_nodes = [
                    node for node in equivalent_nodes
                    if node["identifier"] == input_curie_stripped
                ]
            # Record the name for this input curie
            if input_curie_nodes:
                curie_to_name_map[input_curie] = input_curie_nodes[0].get(
                    "label")
            else:
                input_curies_without_matching_node.add(input_curie)
        if input_curies_without_matching_node:
            log.warning(
                f"No matching nodes found in NodeSynonymizer for these input curies: "
                f"{input_curies_without_matching_node}. Cannot determine their specific names."
            )
    else:
        log.error(f"NodeSynonymizer returned None",
                  error_code="NodeNormalizationIssue")
    return curie_to_name_map
예제 #15
0
def post_entity(body):  # noqa: E501
    """Obtain CURIE and synonym information about search terms

     # noqa: E501

    :param body: List of terms to get information about
    :type body: 

    :rtype: EntityQuery
    """

    synonymizer = NodeSynonymizer()
    response = synonymizer.get_normalizer_results(body)

    return response
예제 #16
0
 def __init__(self, is_test):
     logging.basicConfig(level=logging.INFO,
                         format='%(asctime)s %(levelname)s: %(message)s',
                         handlers=[
                             logging.FileHandler("ngdbuild.log"),
                             logging.StreamHandler()
                         ])
     self.pubmed_directory_path = f"{NGD_DIR}/pubmed_xml_files"
     self.conceptname_to_pmids_db_name = "conceptname_to_pmids.db"
     self.conceptname_to_pmids_db_path = f"{NGD_DIR}/{self.conceptname_to_pmids_db_name}"
     self.curie_to_pmids_db_name = "curie_to_pmids.sqlite"
     self.curie_to_pmids_db_path = f"{NGD_DIR}/{self.curie_to_pmids_db_name}"
     self.status = 'OK'
     self.synonymizer = NodeSynonymizer()
     self.is_test = is_test
예제 #17
0
 def _get_canonical_curies_map(self, curies):
     self.response.debug(f"Canonicalizing curies of relevant nodes using NodeSynonymizer")
     synonymizer = NodeSynonymizer()
     try:
         canonicalized_node_info = synonymizer.get_canonical_curies(curies)
     except Exception:
         tb = traceback.format_exc()
         error_type, error, _ = sys.exc_info()
         self.response.error(f"Encountered a problem using NodeSynonymizer: {tb}", error_code=error_type.__name__)
         return {}
     else:
         canonical_curies_map = dict()
         for input_curie, node_info in canonicalized_node_info.items():
             if node_info:
                 canonical_curies_map[input_curie] = node_info.get('preferred_curie', input_curie)
             else:
                 canonical_curies_map[input_curie] = input_curie
         return canonical_curies_map
예제 #18
0
def get_entity_by_string(search_string):  # noqa: E501
    """Obtain the CURIE and type of some entity by name

     # noqa: E501

    :param search_string: Some string to search by (name, abbreviation, CURIE, etc.)
    :type search_string: str

    :rtype: List[object]
    """
    synonymizer = NodeSynonymizer()
    result = synonymizer.get_canonical_curies(curies=search_string,
                                              names=search_string)
    response = {}
    if result[search_string] is not None:
        response = {
            'curie': result[search_string]['preferred_curie'],
            'name': result[search_string]['preferred_name'],
            'type': result[search_string]['preferred_type']
        }
    return response
예제 #19
0
    def __init__(self, response, message, parameters):
        self.response = response
        self.message = message
        self.parameters = parameters
        self.global_iter = 0
        ## check if the new model files exists in /predictor/retrain_data. If not, scp it from arax.rtx.ai
        pathlist = os.path.realpath(__file__).split(os.path.sep)
        RTXindex = pathlist.index("RTX")
        filepath = os.path.sep.join([*pathlist[:(RTXindex + 1)], 'code', 'ARAX', 'ARAXQuery', 'Overlay', 'predictor','retrain_data'])

        ## check if there is LogModel.pkl
        pkl_file = f"{filepath}/LogModel.pkl"
        if os.path.exists(pkl_file):
            pass
        else:
            os.system("scp [email protected]:/home/ubuntu/drug_repurposing_model_retrain/LogModel.pkl " + pkl_file)

        ## check if there is GRAPH.sqlite
        db_file = f"{filepath}/GRAPH.sqlite"
        if os.path.exists(db_file):
            pass
        else:
            os.system("scp [email protected]:/home/ubuntu/drug_repurposing_model_retrain/GRAPH.sqlite " + db_file)

        # use NodeSynonymizer to replace map.txt
        # check if there is map.txt
        # map_file = f"{filepath}/map.txt"
        # if os.path.exists(map_file):
        #     pass
        # else:
        #     os.system("scp [email protected]:/home/ubuntu/drug_repurposing_model_retrain/map.txt " + map_file)

        self.pred = predictor(model_file=pkl_file)
        self.pred.import_file(None, graph_database=db_file)
        # with open(map_file, 'r') as infile:
        #     map_file_content = infile.readlines()
        #     map_file_content.pop(0) ## remove title
        #     self.known_curies = set(line.strip().split('\t')[0] for line in map_file_content)

        self.synonymizer = NodeSynonymizer()
예제 #20
0
def get_curie_synonyms(curie: Union[str, List[str]],
                       log: ARAXResponse) -> List[str]:
    curies = convert_string_or_list_to_list(curie)
    try:
        synonymizer = NodeSynonymizer()
        log.debug(
            f"Sending NodeSynonymizer.get_equivalent_nodes() a list of {len(curies)} curies"
        )
        equivalent_curies_dict = synonymizer.get_equivalent_nodes(
            curies, kg_name="KG2")
        log.debug(f"Got response back from NodeSynonymizer")
    except Exception:
        tb = traceback.format_exc()
        error_type, error, _ = sys.exc_info()
        log.error(f"Encountered a problem using NodeSynonymizer: {tb}",
                  error_code=error_type.__name__)
        return []
    else:
        if equivalent_curies_dict is not None:
            curies_missing_info = {
                curie
                for curie in equivalent_curies_dict
                if not equivalent_curies_dict.get(curie)
            }
            if curies_missing_info:
                log.warning(
                    f"NodeSynonymizer did not find any equivalent curies for: {curies_missing_info}"
                )
            equivalent_curies = {
                curie
                for curie_dict in equivalent_curies_dict.values() if curie_dict
                for curie in curie_dict
            }
            all_curies = equivalent_curies.union(set(
                curies))  # Make sure even curies without synonyms are included
            return sorted(list(all_curies))
        else:
            log.error(f"NodeSynonymizer returned None",
                      error_code="NodeNormalizationIssue")
            return []
예제 #21
0
def get_curie_synonyms_dict(
    curie: Union[str, List[str]], log: Optional[ARAXResponse] = ARAXResponse()
) -> Dict[str, List[str]]:
    curies = convert_to_list(curie)
    try:
        synonymizer = NodeSynonymizer()
        log.debug(
            f"Sending NodeSynonymizer.get_equivalent_nodes() a list of {len(curies)} curies"
        )
        equivalent_curies_dict = synonymizer.get_equivalent_nodes(curies)
        log.debug(f"Got response back from NodeSynonymizer")
    except Exception:
        tb = traceback.format_exc()
        error_type, error, _ = sys.exc_info()
        log.error(f"Encountered a problem using NodeSynonymizer: {tb}",
                  error_code=error_type.__name__)
        return dict()
    else:
        if equivalent_curies_dict is not None:
            curies_missing_info = {
                curie
                for curie in equivalent_curies_dict
                if not equivalent_curies_dict.get(curie)
            }
            if curies_missing_info:
                log.warning(
                    f"NodeSynonymizer did not find any equivalent curies for: {curies_missing_info}"
                )
            final_curie_dict = dict()
            for input_curie in curies:
                curie_dict = equivalent_curies_dict.get(input_curie)
                final_curie_dict[input_curie] = list(
                    curie_dict) if curie_dict else [input_curie]
            return final_curie_dict
        else:
            log.error(f"NodeSynonymizer returned None",
                      error_code="NodeNormalizationIssue")
            return dict()
예제 #22
0
    def size_of_given_type_in_KP(self, node_type, use_cypher_command=False, kg='KG1'):
        """
        find all nodes of a certain type in KP
        :param node_type: the query node type
        :param use_cypher_command: Boolean (True or False). If True, it used cypher command to query all nodes otherwise used NodeSynonymizer
        :param kg: only allowed for choosing 'KG1' or 'KG2' now. Will extend to BTE later
        """
        # TODO: extend this to KG2, BTE, and other KP's we know of

        size_of_total = None

        if kg == 'KG1' or kg == 'KG2':
            pass
        else:
            self.response.error(f"Only KG1 or KG2 is allowable to calculate the Fisher's exact test temporally")
            return size_of_total

        if kg == 'KG1':
            if use_cypher_command:
                rtxConfig = RTXConfiguration()
                # Connection information for the neo4j server, populated with orangeboard
                driver = GraphDatabase.driver(rtxConfig.neo4j_bolt, auth=basic_auth(rtxConfig.neo4j_username, rtxConfig.neo4j_password))
                session = driver.session()

                query = "MATCH (n:%s) return count(distinct n)" % (node_type)
                res = session.run(query)
                size_of_total = res.single()["count(distinct n)"]
                return size_of_total
            else:
                nodesynonymizer = NodeSynonymizer()
                size_of_total = nodesynonymizer.get_total_entity_count(node_type, kg_name=kg)
                return size_of_total
        else:
            if use_cypher_command:
                self.response.warning(f"KG2 is only allowable to use NodeSynonymizer to query the total number of node with query type. It was set to use kgNodeIndex")
                nodesynonymizer = NodeSynonymizer()
                size_of_total = nodesynonymizer.get_total_entity_count(node_type, kg_name=kg)
                return size_of_total

            else:
                nodesynonymizer = NodeSynonymizer()
                size_of_total = nodesynonymizer.get_total_entity_count(node_type, kg_name=kg)
                return size_of_total
예제 #23
0
    default=
    '~/RTX/code/ARAX/KnowledgeSources/COHD_local/data/preferred_synonyms_kg2_5_0.pkl'
)
args = parser.parse_args()

curie_type = eval(args.CurieType)
NodeNamesDescriptions = pd.read_csv(
    args.NodeDescriptionFile,
    sep='\t',
    header=None,
    names=['curie', 'name', 'full_name', 'type'])
NodeNamesDescriptions = NodeNamesDescriptions.loc[
    NodeNamesDescriptions.type.isin(curie_type), :].reset_index(drop=True)

preferred_synonyms = dict()
synonymizer = NodeSynonymizer()

for curie in NodeNamesDescriptions['curie']:
    preferred_curie = synonymizer.get_canonical_curies(curies=curie)[curie]
    if preferred_curie is None:
        print(f"{curie} doesn't have preferred curies", flush=True)
    else:
        if preferred_curie['preferred_curie'] not in preferred_synonyms:
            preferred_synonyms[preferred_curie['preferred_curie']] = dict()
            preferred_synonyms[preferred_curie['preferred_curie']][
                'preferred_name'] = preferred_curie['preferred_name']
            preferred_synonyms[preferred_curie['preferred_curie']][
                'preferred_type'] = preferred_curie['preferred_category']
            preferred_synonyms[
                preferred_curie['preferred_curie']]['synonyms'] = [curie]
        else:
예제 #24
0
    def assess(self, message):

        #### Define a default response
        response = ARAXResponse()
        self.response = response
        self.message = message
        response.debug(f"Assessing the QueryGraph for basic information")

        #### Get shorter handles
        query_graph = message.query_graph
        nodes = query_graph.nodes
        edges = query_graph.edges

        #### Store number of nodes and edges
        self.n_nodes = len(nodes)
        self.n_edges = len(edges)
        response.debug(f"Found {self.n_nodes} nodes and {self.n_edges} edges")

        #### Handle impossible cases
        if self.n_nodes == 0:
            response.error(
                "QueryGraph has 0 nodes. At least 1 node is required",
                error_code="QueryGraphZeroNodes")
            return response
        if self.n_nodes == 1 and self.n_edges > 0:
            response.error(
                "QueryGraph may not have edges if there is only one node",
                error_code="QueryGraphTooManyEdges")
            return response
        #if self.n_nodes == 2 and self.n_edges > 1:
        #    response.error("QueryGraph may not have more than 1 edge if there are only 2 nodes", error_code="QueryGraphTooManyEdges")
        #    return response

        #### Loop through nodes computing some stats
        node_info = {}
        self.node_category_map = {}
        for key, qnode in nodes.items():
            node_info[key] = {
                'key': key,
                'node_object': qnode,
                'has_id': False,
                'category': qnode.category,
                'has_category': False,
                'is_set': False,
                'n_edges': 0,
                'n_links': 0,
                'is_connected': False,
                'edges': [],
                'edge_dict': {}
            }
            if qnode.id is not None:
                node_info[key]['has_id'] = True

                #### If the user did not specify a category, but there is a curie, try to figure out the category
                if node_info[key]['category'] is None:
                    synonymizer = NodeSynonymizer()
                    curie = qnode.id
                    curies_list = qnode.id
                    if isinstance(qnode.id, list):
                        curie = qnode.id[0]
                    else:
                        curies_list = [qnode.id]

                    canonical_curies = synonymizer.get_canonical_curies(
                        curies=curies_list, return_all_categories=True)
                    if curie in canonical_curies and 'preferred_type' in canonical_curies[
                            curie]:
                        node_info[key]['has_category'] = True
                        node_info[key]['category'] = canonical_curies[curie][
                            'preferred_type']

            if qnode.category is not None:
                node_info[key]['has_category'] = True

            #if qnode.is_set is not None: node_info[key]['is_set'] = True
            if key is None:
                response.error(
                    "QueryGraph has a node with null key. This is not permitted",
                    error_code="QueryGraphNodeWithNoId")
                return response

            #### Remap the node categorys from unsupported to supported
            if qnode.category is not None:
                qnode.category = self.remap_node_category(qnode.category)

            #### Store lookup of categorys
            warning_counter = 0
            if qnode.category is None or (isinstance(qnode.category, list)
                                          and len(qnode.category) == 0):
                if warning_counter == 0:
                    #response.debug("QueryGraph has nodes with no category. This may cause problems with results inference later")
                    pass
                warning_counter += 1
                self.node_category_map['unknown'] = key
            else:
                category = qnode.category
                if isinstance(qnode.category, list):
                    category = qnode.category[
                        0]  # FIXME this is a hack prior to proper list handling
                self.node_category_map[category] = key

        #### Loop through edges computing some stats
        edge_info = {}
        self.edge_predicate_map = {}
        unique_links = {}

        #### Ignore special informationational edges for now.
        virtual_edge_predicates = {
            'has_normalized_google_distance_with': 1,
            'has_fisher_exact_test_p-value_with': 1,
            'has_jaccard_index_with': 1,
            'probably_treats': 1,
            'has_paired_concept_frequency_with': 1,
            'has_observed_expected_ratio_with': 1,
            'has_chi_square_with': 1
        }

        for key, qedge in edges.items():

            predicate = qedge.predicate
            if isinstance(predicate, list):
                if len(predicate) == 0:
                    predicate = None
                else:
                    predicate = predicate[
                        0]  # FIXME Hack before dealing with predicates as lists!

            if predicate is not None and predicate in virtual_edge_predicates:
                continue

            edge_info[key] = {
                'key': key,
                'has_predicate': False,
                'subject': qedge.subject,
                'object': qedge.object,
                'predicate': None
            }
            if predicate is not None:
                edge_info[key]['has_predicate'] = True
                edge_info[key]['predicate'] = predicate

            if key is None:
                response.error(
                    "QueryGraph has a edge with null key. This is not permitted",
                    error_code="QueryGraphEdgeWithNoKey")
                return response

            #### Create a unique node link string
            link_string = ','.join(sorted([qedge.subject, qedge.object]))
            if link_string not in unique_links:
                node_info[qedge.subject]['n_links'] += 1
                node_info[qedge.object]['n_links'] += 1
                unique_links[link_string] = 1
                #print(link_string)

            node_info[qedge.subject]['n_edges'] += 1
            node_info[qedge.object]['n_edges'] += 1
            node_info[qedge.subject]['is_connected'] = True
            node_info[qedge.object]['is_connected'] = True
            #node_info[qedge.subject]['edges'].append(edge_info[key])
            #node_info[qedge.object]['edges'].append(edge_info[key])
            node_info[qedge.subject]['edges'].append(edge_info[key])
            node_info[qedge.object]['edges'].append(edge_info[key])
            node_info[qedge.subject]['edge_dict'][key] = edge_info[key]
            node_info[qedge.object]['edge_dict'][key] = edge_info[key]

            #### Store lookup of predicates
            warning_counter = 0
            edge_predicate = 'any'
            if predicate is None:
                if warning_counter == 0:
                    response.debug(
                        "QueryGraph has edges with no predicate. This may cause problems with results inference later"
                    )
                warning_counter += 1
            else:
                edge_predicate = predicate

            #### It's not clear yet whether we need to store the whole sentence or just the predicate
            #predicate_encoding = f"{node_info[qedge.subject]['predicate']}---{edge_predicate}---{node_info[qedge.object]['predicate']}"
            predicate_encoding = edge_predicate
            self.edge_predicate_map[predicate_encoding] = key

        #### Loop through the nodes again, trying to identify the start_node and the end_node
        singletons = []
        for node_id, node_data in node_info.items():
            if node_data['n_links'] < 2:
                singletons.append(node_data)
            elif node_data['n_links'] > 2:
                self.is_bifurcated_graph = True
                response.warning(
                    "QueryGraph appears to have a fork in it. This might cause trouble"
                )

        #### If this doesn't produce any singletons, then try curie based selection
        if len(singletons) == 0:
            for node_id, node_data in node_info.items():
                if node_data['has_id']:
                    singletons.append(node_data)

        #### If this doesn't produce any singletons, then we don't know how to continue
        if len(singletons) == 0:
            response.error("Unable to understand the query graph",
                           error_code="QueryGraphCircular")
            return response

        #### Try to identify the start_node and the end_node
        start_node = singletons[0]
        if len(nodes) == 1:
            # Just a single node, fine
            pass
        elif len(singletons) < 2:
            response.warning(
                "QueryGraph appears to be circular or has a strange geometry. This might cause trouble"
            )
        elif len(singletons) > 2:
            response.warning(
                "QueryGraph appears to have a fork in it. This might cause trouble"
            )
        else:
            if singletons[0]['has_id'] is True and singletons[1][
                    'has_id'] is False:
                start_node = singletons[0]
            elif singletons[0]['has_id'] is False and singletons[1][
                    'has_id'] is True:
                start_node = singletons[1]
            else:
                start_node = singletons[0]
        #### Hmm, that's not very robust against odd graphs. This needs work. FIXME

        self.node_info = node_info
        self.edge_info = edge_info
        self.start_node = start_node

        current_node = start_node
        node_order = [start_node]
        edge_order = []
        edges = current_node['edges']
        debug = False

        while 1:
            if debug:
                tmp = {
                    'astate': '1',
                    'current_node': current_node,
                    'node_order': node_order,
                    'edge_order': edge_order,
                    'edges': edges
                }
                print(
                    json.dumps(ast.literal_eval(repr(tmp)),
                               sort_keys=True,
                               indent=2))
                print(
                    '=================================================================================='
                )
                tmp = input()

            if len(edges) == 0:
                break
            #if len(edges) > 1:
            if current_node['n_links'] > 1:
                response.error(
                    f"Help, two edges at A583. Don't know what to do: {current_node['n_links']}",
                    error_code="InteralErrorA583")
                return response
            edge_order.append(edges[0])
            previous_node = current_node
            if edges[0]['subject'] == current_node['key']:
                current_node = node_info[edges[0]['object']]
            elif edges[0]['object'] == current_node['key']:
                current_node = node_info[edges[0]['subject']]
            else:
                response.error("Help, edge error A584. Don't know what to do",
                               error_code="InteralErrorA584")
                return response
            node_order.append(current_node)

            #tmp = { 'astate': '2', 'current_node': current_node, 'node_order': node_order, 'edge_order': edge_order, 'edges': edges }
            #print(json.dumps(ast.literal_eval(repr(tmp)),sort_keys=True,indent=2))
            #print('==================================================================================')
            #tmp = input()

            edges = current_node['edges']
            new_edges = []
            for edge in edges:
                key = edge['key']
                if key not in previous_node['edge_dict']:
                    new_edges.append(edge)
            edges = new_edges
            if len(edges) == 0:
                break
            #tmp = { 'astate': '3', 'current_node': current_node, 'node_order': node_order, 'edge_order': edge_order, 'edges': edges }
            #print(json.dumps(ast.literal_eval(repr(tmp)),sort_keys=True,indent=2))
            #print('==================================================================================')
            #tmp = input()

        self.node_order = node_order
        self.edge_order = edge_order

        # Create a text rendering of the QueryGraph geometry for matching against a template
        self.query_graph_templates = {
            'simple': '',
            'detailed': {
                'n_nodes': len(node_order),
                'components': []
            }
        }
        node_index = 0
        edge_index = 0
        #print(json.dumps(ast.literal_eval(repr(node_order)),sort_keys=True,indent=2))
        for node in node_order:
            component_id = f"n{node_index:02}"
            content = ''
            component = {
                'component_type': 'node',
                'component_id': component_id,
                'has_id': node['has_id'],
                'has_category': node['has_category'],
                'category_value': None
            }
            self.query_graph_templates['detailed']['components'].append(
                component)
            if node['has_id']:
                content = 'id'
            elif node['has_category'] and node[
                    'node_object'].category is not None:
                content = f"category={node['node_object'].category}"
                component['category_value'] = node['node_object'].category
            elif node['has_category']:
                content = 'category'
            template_part = f"{component_id}({content})"
            self.query_graph_templates['simple'] += template_part

            # Since queries with intermediate nodes that are not is_set=true tend to blow up, for now, make them is_set=true unless explicitly set to false
            if node_index > 0 and node_index < (self.n_nodes - 1):
                if 'is_set' not in node or node['is_set'] is None:
                    node['node_object'].is_set = True
                    response.warning(
                        f"Setting unspecified is_set to true for {node['key']} because this will probably lead to a happier result"
                    )
                elif node['is_set'] is True:
                    response.debug(
                        f"Value for is_set is already true for {node['key']} so that's good"
                    )
                elif node['is_set'] is False:
                    #response.info(f"Value for is_set is set to false for intermediate node {node['key']}. This could lead to weird results. Consider setting it to true")
                    response.info(
                        f"Value for is_set is false for intermediate node {node['key']}. Setting to true because this will probably lead to a happier result"
                    )
                    node['node_object'].is_set = True
                #else:
                #    response.error(f"Unrecognized value is_set='{node['is_set']}' for {node['key']}. This should be true or false")

            node_index += 1
            if node_index < self.n_nodes:
                #print(json.dumps(ast.literal_eval(repr(node)),sort_keys=True,indent=2))

                #### Extract the has_predicate and predicate_value from the edges of the node
                #### This could fail if there are two edges coming out of the node FIXME
                has_predicate = False
                predicate_value = None
                if 'edges' in node:
                    for related_edge in node['edges']:
                        if related_edge['subject'] == node['key']:
                            has_predicate = related_edge['has_predicate']
                            if has_predicate is True and 'predicate' in related_edge:
                                predicate_value = related_edge['predicate']

                component_id = f"e{edge_index:02}"
                template_part = f"-{component_id}()-"
                self.query_graph_templates['simple'] += template_part
                component = {
                    'component_type': 'edge',
                    'component_id': component_id,
                    'has_id': False,
                    'has_predicate': has_predicate,
                    'predicate_value': predicate_value
                }
                self.query_graph_templates['detailed']['components'].append(
                    component)
                edge_index += 1

        response.debug(
            f"The QueryGraph reference template is: {self.query_graph_templates['simple']}"
        )

        #tmp = { 'node_info': node_info, 'edge_info': edge_info, 'start_node': start_node, 'n_nodes': self.n_nodes, 'n_edges': self.n_edges,
        #    'is_bifurcated_graph': self.is_bifurcated_graph, 'node_order': node_order, 'edge_order': edge_order }
        #print(json.dumps(ast.literal_eval(repr(tmp)),sort_keys=True,indent=2))
        #sys.exit(0)

        #### Return the response
        return response
    def __init__(self, response, message, parameters):
        self.response = response
        self.message = message
        self.parameters = parameters
        self.global_iter = 0
        ## check if the new model files exists in /predictor/retrain_data. If not, scp it from arax.ncats.io
        pathlist = os.path.realpath(__file__).split(os.path.sep)
        RTXindex = pathlist.index("RTX")
        filepath = os.path.sep.join([*pathlist[:(RTXindex + 1)], 'code', 'ARAX', 'ARAXQuery', 'Overlay', 'predictor','retrain_data'])

        ## check if there is LogModel.pkl
        pkl_file = f"{filepath}/LogModel.pkl"
        if os.path.exists(pkl_file):
            pass
        else:
            os.system("scp [email protected]:/data/orangeboard/databases/KG2.3.4/LogModel.pkl " + pkl_file)

        ## check if there is GRAPH.sqlite
        db_file = f"{filepath}/GRAPH.sqlite"
        if os.path.exists(db_file):
            pass
        else:
            os.system("scp [email protected]:/data/orangeboard/databases/KG2.3.4/GRAPH.sqlite " + db_file)

        ## check if there is DTD_probability_database.db
        DTD_prob_db_file = f"{filepath}/DTD_probability_database_v1.0.db"
        if os.path.exists(DTD_prob_db_file):
            pass
        else:
            os.system("scp [email protected]:/data/orangeboard/databases/KG2.3.4/DTD_probability_database_v1.0.db " + DTD_prob_db_file)

        # use NodeSynonymizer to replace map.txt
        # check if there is map.txt
        # map_file = f"{filepath}/map.txt"
        # if os.path.exists(map_file):
        #     pass
        # else:
        #     os.system("scp [email protected]:/home/ubuntu/drug_repurposing_model_retrain/map.txt " + map_file)

        self.use_prob_db = True
        if self.use_prob_db is True:
            try:
                self.pred = predictor(DTD_prob_file=DTD_prob_db_file, use_prob_db=True)
            except:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                self.response.error(tb, error_code=error_type.__name__)
                self.response.error(f"Internal Error encountered connecting to the local DTD prediction database.")
        else:
            try:
                self.pred = predictor(model_file=pkl_file, use_prob_db=False)
            except:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                self.response.error(tb, error_code=error_type.__name__)
                self.response.error(f"Internal Error encountered connecting to the local LogModel.pkl file.")
            try:
                self.pred.import_file(None, graph_database=db_file)
            except:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                self.response.error(tb, error_code=error_type.__name__)
                self.response.error(f"Internal Error encountered connecting to the local graph database file.")
        # with open(map_file, 'r') as infile:
        #     map_file_content = infile.readlines()
        #     map_file_content.pop(0) ## remove title
        #     self.known_curies = set(line.strip().split('\t')[0] for line in map_file_content)

        self.synonymizer = NodeSynonymizer()
class PredictDrugTreatsDisease:

    #### Constructor
    def __init__(self, response, message, parameters):
        self.response = response
        self.message = message
        self.parameters = parameters
        self.global_iter = 0
        ## check if the new model files exists in /predictor/retrain_data. If not, scp it from arax.ncats.io
        pathlist = os.path.realpath(__file__).split(os.path.sep)
        RTXindex = pathlist.index("RTX")
        filepath = os.path.sep.join([*pathlist[:(RTXindex + 1)], 'code', 'ARAX', 'ARAXQuery', 'Overlay', 'predictor','retrain_data'])

        ## check if there is LogModel.pkl
        pkl_file = f"{filepath}/LogModel.pkl"
        if os.path.exists(pkl_file):
            pass
        else:
            os.system("scp [email protected]:/data/orangeboard/databases/KG2.3.4/LogModel.pkl " + pkl_file)

        ## check if there is GRAPH.sqlite
        db_file = f"{filepath}/GRAPH.sqlite"
        if os.path.exists(db_file):
            pass
        else:
            os.system("scp [email protected]:/data/orangeboard/databases/KG2.3.4/GRAPH.sqlite " + db_file)

        ## check if there is DTD_probability_database.db
        DTD_prob_db_file = f"{filepath}/DTD_probability_database_v1.0.db"
        if os.path.exists(DTD_prob_db_file):
            pass
        else:
            os.system("scp [email protected]:/data/orangeboard/databases/KG2.3.4/DTD_probability_database_v1.0.db " + DTD_prob_db_file)

        # use NodeSynonymizer to replace map.txt
        # check if there is map.txt
        # map_file = f"{filepath}/map.txt"
        # if os.path.exists(map_file):
        #     pass
        # else:
        #     os.system("scp [email protected]:/home/ubuntu/drug_repurposing_model_retrain/map.txt " + map_file)

        self.use_prob_db = True
        if self.use_prob_db is True:
            try:
                self.pred = predictor(DTD_prob_file=DTD_prob_db_file, use_prob_db=True)
            except:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                self.response.error(tb, error_code=error_type.__name__)
                self.response.error(f"Internal Error encountered connecting to the local DTD prediction database.")
        else:
            try:
                self.pred = predictor(model_file=pkl_file, use_prob_db=False)
            except:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                self.response.error(tb, error_code=error_type.__name__)
                self.response.error(f"Internal Error encountered connecting to the local LogModel.pkl file.")
            try:
                self.pred.import_file(None, graph_database=db_file)
            except:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                self.response.error(tb, error_code=error_type.__name__)
                self.response.error(f"Internal Error encountered connecting to the local graph database file.")
        # with open(map_file, 'r') as infile:
        #     map_file_content = infile.readlines()
        #     map_file_content.pop(0) ## remove title
        #     self.known_curies = set(line.strip().split('\t')[0] for line in map_file_content)

        self.synonymizer = NodeSynonymizer()

    def convert_to_trained_curies(self, input_curie):
        """
        Takes an input curie from the KG, uses the synonymizer, and then returns something that the map.csv can handle
        """
        normalizer_result = self.synonymizer.get_canonical_curies(input_curie)
        curies_in_model = normalizer_result[input_curie]
        # curies_in_model = [curie for curie in curies_in_model if curie in self.known_curies]
        # equivalent_curies = []  # start with empty equivalent_curies
        # try:
        #     equivalent_curies = [x['identifier'] for x in normalizer_result[input_curie]['equivalent_identifiers']]
        # except:
        #     self.response.warning(f"NodeSynonmizer could not find curies for {input_curie}, skipping this one.")
        # for curie in equivalent_curies:
        #     curie_prefix = curie.split(':')[0]
        #     # FIXME: fix this when re-training the ML model, as when this was originally trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123
        #     if curie_prefix == "CHEMBL.COMPOUND":
        #         chembl_fix = 'ChEMBL:' + curie[22:]
        #         if chembl_fix in self.known_curies:
        #             curies_in_model.add(chembl_fix)
        #     elif curie in self.known_curies:
        #         curies_in_model.add(curie)
        return curies_in_model

    def predict_drug_treats_disease(self):
        """
        Iterate over all the edges in the knowledge graph, add the drug-disease treatment probability for appropriate edges
        on the edge_attributes
        :return: response
        """
        parameters = self.parameters
        self.response.debug(f"Computing drug disease treatment probability based on a machine learning model")
        self.response.info(f"Computing drug disease treatment probability based on a machine learning model: See [this publication](https://doi.org/10.1101/765305) for more details about how this is accomplished.")

        attribute_name = "probability_treats"
        attribute_type = "EDAM:data_0951"
        value = 0  # this will be the default value. If the model returns 0, or the default is there, don't include that edge
        url = "https://doi.org/10.1101/765305"

        # if you want to add virtual edges, identify the source/targets, decorate the edges, add them to the KG, and then add one to the QG corresponding to them
        if 'virtual_relation_label' in parameters:
            source_curies_to_decorate = set()
            target_curies_to_decorate = set()
            curie_to_name = dict()
            # identify the nodes that we should be adding virtual edges for
            for node_key, node in self.message.knowledge_graph.nodes.items():
                if hasattr(node, 'qnode_keys'):
                    if parameters['subject_qnode_key'] in node.qnode_keys:
                        if "drug" in node.category or "chemical_substance" in node.category or "biolink:Drug" in node.category or "biolink:ChemicalSubstance" in node.category:  # this is now NOT checked by ARAX_overlay
                            source_curies_to_decorate.add(node_key)
                            curie_to_name[node_key] = node.name
                    if parameters['object_qnode_key'] in node.qnode_keys:
                        if "disease" in node.category or "phenotypic_feature" in node.category or "biolink:Disease" in node.category or "biolink:PhenotypicFeature" in node.category:  # this is now NOT checked by ARAX_overlay
                            target_curies_to_decorate.add(node_key)
                            curie_to_name[node_key] = node.name

            added_flag = False  # check to see if any edges where added
            # iterate over all pairs of these nodes, add the virtual edge, decorate with the correct attribute

            for (source_curie, target_curie) in itertools.product(source_curies_to_decorate, target_curies_to_decorate):
                self.response.debug(f"Predicting probability that {curie_to_name[source_curie]} treats {curie_to_name[target_curie]}")
                # create the edge attribute if it can be
                # loop over all equivalent curies and take the highest probability

                max_probability = 0
                converted_source_curie = self.convert_to_trained_curies(source_curie)
                if converted_source_curie is None:
                    continue
                else:
                    preferred_type = converted_source_curie['preferred_type']
                    if preferred_type == "drug" or preferred_type == "chemical_substance" or preferred_type == "biolink:Drug" or preferred_type == "biolink:ChemicalSubstance":
                        converted_source_curie = converted_source_curie['preferred_curie']
                    else:
                        continue
                converted_target_curie = self.convert_to_trained_curies(target_curie)
                if converted_target_curie is None:
                    continue
                else:
                    preferred_type = converted_target_curie['preferred_type']
                    if preferred_type == "disease" or preferred_type == "phenotypic_feature" or preferred_type == "biolink:Disease" or preferred_type == "biolink:PhenotypicFeature":
                        converted_target_curie = converted_target_curie['preferred_curie']
                    else:
                        continue
                if self.use_prob_db is True:
                    probability = self.pred.get_prob_from_DTD_db(converted_source_curie, converted_target_curie)
                    if probability is not None:
                        if np.isfinite(probability):
                            max_probability = probability
                else:
                    probability = self.pred.prob_single(converted_source_curie, converted_target_curie)
                    if probability is not None:
                        probability = probability[0]
                        if np.isfinite(probability):
                            max_probability = probability
                # if len(res) != 0:
                #     all_probabilities = self.pred.prob_all(res)
                #     if isinstance(all_probabilities, list):
                #         max_probability = max([value for value in all_probabilities if np.isfinite(value)])

                value = max_probability

                #probability = self.pred.prob_single('ChEMBL:' + source_curie[22:], target_curie)  # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123
                #if probability and np.isfinite(probability):  # finite, that's ok, otherwise, stay with default
                #    value = probability[0]
                edge_attribute = EdgeAttribute(type=attribute_type, name=attribute_name, value=str(value), url=url)  # populate the edge attribute
                if edge_attribute and value != 0:
                    added_flag = True
                    # make the edge, add the attribute

                    # edge properties
                    now = datetime.now()
                    edge_type = "biolink:probably_treats"
                    qedge_keys = [parameters['virtual_relation_label']]
                    relation = parameters['virtual_relation_label']
                    is_defined_by = "ARAX"
                    defined_datetime = now.strftime("%Y-%m-%d %H:%M:%S")
                    provided_by = "ARAX"
                    confidence = None
                    weight = None  # TODO: could make the actual value of the attribute
                    subject_key = source_curie
                    object_key = target_curie

                    # now actually add the virtual edges in
                    id = f"{relation}_{self.global_iter}"
                    self.global_iter += 1
                    edge_attribute_list = [
                        edge_attribute,
                        EdgeAttribute(name="is_defined_by", value=is_defined_by, type="ARAX_TYPE_PLACEHOLDER"),
                        EdgeAttribute(name="defined_datetime", value=defined_datetime, type="metatype:Datetime"),
                        EdgeAttribute(name="provided_by", value=provided_by, type="biolink:provided_by"),
                        #EdgeAttribute(name="confidence", value=confidence, type="biolink:ConfidenceLevel"),
                        #EdgeAttribute(name="weight", value=weight, type="metatype:Float")
                    ]
                    edge = Edge(predicate=edge_type, subject=subject_key, object=object_key, relation=relation,
                                attributes=edge_attribute_list)
                    edge.qedge_keys = qedge_keys
                    self.message.knowledge_graph.edges[id] = edge

            # Now add a q_edge the query_graph since I've added an extra edge to the KG
            if added_flag:
                edge_type = "biolink:probably_treats"
                relation = parameters['virtual_relation_label']
                subject_qnode_key = parameters['subject_qnode_key']
                object_qnode_key = parameters['object_qnode_key']
                option_group_id = ou.determine_virtual_qedge_option_group(subject_qnode_key, object_qnode_key, self.message.query_graph, self.response)
                q_edge = QEdge(predicate=edge_type, relation=relation, subject=subject_qnode_key, object=object_qnode_key, option_group_id=option_group_id)
                self.message.query_graph.edges[relation] = q_edge
            return self.response

        else:  # you want to add it for each edge in the KG
            # iterate over KG edges, add the information
            try:
                # map curies to types
                curie_to_type = dict()
                curie_to_name = dict()
                for node_key, node in self.message.knowledge_graph.nodes.items():
                    curie_to_type[node_key] = node.category
                    curie_to_name[node_key] = node.name
                # then iterate over the edges and decorate if appropriate
                for edge_key, edge in self.message.knowledge_graph.edges.items():
                    # Make sure the edge_attributes are not None
                    if not edge.attributes:
                        edge.attributes = []  # should be an array, but why not a list?
                    # now go and actually get the probability
                    source_curie = edge.subject
                    target_curie = edge.object
                    source_types = curie_to_type[source_curie]
                    target_types = curie_to_type[target_curie]
                    if (("drug" in source_types) or ("chemical_substance" in source_types) or ("biolink:Drug" in source_types) or ("biolink:ChemicalSubstance" in source_types)) and (("disease" in target_types) or ("phenotypic_feature" in target_types) or ("biolink:Disease" in target_types) or ("biolink:PhenotypicFeature" in target_types)):
                        # loop over all pairs of equivalent curies and take the highest probability
                        self.response.debug(f"Predicting treatment probability between {curie_to_name[source_curie]} and {curie_to_name[target_curie]}")
                        max_probability = 0
                        converted_source_curie = self.convert_to_trained_curies(source_curie)
                        if converted_source_curie is None:
                            continue
                        else:
                            preferred_type = converted_source_curie['preferred_type']
                            if preferred_type == "drug" or preferred_type == "chemical_substance" or preferred_type == "biolink:Drug" or preferred_type == "biolink:ChemicalSubstance":
                                converted_source_curie = converted_source_curie['preferred_curie']
                            else:
                                continue
                        converted_target_curie = self.convert_to_trained_curies(target_curie)
                        if converted_target_curie is None:
                            continue
                        else:
                            preferred_type = converted_target_curie['preferred_type']
                            if preferred_type == "disease" or preferred_type == "phenotypic_feature" or preferred_type == "biolink:Disease" or preferred_type == "biolink:PhenotypicFeature":
                                converted_target_curie = converted_target_curie['preferred_curie']
                            else:
                                continue
                        if self.use_prob_db is True:
                            probability = self.pred.get_prob_from_DTD_db(converted_source_curie, converted_target_curie)
                            if probability is not None:
                                if np.isfinite(probability):
                                    max_probability = probability
                        else:
                            probability = self.pred.prob_single(converted_source_curie, converted_target_curie)
                            if probability is not None:
                                probability = probability[0]
                                if np.isfinite(probability):
                                    max_probability = probability
                        # res = list(itertools.product(converted_source_curie, converted_target_curie))
                        # if len(res) != 0:
                        #     all_probabilities = self.pred.prob_all(res)
                        #     if isinstance(all_probabilities, list):
                        #         max_probability = max([value for value in all_probabilities if np.isfinite(value)])

                        value = max_probability

                        #probability = self.pred.prob_single('ChEMBL:' + source_curie[22:], target_curie)  # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123
                        #if probability and np.isfinite(probability):  # finite, that's ok, otherwise, stay with default
                        #    value = probability[0]
                    elif (("drug" in target_types) or ("chemical_substance" in target_types) or ("biolink:Drug" in target_types) or ("biolink:ChemicalSubstance" in target_types)) and (("disease" in source_types) or ("phenotypic_feature" in source_types) or ("biolink:Disease" in source_types) or ("biolink:PhenotypicFeature" in source_types)):
                        #probability = self.pred.prob_single('ChEMBL:' + target_curie[22:], source_curie)  # FIXME: when this was trained, it was ChEMBL:123, not CHEMBL.COMPOUND:CHEMBL123
                        #if probability and np.isfinite(probability):  # finite, that's ok, otherwise, stay with default
                        #    value = probability[0]
                        self.response.debug(f"Predicting treatment probability between {curie_to_name[source_curie]} and {curie_to_name[target_curie]}")
                        max_probability = 0
                        converted_source_curie = self.convert_to_trained_curies(source_curie)
                        if converted_source_curie is None:
                            continue
                        else:
                            preferred_type = converted_source_curie['preferred_type']
                            if preferred_type == "disease" or preferred_type == "phenotypic_feature" or preferred_type == "biolink:Disease" or preferred_type == "biolink:PhenotypicFeature":
                                converted_source_curie = converted_source_curie['preferred_curie']
                            else:
                                continue
                        converted_target_curie = self.convert_to_trained_curies(target_curie)
                        if converted_target_curie is None:
                            continue
                        else:
                            preferred_type = converted_target_curie['preferred_type']
                            if preferred_type == "drug" or preferred_type == "chemical_substance" or preferred_type == "biolink:Drug" or preferred_type == "biolink:ChemicalSubstance":
                                converted_target_curie = converted_target_curie['preferred_curie']
                            else:
                                continue

                        if self.use_prob_db is True:
                            probability = self.pred.get_prob_from_DTD_db(converted_target_curie, converted_source_curie)
                            if probability is not None:
                                if np.isfinite(probability):
                                    max_probability = probability
                        else:
                            probability = self.pred.prob_single(converted_target_curie, converted_source_curie)
                            if probability is not None:
                                probability = probability[0]
                                if np.isfinite(probability):
                                    max_probability = probability
                        # res = list(itertools.product(converted_target_curie, converted_source_curie))
                        # if len(res) != 0:
                        #     all_probabilities = self.pred.prob_all(res)
                        #     if isinstance(all_probabilities, list):
                        #         max_probability = max([value for value in all_probabilities if np.isfinite(value)])

                        value = max_probability

                    else:
                        continue
                    if value != 0:
                        edge_attribute = EdgeAttribute(type=attribute_type, name=attribute_name, value=str(value), url=url)  # populate the attribute
                        edge.attributes.append(edge_attribute)  # append it to the list of attributes
            except:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                self.response.error(tb, error_code=error_type.__name__)
                self.response.error(f"Something went wrong adding the drug disease treatment probability")
            else:
                self.response.info(f"Drug disease treatment probability successfully added to edges")

            return self.response
예제 #27
0
    def add_qnode(self, response, input_parameters, describe=False):
        """
        Adds a new QNode object to the QueryGraph inside the Message object
        :return: ARAXResponse object with execution information
        :rtype: ARAXResponse
        """

        # #### Command definition for autogenerated documentation
        command_definition = {
            'dsl_command': 'add_qnode()',
            'description':
            """The `add_qnode` method adds an additional QNode to the QueryGraph in the Message object.""",
            'parameters': {
                'key': {
                    'is_required':
                    False,
                    'examples': ['n00', 'n01'],
                    'default':
                    '',
                    'type':
                    'string',
                    'description':
                    """Any string that is unique among all QNode key fields, with recommended format n00, n01, n02, etc.
                        If no value is provided, autoincrementing values beginning for n00 are used.""",
                },
                'id': {
                    'is_required':
                    False,
                    'examples':
                    ['DOID:9281', '[UniProtKB:P12345,UniProtKB:Q54321]'],
                    'type':
                    'string',
                    'description':
                    'Any compact URI (CURIE) (e.g. DOID:9281) (May also be a list like [UniProtKB:P12345,UniProtKB:Q54321])',
                },
                'name': {
                    'is_required':
                    False,
                    'examples': ['hypertension', 'insulin'],
                    'type':
                    'string',
                    'description':
                    'Any name of a bioentity that will be resolved into a CURIE if possible or result in an error if not (e.g. hypertension, insulin)',
                },
                'category': {
                    'is_required':
                    False,
                    'examples': ['protein', 'chemical_substance', 'disease'],
                    'type':
                    'ARAXnode',
                    'description':
                    'Any valid Translator bioentity category (e.g. protein, chemical_substance, disease)',
                },
                'is_set': {
                    'is_required':
                    False,
                    'enum':
                    ["true", "false", "True", "False", "t", "f", "T", "F"],
                    'examples': ['true', 'false'],
                    'type':
                    'boolean',
                    'description':
                    'If set to true, this QNode represents a set of nodes that are all in common between the two other linked QNodes (assumed to be false if not specified or value is not recognized as true/t case insensitive)'
                },
                'option_group_id': {
                    'is_required':
                    False,
                    'examples': ['1', 'a', 'b2', 'option'],
                    'type':
                    'string',
                    'description':
                    'A group identifier indicating a group of nodes and edges should either all be included or all excluded. An optional match for all elements in this group. If not included Node will be treated as required.'
                },
            }
        }

        if describe:
            return command_definition

        #### Extract the message to work on
        message = response.envelope.message

        #### Basic checks on arguments
        if not isinstance(input_parameters, dict):
            response.error("Provided parameters is not a dict",
                           error_code="ParametersNotDict")
            return response

        #### Define a complete set of allowed parameters and their defaults
        parameters = {
            'key': None,
            'id': None,
            'name': None,
            'category': None,
            'is_set': None,
            'option_group_id': None,
        }

        #### Loop through the input_parameters and override the defaults and make sure they are allowed
        for key, value in input_parameters.items():
            if key not in parameters:
                response.error(f"Supplied parameter {key} is not permitted",
                               error_code="UnknownParameter")
            else:
                parameters[key] = value

        #### Check for option_group_id and is_set:
        if parameters['option_group_id'] is not None and parameters[
                'id'] is None and parameters['name'] is None:
            if parameters['is_set'] is None:
                parameters['is_set'] = 'true'
                response.warning(
                    f"An 'option_group_id' was set to {parameters['option_group_id']}, but 'is_set' was not an included parameter. It must be true when an 'option_group_id' is given, so automatically setting to true. Avoid this warning by explictly setting to true."
                )
            elif not (parameters['is_set'].lower() == 'true'
                      or parameters['is_set'].lower() == 't'):
                response.error(
                    f"When an 'option_group_id' is given 'is_set' must be set to true. However, supplied input for parameter 'is_set' was {parameters['is_set']}.",
                    error_code="InputMismatch")

        #### Return if any of the parameters generated an error (showing not just the first one)
        if response.status != 'OK':
            return response

        #### Now apply the filters. Order of operations is probably quite important
        #### Scalar value filters probably come first like minimum_confidence, then complex logic filters
        #### based on edge or node properties, and then finally maximum_results
        response.info(
            f"Adding a QueryNode to Message with input parameters {parameters}"
        )

        #### Make sure there's a query_graph already here
        if message.query_graph is None:
            message.query_graph = QueryGraph()
            message.query_graph.nodes = {}
            message.query_graph.edges = {}
        if message.query_graph.nodes is None:
            message.query_graph.nodes = {}

        #### Set up the NodeSynonymizer to find curies and names
        synonymizer = NodeSynonymizer()

        # Create the QNode and set the key
        qnode = QNode()
        if parameters['key'] is not None:
            key = parameters['key']
        else:
            key = self.__get_next_free_node_key()

        if parameters['option_group_id'] is not None:
            qnode.option_group_id = parameters['option_group_id']

        # Set the is_set parameter to what the user selected
        if parameters['is_set'] is not None:
            qnode.is_set = (parameters['is_set'].lower() == 'true'
                            or parameters['is_set'].lower() == 't')

        #### If the id is specified, try to find that
        if parameters['id'] is not None:

            # If the id is a scalar then treat it here as a list of one
            if isinstance(parameters['id'], str):
                id_list = [parameters['id']]
                is_id_a_list = False
                if parameters['is_set'] is not None and qnode.is_set is True:
                    response.error(
                        f"Specified id '{parameters['id']}' is a scalar, but is_set=true, which doesn't make sense",
                        error_code="IdScalarButIsSetTrue")
                    return response

            # Or else set it up as a list
            elif isinstance(parameters['id'], list):
                id_list = parameters['id']
                is_id_a_list = True
                qnode.id = []
                if parameters['is_set'] is None:
                    response.warning(
                        f"Specified id '{parameters['id']}' is a list, but is_set was not set to true. It must be true in this context, so automatically setting to true. Avoid this warning by explictly setting to true."
                    )
                    qnode.is_set = True
                else:
                    if qnode.is_set == False:
                        response.warning(
                            f"Specified id '{parameters['id']}' is a list, but is_set=false, which doesn't make sense, so automatically setting to true. Avoid this warning by explictly setting to true."
                        )
                        qnode.is_set = True

            # Or if it's neither a list or a string, then error out. This cannot be handled at present
            else:
                response.error(
                    f"Specified id '{parameters['id']}' is neither a string nor a list. This cannot to handled",
                    error_code="IdNotListOrScalar")
                return response

            # Loop over the available ids and create the list
            for id in id_list:
                response.debug(f"Looking up id {id} in NodeSynonymizer")
                synonymizer_results = synonymizer.get_canonical_curies(
                    curies=[id])

                # If nothing was found, we won't bail out, but rather just issue a warning that this id is suspect
                if synonymizer_results[id] is None:
                    response.warning(
                        f"A node with id {id} is not in our knowledge graph KG2, but will continue with it"
                    )
                    if is_id_a_list:
                        qnode.id.append(id)
                    else:
                        qnode.id = id

                # And if it is found, keep the same id but report the preferred id
                else:

                    response.info(f"id {id} is found. Adding it to the qnode")
                    if is_id_a_list:
                        qnode.id.append(id)
                    else:
                        qnode.id = id

                if 'category' in parameters and parameters[
                        'category'] is not None:
                    if isinstance(parameters['category'], str):
                        qnode.category = parameters['category']
                    else:
                        qnode.category = parameters['category'][0]

            message.query_graph.nodes[key] = qnode
            return response

        #### If the name is specified, try to find that
        if parameters['name'] is not None:
            name = parameters['name']
            response.debug(
                f"Looking up id for name '{name}' in NodeSynonymizer")
            synonymizer_results = synonymizer.get_canonical_curies(
                curies=[name], names=[name])

            if synonymizer_results[name] is None:
                response.error(
                    f"A node with name '{name}' is not in our knowledge graph",
                    error_code="UnresolvableNodeName")
                return response

            qnode.id = synonymizer_results[name]['preferred_curie']
            response.info(
                f"Creating QueryNode with id '{qnode.id}' for name '{name}'")
            if parameters['category'] is not None:
                qnode.category = parameters['category']
            message.query_graph.nodes[key] = qnode
            return response

        #### If the category is specified, just add that category. There should be checking that it is legal. FIXME
        if parameters['category'] is not None:
            qnode.category = parameters['category']
            if parameters['is_set'] is not None:
                qnode.is_set = (parameters['is_set'].lower() == 'true')
            message.query_graph.nodes[key] = qnode
            return response

        #### If we get here, it means that all three main parameters are null. Just a generic node with no category or anything. This is okay.
        message.query_graph.nodes[key] = qnode
        return response
    def __init__(self, response, message, parameters):
        self.response = response
        self.message = message
        self.parameters = parameters
        self.global_iter = 0
        ## check if the new model files exists in /predictor/retrain_data. If not, scp it from arax.ncats.io
        pathlist = os.path.realpath(__file__).split(os.path.sep)
        RTXindex = pathlist.index("RTX")
        filepath = os.path.sep.join([*pathlist[:(RTXindex + 1)], 'code', 'ARAX', 'KnowledgeSources', 'Prediction'])
        self.drug_label_list = ['chemicalsubstance','drug']
        self.disease_label_list = ['disease','phenotypicfeature','diseaseorphenotypicfeature']

        ## check if there is LogModel.pkl
        log_model_name = RTXConfig.log_model_path.split("/")[-1]
        pkl_file = f"{filepath}{os.path.sep}{log_model_name}"
        if os.path.exists(pkl_file):
            pass
        else:
            #os.system("scp [email protected]:/data/orangeboard/databases/KG2.3.4/LogModel.pkl " + pkl_file)
            os.system(f"scp {RTXConfig.log_model_username}@{RTXConfig.log_model_host}:{RTXConfig.log_model_path} {pkl_file}")


        ## check if there is GRAPH.sqlite
        graph_database_name = RTXConfig.graph_database_path.split("/")[-1]
        db_file = f"{filepath}{os.path.sep}{graph_database_name}"
        if os.path.exists(db_file):
            pass
        else:
            #os.system("scp [email protected]:/data/orangeboard/databases/KG2.3.4/GRAPH.sqlite " + db_file)
            os.system(f"scp {RTXConfig.graph_database_username}@{RTXConfig.graph_database_host}:{RTXConfig.graph_database_path} {db_file}")

        ## check if there is DTD_probability_database.db
        DTD_prob_db_file = f"{filepath}{os.path.sep}{RTXConfig.dtd_prob_path.split('/')[-1]}"
        if os.path.exists(DTD_prob_db_file):
            pass
        else:
            #os.system("scp [email protected]:/data/orangeboard/databases/KG2.3.4/DTD_probability_database_v1.0.db " + DTD_prob_db_file)
            os.system(f"scp {RTXConfig.dtd_prob_username}@{RTXConfig.dtd_prob_host}:{RTXConfig.dtd_prob_path} {DTD_prob_db_file}")

        # use NodeSynonymizer to replace map.txt
        # check if there is map.txt
        # map_file = f"{filepath}/map.txt"
        # if os.path.exists(map_file):
        #     pass
        # else:
        #     os.system("scp [email protected]:/home/ubuntu/drug_repurposing_model_retrain/map.txt " + map_file)

        self.use_prob_db = True
        if self.use_prob_db is True:
            try:
                self.pred = predictor(DTD_prob_file=DTD_prob_db_file, use_prob_db=True)
            except:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                self.response.error(tb, error_code=error_type.__name__)
                self.response.error(f"Internal Error encountered connecting to the local DTD prediction database.")
        else:
            try:
                self.pred = predictor(model_file=pkl_file, use_prob_db=False)
            except:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                self.response.error(tb, error_code=error_type.__name__)
                self.response.error(f"Internal Error encountered connecting to the local LogModel.pkl file.")
            try:
                self.pred.import_file(None, graph_database=db_file)
            except:
                tb = traceback.format_exc()
                error_type, error, _ = sys.exc_info()
                self.response.error(tb, error_code=error_type.__name__)
                self.response.error(f"Internal Error encountered connecting to the local graph database file.")
        # with open(map_file, 'r') as infile:
        #     map_file_content = infile.readlines()
        #     map_file_content.pop(0) ## remove title
        #     self.known_curies = set(line.strip().split('\t')[0] for line in map_file_content)

        self.synonymizer = NodeSynonymizer()
def main():

    parser = argparse.ArgumentParser(
        description="Refresh DTD model and database",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--synoymizer_folder',
                        type=str,
                        help="Full path of folder containing NodeSynonymizer",
                        default='~/RTX/code/ARAX/NodeSynonymizer/')
    parser.add_argument(
        '--DTD_prob_db_file',
        type=str,
        help="Full path of DTD probability database file",
        default=
        '~/work/RTX/code/ARAX/KnowledgeSources/Prediction/DTD_probability_database_v1.0_KG2.3.4.db'
    )
    parser.add_argument(
        '--emb_file',
        type=str,
        help="Full path of DTD model embedding file",
        default=
        '~/work/RTX/code/ARAX/KnowledgeSources/Prediction/rel_max_v1.0_KG2.3.4.emb.gz'
    )
    parser.add_argument(
        '--map_file',
        type=str,
        help="Full path of DTD model mapping file",
        default=
        '~/work/RTX/code/ARAX/KnowledgeSources/Prediction/map_v1.0_KG2.3.4.txt'
    )
    parser.add_argument(
        '--output_folder',
        type=str,
        help="Full path of output folder",
        default='~/work/RTX/code/ARAX/KnowledgeSources/Prediction/')
    args = parser.parse_args()

    if os.path.isdir(args.synoymizer_folder):
        sys.path.append(args.synoymizer_folder)
        from node_synonymizer import NodeSynonymizer
        synonymizer = NodeSynonymizer()
    else:
        print(f"Error: Not found this folder: {args.synoymizer_folder}")
        exit(0)

    if os.path.isfile(args.DTD_prob_db_file):
        print(f'Start to refresh DTD_probability_database.db', flush=True)
        con = sqlite3.connect(args.DTD_prob_db_file)
        DTD_prob_table = pd.read_sql_query("SELECT * from DTD_PROBABILITY",
                                           con)
        con.close()
        DTD_prob_table = DTD_prob_table.apply(lambda row: [
            refresh_disease(row[0], synonymizer),
            refresh_drug(row[1], synonymizer), row[2]
        ],
                                              axis=1,
                                              result_type='expand')
        DTD_prob_table = DTD_prob_table.dropna().reset_index(drop=True)
        con = sqlite3.connect(
            os.path.join(args.output_folder,
                         'DTD_probability_database_refreshed.db'))
        con.execute(
            f"CREATE TABLE DTD_PROBABILITY( disease VARCHAR(255), drug VARCHAR(255), probability FLOAT )"
        )
        insert_command = "INSERT INTO DTD_PROBABILITY VALUES (?, ?, ?)"
        databasefile = list(DTD_prob_table.to_records(index=False))

        print(f"INFO: Populating table", flush=True)
        insert_command = "INSERT INTO DTD_PROBABILITY VALUES (?, ?, ?)"
        batch = list(range(0, len(databasefile), 5000))
        batch.append(len(databasefile))
        count = 0
        for i in range(len(batch)):
            if ((i + 1) < len(batch)):
                start = batch[i]
                end = batch[i + 1]
                rows = databasefile[start:end]
                con.executemany(insert_command, rows)
                con.commit()
                count = count + len(rows)
                percentage = round((count * 100.0 / len(databasefile)), 2)
                print(str(percentage) + "%..", end='', flush=True)

        print(f"INFO: Populating tables is completed", flush=True)

        print(f"INFO: Creating INDEXes on DTD_PROBABILITY", flush=True)
        con.execute(
            f"CREATE INDEX idx_DTD_PROBABILITY_disease ON DTD_PROBABILITY(disease)"
        )
        con.execute(
            f"CREATE INDEX idx_DTD_PROBABILITY_drug ON DTD_PROBABILITY(drug)")
        con.commit()
        con.close()
        print(f"INFO: Creating INDEXes is completed", flush=True)
    else:
        print(f"Error: Not found this file: {args.DTD_prob_db_file}")
        exit(0)

    if os.path.isfile(args.emb_file) and os.path.isfile(args.map_file):
        rel_max = pd.read_csv(args.emb_file, sep=' ', skiprows=1, header=None)
        mapfile = pd.read_csv(args.map_file, sep='\t', header=0)
        merged_table = mapfile.merge(rel_max, left_on='id', right_on=0)
        merged_table = merged_table.loc[:, ['curie'] +
                                        list(merged_table.columns)[3:]]
        new_curie_ids = [
            synonymizer.get_canonical_curies(curie)[curie]['preferred_curie']
            if synonymizer.get_canonical_curies(curie)[curie] is not None else
            None for curie in list(merged_table.curie)
        ]
        graph = pd.concat(
            [pd.DataFrame(new_curie_ids), merged_table.iloc[:, 1:]], axis=1)
        graph = graph.dropna().reset_index(drop=True)

        con = sqlite3.connect(
            os.path.join(args.output_folder, 'GRAPH_refreshed.sqlite'))
        con.execute(f"DROP TABLE IF EXISTs GRAPH")
        insert_command1 = f"CREATE TABLE GRAPH(curie VARCHAR(255)"
        for num in range(1, graph.shape[1]):
            insert_command1 = insert_command1 + f", col{num} INT"
        insert_command1 = insert_command1 + ")"
        con.execute(insert_command1)
        con.commit()

        count = 0

        print(f"Insert data into database", flush=True)
        for row in range(graph.shape[0]):
            count = count + 1
            insert_command1 = f"INSERT INTO GRAPH"
            insert_command2 = f" values ("

            for _ in range(graph.shape[1]):
                insert_command2 = insert_command2 + f"?,"

            insert_command = insert_command1 + insert_command2 + ")"
            insert_command = insert_command.replace(',)', ')')
            line = tuple(graph.loc[row, :])
            con.execute(insert_command, line)
            if count % 5000 == 0:
                con.commit()
                percentage = int(count * 100.0 / graph.shape[0])
                print(str(percentage) + "%..", end='', flush=True)

        con.commit()
        percentage = int(count * 100.0 / graph.shape[0])
        print(str(percentage) + "%..", end='', flush=True)

        con.execute(f"CREATE INDEX idx_GRAPH_curie ON GRAPH(curie)")
        con.commit()
        con.close()
        print(f"INFO: Database created successfully", flush=True)
예제 #30
0
class NGDDatabaseBuilder:
    def __init__(self, pubmed_directory_path, is_test, live="Production"):
        self.RTXConfig = RTXConfiguration()
        self.RTXConfig.live = live
        ngd_filepath = os.path.sep.join([
            *pathlist[:(RTXindex + 1)], 'code', 'ARAX', 'KnowledgeSources',
            'NormalizedGoogleDistance'
        ])
        self.pubmed_directory_path = pubmed_directory_path
        self.conceptname_to_pmids_db_path = "conceptname_to_pmids.db"
        self.curie_to_pmids_db_path = f"{ngd_filepath}{os.path.sep}{self.RTXConfig.curie_to_pmids_path.split('/')[-1]}"
        self.status = 'OK'
        self.synonymizer = NodeSynonymizer()
        self.is_test = is_test

    def build_conceptname_to_pmids_db(self):
        # This function extracts curie -> PMIDs mappings from a Pubmed XML download (saves data in a pickledb)
        print(
            f"Starting to build {self.conceptname_to_pmids_db_path} from pubmed files.."
        )
        start = time.time()
        pubmed_directory = os.fsencode(self.pubmed_directory_path)
        all_file_names = [
            os.fsdecode(file) for file in os.listdir(pubmed_directory)
        ]
        pubmed_file_names = [
            file_name for file_name in all_file_names
            if file_name.startswith('pubmed') and file_name.endswith('.xml.gz')
        ]
        if not pubmed_file_names:
            print(
                f"ERROR: Couldn't find any PubMed XML files to scrape. Provide the path to the directory "
                f"containing your PubMed download as a command line argument.")
            self.status = 'ERROR'
        else:
            conceptname_to_pmids_map = dict()
            # Go through each downloaded pubmed file and build our dictionary of mappings
            pubmed_file_names_to_process = pubmed_file_names if not self.is_test else pubmed_file_names[:
                                                                                                        1]
            for file_name in pubmed_file_names_to_process:
                print(
                    f"  Starting to process file '{file_name}'.. ({pubmed_file_names_to_process.index(file_name) + 1}"
                    f" of {len(pubmed_file_names_to_process)})")
                file_start_time = time.time()
                with gzip.open(f"{self.pubmed_directory_path}/{file_name}"
                               ) as pubmed_file:
                    file_contents_tree = etree.parse(pubmed_file)
                pubmed_articles = file_contents_tree.xpath("//PubmedArticle")

                for article in pubmed_articles:
                    # Link each concept name to the PMID of this article
                    current_pmid = article.xpath(
                        ".//MedlineCitation/PMID/text()")[0]
                    descriptor_names = article.xpath(
                        ".//MedlineCitation/MeshHeadingList/MeshHeading/DescriptorName/text()"
                    )
                    qualifier_names = article.xpath(
                        ".//MedlineCitation/MeshHeadingList/MeshHeading/QualifierName/text()"
                    )
                    chemical_names = article.xpath(
                        ".//MedlineCitation/ChemicalList/Chemical/NameOfSubstance/text()"
                    )
                    gene_symbols = article.xpath(
                        ".//MedlineCitation/GeneSymbolList/GeneSymbol/text()")
                    keywords = article.xpath(
                        ".//MedlineCitation/KeywordList/Keyword/text()")
                    all_concept_names = descriptor_names + qualifier_names + chemical_names + gene_symbols + keywords
                    unique_concept_names = {
                        concept_name
                        for concept_name in all_concept_names if concept_name
                    }
                    for concept_name in unique_concept_names:
                        self._add_pmids_mapping(concept_name, current_pmid,
                                                conceptname_to_pmids_map)

                self._destroy_etree(
                    file_contents_tree)  # Hack around lxml memory leak
                print(
                    f"    took {round((time.time() - file_start_time) / 60, 2)} minutes"
                )

            # Save the data to the PickleDB after we're done
            print("  Loading data into PickleDB..")
            conceptname_to_pmids_db = pickledb.load(
                self.conceptname_to_pmids_db_path, False)
            for concept_name, pmid_list in conceptname_to_pmids_map.items():
                conceptname_to_pmids_db.set(
                    concept_name,
                    list({
                        self._create_pmid_curie_from_local_id(pmid)
                        for pmid in pmid_list
                    }))
            print("  Saving PickleDB file..")
            conceptname_to_pmids_db.dump()
            print(
                f"Done! Building {self.conceptname_to_pmids_db_path} took {round(((time.time() - start) / 60) / 60, 3)} hours"
            )

    def build_curie_to_pmids_db(self):
        # This function creates a final sqlite database of curie->PMIDs mappings using data scraped from Pubmed AND KG2
        print(
            f"Starting to build {self.curie_to_pmids_db_path.split(os.path.sep)[-1]}.."
        )
        start = time.time()
        curie_to_pmids_map = dict()
        self._add_pmids_from_pubmed_scrape(curie_to_pmids_map)
        if self.status != 'OK':
            return
        self._add_pmids_from_kg2_edges(curie_to_pmids_map)
        self._add_pmids_from_kg2_nodes(curie_to_pmids_map)
        print(
            f"  In the end, found PMID lists for {len(curie_to_pmids_map)} (canonical) curies"
        )
        self._save_data_in_sqlite_db(curie_to_pmids_map)
        print(
            f"Done! Building {self.curie_to_pmids_db_path.split(os.path.sep)[-1]} took {round((time.time() - start) / 60)} minutes."
        )

    # Helper methods

    def _add_pmids_from_kg2_edges(self, curie_to_pmids_map):
        print(f"  Getting PMIDs from edges in KG2 neo4j..")
        edge_query = f"match (n)-[e]->(m) where e.publications is not null and e.publications <> '[]' " \
                     f"return distinct n.id, m.id, e.publications{' limit 100' if self.is_test else ''}"
        edge_results = self._run_cypher_query(edge_query, 'KG2')
        print(f"  Processing results..")
        node_ids = {result['n.id']
                    for result in edge_results
                    }.union(result['m.id'] for result in edge_results)
        canonicalized_curies_dict = self._get_canonicalized_curies_dict(
            list(node_ids))
        for result in edge_results:
            canonicalized_node_ids = {
                canonicalized_curies_dict[result['n.id']],
                canonicalized_curies_dict[result['m.id']]
            }
            pmids = self._extract_and_format_pmids(result['e.publications'])
            if pmids:  # Sometimes publications list includes only non-PMID identifiers (like ISBN)
                for canonical_curie in canonicalized_node_ids:
                    self._add_pmids_mapping(canonical_curie, pmids,
                                            curie_to_pmids_map)

    def _add_pmids_from_kg2_nodes(self, curie_to_pmids_map):
        print(f"  Getting PMIDs from nodes in KG2 neo4j..")
        node_query = f"match (n) where n.publications is not null and n.publications <> '[]' " \
                     f"return distinct n.id, n.publications{' limit 100' if self.is_test else ''}"
        node_results = self._run_cypher_query(node_query, 'KG2')
        print(f"  Processing results..")
        node_ids = {result['n.id'] for result in node_results}
        canonicalized_curies_dict = self._get_canonicalized_curies_dict(
            list(node_ids))
        for result in node_results:
            canonical_curie = canonicalized_curies_dict[result['n.id']]
            pmids = self._extract_and_format_pmids(result['n.publications'])
            if pmids:  # Sometimes publications list includes only non-PMID identifiers (like ISBN)
                self._add_pmids_mapping(canonical_curie, pmids,
                                        curie_to_pmids_map)

    def _add_pmids_from_pubmed_scrape(self, curie_to_pmids_map):
        # Load the data from the first half of the build process (scraping pubmed)
        print(
            f"  Loading pickle DB containing pubmed scrapings ({self.conceptname_to_pmids_db_path}).."
        )
        conceptname_to_pmids_db = pickledb.load(
            self.conceptname_to_pmids_db_path, False)
        if not conceptname_to_pmids_db.getall():
            print(
                f"ERROR: {self.conceptname_to_pmids_db_path} must exist to do a partial build. Use --full or locate "
                f"that file.")
            self.status = 'ERROR'
            return

        # Get canonical curies for all of the concept names in our big pubmed pickleDB using the NodeSynonymizer
        concept_names = list(conceptname_to_pmids_db.getall())
        print(
            f"  Sending NodeSynonymizer.get_canonical_curies() a list of {len(concept_names)} concept names.."
        )
        canonical_curies_dict = self.synonymizer.get_canonical_curies(
            names=concept_names)
        print(
            f"  Got results back from NodeSynonymizer. (Returned dict contains {len(canonical_curies_dict)} keys.)"
        )

        # Map all of the concept names scraped from pubmed to curies
        if canonical_curies_dict:
            recognized_concepts = {
                concept
                for concept in canonical_curies_dict
                if canonical_curies_dict.get(concept)
            }
            print(
                f"  NodeSynonymizer recognized {round((len(recognized_concepts) / len(concept_names)) * 100)}% of "
                f"concept names scraped from pubmed.")
            # Store which concept names the NodeSynonymizer didn't know about, for learning purposes
            unrecognized_concepts = set(canonical_curies_dict).difference(
                recognized_concepts)
            with open('unrecognized_pubmed_concept_names.txt',
                      'w+') as unrecognized_concepts_file:
                unrecognized_concepts_file.write(f"{unrecognized_concepts}")
            print(
                f"  Unrecognized concept names were written to 'unrecognized_pubmed_concept_names.txt'."
            )

            # Map the canonical curie for each recognized concept to the concept's PMID list
            print(f"  Mapping canonical curies to PMIDs..")
            for concept_name in recognized_concepts:
                canonical_curie = canonical_curies_dict[concept_name].get(
                    'preferred_curie')
                pmids_for_this_concept = conceptname_to_pmids_db.get(
                    concept_name)
                self._add_pmids_mapping(canonical_curie,
                                        pmids_for_this_concept,
                                        curie_to_pmids_map)
            print(
                f"  Mapped {len(curie_to_pmids_map)} canonical curies to PMIDs based on pubmed scrapings."
            )
        else:
            print(f"ERROR: NodeSynonymizer didn't return anything!")
            self.status = 'ERROR'

    def _save_data_in_sqlite_db(self, curie_to_pmids_map):
        print("  Loading data into sqlite database..")
        # Remove any preexisting version of this database
        if os.path.exists(self.curie_to_pmids_db_path):
            os.remove(self.curie_to_pmids_db_path)
        connection = sqlite3.connect(self.curie_to_pmids_db_path)
        cursor = connection.cursor()
        cursor.execute("CREATE TABLE curie_to_pmids (curie TEXT, pmids TEXT)")
        cursor.execute(
            "CREATE UNIQUE INDEX unique_curie ON curie_to_pmids (curie)")
        print(f"  Gathering row data..")
        rows = [[
            curie,
            json.dumps(
                list(
                    filter(None,
                           {self._get_local_id_as_int(pmid)
                            for pmid in pmids})))
        ] for curie, pmids in curie_to_pmids_map.items()]
        rows_in_chunks = self._divide_list_into_chunks(rows, 5000)
        print(f"  Inserting row data into database..")
        for chunk in rows_in_chunks:
            cursor.executemany(
                f"INSERT INTO curie_to_pmids (curie, pmids) VALUES (?, ?)",
                chunk)
            connection.commit()
        # Log how many rows we've added in the end (for debugging purposes)
        cursor.execute(f"SELECT COUNT(*) FROM curie_to_pmids")
        count = cursor.fetchone()[0]
        print(f"  Done saving data in sqlite; database contains {count} rows.")
        cursor.close()

    def _get_canonicalized_curies_dict(self,
                                       curies: List[str]) -> Dict[str, str]:
        print(
            f"  Sending a batch of {len(curies)} curies to NodeSynonymizer.get_canonical_curies()"
        )
        canonicalized_nodes_info = self.synonymizer.get_canonical_curies(
            curies)
        canonicalized_curies_dict = dict()
        for input_curie, preferred_info_dict in canonicalized_nodes_info.items(
        ):
            if preferred_info_dict:
                canonicalized_curies_dict[
                    input_curie] = preferred_info_dict.get(
                        'preferred_curie', input_curie)
            else:
                canonicalized_curies_dict[input_curie] = input_curie
        print(f"  Got results back from synonymizer")
        return canonicalized_curies_dict

    def _extract_and_format_pmids(self, publications: List[str]) -> List[str]:
        pmids = {
            publication_id
            for publication_id in publications
            if publication_id.upper().startswith('PMID')
        }
        # Make sure all PMIDs are given in same format (e.g., PMID:18299583 rather than PMID18299583)
        formatted_pmids = [
            self._create_pmid_curie_from_local_id(
                pmid.replace('PMID', '').replace(':', '')) for pmid in pmids
        ]
        return formatted_pmids

    @staticmethod
    def _add_pmids_mapping(key: str, value_to_append: Union[str, List[str]],
                           mappings_dict: Dict[str, List[str]]):
        if key not in mappings_dict:
            mappings_dict[key] = []
        if isinstance(value_to_append, list):
            mappings_dict[key] += value_to_append
        else:
            mappings_dict[key].append(value_to_append)

    @staticmethod
    def _create_pmid_curie_from_local_id(pmid):
        return f"PMID:{pmid}"

    @staticmethod
    def _get_local_id_as_int(curie):
        # Converts "PMID:1234" to 1234
        curie_pieces = curie.split(":")
        local_id_str = curie_pieces[-1]
        # Remove any strange characters (like in "PMID:_19960544")
        stripped_id_str = "".join(
            [character for character in local_id_str if character.isdigit()])
        return int(stripped_id_str) if stripped_id_str else None

    @staticmethod
    def _destroy_etree(file_contents_tree):
        # Thank you to https://stackoverflow.com/a/49139904 for this method; important to prevent memory blow-up
        root = file_contents_tree.getroot()
        element_tracker = {root: [0, None]}
        for element in root.iterdescendants():
            parent = element.getparent()
            element_tracker[element] = [element_tracker[parent][0] + 1, parent]
        element_tracker = sorted(
            [(depth, parent, child)
             for child, (depth, parent) in element_tracker.items()],
            key=lambda x: x[0],
            reverse=True)
        for _, parent, child in element_tracker:
            if parent is None:
                break
            parent.remove(child)
        del file_contents_tree

    @staticmethod
    def _run_cypher_query(cypher_query: str, kg='KG2') -> List[Dict[str, any]]:
        rtxc = RTXConfiguration()
        if kg == 'KG2':
            rtxc.live = "KG2"
        try:
            driver = GraphDatabase.driver(rtxc.neo4j_bolt,
                                          auth=(rtxc.neo4j_username,
                                                rtxc.neo4j_password))
            with driver.session() as session:
                query_results = session.run(cypher_query).data()
            driver.close()
        except Exception:
            tb = traceback.format_exc()
            error_type, error, _ = sys.exc_info()
            print(f"Encountered an error interacting with {kg} neo4j. {tb}")
            return []
        else:
            return query_results

    @staticmethod
    def _divide_list_into_chunks(input_list: List[any],
                                 chunk_size: int) -> List[List[any]]:
        num_chunks = len(input_list) // chunk_size if len(
            input_list) % chunk_size == 0 else (len(input_list) //
                                                chunk_size) + 1
        start_index = 0
        stop_index = chunk_size
        all_chunks = []
        for num in range(num_chunks):
            chunk = input_list[start_index:stop_index] if stop_index <= len(
                input_list) else input_list[start_index:]
            all_chunks.append(chunk)
            start_index += chunk_size
            stop_index += chunk_size
        return all_chunks