def get_path_to_root(entity_id): """ :param entity_id: :return: """ if entity_id.startswith('CHEBI'): ssm.semantic_base('bin/DiShIn/chebi.db') if entity_id.startswith('HP'): ssm.semantic_base('bin/DiShIn/hp.db') if entity_id.startswith('GO'): ssm.semantic_base('bin/DiShIn/go.db') if entity_id.startswith('DOID'): ssm.semantic_base('bin/DiShIn/doid.db') e1 = ssm.get_id(entity_id.replace(':', '_')) a = ssm.common_ancestors(e1, e1) a = [ssm.get_name(x) for x in a] return a
if len(documents_entity_list[d][e]) > 0: average_correct_match_score.append( documents_entity_list[d][e][0]["score"]) #print(documents_entity_list[d][e][0]["score"]) print( "average_correct_match_score", sum(average_correct_match_score) / len(average_correct_match_score)) print("perfect match is solution", perfect_matches_correct) print("solution label is not a perfect match", perfect_matches_incorrect) # print("entities with incorrect perfect matches", entities_with_incorrect_perfect_matches) # print("average number of candidates", sum(ncandidates) / len(ncandidates)) # print("max number of candidates", max(ncandidates)) return documents_entity_list print("load semantic base") ssm.semantic_base("DiShIn/hp.db") max_dist = int(sys.argv[1]) min_match_score = float(sys.argv[2]) corpus_dir = sys.argv[3] #documents_entity_list = get_hpo_documents(corpus=("HPOtest/documents/", "HPOtest/annotations/"), min_match_score=min_match_score) print("get hpo documents") documents_entity_list = get_hpo_documents(corpus=corpus_dir, min_match_score=min_match_score) print("generate candidates") for d in documents_entity_list: candidates_filename = "candidates/{}/{}".format(corpus_dir, d) write_candidates(documents_entity_list[d], candidates_filename, max_dist, "hpo")
#print("max number of candidates", max(ncandidates)) return documents_entity_list # print(""perfect matches:", perfect_matches, # "partial matches:", partial_matches, # "label not found", label_not_found) def write_candidates_file(entities, d, max_dist, corpus="ChebiPatents"): candidates_filename = "candidates/{}/{}".format(corpus, d) writen = write_candidates(entities, candidates_filename, max_dist, "chebi") output.put(writen) # first argument: max distance between linked concepts # second argument: min similarity between entity text and candidate match ssm.semantic_base("DiShIn/chebi.db", to_memory=False) max_dist = int(sys.argv[1]) min_match_score = float(sys.argv[2]) #corpus_dir = "ChebiPatents" #corpus_dir = "ChebiTest" corpus_dir = sys.argv[3] start_time = time.time() documents_entity_list = get_chebi_patents(corpus_dir, min_match_score=min_match_score, mapto="chebi") print("parsing and get entities time:", time.time() - start_time) #documents_entity_list = get_chebi_patents(corpus_dir, mapto="dbpedia") entities_writen = 0 output = mp.Queue() """ processes = [mp.Process(target=write_candidates_file, args=(documents_entity_list[d], d, max_dist)) for d in documents_entity_list] print(processes) # Run processes
def get_common_ancestors(id1, id2): """ :param id1: :param id2: :return: """ if id1.startswith('CHEBI'): ssm.semantic_base('bin/DiShIn/chebi.db') if id1.startswith('HP'): ssm.semantic_base('bin/DiShIn/hp.db') if id1.startswith('GO'): ssm.semantic_base('bin/DiShIn/go.db') if id1.startswith('DOID'): ssm.semantic_base('bin/DiShIn/doid.db') e1 = ssm.get_id(id1.replace(':', '_')) if id2.startswith('CHEBI'): ssm.semantic_base('bin/DiShIn/chebi.db') if id2.startswith('HP'): ssm.semantic_base('bin/DiShIn/hp.db') if id2.startswith('GO'): ssm.semantic_base('bin/DiShIn/go.db') if id2.startswith('DOID'): ssm.semantic_base('bin/DiShIn/doid.db') e2 = ssm.get_id(id2.replace(':', '_')) a = ssm.common_ancestors(e1, e2) # if a: # print(id1, id2) # print(e1, e2) # print() # print(a) # b = [ssm.get_name(x) for x in a] # print(b) # print('\n\n\n') a = [ssm.get_name(x) for x in a] return a
import logging from itertools import combinations import sys import os import pickle import atexit import obonet import networkx from fuzzywuzzy import process from fuzzywuzzy import fuzz from DiShIn import ssm ssm.semantic_base("src/DiShIn/chebi.db") global chebi_cache global paths_cache global chemical_entity global role global subatomic_particle global application global multiple_match_count global no_match_count chebi_cache_file = "temp/chebi_cache.pickle" # store string-> chebi ID if os.path.isfile(chebi_cache_file): logging.info("loading chebi...") chebi_cache = pickle.load(open(chebi_cache_file, "rb")) loadedchebi = True