def create_umls_ss_db(umls_kb, char_ngram_len=3, n_max_tokens=5): logging.info('Loading scispacy ...') import spacy self.sci_nlp = spacy.load('en_core_sci_md', disable=['tagger', 'parser', 'ner']) simstring_db = DictDatabase(CharacterNgramFeatureExtractor(char_ngram_len)) # preprocessing aliases and labels logging.info('Preprocessing aliases ...') alias_mapping = defaultdict(set) aliases = [] for cui in umls_kb.get_all_cuis(): cui_aliases = set( [a.lower() for a in umls_kb.get_aliases(cui, include_name=True)]) for alias in cui_aliases: alias_chars = set(alias) if len(alias_chars.intersection(fb_punctuation)) > 0: continue elif alias in en_stopwords: continue elif alias.isnumeric(): continue alias_doc = self.sci_nlp( alias) # use same tokenizer as when splitting medmentions if len(alias_doc ) > n_max_tokens: # gets too big without restrictions continue alias_mapping[alias].add(cui) aliases.append(alias) logging.info('Adding to DB ...') for alias_idx, alias in enumerate(aliases): simstring_db.add(alias) if alias_idx % 1000000 == 0: logging.info('At %d/%d ...' % (alias_idx, len(aliases))) # setting paths db_path = '%s.aliases.%dgram.%dtoks.db' % (umls_kb.umls_version, char_ngram_len, n_max_tokens) map_path = '%s.aliases.%dtoks.map' % (umls_kb.umls_version, n_max_tokens) logging.info('Storing DB ...') with open(db_path, 'wb') as f: pickle.dump(simstring_db, f) logging.info('Storing Alias Mapping ...') with open(map_path, 'wb') as f: alias_mapping = dict(alias_mapping) pickle.dump(alias_mapping, f)
def __init__(self, word2vec): self.w2v = word2vec self.embedding_dim = self.w2v.vector_size self.vocab = set(self.w2v.vocab.keys()) self.db = DictDatabase(CharacterNgramFeatureExtractor(2)) for vocab_word in self.vocab: self.db.add(vocab_word)
def load_disambiguation(): db = DictDatabase(WordNgramFeatureExtractor(2)) with open(LOCATION_WIKIPEDIA_DISAMBIGUATION) as disambig_file: for line in disambig_file: r = line.replace("_(disambiguation)", "").replace("_", " ").lower() db.add(r.strip()) return Searcher(db, JaccardMeasure())
def setUp(self) -> None: db = DictDatabase(CharacterNgramFeatureExtractor(2)) db.add("foo") db.add("bar") db.add("fooo") db.add("food") db.add("fool") db.add("follow") self.searcher = Searcher(db, JaccardMeasure())
class MagnitudeOOV(): def __init__(self, word2vec): self.w2v = word2vec self.embedding_dim = self.w2v.vector_size self.vocab = set(self.w2v.vocab.keys()) self.db = DictDatabase(CharacterNgramFeatureExtractor(2)) for vocab_word in self.vocab: self.db.add(vocab_word) def generate_pseudorandom_vector(self, word): """calculate PRVG form CGRAM""" vectors = [] ngram_list = character_ngram(word) for ngram in ngram_list: np.random.seed(seed(ngram)) vectors.append(np.random.uniform(-1, 1, self.embedding_dim)) return np.mean(vectors, axis=0) def similar_words_top_k(self, query, measure=CosineMeasure(), initial_threshold=0.99, dec_step=0.01, k=3): """search similar words by using edit distance""" searcher = Searcher(self.db, measure) t = initial_threshold similar_words = [] while True: similar_words = searcher.search(query, t) if len(similar_words) >= k or t <= 0.1: break t -= dec_step if len(similar_words) > 3: np.random.choice(42) return np.random.choice(similar_words, k, replace=False).tolist() else: return similar_words def generate_similar_words_vector(self, word): """calculate MATCH from similar words""" vectors = np.mean([self.w2v[w] for w in self.similar_words_top_k(word)], axis=0) return vectors def out_of_vocab_vector(self, word): vector = self.generate_pseudorandom_vector(word) * 0.3 + self.generate_similar_words_vector(word) * 0.7 final_vector = vector / np.linalg.norm(vector) return final_vector def query(self, word): normalized_word = neologdn.normalize(word) if word in self.vocab: return self.w2v[word] elif normalized_word in self.vocab: return self.w2v[normalized_word] else: return self.out_of_vocab_vector(normalized_word)
def output_similar_strings_of_each_line(path): db = DictDatabase(CharacterNgramFeatureExtractor(2)) with open(path, 'r') as lines: for line in lines: strings = line.rstrip('\r\n') db.add(strings) searcher = Searcher(db, CosineMeasure()) with open(path, 'r') as lines: for line in lines: strings = line.rstrip('\r\n') result = [str(round(x[0], 5)) + ' ' + x[1] for x in searcher.ranked_search(strings, 0.8)] print("\t".join([strings, ",".join(result)]))
def get_umls_data(): umls_data = pd.read_csv(umls_df_data_path) print(f"Got UMLS data at length {len(umls_data)}") acronyms_umls_df = pd.read_csv(acronyms_dir + os.sep + 'acronyms_terms.csv') umls_data = pd.concat([umls_data, acronyms_umls_df]) cuiless_umls_df = pd.read_csv(cuiless_dir + os.sep + 'cuiless_terms.csv') umls_data = pd.concat([umls_data, cuiless_umls_df]) umls_data.reset_index(inplace=True) heb_umls_list = list(umls_data['HEB'].values) eng_umls_list = list(umls_data[STRING_COLUMN].values) heb_db = DictDatabase(CharacterNgramFeatureExtractor(2)) eng_db = DictDatabase(CharacterNgramFeatureExtractor(2)) for heb_w in heb_umls_list: heb_db.add(heb_w) for eng_w in eng_umls_list: lower_eng_w = eng_w.lower() eng_db.add(lower_eng_w) return heb_db, eng_db, umls_data
def simstring_database(umls, nchar_val): db = DictDatabase(CharacterNgramFeatureExtractor(nchar_val)) term_to_cui = dict() for value in umls: try: data = value.split('\t') cui = data[0] term = data[1].lower() db.add(term) term_to_cui[term] = cui except: continue pickle.dump(db, open('db.pickle', 'wb')) pickle.dump(term_to_cui, open('term_to_cui.pickle', 'wb'))
def output_similar_strings_of_each_line(path): db = DictDatabase(CharacterNgramFeatureExtractor(2)) with open(path, 'r') as lines: for line in lines: strings = line.rstrip('\r\n') db.add(strings) searcher = Searcher(db, CosineMeasure()) with open(path, 'r') as lines: for line in lines: strings = line.rstrip('\r\n') result = [ str(round(x[0], 5)) + ' ' + x[1] for x in searcher.ranked_search(strings, 0.8) ] print("\t".join([strings, ",".join(result)]))
class TestDict(TestCase): strings = ['a', 'ab', 'abc', 'abcd', 'abcde'] def setUp(self): self.db = DictDatabase(CharacterNgramFeatureExtractor(2)) for string in self.strings: self.db.add(string) def test_strings(self): self.assertEqual(self.db.strings, self.strings) def test_min_feature_size(self): self.assertEqual(self.db.min_feature_size(), min(map(lambda x: len(x) + 1, self.strings))) def test_max_feature_size(self): self.assertEqual(self.db.max_feature_size(), max(map(lambda x: len(x) + 1, self.strings))) def test_lookup_strings_by_feature_set_size_and_feature(self): self.assertEqual( self.db.lookup_strings_by_feature_set_size_and_feature(4, 'ab_1'), set(['abc'])) self.assertEqual( self.db.lookup_strings_by_feature_set_size_and_feature(3, 'ab_1'), set(['ab'])) self.assertEqual( self.db.lookup_strings_by_feature_set_size_and_feature(2, 'ab_1'), set([]))
def output_similar_strings_of_each_line(path, measure): strings = [] with open(path, "r") as lines: for line in lines: strings.append(line.rstrip("\r\n")) db = DictDatabase(CharacterNgramFeatureExtractor(2)) for string in strings: db.add(string) db.save("companies.db") dbl = DictDatabase.load("companies.db") searcher = Searcher(dbl, measure) profiler.start() for string in strings: result = searcher.search(string, 0.8) # result = [str(np.round(x[0], 5)) + ' ' + x[1] for x in searcher.ranked_search(string, 0.8)] # print("\t".join([string, ",".join(result)])) profiler.stop() profiler.print() profiler.open_in_browser()
def construct_ontology(ontology_data): ''' Create an n-char simstring database and term-to-code mapping to enable rapid ontology querying ''' database = DictDatabase(CharacterNgramFeatureExtractor(2)) term_to_cui = {} for entry in ontology_data: entry_values = entry.split('\t') if len(entry_values) == 2: term = clean_selected_term(entry_values[1]) term_to_cui[term] = entry_values[0].strip() for term in term_to_cui.keys(): term = clean_selected_term(term) database.add(term) return database, term_to_cui
def __init__(self, from_dir: str = None, json_path: str = "mapper.json", umls_words: Iterable[str] = None): # self.db = DictDatabase(WordNgramFeatureExtractor(2)) self.db = DictDatabase(CharacterNgramFeatureExtractor(2)) if from_dir: json_path = os.path.join(from_dir, json_path) if os.path.exists(json_path): print(f"initialize {self.__class__.__name__}... Load json") self.umls_dict, self.umls_reverse_dict = self.load_from_json( json_path) self.add_words_to_db(self.umls_dict.keys()) else: print(f"initialize {self.__class__.__name__}... Load dir") self.umls_dict, self.umls_reverse_dict = self.load_umls_dict( from_dir) self.add_words_to_db(self.umls_dict.keys()) self.save_as_json(path=json_path) else: self.add_words_to_db(umls_words)
class TestDict(TestCase): strings = ['a', 'ab', 'abc', 'abcd', 'abcde'] def setUp(self): self.db = DictDatabase(CharacterNgramFeatureExtractor(2)) for string in self.strings: self.db.add(string) def test_strings(self): self.assertEqual(self.db.strings, self.strings) def test_min_feature_size(self): self.assertEqual(self.db.min_feature_size(), min(map(lambda x: len(x) + 1, self.strings))) def test_max_feature_size(self): self.assertEqual(self.db.max_feature_size(), max(map(lambda x: len(x) + 1, self.strings))) def test_lookup_strings_by_feature_set_size_and_feature(self): self.assertEqual(self.db.lookup_strings_by_feature_set_size_and_feature(4, 'ab'), set(['abc'])) self.assertEqual(self.db.lookup_strings_by_feature_set_size_and_feature(3, 'ab'), set(['ab'])) self.assertEqual(self.db.lookup_strings_by_feature_set_size_and_feature(2, 'ab'), set([]))
def __init__( self, base_ges_data='ges_utils/data/ges-health-problems.json', no_ges_str='UNK', alpha=0.2, n_chars=4, n_words=[2], special_words=['vih'] ): self.alpha = alpha with open(base_ges_data,'r',encoding='utf-8') as f: self.__ges_dict = json.load(f) # feature extractor extractor = GESSyntacticFeatureExtractor( n_chars=n_chars, n_words=n_words, special_words=special_words ) self.__db = DictDatabase(extractor) # Caché self.__cache = {} self.__problems_from_disease = defaultdict(set) self.__ids_from_disease = defaultdict(set) self.__problems = {} self.__ids = {} self.__problems[-1] = no_ges_str self.__ids[no_ges_str] = -1 # Por ahora los ids son el orden de los problemas en el json # TODO: decidir si los ids deberían obtenerse de algún lugar estándar for i, problem in enumerate(self.__ges_dict): problem_id = i+1 self.__problems[problem_id] = problem self.__ids[problem] = problem_id # agrega un problema como si fuera disease también self.__problems_from_disease[problem].add(problem) self.__ids_from_disease[problem].add(problem_id) # agrega a las BD self.__db.add(problem) for disease in self.__ges_dict[problem]: self.__problems_from_disease[disease].add(problem) self.__ids_from_disease[disease].add(problem_id) # agrega a la BD self.__db.add(disease) # TODO: agregar datos adicionales para hacer matching de enfermedades y problemas self.__searcher = Searcher(self.__db, CosineMeasure())
def setUp(self): db = DictDatabase(CharacterNgramFeatureExtractor(2)) for string in self.strings: db.add(string) self.searcher = Searcher(db, CosineMeasure())
############################# # Perform database cleaning # ############################# # Read in branded foods CSV and clean it df = pd.read_csv('branded_food.csv') all_ingredients_final = get_cleaned_ingredients_list(df) # Get a count for all the ingredients to be used by Peter Norvig Implementation ingredients_count = Counter(all_ingredients_final) ############################################## # Peter Norvig SimString Implementation Code # ############################################## # Populate database with all ingredients db = DictDatabase(CharacterNgramFeatureExtractor(2)) for ingredient in all_ingredients_final: db.add(ingredient) # Create searcher object to be used by candidates function searcher = Searcher(db, CosineMeasure()) # Functions def probability(word, N=sum(ingredients_count.values())): """ Returns the probability of the word appearing in the text Usually, correctly spelled words will have a higher count and therefore probability than their mispellings """ return ingredients_count[word] / N
class UMLSMapper: # https://www.ncbi.nlm.nih.gov/books/NBK9685/table/ch03.T.concept_names_and_sources_file_mr/ def __init__(self, from_dir: str = None, json_path: str = "mapper.json", umls_words: Iterable[str] = None): # self.db = DictDatabase(WordNgramFeatureExtractor(2)) self.db = DictDatabase(CharacterNgramFeatureExtractor(2)) if from_dir: json_path = os.path.join(from_dir, json_path) if os.path.exists(json_path): print(f"initialize {self.__class__.__name__}... Load json") self.umls_dict, self.umls_reverse_dict = self.load_from_json( json_path) self.add_words_to_db(self.umls_dict.keys()) else: print(f"initialize {self.__class__.__name__}... Load dir") self.umls_dict, self.umls_reverse_dict = self.load_umls_dict( from_dir) self.add_words_to_db(self.umls_dict.keys()) self.save_as_json(path=json_path) else: self.add_words_to_db(umls_words) # if from_dir: # print(f"initialize {self.__class__.__name__}... Load dir") # self.umls_dict, self.umls_reverse_dict = self.load_umls_dict(from_dir) # self.add_words_to_db(self.umls_dict.keys()) # elif json_path: # print(f"initialize {self.__class__.__name__}... Load json") # self.umls_dict, self.umls_reverse_dict = self.load_from_json(json_path) # self.add_words_to_db(self.umls_dict.keys()) # else: # self.add_words_to_db(umls_words) def load_umls_dict(self, directory): path = os.path.join(directory, "GER_MRCONSO.RRF") df = pd.read_csv(path, delimiter="|", header=None) df.columns = [ "CUI", "LAT", "TS", "LUI", "STT", "SUI", "ISPREF", "AUI", "SAUI", "SCUI", "SDUI", "SAB", "TTY", "CODE", "STR", "SRL", "SUPPRESS", "CVF", "NONE" ] df = df.drop(columns=['NONE']) dic = {row["STR"]: row["CUI"] for i, row in df.iterrows()} rev_dic = defaultdict(list) for key, value in dic.items(): rev_dic[value].append(key) return dic, rev_dic def save_as_json(self, path: str): data = self.__dict__ data.pop("db") with open(path, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=0) def load_from_json(self, path: str): with open(path, 'r', encoding='utf-8') as file: data = json.loads(file.read()) return data["umls_dict"], data["umls_reverse_dict"] def add_words_to_db(self, words: Iterable[str]): for token in set(words): self.db.add(token) def search_term_sims(self, term: str) -> List[str]: searcher = Searcher(self.db, CosineMeasure()) return searcher.search(term, 0.8) def search_all_term_sims(self, terms: List[str]) -> Dict[str, List[str]]: dic = {} for term in set(terms): related_terms = self.search_term_sims(term) if len(related_terms) > 0: dic[term] = related_terms return dic # return {term: self.search_term_sims(term) for term in set(terms)} def standardize_words(self, tokens: List[str]): concept_dict = self.search_all_term_sims(tokens) standardized_tokens = [] for token in tokens: mapping = concept_dict.get(token) if mapping: standardized_tokens.append(mapping[0]) else: standardized_tokens.append(token) return standardized_tokens def standardize_documents(self, documents: List[List[str]]): concept_dict = self.search_all_term_sims(list(chain(*documents))) standardized_documents = [] for document in tqdm(documents): standardized_tokens = [] for token in document: mapping = concept_dict.get(token) if mapping: standardized_tokens.append(mapping[0]) else: standardized_tokens.append(token) standardized_documents.append(standardized_tokens) return standardized_documents def get_umls_vectors_only(self, vectors: gensim.models.KeyedVectors): medical_concepts = [ word for word in vectors.index2word if word in self.umls_dict.values() ] concept_vecs = { concept: vectors.get_vector(concept) for concept in medical_concepts } return concept_vecs def un_umls(self, concept, single_return=True): res = self.umls_reverse_dict.get(concept) if res is None: return concept if single_return: return res[0] else: return res def replace_umls(self, tokens: List[str]) -> List[str]: return [self.un_umls(token) for token in tokens if self.un_umls(token)] def umls_code(self, token, delete_non_umls): umls_code = self.umls_dict.get(token) if umls_code is None: if delete_non_umls: return None else: return token else: return umls_code def replace_with_umls(self, tokens: List[str], delete_non_umls=False) -> List[str]: return [ self.umls_code(token, delete_non_umls) for token in tokens if self.umls_code(token, delete_non_umls) ] def replace_documents_token_based(self, documents: List[str], delete_non_umls=False, tokenize: bool = True) \ -> Union[List[List[str]], List[str]]: tokenized_documents = [sentence.split() for sentence in documents] if tokenize: return [[ self.umls_code(token, delete_non_umls) for token in tokens if self.umls_code(token, delete_non_umls) ] for tokens in tokenized_documents] else: return [ ' '.join([ self.umls_code(token, delete_non_umls) for token in tokens if self.umls_code(token, delete_non_umls) ]) for tokens in tokenized_documents ] def spacy_tokenize(self, documents: List["str"], nlp=None) -> List[List[str]]: if nlp is None: nlp = spacy.load('de_core_news_sm') doc_pipe = list( nlp.pipe(documents, disable=["tagger", "parser", "ner"])) tokenized_docs = [[ token.text for token in doc ] for doc in tqdm(doc_pipe, desc="Tokenize", total=len(documents))] return tokenized_docs def replace_documents_with_spacy_multiterm( self, documents: List[str], tokenize: bool = True) -> List[List[str]]: nlp = spacy.load('de_core_news_sm') matcher = PhraseMatcher(nlp.vocab) terms = self.umls_dict.keys() doc_pipe = list( nlp.pipe(documents, disable=["tagger", "parser", "ner"])) # Only run nlp.make_doc to speed things up patterns = [nlp.make_doc(term) for term in terms] matcher.add("TerminologyList", None, *patterns) replaced_docs = [] for doc in tqdm(doc_pipe, desc="Replace with concepts", total=len(documents)): text_doc = doc.text matches = matcher(doc) concepts = [] for match_id, start, end in matches: span = doc[start:end] concepts.append(span.text) concepts.sort(key=lambda s: len(s), reverse=True) for concept in concepts: text_doc = text_doc.replace(concept, self.umls_dict[concept]) replaced_docs.append(text_doc) # tokens = [token for token in text_doc.split()] # replaced_docs.append(tokens) if tokenize: replaced_docs = self.spacy_tokenize(replaced_docs, nlp) # doc_pipe = list(nlp.pipe(replaced_docs, disable=["tagger", "parser", "ner"])) # replaced_docs = [[token.text for token in doc] for doc in tqdm(doc_pipe, desc="Tokenize", total=len(documents))] return replaced_docs
def setUp(self): self.db = DictDatabase(CharacterNgramFeatureExtractor(2)) for string in self.strings: self.db.add(string)
total_count = df[df['name'].isin(passed_list)]['count'].sum() combined_passed_name = ':'.join(passed_list) # 再頻出の単語を抽出する keywords = [ surface for surface in analyzer.analyze(combined_passed_name) ] frequent_word = {'key': 'No Key', 'count': 0} for key in keywords: if len(word) < 3: continue count = combined_passed_name.count(key) if count > frequent_word['count']: frequent_word['key'] = key frequent_word['count'] = count # print("Count: {}, Names: {}".format(total_count, combined_passed_name)) writer.writerow( [frequent_word['key'], total_count, combined_passed_name]) keys = np.delete(keys, np.where(np.isin(keys, passed_list) == True)) # print("Grouping keys... size: {}, keys: {}. Unpassed keys... size: {}".format(len(passed_list), passed_list, len(keys))) file.close() print("End") if __name__ == "__main__": db = DictDatabase(CharacterNgramFeatureExtractor(2)) main()
def make_change_image_dict(drink_names): import re import json import difflib from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor from simstring.measure.cosine import CosineMeasure from simstring.database.dict import DictDatabase from simstring.searcher import Searcher ff = open('jsons/theCocktailDB_allData_20181010.json', 'r', encoding="utf-8_sig") json_data2 = json.load(ff) ff.close() # 互いに類似度を比較する文字列のリスト STR_db = [re.sub(r'[!-/:-@[-`{-~]', " ", d["en"]) for d in drink_names] TCD_db ={re.sub(r'[!-/:-@[-`{-~]', " ", d["drinks"][0]["strDrink"]): d["drinks"][0]["strDrinkThumb"] for d in json_data2} TCD_name_db = list(TCD_db.keys()) count = 0 length = len(STR_db) result_dict = {} change_image_dict = {} db = DictDatabase(CharacterNgramFeatureExtractor(2)) for str1 in STR_db: db.add(str1) for str2 in TCD_name_db: result_dict[str2] = {} searcher = Searcher(db, CosineMeasure()) i = 1.0 # 類似度を計算、0.0~1.0 で結果が返る flag = False for str1 in STR_db: s = difflib.SequenceMatcher(None, str2, str1).ratio() if s > 0.75: flag = True if (str1 in result_dict[str2]): d = result_dict[str2][str1] #平均更新 d = [(d[0]*d[1]+s)/(d[1]+1), d[1]+1] result_dict[str2][str1] = d else: result_dict[str2].setdefault(str1, [s ,1]) temp = [] while i >= 0.65: result = searcher.search(str2, i) if (len(result)): flag = True for str1 in result: if (str1 in temp): continue temp += [str1] if (str1 in result_dict[str2]): d = result_dict[str2][str1] #平均更新 d = [(d[0]*d[1]+i)/(d[1]+1), d[1]+1] result_dict[str2][str1] = d else: result_dict[str2].setdefault(str1, [i ,1]) i -= 0.001 if (flag): count += 1 with open("./search_log.txt", "w+", encoding="utf-8_sig") as f: real_count = 0 for str2 in TCD_name_db: print("\n", file=f) print("\n") print(">> "+str2, file=f) print(">> "+str2) M = 0.0 name = "" for key, value_list in result_dict[str2].items(): if (M < value_list[0]): name = key M = value_list[0] print(" "+name+": "+str(M), file=f) if (M != 0): if (M >= 0.76): print(" "+name+": "+str(M)) print("ok", file=f) print("ok") change_image_dict[name] = TCD_db[str2] real_count += 1 else: print(" "+name+": "+str(M)) print("out", file=f) print("out") print("\nmatch is {count}/{length} but real_match is {real_count}/{length}".format(count=count, real_count=real_count, length=length), file=f) print("\nmatch is {count}/{length} but real_match is {real_count}/{length}".format(count=count, real_count=real_count, length=length)) exit() return change_image_dict
class GESSimpleMatcher: ''' Clase para hacer match simple de patologías GES. Solo considera similitud entre strings, nada muy sofisticado. Basado en código de Fabián Villena (https://fabianvillena.cl). Actualmente considera un extractor de features que combina caracteres y palabras y tiene ciertas cosas específicas de textos GES. TODO: - probar técnicas un poco más sofisticadas de matching - completar la documentación ''' def __init__( self, base_ges_data='ges_utils/data/ges-health-problems.json', no_ges_str='UNK', alpha=0.2, n_chars=4, n_words=[2], special_words=['vih'] ): self.alpha = alpha with open(base_ges_data,'r',encoding='utf-8') as f: self.__ges_dict = json.load(f) # feature extractor extractor = GESSyntacticFeatureExtractor( n_chars=n_chars, n_words=n_words, special_words=special_words ) self.__db = DictDatabase(extractor) # Caché self.__cache = {} self.__problems_from_disease = defaultdict(set) self.__ids_from_disease = defaultdict(set) self.__problems = {} self.__ids = {} self.__problems[-1] = no_ges_str self.__ids[no_ges_str] = -1 # Por ahora los ids son el orden de los problemas en el json # TODO: decidir si los ids deberían obtenerse de algún lugar estándar for i, problem in enumerate(self.__ges_dict): problem_id = i+1 self.__problems[problem_id] = problem self.__ids[problem] = problem_id # agrega un problema como si fuera disease también self.__problems_from_disease[problem].add(problem) self.__ids_from_disease[problem].add(problem_id) # agrega a las BD self.__db.add(problem) for disease in self.__ges_dict[problem]: self.__problems_from_disease[disease].add(problem) self.__ids_from_disease[disease].add(problem_id) # agrega a la BD self.__db.add(disease) # TODO: agregar datos adicionales para hacer matching de enfermedades y problemas self.__searcher = Searcher(self.__db, CosineMeasure()) def get_ranking_ges_diseases(self, raw_string): ranking = self.__searcher.ranked_search(raw_string, alpha=self.alpha) return ranking def get_ges_problem(self, raw_string): problem_id = self.get_ges_id(raw_string) problem = self.__problems[problem_id] return problem def get_ges_id(self, raw_string): # si ya lo computamos entrega el valor if raw_string in self.__cache: return self.__cache[raw_string] # si no lo tenemos, lo computamos ranking = self.get_ranking_ges_diseases(raw_string) if ranking: # ipdb.set_trace() (v, disease) = ranking[0] problem_ids = self.__ids_from_disease[disease] problem_id = list(problem_ids)[0] self.__cache[raw_string] = problem_id return problem_id else: self.__cache[raw_string] = -1 return -1 def get_possible_ges_ids(self, raw_string): to_search = raw_string problem_ids = [] # busca las enfermedades candidatas candidate_diseases = self.__searcher.search(to_search, alpha=self.alpha) for disease in candidate_diseases: problem_ids.extend(self.__ids_from_disease[disease]) problem_ids_counter = Counter(problem_ids) ordered_ids = [i for i,_ in problem_ids_counter.most_common()] return ordered_ids def get_ges_id_prev(self, raw_string): # si ya lo computamos entrega el valor if hash(raw_string) in self.__cache: return self.__cache[hash(raw_string)] ids_list = self.get_possible_ges_ids(raw_string) if not ids_list: self.__cache[raw_string] = -1 return -1 else: self.__cache[raw_string] = ids_list[0] return ids_list[0] def problem_from_id(self, id_problem): return self.__problems[id_problem] def id_from_problem(self, problem): return self.__ids[problem] def clean_cache(self): self.__cache = {}