예제 #1
0
def output_similar_strings_of_each_line(path, measure):
    strings = []
    with open(path, "r") as lines:
        for line in lines:
            strings.append(line.rstrip("\r\n"))

    db = DictDatabase(CharacterNgramFeatureExtractor(2))
    for string in strings:
        db.add(string)

    db.save("companies.db")

    dbl = DictDatabase.load("companies.db")

    searcher = Searcher(dbl, measure)
    profiler.start()

    for string in strings:
        result = searcher.search(string, 0.8)
        # result = [str(np.round(x[0], 5)) + ' ' + x[1] for x in searcher.ranked_search(string, 0.8)]
        # print("\t".join([string, ",".join(result)]))

    profiler.stop()

    profiler.print()
    profiler.open_in_browser()
예제 #2
0
class TestDict(TestCase):
    strings = ['a', 'ab', 'abc', 'abcd', 'abcde']

    def setUp(self):
        self.db = DictDatabase(CharacterNgramFeatureExtractor(2))
        for string in self.strings:
            self.db.add(string)

    def test_strings(self):
        self.assertEqual(self.db.strings, self.strings)

    def test_min_feature_size(self):
        self.assertEqual(self.db.min_feature_size(),
                         min(map(lambda x: len(x) + 1, self.strings)))

    def test_max_feature_size(self):
        self.assertEqual(self.db.max_feature_size(),
                         max(map(lambda x: len(x) + 1, self.strings)))

    def test_lookup_strings_by_feature_set_size_and_feature(self):
        self.assertEqual(
            self.db.lookup_strings_by_feature_set_size_and_feature(4, 'ab_1'),
            set(['abc']))
        self.assertEqual(
            self.db.lookup_strings_by_feature_set_size_and_feature(3, 'ab_1'),
            set(['ab']))
        self.assertEqual(
            self.db.lookup_strings_by_feature_set_size_and_feature(2, 'ab_1'),
            set([]))
예제 #3
0
def get_umls_data():
    umls_data = pd.read_csv(umls_df_data_path)
    print(f"Got UMLS data at length {len(umls_data)}")

    acronyms_umls_df = pd.read_csv(acronyms_dir + os.sep + 'acronyms_terms.csv')
    umls_data = pd.concat([umls_data, acronyms_umls_df])

    cuiless_umls_df = pd.read_csv(cuiless_dir + os.sep + 'cuiless_terms.csv')
    umls_data = pd.concat([umls_data, cuiless_umls_df])

    umls_data.reset_index(inplace=True)

    heb_umls_list = list(umls_data['HEB'].values)
    eng_umls_list = list(umls_data[STRING_COLUMN].values)

    heb_db = DictDatabase(CharacterNgramFeatureExtractor(2))
    eng_db = DictDatabase(CharacterNgramFeatureExtractor(2))

    for heb_w in heb_umls_list:
        heb_db.add(heb_w)

    for eng_w in eng_umls_list:
        lower_eng_w = eng_w.lower()
        eng_db.add(lower_eng_w)

    return heb_db, eng_db, umls_data
예제 #4
0
def create_umls_ss_db(umls_kb, char_ngram_len=3, n_max_tokens=5):

    logging.info('Loading scispacy ...')
    import spacy
    self.sci_nlp = spacy.load('en_core_sci_md',
                              disable=['tagger', 'parser', 'ner'])

    simstring_db = DictDatabase(CharacterNgramFeatureExtractor(char_ngram_len))

    # preprocessing aliases and labels
    logging.info('Preprocessing aliases ...')
    alias_mapping = defaultdict(set)

    aliases = []
    for cui in umls_kb.get_all_cuis():

        cui_aliases = set(
            [a.lower() for a in umls_kb.get_aliases(cui, include_name=True)])

        for alias in cui_aliases:

            alias_chars = set(alias)
            if len(alias_chars.intersection(fb_punctuation)) > 0:
                continue

            elif alias in en_stopwords:
                continue

            elif alias.isnumeric():
                continue

            alias_doc = self.sci_nlp(
                alias)  # use same tokenizer as when splitting medmentions
            if len(alias_doc
                   ) > n_max_tokens:  # gets too big without restrictions
                continue

            alias_mapping[alias].add(cui)
            aliases.append(alias)

    logging.info('Adding to DB ...')
    for alias_idx, alias in enumerate(aliases):
        simstring_db.add(alias)
        if alias_idx % 1000000 == 0:
            logging.info('At %d/%d ...' % (alias_idx, len(aliases)))

    # setting paths
    db_path = '%s.aliases.%dgram.%dtoks.db' % (umls_kb.umls_version,
                                               char_ngram_len, n_max_tokens)
    map_path = '%s.aliases.%dtoks.map' % (umls_kb.umls_version, n_max_tokens)

    logging.info('Storing DB ...')
    with open(db_path, 'wb') as f:
        pickle.dump(simstring_db, f)

    logging.info('Storing Alias Mapping ...')
    with open(map_path, 'wb') as f:
        alias_mapping = dict(alias_mapping)
        pickle.dump(alias_mapping, f)
예제 #5
0
def load_disambiguation():
    db = DictDatabase(WordNgramFeatureExtractor(2))

    with open(LOCATION_WIKIPEDIA_DISAMBIGUATION) as disambig_file:
        for line in disambig_file:
            r = line.replace("_(disambiguation)", "").replace("_", " ").lower()
            db.add(r.strip())

    return Searcher(db, JaccardMeasure())
예제 #6
0
class MagnitudeOOV():
    def __init__(self, word2vec):
        self.w2v = word2vec
        self.embedding_dim = self.w2v.vector_size
        self.vocab = set(self.w2v.vocab.keys())

        self.db = DictDatabase(CharacterNgramFeatureExtractor(2))
        for vocab_word in self.vocab:
            self.db.add(vocab_word)

    def generate_pseudorandom_vector(self, word):
        """calculate PRVG form CGRAM"""
        vectors = []
        ngram_list = character_ngram(word)
        for ngram in ngram_list:
            np.random.seed(seed(ngram))
            vectors.append(np.random.uniform(-1, 1, self.embedding_dim))
        return np.mean(vectors, axis=0)

    def similar_words_top_k(self, query, measure=CosineMeasure(), initial_threshold=0.99, dec_step=0.01, k=3):
        """search similar words by using edit distance"""
        searcher = Searcher(self.db, measure)
        t = initial_threshold
        similar_words = []
        while True:
            similar_words = searcher.search(query, t)

            if len(similar_words) >= k or t <= 0.1:
                break
            t -= dec_step

        if len(similar_words) > 3:
            np.random.choice(42)
            return np.random.choice(similar_words, k, replace=False).tolist()
        else:
            return similar_words

    def generate_similar_words_vector(self, word):
        """calculate MATCH from similar words"""
        vectors = np.mean([self.w2v[w] for w in self.similar_words_top_k(word)], axis=0)
        return vectors

    def out_of_vocab_vector(self, word):
        vector = self.generate_pseudorandom_vector(word) * 0.3 + self.generate_similar_words_vector(word) * 0.7
        final_vector = vector / np.linalg.norm(vector)
        return final_vector

    def query(self, word):
        normalized_word = neologdn.normalize(word)

        if word in self.vocab:
            return self.w2v[word]
        elif normalized_word in self.vocab:
            return self.w2v[normalized_word]
        else:
            return self.out_of_vocab_vector(normalized_word)
예제 #7
0
def output_similar_strings_of_each_line(path):
    db = DictDatabase(CharacterNgramFeatureExtractor(2))
    with open(path, 'r') as lines:
        for line in lines:
            strings = line.rstrip('\r\n')
            db.add(strings)

    searcher = Searcher(db, CosineMeasure())
    with open(path, 'r') as lines:
        for line in lines:
            strings = line.rstrip('\r\n')
            result = [str(round(x[0], 5)) + ' ' + x[1] for x in searcher.ranked_search(strings, 0.8)]
            print("\t".join([strings, ",".join(result)]))
예제 #8
0
def simstring_database(umls, nchar_val):
    db = DictDatabase(CharacterNgramFeatureExtractor(nchar_val))
    term_to_cui = dict()

    for value in umls:
        try:
            data = value.split('\t')
            cui = data[0]
            term = data[1].lower()
            db.add(term)
            term_to_cui[term] = cui
        except:
            continue

    pickle.dump(db, open('db.pickle', 'wb'))
    pickle.dump(term_to_cui, open('term_to_cui.pickle', 'wb'))
예제 #9
0
def output_similar_strings_of_each_line(path):
    db = DictDatabase(CharacterNgramFeatureExtractor(2))
    with open(path, 'r') as lines:
        for line in lines:
            strings = line.rstrip('\r\n')
            db.add(strings)

    searcher = Searcher(db, CosineMeasure())
    with open(path, 'r') as lines:
        for line in lines:
            strings = line.rstrip('\r\n')
            result = [
                str(round(x[0], 5)) + ' ' + x[1]
                for x in searcher.ranked_search(strings, 0.8)
            ]
            print("\t".join([strings, ",".join(result)]))
예제 #10
0
 def setUp(self) -> None:
     db = DictDatabase(CharacterNgramFeatureExtractor(2))
     db.add("foo")
     db.add("bar")
     db.add("fooo")
     db.add("food")
     db.add("fool")
     db.add("follow")
     self.searcher = Searcher(db, JaccardMeasure())
예제 #11
0
def construct_ontology(ontology_data):
    '''
    Create an n-char simstring database and
    term-to-code mapping to enable rapid ontology
    querying
    '''
    database = DictDatabase(CharacterNgramFeatureExtractor(2))

    term_to_cui = {}
    for entry in ontology_data:
        entry_values = entry.split('\t')
        if len(entry_values) == 2:
            term = clean_selected_term(entry_values[1])
            term_to_cui[term] = entry_values[0].strip()

    for term in term_to_cui.keys():
        term = clean_selected_term(term)
        database.add(term)

    return database, term_to_cui
예제 #12
0
class TestDict(TestCase):
    strings = ['a', 'ab', 'abc', 'abcd', 'abcde']

    def setUp(self):
        self.db = DictDatabase(CharacterNgramFeatureExtractor(2))
        for string in self.strings:
            self.db.add(string)

    def test_strings(self):
        self.assertEqual(self.db.strings, self.strings)

    def test_min_feature_size(self):
        self.assertEqual(self.db.min_feature_size(), min(map(lambda x: len(x) + 1, self.strings)))

    def test_max_feature_size(self):
        self.assertEqual(self.db.max_feature_size(), max(map(lambda x: len(x) + 1, self.strings)))

    def test_lookup_strings_by_feature_set_size_and_feature(self):
        self.assertEqual(self.db.lookup_strings_by_feature_set_size_and_feature(4, 'ab'), set(['abc']))
        self.assertEqual(self.db.lookup_strings_by_feature_set_size_and_feature(3, 'ab'), set(['ab']))
        self.assertEqual(self.db.lookup_strings_by_feature_set_size_and_feature(2, 'ab'), set([]))
예제 #13
0
def make_change_image_dict(drink_names):
    import re
    import json
    import difflib
    from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor
    from simstring.measure.cosine import CosineMeasure
    from simstring.database.dict import DictDatabase
    from simstring.searcher import Searcher

    ff = open('jsons/theCocktailDB_allData_20181010.json', 'r', encoding="utf-8_sig")
    json_data2 = json.load(ff)
    ff.close()

    # 互いに類似度を比較する文字列のリスト
    STR_db = [re.sub(r'[!-/:-@[-`{-~]', " ", d["en"]) for d in drink_names]
    TCD_db ={re.sub(r'[!-/:-@[-`{-~]', " ", d["drinks"][0]["strDrink"]): d["drinks"][0]["strDrinkThumb"] for d in json_data2}
    TCD_name_db = list(TCD_db.keys())
    count = 0
    length = len(STR_db)
    result_dict = {}
    change_image_dict = {}

    
    db = DictDatabase(CharacterNgramFeatureExtractor(2))
    for str1 in STR_db:
        db.add(str1)
    
    for str2 in TCD_name_db:
        result_dict[str2] = {}
        searcher = Searcher(db, CosineMeasure())
        i = 1.0
        # 類似度を計算、0.0~1.0 で結果が返る
        flag = False
        for str1 in STR_db:
            s = difflib.SequenceMatcher(None, str2, str1).ratio()
            if s > 0.75:
                flag = True
                if (str1 in result_dict[str2]):
                    
                    d =  result_dict[str2][str1]
                    #平均更新
                    d = [(d[0]*d[1]+s)/(d[1]+1), d[1]+1]
                    
                    result_dict[str2][str1] = d
                else:
                    
                    result_dict[str2].setdefault(str1, [s ,1])
                    
        
        temp = []
        while i >= 0.65:
            result = searcher.search(str2, i)
            if (len(result)):
                flag = True
                for str1 in result:
                    if (str1 in temp): continue
                    temp += [str1]
                    if (str1 in result_dict[str2]):
                        
                        d =  result_dict[str2][str1]
                        #平均更新
                        d = [(d[0]*d[1]+i)/(d[1]+1), d[1]+1]
                        
                        result_dict[str2][str1] = d
                    else:
                        result_dict[str2].setdefault(str1, [i ,1])
                        
                        
            i -= 0.001
        if (flag):
            
            count += 1
        
    with open("./search_log.txt", "w+", encoding="utf-8_sig") as f:
        real_count = 0
        for str2 in TCD_name_db:
            print("\n", file=f)
            print("\n")
            print(">> "+str2, file=f)
            print(">> "+str2)
            M = 0.0
            name = ""
            for key, value_list in result_dict[str2].items():
                if (M < value_list[0]):
                    name = key
                    M = value_list[0]
            print("  "+name+": "+str(M), file=f)
            if (M != 0):
                if (M >= 0.76):
                    print("  "+name+": "+str(M))
                    print("ok", file=f)
                    print("ok")
                    change_image_dict[name] = TCD_db[str2]
                    real_count += 1
                else:
                    print("  "+name+": "+str(M))
                    print("out", file=f)
                    print("out")
            

        print("\nmatch is {count}/{length} but real_match is {real_count}/{length}".format(count=count, real_count=real_count, length=length), file=f)
        print("\nmatch is {count}/{length} but real_match is {real_count}/{length}".format(count=count, real_count=real_count, length=length))

    exit()
    return change_image_dict
예제 #14
0
 def setUp(self):
     db = DictDatabase(CharacterNgramFeatureExtractor(2))
     for string in self.strings:
         db.add(string)
     self.searcher = Searcher(db, CosineMeasure())
예제 #15
0
 def setUp(self):
     db = DictDatabase(CharacterNgramFeatureExtractor(2))
     for string in self.strings:
         db.add(string)
     self.searcher = Searcher(db, CosineMeasure())
예제 #16
0
class UMLSMapper:
    # https://www.ncbi.nlm.nih.gov/books/NBK9685/table/ch03.T.concept_names_and_sources_file_mr/
    def __init__(self,
                 from_dir: str = None,
                 json_path: str = "mapper.json",
                 umls_words: Iterable[str] = None):
        # self.db = DictDatabase(WordNgramFeatureExtractor(2))

        self.db = DictDatabase(CharacterNgramFeatureExtractor(2))

        if from_dir:
            json_path = os.path.join(from_dir, json_path)
            if os.path.exists(json_path):
                print(f"initialize {self.__class__.__name__}... Load json")
                self.umls_dict, self.umls_reverse_dict = self.load_from_json(
                    json_path)
                self.add_words_to_db(self.umls_dict.keys())
            else:
                print(f"initialize {self.__class__.__name__}... Load dir")
                self.umls_dict, self.umls_reverse_dict = self.load_umls_dict(
                    from_dir)
                self.add_words_to_db(self.umls_dict.keys())
                self.save_as_json(path=json_path)
        else:
            self.add_words_to_db(umls_words)

        # if from_dir:
        #     print(f"initialize {self.__class__.__name__}... Load dir")
        #     self.umls_dict, self.umls_reverse_dict = self.load_umls_dict(from_dir)
        #     self.add_words_to_db(self.umls_dict.keys())
        # elif json_path:
        #     print(f"initialize {self.__class__.__name__}... Load json")
        #     self.umls_dict, self.umls_reverse_dict = self.load_from_json(json_path)
        #     self.add_words_to_db(self.umls_dict.keys())
        # else:
        #     self.add_words_to_db(umls_words)

    def load_umls_dict(self, directory):
        path = os.path.join(directory, "GER_MRCONSO.RRF")
        df = pd.read_csv(path, delimiter="|", header=None)
        df.columns = [
            "CUI", "LAT", "TS", "LUI", "STT", "SUI", "ISPREF", "AUI", "SAUI",
            "SCUI", "SDUI", "SAB", "TTY", "CODE", "STR", "SRL", "SUPPRESS",
            "CVF", "NONE"
        ]
        df = df.drop(columns=['NONE'])
        dic = {row["STR"]: row["CUI"] for i, row in df.iterrows()}
        rev_dic = defaultdict(list)
        for key, value in dic.items():
            rev_dic[value].append(key)
        return dic, rev_dic

    def save_as_json(self, path: str):
        data = self.__dict__
        data.pop("db")
        with open(path, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=0)

    def load_from_json(self, path: str):
        with open(path, 'r', encoding='utf-8') as file:
            data = json.loads(file.read())
        return data["umls_dict"], data["umls_reverse_dict"]

    def add_words_to_db(self, words: Iterable[str]):
        for token in set(words):
            self.db.add(token)

    def search_term_sims(self, term: str) -> List[str]:
        searcher = Searcher(self.db, CosineMeasure())
        return searcher.search(term, 0.8)

    def search_all_term_sims(self, terms: List[str]) -> Dict[str, List[str]]:
        dic = {}
        for term in set(terms):
            related_terms = self.search_term_sims(term)
            if len(related_terms) > 0:
                dic[term] = related_terms
        return dic
        # return {term: self.search_term_sims(term) for term in set(terms)}

    def standardize_words(self, tokens: List[str]):
        concept_dict = self.search_all_term_sims(tokens)
        standardized_tokens = []
        for token in tokens:
            mapping = concept_dict.get(token)
            if mapping:
                standardized_tokens.append(mapping[0])
            else:
                standardized_tokens.append(token)
        return standardized_tokens

    def standardize_documents(self, documents: List[List[str]]):
        concept_dict = self.search_all_term_sims(list(chain(*documents)))
        standardized_documents = []
        for document in tqdm(documents):
            standardized_tokens = []
            for token in document:
                mapping = concept_dict.get(token)
                if mapping:
                    standardized_tokens.append(mapping[0])
                else:
                    standardized_tokens.append(token)
                standardized_documents.append(standardized_tokens)
        return standardized_documents

    def get_umls_vectors_only(self, vectors: gensim.models.KeyedVectors):
        medical_concepts = [
            word for word in vectors.index2word
            if word in self.umls_dict.values()
        ]
        concept_vecs = {
            concept: vectors.get_vector(concept)
            for concept in medical_concepts
        }
        return concept_vecs

    def un_umls(self, concept, single_return=True):
        res = self.umls_reverse_dict.get(concept)
        if res is None:
            return concept

        if single_return:
            return res[0]
        else:
            return res

    def replace_umls(self, tokens: List[str]) -> List[str]:
        return [self.un_umls(token) for token in tokens if self.un_umls(token)]

    def umls_code(self, token, delete_non_umls):
        umls_code = self.umls_dict.get(token)
        if umls_code is None:
            if delete_non_umls:
                return None
            else:
                return token
        else:
            return umls_code

    def replace_with_umls(self,
                          tokens: List[str],
                          delete_non_umls=False) -> List[str]:
        return [
            self.umls_code(token, delete_non_umls) for token in tokens
            if self.umls_code(token, delete_non_umls)
        ]

    def replace_documents_token_based(self, documents: List[str], delete_non_umls=False, tokenize: bool = True) \
            -> Union[List[List[str]], List[str]]:
        tokenized_documents = [sentence.split() for sentence in documents]
        if tokenize:
            return [[
                self.umls_code(token, delete_non_umls) for token in tokens
                if self.umls_code(token, delete_non_umls)
            ] for tokens in tokenized_documents]
        else:
            return [
                ' '.join([
                    self.umls_code(token, delete_non_umls) for token in tokens
                    if self.umls_code(token, delete_non_umls)
                ]) for tokens in tokenized_documents
            ]

    def spacy_tokenize(self,
                       documents: List["str"],
                       nlp=None) -> List[List[str]]:
        if nlp is None:
            nlp = spacy.load('de_core_news_sm')
        doc_pipe = list(
            nlp.pipe(documents, disable=["tagger", "parser", "ner"]))
        tokenized_docs = [[
            token.text for token in doc
        ] for doc in tqdm(doc_pipe, desc="Tokenize", total=len(documents))]
        return tokenized_docs

    def replace_documents_with_spacy_multiterm(
            self,
            documents: List[str],
            tokenize: bool = True) -> List[List[str]]:
        nlp = spacy.load('de_core_news_sm')
        matcher = PhraseMatcher(nlp.vocab)
        terms = self.umls_dict.keys()
        doc_pipe = list(
            nlp.pipe(documents, disable=["tagger", "parser", "ner"]))
        # Only run nlp.make_doc to speed things up
        patterns = [nlp.make_doc(term) for term in terms]
        matcher.add("TerminologyList", None, *patterns)
        replaced_docs = []

        for doc in tqdm(doc_pipe,
                        desc="Replace with concepts",
                        total=len(documents)):
            text_doc = doc.text
            matches = matcher(doc)
            concepts = []
            for match_id, start, end in matches:
                span = doc[start:end]
                concepts.append(span.text)

            concepts.sort(key=lambda s: len(s), reverse=True)
            for concept in concepts:
                text_doc = text_doc.replace(concept, self.umls_dict[concept])

            replaced_docs.append(text_doc)

            # tokens = [token for token in text_doc.split()]
            # replaced_docs.append(tokens)
        if tokenize:
            replaced_docs = self.spacy_tokenize(replaced_docs, nlp)
        # doc_pipe = list(nlp.pipe(replaced_docs, disable=["tagger", "parser", "ner"]))
        # replaced_docs = [[token.text for token in doc] for doc in tqdm(doc_pipe, desc="Tokenize", total=len(documents))]

        return replaced_docs
예제 #17
0
class GESSimpleMatcher:
    '''
    Clase para hacer match simple de patologías GES. Solo considera similitud entre strings, 
    nada muy sofisticado. Basado en código de Fabián Villena (https://fabianvillena.cl).
    Actualmente considera un extractor de features que combina caracteres y palabras y tiene
    ciertas cosas específicas de textos GES.
    TODO: 
        - probar técnicas un poco más sofisticadas de matching
        - completar la documentación
    '''
    def __init__(
            self, 
            base_ges_data='ges_utils/data/ges-health-problems.json', 
            no_ges_str='UNK',
            alpha=0.2,
            n_chars=4, 
            n_words=[2], 
            special_words=['vih']
        ):

        self.alpha = alpha

        with open(base_ges_data,'r',encoding='utf-8') as f:
            self.__ges_dict = json.load(f)
        
        # feature extractor
        extractor = GESSyntacticFeatureExtractor(
                        n_chars=n_chars, 
                        n_words=n_words, 
                        special_words=special_words
                    )
        self.__db = DictDatabase(extractor)
        
        # Caché
        self.__cache = {}
        
        self.__problems_from_disease = defaultdict(set)
        self.__ids_from_disease = defaultdict(set)
        self.__problems = {}
        self.__ids = {}
        
        self.__problems[-1] = no_ges_str
        self.__ids[no_ges_str] = -1
        
        # Por ahora los ids son el orden de los problemas en el json
        # TODO: decidir si los ids deberían obtenerse de algún lugar estándar
        for i, problem in enumerate(self.__ges_dict):
            
            problem_id = i+1
            
            self.__problems[problem_id] = problem
            self.__ids[problem] = problem_id
            
            # agrega un problema como si fuera disease también
            self.__problems_from_disease[problem].add(problem)
            self.__ids_from_disease[problem].add(problem_id)
            
            # agrega a las BD 
            self.__db.add(problem)
            
            for disease in self.__ges_dict[problem]:
                
                self.__problems_from_disease[disease].add(problem)
                self.__ids_from_disease[disease].add(problem_id)
                
                # agrega a la BD
                self.__db.add(disease)
        
        # TODO: agregar datos adicionales para hacer matching de enfermedades y problemas

        self.__searcher = Searcher(self.__db, CosineMeasure())

    def get_ranking_ges_diseases(self, raw_string):
        ranking = self.__searcher.ranked_search(raw_string, alpha=self.alpha)
        return ranking

    def get_ges_problem(self, raw_string):
        problem_id = self.get_ges_id(raw_string)
        problem = self.__problems[problem_id]
        return problem        

    def get_ges_id(self, raw_string):
        # si ya lo computamos entrega el valor 
        if raw_string in self.__cache:
            return self.__cache[raw_string]
        
        # si no lo tenemos, lo computamos
        ranking = self.get_ranking_ges_diseases(raw_string)

        if ranking:
            # ipdb.set_trace()
            (v, disease) = ranking[0]
            problem_ids = self.__ids_from_disease[disease]
            problem_id = list(problem_ids)[0]
            self.__cache[raw_string] = problem_id
            return problem_id

        else:
            self.__cache[raw_string] = -1
            return -1


    def get_possible_ges_ids(self, raw_string):
        
        to_search = raw_string
        
        problem_ids = []
        
        # busca las enfermedades candidatas
        candidate_diseases = self.__searcher.search(to_search, alpha=self.alpha) 
        
        for disease in candidate_diseases:
            problem_ids.extend(self.__ids_from_disease[disease])
          
        problem_ids_counter = Counter(problem_ids)
        ordered_ids = [i for i,_ in problem_ids_counter.most_common()]
        
        return ordered_ids

    def get_ges_id_prev(self, raw_string):
        
        # si ya lo computamos entrega el valor 
        if hash(raw_string) in self.__cache:
            return self.__cache[hash(raw_string)]
        
        ids_list = self.get_possible_ges_ids(raw_string)
        if not ids_list:
            self.__cache[raw_string] = -1
            return -1
        else:
            self.__cache[raw_string] = ids_list[0]
            return ids_list[0]

    def problem_from_id(self, id_problem):
        return self.__problems[id_problem]

    def id_from_problem(self, problem):
        return self.__ids[problem]

    def clean_cache(self):
        self.__cache = {}
예제 #18
0
#############################

# Read in branded foods CSV and clean it
df = pd.read_csv('branded_food.csv')
all_ingredients_final = get_cleaned_ingredients_list(df)
# Get a count for all the ingredients to be used by Peter Norvig Implementation
ingredients_count = Counter(all_ingredients_final)

##############################################
# Peter Norvig SimString Implementation Code #
##############################################

# Populate database with all ingredients
db = DictDatabase(CharacterNgramFeatureExtractor(2))
for ingredient in all_ingredients_final:
    db.add(ingredient)
# Create searcher object to be used by candidates function
searcher = Searcher(db, CosineMeasure())

# Functions


def probability(word, N=sum(ingredients_count.values())):
    """ 
  Returns the probability of the word appearing in the text 
  Usually, correctly spelled words will have a higher count and therefore probability than their mispellings
  """
    return ingredients_count[word] / N


def candidates(word, searcher):