Пример #1
0
class TestRankedSearchJaccard(TestCase):
    def setUp(self) -> None:
        db = DictDatabase(CharacterNgramFeatureExtractor(2))
        db.add("foo")
        db.add("bar")
        db.add("fooo")
        db.add("food")
        db.add("fool")
        db.add("follow")
        self.searcher = Searcher(db, JaccardMeasure())

    def test_ranked_search_example1(self):
        results = self.searcher.ranked_search("fo", 0.5)
        goal = OrderedDict({"foo": 0.75, "fooo": 0.6})
        self.assertEqual(results, goal)

    def test_ranked_search_example2(self):
        results = self.searcher.ranked_search("fo", 0.3)
        goal = OrderedDict({
            "foo": 0.75,
            "fooo": 0.6,
            "food": 0.3333333333333333,
            "fool": 0.3333333333333333,
        })
        self.assertEqual(results, goal)
Пример #2
0
class TestRankedSearchCosine(TestCase):
    def setUp(self) -> None:
        db = DictDatabase(CharacterNgramFeatureExtractor(2))
        db.add("foo")
        db.add("bar")
        db.add("fooo")
        db.add("food")
        db.add("fool")
        db.add("follow")
        self.searcher = Searcher(db, CosineMeasure())

    def test_ranked_search_example1(self):
        results = self.searcher.ranked_search("fo", 0.5)
        goal = OrderedDict({
            "foo": 0.8660254037844387,
            "fooo": 0.7745966692414834,
            "food": 0.5163977794943222,
            "fool": 0.5163977794943222,
        })
        self.assertEqual(results, goal)

    def test_ranked_search_example2(self):
        results = self.searcher.ranked_search("fo", 0.6)
        goal = OrderedDict({
            "foo": 0.8660254037844387,
            "fooo": 0.7745966692414834
        })
        self.assertEqual(results, goal)
Пример #3
0
class SimString_UMLS(object):

    def __init__(self, umls_db, db_path, cui_mapping_path, alpha=0.5):
        self.db = None
        self.umls_db = umls_db
        self.cui_mapping = None
        self.searcher = None
        self.alpha = alpha

        self.load(db_path, cui_mapping_path)

    def load(self, db_path, cui_mapping_path):

        logging.info('Loading DB ...')
        with open(db_path, 'rb') as db_f:
            self.db = pickle.load(db_f)
        
        logging.info('Loading Mapping ...')
        with open(cui_mapping_path, 'rb') as mapping_f:
            self.cui_mapping = pickle.load(mapping_f)

        logging.info('Creating Searcher ...')
        self.searcher = Searcher(self.db, CosineMeasure())

    @lru_cache(262144)
    def match(self, text):
        results = self.searcher.ranked_search(text, alpha=self.alpha)
        results = [(a, sim) for sim, a in results]  # to be consistent with other matchers
        return results

    def match_cuis(self, text):
        alias_results = self.match(text)

        cui_results = []
        included_cuis = set()
        for alias, sim in alias_results:
            for cui in self.cui_mapping[alias]:
                if cui not in included_cuis:
                    cui_results.append((cui, sim))
                    included_cuis.add(cui)

        return cui_results

    def match_sts(self, text):

        st_results = {}
        for cui, sim in self.match_cuis(text):
            for st in self.umls_db.get_sts(cui):

                if st not in st_results:
                    st_results[st] = sim
                else:
                    st_results[st] = max(sim, st_results[st])
        
        st_results = list(st_results.items())
        st_results = sorted(st_results, key=lambda x: (x[1], x[0]), reverse=True)

        return st_results
Пример #4
0
def output_similar_strings_of_each_line(path):
    db = DictDatabase(CharacterNgramFeatureExtractor(2))
    with open(path, 'r') as lines:
        for line in lines:
            strings = line.rstrip('\r\n')
            db.add(strings)

    searcher = Searcher(db, CosineMeasure())
    with open(path, 'r') as lines:
        for line in lines:
            strings = line.rstrip('\r\n')
            result = [str(round(x[0], 5)) + ' ' + x[1] for x in searcher.ranked_search(strings, 0.8)]
            print("\t".join([strings, ",".join(result)]))
Пример #5
0
class TestSearcher(TestCase):
    strings = ["a", "ab", "abc", "abcd", "abcde"]

    def setUp(self):
        db = DictDatabase(CharacterNgramFeatureExtractor(2))
        for string in self.strings:
            db.add(string)
        self.searcher = Searcher(db, CosineMeasure())

    def test_search1(self):
        self.assertEqual(self.searcher.search("a", 1.0), ["a"])

    def test_search2(self):
        self.assertEqual(self.searcher.search("ab", 0.5),
                         ["ab", "abc", "abcd"])
        self.assertEqual(self.searcher.search("ab", 1.0), ["ab"])
        self.assertEqual(self.searcher.search("ab", 0.9), ["ab"])

    def test_search3(self):
        self.assertEqual(self.searcher.search("abc", 1.0), ["abc"])
        self.assertEqual(self.searcher.search("abc", 0.9), ["abc"])

    def test_search4(self):
        self.assertEqual(self.searcher.search("abcd", 1.0), ["abcd"])
        self.assertEqual(self.searcher.search("abcd", 0.9), ["abcd"])

    def test_ranked_search(self):
        self.assertEqual(self.searcher.ranked_search("abcd", 1.0),
                         OrderedDict({"abcd": 1.0}))
        self.assertEqual(
            self.searcher.ranked_search("ab", 0.41),
            OrderedDict({
                "ab": 1.0,
                "abc": 0.5773502691896258,
                "abcd": 0.5163977794943222,
                "abcde": 0.47140452079103173,
            }),
        )
Пример #6
0
def output_similar_strings_of_each_line(path):
    db = DictDatabase(CharacterNgramFeatureExtractor(2))
    with open(path, 'r') as lines:
        for line in lines:
            strings = line.rstrip('\r\n')
            db.add(strings)

    searcher = Searcher(db, CosineMeasure())
    with open(path, 'r') as lines:
        for line in lines:
            strings = line.rstrip('\r\n')
            result = [
                str(round(x[0], 5)) + ' ' + x[1]
                for x in searcher.ranked_search(strings, 0.8)
            ]
            print("\t".join([strings, ",".join(result)]))
Пример #7
0
class GESSimpleMatcher:
    '''
    Clase para hacer match simple de patologías GES. Solo considera similitud entre strings, 
    nada muy sofisticado. Basado en código de Fabián Villena (https://fabianvillena.cl).
    Actualmente considera un extractor de features que combina caracteres y palabras y tiene
    ciertas cosas específicas de textos GES.
    TODO: 
        - probar técnicas un poco más sofisticadas de matching
        - completar la documentación
    '''
    def __init__(
            self, 
            base_ges_data='ges_utils/data/ges-health-problems.json', 
            no_ges_str='UNK',
            alpha=0.2,
            n_chars=4, 
            n_words=[2], 
            special_words=['vih']
        ):

        self.alpha = alpha

        with open(base_ges_data,'r',encoding='utf-8') as f:
            self.__ges_dict = json.load(f)
        
        # feature extractor
        extractor = GESSyntacticFeatureExtractor(
                        n_chars=n_chars, 
                        n_words=n_words, 
                        special_words=special_words
                    )
        self.__db = DictDatabase(extractor)
        
        # Caché
        self.__cache = {}
        
        self.__problems_from_disease = defaultdict(set)
        self.__ids_from_disease = defaultdict(set)
        self.__problems = {}
        self.__ids = {}
        
        self.__problems[-1] = no_ges_str
        self.__ids[no_ges_str] = -1
        
        # Por ahora los ids son el orden de los problemas en el json
        # TODO: decidir si los ids deberían obtenerse de algún lugar estándar
        for i, problem in enumerate(self.__ges_dict):
            
            problem_id = i+1
            
            self.__problems[problem_id] = problem
            self.__ids[problem] = problem_id
            
            # agrega un problema como si fuera disease también
            self.__problems_from_disease[problem].add(problem)
            self.__ids_from_disease[problem].add(problem_id)
            
            # agrega a las BD 
            self.__db.add(problem)
            
            for disease in self.__ges_dict[problem]:
                
                self.__problems_from_disease[disease].add(problem)
                self.__ids_from_disease[disease].add(problem_id)
                
                # agrega a la BD
                self.__db.add(disease)
        
        # TODO: agregar datos adicionales para hacer matching de enfermedades y problemas

        self.__searcher = Searcher(self.__db, CosineMeasure())

    def get_ranking_ges_diseases(self, raw_string):
        ranking = self.__searcher.ranked_search(raw_string, alpha=self.alpha)
        return ranking

    def get_ges_problem(self, raw_string):
        problem_id = self.get_ges_id(raw_string)
        problem = self.__problems[problem_id]
        return problem        

    def get_ges_id(self, raw_string):
        # si ya lo computamos entrega el valor 
        if raw_string in self.__cache:
            return self.__cache[raw_string]
        
        # si no lo tenemos, lo computamos
        ranking = self.get_ranking_ges_diseases(raw_string)

        if ranking:
            # ipdb.set_trace()
            (v, disease) = ranking[0]
            problem_ids = self.__ids_from_disease[disease]
            problem_id = list(problem_ids)[0]
            self.__cache[raw_string] = problem_id
            return problem_id

        else:
            self.__cache[raw_string] = -1
            return -1


    def get_possible_ges_ids(self, raw_string):
        
        to_search = raw_string
        
        problem_ids = []
        
        # busca las enfermedades candidatas
        candidate_diseases = self.__searcher.search(to_search, alpha=self.alpha) 
        
        for disease in candidate_diseases:
            problem_ids.extend(self.__ids_from_disease[disease])
          
        problem_ids_counter = Counter(problem_ids)
        ordered_ids = [i for i,_ in problem_ids_counter.most_common()]
        
        return ordered_ids

    def get_ges_id_prev(self, raw_string):
        
        # si ya lo computamos entrega el valor 
        if hash(raw_string) in self.__cache:
            return self.__cache[hash(raw_string)]
        
        ids_list = self.get_possible_ges_ids(raw_string)
        if not ids_list:
            self.__cache[raw_string] = -1
            return -1
        else:
            self.__cache[raw_string] = ids_list[0]
            return ids_list[0]

    def problem_from_id(self, id_problem):
        return self.__problems[id_problem]

    def id_from_problem(self, problem):
        return self.__ids[problem]

    def clean_cache(self):
        self.__cache = {}