示例#1
0
def match_indication(indication, hpo_dict, fuzzy=False, min_similarity=0.85, 
                     min_fuzzy_len=10):
    """
    Match a single indication to HPO dictionary, and return a nonredundant list
    of all matching terms and superterms
    """

    # First check if indication is non-informative
    skip_terms = 'NO_CLINICAL_INFORMATION_PROVIDED UNKNOWN NOT_INDICATED ' + \
                 'NOT_SPECIFIED NOT_PROVIDED NONE_PROVIDED'
    if indication in skip_terms.split():
        return []

    # Otherwise, try to match into HPO
    query = clean_pheno(indication)

    if fuzzy:
        raw_matches = [hp for key, hp in hpo_dict.items() \
                       if fuzz.token_sort_ratio(query, key) / 100 >= min_similarity]
    else:
        raw_matches = [hp for key, hp in hpo_dict.items() if query == key]

    raw_matches = set([hp for sublist in raw_matches for hp in sublist])

    matches = list(raw_matches)

    return matches
示例#2
0
    def fast_score(self, target_place: Loc, result_place: Loc) -> float:
        # Get a rough, fast score for similarity between target and result.  O is best.  100 is worst
        result_title = result_place.get_five_part_title()
        target_title = target_place.get_five_part_title()
        #self.logger.debug(f'F Score  Result [{result_title}] targ [{target_title}] ')

        sc = 100 - fuzz.token_sort_ratio(result_title, target_title)

        #self.logger.debug(f'F Score={sc:.2f} Result [{result_title}] targ [{target_title}] ')
        return sc
示例#3
0
def fuzzy_string_match(str_ref, str_hyp):
    """Returns fuzzy string similarity score in range [0.0, 1.0].
    Args:
      str_ref: reference string
      str_hyp: hypothesis string
    Returns:
      fuzzy string similarity
    """

    # The higher the score, the higher the similarity between the two strings.
    return fuzz.token_sort_ratio(str_ref, str_hyp) / 100.0
    def query_tex_string(self, tex_string_1, threshold=65):
        recommendations = []
        for fc in self.formula_concepts:

            tex_strings = self.formula_concepts[fc]['TeXStrings']
            for tex_string_2 in tex_strings:
                if fuzz.token_sort_ratio(tex_string_1,
                                         tex_string_2) >= threshold:
                    recommendations.append({'name': fc})
                    break

        return recommendations[:10]
        """formula_concept_names = list(formula_concepts.keys())
示例#5
0
def partial_match(x_fact, y_fact, x_index, y_index):
    if x_index == y_index:
        # dont compare same facts
        return None

    # compare facts
    ratio = fuzz.token_sort_ratio(x_fact[0], y_fact[0])
    if ratio > 80:
        # facts are most likely a match
        return (x_fact, y_fact), (x_index, y_index), ratio

    # facts are most likely not a match, return none
    return None
示例#6
0
def partial_string_based(str1, str2):
    """Performs a partial string match using the Jaro-Winkler distance algorithm.

    Args:
        str1: A string value to check.
        str2: A string value to check.

    Returns:
        float: Number between 0.0 and 1.0 depending on match criteria.

    """
    from rapidfuzz import fuzz
    result = fuzz.token_sort_ratio(str1, str2)
    logger.debug("--\t\tpartial_string_based '%s' '%s'\tresult: '%s'", str1, str2, result)
    return result / 100.0
def text_in_note(note, query_string):
    """Performs fuzzy searching against note text.

    :param dict note: an ArchivesSpace note.
    :param str query_string: a string to match against.

    :returns: True if a match is found for `query_string`, False if no match is
            found.
    :rtype: bool
    """
    CONFIDENCE_RATIO = 97
    """int: Minimum confidence ratio to match against."""
    note_content = get_note_text(note)
    ratio = fuzz.token_sort_ratio(
        " ".join([n.lower() for n in note_content]),
        query_string.lower(),
        score_cutoff=CONFIDENCE_RATIO)
    return bool(ratio)
示例#8
0
def featurize(df):
    if len(df.columns)==3:
        df.columns=['a', 'b', 'target']
    elif len(df.columns)==2:
        df.columns=['a', 'b']
    else:
        df = df.rename(columns={df.columns[0]: 'a', df.columns[1]: 'b' })
        
    df['TM_A'] = df.apply(lambda row: re.sub(
        '[^a-zA-Z]+', '', unidecode.unidecode(row['a']).lower()), axis=1)
    df['TM_B'] = df.apply(lambda row: re.sub(
        '[^a-zA-Z]+', '', unidecode.unidecode(row['b']).lower()), axis=1)

    df['partial'] = df.apply(lambda row: fuzz.partial_ratio(row.TM_A,row.TM_B), axis=1)
    df['tkn_sort'] = df.apply(lambda row: fuzz.token_sort_ratio(row.TM_A,row.TM_B), axis=1)
    df['tkn_set'] = df.apply(lambda row: fuzz.token_set_ratio(row.TM_A,row.TM_B), axis=1)
    
    df['sum_ipa'] = df.apply(lambda row: sum_ipa(row.TM_A,row.TM_B), axis=1)
    
    # Jellyfish levenshtein
    df['levenshtein']= df.apply(lambda row: jellyfish.levenshtein_distance(row.TM_A,row.TM_B), axis=1)
    # Scale Levenshtein column
    scaler = MinMaxScaler()
    df['levenshtein'] = scaler.fit_transform(df['levenshtein'].values.reshape(-1,1))

    # Jellyfish phoneme
    df['metaphone'] = df.apply(
        lambda row: 1 if jellyfish.metaphone(row.TM_A)==jellyfish.metaphone(row.TM_B) else 0, axis=1)
    df['nysiis'] = df.apply(
        lambda row: 1 if jellyfish.nysiis(row.TM_A)==jellyfish.nysiis(row.TM_B) else 0, axis=1)
    df['mtch_rtng_cdx'] = df.apply(
        lambda row: 1 if jellyfish.match_rating_codex(row.TM_A)==jellyfish.match_rating_codex(row.TM_B) else 0, axis=1)
    
    df['pshp_soundex_first'] = df.apply(
        lambda row: 1 if pshp_soundex_first.encode(row.TM_A)==pshp_soundex_first.encode(row.TM_B) else 0, axis=1)
    
    for i, algo in enumerate(algos):
            df[algo_names[i]] = df.apply(lambda row: algo.sim(row.TM_A, row.TM_B), axis=1)
    
    return df
示例#9
0
def contains_match(content, search_string):
    """Returns True if user-provided note input matches the corresponding note within a given ratio (CONFIDENCE_RATIO)."""
    ratio = fuzz.token_sort_ratio(content.lower(), search_string.lower())
    return True if ratio > CONFIDENCE_RATIO else False
示例#10
0
def test_token_ratio(s1, s2):
    """
    token_ratio should be max(token_sort_ratio, token_set_ratio)
    """
    assert fuzz.token_ratio(s1, s2) == max(fuzz.token_sort_ratio(s1, s2),
                                           fuzz.token_set_ratio(s1, s2))
示例#11
0
 def testTokenSortRatio(self):
     self.assertEqual(fuzz.token_sort_ratio(self.s1, self.s1a), 100)
示例#12
0
    def check_formulae(self,
                       formula_string,
                       annotations,
                       threshold_string=65,
                       threshold_identifers=1):
        def get_identifier_score(identifiers1, identifiers2):
            score_identifers = len(
                list(set(identifiers1).intersection(identifiers2)))
            return score_identifers

        results_string = []
        results_identifiers = []
        formula_dict = self.get_formulae_from_repo()
        identifiers = self.extract_identifiers_from_formula(
            annotations, formula_string)

        c = CustomMathEnvParser(formula_string)
        identifiers_from_wikidata_formula, _ = c.get_split_math_env()
        static_wikidata_handler_logger.info(
            'identifiers_from_wikidata_formula: {}'.format(
                identifiers_from_wikidata_formula))

        for formula_name in formula_dict:
            formula = formula_dict[formula_name]
            qid = formula['qid']
            tex_string = formula['formula']

            score_string = fuzz.token_sort_ratio(formula_string, tex_string)
            if score_string >= threshold_string:
                results_string.append(({
                    'name': formula_name,
                    'qid': qid
                }, score_string))

            formula_identifiers = formula['identifiers']['names']
            formula_quantity_symbols = formula['identifiers']['strings']

            #flawed logic
            """if len(formula_quantity_symbols+formula_identifiers) > len(identifiers_from_wikidata_formula):
                score_identifers = get_identifier_score(identifiers, formula_quantity_symbols+formula_identifiers)
                if formula_name == 'sphere':
                    print('score identifiers if: {}'.format(score_identifers))
            else:
                score_identifers = get_identifier_score(identifiers, identifiers_from_wikidata_formula)
                if formula_name == 'sphere':
                    print('score identifiers else: {}'.format(score_identifers))
                    print(formula)
                    print('formula_identifiers: {}'.format(formula_identifiers))
                    print('formula_quantity_symbols: {}'.format(formula_quantity_symbols))
                    print('identifiers: {}'.format(identifiers))"""

            score_identifers = get_identifier_score(
                identifiers, formula_quantity_symbols + formula_identifiers)
            """if formula_name == 'sphere':
                print('score identifiers else: {}'.format(score_identifers))
                print(formula)
                print('formula_identifiers: {}'.format(formula_identifiers))
                print('formula_quantity_symbols: {}'.format(formula_quantity_symbols))
                print('identifiers: {}'.format(identifiers))"""

            if score_identifers >= threshold_identifers:
                results_identifiers.append(({
                    'name': formula_name,
                    'qid': qid
                }, score_identifers))

        if len(results_string) > 0:
            results_string = [
                r[0] for r in sorted(results_string, key=itemgetter(1))
            ]  #.reverse()
        if len(results_identifiers) > 0:
            results_identifiers = [
                r[0] for r in sorted(results_identifiers, key=itemgetter(1))
            ]  #.reverse()

        return list(reversed(results_string)), list(
            reversed(results_identifiers))
示例#13
0
    def _find_device(self, device, allowed_types, room=""):
        LOG.debug("device: {} allowed_types: {} room: {}".format(
            device, allowed_types, room))
        filter_dict = {'genericDeviceType': allowed_types}

        # new search strategy: first check if there is a fit in specified room
        if room:
            room = self._normalize(self._clean_common_words(room))
            # LOG.debug("normalized room: {}".format(room))
            filter_dict['room'] = room
        device_candidates = self.fhem.get(room=self.allowed_devices_room,
                                          filters=filter_dict)

        if len(device_candidates) == 1:
            # TODO can we do anything if len(...) > 1 ?
            LOG.debug("perfect match")
            # we have a perfect match:
            # there is only one device of the allowed type in the room
            dc = device_candidates[0]
            best_device = {
                "id": dc['Name'],
                "dev_name": self._get_aliasname(dc),
                "state": dc['Readings']['state'],
                "best_score": 999
            }
            return best_device

        # try again without filter on room
        if 'room' in filter_dict.keys():
            LOG.debug("try again without filter on room")
            del filter_dict['room']
        device_candidates = self.fhem.get(room=self.allowed_devices_room,
                                          filters=filter_dict)
        # LOG.debug(device_candidates)

        # require a score above 50%
        best_score = 50
        best_device = None

        if device_candidates:
            for dc in device_candidates:
                # LOG.debug("==================================================")
                norm_name = self._normalize(dc['Name'])
                norm_name_list = norm_name.split(" ")
                # LOG.debug("norm_name_list = %s" % norm_name_list)

                dev_room = self._get_normalized_room_list(dc)
                for r in dev_room:
                    if (r not in norm_name_list):
                        norm_name += (" " + self._normalize(r))

                # LOG.debug("dev_room: {}".format(dev_room))
                # LOG.debug("norm_name = %s" % norm_name)

                alias = self._get_aliasname(dc)
                norm_alias = self._normalize(alias)

                try:
                    if (norm_name != norm_alias) and ('alias'
                                                      in dc['Attributes']):
                        score = fuzz.token_sort_ratio(device, norm_alias)
                        # add bonus if room name match
                        if room and dev_room:
                            score += self._get_bonus_for_room(
                                room, dev_room[0])
                        if score > best_score:
                            best_score = score
                            best_device = {
                                "id": dc['Name'],
                                "dev_name": alias,
                                "state": dc['Readings']['state'],
                                "best_score": best_score
                            }

                    score = fuzz.token_sort_ratio(device, norm_name)
                    # add bonus if room name match
                    if room and dev_room:
                        score += self._get_bonus_for_room(room, dev_room[0])
                    # LOG.debug("%s %s" % (norm_name, score))
                    if score > best_score:
                        best_score = score
                        best_device = {
                            "id": dc['Name'],
                            "dev_name": alias,
                            "state": dc['Readings']['state'],
                            "best_score": best_score
                        }

                except KeyError:
                    pass  # print("KeyError")
            LOG.debug("best device = %s" % best_device)
            return best_device
示例#14
0
			def get_ratio(row):
				    name = row['wordmark']
				    return fuzz.token_sort_ratio(clean_text, name)
示例#15
0
 def _query_matches(self, query: str, place: Place) -> bool:
     '''Test if query matches place.'''
     return (fuzz.partial_ratio(query, place.name.lower()) > 80
             or any(fuzz.token_sort_ratio(query, type_) > 80 for type_ in place.types)
             )
示例#16
0
 def __score_result(tweet, search_criteria):
     score = fuzz.token_sort_ratio(tweet.text, search_criteria.content)
     return score
示例#17
0
def fuzzy_string_match(str_ref, str_hyp):
    """Returns fuzzy string similarity score in range [0.0, 1.0]."""

    # The higher the score, the higher the similarity between the two strings.
    return fuzz.token_sort_ratio(str_ref, str_hyp) / 100.0
def prefit_compute(queried_movie_type, standard_query_data):
    standard_train_data = dict_data[queried_movie_type.lower()]
    if queried_movie_type.lower() in ["tv-show","korean drama"]:
        queried_movie_type = "series"
    else:
        queried_movie_type = 'movie'

    train_data = standard_train_data.copy()
    
    if queried_movie_type == 'movie':
        try:
            standard_query_data[8] = int(standard_query_data[8].split('–')[0])
        except:
            pass
    train_data = train_data[train_data['imdbID'] != standard_query_data[0]]
    train_data_with_query = train_data.append(pd.Series(index=train_data.columns, data=standard_query_data),ignore_index=True)
    if queried_movie_type != "series":
        train_data_with_query.drop('BoxOffice',axis=1,inplace=True)
    column_unknown = ['directors','writers', 'actors', 'production', 'country', 'language','Plot','Rated','Type','Genre']
    for c in column_unknown:
        try:
            train_data_with_query[c].fillna('Unknown',inplace=True)
        except:
            pass
    if queried_movie_type == 'movie':
        column_median = ['imdbRating', 'rottenTomatoRating','metacriticRating','duration','year']
    else:
        column_median = ['imdbRating','duration', 'totalSeasons']
    for c in column_median:
        
        try:
            train_data_with_query[c].fillna(train_data_with_query[c].median(),inplace=True)
        except Exception as err:
            raise err
    if queried_movie_type == 'movie':
        column_zero = ['oscarNominations',
               'noOfAwards', 'noOfNominations','imdbVotes']
    else:
        column_zero = ['noOfNominations','imdbVotes','noOfAwards']
    for c in column_zero:
        try:
            train_data_with_query[c].fillna(0,inplace=True)
        except:
            pass
    if queried_movie_type == 'movie':
        column_mode = []
    else:
        column_mode = ['year']
    for c in column_mode:
        try:
            train_data_with_query[c].fillna(train_data_with_query[c].mode()[0],inplace=True)
        except Exception as err:
            raise err

    def eucld_dist(a,b):
        return (abs(a-b))

    def extract_unique_nominations(x):
        try:
            for (value, nomination) in re.findall(r'(\d+?):([a-zA-Z\s]+);', x, re.IGNORECASE):
                list_special_awards.append(nomination)
        except:
            pass
        return x
        
    def extract_nom_val(x,nom):
        try:
            res = re.findall(fr'(\d+?):{nom}', x, re.IGNORECASE)
            if res:
                return int(res[0]) 
            else:
                return 0
        except:
            pass
    if queried_movie_type.lower() == 'series':
        list_special_awards = ['Golden Globe', 'Primetime Emmy']
    #     movies_data_am_show['specialNominations'].apply(extract_unique_nominations)
        list_special_awards = list(set(list_special_awards))
        for spec_nom in list_special_awards:
            train_data_with_query[spec_nom] = train_data_with_query['specialNominations'].apply(lambda x: extract_nom_val(x,spec_nom))
            column_zero.append(spec_nom)
            train_data_with_query[spec_nom].fillna(0,inplace=True)

    if queried_movie_type == 'series':
        train_data_with_query['year'] = train_data_with_query['year'].apply(lambda x: int(x.split('–')[0]))
        column_zero.append('year');

    dict_cols_init = dict((j,i) for i,j in enumerate(train_data_with_query.columns))

    train_data_with_query.iloc[-1,3] = np.float64(train_data_with_query.iloc[-1,3])

    train_data_with_query['imdbRating']*=10

    standard_query_data = train_data_with_query.iloc[-1,:]

    train_data_with_query['actor_fuzz'] = train_data_with_query['actors'].apply(lambda x:fuzz.token_sort_ratio(standard_query_data[dict_cols_init['actors']],x))
    dict_cols_init['actor_fuzz'] = len(dict_cols_init.values())
    column_median.append('actor_fuzz')
    train_data_with_query['language_fuzz'] = train_data_with_query['language'].apply(lambda x:fuzz.token_sort_ratio(standard_query_data[dict_cols_init['language']],x))
    dict_cols_init['language_fuzz'] = len(dict_cols_init.values())
    column_median.append('language_fuzz')
    train_data_with_query['rated_fuzz'] = train_data_with_query['Rated'].apply(lambda x:fuzz.token_sort_ratio(standard_query_data[dict_cols_init['Rated']],x))
    dict_cols_init['rated_fuzz'] = len(dict_cols_init.values())
    column_median.append('rated_fuzz')
    train_data_with_query['type_fuzz'] = train_data_with_query['Type'].apply(lambda x:fuzz.token_sort_ratio(standard_query_data[dict_cols_init['Type']],x))
    dict_cols_init['type_fuzz'] = len(dict_cols_init.values())
    column_median.append('type_fuzz')
    train_data_with_query['plot_fuzz'] = train_data_with_query['Plot'].apply(lambda x:fuzz.token_sort_ratio(standard_query_data[dict_cols_init['Plot']],x))
    dict_cols_init['plot_fuzz'] = len(dict_cols_init.values())
    column_median.append('plot_fuzz')
    train_data_with_query['genre_fuzz'] = train_data_with_query['Genre'].apply(lambda x:fuzz.token_sort_ratio(standard_query_data[dict_cols_init['Genre']],x))
    dict_cols_init['genre_fuzz'] = len(dict_cols_init.values())
    column_median.append('genre_fuzz')
    train_data_with_query['country_fuzz'] = train_data_with_query['country'].apply(lambda x:fuzz.token_sort_ratio(standard_query_data[dict_cols_init['country']],x))
    dict_cols_init['country_fuzz'] = len(dict_cols_init.values())
    column_median.append('country_fuzz')
    train_data_with_query['title_fuzz'] = train_data_with_query['title'].apply(lambda x:fuzz.token_sort_ratio(standard_query_data[dict_cols_init['title']],x))
    dict_cols_init['title_fuzz'] = len(dict_cols_init.values())
    column_median.append('title_fuzz')

    def scale_data(x, col):
        return ((x/train_data_with_query[col].max()) * 100)
    if queried_movie_type == 'series':
        train_data_with_query['plot_fuzz'] = train_data_with_query['plot_fuzz'].apply(lambda x: 0 if (x <70) else x)
        train_data_with_query['genre_fuzz'] = train_data_with_query['genre_fuzz'].apply(lambda x: 0 if (x <70) else x)
        train_data_with_query['rated_fuzz'] = train_data_with_query['rated_fuzz'].apply(lambda x: 0 if (x <100) else x)
        train_data_with_query['actor_fuzz'] = train_data_with_query['actor_fuzz'].apply(lambda x: 0 if (x <80) else x)
        train_data_with_query['title_fuzz'] = train_data_with_query['title_fuzz'].apply(lambda x: 0 if (x <80) else x)
        train_data_with_query['noOfAwards'] =  train_data_with_query['noOfAwards'].apply(lambda x: scale_data(x, 'noOfAwards'))
        train_data_with_query['noOfNominations'] =  train_data_with_query['noOfAwards'].apply(lambda x: scale_data(x, 'noOfNominations'))
        train_data_with_query['imdbVotes'] =  train_data_with_query['imdbVotes'].apply(lambda x: scale_data(x, 'imdbVotes'))
        # train_data_with_query['Primetime Emmy'] =  train_data_with_query['Primetime Emmy'].apply(lambda x: scale_data(x, 'Primetime Emmy'))
        # train_data_with_query['Golden Globe'] =  train_data_with_query['Primetime Emmy'].apply(lambda x: scale_data(x, 'Golden Globe'))
    else:
        train_data_with_query['title_fuzz'] = train_data_with_query['title_fuzz'].apply(lambda x: 0 if (x <80) else x)
    train_data_with_query.sort_values(by="title_fuzz", ascending=False).head(10)


    if queried_movie_type == 'series':
        train_data_with_query['total_season_dist'] = train_data_with_query['totalSeasons'].apply(lambda x:eucld_dist(x, standard_query_data[dict_cols_init['totalSeasons']]))
        column_median.append('total_season_dist')


    column_genres = ['Action',
           'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime',
           'Documentary', 'Drama', 'Family', 'Fantasy', 'FilmNoir', 'GameShow',
           'History', 'Horror', 'Music', 'Musical', 'Mystery', 'News', 'RealityTV',
           'Romance', 'SciFi', 'Short', 'Sport', 'TalkShow', 'Thriller', 'War',
           'Western']

    def parse_genre(x):
        try:
            if int(x) in [0,1]:
                return int[x]
            else:
                return 0;
        except:
            return 0
    for col in column_genres:
        train_data_with_query[col] = train_data_with_query[col].fillna(0)


    a = train_data_with_query.copy()


    scaler = StandardScaler()

    to_be_standardized_features = column_median+column_zero
    to_be_standardized_data = train_data_with_query[to_be_standardized_features]


    train_data_with_query_standardized = scaler.fit_transform(to_be_standardized_data)


    for id_, c in enumerate(to_be_standardized_features):
        train_data_with_query[c] = train_data_with_query_standardized[:,id_]


    dict_cols_final = dict((j,i) for i,j in enumerate(train_data_with_query.columns))

    if queried_movie_type == 'movie':
        distance_template = [
            [1, [12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39], 'c','d1',-1],
            [2, [50,52,53,54,55,3,4,5,6,7,8,10,11,51], 'e','d1',1]
        ]
    else:
        distance_template = [
    #         [1, [*list(range(10,38))], 'c','d2',-1],
            [2, [54,55,52,50,8,9,4,3], 'e','d1',1]
        ]
    train_data_2d = train_data_with_query.iloc[:-1,:].values
    labels = train_data_with_query.columns
    train_data_movie_titles_1d = train_data_with_query.iloc[:-1,:]['title'].values
    query_data_1d = train_data_with_query.iloc[-1,:].values
    return [train_data_2d, query_data_1d,train_data_movie_titles_1d,distance_template, labels]