Пример #1
0
 def morphologique(a: str, b: str) -> bool:
     """
     :param a: a word
     :param b: a word
     :return: True if both a and b sound the same.
     """
     return soundex(a) == soundex(b)
Пример #2
0
    def find_match_levenshtein_soundex(self, token, canonical):
        candidates = []
        best_score = 2
        for word in self.dicts:
            score = jellyfish.levenshtein_distance(
                token,
                word.decode("utf-8").lower())
            if score <= best_score:
                best_score = score
                candidates.append(word.lower())

        token_soundex = jellyfish.soundex(token.decode("utf-8"))
        match_soundex = [
            match for match in candidates
            if jellyfish.soundex(match.decode("utf-8")) == token_soundex
        ]

        #G = ngram.NGram(match_soundex)
        #best_candidates = G.search(token, threshold=0.5)

        #results = [item[0] for item in best_candidates]

        is_match = False
        for word in match_soundex:
            if word == canonical:
                is_match = True
                break

        #if len(best_candidates) > 0:
        #    best_match = best_candidates[0][0]
        #else:
        #    best_match = ""

        return match_soundex, is_match
Пример #3
0
def match_to_lib(sound, sound_lib):
    
    # Find the closest match of numbers 1-10
    n_closest = get_close_matches(sound, sound_lib, 1)
    
    # No direct match from the sound_lib
    #  Check if the sound "sounds like" any lib sounds 
    if not n_closest:
        
        # Convert the sounds into what they sound like
        sound_lib_ex = [soundex(x) for x in sound_lib]
        sound_ex = soundex(sound)
        
        # Find the closest match for "sounds like"
        closest_sound = get_close_matches(sound_ex, sound_lib_ex, 1)
        
        # Sounds like something in the sound_lib
        if closest_sound:
            closest_sound = closest_sound[0]
            n_closest = sound_lib[sound_lib_ex.index(closest_sound)]
            
        # Doesn't sound like anything in sound_lib
        else:
            n_closest = []
    else:
        n_closest = n_closest[0]
        
    return n_closest
Пример #4
0
def update_entry(entry: dict) -> dict:
    name = entry["PaxName"].split()

    # Get name.
    entry["PassengerLastName"] = name[0] if len(name) >= 1 else ""
    entry["PassengerFirstName"] = name[1] if len(name) >= 2 else ""
    entry["PassengerSecondName"] = name[2] if len(name) >= 3 else ""
    entry["PassengerFirstName_en"] = (transliterate(
        entry["PassengerFirstName"]).replace("'", "").upper())
    entry["PassengerSecondName_en"] = (transliterate(
        entry["PassengerSecondName"]).replace("'", "").upper())
    entry["PassengerLastName_en"] = (transliterate(
        entry["PassengerLastName"]).replace("'", "").upper())
    entry["PassengerFirstName_sx"] = soundex(entry["PassengerFirstName_en"])
    entry["PassengerSecondName_sx"] = soundex(entry["PassengerSecondName_en"])
    entry["PassengerLastName_sx"] = soundex(entry["PassengerLastName_en"])

    # Transliterate name.
    entry["PassengerFirstName_en"] = (translit(entry["PassengerFirstName"],
                                               "ru",
                                               reversed=True).replace(
                                                   "'", "").upper())
    entry["PassengerSecondName_en"] = (translit(entry["PassengerSecondName"],
                                                "ru",
                                                reversed=True).replace(
                                                    "'", "").upper())
    entry["PassengerLastName_en"] = (translit(entry["PassengerLastName"],
                                              "ru",
                                              reversed=True).replace(
                                                  "'", "").upper())

    return entry
Пример #5
0
def apply_soundex(misspell, dictionary):
    count = 0
    result = []

    for mis_word in misspell:
        predict_words = []

        if mis_word not in dictionary:
            if '/' not in mis_word:
                for dict_word in dictionary:
                    soundex_mis = jf.soundex(mis_word)
                    soundex_dict = jf.soundex(dict_word)
                    l_dist = jf.levenshtein_distance(soundex_mis, soundex_dict)

                    predict_words.append((dict_word, l_dist))

                first_five_pred = sorted(predict_words,
                                         key=operator.itemgetter(1),
                                         reverse=False)[:5]
                pred_words = [x[0] for x in first_five_pred]

                result.append(pred_words)

            else:
                # do not predict when  word contains '/', a lazy method
                result.append(mis_word)

        # if mis_word in dictionary
        else:
            result.append(mis_word)

        count += 1
        print("Processing: {} / {}".format(count, len(misspell)), end='\r')

    return result
Пример #6
0
def token_similarity(a, b):
    # Strings are a case insensitive match.
    # Match any whitespace to any whitespace.
    if a.word.lower().strip() == b.word.lower().strip():
        return 1.

    # Make it impossible for words to map to whitespace.
    if ((isspace(a.word) and not isspace(b.word))
            or (not isspace(a.word) and isspace(b.word))):
        return -1.

    # Make it impossible for words to map to punctuation.
    if ispunc(a.word) and ispunc(b.word):
        return 0.9
    if ((ispunc(a.word) and not ispunc(b.word))
            or (not ispunc(a.word) and ispunc(b.word))):
        return -1.

    # Strings sound alike (approximate phonetic match).
    if a.word.isalpha() and b.word.isalpha():
        if jf.metaphone(a.word) == jf.metaphone(b.word):
            return 0.9
        if jf.soundex(a.word) == jf.soundex(b.word):
            return 0.9
        if jf.nysiis(a.word) == jf.nysiis(b.word):
            return 0.9
        if jf.match_rating_codex(a.word) == jf.match_rating_codex(b.word):
            return 0.9

    # Use scaled Jaro-Winkler distance.
    return jf.jaro_winkler(a.word, b.word)
Пример #7
0
def remove_similar_sounds(l,r):
    l_sig = []
    r_sig = []
    l_map = defaultdict(lambda:[])
    r_map = defaultdict(lambda:[])

    if len(l)!=len(r):
        pdb.set_trace()
    for tok in l:
        sig = jellyfish.soundex(tok)
        l_sig.append(sig)
        l_map[sig].append(tok)
    for tok in r:
        sig = jellyfish.soundex(tok)
        if sig in l_sig:
            l_sig.remove(sig)
        else:
            r_sig.append(sig)
            r_map[sig].append(tok)
    new_l = []
    for item in l_sig:
        for i in l_map[item]:
            new_l.append(i)
    new_r = []
    for item in r_sig:
        for i in r_map[item]:
            new_r.append(i)

    if len(new_l)!=len(new_r):
        pdb.set_trace()
    return (sorted(new_l),sorted(new_r))
 def match_strings(self, string1, string2):
     wordList1=string1.split()
     wordList2=string2.split()
     for entity in wordList1:
         for word in wordList2:
             if jellyfish.soundex(entity) == jellyfish.soundex(word):return True
     return False
Пример #9
0
def augment_data(df: pd.DataFrame) -> pd.DataFrame:
    """Augment dataframe of FEBRL person data with blocking keys and cleanup for
    comparison step.

    Args:
        df: pandas dataframe containing FEBRL-generated person data

    Returns:
        Augmented dataframe.
    """

    df["surname"] = df["surname"].fillna("")
    df["first_name"] = df["first_name"].fillna("")

    # Soundex phonetic encodings.
    df["soundex_surname"] = df["surname"].apply(lambda x: jellyfish.soundex(x))
    df["soundex_firstname"] = df["first_name"].apply(
        lambda x: jellyfish.soundex(x))

    # NYSIIS phonetic encodings.
    df["nysiis_surname"] = df["surname"].apply(lambda x: jellyfish.nysiis(x))
    df["nysiis_firstname"] = df["first_name"].apply(
        lambda x: jellyfish.nysiis(x))

    # Last 3 of SSID.
    df["ssid_last3"] = df["soc_sec_id"].apply(lambda x: str(x)[-3:].zfill(3)
                                              if x else None)
    df["soc_sec_id"] = df["soc_sec_id"].astype(str)

    # DOB to date object.
    df["dob"] = df["date_of_birth"].apply(lambda x: dob_to_date(x))
Пример #10
0
def Soundex(word, dict):
    word_sound = jellyfish.soundex(word.decode('utf-8'))
    match_candidates = []
    for token in dict:
        if (jellyfish.soundex(token.strip().decode('utf-8')) == word_sound):
            match_candidates.append(token.strip().lower())
    return match_candidates
Пример #11
0
def get_block_key(name1, name2, input_type='REFERENCE'):
    """
       from name1 and name2 generates a blocking key
       for input_type of reference: first name, last_name
       for input_type of document: first_name_last_name, first_name_last_name
    """

    if input_type == 'REFERENCE':
        name1 = name1.split()[0].lower().replace("'", "").decode("ISO-8859-1").encode('utf8', 'ignore')
        name2 = name2.split()[0].lower().replace("'", "").decode("ISO-8859-1").encode('utf8', 'ignore')

        feature_set = {'f3f': name1[:3], 'l2f': name1[-2:],
                       'f3l': name2[:3], 'l2l': name2[-2:],
                       'soundex': jellyfish.soundex(name1) + '_' + \
                                  jellyfish.soundex(name2)}

        # TODO: For now we're not considering the gender! Please check it later!

        block_key = feature_set.get('f3f', '') + '_' + feature_set.get('l2f', '') + '_' + feature_set.get('f3l',
                                                                                                          '') + '_' + feature_set.get(
            'l2l', '') + '_' + feature_set.get('soundex', '')

    if input_type == 'DOCUMENT':
        name1 = name1.strip().replace(' ', '_')
        name2 = name2.strip().replace(' ', '_')
        blocks = sorted([get_block_key(name1.split('_')[0], name1.split('_')[-1]),
                         get_block_key(name2.split('_')[0], name2.split('_')[-1])])

        block_key = '_'.join(blocks).decode('utf-8', 'ignore')

    return block_key
Пример #12
0
 def get_soundex(self, word1, word2, old_word):
     if jellyfish.soundex(word1) == jellyfish.soundex(word2):
         return word2
     elif jellyfish.soundex(word1) == jellyfish.soundex(old_word):
         return old_word
     else:
         return word1
Пример #13
0
def check_street(name):
    name = name.strip()
    name = name.split(" ")
    ls = [n.lower() for n in name]
    for item in ls:
        try:
            int(item)
            return "other"
        except:
            continue
    word = ls[-1]
    if word == "null":
        return "other"
    lst = [
        'boulevard', 'parkway', 'east', 'west', 'street', 'avenue', 'lane',
        'place', 'road', 'broadway', 'beach', 'drive', 'trail', 'circle',
        'promenade', 'transit', 'park', 'highway', 'expressway', 'parkway',
        'overpass', 'tunnel', 'slip', 'bridge', 'exit', 'loop', 'court',
        'ramp', 'alley', 'entrance', 'heights', 'oval'
    ]
    slst = [soundex(l) for l in lst]
    if any(item in ls for item in lst):
        return "street_name"
    elif any(soundex(item) in ls for item in slst):
        return "street_name"
    else:
        return "other"
Пример #14
0
def shortword(sen):

        if "'re" in sen:
              sen=sen.replace("'re"," are")
        #sent=pytypo.correct_sentence(sen)
        spl=sen.split()
        s=''

        number=['0','1','2','3','4','5','6','7','8','9']
        for string in spl:
              count=0
              nd=0
              z=0
              h=0
              val=0
              al=len(string)
          #    if not d.check(string) and string[-1]!='z':
           #        string=pytypo.correct_sentence(string)
            #  if al>1:
             #   if string[-1]==string[-2] and not d.check(string):
              #       string=pytypo.cut_repeat(string,1)
              for name,v in dictionary.dic.iteritems():
                     for a in v:
                            if string==a:
                                   count+=1
                                   s+=name+' '
                                   z=1
                                   break
              if z:
                     continue
 
              for num in number:
                     if num in string:
                            s+=string+' '
                            val+=1
                            break
              if val==1:
                     continue
              if d.check(string):
                  s+=string+' '
                  continue
              else:
                 try:
                   a=jellyfish.soundex(unicode(string))
                   x=int(a[1:])
                   z=d.suggest(string)
                   for line in z:
                       if d.check(line):
                                  b=jellyfish.soundex(unicode(line))
                                  y=int(b[1:])
                                  if (a[0]==b[0])and(abs(x-y))<=5 and len(line)>len(string):
                                      s+=line+' '
                                      count+=1
                                      break
                 except UnicodeDecodeError:
                     ex=0   
        f = open("../data_sets/shortword.txt",'a')
        f.write(str(s)+"\n");  
        return s    
Пример #15
0
 def match_strings(self, string1, string2):
     wordList1 = string1.split()
     wordList2 = string2.split()
     for entity in wordList1:
         for word in wordList2:
             if jellyfish.soundex(entity) == jellyfish.soundex(word):
                 return True
     return False
def compare_pred(x, y):
    if x == y:
        return 1.
    else:
        if editdistance.eval(
                x, y) <= 1 and jellyfish.soundex(x) == jellyfish.soundex(y):
            return 0.5
    return 0.
Пример #17
0
def similarity_factor(s1, s2):
    """ Returns float number which corresponds to similarity order of two strings s1 and s2 """
    diffl = difflib.SequenceMatcher(None, s1, s2).ratio()*100
    ng = ngram.NGram.compare(s1, s2, N=1)*100
    fpr = fuzz.partial_ratio(s1, s2)
    jac_metaphone = (1-distance.jaccard(jellyfish.metaphone(unicode(s1)).lower(), jellyfish.metaphone(unicode(s2)).lower()))*100
    jac_soundex = (1-distance.jaccard(jellyfish.soundex(unicode(s1)).lower(), jellyfish.soundex(unicode(s2)).lower()))*100
    return mean([diffl, ng, fpr, jac_soundex, jac_metaphone]) if mean([diffl, ng, fpr]) < jac_soundex else mean([diffl, ng, fpr, jac_metaphone])
Пример #18
0
 def soundexSimilarity(self, s1, s2):
     try:
         sdx1 = jf.soundex(s1)
         sdx2 = jf.soundex(s2)
     except:
         return 0
     else:
         return sum([1 if a == b else 0 for a, b in zip(sdx1, sdx2)]) / max(
             len(sdx1), len(sdx2))
Пример #19
0
def compare_for_seniority_finding(s1, s2):
    """ Returns the input word if it is similar (according to corresponding algorithms) to some another word.
        s1 - main string, s2 - string from list for comparison
    """
    fpr = fuzz.partial_ratio(s1, s2)
    jac_metaphone = (1-distance.jaccard(jellyfish.metaphone(unicode(s1)).lower(), jellyfish.metaphone(unicode(s2)).lower()))*100
    jac_soundex = (1-distance.jaccard(jellyfish.soundex(unicode(s1)).lower(), jellyfish.soundex(unicode(s2)).lower()))*100
    jac_mrc = (1-distance.jaccard(jellyfish.match_rating_codex(unicode(s1)).lower(), jellyfish.match_rating_codex(unicode(s2)).lower()))*100
    return fpr >= 50 and jac_soundex > 70 and jac_metaphone > 65 and jac_mrc > 65
Пример #20
0
def update_entry(entry: dict) -> dict:
    entry["PassengerBirthDate"] = (entry["PassengerBirthDate"][6:10] + "-" +
                                   entry["PassengerBirthDate"][0:2] + "-" +
                                   entry["PassengerBirthDate"][3:5])

    entry["PassengerFirstName_sx"] = soundex(entry["PassengerFirstName"])
    entry["PassengerSecondName_sx"] = soundex(entry["PassengerSecondName"])
    entry["PassengerLastName_sx"] = soundex(entry["PassengerLastName"])

    return entry
Пример #21
0
def update_entry(entry: dict) -> dict:
    name = entry["name"].split()

    entry["PassengerLastName"] = name[0] if len(name) >= 1 else ""
    entry["PassengerFirstName"] = name[1] if len(name) >= 2 else ""
    entry["PassengerSecondName"] = name[2] if len(name) >= 3 else ""
    entry["PassengerFirstName_sx"] = soundex(entry["PassengerFirstName"])
    entry["PassengerSecondName_sx"] = soundex(entry["PassengerSecondName"])
    entry["PassengerLastName_sx"] = soundex(entry["PassengerLastName"])

    return entry
Пример #22
0
def get_best_matched(query_word='', jacard_list=None):
    index_value = jacard_list[0][1]
    word = jacard_list[0][0]
    if index_value >= 0.40:
        return word.replace("$", '')
    elif 0.30 < index_value < 0.40:
        sound_query_word = jellyfish.soundex(query_word)
        sound_word = jellyfish.soundex(word)
        if sound_query_word[1:] == sound_word[1:]:
            return word.replace("$", '')
        else:
            return None
    else:
        return None
Пример #23
0
def similarity_factor(s1, s2):
    """ Returns float number which corresponds to similarity order of two strings s1 and s2 """
    diffl = difflib.SequenceMatcher(None, s1, s2).ratio() * 100
    ng = ngram.NGram.compare(s1, s2, N=1) * 100
    fpr = fuzz.partial_ratio(s1, s2)
    jac_metaphone = (1 - distance.jaccard(
        jellyfish.metaphone(unicode(s1)).lower(),
        jellyfish.metaphone(unicode(s2)).lower())) * 100
    jac_soundex = (1 - distance.jaccard(
        jellyfish.soundex(unicode(s1)).lower(),
        jellyfish.soundex(unicode(s2)).lower())) * 100
    return mean([diffl, ng, fpr, jac_soundex, jac_metaphone
                 ]) if mean([diffl, ng, fpr]) < jac_soundex else mean(
                     [diffl, ng, fpr, jac_metaphone])
Пример #24
0
def soundex():
    fw5 = open('soundex_result.txt', 'w')
    for line in wiki_misspell:
        string = line.strip()
        dis = 100000
        bests = ""
        string_s = jellyfish.soundex(string)
        for entry in my_dict:
            entry.strip()
            entry_s = jellyfish.soundex(entry)

            # tem_dis = distance(entry_s, string_s)

            len_entry = len(entry_s) + 1
            len_string = len(string_s) + 1
            distance_m = [[0 for i in range(len_string)]
                          for i in range(len_entry)]
            for i in range(0, len_entry):
                distance_m[i][0] = 0
            for i in range(0, len_string):
                distance_m[0][i] = 0

            for i in range(1, len_entry):
                for j in range(1, len_string):
                    if entry_s[i - 1] == string_s[j - 1]:
                        distance_m[i][j] = min(
                            distance_m[i - 1][j - 1] - 1,
                            distance_m[i - 1][j] + 1,
                            distance_m[i][j - 1] + 1,
                        )
                    else:
                        distance_m[i][j] = min(
                            distance_m[i - 1][j - 1] + 1,
                            distance_m[i - 1][j] + 1,
                            distance_m[i][j - 1] + 1,
                        )

            tem_dis = distance_m[len_entry - 1][len_string - 1]

            if tem_dis < dis:
                dis = tem_dis
                bests = " "
                bests = entry.strip()
            elif tem_dis == dis:
                bests += ' ' + entry.strip()

        print(dis, string, bests)
        fw5.write(bests + '\n')
    fw5.close()
def find_correct_words(word):
    correct_words = []
    dic_path = "dict.txt"
    try:
        with open(dic_path) as dict:
            for dic_word_line in dict:
                dict_word = dic_word_line.strip()
                word_soundex = jellyfish.soundex(word)
                dict_word_soundex = jellyfish.soundex(dict_word)
                if word_soundex == dict_word_soundex:
                    correct_words.append(dict_word)

    except Exception as e:
        print(e)
    return correct_words
Пример #26
0
def compare_for_seniority_finding(s1, s2):
    """ Returns the input word if it is similar (according to corresponding algorithms) to some another word.
        s1 - main string, s2 - string from list for comparison
    """
    fpr = fuzz.partial_ratio(s1, s2)
    jac_metaphone = (1 - distance.jaccard(
        jellyfish.metaphone(unicode(s1)).lower(),
        jellyfish.metaphone(unicode(s2)).lower())) * 100
    jac_soundex = (1 - distance.jaccard(
        jellyfish.soundex(unicode(s1)).lower(),
        jellyfish.soundex(unicode(s2)).lower())) * 100
    jac_mrc = (1 - distance.jaccard(
        jellyfish.match_rating_codex(unicode(s1)).lower(),
        jellyfish.match_rating_codex(unicode(s2)).lower())) * 100
    return fpr >= 50 and jac_soundex > 70 and jac_metaphone > 65 and jac_mrc > 65
Пример #27
0
def extract_block_key(person, gender_names):
    feature_set = {'id': person['id'],
                   'first_name': person['first_name'].replace("'", ""),
                   'last_name': person['last_name'].replace("'", ""),
                   'role': person['role'],
                   'register_type': person['register_type'],
                   'register_id': person['register_id']}
    if person['gender'] == "male" or person['gender'] == "female":
        feature_set['gender'] = person['gender']

    if p['first_name'] and person['last_name']:
        if not (person['gender'] == "male" or person['gender'] == "female"):
            first_split = person['first_name'].split()[0]
            if first_split in gender_names['male']:
                feature_set['gender'] = "male"
                # print first_split, "male"
            if first_split in gender_names['female']:
                feature_set['gender'] = "female"
                # print first_split, "female"
            if first_split not in gender_names['male'] and first_split not in gender_names['female']:
                feature_set['gender'] = "unknown"
                # print first_split, "unknown"

        feature_set['f3f'] = person['first_name'].split()[0][:3].replace("'", "")
        feature_set['l2f'] = person['first_name'].split()[0][-2:].replace("'", "")

        feature_set['f3l'] = person['last_name'].split()[0][:3].replace("'", "")
        feature_set['l2l'] = person['last_name'].split()[0][-2:].replace("'", "")

        feature_set['soundex'] = jellyfish.soundex(person['first_name'].split()[0].replace("'", "")) + '_' + \
                                 jellyfish.soundex(person['last_name'].split()[0].replace("'", ""))

        feature_set['block_key'] = feature_set.get('gender', '') + '_' + feature_set.get('f3f',
                                                                                         '') + '_' + feature_set.get(
            'f3l', '') + '_' \
                                   + feature_set.get('l2f', '') + '_' + feature_set.get('l2l',
                                                                                        '') + '_' + feature_set.get(
            'soundex', '')
    else:
        feature_set['gender'] = ''
        feature_set['soundex'] = ''
        feature_set['block_key'] = ''
        feature_set['f3f'] = ''
        feature_set['f3l'] = ''
        feature_set['l2l'] = ''
        feature_set['l2f'] = ''

    return feature_set
Пример #28
0
def phonetic(s, method):
    """
    Phonetically encode the values in the Series. 

    :param method: The algorithm that is used to phonetically encode the values. The possible options are 'soundex' (`wikipedia <https://en.wikipedia.org/wiki/Soundex>`_) and 'nysiis' (`wikipedia <https://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System>`_). 
    :type method: str

    :return: A Series with phonetic encoded values.
    :rtype: pandas.Series

    .. note::

        The 'soundex' and 'nysiis' algorithms use the package 'jellyfish'. 
        It can be installed with pip (``pip install jellyfish``).

    """


    try:
        import jellyfish
    except ImportError:
        print ("Install jellyfish to use string encoding.")

    s = clean(s, replace_by_none='[^\-\_A-Za-z0-9]+')
 
    if method == 'soundex':
        return s.str.upper().apply(lambda x: jellyfish.soundex(x) if pandas.notnull(x) else np.nan)

    elif method == 'nysiis':
        return s.str.upper().apply(lambda x: jellyfish.nysiis(x) if pandas.notnull(x) else np.nan)

    else:
        raise Exception("Phonetic encoding method not found")
Пример #29
0
def fuzzy(string):
    return jsonify({
        "metaphone": jellyfish.metaphone(string),
        "soundex": jellyfish.soundex(string),
        "nysiis": jellyfish.nysiis(string),
        "match_rating_codex": jellyfish.match_rating_codex(string),
    })
Пример #30
0
def simple_example():
    # String comparison.
    str1, str2 = u'jellyfish', u'smellyfish'

    print("jellyfish.levenshtein_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.levenshtein_distance(str1, str2)))
    print("jellyfish.damerau_levenshtein_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.damerau_levenshtein_distance(str1, str2)))
    print("jellyfish.hamming_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.hamming_distance(str1, str2)))
    print("jellyfish.jaro_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_distance(str1, str2)))
    print("jellyfish.jaro_similarity({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_similarity(str1, str2)))
    print("jellyfish.jaro_winkler({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_winkler(str1, str2)))
    print("jellyfish.jaro_winkler_similarity({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_winkler_similarity(str1, str2)))
    print("jellyfish.match_rating_comparison({}, {}) = {}.".format(
        str1, str2, jellyfish.match_rating_comparison(str1, str2)))

    #--------------------
    # Phonetic encoding.
    ss = u'Jellyfish'

    print("jellyfish.metaphone({}) = {}.".format(ss, jellyfish.metaphone(ss)))
    print("jellyfish.soundex({}) = {}.".format(ss, jellyfish.soundex(ss)))
    print("jellyfish.nysiis({}) = {}.".format(ss, jellyfish.nysiis(ss)))
    print("jellyfish.match_rating_codex({}) = {}.".format(
        ss, jellyfish.match_rating_codex(ss)))
    def __init__(self):
        SpellChecker.dictCountMap = self.readDitionary(
            '../data/count_1w100k.txt')
        for key in SpellChecker.dictCountMap:
            SpellChecker.totalCount += SpellChecker.dictCountMap[key]
        for word in SpellChecker.dictCountMap:
            tGList = self.getGrams(word, SpellChecker.invertMapGram)
            for tgram in tGList:
                tmpWordList = []
                if tgram in SpellChecker.invertTriMap:
                    tmpWordList = SpellChecker.invertTriMap[tgram]
                tmpWordList.append(word)
                SpellChecker.invertTriMap[tgram] = tmpWordList
            tmpWordList = []

            soundexHash = jellyfish.soundex(word)
            if soundexHash in SpellChecker.invertSoundexMap:
                tmpWordList = SpellChecker.invertSoundexMap[soundexHash]
            tmpWordList.append(word)
            SpellChecker.invertSoundexMap[soundexHash] = tmpWordList

            metaHash = jellyfish.metaphone(word)
            if metaHash in SpellChecker.invertMetaMap:
                tmpWordList = SpellChecker.invertMetaMap[metaHash]
            tmpWordList.append(word)
            SpellChecker.invertMetaMap[metaHash] = tmpWordList
Пример #32
0
 def soundex_mapping(self, word):
     print("Performinig Soundex Mapping")
     sounds = {"Z600":0,"O500":1,"T000":2,"T600":3,"F600":4,"F100":5,"S200":6,"S150":7,"E230":8,"N500":9}
     try:
         num = sounds[jellyfish.soundex(word)]
     except:
         num = 0
     return num
    def get_match(self, dictList ,token):
        candidates = []
        candidatesGram = []
        bestMatch = ""

        soundex_token = jellyfish.soundex(token)

        candidates = [match for match in dictList if jellyfish.soundex(match) == soundex_token]

        if len(candidates) > 1:
            GramSet = ngram.NGram(candidates)
            candidatesGram = GramSet.search(token)
            if len(candidatesGram) > 0:
                bestMatch = candidatesGram[0][0]
        elif len(candidates) == 1:
            bestMatch = candidates[0]

        return bestMatch, candidates, candidatesGram
Пример #34
0
    def get_most_similar(self, word):
        def similar_key(x):
            return jellyfish.levenshtein_distance(x, word)

        phonetic = jellyfish.soundex(word)
        similar = self.phonetic_map[phonetic]
        most_similar = sorted(list(similar), key=similar_key)

        return most_similar[:self.closest_neighbours]
Пример #35
0
def _word_similarity_score(a, b):
    if a == b:
        return 1.

    # Case and whitespace insenstive comparison.
    if a.lower().strip() == b.lower().strip():
        return 0.95

    # Penalize whitespace matching to non-whitespace.
    if ((_isspace(a) and not _isspace(b)) or
        (not _isspace(a) and _isspace(b))):
        return 0

    # Exceptions to punctuation.
    if _match_ampersand(a, b):
        return 0.85
    # Penalize punctuation matching to non-punctuation.
    if _ispunc(a) and _ispunc(b):
        return 0.95
    if ((_ispunc(a) and not _ispunc(b)) or
        (not _ispunc(a) and _ispunc(b))):
        return 0

    # Problems with phonetic match functions segfaulting on
    # empty strings. Also beneficial to match strings with
    # no alpha characters to each other (e.g., line numbers).
    a_alpha = u''.join([ c for c in a if c.isalpha() ])
    b_alpha = u''.join([ c for c in b if c.isalpha() ])
    if a_alpha == '' and b_alpha == '':
        return 0.85

    # Strings sound alike (approximate phonetic match).
    if jf.match_rating_comparison(a_alpha, b_alpha):
        return 0.9
    if jf.metaphone(a_alpha) == jf.metaphone(b_alpha):
        return 0.9
    if jf.soundex(a_alpha) == jf.soundex(b_alpha):
        return 0.9
    if jf.nysiis(a_alpha) == jf.nysiis(b_alpha):
        return 0.9

    # Use scaled Jaro-Winkler distance.
    return jf.jaro_winkler(a, b)
Пример #36
0
    def correct(self, wrongWord):
        candidates = []
        candidateDistList = []
        wWTGrams = self.getGrams(wrongWord, SpellChecker.invertMapGram)

        for trigram in wWTGrams:
            if trigram in SpellChecker.invertTriMap:
                candidates = candidates + SpellChecker.invertTriMap[trigram]
        candidates = list(set(candidates))
        #print (len(candidates))

        for candidate in candidates:
            if abs(len(candidate) - len(wrongWord)) > 2:
                continue
            if wrongWord == candidate:
                continue
            ed = self.compED(candidate, wrongWord)
            jd = jellyfish.jaro_distance(wrongWord, candidate)
            gd = self.getJackSim(
                self.getGrams(candidate, SpellChecker.jackardGram),
                self.getGrams(wrongWord, SpellChecker.jackardGram))
            score = gd * SpellChecker.dictCountMap[
                candidate] / SpellChecker.totalCount * (1 /
                                                        (ed + 1)) * (1 /
                                                                     (jd + 1))
            if jellyfish.metaphone(wrongWord) == jellyfish.metaphone(
                    candidate):
                score = score + 0.1
            if jellyfish.soundex(wrongWord) == jellyfish.soundex(candidate):
                score = score + 0.1
            if jellyfish.nysiis(wrongWord) == jellyfish.nysiis(candidate):
                score = score + 0.1
            if jellyfish.match_rating_codex(
                    wrongWord) == jellyfish.match_rating_codex(candidate):
                score = score + 0.1
            tmpCandidate = ScoreRcd(candidate, ed, score)
            candidateDistList.append(tmpCandidate)
        candidateDistList.sort()
        return candidateDistList
Пример #37
0
def get_hash(word, hash_type):
    if hash_type == "SOUNDEX":
        hash = jellyfish.soundex(word)
    elif hash_type == "NYSIIS":
        hash = jellyfish.nysiis(word)
    elif hash_type == "MRA":
        hash = jellyfish.match_rating_codex(word)
    elif hash_type == "METAPHONE":
        hash = jellyfish.metaphone(word)
    else:
        raise NotImplementedError(
            "approach '{}' not implemented".format(hash_type))
    return hash
Пример #38
0
    def test_soundex(self):
        cases = [("Washington", "W252"),
                 ("Lee", "L000"),
                 ("Gutierrez", "G362"),
                 ("Pfister", "P236"),
                 ("Jackson", "J250"),
                 ("Tymczak", "T522"),
                 ("", ""),
                 ("A", "A000"),
                 (u"Çáŕẗéř", "C636"),
                 ]

        for (s1, code) in cases:
            self.assertEqual(jellyfish.soundex(s1), code)
Пример #39
0
 def transform(self, data):
     if isinstance(data, basestring):
         return soundex(unicode(data))
Пример #40
0
def extract_feature(name, standard):
    """ (string, string) --> [boolean, boolean, boolean, int, int, int, boolean, boolean, boolean, int]
    extracts various features for each record (name, standard) and exports results in form of a list of booleans and integers.

    >>> extract_feature('ARINCK', 'AAFTINK')
     [0,0,0,1,1,1,?, ?, ?, 1]

    """
    if not name or not standard:
        return []

    f_list = [] # features list

    # f1: Boolean feature -- If first 2 letters of name and standard name are equal
    f_list.append(name[:2] == standard[:2])
    # f2: Boolean feature -- If last 2 letters of name and standard name are equal
    f_list.append(name[-2:] == standard[-2:])

    # f3: Boolean feature -- If size of name and standard name are equal
    f_list.append(len(name) == len(standard))

    # f4: Number feature -- absolute difference of name size and standard size
    f_list.append(abs(len(name) - len(standard)))

    # f5: Number feature--Number of longest first equal chars
    for i in xrange(1,len(name)+1):

        if not name[:i] == standard[:i]:
            break
    # print i, name, standard
    f_list.append(i-1)



    # f6: Number feature -- Number of longest last equal chars
    for i in range(len(name)):
        if not name[-i-1:] == standard[-i-1:]:
            break

    f_list.append(i)


    # f7: Boolean feature -- if soundex code of name and standard name is equal
    import jellyfish
    f_list.append(jellyfish.soundex(name) == jellyfish.soundex(standard))

    # f8: Boolean feature -- if metaphone code of name and standard name is equal

    f_list.append(jellyfish.metaphone(name) == jellyfish.metaphone(standard))

    # f9: Boolean feature -- if double-metaphone code of name and standard name is equal
    from preModules import metaphone
    dm_flag = False # a flag that shows whether two words have any common double-metaphone or not
    for dm1 in metaphone.doublemetaphone(name):
        for dm2 in metaphone.doublemetaphone(standard):
            if dm1 and dm2 and dm1 == dm2:
                dm_flag = True
                break

    f_list.append(dm_flag)

    # f10: Number feature -- longest common chars between name and its standard name
    from modules.basic_modules.basic import longest_common_substring
    f_list.append(len(longest_common_substring(name, standard)))

    return f_list
import jellyfish

#checking if two words are homophones (not much accurate)
x,y = map(str,input("Enter two words : ").split())
if(jellyfish.metaphone(x) == jellyfish.metaphone(y) or jellyfish.soundex(x) == jellyfish.soundex(y)):
    print("Homophones !")
else:
    print("Not Homophones !")
'''
#check difference between two words
#returns number of changes
print(jellyfish.levenshtein_distance(x,y))
'''
#     Jaro Distance
#     Jaro-Winkler Distance
#     Match Rating Approach Comparison
#     Hamming Distance

# Phonetic encoding:
#     American Soundex
#     Metaphone
#     NYSIIS (New York State Identification and Intelligence System)
#     Match Rating Codex
import jellyfish
print(jellyfish.levenshtein_distance('jellyfish', 'smellyfish'))  # 2; 编辑距离
print(jellyfish.jaro_distance('jellyfish', 'smellyfish'))  # 0.89629629629629637
print(jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs'))  # 1; 编辑距离, 带翻转的
print(jellyfish.metaphone('Jellyfish'))  # 'JLFX'
print(jellyfish.soundex('Jellyfish'))  # 'J412'
print(jellyfish.nysiis('Jellyfish'))  # 'JALYF'
print(jellyfish.match_rating_codex('Jellyfish'))  # 'JLLFSH'

##################################################################
## Lenvenshtein
import Levenshtein
print(Levenshtein.hamming('hello', 'helol'))  # 2; 计算汉明距离; 要求 str1 和 str2 必须长度一致; 是描述两个等长字串之间对应位置上不同字符的个数
print(Levenshtein.distance('hello', 'helol'))  # 2; 计算编辑距离(也成 Levenshtein 距离); 是描述由一个字串转化成另一个字串最少的操作次数, 在其中的操作包括插入 & 删除 & 替换
print(Levenshtein.distance('hello world asdf', 'helolaaaa world asdf'))  # 5
print(Levenshtein.ratio('hello', 'helol'))  # 0.8; 计算莱文斯坦比; 计算公式 r = (sum - ldist) / sum, 其中 sum 是指 str1 和 str2 字串的长度总和, ldist 是类编辑距离
# 注意: 这里的类编辑距离不是 2 中所说的编辑距离, 2 中三种操作中每个操作+1, 而在此处, 删除、插入依然+1, 但是替换+2
# 这样设计的目的: ratio('a', 'c'), sum=2, 按 2 中计算为(2-1)/2 = 0.5,' a','c'没有重合, 显然不合算, 但是替换操作+2, 就可以解决这个问题
print(Levenshtein.jaro('hello', 'helol'))  # 0.9333333333333332; 计算 jaro 距离; 用于健康普查
print(Levenshtein.jaro_winkler('hello', 'helol'))  # 0.9533333333333333; 计算 Jaro – Winkler 距离
import jellyfish
print jellyfish.levenshtein_distance('jellyfish', 'smellyfish')
#2
print jellyfish.jaro_distance('jellyfish', 'smellyfish')
#0.89629629629629637
print jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs')
#1

print jellyfish.metaphone('Jellyfish')
#'JLFX'
print jellyfish.soundex('Jellyfish')
#'J412'
print jellyfish.nysiis('Jellyfish')
#'JALYF'
print jellyfish.match_rating_codex('Jellyfish')
#'JLLFSH'
import jellyfish
print jellyfish.levenshtein_distance('jellyfish', 'smellyfish')
#2
print jellyfish.jaro_distance('jellyfish', 'smellyfish')
#0.89629629629629637
print jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs')
#1

print jellyfish.metaphone('Jellyfish')
#'JLFX'
print jellyfish.soundex('Jellyfish')
#'J412'
print jellyfish.nysiis('Jellyfish')
#'JALYF'
print jellyfish.match_rating_codex('Jellyfish')