예제 #1
0
파일: names.py 프로젝트: qianrenjian/beard
def phonetic_tokenize_name(name, phonetic_algorithm="double_metaphone"):
    """Create Double Metaphone tokens from the string.

     Parameters
    ----------
    :param name: string
        Name of the author. Usually it should be in the format:
        surnames, first names.

    :param phonetic algorithm: string
        Which phonetic algorithm will be used. Options:
        -  "double_metaphone"
        -  "nysiis"
        -  "soundex"

    Returns
    -------
    :return: tuple
        The first element is a tuple with the tokens for surnames, the second
        is a tuple with the tokens for first names. The tuple always contains
        exactly two elements. Only the first results of the double metaphone
        algorithm are included in tuples.
    """
    if phonetic_algorithm == "soundex":
        error = (
            "The version of the 'fuzzy' package in use has a buggy soundex"
            " implementation (see https://github.com/yougov/fuzzy/issues/14 ),"
            " downgrade the package to 1.1 (compatible with Python 2 only) if"
            " you want to use the soundex phonetic encoding.")
        try:
            if fuzzy.Soundex(4)("fuzzy") != "F200":
                raise ValueError(error)
        except UnicodeDecodeError:
            raise ValueError(error)

    dm = fuzzy.DMetaphone()
    soundex = fuzzy.Soundex(5)
    phonetic_algorithms = {
        "double_metaphone": lambda y: (dm(y)[0] or b'').decode(),
        "nysiis": lambda y: fuzzy.nysiis(y),
        "soundex": lambda y: soundex(y)
    }

    tokens = tokenize_name(name)
    # Use double metaphone
    tokens = tuple(
        map(
            lambda x: tuple(
                map(lambda y: phonetic_algorithms[phonetic_algorithm](y), x)),
            tokens))

    return tokens
def match_citations_with_papers(papers):
    """
        Paramters:
            papers: Json representing the papers. Each paper has a list of references
        Returns: Json representing edges in teh citation network

        Uses fuzzy matching to compare paper titles to the reference titles. If there
        is a match then an edge is added to the edge list. This is used to build the
        citations table
    """
    edges = []
    i = 1
    for source in papers:
        print "doon", i
        i += 1
        soundex = fuzzy.Soundex(25)
        for reference in source['references']:
            for target in papers:
                s = remove_stopwords(reference.encode('UTF-8'))
                t = remove_stopwords(target['title'].encode('UTF-8'))
                if soundex(s) == soundex(t):
                    edges.append({
                        'source': source['doi'],
                        'target': target['doi']
                    })
    return edges
예제 #3
0
def seq_matcher(name1, name2):
    name1 = unicode(
        unicodedata.normalize('NFKD', name1).encode('ascii', 'ignore'),
        'utf-8')
    name2 = unicode(name2, 'utf-8')
    name2 = unicode(
        unicodedata.normalize('NFKD', name2).encode('ascii', 'ignore'),
        'utf-8')

    soundex = fuzzy.Soundex(4)
    name1 = soundex(name1)
    name2 = soundex(name2)

    # dmeta = fuzzy.DMetaphone()
    # name1 = dmeta(name1)[0]
    # name2 = dmeta(name2)[0]

    # name1 = fuzzy.nysiis(name1)
    # name2 = fuzzy.nysiis(name2)

    m = SequenceMatcher(None, name1, name2)
    # Calculate an edit distance"abcef"
    # print 'm',m.ratio()
    e = editdist.distance(name1, name2)
    # print 'e',e
    sm = StringMatcher(seq1=name1, seq2=name2)
    # return e
    # print sm.distance()
    return sm.distance()
예제 #4
0
 def __diffsoundex(self, query, name, value=4):
     soundex = fuzzy.Soundex(value)
     a = soundex(name)
     b = soundex(query)
     if a[0] == b[0]:
         return abs(int(a[1:]) - int(b[1:]))
     else:
         return abs(int(a[1:]) - int(b[1:])) + 250
예제 #5
0
def test_phonetic_tokenize_name_python2():
    """Test checking if custom phonetic algorithms from fuzzy packages work."""
    import fuzzy
    soundex = fuzzy.Soundex(5)
    assert phonetic_tokenize_name("Dupont, René", "nysiis") == (
        ((fuzzy.nysiis(u"Dupont"),), (fuzzy.nysiis(u"René"),)))
    assert phonetic_tokenize_name("Dupont, René", "soundex") == (
        # no direct support for unicode in soundex, thus "Rene"
        ((soundex(u"Dupont"),), (soundex(u"Rene"),)))
def get_soundex_dict(idf_dict):
    """
    This methods precomputes the soundex code of every unique token in the corpus. 
    """
    soundex_dict = {}
    soundex = fuzzy.Soundex(4)
    for term in idf_dict.keys():
        soundex_dict[term] = get_soundex(term)
    return soundex_dict
예제 #7
0
def compare(input_list, keywords_dictionary, word_weights):
    # Load phonetics functions
    dmeta = fuzzy.DMetaphone()
    metaphone = lambda x: dmeta(x)[0]
    soundex = fuzzy.Soundex(4)
    phonetics_methods = [metaphone, soundex]

    # initiate empty dictionary for scores
    scores = {}

    # Iterate through methods for solving, then iterate through words in
    # scrubbed user input. For each word, compare phonetics to all keywords
    # and add score to the scores dictionary. After, do normal QWERTY and LD
    # analyses
    for method, keywords in keywords_dictionary.iteritems():
        scores[method] = 0
        # print(method)
        # Phonetic Scoring methods
        for phonetic in phonetics_methods:
            formatted_array = np.asarray(map(phonetic, keywords))

            for word in input_list:
                formatted_word = phonetic(word)
                dist_array = \
                normalized_damerau_levenshtein_distance_withNPArray(
                 formatted_word, formatted_array)

                dist = min(dist_array)

                # Handle cases where "not" was found within the input - add to
                #    scores dictionary.
                weight = word_weights.get(word) if word_weights.get(
                    word) else 1

                scores[method] += weight * math.sqrt(dist)

        # For QWERTY and Damerau-Levenshtein distances, calcuate the differences
        for word in input_list:
            # Do QWERTY Keyboard analysis
            dist_array = normalized_keyboard_word_distance_withNPArray(
                word, keywords)
            dist = min(dist_array)

            # handle weighting for position from "not"
            weight = word_weights.get(word) if word_weights.get(word) else 1
            scores[method] += weight * math.sqrt(dist)

            # Do normal LD analysis
            dist_array = normalized_damerau_levenshtein_distance_withNPArray(
                word, np.asarray(keywords))
            dist = min(dist_array)

            weight = word_weights.get(word) if word_weights.get(word) else 1
            scores[method] += weight * math.sqrt(dist)

    return scores
예제 #8
0
def phonetic_matching(s1, s2):
    """Computing the phonetic sound of 2 strings and using LD to compute its
    similarity.

    :param s1: First string.
    :param s2: A second string.
    :returns: The LD between the 2 string phonetic representations.
    """
    soundex = fuzzy.Soundex(4)
    return levenshtein_distance(soundex(s1), soundex(s2))
예제 #9
0
def using_soundex():
    soundex = fuzzy.Soundex(4)
    soundex_predics = []
    for element in misspell:
        temp = []
        for elem in dict:
            if soundex(element) == soundex(elem):
                temp.append(elem)
        soundex_predics.append(temp)
    return soundex_predics
예제 #10
0
    def generateSoundexHash(self, dictionary, table=None):
        soundexHash = {} if table is None else table

        for name, gender in dictionary.iteritems():
            name = self._sanitizeName(name)

            if len(name) > 1:
                soundhash = fuzzy.Soundex(4)(name)
                self._appendToDict(soundhash, gender, soundexHash)

        return soundexHash
예제 #11
0
def checkIfSongExists(curr_song, songs_list):
    retVal = False
    matched_song = ""
    song_name = curr_song['name']
    for s in songs_list:
        #print song
        song = songs_list[s]['name']
        if (len(song) > len(song_name)):
            soundex = fuzzy.Soundex(len(song))
        else:
            soundex = fuzzy.Soundex(len(song_name))
        phonectic_distance = fuzz.ratio(soundex(song), soundex(song_name))
        if ('(' in song.lower() and '(' in song_name.lower()):
            parmatch, tryagain = getparanthesismatch(song.lower(),
                                                     song_name.lower())
            if (parmatch == True):
                if (curr_song['artistName'].lower()
                        == songs_list[s]['artistName'].lower() and
                        checkFtArtist(curr_song['featArtists'],
                                      songs_list[s]['featArtists']) == True):
                    retVal = True
                    #print song_name + ' -------------- ' + song
                    #print "paranthesis match"
                    matched_song = s
                    break
        normal_distance = fuzz.ratio(song.lower(), song_name.lower())
        if (phonectic_distance >= 90 and normal_distance >= 85):
            if (curr_song['artistName'].lower() !=
                    songs_list[s]['artistName'].lower()):
                continue
            if (checkFtArtist(curr_song['featArtists'],
                              songs_list[s]['featArtists']) == False):
                continue
            retVal = True
            #print song_name + ' -------------- ' + song
            #print songs_list[s]['year']
            #print curr_song['year']
            #print str(phonectic_distance) + " ######### " + str(normal_distance)
            matched_song = s
            break
    return retVal, matched_song
예제 #12
0
class Validator:

    soundex = fuzzy.Soundex(4)

    def __init__(self, key_sentences: dict):
        self.key_sentences = key_sentences

    def validate_phonetic_similarities(self, input_text: str,
                                       key_sentence: str):
        key_length = len(key_sentence)
        if key_length <= 5:
            levenshtein_max_op = 2
        else:
            levenshtein_max_op = 4

        if Levenshtein.distance(input_text,
                                key_sentence) <= levenshtein_max_op:
            return True

        # if self.soundex(key_sentence) == self.soundex(input_text):
        #     return True

        return False

    def validate_wakeupword(self, input_sentence: str):
        self.wake_up_word = self.key_sentences.get("wake_up_word")
        return self.validate_phonetic_similarities(input_sentence,
                                                   self.wake_up_word)

    def validate_input(self, input_text):
        if not isinstance(input_text, str):
            return DataProcessingResult(success=False,
                                        is_wake_up_word=False,
                                        sentence=input_text,
                                        guess="Invalid")

        if self.validate_wakeupword(input_text):
            return DataProcessingResult(success=True,
                                        is_wake_up_word=True,
                                        sentence=input_text,
                                        guess=self.wake_up_word)

        for key_sentence in self.key_sentences.get("commands"):
            if self.validate_phonetic_similarities(input_text, key_sentence):
                return DataProcessingResult(success=True,
                                            is_wake_up_word=False,
                                            sentence=input_text,
                                            guess=key_sentence)

        return DataProcessingResult(success=False,
                                    is_wake_up_word=False,
                                    sentence=input_text,
                                    guess="No guess")
예제 #13
0
 def __init__(self, token=None, email=None, password=None):
     self.soundex = fuzzy.Soundex(4)
     self.token = token
     self.email = email
     self.password = password
     self.access_token = None
     self.circles = {}
     self.people = {}
     self.scan = False
     self.delay = 10
     if self.token is None:
         self.setDefaultToken()
예제 #14
0
def tokens_to_vocab(data, phonetic_encoding="none"):
    """
    Used for token level string edit distance or phonetic encoding.
    Transforms tokens to vocabulary (t1 = a, t2 = b, t3 = c...)
    :param data: matrix, (data_size by 3) one line is [s1 \t s2 \t label], where s1 is made of tokens t1 t2 t3...
    :param labels: Ground truth (whether or not these two strings refer to the same entity)
    :param phonetic_encoding: name of phonetic encoding for string being used.
        "soundex": Soundex encoding
        "nysiis": Nysiis encoding
        "LCS": Longest common subsequence.
        Anything else: Program will shut down.
    :Returns array of size
    :return: (data_size by 3) where each line is s1 \t s2 \t label where s1/2 are encoded appropriately.
    """

    new_data = []
    if phonetic_encoding == "soundex":
        soundex = fuzzy.Soundex(4)
    ct = 0
    for line in data:
        ct += 1
        if ct % 10000 == 0:
            print("{} tokenized!".format(ct))
        tokens_made = []
        new_strings = ["", ""]
        new_string_1 = ""
        new_string_2 = ""
        for idx in [0, 1]:
            phrase = line.rstrip()[idx]
            for token in phrase.split(" "):
                #PHONETIC ENCODING WILL ONLY WORK WITH STRING EDIT DISTANCE
                if phonetic_encoding == "nysiis":
                    token = fuzzy.nysiis(token)
                elif phonetic_encoding == "soundex":
                    token = token.encode("UTF-8")
                    try:
                        token = soundex(token)
                    except:
                        print("Could not apply soundex to {}".format(token))
                        token = token
                else:
                    print("Provide phonetic encoding")
                    sys.exit(1)
                if token not in tokens_made:
                    tokens_made.append(token)
                if len(tokens_made) > 100:
                    print("TOO LONG!", tokens_made)
                new_strings[idx] += chr(ord("!") + tokens_made.index(token))
                new_strings[idx] = new_strings[idx].encode("UTF-8")
        new_data.append(new_strings)
    return new_data
예제 #15
0
    def determineFromSoundex(self, firstName):
        hashTable = {}
        self.generateSoundexHash(self.firstDict, hashTable)
        self.generateSoundexHash(self.secondDict, hashTable)

        firstName = self._sanitizeName(firstName)
        nameHash  = fuzzy.Soundex(4)(firstName)

        if nameHash in hashTable:
            results = hashTable[nameHash]
            gender  = max(results, key=results.get)

            if results[gender] > 0:
                return gender

        return self.options['unknown']
예제 #16
0
def phonetic_tokenize_name(name, phonetic_algorithm="double_metaphone"):
    """Create Double Metaphone tokens from the string.

     Parameters
    ----------
    :param name: string
        Name of the author. Usually it should be in the format:
        surnames, first names.

    :param phonetic algorithm: string
        Which phonetic algorithm will be used. Options:
        -  "double_metaphone"
        -  "nysiis" (only for Python 2)
        -  "soundex" (only for Python 2)

    Returns
    -------
    :return: tuple
        The first element is a tuple with the tokens for surnames, the second
        is a tuple with the tokens for first names. The tuple always contains
        exactly two elements. Only the first results of the double metaphone
        algorithm are included in tuples.
    """
    if sys.version[0] == '2':
        import fuzzy
        dm = fuzzy.DMetaphone()
        soundex = fuzzy.Soundex(5)
        phonetic_algorithms = {
            "double_metaphone": lambda y: dm(y)[0] or '',
            "nysiis": lambda y: fuzzy.nysiis(y),
            "soundex": lambda y: soundex(y)
        }
    else:
        from ..ext.metaphone import dm
        phonetic_algorithms = {"double_metaphone": lambda y: dm(y)[0]}

    tokens = tokenize_name(name)
    # Use double metaphone
    tokens = tuple(
        map(
            lambda x: tuple(
                map(lambda y: phonetic_algorithms[phonetic_algorithm](y), x)),
            tokens))

    return tokens
def main():
    dictFp = open(sys.argv[1], 'r')
    inputFp = open(sys.argv[2], 'r')
    soundex = fuzzy.Soundex(5)

    d = []
    #print("Dictionary Loading")
    for line in dictFp:
        d.append(line.strip())

    d = sorted(d)
    s = []

    for a in d:
        s.append(soundex(a))

    #print("Dictionary Loaded")

    done = set()

    for line in inputFp:
        uid, tid, tweet, date = line.split('\t')

        words = tweet.split()
        for word in words:
            if(word in done):
                continue
            done.add(word)
            sdex = soundex(word)

            output = []
            for i in range(len(s)):
                if(s[i] == sdex):
                    output.append(d[i])

            if(len(output)):
                sys.stdout.write("{}: ".format(word))
                i = 0
                for w in output:
                    if w != word:
                        sys.stdout.write(w + " ")
                    i+=1
                    if(i == 10):
                        break;
                print("")
예제 #18
0
def match_soundex(token):
    dictSet = getDict()
    candidates = []
    candidatesG = []
    bestMatch = ""

    soundex = fuzzy.Soundex(4)
    soundex_token = soundex(token)

    candidates = [
        match for match in dictSet if soundex(match) == soundex_token
    ]

    if len(candidates) > 1:
        G = ngram.NGram(candidates)
        candidatesG = G.search(token)
        if len(candidatesG) > 0:
            bestMatch = candidatesG[0][0]
    elif len(candidates) == 1:
        bestMatch = candidates[0]

    return bestMatch, candidates, candidatesG
def soundex(collection, zero=False):
    """
    Returns a soundexed encoded version of the collection.
    """

    import fuzzy

    soundex = fuzzy.Soundex(4)
    try:
        assert type(collection) == list
    except AssertionError:
        print("Input collection is not a list.")

    collectionEncoded = list()
    for i, word in enumerate(tqdm(collection)):
        wordEncoded = soundex(word)
        if not zero:    # Optional: remove 0s.
            wordEncoded = wordEncoded.strip('0')
        collectionEncoded.append(wordEncoded)
    
        
    return collectionEncoded
예제 #20
0
def match_levenshtein_soundex(token):
    dictSet = getDict()
    candidates = []
    candidatesG = []
    bestMatch = ""
    minDistance = 3

    for item in dictSet:
        distance = Levenshtein.distance(token.lower(), item.lower())
        if distance == 0:
            return item, [], []
        elif distance < minDistance:
            minDistance = distance
            candidates = []
        if distance == minDistance:
            candidates.append(item.lower())

    soundex = fuzzy.Soundex(4)
    soundex_token = soundex(token)

    soudex_candidates = [
        match for match in candidates if soundex(match) == soundex_token
    ]

    if len(soudex_candidates) != 0:
        candidates = soudex_candidates

    if len(candidates) > 1:
        G = ngram.NGram(candidates)
        candidatesG = G.search(token)
        if len(candidatesG) > 0:
            bestMatch = candidatesG[0][0]
    elif len(candidates) == 1:
        bestMatch = candidates[0]

    return bestMatch, candidates, candidatesG
예제 #21
0
파일: index.py 프로젝트: kgpayne/book-tools
 def __init__(self, left_on, right_on=None, **kwargs):
     super().__init__(**kwargs)
     self.left_on = listify(left_on)
     self.right_on = listify(right_on) if right_on else self.left_on
     self.soundex = fuzzy.Soundex(4)
예제 #22
0
def fingerprint_word(word):
    return "%s%02d" % (fuzzy.Soundex(5)(word)[1:],(len(fuzzy.nysiis(word))))
예제 #23
0
def test_soundex_does_not_mutate_strings():
    phrase = 'FancyFree'
    fuzzy.Soundex(4)(phrase)
    buffer = ctypes.c_char_p(phrase.encode())
    assert buffer.value.decode() == "FancyFree"
예제 #24
0
def test_soundex_result():
    phrase = 'FancyFree'
    res = fuzzy.Soundex(4)(phrase)
    assert res == 'F521'
예제 #25
0
def test_soundex_non_ascii():
    assert fuzzy.Soundex(8)('Jéroboam') == 'J615'
예제 #26
0
def soundex(s, e=fuzzy.Soundex(4)):
    # Return a list to be like metaphone
    return [e(s)]
예제 #27
0
def test_soundex_Test():
    assert fuzzy.Soundex(8)('Test') == 'T23'
예제 #28
0
 def __init__(self):
     import fuzzy
     self.soundex = fuzzy.Soundex(4)
예제 #29
0
#!/bin/python
import fuzzy

wf = open("test_sents.txt", 'r')
#lf = open("../hinglishData/lang_ids.txt", 'r')
sf = open("test_soundex.txt", 'w')
wlines = wf.readlines()
#llines = lf.readlines()

soundex = fuzzy.Soundex(4)
for i in range(len(wlines)):
    #wlines[i] = "leather bag , belt aur shoe dull dikhne lage ho , to oon par kela ka chhilka ragadne se unme chamak aa jati hain"
    x = wlines[i].strip().split()
    sx = []
    for el in x:
        sxcode = "x_x"
        try:
            sxcode = soundex(el)
            if sxcode.strip() == " " or sxcode.strip() == "":
                sxcode = "x_x"
        except:
            sxcode = "x_x"
        sx.append(sxcode)
        #print el
        #print sxcode
        #raw_input()
    sx = " ".join(sx)
    sf.write(sx + "\n")
    #print wlines[i].strip()
    #print sx
    #print len(x)
예제 #30
0
import fuzzy
# Convert up to 10 characters to phonetic character
soundex = fuzzy.Soundex(10)
# Text to process
word = 'phone'
soundex(word)

#Doc2vec
#stemming, lemmatization, n-grams, stop word removal etc

#Import packages
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

## Exapmple document (list of sentences)
doc = ["I love data science",
        "I love coding in python",
        "I love building NLP tool",
        "This is a good phone",
        "This is a good TV",
        "This is a good laptop"]

# Tokenization of each document
tokenized_doc = []
for d in doc:
    tokenized_doc.append(word_tokenize(d.lower()))
tokenized_doc

# Convert tokenized document into gensim formated tagged data
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_doc)]
tagged_data