Пример #1
0
def keywordize(str):
    """
    Splits a string into words, removes common stopwords, stems and removes
    duplicates.
    """
    return set([jellyfish.porter_stem(word.lower().encode('ascii',
                                                          'ignore'))
                for word in tokenize(str)
                if (word.isalpha() or word.isdigit()) and
                word.lower() not in stop_words])
Пример #2
0
def keywordize(str):
    """
    Splits a string into words, removes common stopwords, stems and removes
    duplicates.
    """
    return set([jellyfish.porter_stem(word.lower().encode('ascii',
                                                          'ignore'))
                for word in tokenize(str)
                if (word.isalpha() or word.isdigit()) and
                word.lower() not in stop_words])
Пример #3
0
def porter_stemmer(d):
    start = 0
    end = 0
    nu_d = ''
    for char in d:
        if char == ' ':
            #print('Stemming ', s[start:end])
            nu_d = nu_d + jellyfish.porter_stem(d[start:end]) + ' '
            start = end
        end += 1
    d = nu_d
    return d
Пример #4
0
def main():
    # declare test strings
    # rem: u prefix is required jellyfish convention
    str1 = u'Jellyfish' 
    str2= u'Smellyfish'
    
    
    # test Phonetic Encoding
    print('\nPhonetic Encoding ----------------------------')
    
    # Metaphone
    r1 = jellyfish.metaphone(str1)
    r2 = jellyfish.metaphone(str2)
    print('Metaphone: ', r1, ", ", r2)
    
    # American Soundex
    r1 = jellyfish.soundex(str1)
    r2 = jellyfish.soundex(str2)
    print('Soundex: ', r1, ", ", r2)
    
    # NYSIIS
    r1 = jellyfish.nysiis(str1)
    r2 = jellyfish.nysiis(str2)
    print('NYSIIS: ', r1, ", ", r2)

    # Match Rating Codex    
    r1 = jellyfish.match_rating_codex(str1)
    r2 = jellyfish.match_rating_codex(str2)
    print('Match Rating Codex: ', r1, ", ", r2)
    
    
    # test Stemming
    print('\nStemming -------------------------------------')
    pStr1 = u'Jellyfished'
    pStr2 = u'Smellyfishing'
    r1 = jellyfish.porter_stem(str1)
    r2 = jellyfish.porter_stem(str2)
    print('Porter Stemmer: ', r1, ", ", r2)
    
    
    # test String Comparison
    print('\nString Comparisons ---------------------------')
    
    # Levenshtein Distance
    r = jellyfish.levenshtein_distance(str1, str2)
    print('Levenshtein Distance: ', r)

    # Damerau-Levenshtein Distance
    r = jellyfish.damerau_levenshtein_distance(str1, str2)
    print('Damerau-Levenshtein Distance: ', r)
    
    # Hamming Distance
    result = jellyfish.hamming_distance(str1, str2)
    print('Hamming Distance: ', r)

    # Jaro Distance
    result = jellyfish.jaro_distance(str1, str2)
    print('Jaro Distance: ', r)
    
    # Jaro-Winkler Distance
    result = jellyfish.jaro_winkler(str1, str2)
    print('Jaro-Winkler Distance: ', r)
    
    # Match Rating Approach (comparison)
    r = jellyfish.match_rating_comparison(str1, str2)
    print('Match Rating Comparison: ', r)
     
        
    # end program
    print('Done.')
Пример #5
0
 def test_porter_stem(self):
     with open('porter-test.csv') as f:
         reader = csv.reader(f)
         for (a, b) in reader:
             self.assertEqual(jellyfish.porter_stem(a.lower()), b.lower())
Пример #6
0
def porter_stem(s):
    return None if s == None else J.porter_stem(s)
Пример #7
0
import jellyfish

s = 'My doctors are both single students who like sandwiches.'

print('Before: ', s)

start = 0
end = 0
nu_s = ''
for char in s:
    if char = ' ':
        print('Stemming ', s[start:end])
        nu_s = nu_s + jellyfish.porter_stem(s[start:end]) + ' '
        start = end
    end += 1

print('After: ', nu_s)

print('Done.')