Пример #1
0
 def compare_distance(self, other):
     '''
     '''
     jarowinkler = JaroWinkler()
     dist_1 = jarowinkler.similarity(str.lower(self.data['name']),
                                     str.lower(other.data['name']))
     dist_2 = jarowinkler.similarity(str.lower(self.data['address']),
                                     str.lower(other.data['address']))
     if dist_2 > 0.85:
         return dist_1
     return 0
Пример #2
0
def check_for_text_match(self,str1,strlist):
    jarowinkler = JaroWinkler()
    similarities=[]
    for str2 in strlist:
        similarities.append(jarowinkler.similarity(str1, str2))
    index_max = np.argmax(similarities)
    if index_max >= .70:
        return strlist[index_max]
    similarities=[]
    for str2 in strlist:
        similarities.append(jarowinkler.similarity(str1.lower(), str2.lower())) 
    index_max = np.argmax(similarities) 
    if index_max >= .70:
        return strlist[index_max]  
    else:
        return None
Пример #3
0
 async def similarity(username: str, password: str):
     jw = JaroWinkler()
     res = jw.similarity(username, password)
     if res <= 0.6:
         return True
     else:
         return False
Пример #4
0
def jaro_winkler(keyword, domain):
    """Compute Jaro Winkler similarity

    Args:
        keyword:
        domain:

    Returns:
        jarowinkler.similarity: (float) between 0.0 and 1.0

    """
    jarowinkler = JaroWinkler()
    return jarowinkler.similarity(keyword, domain)
Пример #5
0
 #     normalized_levenshtein.distance(str1a, str1b),
 #     normalized_levenshtein.distance(str2a, str2b),
 #     normalized_levenshtein.distance(str3a, str3b)
 # ],
 # 'Damerau': [
 #     damerau.distance(str1a, str1b),
 #     damerau.distance(str2a, str2b),
 #     damerau.distance(str3a, str3b)
 # ],
 # 'OptimalStringAlignment': [
 #     optimal_string_alignment.distance(str1a, str1b),
 #     optimal_string_alignment.distance(str2a, str2b),
 #     optimal_string_alignment.distance(str3a, str3b)
 # ],
 'JaroWinkler':
 [jarowinkler.similarity(strA, strB) for strA, strB in stringsAB],
 # 'LongestCommonSubsequence': [
 #     lcs.distance(str1a, str1b),
 #     lcs.distance(str2a, str2b),
 #     lcs.distance(str3a, str3b)
 # ],
 # 'MetricLCS': [
 #     metric_lcs .distance(str1a, str1b),
 #     metric_lcs .distance(str2a, str2b),
 #     metric_lcs .distance(str3a, str3b)
 # ],
 'NGram(2)': [twogram.distance(strA, strB) for strA, strB in stringsAB],
 # 'QGram(2)': [
 #     qgram.distance(str1a, str1b),
 #     qgram.distance(str2a, str2b),
 #     qgram.distance(str3a, str3b)
Пример #6
0
else:
    network.get_track(artist, title).unlove()
    logger.write('   LOVE: Unloved the Song on LastFm\n')

############### String Compare ####
jarowinkler = JaroWinkler()

if last_song[
        0] != title:  # Check, so that this program doesn't scrobble the song multiple times
    last_scrobble = network.get_user(
        lastFmCreds['username']).get_recent_tracks(limit=1)

    logger.write('   LastFM: Last song was %s by %s\n' %
                 (last_scrobble[0][0].title, last_scrobble[0][0].artist))

    if jarowinkler.similarity(str(last_scrobble[0][0].title.lower(
    )), title.lower()) < 0.9:  # check that "nobody else" scrobbled the song
        unix_timestamp = int(time.mktime(datetime.datetime.now().timetuple()))
        if 'album' in locals():
            network.scrobble(artist=artist,
                             title=title,
                             timestamp=unix_timestamp,
                             album=album)
            network.update_now_playing(artist=artist, title=title, album=album)
        else:

            network.scrobble(artist=artist,
                             title=title,
                             timestamp=unix_timestamp)
            network.update_now_playing(artist=artist, title=title)
        logger.write('   Scrobbled %s by %s\n' % (title, artist))
        with open('last_song.json', 'w') as f:
Пример #7
0
f = open("akcigerhastaligi.txt", encoding="utf8")
df = f.read()


def basic_clean(text):
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('turkish')
    words = re.sub(r'[^\w\s]', '', text).split()
    return [wnl.lemmatize(word) for word in words if word not in stopwords]


words = basic_clean(df)
unigrams = nltk.ngrams(words, 1)
unigramsFrequency = Counter(unigrams)
valuesOfUnigrams = list(unigramsFrequency.values())
unigramlist = list(unigramsFrequency)

for x in range(0, len(unigramlist)):
    if (valuesOfUnigrams[x] > 4):
        print(unigramlist[x], "is used", valuesOfUnigrams[x], "times")

jarowinkler = JaroWinkler()
print(jarowinkler.similarity('öksürük', 'öksürk'))
print(jarowinkler.similarity('akciğer', 'akciğr'))
print(jarowinkler.similarity('kanser', 'akciğr'))
print(jarowinkler.similarity('kanser', 'öksürk'))
print(jarowinkler.similarity('akciğer', 'öksürk'))

print('öksürk     ' 'öksürük')
print('akciğr     ' 'akciğer')
print('kansr      ' 'kanser')
Пример #8
0
def string_distance(a, b):
    jarowinkler = JaroWinkler()
    return jarowinkler.similarity(a, b) + jaccard(a, b)
Пример #9
0
cosine = Cosine(2)
sorensenDice = SorensenDice(2)
jaccard = Jaccard(2)
qgram = QGram(2)
jaroWinkler = JaroWinkler()
normalizedLevenshtein = NormalizedLevenshtein()
stringSimilarity = StringDistance()
s0 = '烟台大学人文学院'
s1 = '江西农业大学'
# print(cosine.get_profile(s1))
# print(sorensenDice.get_profile(s1))
print(cosine.similarity(s0, s1))
print(sorensenDice.similarity(s0, s1))
print(jaccard.similarity(s0, s1))
print(jaroWinkler.similarity(s0, s1))
print(normalizedLevenshtein.similarity(s0, s1))
# print(qgram.distance(s0, s1))
similarity_list = [jaroWinkler, cosine, jaccard, normalizedLevenshtein]

er_process_with_similarity(path_o1, path_o2, path_t,
                           'result_cn_with_similarity', similarity_list)
# er_process(path_o, path_t, 'result_cn', segment)

# seg = seg.iterator()
# tmp_list = []
# for i in seg:
#     tmp_list.append(str(i))
#
# print(tmp_list)
#
Пример #10
0
from strsimpy.ngram import NGram
from strsimpy.qgram import QGram

qgram = QGram(2)
print(qgram.distance('ABCD', 'ABCE'))

twogram = NGram(2)
print(twogram.distance('ABCD', 'ABTUIO'))

s1 = 'Adobe CreativeSuite 5 Master Collection from cheap 4zp'
s2 = 'Adobe CreativeSuite 5 Master Collection from cheap d1x'
fourgram = NGram(4)
print(fourgram.distance(s1, s2))

jarowinkler = JaroWinkler()
print(jarowinkler.similarity('My string', 'My tsring'))
print(jarowinkler.similarity('My string', 'My ntrisg'))

optimal_string_alignment = OptimalStringAlignment()
print(optimal_string_alignment.distance('CA', 'ABC'))

damerau = Damerau()
print(damerau.distance('ABCDEF', 'ABDCEF'))
print(damerau.distance('ABCDEF', 'BACDFE'))
print(damerau.distance('ABCDEF', 'ABCDE'))
print(damerau.distance('ABCDEF', 'BCDEF'))
print(damerau.distance('ABCDEF', 'ABCGDEF'))
print(damerau.distance('ABCDEF', 'POIU'))

normalized_levenshtein = NormalizedLevenshtein()
print(normalized_levenshtein.distance('My string', 'My $string'))