예제 #1
0
 def jaro_sim(self):
     self.cluster = []
     for i in range(0,len(self.group)):
         for j in range(i+1, len(self.group)):
             if self.threshold <= jf.jaro_similarity(str(self.group[i]),str(self.group[j])):
                 self.cluster.append([str(self.group[i]),str(self.group[j])])
     return self.cluster
예제 #2
0
def simple_example():
    # String comparison.
    str1, str2 = u'jellyfish', u'smellyfish'

    print("jellyfish.levenshtein_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.levenshtein_distance(str1, str2)))
    print("jellyfish.damerau_levenshtein_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.damerau_levenshtein_distance(str1, str2)))
    print("jellyfish.hamming_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.hamming_distance(str1, str2)))
    print("jellyfish.jaro_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_distance(str1, str2)))
    print("jellyfish.jaro_similarity({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_similarity(str1, str2)))
    print("jellyfish.jaro_winkler({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_winkler(str1, str2)))
    print("jellyfish.jaro_winkler_similarity({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_winkler_similarity(str1, str2)))
    print("jellyfish.match_rating_comparison({}, {}) = {}.".format(
        str1, str2, jellyfish.match_rating_comparison(str1, str2)))

    #--------------------
    # Phonetic encoding.
    ss = u'Jellyfish'

    print("jellyfish.metaphone({}) = {}.".format(ss, jellyfish.metaphone(ss)))
    print("jellyfish.soundex({}) = {}.".format(ss, jellyfish.soundex(ss)))
    print("jellyfish.nysiis({}) = {}.".format(ss, jellyfish.nysiis(ss)))
    print("jellyfish.match_rating_codex({}) = {}.".format(
        ss, jellyfish.match_rating_codex(ss)))
예제 #3
0
def getClosestPlayerName(playerName: str) -> str:
    closest = sorted(playerNames,
                     key=lambda savedName: jellyfish.levenshtein_distance(
                         savedName, playerName))[0]
    similarity = jellyfish.jaro_similarity(closest, playerName)
    if similarity > 0.7:
        return closest
    return "No Match"
예제 #4
0
    def jaro_apply(x):

        try:
            return jaro_similarity(x[0], x[1])
        except Exception as err:
            if pandas.isnull(x[0]) or pandas.isnull(x[1]):
                return np.nan
            else:
                raise err
예제 #5
0
def correct_state(df):
    state_list = ['sa','wa','nsw','qld','tas','vic','act','jbt','nt','cx','cc','hm']
    for index, row in df.iterrows():
        for index1, row2 in enumerate(state_list):
            value = row['state']
            score = jellyfish.jaro_similarity(str(value),str(row2))
            if(value != None and value not in (state_list)):
                df.state[index] = None
            elif(score > 0.62 and score < 1 and value != None  and value != 'sa' and value != 'wa'): #sa et wa etant trop proche on ne les compare pas
                df.state[index] = row2
    return df
예제 #6
0
	async def on_raw_message_edit(self, payload: discord.RawReactionActionEvent):
		if payload.cached_message.author.bot is False:
			cached_message = payload.cached_message.content
			data = payload.data['content']
			text = "edit '{0}' to '{1}'".format(cached_message, data)
			k = jellyfish.jaro_similarity(cached_message, data)
			dataL = len(data)
			cachL = len(cached_message)
			if k <= 0.45 or dataL*5 <= cachL:
				embed = await self.createEmbed(payload.cached_message, 'Edit')
				embed.description = text
				await payload.cached_message.channel.send(embed = embed)
def measure_distance(word1, word2, distance_type):
    if distance_type == 'lv':
        distance = Levenshtein.eval(word1, word2)
    if distance_type == 'dlv':
        distance = jellyfish.damerau_levenshtein_distance(word1, word2)
    if distance_type == 'jw':
        # Jaro–Winkler indicates the similiraty, we take the inverse
        distance = -jellyfish.jaro_winkler_similarity(word1, word2)
    if distance_type == 'j':
        distance = -jellyfish.jaro_similarity(word1, word2)
    if distance_type == 'hm':
        distance = jellyfish.hamming_distance(word1, word2)
    return distance
예제 #8
0
def find_similar_file(origin_file_name, files):
    similar_file = ''
    similarity = 0

    for file in files:
        cur_file_name = os.path.splitext(file)[0]
        cur_extension = os.path.splitext(file)[1][1:]

        if 'json' == cur_extension:
            cur_similarity = jellyfish.jaro_similarity(origin_file_name,
                                                       cur_file_name) * 100
            if cur_similarity >= 70 and cur_similarity >= similarity:
                similarity = cur_similarity
                similar_file = cur_file_name + '.' + cur_extension

    return similar_file, similarity
예제 #9
0
    def comparacion_pares(self, texto1, texto2, tipo="levenshtein", norm=None):
        """
        Permite hacer comparaciones entre dos textos de entrada, de acuerdo a \
        un tipo de distancia o similitud determinado.

        :param texto1: Primer texto de interés a comparar.
        :type texto1: str
        :param texto2: Segundo texto de interés a comparar.
        :type texto2: str
        :param tipo: Criterio de comparación a utilizar entre los textos. \
            Valor por defecto `'levenshtein'`.
        :type tipo: {'damerau_levenshtein', 'levenshtein', 'hamming', \
            'jaro_winkler', 'jaro'}, opcional
        :param norm: Permite normalizar los resultados en función de la \
            longitud de los textos. Si `norm = 1` se normaliza en función al \
            texto más corto, si `norm = 2` se normaliza en función al texto \
            de mayor extensión.
        :type norm: {1,2}, opcional
        :return: (float) Valor resultado de la comparación entre `texto1` y \
            `texto2`.
        """
        tipo = tipo.lower()
        if "damerau" in tipo:
            salida = jellyfish.damerau_levenshtein_distance(texto1, texto2)
        elif "levenshtein" in tipo:
            salida = jellyfish.levenshtein_distance(texto1, texto2)
        elif "hamming" in tipo:
            salida = jellyfish.hamming_distance(texto1, texto2)
        elif "winkler" in tipo:
            salida = jellyfish.jaro_winkler_similarity(texto1, texto2)
        elif "jaro" in tipo:
            salida = jellyfish.jaro_similarity(texto1, texto2)
        else:
            print(
                (
                    "Por favor seleccione un criterio válido "
                    "para comparar los strings."
                )
            )
            return None
        if norm in [1, 2] and "jaro" not in tipo:
            if norm == 1:
                salida /= min(len(texto1), len(texto2))
            else:
                salida /= max(len(texto1), len(texto2))
        return salida
예제 #10
0
def extract_closest_match(search_key, target_list, score_cutoff=0):
    """Return str value from target list with highest score using Jaro
    for String distance.

     search_key (str): A string used to search for closest match.
     target_list (list): A list of strings for comparison.
     score_cutoff (float): A score cutoff (betwen 0 and 1) to be met.
    """
    highest_score = score_cutoff
    highest_value_key = None

    for target_key in target_list:
        score = jellyfish.jaro_similarity(search_key, target_key)
        if score >= highest_score:
            highest_score = score
            highest_value_key = target_key

    return highest_value_key
예제 #11
0
 def word_deduplication(self, threshold=0.5):
     keywords = []
     #add first element  to liste
     for index, key in enumerate(self.candidateKeywords):
         if index > 0:
             break
         print(key)
         keywords.append(key)
     for candidate in self.candidateKeywords:
         skip = False
         for word in self.candidateKeywords:
             candidat = candidate
             candidat1 = word
             if jaro_similarity(candidat, candidat1) > 0.3:
                 skip = True
                 break
         if not skip:
             keywords.append((candidat, candidat))
     self.candidateKeywords = keywords
예제 #12
0
    def find_double(self, with_fusion=True):
        log("Recherche des doublons sur les films")
        rc = 0
        for p1 in self.pows:
            for p2 in self.pows:
                d = jellyfish.jaro_similarity(p1.title.lower(),
                                              p2.title.lower())
                if d > 0.97 and p1.year == p2.year and p1.id != p2.id:
                    log("Suspission de doublon entre " + str(p1) + " et " +
                        str(p2))
                    if with_fusion:
                        if p1.quality_score() > p2.quality_score():
                            b = self.fusion(p2, p1)
                        else:
                            b = self.fusion(p1, p2)
                        if b:
                            log("Fusion réalisée")
                            rc = rc + 1

        return rc
예제 #13
0
    def word_deduplication(self, threshold=.8):
        keywords = []

        for index, item in enumerate(self.candidateKeywords):
            keywords.append(item[0])
            if index > 0:
                break
        for candidate in self.candidateKeywords:
            skip = False
            for key in keywords:
                if jaro_similarity(key.lower(),
                                   candidate[0].lower()) > threshold:
                    skip = True
                    break
            if not skip:
                keywords.append(candidate[0])
        #print(keywords)
        #print( sorted([x[0] for x in self.candidateKeywords if x[0] in keywords], key=lambda k:k))
        return sorted(
            [x[0] for x in self.candidateKeywords if x[0] in keywords],
            key=lambda k: k)
예제 #14
0
    def comparacion_pares(self, texto1, texto2, tipo='levenshtein', norm=None):
        """ Permite hacer comparaciones entre dos textos de entrada, de acuerdo a un tipo de \
            distancia o similitud determinado.

        :param texto1: (str) Primer texto de interés a comparar.
        :param texto2: (str) Segundo texto de interés a comparar.
        :param tipo: (str) {'damerau_levenshtein', 'levenshtein', 'hamming', 'jaro_winkler', \
            'jaro'} Valor por defecto: 'levenshtein'. Criterio de comparación a utilizar entre los textos.
        :param norm: (int) {1, 2} Valor por defecto: None. Permite normalizar \ 
            los resultados en función de la longitud de los textos. \ 
            Si norm=1 se normaliza en función al texto más corto, \ 
            si norm=2 se normaliza en función al texto de mayor extensión.
        :return: (float o int) Valor resultado de la comparación.
        """
        tipo = tipo.lower()
        if 'damerau' in tipo:
            salida = jellyfish.damerau_levenshtein_distance(texto1, texto2)
        elif 'levenshtein' in tipo:
            salida = jellyfish.levenshtein_distance(texto1, texto2)
        elif 'hamming' in tipo:
            salida = jellyfish.hamming_distance(texto1, texto2)
        elif 'winkler' in tipo:
            salida = jellyfish.jaro_winkler_similarity(texto1, texto2)
        elif 'jaro' in tipo:
            salida = jellyfish.jaro_similarity(texto1, texto2)
        else:
            print(
                'Por favor seleccione un criterio válido para comparar los strings.'
            )
            return None
        if norm in [1, 2] and 'jaro' not in tipo:
            if norm == 1:
                salida /= min(len(texto1), len(texto2))
            else:
                salida /= max(len(texto1), len(texto2))
        return salida
예제 #15
0
def jaro_distance(A, B):
    return 1 - jaro_similarity(A, B)
예제 #16
0
def diff(user, music_filter, spotify, output, min_threshold, max_threshold):
    spotify_tracks = spotify.tracks()
    spotify_tracks_by_slug = {
        slugify(f"""{t['track']['artists'][0]['name']}-{t['track']['name']}""",
                stopwords=STOPWORDS,
                replacements=REPLACEMENTS):  # type: ignore
        t
        for t in spotify_tracks
    }

    local_tracks = user.do_filter(music_filter)
    local_tracks_by_slug = {
        slugify(f"""{t['artist']}-{t['title']}""",
                stopwords=STOPWORDS,
                replacements=REPLACEMENTS):  # type: ignore
        t
        for t in local_tracks
    }

    spotify_differences = set(spotify_tracks_by_slug.keys()).difference(
        set(local_tracks_by_slug.keys()))
    spotify_slug_tracks = collections.OrderedDict(
        (d, spotify_tracks_by_slug[d]) for d in sorted(spotify_differences))

    local_tracks_found = len(spotify_tracks_by_slug) - len(spotify_differences)

    if len(local_tracks) == local_tracks_found:
        return

    output_tracks(output, spotify_slug_tracks.values())
    distances_tracks = []
    for spotify_slug, spotify_track in spotify_slug_tracks.items():
        distances = {
            local_slug: jellyfish.jaro_similarity(spotify_slug, local_slug)
            for local_slug in local_tracks_by_slug
        }
        if not distances:
            continue
        closest_local_track = max(distances.items(),
                                  key=operator.itemgetter(1))
        closest_local_slug = closest_local_track[0]
        closest_distance = closest_local_track[1]

        if min_threshold <= closest_distance <= max_threshold:
            if 'spotify-error' in local_tracks_by_slug[closest_local_slug][
                    'keywords']:
                continue
            distances_tracks.append({
                'local_track':
                local_tracks_by_slug[closest_local_slug],
                'local_slug':
                closest_local_slug,
                'spotify_track':
                spotify_track,
                'spotify_slug':
                spotify_slug,
                'distance':
                closest_distance,
            })
    print_distances(distances_tracks)
    print(f"min threshold : {min_threshold}")
    print(f"max threshold : {max_threshold}")
    print(f"spotify tracks : {len(spotify_tracks)}")
    print(f"spotify slugs: {len(spotify_tracks_by_slug)}")
    print(f"local tracks : {len(local_tracks)}")
    print(f"local tracks slugs : {len(local_tracks_by_slug)}")
    print(f"found in local     : {local_tracks_found}")
    print(f"not found in local : {len(spotify_differences)}")
예제 #17
0
import pandas as pd
import jellyfish

importer_list = pd.read_csv(
    r'C:\Users\S\PycharmProjects\CompanyNames\HMRC\importsNames.csv')

importer_names = importer_list[['NAME']].drop_duplicates()

# sample_df = pd.read_csv(r'C:\Users\S\PycharmProjects\CompanyNames\data\raw\company_names.csv')
#
#
# x= pd.merge(sample_df,importer_names,how='inner',left_on = ['CompanyName'],right_on=['NAME'])
# x=x[['NAME']].sample(100)
# x.to_csv('matched.csv',index=None )

x = pd.read_csv(r'./HMRC/matched.csv')
y = x['NAME'][0]

z = [[i, jellyfish.jaro_similarity(i, y)] for i in x['NAME'] if y != i]
z3 = [[i, jellyfish.match_rating_comparison(i, y)] for i in x['NAME']
      if y != i]
z2 = pd.DataFrame(z)
예제 #18
0
def jaro_similarity(s1, s2):
    return None if s1 == None or s2 == None else J.jaro_similarity(s1, s2)
예제 #19
0
def getClosestTeamName(teamName: str) -> str:
    return sorted(
        teamNames,
        key=lambda savedName: jellyfish.jaro_similarity(savedName, teamName),
        reverse=True)[0]
예제 #20
0
        return float(numerator) / denominator


def textToVector(text):
    words = WORD.findall(text)
    return Counter(words)

second = open("reinterpreted_file_1.txt", "r")
first = open("reinterpreted_file_2.txt", "r")

text1 = second.read()
text2 = first.read()


vector1 = textToVector(text1)
vector2 = textToVector(text2)

cosine = calculateCosineSimilarity(vector1, vector2)

data = {
    'cosine': cosine,
    'jaro_similarity': jellyfish.jaro_similarity(text1, text2),
    'jaro_winkler_similarity': jellyfish.jaro_winkler_similarity(text1, text2),
    'levenshtein_distance': jellyfish.levenshtein_distance(text1, text2),
    'damerau_levenshtein_distance': jellyfish.damerau_levenshtein_distance(text1, text2),
    'hamming_distance': jellyfish.hamming_distance(text1, text2)

}

with open('results.txt', 'w') as outfile:
    json.dump(data, outfile)
예제 #21
0
 def jaro_similarity(self, a, b):
     result = jellyfish.jaro_similarity(a, b)
     return result