Exemplo n.º 1
0
def closest_matches(value, ref, num=5, thresh=1):
    res = heapq.nsmallest(
        num, ref, lambda x: jellyfish.damerau_levenshtein_distance(value, x))
    if jellyfish.damerau_levenshtein_distance(value, res[0]) <= thresh:
        return res[0]
    else:
        return res
Exemplo n.º 2
0
 def correct_sentence(self, sentence, window=2, topn=10):
     """Correct mistakes in a single sentence
     
     Args: 
         sentence (:obj:`list` of :obj:`str`): list of tokens in the sentence
         window (int): word window used to predict center word from context
         topn (int): number of most probable candidates to choose from
     Returns: 
         
     """
     sentence = copy.deepcopy(sentence)
     for i in range(len(sentence)):
         if sentence[i] not in self.model.wv.vocab:
             candidates = predict_output_word(
                 self.model,
                 sentence[max(0, i - window):min(len(sentence), i + window +
                                                 1)],
                 topn=topn)
             #                 candidates = self.model.wv.most_similar([sentence[i]])
             # if no candidates were found
             if candidates is None:
                 continue
             best_candidate = min(
                 candidates,
                 key=lambda x: damerau_levenshtein_distance(
                     x[0], sentence[i]))[0]
             sentence[i] = best_candidate if damerau_levenshtein_distance(sentence[i], best_candidate) < 3 else \
                 sentence[i]
     return sentence
Exemplo n.º 3
0
 def extract(self, x, y):
     if x is None or y is None:
         return 0
     if self.similarity:
         return 1 - float(damerau_levenshtein_distance(unicode(x), unicode(y))) / max(len(x), len(y))
     else:
         return damerau_levenshtein_distance(unicode(x), unicode(y))
Exemplo n.º 4
0
def measure_distance(log_data, simulation_data):
    similarity = list()
    temp_log_data = log_data.copy()
    for sim_instance in simulation_data:
        min_dist, min_index = jf.damerau_levenshtein_distance(sim_instance['profile'], temp_log_data[0]['profile']) , 0
        for i in range(0,len(temp_log_data)):
            sim = jf.damerau_levenshtein_distance(sim_instance['profile'], temp_log_data[i]['profile'])
            if min_dist > sim:
                min_dist = sim
                min_index = i
        abs_err = abs(temp_log_data[min_index]['tbtw'] - sim_instance['tbtw'])
        dl_t = damerau_levenshtein_distance(sim_instance['profile'],
                                            temp_log_data[min_index]['profile'],
                                            sim_instance['tbtw_list'],
                                            temp_log_data[min_index]['tbtw_list'])
        length=np.max([len(sim_instance['profile']), len(temp_log_data[min_index]['profile'])])        
        similarity.append(dict(caseid=sim_instance['caseid'],
                               sim_order=sim_instance['profile'],
                               log_order=temp_log_data[min_index]['profile'],
                               sim_tbtw=sim_instance['tbtw_list'],
                               log_tbtw=temp_log_data[min_index]['tbtw_list'],
                               sim_score_t=(1-(dl_t/length)),
                               sim_score=(1-(min_dist/length)),
                               abs_err=abs_err))
        del temp_log_data[min_index]
    return similarity
def impute_qty(instance, ingredients):
    similar_ingredients = get_similar(
        instance, ingredients,
        lambda x, y: jellyfish.damerau_levenshtein_distance(
            x["base"], y["base"]) + jellyfish.damerau_levenshtein_distance(
                x["unit"], y["unit"]))

    return similar_ingredients.qty.apply(float).mean()
 def test_corpus(self):
     twoWords = re.compile('([A-Za-z]+)\s+([A-Za-z]+)')
     with open(os.path.dirname(os.path.realpath(__file__))+os.path.sep+"birkbeck_spelling_error_corpus/ABODAT.643") as f:
         pairs = f.read().split(',')
         for p in pairs:
             if twoWords.search(p):
                 fst,snd = twoWords.search(p).groups()
                 print fst.lower(),snd.lower()
                 print jellyfish.damerau_levenshtein_distance(unicode(fst.lower()),unicode(snd.lower())),dl_dist(fst.lower(),snd.lower())
Exemplo n.º 7
0
 def gen(metric, serie1, serie2, oracle, r):
     """Reads the simulation results stats
     Args:
         settings (dict): Path to jar and file names
         rep (int): repetition number
     """
     try:
         df_matrix = list()
         for i, s1_ele in enumerate(serie1):
             for j, s2_ele in enumerate(serie2):
                 element = {'i': r[0]['min'] + i, 'j': r[1]['min'] + j}
                 if metric in ['tsd', 'dl', 'dl_mae']:
                     element['s_1'] = s1_ele['profile']
                     element['s_2'] = s2_ele['profile']
                     element['length'] = max(len(s1_ele['profile']),
                                             len(s2_ele['profile']))
                 if metric == 'tsd':
                     element['p_1'] = s1_ele['proc_act_norm']
                     element['p_2'] = s2_ele['proc_act_norm']
                     element['w_1'] = s1_ele['wait_act_norm']
                     element['w_2'] = s2_ele['wait_act_norm']
                 if metric in ['mae', 'dl_mae']:
                     element['et_1'] = s1_ele['end_time']
                     element['et_2'] = s2_ele['end_time']
                     element['st_1'] = s1_ele['start_time']
                     element['st_2'] = s2_ele['start_time']
                 df_matrix.append(element)
         df_matrix = pd.DataFrame(df_matrix)
         if metric == 'tsd':
             df_matrix['distance'] = df_matrix.apply(
                 lambda x: tsd_alpha(x.s_1, x.s_2, x.p_1, x.p_2, x.w_1,
                                     x.w_2, oracle) / x.length,
                 axis=1)
         elif metric in 'dl':
             df_matrix['distance'] = df_matrix.apply(
                 lambda x: jf.damerau_levenshtein_distance(
                     ''.join(x.s_1), ''.join(x.s_2)) / x.length,
                 axis=1)
         elif metric == 'mae':
             df_matrix['distance'] = df_matrix.apply(
                 lambda x: ae_distance(x.et_1, x.et_2, x.st_1, x.st_2),
                 axis=1)
         elif metric == 'dl_mae':
             df_matrix['dl_distance'] = df_matrix.apply(
                 lambda x: jf.damerau_levenshtein_distance(
                     ''.join(x.s_1), ''.join(x.s_2)) / x.length,
                 axis=1)
             df_matrix['mae_distance'] = df_matrix.apply(
                 lambda x: ae_distance(x.et_1, x.et_2, x.st_1, x.st_2),
                 axis=1)
         else:
             raise ValueError(metric)
         return df_matrix
     except Exception:
         traceback.print_exc()
Exemplo n.º 8
0
def get_closest_damerau_levenshtein(needle, haystack):
    closest = None
    for x in haystack:
        if (closest == None):
            closest = (x, jellyfish.damerau_levenshtein_distance(needle, x))
        else:
            temp = (x, jellyfish.damerau_levenshtein_distance(needle, x))
            if (temp[1] < closest[1]):
                closest = temp
    if (closest == None):
        return None
    return closest[0]
Exemplo n.º 9
0
def score(df):
    group_combinations = list(itertools.combinations(df.index.tolist(), 2))
    xs = [r[0] for r in group_combinations]
    ys = [r[1] for r in group_combinations]
    xdf = df.loc[xs]
    ydf = df.loc[ys]
    first_name_cmp = damerau_levenshtein_distance(xdf.first_name,
                                                  ydf.first_name)
    last_name_cmp = damerau_levenshtein_distance(xdf.last_name, ydf.first_name)
    score = first_name_cmp + last_name_cmp
    result = DataFrame(dict(left=xs, right=ys, score=score))
    return result[result.score < 4]
def get_closest_damerau_levenshtein(needle,haystack):
	closest = None;
	for x in haystack:
		if(closest == None):
			closest = (x,jellyfish.damerau_levenshtein_distance(needle,x));
		else:
			temp = (x,jellyfish.damerau_levenshtein_distance(needle,x));
			if(temp[1] < closest[1]):
				closest = temp;
	if(closest == None):
		return None;
	return closest[0];
Exemplo n.º 11
0
def comparator(element):
    """
    Extract similarity features
    """
    
    return {
        'donor_id1': element['record_a']['donor_id'],
        'donor_id2': element['record_b']['donor_id'],
        'jaro_name': jf.jaro_winkler(element['record_a']['name'], element['record_b']['name']),
        'damerau_name': jf.damerau_levenshtein_distance(element['record_a']['name'], element['record_b']['name']),
        'jaro_address': jf.jaro_winkler(element['record_a']['address'], element['record_b']['address']),
        'damerau_address': jf.damerau_levenshtein_distance(element['record_a']['address'], element['record_b']['address'])  
        }
Exemplo n.º 12
0
 def gen(metric: Metric, serie1, serie2, oracle, r):
     """Reads the simulation results stats"""
     try:
         df_matrix = list()
         for i, s1_ele in enumerate(serie1):
             for j, s2_ele in enumerate(serie2):
                 element = {'i': r[0]['min'] + i, 'j': r[1]['min'] + j}
                 if metric in [Metric.TSD, Metric.DL, Metric.DL_MAE]:
                     element['s_1'] = s1_ele['profile']
                     element['s_2'] = s2_ele['profile']
                     element['length'] = max(len(s1_ele['profile']),
                                             len(s2_ele['profile']))
                 if metric is Metric.TSD:
                     element['p_1'] = s1_ele['proc_act_norm']
                     element['p_2'] = s2_ele['proc_act_norm']
                     element['w_1'] = s1_ele['wait_act_norm']
                     element['w_2'] = s2_ele['wait_act_norm']
                 if metric in [Metric.MAE, Metric.DL_MAE]:
                     element['et_1'] = s1_ele['end_time']
                     element['et_2'] = s2_ele['end_time']
                     element['st_1'] = s1_ele['start_time']
                     element['st_2'] = s2_ele['start_time']
                 df_matrix.append(element)
         df_matrix = pd.DataFrame(df_matrix)
         if metric is Metric.TSD:
             df_matrix['distance'] = df_matrix.apply(
                 lambda x: tsd_alpha(x.s_1, x.s_2, x.p_1, x.p_2, x.w_1,
                                     x.w_2, oracle) / x.length,
                 axis=1)
         elif metric is Metric.DL:
             df_matrix['distance'] = df_matrix.apply(
                 lambda x: jf.damerau_levenshtein_distance(
                     ''.join(x.s_1), ''.join(x.s_2)) / x.length,
                 axis=1)
         elif metric is Metric.MAE:
             df_matrix['distance'] = df_matrix.apply(
                 lambda x: ae_distance(x.et_1, x.et_2, x.st_1, x.st_2),
                 axis=1)
         elif metric is Metric.DL_MAE:
             df_matrix['dl_distance'] = df_matrix.apply(
                 lambda x: jf.damerau_levenshtein_distance(
                     ''.join(x.s_1), ''.join(x.s_2)) / x.length,
                 axis=1)
             df_matrix['mae_distance'] = df_matrix.apply(
                 lambda x: ae_distance(x.et_1, x.et_2, x.st_1, x.st_2),
                 axis=1)
         else:
             raise ValueError(metric)
         return df_matrix
     except Exception:
         traceback.print_exc()
Exemplo n.º 13
0
    def TMDB_get(self, response_search, film_number):
        """
        Sub-method.
        It takes itself, the content of the response of the API, the index number of the film in self.data["film"].
        Used by getAPI_tmdb to extract to confirm the right movie was picked (it contains the individual it is looking for)
        Returns True or False, True if the individual was present in the credits of the movie.
        """
        
        url_MDB_credit = "http://api.tmdb.org/3/movie/{}/credits?api_key={}"
        API_KEY_MDB = "a68690ebf69567801e68c26ee82d7787"
        found_individual = False
        try :
            for number_results in range(len(response_search["results"])):
                if found_individual == False:
                    if response_search["results"][number_results].get("release_date") not in (None,0,''): # There is a release date
                        if (int(response_search["results"][number_results]["release_date"][0:4]) in (self.data["year"][film_number],self.data["year"][film_number]-1, self.data["year"][film_number]-2, self.data["year"][film_number]-3)) and found_individual == False:
                            id_MDB = response_search["results"][number_results]["id"]
                            response_credit = json.loads(requests.get(url_MDB_credit.format(id_MDB,API_KEY_MDB)).text)
                            if response_credit.get("success") != False: # Check if the API find a person
                                if self.data["category"][film_number] == "Directing":
                                    for acteurs in range(len(response_credit["crew"])):
                                        if jellyfish.damerau_levenshtein_distance(response_credit["crew"][acteurs]["name"], self.data["name"][film_number]) < 2:
                                            self.list_id_indiv.append(response_credit["crew"][acteurs]["id"])
                                            found_individual = True
                                        
                                            break


                                    if not any(jellyfish.damerau_levenshtein_distance(response_credit["crew"][acteurs]["name"], self.data["name"][film_number]) < 2 for acteurs in range(len(response_credit["crew"]))):
                                        pass
                                        # self.Correction(film_number, False)
                                        
                                else:
                                    for acteurs in range(len(response_credit["cast"])):
                                        if jellyfish.damerau_levenshtein_distance(response_credit["cast"][acteurs]["name"], self.data["name"][film_number]) < 2:
                                            self.list_id_indiv.append(response_credit["cast"][acteurs]["id"])
                                            found_individual = True  # Empêche d'avoir plusieurs fois le même acteur si on l'a déjà trouvé
                                            break   
                                            
                                    if not any(jellyfish.damerau_levenshtein_distance(response_credit["cast"][acteurs]["name"], self.data["name"][film_number]) < 2 for acteurs in range(len(response_credit["cast"]))):
                                        pass
                                        # self.Correction(film_number, False)
                                        

        # Error handling      
        except requests.exceptions.RequestException as e:  
            print("There was an error while requesting oscars.org website. Please retry or check your connection or the status of the website. See next the error message: ", e)
            raise SystemExit(e)
        
        # Return False if no movie-individual pair was found, and True if it was found
        return(found_individual)
Exemplo n.º 14
0
def jelly():
    import jellyfish
    a = u'Korle Bu Teaching Hospital Sickle Cell Dept'
    b = u'Korle Bu Teaching Hospital'
    # a = u'x'
    # b = u'a'
    print jellyfish.levenshtein_distance(a, b)
    print jellyfish.jaro_distance(a, b)
    print jellyfish.damerau_levenshtein_distance(a, b)
    # print jellyfish.match_rating_comparison(a,b)

    from fuzzywuzzy import fuzz

    print fuzz.ratio(a, b)
Exemplo n.º 15
0
 def damerau_sim(self):
     self.cluster = []
     for i in range(0,len(self.group)):
         for j in range(i+1, len(self.group)):
             if self.threshold <= (1 - jf.damerau_levenshtein_distance(str(self.group[i],self.group[j])) / max(len(str(self.group[i])),len(str(self.group[j])))):
                 self.cluster.append([self.group[i],self.group[j]])
     return self.cluster
Exemplo n.º 16
0
def dl_preprocess_words(words1, words2):
    global DL_COUNT
    min_distance = 100
    min_threshold = 0.4
    new_words = []
    for word1 in words1:
        l1 = len(word1)
        if l1 < 5:
            new_words.append(word1)
            continue
        closest_word = word1
        for word2 in words2:
            l2 = len(word2)
            if l2 < 5:
                continue
            try:
                d = damerau_levenshtein_distance(word1, word2)
            except:
                d = 100
            if d < min_threshold * min(l1, l2) and d < min_distance:
                min_distance = d
                closest_word = word2
                DL_COUNT += 1
                logging.debug('count: %d, word1: %s, word2: %s, distance: %d' %
                              (DL_COUNT, word1, word2, d))
        new_words.append(closest_word)
    return new_words
Exemplo n.º 17
0
def dameraulevenshtein(seq1, seq2):
    """Calculate the Damerau-Levenshtein distance between sequences.

    This distance is the number of operations (consisting of insertions,
    deletions or substitutions of a single character, or transposition of two
    adjacent characters) required to change one sequence into the other.

    Arguments may be str or unicode.

    >>> dameraulevenshtein('ba', 'abc')
    2
    >>> dameraulevenshtein('fee', 'deed')
    2
    >>> dameraulevenshtein(u'abcd', u'bacde')
    2
    >>> dameraulevenshtein(u'number e', u'number \u03c0')
    1
    """
    if isinstance(seq1, str):
        seq1 = unicode(seq1, 'utf-8')
    if isinstance(seq2, str):
        seq2 = unicode(seq2, 'utf-8')

    # Fall back onto Python implementation for code points unsupported by the C
    # implementation.
    # https://github.com/jamesturk/jellyfish/issues/55#issuecomment-312509263
    try:
        return jellyfish.damerau_levenshtein_distance(seq1, seq2)
    except ValueError:
        return py_jellyfish.damerau_levenshtein_distance(seq1, seq2)
def get_damerau_levenshtein_avg(row1, row2):
	sum = 0
	for columnIndex in xrange(1,15):
		a = row1[columnIndex]
		b = row2[columnIndex]
		sum += 1 - jellyfish.damerau_levenshtein_distance(a, b) / float(max(len(a), len(b)))
	return sum / 14.0
def impute_unit(instance, ingredients):
    similar_ingredients = get_similar(
        instance, ingredients,
        lambda x, y: jellyfish.damerau_levenshtein_distance(
            x["base"], y["base"]))

    return similar_ingredients.unit.mode()
Exemplo n.º 20
0
def dist_calc(author_pair):
    dist = jellyfish.damerau_levenshtein_distance(author_pair[0], author_pair[1])
    if dist <= 3:
        author_pair.append(dist)
        return author_pair
    else:
        return False
Exemplo n.º 21
0
def dameraulevenshtein(seq1, seq2):
    """Calculate the Damerau-Levenshtein distance between sequences.

    This distance is the number of additions, deletions, substitutions,
    and transpositions needed to transform the first sequence into the
    second. Arguments must be strings.

    Transpositions are exchanges of *consecutive* characters; all other
    operations are self-explanatory.

    This implementation is O(N*M) time and O(M) space, for N and M the
    lengths of the two sequences.

    >>> dameraulevenshtein('ba', 'abc')
    2
    >>> dameraulevenshtein('fee', 'deed')
    2
    >>> dameraulevenshtein('abcd', 'bacde')
    3

    Note: the real answer is 2: abcd->bacd->bacde
          but this algorithm is apparently doing abcd->acd->bacd->bacde
    """
    return jellyfish.damerau_levenshtein_distance(seq1.encode('utf-8'),
                                                  seq2.encode('utf-8'))
Exemplo n.º 22
0
def dameraulevenshtein(seq1, seq2):
    """Calculate the Damerau-Levenshtein distance between sequences.

    This distance is the number of additions, deletions, substitutions,
    and transpositions needed to transform the first sequence into the
    second. Although generally used with strings, any sequences of
    comparable objects will work.

    Transpositions are exchanges of *consecutive* characters; all other
    operations are self-explanatory.

    This implementation is O(N*M) time and O(M) space, for N and M the
    lengths of the two sequences.

    >>> dameraulevenshtein('ba', 'abc')
    2
    >>> dameraulevenshtein('fee', 'deed')
    2

    It works with arbitrary sequences too:
    >>> dameraulevenshtein('abcd', ['b', 'a', 'c', 'd', 'e'])
    2
    """
    return jellyfish.damerau_levenshtein_distance(seq1.encode('utf-8'),
                                                  seq2.encode('utf-8'))
Exemplo n.º 23
0
def check_last_name(last_names, officer_name):
    poss_match = []
    for l in last_names:
        for o in officer_name:
            if js.damerau_levenshtein_distance(l[0], o) == 0:
                poss_match.append(l)
    return poss_match
Exemplo n.º 24
0
def check_last_name(last_names, officer_name):
    poss_match = []
    for l in last_names:
        for o in officer_name:
            if js.damerau_levenshtein_distance(l[0], o) == 0:
                poss_match.append(l)
    return poss_match
def spell_suggest(word, possibilities):
    r"""Return a ordered list of spelling suggestions for `word`.

    Suggestions are drawn from `possibilities`. If the word is too dissimilar
    the list will be empty.

    >>> possible='''title subtitle client project author recipients version date
    ... tnc toc toc-depth title subtitle client project author recipients
    ... confidential tnc toc toc-depth'''.split()
    >>> spell_suggest('recipient', possible)
    ['recipients']
    >>> spell_suggest('t&c', possible)
    ['tnc']
    >>> spell_suggest('foobar', possible)
    []
    """
    # pylint: disable=E1101
    dist, best_i = min(
        zip((jellyfish.damerau_levenshtein_distance(deunicode(word),
                                                    poss)
             for poss in possibilities),
            count()))
    if dist <= min(3, len(possibilities[best_i])/2):
        return [possibilities[best_i]]
    return []
Exemplo n.º 26
0
def code_generator(digits, max_value, min_distance):
    """Generate distant enough numeric codes (Damerau-Levenshtein distance).

    Parameters
    ----------
    digits : int
        Number of digits the numeric code is made of. If needed the string
        will be padded with zeroes.

    max_value : int
        Maximal numeric value of the code.

    min_distance : int
        Minimal Damerau-Levenshtein distance between generated strings.

    Yields
    ------
    str
        A code is a string made of `digits` characters.

    """
    lexicode = []

    candidates = list(range(largest_int_with_less_digits(max_value) + 1,
                            max_value + 1))
    shuffle(candidates)

    for i in candidates:
        i = str(i)
        i = i.zfill(digits)
        if not lexicode or min(damerau_levenshtein_distance(i, j) for j in lexicode) >= min_distance:
            lexicode.append(i)
            yield i
Exemplo n.º 27
0
    def calculate_distances(self, serie1, serie2, id1, id2):
        """


        Parameters
        ----------
        serie1 : list
        serie2 : list
        id1 : index of the list 1
        id2 : index of the list 2

        Returns
        -------
        dl : float value
        ae : absolute error value
        """
        length = np.max([len(serie1[id1]['profile']),
                         len(serie2[id2]['profile'])])
        d_l = jf.damerau_levenshtein_distance(
            ''.join(serie1[id1]['profile']),
            ''.join(serie2[id2]['profile']))/length

        cicle_time_s1 = (
            serie1[id1]['end_time'] - serie1[id1]['start_time']).total_seconds()
        cicle_time_s2 = (
            serie2[id2]['end_time'] - serie2[id2]['start_time']).total_seconds()
        ae = np.abs(cicle_time_s1 - cicle_time_s2)
        return d_l, ae
Exemplo n.º 28
0
def alldist(filex, filey):
    xread = open(filex, 'r').read()
    yread = open(filey, 'r').read()
    lvd = jellyfish.levenshtein_distance(xread,yread)
    dlvd= jellyfish.damerau_levenshtein_distance(xread,yread)
    spsum = spamsum.match(xread,yread)
    spsum = 100 - spsum
    spsum = float(spsum/100.00)
#    print lvd
    res = float( lvd / 100.00 )
    dres= float(dlvd / 100.00 )
#    print res
#    print "Levenshtein Distance=",res
    jaro = jellyfish.jaro_distance(xread,yread)
## Added jaro-winkler distance by fahim 20111011
    jarowink = jellyfish.jaro_winkler(xread,yread)
    jaro = 1.0 - jaro
    jarowink = 1.0 - jarowink
#   print "Jaro Distance = ",jaro
    ham = jellyfish.hamming_distance(xread,yread)
    ham = float ( ham / 100.00)
    print "Hamming Distance = ", ham
#	print "KL-divergence between d1 and d2:", kldiv(tokenize(d1), tokenize(d2))
#	print "KL-divergence between d2 and d1:", kldiv(tokenize(d2), tokenize(d1))
#    print "Spamsum Match score: ", spsum
    kl = kldiv(tokenize(xread), tokenize(yread))

    return res, dres , jaro, jarowink, ham, kl, spsum
Exemplo n.º 29
0
def measure_distance(not_conformant, conformant):
    similarity = list()
    temp_conformant = conformant.copy()
    for not_con_trace in not_conformant:
        min_dist = jf.damerau_levenshtein_distance(not_con_trace['profile'], temp_conformant[0]['profile'])
        min_index = 0
        for i in range(0,len(temp_conformant)):
            sim = jf.damerau_levenshtein_distance(not_con_trace['profile'], temp_conformant[i]['profile'])
            if min_dist > sim:
                min_dist = sim
                min_index = i
        length=np.max([len(not_con_trace['profile']), len(temp_conformant[min_index]['profile'])])        
        similarity.append(dict(caseid=not_con_trace['caseid'],
                               sim_caseid=temp_conformant[min_index]['caseid'],
                               sim_score=(1-(min_dist/length))))
    return similarity
Exemplo n.º 30
0
def dameraulevenshtein(seq1, seq2):
    """Calculate the Damerau-Levenshtein distance between sequences.

    This distance is the number of additions, deletions, substitutions,
    and transpositions needed to transform the first sequence into the
    second. Arguments must be strings.

    Transpositions are exchanges of *consecutive* characters; all other
    operations are self-explanatory.

    This implementation is O(N*M) time and O(M) space, for N and M the
    lengths of the two sequences.

    >>> dameraulevenshtein('ba', 'abc')
    2
    >>> dameraulevenshtein('fee', 'deed')
    2
    >>> dameraulevenshtein('abcd', 'bacde')
    3

    Note: the real answer is 2: abcd->bacd->bacde
          but this algorithm is apparently doing abcd->acd->bacd->bacde
    """
    return jellyfish.damerau_levenshtein_distance(seq1.encode('utf-8'),
                                                  seq2.encode('utf-8'))
Exemplo n.º 31
0
 def _name_distance_indicator(pkg_name_1, pkg_name_2):
     if pkg_name_1 == pkg_name_2:
         return float(
             "inf"
         )  # We don't want the scan to report that, for instance, numpy is a name very close to that of the popular package numpy...
     return (2 * jellyfish.damerau_levenshtein_distance(
         pkg_name_1, pkg_name_2) / (len(pkg_name_1) + len(pkg_name_2)))
Exemplo n.º 32
0
def get_closest_matches(s, candidates, top_n=1):
    scores = np.array([
        1 - jellyfish.damerau_levenshtein_distance(s, c) / len(s)
        for c, _ in candidates
    ])
    return [(scores[i], candidates[i])
            for i in np.argsort(scores)[-top_n:][::-1]]
Exemplo n.º 33
0
def simple_example():
    # String comparison.
    str1, str2 = u'jellyfish', u'smellyfish'

    print("jellyfish.levenshtein_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.levenshtein_distance(str1, str2)))
    print("jellyfish.damerau_levenshtein_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.damerau_levenshtein_distance(str1, str2)))
    print("jellyfish.hamming_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.hamming_distance(str1, str2)))
    print("jellyfish.jaro_distance({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_distance(str1, str2)))
    print("jellyfish.jaro_similarity({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_similarity(str1, str2)))
    print("jellyfish.jaro_winkler({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_winkler(str1, str2)))
    print("jellyfish.jaro_winkler_similarity({}, {}) = {}.".format(
        str1, str2, jellyfish.jaro_winkler_similarity(str1, str2)))
    print("jellyfish.match_rating_comparison({}, {}) = {}.".format(
        str1, str2, jellyfish.match_rating_comparison(str1, str2)))

    #--------------------
    # Phonetic encoding.
    ss = u'Jellyfish'

    print("jellyfish.metaphone({}) = {}.".format(ss, jellyfish.metaphone(ss)))
    print("jellyfish.soundex({}) = {}.".format(ss, jellyfish.soundex(ss)))
    print("jellyfish.nysiis({}) = {}.".format(ss, jellyfish.nysiis(ss)))
    print("jellyfish.match_rating_codex({}) = {}.".format(
        ss, jellyfish.match_rating_codex(ss)))
Exemplo n.º 34
0
def subtract(filename):
    
    # import control file as list
    with open(CONTROL, "r") as control:
        control_list = [line.strip() for line in control.readlines()]

    # import other files to subtract from
    with open(filename, "r") as peptides:
        peptides_list = [line.strip() for line in peptides.readlines()]

    # open empty lists and then append sequences based on string distance metric
    different = []
    close = []

    # tests to see if each peptide is close to all the peptides in control file, based on the cutoff
    for i in peptides_list:
        for j in control_list:
            metric = jellyfish.damerau_levenshtein_distance(str(i), str(j))
            if metric < cutoff:
		if i not in close: #this is a modification made to reduce memory use, output file size, etc. only append unique entries
                    close.append(i)
                
    # screens out any peptides that had a hit in the control        
    for i in peptides_list:
        if i not in close:
            different.append(i)
        
    # save the filtered lists and the removed hits as text files if the lists have content       
    if len(different) > 0:
        np.savetxt(filename+"_.controlsubtracted", different, fmt="%s", delimiter="\n")
    if len(close) > 0:        
        np.savetxt(filename+"_.hitscontrol", close, fmt="%s", delimiter="\n")
        
    return different, close
Exemplo n.º 35
0
def find_approx(cmd_input: str,
                cmd_map: Optional[Iterable[str]]) -> Iterable[str]:
    """Finds the closest command to the passed cmd, this is used in case we
    cannot find an exact match for the cmd
    We will use two methods, unique prefix match and levenshtein distance match
    """

    prefix_suggestions = set()
    levenshtein_suggestions = {}

    for another_command in cmd_map:
        if str(another_command).startswith(str(cmd_input).lower()):
            prefix_suggestions.add(another_command)
        #  removing single letter levenshtein suggestions
        #  such as `?`, `q` etc
        elif len(another_command) > 1:
            distance = jellyfish.damerau_levenshtein_distance(
                str(cmd_input).lower(), another_command)
            if distance <= 2:
                levenshtein_suggestions.update({another_command: distance})

    if prefix_suggestions:
        return sorted(prefix_suggestions)
    else:
        # sort suggestions by levenshtein distance and then by name
        return [
            k for k, _ in sorted(levenshtein_suggestions.items(),
                                 key=lambda i: (i[1], i[0]))
        ]
def alldist(filex, filey):
    xread = open(filex, "r").read()
    yread = open(filey, "r").read()
    lvd = jellyfish.levenshtein_distance(xread, yread)
    dlvd = jellyfish.damerau_levenshtein_distance(xread, yread)

    #    print lvd
    res = float(lvd / 100.00)
    dres = float(dlvd / 100.00)
    #    print res
    # print "Levenshtein Distance=",lv_d
    #    jaro = jellyfish.jaro_distance(xread,yread)
    ## Added jaro-winkler distance by fahim 20111011
    #    jarowink = jellyfish.jaro_winkler(xread,yread)
    #    jaro = 1.0 - jaro
    #    jarowink = 1.0 - jarowink
    # 	print "Jaro Distance = ",jaro
    #    ham = jellyfish.hamming_distance(xread,yread)
    #    ham = float ( ham / 100.00)
    # 	print "Hamming Distance = ", ham
    # 	print "KL-divergence between d1 and d2:", kldiv(tokenize(d1), tokenize(d2))
    # 	print "KL-divergence between d2 and d1:", kldiv(tokenize(d2), tokenize(d1))
    #    kl = kldiv(tokenize(xread), tokenize(yread))

    return res, dres, jaro, jarowink, ham, kl
Exemplo n.º 37
0
 def similarity(self, a, b):
     """Returns string similarity in range 0 - 100%."""
     try:
         distance = damerau_levenshtein_distance(a, b)
         return int(100 * (1.0 - (distance / max(len(a), len(b), 1))))
     except MemoryError:
         # Too long string, mark them as not much similar
         return 50
Exemplo n.º 38
0
 def similarity(self, a, b):
     """Returns string similarity in range 0 - 100%."""
     try:
         distance = damerau_levenshtein_distance(a, b)
         return int(100 * (1.0 - (distance / max(len(a), len(b), 1))))
     except MemoryError:
         # Too long string, mark them as not much similar
         return 50
Exemplo n.º 39
0
def package_conflicts(packages, max_similarity_ratio=1/3, max_distance=2):
    for package_x, package_y in product(packages, repeat=2):
        if package_x <= package_y:
            continue
        distance = jellyfish.damerau_levenshtein_distance(package_x, package_y)
        min_len = min(len(package_x), len(package_y))
        if distance/min_len <= max_similarity_ratio and distance <= max_distance:
            yield package_x, package_y
Exemplo n.º 40
0
 def mapperSimilarity(self, _, line):
     SIMILARITY_THRESHOLD = -1.0
     words = line.split(' ')
     distance = damerau_levenshtein_distance(words[0], words[1])
     sim = self.normalizeDistanceIndex(len(words[0]), len(words[1]),
                                       distance)
     if (sim > SIMILARITY_THRESHOLD):
         yield (words[0], [words[1], sim])
Exemplo n.º 41
0
    def damerau_levenshtein_apply(x):

        try:
            return 1 - jellyfish.damerau_levenshtein_distance(x[0], x[1]) / np.max([len(x[0]), len(x[1])])
        except Exception as err:
            if pandas.isnull(x[0]) or pandas.isnull(x[1]):
                return np.nan
            else:
                raise err
Exemplo n.º 42
0
def check_other_names(officer_name, poss_names):
    poss_match_dict = Counter()
    for p in poss_names:
        for n in p[0]:
            for o in officer_name:
                if js.damerau_levenshtein_distance(n, o) == 0:
                    poss_match_dict[p[1]] += 1

    return poss_match_dict
Exemplo n.º 43
0
def get_matching_mov_title(seq,gt_rows,col_name):
	seq = ''.join(c for c in seq if c.isalnum())
	seq = seq.upper()
	rows = list(gt_rows)
	first_row = rows.pop(0)
	gt_seq = first_row[col_name]
	gt_seq = ''.join(c for c in gt_seq if c.isalnum())
	gt_seq = gt_seq.upper()
	best_score = jellyfish.damerau_levenshtein_distance(seq,gt_seq)
	best_match = first_row[col_name]
	for row in rows:
		gt_seq = ''.join(c for c in row[col_name] if c.isalnum())
		gt_seq = gt_seq.upper()
		cur_score = jellyfish.damerau_levenshtein_distance(seq,gt_seq)
		if cur_score < best_score:
			best_score = cur_score
			best_match = row[col_name]
	return best_match
Exemplo n.º 44
0
    def test_damerau_levenshtein_distance(self):
        cases = [("", "", 0),
                 ("abc", "", 3),
                 ("bc", "abc", 1),
                 ("abc", "acb", 1),
                 ]

        for (s1, s2, value) in cases:
            self.assertEqual(jellyfish.damerau_levenshtein_distance(s1, s2),
                             value)
Exemplo n.º 45
0
 def similarity(self, first, second):
     """Returns string similarity in range 0 - 100%."""
     try:
         # The C version (default) fails on unicode chars
         # see https://github.com/jamesturk/jellyfish/issues/55
         try:
             distance = damerau_levenshtein_distance(first, second)
         except ValueError:
             distance = py_damerau_levenshtein_distance(first, second)
     except MemoryError:
         # Too long string, mark them as not much similar
         return 50
     return int(
         100 * (1.0 - (float(distance) / max(len(first), len(second), 1)))
     )
Exemplo n.º 46
0
def distance(string_1, string_2):
    """Compute the edit distance between two strings.
    """
    return jsonify({
        "levenshtein": jellyfish.levenshtein_distance(string_1, string_2),
        "damerau-levenshtein": jellyfish.damerau_levenshtein_distance(
            string_1,
            string_2
        ),
        "jaro": jellyfish.jaro_distance(string_1, string_2),
        "jaro-winkler": jellyfish.jaro_winkler(string_1, string_2),
        "match_rating_codex": jellyfish.match_rating_comparison(
            string_1,
            string_2
        ),
        "sift3": pymailcheck.sift3_distance(string_1, string_2),
    })
Exemplo n.º 47
0
def stringDistance(str1, str2):
  """
  Return distance between two strings
    String distance : jaro + levenshtein + damerau
  """
  distance = 0
  if len(str1) > 0 and len(str2) > 0:
    str1 = str1.decode('utf-8')
    str2 = str2.decode('utf-8')

    jaro = jellyfish.jaro_distance(str1, str2)
    leven = jellyfish.levenshtein_distance(str1, str2)
    damerau = jellyfish.damerau_levenshtein_distance(str1, str2)

    norm = max(len(str1), len(str2))
    distance = 0.5 * jaro + 0.25 * (1 - leven / norm)   \
                          + 0.25 * (1 - damerau / norm)

  return distance
Exemplo n.º 48
0
def find_distances(file1, file2):
    
    #open list to dump calculated distances into
    distances = []
    # import files to compare
    with open(file1, "r") as file1:
        file1_list = [line.strip() for line in file1.readlines()]

    with open(file2, "r") as file2:
        file2_list = [line.strip() for line in file2.readlines()]

    for i in file1_list:
        for j in file2_list:
            distances.append(jellyfish.damerau_levenshtein_distance(i, j))
            
    mean = np.mean(distances)
    stdv = np.std(distances)
    
    return distances, mean, stdv
Exemplo n.º 49
0
def calc_distances(product, company):
    matches = []
    product_tokens = product['name'].split(' ')
    company_tokens = company['name'].split(' ')
    for company_token in company_tokens:
        lowest_distance = None
        matched_token = None
        for product_token in product_tokens:
            distance = jellyfish.damerau_levenshtein_distance(
                product_token, company_token)
            if lowest_distance is None or distance < lowest_distance:
                lowest_distance = distance
                matched_token = product_token
        matches.append({
            'company_token': company_token,
            'product_token': matched_token,
            'lowest_distance': lowest_distance
        })
    return matches
Exemplo n.º 50
0
def get_matching_seq(target_seq, gt_seqs_dict):
    best_score = 10000 
    
    target_seq = ''.join(c for c in target_seq if c.isalnum())
    target_seq = target_seq.upper()

    best_matching_index = 0 
    best_matching_seq = ""
    
    for key, gt_seq in gt_seqs_dict.iteritems():
        gt_seq = ''.join(c for c in gt_seq if c.isalnum())
        gt_seq = gt_seq.upper()
        curr_score = jellyfish.damerau_levenshtein_distance(target_seq, gt_seq)

        if curr_score < best_score:
            best_score = curr_score
            best_matching_index = key 
            best_matching_seq = gt_seq

    return best_matching_index
Exemplo n.º 51
0
def measure_string_distance(s1, s2, method):
    '''
            Four methods will be used with method code from 1 to 4
            Two methods focused on string similarity and the other two will be focused on phonetic encoding
            Method code to method name:
            1. jaro-winkler distance
            2. damerau-levenshtein distance
            3. Metaphone
            4. NYSIIS
            5. match_rating_codex

            note:
                    for methods 4,5 and 6, they only can provide results as 1 (match) or 0 (not match)
                    for methods 1 and 2, the methods will return a value in range [0, 1]
    '''
    result = 0

    if s1 == '' or s2 == '':
        return result

    if method == 1:
        result = jellyfish.jaro_winkler(s1, s2)
    elif method == 2:
        try:
            diff = jellyfish.damerau_levenshtein_distance(s1, s2)
            result = 1 - (diff / max(len(s1), len(s2)))
        except:
            result = 0
    elif method == 3:
        result = 1 if jellyfish.metaphone(s1) == jellyfish.metaphone(s2) else 0
    elif method == 4:
        result = 1 if jellyfish.nysiis(s1) == jellyfish.nysiis(s2) else 0
    elif method == 5:
        result = 1 if jellyfish.match_rating_codex(
            s1) == jellyfish.match_rating_codex(s2) else 0
    # elif method == 0:
    # 	raise ValueError("provide a method code (1-6).")
    # else:
    # 	raise ValueError("the method parameter must be in the range from 1 to 6.")

    return result
Exemplo n.º 52
0
    def populate_topics_from_phantom_forms(cls):
        all_forms = phantom_on_the_capitol.retrieve_form_elements([x.bioguide_id for x in Legislator.query.all()])
        all_topics = {}
        for legislator, req in all_forms.iteritems():
            for key, val in req.iteritems():
                for step in val:
                    if step['value'] == '$TOPIC':
                        if type(step['options_hash']) is dict:
                            keys = step['options_hash'].keys()
                        else:
                            keys = step['options_hash']
                        for k in keys:
                            k = k.strip()
                            if all_topics.has_key(k):
                                all_topics[k] += 1
                            else:
                                all_topics[k] = 1

        failed_topics = []
        for topic, count in all_topics.iteritems():
            result = select_solver.choose('test', [topic.lower()])
            if result is None:
                failed_topics.append(topic.lower())
            elif result:
                db_first_or_create(Topic, name=topic.lower())

        all_topics = Topic.query.filter_by(wikipedia_parent=None)

        for f_topic in failed_topics:
            try:
                lowest = (None, None)
                for topic in all_topics:
                    print topic.name, f_topic
                    d = jellyfish.damerau_levenshtein_distance(unicode(str(topic.name)), unicode(str(f_topic)))
                    if lowest[0] is None or lowest[1] > d:
                        lowest = (topic, d)
                print 'Adding ' + f_topic + ' with parent ' + lowest[0].name
                db_first_or_create(Topic, name=f_topic, wikipedia_parent=lowest[0].id)
            except:
                continue
Exemplo n.º 53
0
    def find(self, name_alias_id, fuzzy=False):
        """
        Find securities
        :param name_alias_id:
        :return: ISIN_ID based on any (useful) information
        """
        # import pdb; pdb.set_trace()
        if not fuzzy:
            find_something = Security.objects.filter(name__contains=name_alias_id) |\
                             Security.objects.filter(aliases__contains=name_alias_id) |\
                             Security.objects.filter(isin_id=name_alias_id) |\
                             Security.objects.filter(yahoo_id=name_alias_id)
            result = None if not find_something else find_something[0]
        else:
            # import pdb; pdb.set_trace()
            min_score = 2.5
            min_score_sec = None
            # print('Trans', name_alias_id)
            for sec in Security.objects.all():
                if not isinstance(sec.aliases, list):
                    all_names = [sec.name]
                else:
                    all_names = sec.aliases + [sec.name]
                for alias in all_names:
                    # print('analyzing sec', alias)
                    score = jellyfish.damerau_levenshtein_distance(name_alias_id.lower(), alias.lower())
                    # print('Score', score)
                    if score < min_score:
                        min_score = score
                        min_score_sec = sec
            result = min_score_sec, min_score

            # >>> jellyfish.levenshtein_distance('jellyfish', 'smellyfish')
            # 2
            # >>> jellyfish.jaro_distance('jellyfish', 'smellyfish')
            # 0.89629629629629637
            # >>> jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs')
            # 1
        return result
Exemplo n.º 54
0
def dameraulevenshtein(seq1, seq2):
    """Calculate the Damerau-Levenshtein distance between sequences.

    This distance is the number of additions, deletions, substitutions,
    and transpositions needed to transform the first sequence into the
    second. Although generally used with strings, any sequences of
    comparable objects will work.

    Transpositions are exchanges of *consecutive* characters; all other
    operations are self-explanatory.

    This implementation is O(N*M) time and O(M) space, for N and M the
    lengths of the two sequences.

    >>> dameraulevenshtein('ba', 'abc')
    2
    >>> dameraulevenshtein('fee', 'deed')
    2

    It works with arbitrary sequences too:
    >>> dameraulevenshtein('abcd', ['b', 'a', 'c', 'd', 'e'])
    2
    """
    return jellyfish.damerau_levenshtein_distance(seq1.encode("utf-8"), seq2.encode("utf-8"))
Exemplo n.º 55
0
def similarityMeasures(row1, row2):
	jaro_sum = 0
	jaro_winkler_sum = 0
	levenshtein_sum = 0
	damerau_levenshtein_sum = 0
	
	for columnIndex in range(1,15): #skips id column
		a = row1[columnIndex]
		b = row2[columnIndex]
		jaro_sum += jellyfish.jaro_distance(a, b)
		jaro_winkler_sum += jellyfish.jaro_winkler(a, b)
		levenshtein_sum += 1 - jellyfish.levenshtein_distance(a, b) / float(max(len(a), len(b)))
		damerau_levenshtein_sum += 1 - jellyfish.damerau_levenshtein_distance(a, b) / float(max(len(a), len(b)))

	returnV =  "%.6f,%.6f,%.6f,%.6f" % (
		jaro_sum / 14.0,
		jaro_winkler_sum / 14.0,
		levenshtein_sum / 14.0,
		damerau_levenshtein_sum / 14.0)
		
	for i in range(1,15):
		returnV += ",%.6f" % (jellyfish.jaro_distance(row1[i], row2[i]))

	return returnV
#     Levenshtein Distance
#     Damerau-Levenshtein Distance
#     Jaro Distance
#     Jaro-Winkler Distance
#     Match Rating Approach Comparison
#     Hamming Distance

# Phonetic encoding:
#     American Soundex
#     Metaphone
#     NYSIIS (New York State Identification and Intelligence System)
#     Match Rating Codex
import jellyfish
print(jellyfish.levenshtein_distance('jellyfish', 'smellyfish'))  # 2; 编辑距离
print(jellyfish.jaro_distance('jellyfish', 'smellyfish'))  # 0.89629629629629637
print(jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs'))  # 1; 编辑距离, 带翻转的
print(jellyfish.metaphone('Jellyfish'))  # 'JLFX'
print(jellyfish.soundex('Jellyfish'))  # 'J412'
print(jellyfish.nysiis('Jellyfish'))  # 'JALYF'
print(jellyfish.match_rating_codex('Jellyfish'))  # 'JLLFSH'

##################################################################
## Lenvenshtein
import Levenshtein
print(Levenshtein.hamming('hello', 'helol'))  # 2; 计算汉明距离; 要求 str1 和 str2 必须长度一致; 是描述两个等长字串之间对应位置上不同字符的个数
print(Levenshtein.distance('hello', 'helol'))  # 2; 计算编辑距离(也成 Levenshtein 距离); 是描述由一个字串转化成另一个字串最少的操作次数, 在其中的操作包括插入 & 删除 & 替换
print(Levenshtein.distance('hello world asdf', 'helolaaaa world asdf'))  # 5
print(Levenshtein.ratio('hello', 'helol'))  # 0.8; 计算莱文斯坦比; 计算公式 r = (sum - ldist) / sum, 其中 sum 是指 str1 和 str2 字串的长度总和, ldist 是类编辑距离
# 注意: 这里的类编辑距离不是 2 中所说的编辑距离, 2 中三种操作中每个操作+1, 而在此处, 删除、插入依然+1, 但是替换+2
# 这样设计的目的: ratio('a', 'c'), sum=2, 按 2 中计算为(2-1)/2 = 0.5,' a','c'没有重合, 显然不合算, 但是替换操作+2, 就可以解决这个问题
print(Levenshtein.jaro('hello', 'helol'))  # 0.9333333333333332; 计算 jaro 距离; 用于健康普查
				damerau_levenshtein_avg = 0
				levenshtein_avg = 0
				for columnIndex in xrange(1,15):
					a = table[index1][columnIndex][1:]
					b = table[index2][columnIndex][1:]
					


					if a=="" or b=="":
						numColumns -= 1
					else:
						jaro_tmp = jellyfish.jaro_distance(a, b) 	
						jaro[columnIndex] = jaro_tmp
						jaro_avg += jaro_tmp
						jaro_winkler_avg += jellyfish.jaro_winkler(a, b)
						damerau_levenshtein_avg += 1 - jellyfish.damerau_levenshtein_distance(a, b) / float(max(len(a), len(b)))
						levenshtein_avg += 1 - jellyfish.levenshtein_distance(a, b) / float(max(len(a), len(b)))


				jaro_avg /= numColumns
				jaro_winkler_avg /= numColumns
				damerau_levenshtein_avg /= numColumns

				# apply the learned rules from the trained model:
				#if jaro_winkler_avg >= 0.844955 or ((damerau_levenshtein_avg >= 0.650227) and (jaro_winkler_avg >= 0.833977)):
				#	results_file.write(table[index1][0] + "\t" + table[index2][0] + "\n")
				
				'''
				#duplicate = [int(table[index1][0]), int(table[index2][0])]

				isDuplicateInReal = duplicate in trueDuplicates'''