Пример #1
1
    def get_celldifference(self):
        conn = sqlite3.connect('ipython.db')
        c = conn.cursor()
        lines = c.execute('SELECT script, cell, line_content FROM ipython ')
        lines = list(lines)
        lines_copy = copy.copy(lines)
        cell_differences = {}
        repo_cell = ""
        for l in range(0, len(lines)):
            line = lines[l]
            line_diff = []
            for c in range(l+1, len(lines_copy)):
                line_copy = lines_copy[c]
                if line[0] == line_copy[0] and l != c and line[1] != line_copy[1]:
                    difference = distance.levenshtein(line[2], line_copy[2])
                    same = self.cell_analysis(len(line[2]), len(line_copy[2]), difference)
                    line_diff.append(same)

            repo_cell = str(line[0]) + ": " + str(line[1])
            if repo_cell in cell_differences:
                cell_differences[repo_cell].extend(line_diff)
            else:
                cell_differences[repo_cell] = line_diff

        return cell_differences
def kskip_ngram_cuoccurrence(answer, example, ng, kskip, question_index):
    distance_over_ngrams = 0
    # nltk skipgrams function doesn't include unigrams, so they are calculated separately:
    summ_distance = 0
    for word_ex in example:
        dist = 0
        for word_an in answer:
            dist_tmp = distance.levenshtein(word_ex, word_an)
            if dist_tmp > dist:
                dist = dist_tmp
        summ_distance += dist
    try:
        distance_over_ngrams += summ_distance / len(example)
    except:
        print("zero")
        print(question_index)
        distance_over_ngrams += 0
    for n in range(2, ng + 1):
        answer_ngramed = list(skipgrams(answer, n, kskip))
        example_ngramed = list(skipgrams(example, n, kskip))
        summ_distance = 0
        for ngram_of_example in example_ngramed:
            dist = 0
            for ngram_of_answer in answer_ngramed:
                dist_tmp = distance.levenshtein(ngram_of_example, ngram_of_answer)
                if dist_tmp > dist:
                    dist = dist_tmp
            summ_distance += dist
        try:
            distance_over_ngrams += summ_distance / len(example_ngramed)
        except:
            distance_over_ngrams += 0
    return distance_over_ngrams
Пример #3
0
def run_cal():
    db = MySQLdb.connect(host="localhost", port=3308, user="******",
                         passwd="你的密码", db="MovieSizer", charset="utf8")
    cursor = db.cursor()
    sql = "select * from movies_movieinfo"
    cursor.execute(sql)
    res = cursor.fetchall()
    id = 1
    print('\n\n cal_similar_gry.py\n\n')
    # 控制矩阵规模
    for i in range(0, int(len(res)/100)):
        for j in range(i+1, len(res)):
            i_id = res[i][0]
            j_id = res[j][0]
            moviename_length = distance.levenshtein(res[i][1], res[j][1])
            nation_length = distance.levenshtein(res[i][3], res[j][3])
            directors_length = distance.levenshtein(res[i][4], res[j][4])
            leadactors_length = distance.levenshtein(res[i][5], res[j][5])
            editors_length = distance.levenshtein(res[i][6], res[j][6])
            length = moviename_length + nation_length + \
                directors_length + leadactors_length + editors_length
            similar = 1/length
            sql = "INSERT INTO movies_moviesimilar VALUES (%d, %d, %d, %f)" % (
                id, i_id, j_id, similar)
            cursor.execute(sql)
            db.commit()
            id = id + 1
        print('current : ', i)
    db.close()
    print('DONE !')
Пример #4
0
 def testLD(self):
     str1 = "hello my name is jose"
     str2 = "hello m yname is jose"
     str3 = "hlelo my name is jose"
     self.assertEquals(distance.levenshtein(str1, str2), 2)
     self.assertEquals(distance.levenshtein(str1, str3), 2)
     self.assertEquals(distance.levenshtein(str1, str2), \
                       distance.levenshtein(str2, str1))
Пример #5
0
def valid_text(text):
    text = clean_text(text)
    # print(text)
    if text == "":
        return False, ''
    p = re.compile('((\d{3})-(\d{3})-(\d{2}))')
    if p.search(text) is not None:
        found = p.search(text).group()
        print(f"Matched regex 1: {found}")
        return True, found
    p = re.compile('(LPN\d{8})')
    if p.match(text) is not None:
        # print(f"Matched regex 2: {p.match(text).group()}")
        return True, p.match(text).group()
    else:
        if len(text) > 3:
            min_distance_levenshtein = 2
            index = text.find('L')
            if index >= 0:
                #print('trazas ',distance.levenshtein('LPN',text[max(index-2,0):index]))
                if distance.levenshtein(
                        'LPN', text[index:min(index + 2, len(text)
                                              )]) <= min_distance_levenshtein:
                    text_aux = text[index + 3:-1]
                    text_aux = 'LPN' + text_aux
                    #print("NUEVO LPN L ", text_aux)
                    if p.match(text_aux) is not None:
                        # print(f"Matched regex 2: {p.match(text_aux).group()}")
                        return True, p.match(text_aux).group()

            index = text.find('P')
            if index >= 0:
                #print('trazas ',distance.levenshtein('LPN',text[max(index-2,0):index]))
                if distance.levenshtein(
                        'LPN',
                        text[max(index - 1, 0):min(index + 1, len(text))]
                ) <= min_distance_levenshtein:
                    text_aux = text[index + 2:-1]
                    text_aux = 'LPN' + text_aux
                    #print("NUEVO LPN P ", text_aux)
                    if p.match(text_aux) is not None:
                        # print(f"Matched regex 2: {p.match(text_aux).group()}")
                        return True, p.match(text_aux).group()

            index = text.find('N')
            if index >= 0:
                #print('trazas ',distance.levenshtein('LPN',text[max(index-2,0):index]))
                if distance.levenshtein(
                        'LPN',
                        text[max(index -
                                 2, 0):index]) <= min_distance_levenshtein:
                    text_aux = text[index + 1:-1]
                    text_aux = 'LPN' + text_aux
                    #print("NUEVO LPN N ", text_aux)
                    if p.match(text_aux) is not None:
                        # print(f"Matched regex 2: {p.match(text_aux).group()}")
                        return True, p.match(text_aux).group()
    return False, ''
Пример #6
0
    def predict_oov_v2(self, n):
        if len(self.masked_index) != 0:
            print('Masked sentence ' + ' '.join(self.tokenized_text) +
                  ' and the masked index is ' + str(self.masked_index))
            indexed_tokens = self.tokenizer.convert_tokens_to_ids(
                self.tokenized_text)
            segments_ids = [0] * len(self.tokenized_text)
            tokens_tensor = torch.tensor([indexed_tokens])
            segments_tensors = torch.tensor([segments_ids])

            #print (indexed_tokens)
            with torch.no_grad():
                outputs = self.model(tokens_tensor,
                                     token_type_ids=segments_tensors)
                predictions_1 = outputs[0]
            #predicted_index = torch.argmax(predictions_1[0, masked_index]).item()

            bestsent = self.tokenized_text

            # looping the masked index
            for mei in self.masked_index:

                sorted_index = torch.argsort(predictions_1[0, mei],
                                             descending=True).tolist()
                sorted_probability = [
                    predictions_1[0, mei][i] for i in sorted_index
                ]

                returnstr = []

                for loop in range(n):
                    predicted_token = self.tokenizer.convert_ids_to_tokens(
                        [sorted_index[loop]])[0]
                    returnstr.append(predicted_token)
                    #returnstr.append(' '.join(self.tokenized_text).replace('[MASK]', predicted_token))
                    #print(predicted_token)

                bestd = 100
                bestw = ''

                for v in returnstr:
                    #print(v+' ' +str(distance.levenshtein(v, self.original_sentence[mei])))
                    if distance.levenshtein(
                            v.lower(),
                            self.original_sentence[mei].lower()) < bestd:
                        bestd = distance.levenshtein(
                            v.lower(), self.original_sentence[mei].lower())
                        bestw = v

                bestsent[mei] = bestw

            print('Recovered sentence ' + ' '.join(bestsent))
            return ' '.join(bestsent), self.get_score(bestsent)

        else:
            return ' '.join(self.tokenized_text), self.get_score(
                self.tokenized_text)
Пример #7
0
def compare_mt(usertrans, referencetrans, machinetrans):
	"""
	Compare if user translation better or worst
	than machine translation
	"""
	
	evaluation = {'better': ['Congratulations, you did better than the machine translation!', \
							 'Be proud, you were better than the machine translation!', \
							 'You are the best, even better than the machine translation!'], \
					'same': ['You did as good as the machine translation!', \
							 'This is a tie between you and the machine translation!', \
							 'The machine translation was as good as you!'], \
					'worst': ["The machine translation beat you, let's try to do better!", \
							  "What a shame, you were defeated by the machine translation.", \
							  "Next time, you will beat the machine translation, but not this time!"]}
	
	# encode sentences to UTF-8
	ut = usertrans.encode('utf8')
	tt = referencetrans.encode('utf8')
	mt = machinetrans.encode('utf8')
	
	# remove punctuation
	replace_punctuation = string.maketrans(string.punctuation, ' '*len(string.punctuation))
	
	ut = ut.translate(replace_punctuation).lower()
	tt = tt.translate(replace_punctuation).lower()
	mt = mt.translate(replace_punctuation).lower()
	
	# levensthein characters ratio
	lev_let_ut = lev.ratio(tt, ut)
	lev_let_mt = lev.ratio(tt, mt)
	
	# levensthein tokens ratio
	ut = ut.split()
	tt = tt.split()
	mt = mt.split()
	
	lensum_user = len(ut)+len(tt)
	lensum_machine = len(mt)+len(tt)
	
	lev_tok_ut = (lensum_user - distance.levenshtein(tt, ut)) / lensum_user
	lev_tok_mt = (lensum_machine - distance.levenshtein(mt, ut)) / lensum_machine
	
	# get best levensthien ratio
	ratio_ut = max(lev_let_ut, lev_tok_ut)
	ratio_mt = max(lev_let_mt, lev_tok_mt)
	
	# evaluate if user better, worst or similar than machine
	if ratio_ut > ratio_mt:
		return random.choice(evaluation['better'])
	elif ratio_ut < ratio_mt:
		return random.choice(evaluation['worst'])
	else:
		return random.choice(evaluation['same'])
Пример #8
0
    def compare_sent(self, sent_true: str, sent_pred: str):
        '''

        :param sent_true:
        :param sent_pred:
        :return: int value
        '''
        return distance.levenshtein(
            sent_true, sent_pred), distance.levenshtein(sent_true,
                                                        sent_pred,
                                                        normalized=True)
Пример #9
0
def get_closest_word(ww, wset):
    bestd = 100
    bestw = ''

    for v in wset:
        #print(v+' ' +str(distance.levenshtein(v, self.original_sentence[mei])))
        if distance.levenshtein(v.lower(), ww.lower()) < bestd:
            bestd = distance.levenshtein(v.lower(), ww.lower())
            bestw = v

    return bestw
Пример #10
0
def compare_lex(item1, item2):
    global search_word
    if search_word == item1["point headline"]:
        return 1
    if search_word == item2["point headline"]:
        return -1
    else:
        if levenshtein(item1["point headline"], search_word) < levenshtein(
                item2["point headline"], search_word):
            return 1
        else:
            return -1
Пример #11
0
def getIDs(name, players, ids):
    best = 100000
    bestIndex = -1
    for player in players:
        if distance.levenshtein(name, player) < best:
            best = distance.levenshtein(name, player)
            bestIndex = players.index(player)

    if best > 5:
        print(name)
        print(ids[bestIndex])

    return ids[bestIndex]
Пример #12
0
    def test_levenshteinBase(self):

        a = ''
        b = ''
        self.assertAlmostEqual(distance.levenshtein(a, b), 0, places=3)

        a = 'abc'
        b = 'abc'
        self.assertAlmostEqual(distance.levenshtein(a, b), 0, places=3)

        a = 'abcd'
        b = 'abc'
        self.assertAlmostEqual(distance.levenshtein(a, b), 0.25, places=3)

        a = 'abc'
        b = 'abd'
        self.assertAlmostEqual(distance.levenshtein(a, b), 0.33333, places=3)

        a = 'ab'
        b = 'abc'
        self.assertAlmostEqual(distance.levenshtein(a, b), 0.33333, places=3)

        a = ''
        b = 'abc'
        self.assertAlmostEqual(distance.levenshtein(a, b), 1, places=3)

        a = 'abc'
        b = ''
        self.assertAlmostEqual(distance.levenshtein(a, b), 1, places=3)

        a = 'abc'
        b = 'xyz'
        self.assertAlmostEqual(distance.levenshtein(a, b), 1, places=3)
Пример #13
0
def calculateSimilarity_WithDistancePackage(createdSentence): #createdSentence is list of words

    levinDist = {}
    jaccardDist = {}
    bestValues = {}
    for i in range(len(X_train)):
        currSentence = X_train[i]
        sentence_str = [index_to_word[x] for x in currSentence[1:-1]]  # sentence_str is list of words
        #Levinstein Distance
        dist = distance.levenshtein(createdSentence, sentence_str)
        dist2 = distance.jaccard(createdSentence,sentence_str)
        #print(dist)
        if (dist>0):
            #print ("Distance Levinshtein: %f" % (dist))
            levinDist[i]=dist
        jaccardDist[i]=dist2
        #print ("Jaccard Distance: %f" % (dist2))

    #take best value
    levinMin = min(levinDist.itervalues())
    jaccardMin = min(jaccardDist.itervalues())

    print ("Best Distance Levinshtein: %f" % (levinMin))
    print ("Best Distance Jaccard: %f" % (jaccardMin))
    bestValues["Jaccard"]=jaccardMin
    bestValues["Levin"]=levinMin
    return bestValues
Пример #14
0
def main(args):
    parameters = process_args(args)

    target_formulas_file = parameters.target_file
    predicted_formulas_file = parameters.predicted_file

    target_formulas = open(target_formulas_file).readlines()
    predicted_formulas = open(predicted_formulas_file).readlines()

    i = 0

    total_len = 0
    total_edit_distance = 0
    if len(target_formulas) != len(predicted_formulas):
        print("number of formulas doesn't match")
        return
    n = len(target_formulas)
    for tf, pf in zip(target_formulas, predicted_formulas):
        i += 1
        if i % 2000 == 0:
            print("{}/{}".format(i, n))

        tf_ = tf.strip()
        pf_ = pf.strip()
        true_token = tf_.split(' ')
        predicted_tokens = pf_.split(' ')
        l = max(len(true_token), len(predicted_tokens))
        edit_distance = distance.levenshtein(true_token, predicted_tokens)
        total_len += l
        total_edit_distance += edit_distance
    print("{}/{}".format(n, n))
    print('Edit Distance Accuracy: %f' %
          (1. - float(total_edit_distance) / total_len))
Пример #15
0
def cluster_similar_responses(output_path):
    max_count = get_max_socket_message_count(output_path)
    listing = glob.glob(output_path + '*-%s.log' % max_count)

    messages = [file(filename).read() for filename in listing]
    messages = [extract_description_from_message(m) for m in messages]
    messages = np.asarray(messages)

    print()
    print('Clustering %s responses...(this might take a while)' %
          len(messages))
    print()

    lev_similarity = -1 * np.array(
        [[distance.levenshtein(m1, m2) for m1 in messages] for m2 in messages])

    affprop = sklearn.cluster.AffinityPropagation(affinity='precomputed',
                                                  damping=0.5)
    affprop.fit(lev_similarity)

    print('Generated clusters:')
    print()

    for cluster_id in np.unique(affprop.labels_):
        exemplar = messages[affprop.cluster_centers_indices_[cluster_id]]
        cluster = np.unique(
            messages[np.nonzero(affprop.labels_ == cluster_id)])
        cluster_str = ', '.join(cluster)
        print('-' * 80)
        print(' - *%s:* %s' % (exemplar, cluster_str))
        print('-' * 80)
        print()
Пример #16
0
def dist(word1, word2):							# определяем схожесть 2-х треков
	import distance
	
	#print distance.hamming(word1, word2, normalized=True)	
	# не можем использовать, так как это расстояние рассчитывается только для строк одинаковой длины
	
	return distance.levenshtein(word1, word2, normalized=True)	#минимальное количество операций вставки одного символа, удаления одного символа и замены одного символа на другой, необходимых для превращения одной строки в другую
Пример #17
0
 def similarity(self, word1, word2):
     # Score function may vary and for sure can be improved
     # I used distance module here
     ldist = distance.levenshtein(word1, word2)
     hdist = 0  #distance.hamming(word1, word2) requires equal length e.g. padding
     score = ldist * 1  # + hdist * 0
     return score
Пример #18
0
 def initUI(self):
     inputGroups = self.inputText.split()
     solutionGroups = self.solutionText.split()
     
     numLetters = 0.0
     numErrors = 0.0
     for idx, _ in enumerate(solutionGroups):
         if idx >= len(inputGroups):
             inputGroups.append("")
         numLetters += len(solutionGroups[idx])
         numErrors += levenshtein(solutionGroups[idx], inputGroups[idx])                        
     percentage = numErrors / numLetters * 100.0
     
     solutionLabel = QLabel(self.createEvaluationRichText(solutionGroups, inputGroups))
     errorLabel = QLabel("Error count (Levenshtein): %02.2f%%" % percentage)
     
     layout = QVBoxLayout()
     layout.addWidget(solutionLabel)
     layout.addWidget(errorLabel)
     
     if percentage < 10.0:
         successLabel = QLabel("Error rate lower than 10%! <br/> Proceed to next lesson!")
         layout.addWidget(successLabel)
     
     self.setLayout(layout)
     self.setWindowTitle('Evaluation')
Пример #19
0
	def similarity(self, word1, word2):
		# Score function may vary and for sure can be improved
		# I used distance module here
		ldist = distance.levenshtein(word1, word2)
		hdist = 0 #distance.hamming(word1, word2) requires equal length e.g. padding
		score = ldist * 1# + hdist * 0
		return score
Пример #20
0
def checkNames(df):
    df = df[df['mins'] >= 15]
    df['hours'] = 1. / 60. * df['mins']
    res = df.groupby('name')['hours'].agg(np.sum)

    nPart = len(res)

    print('Helenka')
    names = res.index.tolist()

    import sklearn.cluster
    import distance

    #words = "YOUR WORDS HERE".split(" ") #Replace this line
    words = names[0:100]
    words = np.asarray(words)  #So that indexing with a list will work
    lev_similarity = -1 * np.array(
        [[distance.levenshtein(w1, w2) for w1 in words] for w2 in words])
    print(lev_similarity)

    vals = []
    for idx, x in np.ndenumerate(lev_similarity):
        vals.append([x, idx])
    #print(idx, x)

    vals = sorted(vals, key=lambda s: s[0])
    for v in vals:
        print(v[0], words[v[1][0]], ' , ', words[v[1][1]])
Пример #21
0
def ubica(bot, update, user_data):
    conn = sqlite3.connect('aemet.db')
    c = conn.cursor()
    sitio = update.message.text.title()
    c.execute('SELECT * FROM municipio WHERE nombre=?', (sitio, ))
    base = c.fetchone()
    if base is None:
        update.message.reply_text(
            'Parece que no está bien escrito, déjame ver...')
        c.execute('SELECT * FROM municipio')
        flag = 0
        i = 0

        while i < 9208:
            base = c.fetchone()
            if distance.levenshtein(unidecode(sitio.lower()),
                                    unidecode(base[0].lower()),
                                    normalized=True) < 0.2:
                flag = 1
                break
            i += 1
        if flag == 0:
            update.message.reply_text(
                'Ubicación no encontrada. Por favor, indica el municipio que quieres buscar:'
            )
            return LUGAR

    cod[update.message.chat.id] = base[1]

    update.message.reply_text("Se ha elegido el municipio %s, en %s" %
                              (base[0], base[2]),
                              reply_markup=markup)
    place[update.message.chat.id] = base[1]

    return CHOOSING
Пример #22
0
def inlevenshtein(seq1, seqs, max_dist=0.1):
    for seq2 in seqs:
        dist1 = distance.levenshtein(seq1, seq2, max_dist=2)
        if dist1 !=-1:
            dist2 = distance.nlevenshtein(seq1, seq2, )
            if dist2 <= max_dist:
                yield dist2, seq2
Пример #23
0
def distance_calculating(train_tuple, test_tuple):
    """
    Calculate the Levenshtein distance between two items and return a
    dictionary with the index of closest matches as a list as its value and
    the n_test item as the key.

    This function returns a dictionary of the form:
        n_test: [distance, [n_train...n_train]]
    """
    data = dict()

    for n_test, test_item in enumerate(test_tuple):

        # set default value for the data dict.
        data.setdefault(n_test,[-1,[]])

        # remove the class label
        test_item = test_item[:-1]

        for n_train, train_item in enumerate(train_tuple):
            # remove the class label
            train_item = train_item[:-1]

            min_distance = distance.levenshtein(test_item, train_item)

            # if there is an exact match
            if min_distance == 0:
                data[n_test] = [min_distance, [n_train]]
                break
            elif min_distance == data[n_test][0]:
                data[n_test][1].append(n_train)
            elif min_distance < data[n_test][0] or data[n_test][0] == -1:
                data[n_test] = [min_distance, [n_train]]

    return data
Пример #24
0
def knn(test, orig, k):
    index = []
    dist = []
    ct = 1
    y = test[0:8]
    oc = test[8]
    for i in orig:
        x = i[0:8]
        dist.append(distance.levenshtein(test, x))
        index.append(i[8])

    dist, index = pairsort(dist, index)
    cp = {}
    for i in dist:
        if ct <= k:

            if index[ct - 1] in cp.keys():
                cp[index[ct - 1]] += 1
            else:
                cp[index[ct - 1]] = 1
        else:
            break
        ct += 1
    predicted = max(cp.items(), key=operator.itemgetter(1))[0]
    return predicted, oc
Пример #25
0
def calcDist(Mots, AllWordsList, nbSyll):
    nb9 = nbSyll / 9
    # on calcule leur distance de Levenshtein min avec la base
    WordDist = []
    for i, mot in enumerate(Mots):  # pseudo-mots
        minD = 1000000
        minWord = ''
        nbD = 0
        for word in AllWordsList:  # base
            d = distance.levenshtein(mot, word)
            if d < minD:
                minD = d
                minWord = word
                nbD = 1
            elif d == minD:
                nbD += 1
        WordDist.append([mot, minWord, minD, nbD])
    distList = [x[2] for x in WordDist]
    # nombre le plus commun
    [dist, nb] = Counter(distList).most_common(1)[0]
    if nb < 12:
        return -1
    L1 = distList[0:nb9]
    L2 = distList[nb9:2 * nb9]
    L3 = distList[2 * nb9:3 * nb9]
    if L1.count(dist) < 4 or L2.count(dist) < 4 or L3.count(dist) < 4:
        return -1
    L1Res = [x for x in WordDist[0:nb9] if x[2] == dist]
    L2Res = [x for x in WordDist[nb9:2 * nb9] if x[2] == dist]
    L3Res = [x for x in WordDist[2 * nb9:3 * nb9] if x[2] == dist]
    return L1Res + L2Res + L3Res
Пример #26
0
def calculateSimilarity_WithDistancePackage(
        createdSentence):  #createdSentence is list of words

    levinDist = {}
    jaccardDist = {}
    bestValues = {}
    for i in range(len(X_train)):
        currSentence = X_train[i]
        sentence_str = [index_to_word[x] for x in currSentence[1:-1]
                        ]  # sentence_str is list of words
        #Levinstein Distance
        dist = distance.levenshtein(createdSentence, sentence_str)
        dist2 = distance.jaccard(createdSentence, sentence_str)
        #print(dist)
        if (dist > 0):
            #print ("Distance Levinshtein: %f" % (dist))
            levinDist[i] = dist
        jaccardDist[i] = dist2
        #print ("Jaccard Distance: %f" % (dist2))

    #take best value
    levinMin = min(levinDist.itervalues())
    jaccardMin = min(jaccardDist.itervalues())

    print("Best Distance Levinshtein: %f" % (levinMin))
    print("Best Distance Jaccard: %f" % (jaccardMin))
    bestValues["Jaccard"] = jaccardMin
    bestValues["Levin"] = levinMin
    return bestValues
Пример #27
0
    def get_conceptual_independence(data):
        comm = []
        count = 0
        comm_data = data["clusters"]
        for i in comm_data:
            temp = []
            for j in i["nodes"]:
                if find_node_type(data, j):
                    name = find_node(data, j)
                    name = name.split(".")[-1]
                    temp.append(name)
            comm.append(temp)

        community_score = []
        for i in comm:
            score = 0
            words = np.asarray(i)
            for j in words:
                for k in words:
                    # print (distance.levenshtein(j, k)/max(len(j), len(k)))
                    score += 1 - distance.levenshtein(j, k) / max(
                        len(j), len(k))
            community_score.append(score / len(i)**2)

        total_score = 0
        for i in community_score:
            total_score += i

        total_score = total_score / len(community_score)

        return total_score
Пример #28
0
    def get_repodifference(self):
        conn = sqlite3.connect('ipython.db')
        c = conn.cursor()
        lines = c.execute(
            'SELECT repository, script, line_content FROM ipython ')
        lines = list(lines)
        lines_copy = copy.copy(lines)
        repo_differences = {}
        for l in range(0, len(lines)):
            line = lines[l]
            line_diff = []
            for c in range(l + 1, len(lines_copy)):
                line_copy = lines_copy[c]
                if line[0] != line_copy[0] and l != c and line[1] != line_copy[
                        1]:
                    difference = distance.levenshtein(line[2], line_copy[2])
                    same = self.cell_analysis(len(line[2]), len(line_copy[2]),
                                              difference)
                    line_diff.append(same)

            repo_cell = str(line[0]) + ": " + str(line[1])
            if repo_cell in repo_differences:
                repo_differences[repo_cell].extend(line_diff)
            else:
                repo_differences[repo_cell] = line_diff

        return repo_differences
Пример #29
0
def get_title(author, talk_names):
    for name in talk_names:
        if name.startswith(author):
            matches = re.search(r'(.*) "(.*)"', name)

            if matches:
                return matches.group(2)
            else:
                return name


# Use levenshtein's distance to get the closes title
    min_title = ''
    min_distance = float('inf')
    for subauth, subtitle in filter(lambda x: x[0],
                                    map(get_author, talk_names)):
        cur_dis = distance.levenshtein(subauth, author)

        if cur_dis < min_distance:
            min_distance = cur_dis
            min_title = subtitle

    if min_title:
        return min_title
    else:
        return 'not_found. Is it no educativa?'
Пример #30
0
def assign_third(first, second, tags):
    first_out = []
    second_out = []
    third_out = []
    
    B = len(first)
    attempts = min(B-1,ATTEMPTS)
    for i in range(B): 
        ti = first[i]
        si = second[i]
        hi, tidi, sidi = tags[i]
        flag = attempts
        checked = []
        while (flag): 
            j = random.randrange(B)
            if j in checked:
                continue
            checked.append(j)
            flag -= 1
            hj = tags[j][0]
            if distance.levenshtein(hi, hj) > MIN_LEV_DIST: 
                if (random.getrandbits(1)): 
                    tj = first[j] 
                    newdi = tags[j][1]
                else: 
                    tj = second[j] 
                    newdi = tags[j][2]
                if (newdi != tidi ) & (newdi != sidi): 
                    first_out.append(ti)
                    second_out.append(si)
                    third_out.append(tj)
                    flag = 0
                    
    return (first_out, second_out, third_out)
Пример #31
0
def RatingLevenshtein(revista):

    #Get path file
    dirname = os.path.dirname(__file__)
    loc = os.path.join(dirname, r'JCR2018.xlsx')

    #Initialize reader
    workbook = xlrd.open_workbook(loc)
    sheet = workbook.sheet_by_index(0)

    tuplas = []

    start_time = time()

    for i in range(sheet.nrows):
        valor = (sheet.cell_value(i, 1),
                 levenshtein(revista, sheet.cell_value(i, 1)))
        tuplas.append(valor)

    final_time = time()
    execution_time = round(final_time - start_time, 2)

    tuplas.sort(key=lambda revista: revista[1])

    top_5 = tuplas[:10]

    result = (top_5, execution_time)

    return result
Пример #32
0
    def score(self):
        count = []
        j_coeff = []
        j_coeff_feat = []
        l_dist = []
        # pred_feat_list = []
        # pred_feat_accuracy = []
        bleu_score = []
        for i in range(self.df.shape[0]):
            curr_feat = np.array([self.df.iloc[i, 0:4]])
            path, label, decision_feature = self.predict(curr_feat)
            print('actual vs predicted: ', self.df.iloc[i, 4], ' vs ',
                  ' '.join(path), 'labels: ', self.df.iloc[i, 5], label[0])
            count.append(self.df.iloc[i, 5] == label[0])
            actual_path = self.df.iloc[i, 4].split()
            actual_path_tok = [self.char_indices[char] for char in actual_path]
            pred_path_tok = [self.char_indices[char] for char in path]
            # print('actual_path--', actual_path)
            # print('path--', path)
            bleu_score.append(sentence_bleu([actual_path], path))
            j_coeff.append(self.get_j_coeff(actual_path_tok, pred_path_tok))
            j_coeff_feat.append(
                self.get_j_coeff(self.df.iloc[i, 6], decision_feature))
            l_dist.append(
                distance.levenshtein(self.df.iloc[i, 4].replace(' ', ''),
                                     ''.join(path)))

            print('Actual vs predicted features: ', self.df.iloc[i, 6], 'vs',
                  decision_feature, '\n')

        print('\nLabel accuracy - ', np.mean(count))
        print('Path metric (Jaccard) - ', np.mean(j_coeff))
        print('Path metric (Levenshtein) - ', np.mean(l_dist))
        print('Decision feature metric (Jaccard) - ', np.mean(j_coeff_feat))
        print('Bleu score of paths - ', np.mean(bleu_score))
Пример #33
0
def edit_distance(references, hypotheses):
    d_leven, len_tot = 0, 0
    for ref, hypo in zip(references, hypotheses):
        d_leven += distance.levenshtein(ref, hypo)
        len_tot += float(max(len(ref), len(hypo)))

    return 1. - d_leven / len_tot
Пример #34
0
def update1(k1,k2):
	i=0
	while i<len(k1):
		t1=k1[i]['T']
		j=0
		flag=0
		while j<len(k2):
			k=0
			t2=k2[j]['T']
			#for word1 in t1:
			#	dist=0
			#	for word2 in t2:
			#		dist+=distance.lebenshtein(word1,word2)						
			dist=int(distance.levenshtein(t1,t2))
			#print dist			
			if dist<=18:
				flag=1
				break
			#print dist
			j=j+1
	
		
			#minsum=su
			#k=j
		i=i+1							
Пример #35
0
def align(sen1, sen2):
	"""finds the best mapping of words from one sentence to the other"""
	#find lengths
	sen1 = list(map(preprocess_word, sen1.split()))
	sen2 = list(map(preprocess_word, sen2.split()))
	lengthDif = len(sen1) - len(sen2)
	if lengthDif > 0:
		shorter = sen2
		longer = sen1
	else:
		shorter = sen1
		longer = sen2
		lengthDif = abs(lengthDif)
	shorter += ["emptyWord"] * lengthDif

	#create matrix	
	matrix = np.zeros((len(longer), len(longer)))
	for i in range(len(longer)):
		for j in range(len(longer) - lengthDif):
			matrix[i,j] = distance.levenshtein(longer[i], shorter[j])
	print(matrix)
	
	#compare with munkres
	m = Munkres()
	indexes = m.compute(matrix)
	print("mapping is:",[(shorter[i], longer[j]) for (i,j) in indexes])
Пример #36
0
    def comp_string(self):  # self.comp_string()

        allOriginalWords = []
        allNLD = []

        # rec = j[count][wordPos].split() #j is entry, count is dict key, wordPos is sentence called

        for i in self.original:
            split = i.split()  # this splits one whole sentence to words
            allOriginalWords.append(split)

        for i in allOriginalWords:
            groupNLD = []
            allNLD.append(groupNLD)

        for i in self.recognized:
            count = 0
            #print("Self.recognized", self.recognized)
            for j in range(25):
                #print(j)
                sentence = i[j].split()
                #print(allOriginalWords[j])
                #print(sentence)
                ld = d.levenshtein(allOriginalWords[j], sentence)
                nld = 0
                nld = ld /( max(len(allOriginalWords[j]), len(sentence)))
                allNLD[j].append(nld)
                # print(nld)
            count += 1
        for i in allNLD:
            s.distances.append(i)
            #print(i)
        print("finished distance NLD")
Пример #37
0
def analyse_titles(author, liste_titles, dict_titles2ark, zipname, outputfile):
    """
    Si un auteur a plusieurs titres associés : 
    on compare chaque titre avec chacun des titres suivants dans la liste
    en ne tenant pas compte des variantes sur les chiffres
    """
    i = 0
    for title in liste_titles[i:]:
        title_chars = clean_titles_int(title)
        for foll_title in liste_titles[i + 1:]:
            foll_title_chars = clean_titles_int(foll_title)
            dist = distance.levenshtein(title, foll_title)
            if ((dist == 1 or dist == 2) and title_chars != foll_title_chars):
                nbark_title1 = len(dict_titles2ark[title])
                nbark_title2 = len(dict_titles2ark[foll_title])
                ark_title = " ".join(list(dict_titles2ark[title]))
                ark_foll_title = " ".join(list(dict_titles2ark[foll_title]))
                if nbark_title1 > nbark_title2:
                    line = [
                        str(author), ark_title, title, ark_foll_title,
                        foll_title,
                        str(dist), zipname
                    ]
                    line2report(line, outputfile)
                else:
                    line = [
                        str(author), ark_foll_title, foll_title, ark_title,
                        title,
                        str(dist), zipname
                    ]
                    line2report(line, outputfile)
        i += 1
Пример #38
0
def assign_third(first, second, tags):
    first_out = []
    second_out = []
    third_out = []

    B = len(first)
    attempts = min(B - 1, ATTEMPTS)
    for i in range(B):
        ti = first[i]
        si = second[i]
        hi, tidi, sidi = tags[i]
        flag = attempts
        checked = []
        while (flag):
            j = random.randrange(B)
            if j in checked:
                continue
            checked.append(j)
            flag -= 1
            hj = tags[j][0]
            if distance.levenshtein(hi, hj) > MIN_LEV_DIST:
                if (random.getrandbits(1)):
                    tj = first[j]
                    newdi = tags[j][1]
                else:
                    tj = second[j]
                    newdi = tags[j][2]
                if (newdi != tidi) & (newdi != sidi):
                    first_out.append(ti)
                    second_out.append(si)
                    third_out.append(tj)
                    flag = 0

    return (first_out, second_out, third_out)
Пример #39
0
    def __init__(self, mapping=None, weights=None):
        self._distanceMetrics = {
            'euclidean': lambda a, b: distance.euclidean([a], [b]),
            'manhattan': distance.manhattanScalar,
            'levenshtein': lambda a, b: distance.levenshtein([a], [b]),
            'needleman_wunsch':
            lambda a, b: distance.needleman_wunsch([a], [b]),
            'jaccard': distance.jaccard,
            'dice': distance.dice
        }

        self._mapping = mapping
        if (self._mapping == None):
            self._mapping = [None] * NUM_FEATURES
            self._mapping[STARS] = self._distanceMetrics['manhattan']
            self._mapping[TOTAL_REVIEW_COUNT] = self._distanceMetrics[
                'manhattan']
            self._mapping[AVAILABLE_REVIEW_COUNT] = self._distanceMetrics[
                'manhattan']
            self._mapping[MEAN_REVIEW_LEN] = self._distanceMetrics['manhattan']
            self._mapping[MEAN_WORD_LEN] = self._distanceMetrics['manhattan']
            self._mapping[NUM_WORDS] = self._distanceMetrics['manhattan']
            self._mapping[MEAN_WORD_COUNT] = self._distanceMetrics['manhattan']
            self._mapping[TOTAL_HOURS] = self._distanceMetrics['manhattan']
            self._mapping[ATTRIBUTES] = self._distanceMetrics['jaccard']
            self._mapping[CATEGORIES] = self._distanceMetrics['jaccard']
            self._mapping[TOP_WORDS] = self._distanceMetrics['jaccard']
            self._mapping[KEY_WORDS] = self._distanceMetrics['jaccard']
            self._mapping[OPEN_HOURS] = self._distanceMetrics['jaccard']

        self._weights = weights
        if (self._weights == None):
            self._weights = [1] * NUM_FEATURES
Пример #40
0
def ajax():
    if not request.json:
        abort(400)
    else:
        ingredient1 = request.json.get("ingredient1")
        ingredient2 = request.json.get("ingredient2")
        edit_distance = distance.levenshtein(ingredient1, ingredient2)
        return jsonify({"distance": edit_distance})
Пример #41
0
 def leven(self):
     all_dis = []
     small_list = Strings_comp(self).clear_data()
     for i in range(0, len(small_list)-1):
         cursor = small_list[i]
         for j in range(i+1, len(small_list)-1):
             dis = distance.levenshtein(cursor, small_list[j])
             all_dis.append(dis)
     return all_dis
Пример #42
0
        def successful(entry, word_list=word_list):
            new_word_list = []
            for address, subtree in sorted(entry.lexical, key=lambda (addr,subt): addr):
                new_word = subtree.symbol
                if new_word not in word_list:
                    return False

                new_word_list.append(new_word)
            return levenshtein(new_word_list, word_list) == 0
Пример #43
0
def test_similarity(sentence):
    if sentence in similarity_memo:
        return similarity_memo[sentence]


    similarity = distance.levenshtein(target_sentence, sentence)
    similarity_memo[sentence] = similarity

    return similarity
Пример #44
0
Файл: test.py Проект: comex/wk
 def meaning_answer_qual(self, entered):
     qual = 0
     for meaning in word.meaning:
         meaning = normalize(meaning)
         # arbitrary
         ok_dist = round(0.4*len(meaning))
         dist = distance.levenshtein(entered, meaning)
         qual = max(qual, 2 if dist == 0 else 1 if dist <= ok_dist else 0)
     return qual
Пример #45
0
def levenhstein_cluster(words):
	words = np.asarray(words) #So that indexing with a list will work
	lev_similarity = -1*np.array([[distance.levenshtein(w1,w2) for w1 in words] for w2 in words])
	affprop = sklearn.cluster.AffinityPropagation(affinity="precomputed", damping=0.5)
	affprop.fit(lev_similarity)
	for cluster_id in np.unique(affprop.labels_):
		exemplar = words[affprop.cluster_centers_indices_[cluster_id]]
		cluster = np.unique(words[np.nonzero(affprop.labels_==cluster_id)])
		cluster_str = ", ".join(cluster)
		print(" - *%s:* %s" % (exemplar, cluster_str))
Пример #46
0
def wikiMatches(word, maxDistance=5):
    import distance
    import re
    word = word.lower()
    response = keywordSearch(word, maxHits = 20)
    results = response["results"]
    for result in results:
        cleanLabel = re.sub(r'\(.*\)$', '', result["label"]).lower().strip()
        result["labelDistance"] = distance.levenshtein(word, cleanLabel)
    return [result for result in results if result["labelDistance"] <= maxDistance]
Пример #47
0
    def get_close_matches(self,test_string,n=5,cutoff=0.5):
        candidate_keys = difflib.get_close_matches(test_string.lower(),self.long_titles.keys(),n=n,cutoff=cutoff)
        candidates = []
        scores = []

        for candidate_key in candidate_keys:
            candidate = self.long_titles[candidate_key]
            candidates.append(candidate)
            scores.append(1.0-distance.levenshtein(test_string,candidate))

        return zip(candidates,scores)
Пример #48
0
def edit_distance():
    if not request.json:
        raise InvalidUsage('Requires JSON')

    data = request.get_json()
    edit_distance = levenshtein(data['word1'], data['word2'])
    response = {
        'word1': data['word1'], 
        'word2': data['word2'], 
        'edit_distance': edit_distance
    }
    return jsonify(**response)
Пример #49
0
 def search(self, term, k, results=None):
     if results is None:
         results = []
     d = distance.levenshtein(self.term, term)
     counter = 1
     if d <= k:
         results.append(self.term)
     for i in range(max(0, d - k), d + k + 1):
         child = self.children.get(i)
         if child:
             counter += child.search(term, k, results)
     return counter
Пример #50
0
def build_json(df):
    node_list = []
    edge_list = []
    name_list = list(df.name)
    for name in name_list:
        node_list.append({'name': name})
    for i in range(0,len(name_list)):
        for j in range(i,len(name_list)):
            d = distance.levenshtein(name_list[i],name_list[j])
            if d < 3 and j!=i:
                edge_list.append({'source':i,'target':j,'value':d+1})
    n = {'nodes': node_list, 'edges': edge_list}
    return n
def calculateDistance(word_array,vocabulary):
    distance_score_min_levenshstein = {}
    distance_score_sum_levenshstein = {}

    for voc in vocabulary:
        temp_score_lev = 9999
        mean_score = 0
        for word in word_array:
            score = distance.levenshtein(word,voc);
            if temp_score_lev > score:
                temp_score_lev = score
        distance_score_min_levenshstein[voc] = temp_score_lev
    return distance_score_min_levenshstein
Пример #52
0
    def extract(self, query, document, params=None):
        '''
        Extracts features for a given pair of query and document

        Args:
            query(str): The query string
            document(str)
            params: parameters for the feature extractor

        Returns
            float -- feature value
        '''
        return distance.levenshtein(self.tokenize(query),
                                    self.tokenize(document),
                                    normalized=True)
Пример #53
0
def align(sen1, sen2, string=True):
	"""finds the best mapping of words from one sentence to the other
	string = a boolean represents if sentences are given as strings or as list of ucca terminal nodes
	returns list of word tuples and the corresponding list of indexes tuples"""
	if string:
		sen1 = list(map(preprocess_word, sen1.split()))
		sen2 = list(map(preprocess_word, sen2.split()))
	else:
		sen1 = [preprocess_word(terminal.text) for terminal in sen1]
		sen2 = [preprocess_word(terminal.text) for terminal in sen2]

	#find lengths
	length_dif = len(sen1) - len(sen2)
	if length_dif > 0:
		shorter = sen2
		longer = sen1
		switched = False
	else:
		shorter = sen1
		longer = sen2
		switched = True
		length_dif = abs(length_dif)
	shorter += ["emptyWord"] * length_dif

	#create matrix	
	matrix = np.zeros((len(longer), len(longer)))
	for i in range(len(longer)):
		for j in range(len(longer) - length_dif):
			matrix[i,j] = distance.levenshtein(longer[i], shorter[j]) + float(abs(i-j))/len(longer)
	
	#compare with munkres
	m = Munkres()
	indexes = m.compute(matrix)

	#remove indexing for emptywords and create string mapping
	refactored_indexes = []
	mapping = []
	start = 0 if string else 1
	for i, j in indexes:
		if j >= len(longer) - length_dif:
			j = -1 - start
		if switched:
			refactored_indexes.append((j + start, i + start))
			mapping.append((shorter[j], longer[i]))
		else:
			refactored_indexes.append((i + start, j + start))
			mapping.append((longer[i], shorter[j]))
	return mapping, refactored_indexes
Пример #54
0
def getSuggestedWords(word, wordFrequencies):
    suggestedWords = []
    print word
    for item in wordFrequencies:
        possibleWord = item[0]

        #quick check for possibility based on length
        if abs(len(word) - len(possibleWord)) > 2:
            continue

        #compute levenshtein distance
        editDistance = distance.levenshtein(word, possibleWord)
        if editDistance <= 2:
            suggestedWords.append(possibleWord)

    return suggestedWords
Пример #55
0
def findWordVariants(tset, wordVariants):
  dist = 0
  for ind in range(len(wordVariants)):
    #its a set
    entry = wordVariants[ind]
    dist = 0
    for cand in tset:
      if cand not in entry:
        for toCheck in entry:
          dist += levenshtein(cand, toCheck)
        dist /= (len(tset) * len(entry) * 1.0)
        if dist < 1.3:
          #print dist, len(tset), len(entry)
          return ind
      else:
        return ind

  return -1
Пример #56
0
 def testCloseMatch(self):
     for k1 in self.keys.iterkeys():
         timer_start("close_match")
         d = self.tree.close_match(k1, 4, None, DictAction())
         timer_end("close_match")
         for k2 in self.keys.iterkeys():
             distance = levenshtein(k1, k2)
             if distance <= 4:
                 self.assert_(k2 in d, "Match manquant pour %s : %s (distance = %i)" % (k1, k2, distance))
                 self.assert_(
                     k2 in d and d[k2][0] == distance,
                     "Mauvaise distance pour %s et %s : %i != %s" % (k1, k2, distance, d.get(k2)[0]),
                 )
             else:
                 self.assert_(
                     k2 not in d,
                     "Mauvaise distance pour %s et %s : %i > 4 mais trouvé %s" % (k1, k2, distance, d.get(k2)),
                 )
Пример #57
0
def getMinSequence(string, slist):
  mind = None
  mdist = None
  i = 0
  for tup in slist:
    tdist = 0
    for entry in tup:
      tdist += levenshtein(string, entry)

    dist = tdist / (len(tup) * 1.0)
    if dist < mdist or mdist == None:
      mdist = dist
      mind = i
    i += 1

  if mdist < 3:
    return mind
  else:
    return None
Пример #58
0
def compare_ref(usertrans, targettrans):
	"""
	Takes the target translation and the user translation as inputs.
	Based on their edit distance returns an evaluation.
	@ targettrans: target translation (ideal translation of a text)
	@ usertrans: translation provided by the user
	"""
	
	evaluation = {'very good': ['Superb translation!', 'Great work!', 'Perfect score!', 'High five!'], \
			'good': ['Good translation!', 'Nice work!', 'Almost perfect!'], \
			'fair': ['Not bad!', 'Almost there!'], \
			'average': ['You can do better!', 'Shall we practice a little more?']
			}
	
	# encode to UTF-8
	tt = targettrans.encode('utf-8')
	ut = usertrans.encode('utf-8')
	
	# remove punctuation
	replace_punctuation = string.maketrans(string.punctuation, ' '*len(string.punctuation))
	
	tt = tt.translate(replace_punctuation).lower()	
	ut = ut.translate(replace_punctuation).lower()
	
	lev_let = lev.ratio(tt, ut)
	
	tt = tt.split()
	ut = ut.split()
	
	lensum = len(tt)+len(ut)
	lev_tok = (lensum - distance.levenshtein(tt, ut)) / lensum
	
	ratio = max(lev_tok, lev_let)
	
	if ratio >= 0.9:
	    return random.choice(evaluation['very good'])
	elif ratio >= 0.75:
	    return random.choice(evaluation['good'])
	elif ratio >= 0.6:
	    return random.choice(evaluation['fair'])
	else:
	    return random.choice(evaluation['average'])
Пример #59
0
def lexem_best_match(token, lexems):

    if len(lexems) == 0:
        return token

    # print "TOKEN: ", token
    # print lexems

    best_match = None
    best_count = len(token)
    for match in lexems:
        if match[1] != 'P' and match[1] != 'V':
            lexem = match[0].lower()
            count_diff = distance.levenshtein(token, lexem)
            if count_diff < best_count:
                best_count = count_diff
                best_match = lexem

    # print "BEST_MATCH: ", best_match, best_count

    if best_match:
        return strip_diacritics_1(best_match.lower())
    return None
Пример #60
0
def main_batches():  
    g = Graph(is_training=False)
    
    # Load data
    nums, X, ys = load_test_data()
    pnyn2idx, idx2pnyn, hanzi2idx, idx2hanzi = load_vocab()
    
    with g.graph.as_default():    
        sv = tf.train.Supervisor()
        with sv.managed_session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
            # Restore parameters
            sv.saver.restore(sess, tf.train.latest_checkpoint(hp.logdir)); print("Restored!")

            # Get model
            mname = open(hp.logdir + '/checkpoint', 'r').read().split('"')[1] # model name
            
            with codecs.open('eval/{}_{}.csv'.format(mname, "qwerty" if hp.isqwerty else "nine"), 'w', 'utf-8') as fout:
                fout.write("NUM,EXPECTED,{}_{},# characters,edit distance\n".format(mname, "qwerty" if hp.isqwerty else "nine"))
                
                total_edit_distance, num_chars = 0, 0
                for step in range(len(X)//hp.batch_size):
                    num = nums[step*hp.batch_size:(step+1)*hp.batch_size] #number batch
                    x = X[step*hp.batch_size:(step+1)*hp.batch_size] # input batch
                    y = ys[step*hp.batch_size:(step+1)*hp.batch_size] # batch of ground truth strings
                    
                    preds = sess.run(g.preds, {g.x: x})
                    for n, xx, pred, expected in zip(num, x, preds, y): # sentence-wise
                        #got = "".join(idx2hanzi[str(idx)] for idx in pred)[:np.count_nonzero(xx)].replace("_", "")
                        got = "".join(idx2hanzi[idx] for idx in pred)[:np.count_nonzero(xx)].replace("_", "")
                        edit_distance = distance.levenshtein(expected, got)
                        total_edit_distance += edit_distance
                        num_chars += len(expected)
                
                        fout.write(u"{},{},{},{},{}\n".format(n, expected, got, len(expected), edit_distance))
                fout.write(u"Total CER: {}/{}={},,,,\n".format(total_edit_distance, 
                                                        num_chars, 
                                                        round(float(total_edit_distance)/num_chars, 2)))