def extract_features(df): features = pd.DataFrame() print('extracting space splitted sequence features...') df['q1_words'] = df.question1.map(space_split) df['q2_words'] = df.question2.map(space_split) features['str_leven1'] = df.apply( lambda r: distance.nlevenshtein(r.q1_words, r.q2_words, method=1), axis=1) features['str_leven2'] = df.apply( lambda r: distance.nlevenshtein(r.q1_words, r.q2_words, method=2), axis=1) features['str_jaccard'] = df.apply( lambda r: distance.jaccard(r.q1_words, r.q2_words), axis=1) #features['str_hamming'] = df.apply(lambda r: distance.hamming(r.q1_words, r.q2_words, normalized=True), axis=1) #features['str_sorensen'] = df.apply(lambda r: distance.jaccard(r.question1, r.question2), axis=1) print('extracting stemmed word sequence features...') df['q1_stems'] = df.question1.map(stem) df['q2_stems'] = df.question2.map(stem) features['stem_leven1'] = df.apply( lambda r: distance.nlevenshtein(r.q1_stems, r.q2_stems, method=1), axis=1) features['stem_leven2'] = df.apply( lambda r: distance.nlevenshtein(r.q1_stems, r.q2_stems, method=2), axis=1) features['stem_jaccard'] = df.apply( lambda r: distance.jaccard(r.q1_stems, r.q2_stems), axis=1) return features.fillna(.0)
def thelevenstein(): verbose = 0 mypath = '~/test' ## get the list of all files onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] ## read all the txt files so as not to read it severally fileData = [ ' '.join(open(join(mypath, f), 'r').read().split()[1:]) for f in onlyfiles ] ## intialize an empty dataframe new_df = pd.DataFrame() ## iterate between files and find the similarity metric i = 0 for f1 in fileData: # print 'currently processing ', onlyfiles[i] j = 0 for f2 in fileData: if i <= j: new_df.loc[i, j] = distance.nlevenshtein(f1.lower().strip(), f2.lower().strip(), method=2) if verbose and (j % 100 == 0): print('currently processing', onlyfiles[i], ' with ', onlyfiles[j]) j += 1 i += 1 new_df.columns = onlyfiles new_df.index = onlyfiles print('all calculations made. Exporting to csv') new_df.to_csv('document_similarity_levenstein_business.csv', encoding='utf-8') print('Export to csv done!')
def names_are_similar(db_name, patents_name): if patents_name is None: return False patents_name = standardize_name(patents_name) dist1 = distance.nlevenshtein(db_name, patents_name, method=1) dist2 = distance.nlevenshtein(db_name, patents_name.split(" ")[0], method=1) dist3 = distance.nlevenshtein(db_name.split(" ")[0], patents_name, method=1) response = sum([ dist1 < 0.2, dist2 < 0.2, dist3 < 0.2, (dist2 == 0) * 2, (dist3 == 0) * 2 ]) > 1 if response: print("--Matched:", patents_name) return response
def get_pairs(*lists, **options): pairs = options.get('pairs', []) method = options.get('method', 1) # method 1 for shortest alignment, 2 for longgest # cache the result cause it is always time-consuming use_cache = options.get('use_cache', True) if use_cache and os.path.exists(CACHE_FILENAME): with open(CACHE_FILENAME, 'r') as f: cache = f.read().splitlines() for line in cache: pairs.append(filter(lambda x: x.strip(), map(lambda x: x.strip("' \""), line.split('***')))) else: for prime in lists[0]: pair = [ prime ] for minors in lists[1:]: # calculate its edit distance to the prime distances = map(lambda minor: distance.nlevenshtein(prime, minor, method), minors) # get the value whose levenshtein distance to the prime is the minimum most_matched = lambda l: minors[ l.index( min(l) ) ] candidate = most_matched(distances) pair.append(candidate) pairs.append(pair) with open(CACHE_FILENAME, 'w') as f: for pair in pairs: f.write('***'.join(pair)) # write to files to cache f.write(os.linesep) return pairs
def lev(doc1, doc2): txt1 = open(doc1).read() txt2 = open(doc2).read() p = distance.nlevenshtein(txt1.lower().strip(), txt2.lower().strip(), method=2) return p
def val(net, test_dataset, criterion, max_iter=2): print('Start val') for p in crnn.parameters(): p.requires_grad = False net.eval() data_loader = torch.utils.data.DataLoader( test_dataset, batch_size=opt.batchSize, num_workers=int(opt.workers), sampler=dataset.randomSequentialSampler(test_dataset, opt.batchSize), collate_fn=dataset.alignCollate(imgH=opt.imgH, imgW=opt.imgW, keep_ratio=opt.keep_ratio)) val_iter = iter(data_loader) i = 0 n_correct = 0 loss_avg = utils.averager() test_distance = 0 max_iter = min(max_iter, len(data_loader)) for i in range(max_iter): data = val_iter.next() i += 1 cpu_images, cpu_texts = data batch_size = cpu_images.size(0) utils.loadData(image, cpu_images) if ifUnicode: cpu_texts = [clean_txt(tx.decode('utf-8')) for tx in cpu_texts] t, l = converter.encode(cpu_texts) utils.loadData(text, t) utils.loadData(length, l) preds = crnn(image) preds_size = Variable(torch.IntTensor([preds.size(0)] * batch_size)) cost = criterion(preds, text, preds_size, length) / batch_size loss_avg.add(cost) _, preds = preds.max(2) # preds = preds.squeeze(2) preds = preds.transpose(1, 0).contiguous().view(-1) sim_preds = converter.decode(preds.data, preds_size.data, raw=False) for pred, target in zip(sim_preds, cpu_texts): if pred.strip() == target.strip(): n_correct += 1 # print(distance.levenshtein(pred.strip(), target.strip())) test_distance += distance.nlevenshtein(pred.strip(), target.strip(), method=2) raw_preds = converter.decode(preds.data, preds_size.data, raw=True)[:opt.n_test_disp] for raw_pred, pred, gt in zip(raw_preds, sim_preds, cpu_texts): print('%-20s => %-20s, gt: %-20s' % (raw_pred, pred, gt)) accuracy = n_correct / float(max_iter * opt.batchSize) test_distance = test_distance / float(max_iter * opt.batchSize) testLoss = loss_avg.val() #print('Test loss: %f, accuray: %f' % (testLoss, accuracy)) return testLoss, accuracy, test_distance
def inlevenshtein(seq1, seqs, max_dist=0.1): for seq2 in seqs: dist1 = distance.levenshtein(seq1, seq2, max_dist=2) if dist1 !=-1: dist2 = distance.nlevenshtein(seq1, seq2, ) if dist2 <= max_dist: yield dist2, seq2
def identify_keywords(self, key_words, headers): key_words = [x.lower() for x in key_words] headers = [x.lower() for x in headers] all_words = [] all_words.extend(key_words) all_words.extend(headers) print(all_words) featers = [] for column_word in all_words: featuer_dict = {} featuer_dict['word'] = column_word for word in key_words: featuer_dict[word] = distance.nlevenshtein(word, column_word, method=1) featers.append(featuer_dict) data_frame = pd.DataFrame(featers) train_df = data_frame.drop(['word'], axis=1) kmeans = KMeans(n_clusters=len(key_words), random_state=0).fit(train_df) clusters = kmeans.labels_.tolist() duplicates = set([x for x in clusters if clusters.count(x) > 1]) df = pd.DataFrame({'header': all_words, 'cluster': clusters}) header_pairs = [] for duplicate in duplicates: header_pairs.append(df['header'][df['cluster'] == duplicate]) return header_pairs
def find_pairs(*lists): pairs = [] cache_file = 'cache.txt' if os.path.exists(cache_file): with open(cache_file, 'r') as f: cache = f.read().splitlines() for line in cache: pair = filter(lambda x: x.strip(), line.split('***')) if pair: # to avoid empty list pairs.append(pair) else: with open(cache_file, 'w') as f: for prime in lists[0]: pair = [ prime ] for minors in lists[1:]: similarty = map(lambda minor: distance.nlevenshtein(prime, minor, method=2), minors) most_matched = lambda l: minors[ l.index( min(l) ) ] candidate = most_matched(similarty) pair.append(candidate) pairs.append(pair) f.write('***'.join(pair)) f.write(os.linesep) return pairs
def evaluate_pretain(searcher, voc, test_x, test_y): ### Format input sentence as a batch # words -> indexes x_indexes_batch = [indexesFromSentence(voc, test_x)] y_indexes_batch = indexesFromFPs(voc, test_y) # y_indexes_batch = indexesFromSentence(voc, test_y) # y_indexes_batch = indexesFromSentence(voc, test_y) # Create lengths tensor lengths = torch.Tensor([len(indexes) for indexes in x_indexes_batch]) # Transpose dimensions of batch to match models' expectations input_batch = torch.LongTensor(x_indexes_batch).transpose(0, 1) # Use appropriate device input_batch = input_batch.to(device) lengths = lengths.to(device) # Decode sentence with searcher tokens, scores = searcher(input_batch, lengths, 100) tokens = tokens[:-1] # indexes -> words decoded_words = [voc.index2word[token.item()] for token in tokens] # print(test_x) # print(x_indexes_batch) # print([voc.index2word[token] for token in x_indexes_batch[0]]) # print(test_y) # print(decoded_words) reference = [decoded_words] candidate = [voc.index2word[token] for token in y_indexes_batch][:-1] print(test_x) print(''.join(decoded_words)) print(''.join(candidate)) # print(test_y) score = sentence_bleu(reference, candidate) dis = 1 - distance.nlevenshtein(decoded_words, candidate) print(dis) print('-' * 80) return score, 1 if dis >= 0.6 else 0
def levenshtein_long(self, string_one, questions_list, print_flag=False): bigger = 1 frase = "" index = -1 i = 0 for element in questions_list: compare = distance.nlevenshtein(string_one, element.lower(), method=2) #print "Score: " + str(distance.levenshtein(string_one, element)) if print_flag: print "Normalizado: " + str(compare) print "Sentence: " + element print "Index number: ", i, "\n" if compare < bigger: bigger = compare frase = element index = i i += 1 ans = frase, index return ans
def compute_similarity(X): """ Compute similarity matrix with mean of 3 distances :param X: List of contracts ssdeep hashes :return: Similarity matrix """ jaccard_matrix = pdist(X, lambda x, y: distance.jaccard(x[0], y[0])) np.savetxt("../data/jaccard_matrix.csv", np.asarray(squareform(jaccard_matrix)), delimiter=",") sorensen_matrix = pdist(X, lambda x, y: distance.sorensen(x[0], y[0])) np.savetxt("../data/sorensen_matrix.csv", np.asarray(squareform(sorensen_matrix)), delimiter=",") # normalized, so that the results can be meaningfully compared # method=1 means the shortest alignment between the sequences is taken as factor levenshtein_matrix = pdist( X, lambda x, y: distance.nlevenshtein(x[0], y[0], method=1)) np.savetxt("../data/levenshtein_matrix.csv", np.asarray(squareform(levenshtein_matrix)), delimiter=",") mean_matrix = 1 - np.mean(np.array( [jaccard_matrix, sorensen_matrix, levenshtein_matrix]), axis=0) np.savetxt("../data/similarity_matrix.csv", np.asarray(mean_matrix), delimiter=",") print("Similarity matrix computed.") return mean_matrix
def get_pairs(*lists, **options): pairs = options.get('pairs', []) method = options.get('method', 1) # method 1 for shortest alignment, 2 for longgest # cache the result cause it is always time-consuming use_cache = options.get('use_cache', True) if use_cache and os.path.exists(CACHE_FILENAME): with open(CACHE_FILENAME, 'r') as f: cache = f.read().splitlines() for line in cache: pairs.append( filter(lambda x: x.strip(), map(lambda x: x.strip("' \""), line.split('***')))) else: for prime in lists[0]: pair = [prime] for minors in lists[1:]: # calculate its edit distance to the prime distances = map( lambda minor: distance.nlevenshtein(prime, minor, method), minors) # get the value whose levenshtein distance to the prime is the minimum most_matched = lambda l: minors[l.index(min(l))] candidate = most_matched(distances) pair.append(candidate) pairs.append(pair) with open(CACHE_FILENAME, 'w') as f: for pair in pairs: f.write('***'.join(pair)) # write to files to cache f.write(os.linesep) return pairs
def str_levenshtein_1(str1, str2): #str1_list = str1.split(' ') #str2_list = str2.split(' ') res = distance.nlevenshtein(str1, str2,method=1) return res
def norm_edist(df): id_u = sorted(list(set(df.loc[:, 'id']))) srcs_l = [] for idx in range(0, len(id_u)): if idx == len(id_u) - 1: break else: print 'source', id_u[idx] id_bool = df.loc[:, 'id'] == id_u[idx] src = df.loc[:, 'content'][id_bool] src_l = [] i = 0 for s in src: i += 1 res_mat = np.zeros((len(id_u) - (idx + 1), 5)) ii = 0 for iii in range(idx + 1, len(id_u)): trgt = df.loc[:, 'content'][df.loc[:, 'id'] == id_u[iii]] d = [] for t in trgt: d.append(distance.nlevenshtein(s, t, method=1)) res_mat[ii, 0] = i res_mat[ii, 1] = id_u[idx] res_mat[ii, 2] = id_u[iii] res_mat[ii, 3] = np.min(d) res_mat[ii, 4] = np.std(d) ii += 1 src_l.append(res_mat) src_mat = np.vstack(src_l) srcs_l.append(src_mat) srcs_mat = np.vstack(srcs_l) return srcs_mat
def str_levenshtein_1(str1, str2): str1_list = str1.split(' ') str2_list = str2.split(' ') res = distance.nlevenshtein(str1, str2,method=1) return res
def calcTitleHashFeats(title1, title2, featVector): if title1 is None or title2 is None or title1 == '' or title2 == '': featVector.append(1) return title1 = '%x' % Simhash(get_features(normalize(title1))).value title2 = '%x' % Simhash(get_features(normalize(title2))).value t2 = distance.nlevenshtein(title1, title2) featVector.append(t2)
def get_similar(seq, dismatched, max_norm_distance=0.5): measured = [ distance.nlevenshtein(seq, line[0], method=2) for line in dismatched ] if measured and min(measured) < max_norm_distance: return dismatched.pop(measured.index(min(measured))) else: return None
def calcAbstractHashFeats(abstract1, abstract2, featVector): if abstract1 is None or abstract2 is None or abstract1 == '' or abstract2 == '': featVector.append(1) return abstract1 = '%x' % Simhash(get_features(abstract1)).value abstract2 = '%x' % Simhash(get_features(abstract2)).value t2 = distance.nlevenshtein(abstract1, abstract2) featVector.append(t2)
def title_similarity_np(row1, row2, method="difflib"): if method.lower() == "levenshtein": return 1 - distance.nlevenshtein(row1[1], row2[1], method=1) if method.lower() == "sorensen": return 1 - distance.sorensen(row1[1], row2[1]) if method.lower() == "jaccard": return 1 - distance.jaccard(row1[1], row2[1]) return difflib.SequenceMatcher(None, row1[1], row2[1]).quick_ratio()
def compare_files(similar_fp, base_content): similar_content = parser.from_file(similar_fp)['content'] similar_content = tika_compare_clean(similar_content) leven_dist = distance.nlevenshtein(base_content, similar_content) if leven_dist <= .001: return similar_fp
def train(tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram,delete = []): allTrainX = list() allTrainY = list() with open("./data/train.csv") as f: for line in f: lin = line.split(",") if len(lin) == 3: st1 = lin[0].lower() st2 = lin[1].lower() temp = [ 1.-(lev.distance(st1,st2)*2/(len(st1)+len(st2))), lev.jaro(st1,st2), lev.jaro_winkler(st1,st2), lev.ratio(st1,st2), distance.sorensen(st1,st2), jaccard(set(st1),set(st2)), 1. - distance.nlevenshtein(st1,st2,method=1), 1. - distance.nlevenshtein(st1,st2,method=2), dice_coefficient(st1,st2,lenGram=2), dice_coefficient(st1,st2,lenGram=3), dice_coefficient(st1,st2,lenGram=4), cosineWords(st1,st2,dictTrain,tfidf_matrix_train), cosineBigrams(st1,st2,dictTrainBigrams,tfidf_matrix_trainBigrams,lenGram) ] if len(delete) > 0: for elem in delete: temp[elem] = 0. allTrainX.append(temp) allTrainY.append(int(lin[2])) X = np.array(allTrainX,dtype=float) y = np.array(allTrainY,dtype=float) clf = svm.LinearSVC(C=1.,dual=False,loss='l2', penalty='l1') clf2 = linear_model.LogisticRegression(C=1.,dual=False, penalty='l1') clf.fit(X, y) clf2.fit(X, y) weights = np.array(clf.coef_[0]) print(weights) weights = np.array(clf2.coef_[0]) print(weights) return clf,clf2
def calculate_nlevenshtein(actual: ndarray, predicted: ndarray) -> float: distances = [] for row in range(actual.shape[0]): distances.append( nlevenshtein(np.array2string(actual[row]), np.array2string(predicted[row]))) return float(np.mean(distances))
def DL_Distance(str1, str2): print(str1, str2) print("distance 1: ", distance.nlevenshtein(str1, str2)) print("distance 2: ", damerau_levenshtein_distance(str1, str2)) dls = (damerau_levenshtein_distance(str1, str2) / max(len(str1), len(str2))) print("distance 3: ", dls) print("distance 4: ", distance.jaccard(str1, str2))
def levenshtein(self, other): """ Computes the edit distance between this log and the other one and does not do it on name sequences, but rather on the entire log. """ a = [str(version) for version in self.iter_versions()] b = [str(version) for version in other.iter_versions()] return nlevenshtein(a, b)
def extract_basic_distance_feat(df): ## jaccard coef/dice dist of n-gram print "generate jaccard coef and dice dist for n-gram" dists = ["jaccard_coef", "dice_dist"] grams = ["unigram", "bigram", "trigram"] feat_names = ["origsent", "candsent"] for stem in ["", "_stem"]: for dist in dists: for gram in grams: for i in range(len(feat_names) - 1): for j in range(i + 1, len(feat_names)): target_name = feat_names[i] obs_name = feat_names[j] df["%s_of_%s_between_%s_%s%s" % (dist, gram, target_name, obs_name, stem)] = list( df.apply(lambda x: compute_dist( x[target_name + "_" + gram + stem], x[ obs_name + "_" + gram + stem], dist), axis=1)) print "generate rest all features" gram_ext = [ "_unigram", "_bigram", "_trigram", "_char_unigram", "_char_bigram", "_char_trigram" ] for stem in ["", "_stem"]: for gram in gram_ext: df["levenshtein_%s%s" % (gram, stem)] = list( df.apply(lambda x: distance.nlevenshtein( x["origsent" + gram + stem], x["candsent" + gram + stem], method=2), axis=1)) df["sorensen_%s%s" % (gram, stem)] = list( df.apply(lambda x: distance.sorensen( x["origsent" + gram + stem], x["candsent" + gram + stem]), axis=1)) df["cosine_%s%s" % (gram, stem)] = list( df.apply(lambda x: cosine(x["origsent" + gram + stem], x[ "candsent" + gram + stem]), axis=1)) df["precision_%s%s" % (gram, stem)] = list( df.apply(lambda x: precision_recall( x["origsent" + gram + stem], x["candsent" + gram + stem], x["origsent" + gram + stem]), axis=1)) df["recall1gram_%s%s" % (gram, stem)] = list( df.apply(lambda x: precision_recall( x["origsent" + gram + stem], x["candsent" + gram + stem], x["candsent" + gram + stem]), axis=1)) df["f1gram_%s%s" % (gram, stem)] = list( df.apply( lambda x: fmeasure(x["precision_%s%s" % (gram, stem)], x["recall1gram_%s%s" % (gram, stem)]), axis=1))
def getPairFeatures(session): totalTime = 1.0 + (session[-1][QTIME] - session[0][QTIME]).total_seconds() for i in range(len(session) - 1): for j in range(i + 1, len(session)): e1 = session[i] e2 = session[j] jaccard = 1.0 - distance.jaccard(e1[QUERY].split(), e2[QUERY].split()) edit = 1.0 - distance.nlevenshtein(e1[QUERY].split(), e2[QUERY].split()) timeDiff = ((e2[QTIME] - e1[QTIME]).total_seconds()) / totalTime * 1.0 #normalized distance dist = (j - i) * 1.0 / len(session) urlMatch = -1 if CLICKU in e1 and CLICKU in e2: urlMatch = 1.0 - distance.nlevenshtein(e1[CLICKU], e2[CLICKU]) cosine = get_cosine(text_to_vector(e1[QUERY]), text_to_vector(e2[QUERY])) edgeScore = .20 * cosine + .20 * jaccard + .20 * edit + .15 * dist + .15 * timeDiff + .10 * urlMatch yield i, j, edgeScore, cosine, jaccard, edit, dist, timeDiff, urlMatch
def levenshtein(self, other): """ Computes the edit distance between this log and the other one and does not do it on name sequences, but rather on the entire log. """ a = [str(version) for version in self.iter_versions()] b = [str(version) for version in other.iter_versions()] return nlevenshtein(a,b)
def title_similarity_pd(row, method='difflib'): if method.lower() == "levenshtein": return 1 - distance.nlevenshtein( row["title"], row["title_R"], method=1) if method.lower() == "sorensen": return 1 - distance.sorensen(row["title"], row["title_R"]) if method.lower() == "jaccard": return 1 - distance.jaccard(row["title"], row["title_R"]) return difflib.SequenceMatcher(None, row["title"], row["title_R"]).quick_ratio()
def is_similar_by_levenstein(first, second, threshold): """Check rather two lists are similar by Levenstein similarity metrics up to threshold :param first: one of the lists :param second: another list :param threshold: similarity threshold :return: boolean value is elements are similar up to threshold """ if (1 - distance.nlevenshtein(first, second)) >= threshold: return True return False
def categorize_peptide_distance(annotation1, annotation2): #Determining if it is I/L Substitution if annotation1.replace("I", "L") == annotation2.replace("I", "L"): #I/L Substitution return "I/L Substitution" annotation1_sequence_only = re.sub(r'[0-9.+-]+', '', annotation1) annotation2_sequence_only = re.sub(r'[0-9.+-]+', '', annotation2) string_distance = distance.nlevenshtein(annotation1_sequence_only, annotation2_sequence_only, method=1) #Detecting Site locatization of PTMs if string_distance < 0.01: return "PTM Localization" hamming_distance = 0 if len(annotation1_sequence_only) == len(annotation2_sequence_only): hamming_distance = distance.hamming(annotation1_sequence_only, annotation2_sequence_only) if hamming_distance == 2: return "Double Amino Substitution" if hamming_distance == 1: #Seeing if it is a deamidation annotation1_contains_deamidation = False annotation2_contains_deamidation = False if annotation1.find("+0.984") != -1: annotation1_contains_deamidation = True if annotation2.find("+0.984") != -1: annotation2_contains_deamidation = True if annotation1_contains_deamidation != annotation2_contains_deamidation: #Probably should check for Q->E return "Deamidation" #Checking for Q->K Substitution #Determining String Distance string_distance = distance.nlevenshtein(annotation1, annotation2, method=1) return "UNKNOWN"
def accuracy(first, second): """Calculates similarity metrics for two lists (the order of parameters doesn't matter) :param first: list with predicted events :param second: list with true events :return: tuple of specified accuracy metrics (similarities): normalized Levenshtein and Damerau-Levenstein, Jaccard """ n_levenstein = 1 - distance.nlevenshtein(first, second) n_damerau_levenshtein = (1 - damerau_levenshtein_distance( first, second)) / max(len(first), len(second)) jaccard = 1 - distance.jaccard(first, second) return n_levenstein, n_damerau_levenshtein, jaccard
def adjusted_similarity(dfp1, dfp2): """ 计算两个数据功能项的相似度 :param dfp1: :param dfp2: :return: """ if dfp1 in dfp2 or dfp2 in dfp1: return 1 return 1 - distance.nlevenshtein(dfp1, dfp2, method=1)
def compareList(l1, l2): result = 1000 for i in l1: for j in l2: current = distance.nlevenshtein(i, j, method=2) if result > current: result = current if result == 0: break return result
def levenshtein_similarity(str1,str2): ''' Implements the basic Levenshtein algorithm providing a similarity measure between two strings return actual / possible levenstein distance to get 0-1 range normalised by the length of the longest sequence ''' #sim_score=self.load_sim_from_memory(str1, str2) #if sim_score is None: from distance import nlevenshtein dist=nlevenshtein(str1, str2, method=1) sim_score= 1 - dist return sim_score
def compare_word(word1, word2: str) -> float: if len(word1) <= 2 or len(word2) <= 2: return 1000.0 if len(word1) == 3 and len(word2) == 3: if word1 == word2: return 0.001 else: return 1000.0 else: return ds.nlevenshtein(word1, word2, method=1)
def get_features(raw_data): fet_data = pd.DataFrame() print "extracting count features..." fet_data["q_len"] = raw_data["query"].map(word_len) fet_data["t_len"] = raw_data["product_title"].map(word_len) fet_data["d_len"] = raw_data["product_description"].map(word_len) print "extracting basic distance features from q and t..." fet_data["nleven1"] = raw_data.apply(lambda x: distance.nlevenshtein(x.q, x.t, method=1), axis=1) fet_data["nleven2"] = raw_data.apply(lambda x: distance.nlevenshtein(x.q, x.t, method=2), axis=1) fet_data["sorensen"] = raw_data.apply(lambda x: distance.sorensen(x.q, x.t), axis=1) fet_data["jaccard"] = raw_data.apply(lambda x: distance.jaccard(x.q, x.t), axis=1) fet_data["ncd"] = raw_data.apply(lambda x: ncd(x.q, x.t), axis=1) print "extracting basic distance features from q_ex and t..." fet_data["sorensen_ex"] = raw_data.apply(lambda x: distance.sorensen(get_uniq_words_text(x.q_ex), x.t), axis=1) print "extracting basic distance features from q_ex and t..." fet_data["jaccard_ex"] = raw_data.apply(lambda x: distance.jaccard(get_uniq_words_text(x.q_ex), x.t), axis=1) print "extracting basic distance features from q_ex and t..." fet_data["ncd_ex"] = raw_data.apply(lambda x: ncd(get_uniq_words_text(x.q_ex), x.t), axis=1) return fet_data
def levenshtein_similarity(str1,str2): ''' Implements the basic Levenshtein algorithm providing a similarity measure between two strings return actual / possible levenstein distance to get 0-1 range normalised by the length of the longest sequence e.g., http://www.pris.net.cn/wp-content/uploads/2013/12/PRIS2013.notebook.pdf ''' #sim_score=self.load_sim_from_memory(str1, str2) #if sim_score is None: from distance import nlevenshtein dist=nlevenshtein(str1, str2, method=1) sim_score= 1 - dist return sim_score
def check_text(text, entry=None): text_results = [] text_contexts = set() if text != "": for keyword in keywords: result = nlevenshtein(text, keyword, method=2) if result <= 0.4: print "Match! " + text + " is close to " + keyword + " (" + str(result) + ")" if entry and "phish_detail_url" in entry: text_contexts.add(entry["phish_detail_url"]) else: text_contexts.add(keyword) text_results.append(text) break return text_results, text_contexts
def test_edges(scheme_exp,scheme_obs): import pystats edges_exp=scheme2edges(scheme_exp) edges_obs=scheme2edges(scheme_obs) #print scheme_exp,scheme_obs #return pystats.mean([int(e in edges_obs) for e in set(edges_exp)]) import distance dist=distance.nlevenshtein(scheme_exp, scheme_obs) #for s1,s2 in zip(scheme_exp,scheme_obs): # if 0 in [s1,s2] and {s1,s2}!={0}: # dist+=2 print scheme_exp,'\t',scheme_obs,'\t',dist #if scheme_exp==(1,1) and scheme_obs!=(1,1): # dist+=2 dist = dist * 10**(1/len(scheme_exp)) return dist
def cell_difference(self, cell1, cell2): """ return a single value indicating the extent to which cell 1 is like cell 2. :param cell1: a list of lines of code :param cell2: a list of lines of code :return: """ cell1_concatenation = "" cell2_concatenation = "" for line_in_cell1 in cell1: cell1_concatenation += line_in_cell1 for line_in_cell2 in cell2: cell2_concatenation += line_in_cell2 difference = distance.nlevenshtein(cell1_concatenation, cell2_concatenation) return difference
def detect_trend(df): with open('shadow_words.txt', 'r') as f: shadow_tags = f.read().splitlines() with open('bad_list_total.txt', 'r') as f: shadow_tags += f.read().splitlines() shadow_tags.append('') top_tags = df.loc[df[df.shape[1]-1]>5000] drop_tags = [x for x in shadow_tags if x in top_tags.index] top_tags = top_tags.drop(drop_tags) #model = pycast.methods.ExponentialSmoothing(smoothingFactor=0.1,valuesToForecast=1) model = pycast.methods.HoltWintersMethod(seasonLength=4) forecast = [] prev_times_s = map(lambda x: (x-prev_times[0]).total_seconds(), prev_times) top_tags.fillna(method='pad', inplace=True, axis=0) for index in top_tags.index: ts = zip(prev_times_s, df.ix[index]) preds = model.execute(ts) pred = preds[-1][1] / preds[-10][1] forecast.append((index, pred)) forecast.sort(key=lambda x: x[1], reverse=True) candidates = [x[0] for x in forecast[0:500]] candidates.sort(key=len, reverse=True) print candidates top = [] for t1 in candidates: skip = False for t2 in top: #print t1, t2 #print distance.nlevenshtein(t1, t2, method=1) if t1 != t2 and distance.nlevenshtein(t1, t2, method=1) < 0.4: skip = True break if skip: continue top.append(t1) top_sorted = [] for tag in forecast[0:500]: if tag[0] in top: top_sorted.append(tag[0]) if len(top_sorted) >= NUM_TOP_TAGS: break return top_sorted
def getTags(self,keywords): predtags=set([]) keywordlist = [] for keyword in keywords: keywordlist.append(keyword.split(' ')) ################### check if a tag is completely in a keyword################################# for wordlist in self.taglist: flag=0 for keyword in keywordlist: if(set(wordlist[0])<=set(keyword)): flag=1 break if(flag==1): predtags.add(self.tags[wordlist[1]]) ############################################## step completed############################### for keyword in keywords: for tag in self.taglem: score = distance.nlevenshtein(tag[0],keyword) if(0.2 > score): predtags.add(self.tags[tag[1]]) return set(predtags)
def template_distance(template1, template2): return distance.nlevenshtein( template1.raw_str.strip().split(), template2.raw_str.strip().split() )
def stats(tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram,delete = [],plotX=False): with open("./data/stats.csv") as infile: for i,line in enumerate(infile): pass dimMatrix = 16 predict = np.zeros((i+1,dimMatrix)) clf1,clf2 = train(tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram,delete=delete) with open("./data/stats.csv") as infile: for i,line in enumerate(infile): a = line.rstrip().split("\t") ## create same vector with more distances st1 = a[0].lower() st2 = a[1].lower() temp = [ 1.-(lev.distance(st1,st2)*2/(len(st1)+len(st2))), lev.jaro(st1,st2), lev.jaro_winkler(st1,st2), lev.ratio(st1,st2), distance.sorensen(st1,st2), jaccard(set(st1),set(st2)), 1. - distance.nlevenshtein(st1,st2,method=1), 1. - distance.nlevenshtein(st1,st2,method=2), dice_coefficient(st1,st2,lenGram=2), dice_coefficient(st1,st2,lenGram=3), dice_coefficient(st1,st2,lenGram=4), cosineWords(st1,st2), cosineBigrams(st1,st2)] if len(delete) > 0: for elem in delete: temp[elem] = 0. predict[i,:-3] = temp predict[i,-3] = clf1.decision_function(np.array(temp,dtype=float)) predict[i,-2] = clf2.decision_function(np.array(temp,dtype=float)) predict[i,-1] = a[-1] if plotX: labelsM = ["Lev","Jaro","Jaro-Winkler","Ratio","Sorensen","Jaccard","Lev1","Lev2","Dice_2","Dice_3","Dice_4","cosineWords","cosineBigrams","SVM","Logit"] f1matrix = np.zeros((100,dimMatrix-1)) fig = plt.figure() fig.set_size_inches(9,6) ax = fig.add_subplot(111) iC = -1 for i in np.linspace(0,1,100): iC += 1 for j in range(dimMatrix-1): t = np.array(predict[:,j]) if j >= dimMatrix-3: t = (t - np.min(t))/(np.max(t)-np.min(t)) f1matrix[iC,j] = f1_score(y_pred=t>i ,y_true=predict[:,-1]) F1scores = [] for j in range(dimMatrix-1): F1scores.append(np.max(f1matrix[:,j])) #ax.plot(np.linspace(0,1,100),f1matrix[:,j],label=labelsM[j],color=tableau20[j]) ax.bar(range(dimMatrix-1),F1scores) plt.xticks(np.arange(dimMatrix-1)+0.5,["Lev","Jaro","Jaro-Winkler","Ratio","Sorensen","Jaccard","Lev1","Lev2","Dice_2","Dice_3","Dice_4","cosineWords","cosineBigrams","SVM","Logit"],rotation=45) ax.set_ylabel("F1 score") ax.set_xlabel("Parameter") plt.legend(loc=2) customaxis(ax) plt.savefig("f1_bar.pdf") plt.show() fig = plt.figure() fig.set_size_inches(9, 6) ax = fig.add_subplot(111) AUCScores = [] for j in range(dimMatrix-1): # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(predict[:,-1], predict[:,j]) AUCScores.append(auc(fpr, tpr)) # Plot ROC curve ax.plot(fpr, tpr, label=labelsM[j],color=tableau20[j]) ax.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) ax.set_xlabel('False Positive Rate') ax.set_ylabel('True Positive Rate') ax.set_title('ROC Curve') plt.legend(loc=2) customaxis(ax) plt.savefig("roc.pdf") plt.show() fig = plt.figure() fig.set_size_inches(9, 6) ax = fig.add_subplot(111) ax.bar(range(dimMatrix-1),AUCScores) ax.set_ylabel('Area Under Curve') plt.xticks(np.arange(dimMatrix-1)+0.5,["Lev","Jaro","Jaro-Winkler","Ratio","Sorensen","Jaccard","Lev1","Lev2","Dice_2","Dice_3","Dice_4","cosineWords","cosineBigrams","SVM","Logit"],rotation=45) customaxis(ax) plt.savefig("roc_bar.pdf") plt.show()
def search(): """ search the database for names or passwords """ admin.list_titulo='' # delete page title app.logger.debug('entering search') try: query_name = request.form['inputName'] or None query_password = request.form['inputPassword'] or None query_distance = request.form['inputDistance'] or '40' query_distance = 1.0 - (float(query_distance)/100.0) query_name_len = query_password_len = 0 app.logger.debug(u"{0} {1}".format(query_name, query_password)) if query_name is not None: query_name = query_name.strip() query_name_len = len(query_name) if query_password is not None: query_password = normalize_passport(query_password) query_password_len = len(query_password) score = [] start_time = timeit.default_timer() if ((query_name is not None) or (query_password is not None)): query_st = u'select rowid, word, distance from spell_{0} where word match ? order by distance limit 10' param = '' query_spell_ref = '' root_filter= '/admin/entity/?flt0_5=' query_filter_entity = '' if query_name is not None: query_stc = query_st.format(u'whole_name', query_distance) query_spell_ref = 'select entity_id from names where spell_ref=? limit 1' param = query_name else: query_stc = query_st.format(u'passport', query_distance) query_spell_ref = 'select entity_id from passports where spell_ref=? limit 1' param = query_password app.logger.debug(u''+query_stc+' '+param) cursor=apsw_con.cursor() cursor2=apsw_con.cursor() for rowid, word, distance in cursor.execute(query_stc, (param,)): # distance in % by shortest alignment d = nlevenshtein(param.upper(), word.upper(), method=2) app.logger.debug(u'Distance between {0} and {1} is {2}'.format( param, word, d )) if d<=query_distance: # find spell reference for rf in cursor2.execute(query_spell_ref, (rowid,)): if (len(query_filter_entity)==0): query_filter_entity=rf[0] else: query_filter_entity+='%2C'+rf[0] score.append( (rowid, word, (1-d)*100) ) et = u'Execution time: {0} s'.format( timeit.default_timer() - start_time) admin.ent_ctrl.list_titulo=u"Results for {0} with\ {1}% of similarity. ({2})".format( param, (1-query_distance)*100, et ) return redirect(root_filter+query_filter_entity) # http://localhost:5000/admin/entity/?flt2_5=EU40%2CUN40 """ return render_template( 'index.html', query_name=u"{0}".format(param), score=score, similarity=(1-query_distance)*100, execution_time=et) """ else: return redirect('/admin') except Exception, e: msg="Rendering error: {0}".format(e) app.logger.error(msg) return render_template('400.html', msg=msg)
def findEditDistance(self, qFeat): #print self.query, qFeat.query, distance.nlevenshtein(self.query, qFeat.query,method=1), distance.nlevenshtein(self.query, qFeat.query,method=2) edit = 1.0 - distance.nlevenshtein(self.query, qFeat.query, method=1) return edit
import nltk f = open(sys.argv[1],"r") read_line = f.readlines() words = [] z=[] for line in read_line: line=line.translate(None,'\n') r = line.split(',') o = r[0] + r[1] z=[] z.append(r[0]) z.append(r[1]) print nltk.pos_tag(z) words.append(o) lev_similarity = -1*np.array([[distance.nlevenshtein(w1,w2,method=2) for w1 in words] for w2 in words]) affprop = sklearn.cluster.AffinityPropagation(affinity="precomputed", damping=0.5) affprop.fit(lev_similarity) for cluster_id in np.unique(affprop.labels_): exemplar = words[affprop.cluster_centers_indices_[cluster_id]] y = np.nonzero(affprop.labels_==cluster_id) output = "" for j in y: for k in j: output = output+','+words[k] print exemplar,":",output # cluster = np.unique(words[np.nonzero(affprop.labels_==cluster_id)]) # cluster_str = ", ".join(cluster) # print(" - *%s:* %s" % (exemplar, cluster_str))
######Below code gives suggestion in a spell checker############ import nltk import distance #Array dict_words stores the words in english dictionary dict_words=nltk.corpus.words.words('en') #Array to store similar words sim_words = [] #Array to store similarity values for each word in sim_words array sim_dist = [] for word in dict_words: if distance.nlevenshtein(word, "applicablity", method=1) <= 0.25 : sim_words.append(word) sim_dist.append(distance.nlevenshtein(word, "applicablity", method=1)) #Minimum value among all the sim_dist values, which represents more similarity# min_value = min(sim_dist) #Extracting the index of the min_value index_min_val= sim_dist.index(min(sim_dist)) #Suggeting word for wrong spelling print(sim_words[index_min_val])
def logline_distance(logline1, logline2): return distance.nlevenshtein( logline1.text.strip().split(), logline2.text.strip().split())