def test_distance(): assert_equal(levenshtein(S1, S2), 3) assert_equal(levenshtein(U1, U2), 3) assert_equal(levenshtein(S2, S1), 3) assert_equal(levenshtein(U2, U1), 3) for x in (S1, S2, U1, U2): assert_equal(levenshtein(x, x), 0)
def file_editdistance_eval(path1, path2): with open(path1, 'r') as f: str1 = strip_whitespace(f.read()) with open(path2, 'r') as f: str2 = strip_whitespace(f.read()) return levenshtein(str1, str2)
def distanceMetric(start,end): input = open("actionList.pkl","rb") actions = pickle.load(input) actionStrings = [] for item in actions: actionstring = "" for i in item: if i == -1: i = 255 actionstring += chr(i) actionStrings.append(actionstring) if end > len(actions): end = len(actions) distanceMextric = [] for i in range(start,end): print "process: " , i dist = [] for j in range(0,len(actionStrings)): distance = levenshtein(actionStrings[i], actionStrings[j]) #print distance dist.append(distance) distanceMextric.append(dist) output = open("distMetric" + str(start) + ".pkl","wb") pickle.dump(distanceMextric,output) output.close()
def feature_TITLE(self, lnk, re_label_text, features): label_text = unicode(lnk["label"]) re_title = stringUtils.ngramToPattern(lnk['title']) article_title = unicode(lnk['title']) features["NCT"] = 0 if re.search(re_title, label_text) is None \ else 1 features["TCN"] = 0 \ if re.search(re_label_text, article_title) is None else 1 features["TEN"] = 1 if article_title == label_text else 0 # Irritatingly enough, split() can give you empty values as last # element split_label = self.re_non_word_chars.split(label_text) if split_label[-1] == '': split_label.pop() split_title = self.re_non_word_chars.split(article_title) if split_title[-1] == '': split_title.pop() # I: True if the title of the candidate begins with the the query # (e.g. "Cambridge, Massachusetts" and "Cambridge" ) features["SUBSTRING_MATCH_1"] = 1 \ if split_title[0] == split_label[0] else 0 # II: True if the title of the candidate ends with the the query # (e.g: "Venice-Simplon Orient Express" and "Orient Express") features["SUBSTRING_MATCH_2"] = 1 \ if split_title[-1] == split_label[-1] else 0 # collections.Counter() converts an array to a dict of words # and their frequencies cSplitLabel = collections.Counter(split_label) cSplitTitle = collections.Counter(split_title) # Number of shared words between the title of the candidate and # the query features['WORD_MATCH'] = len(list(cSplitLabel & cSplitTitle)) # Number of different words between the title of the candidate # and the query features['WORD_MISS'] = len(split_label) + len(split_title) \ - (2 * features['WORD_MATCH']) # Levenshtein distance between query and title of the candidate features["EDIT_DISTANCE"] = levenshtein(label_text, article_title)
def test_normalize(): assert_true(isinstance(levenshtein(S2, S1), numbers.Integral)) assert_equal(levenshtein("", "", normalize=True), 0) assert_equal(levenshtein(S1, S2, normalize=True), 3 / 7)
def metric_levenshtein(predictions, labels): predictions = predictions.softmax().topk(axis=2).asnumpy() zipped = zip(decode(labels.asnumpy()), decode(predictions)) metric = sum([(len(label) - levenshtein(label, pred)) / len(label) for label, pred in zipped]) return metric / len(labels)
def lev_metric(x, y): i, j = int(x[0]), int(y[0]) # extract indices return levenshtein(data[i], data[j])
r = requests.get('http://0.0.0.0:8000/circle?_f=%s&_t=%s' % (f, t)) except: print('smurf hug api started?') print('try: hug -m smurf.hug_api') sys.exit(-1) rj = r.json() rjc = cycle(rj) l = len(rj) p = n = None print('circle:edit distances should be 1') while l: if not p: p = next(rjc) n = next(rjc) print(p, n, leven.levenshtein(p, n)) p = n l -= 1 print('random:edit distances should not be always 1') l = len(rj) p = n = None random.shuffle(rj) rjc = cycle(rj) while l: if not p: p = next(rjc) n = next(rjc) print(p, n, leven.levenshtein(p, n)) p = n l -= 1
def checkAuthorMatch(article1, article2, authorName=None, yearGap=2, nCommonAuths=3, matchThresh=0.70, absMatchLimit=0.5, titleMatchLimit=0.5, keywordMatchTol=2): """ Check if two articles are probably from the same author. I think they are if: 1) They are published within yearGap of eachother, from the same location or 2) They share nCommonRefs or more or 3) They share nCommonAuths or more or 4) Author lists are >1 and have the same names. or 5) Author of interest is spelled exactly the same, and they share 2+ keywords matchThresh sets the theshold for doing fuzzy string comparison with the affiliation names (so, 'Univeristy of Washington' will match 'University of Washington, Astronomy Department' returns ------- bool """ result = False # If any of the criteria match, return True and bail out if authorName is None: authorName = authSimple(article1.author[0]) # If author list is >1 and identical if len(article1.authorset) > 1: if article1.authorset == article2.authorset: return True # If published within yearGap from same location if hasattr(article1, 'year') & hasattr(article2, 'year'): if np.abs(int(article1.year) - int(article2.year)) <= yearGap: aff1 = None aff2 = None lDists = [ levenshtein(unicode(authorName), unicode(authSimple(name))) for name in article1.author ] good = np.where(lDists == np.min(lDists))[0] # Can't tell which one is author, can't link it up. if np.size(good) > 1: return False if lDists[good] < 3: aff1 = affClean(article1.aff[good]) lDists = [ levenshtein(unicode(authorName), unicode(authSimple(name))) for name in article2.author ] good = np.where(lDists == np.min(lDists))[0] if np.size(good) > 1: return False if lDists[good] < 3: aff2 = affClean(article2.aff[good]) if (aff1 is not None) & (aff2 is not None): if checkAffMatch(aff1, aff2, matchThresh=matchThresh): return True # Check if they have enough common authors commonAuthors = article1.authorset.intersection(article2.authorset) if len(commonAuthors) >= nCommonAuths: return True # If the keywords are in common if (article1.keyword is not None) & (article2.keyword is not None): if len(set(article1.keyword) & set(article2.keyword)) > keywordMatchTol: # If the author is spelled exactly the same, and there are matching keywords a1 = [ author for author in article1.author if authSimple(author) == authSimple(authorName) ] a2 = [ author for author in article2.author if authSimple(author) == authSimple(authorName) ] if (len(a1) == 1) & (len(a2) == 1): # If the names match exactly if a1[0] == a2[0]: return True return result
def filter_by_levenshtein(self, word_spell, distance): words = self.words for word in words: if levenshtein(word_spell, word.entry.form) <= distance: yield word
def generate_sequences(self, inputs, begin_states, sentence): samples, scores, valid_lengths = self.sampler(inputs, begin_states) samples = samples[0].asnumpy() scores = scores[0].asnumpy() valid_lengths = valid_lengths[0].asnumpy() max_score = -10e20 # Heuristic #1 #If the sentence is correct, let's not try to change it sentence_tokenized = [ i.replace(""", '"').replace("'", "'").replace("&", "&") for i in self.tokenizer(sentence) ] sentence_correct = True for token in sentence_tokenized: if (token not in self.vocab or self.vocab[token] > 400000) and token.lower() not in [ "don't", "doesn't", "can't", "won't", "ain't", "couldn't", "i'd", "you'd", "he's", "she's", "it's", "i've", "you've", "she'd" ]: sentence_correct = False break if sentence_correct: return sentence # Heuristic #2 # We want sentence that have the most in-vocabulary words # and we penalize sentences that have out of vocabulary words # that do not start with a capital letter for i, sample in enumerate(samples): tokens = decode_char(sample[:valid_lengths[i]]) tokens = [ i.replace(""", '"').replace("'", "'").replace("&", "&") for i in self.tokenizer(tokens) ] score = 0 for t in tokens: # Boosting names if (t in self.vocab and self.vocab[t] < 450000) or (len(t) > 0 and t.istitle()): score += 0 else: score -= 1 score -= 0 if score == max_score: max_score = score best_tokens.append(tokens) elif score > max_score: max_score = score best_tokens = [tokens] # Heurisitic #3 # Smallest edit distance # We then take the sentence with the lowest edit distance # From the predicted original sentence best_dist = 1000 output_tokens = best_tokens[0] best_tokens_ = [] for tokens in best_tokens: dist = leven.levenshtein(sentence, ' '.join(self.detokenizer(tokens))) if dist < best_dist: best_dist = dist best_tokens_ = [tokens] elif dist == best_dist: best_tokens_.append(tokens) # Heuristic #4 # We take the sentence with the smallest number of tokens # to avoid split up composed words min_len = 10e20 for tokens in best_tokens_: if len(tokens) < min_len: min_len = len(tokens) best_tokens__ = [tokens] elif len(tokens) == min_len: best_tokens__.append(tokens) # Heuristic #5 # Lowest ppl # If we still have ties we take the sentence with the lowest # Perplexity score according to the language model best_ppl = 10e20 for tokens in best_tokens__: if len(tokens) > 1: inputs = self.vocab[tokens] hidden = self.language_model.begin_state(batch_size=1, func=mx.nd.zeros, ctx=self.ctx_nlp) output, _ = self.language_model( mx.nd.array(inputs).expand_dims(axis=1).as_in_context( self.ctx_nlp), hidden) output = output.softmax() l = 0 for i in range(1, len(inputs)): l += -output[i - 1][0][inputs[i]].log() ppl = (l / len(inputs)).exp() if ppl < best_ppl: output_tokens = tokens best_ppl = ppl output = ' '.join(self.detokenizer(output_tokens)) # Heuristic #6 # Sometimes there are artefact at the end of the corrected sentence # We cut the end of the sentence if len(output) > len(sentence) + 10: output = output[:len(sentence) + 2] return output
def affinity_propagation(self, entity_group: list, metric: str = None, damping: float = None, preference: int = None, embeddings: list = None, entity_name: str = None, selected_base_models: list = None): """ In contrast to other traditional clustering methods, affprop does not require you to specify the number of clusters. In creators' terms, in affprop, each data point sends messages to all other points informing its targets of each target’s relative attractiveness to the sender. Each target then responds to all senders with a reply informing each sender of its availability to associate with the sender, given the attractiveness of the messages that it has received from all other senders. Senders reply to the targets with messages informing each target of the target’s revised relative attractiveness to the sender, given the availability messages it has received from all targets. The message-passing procedure proceeds until a consensus is reached. Once the sender is associated with one of its targets, that target becomes the point’s exemplar. All points with the same exemplar are placed in the same cluster. :param entity_group, company_names, locations, or unknown_soup for everything else. :param metric, distance/similarity matrix - jaro or levenshtein. Will only needed when "graph representation" is being selected at constructor time :param damping, damps the responsibility and availability messages to avoid numerical oscillations when updating these messages. :param entity_name, useful for results file naming :param embeddings, list or embeddings in case of vector representation :param selected_base_models, need this for jason naming :param preference, Preferences for each point - points with larger values of preferences are more likely to be chosen as exemplars. The number of exemplars, ie of clusters, is influenced by the input preferences value. If the preferences are not passed as arguments, they will be set to the median of the input similarities. :return: clusters """ words = np.asarray(entity_group) # graph representation aka string similarity - requires precombuted matrix of destances between strings # and "custom" distance metric metric if self.representation == "graph_representation": affinity = "precomputed" selected_base_models = '' if metric == "jaro": distance_matrix = np.array([[jaro(w1, w2) for w1 in words] for w2 in words]) features = distance_matrix elif metric == "levenshtein": similarity_matrix = -1 * np.array( [[levenshtein(w1, w2) for w1 in words] for w2 in words]) features = similarity_matrix else: raise SystemExit( f"[ERROR]: function affinity_propagation() -> Provide one of the available metrics: " f"['jaro', 'levenshtein']") else: # in this case we are dealing with embeddings; python currently supports only euclidean distance affinity = 'euclidean' metric = affinity features = embeddings if selected_base_models: selected_base_models = selected_base_models affprop = AffinityPropagation(affinity=affinity, damping=damping, preference=preference, random_state=None) affprop.fit(features) clusters = {} for cluster_id in np.unique(affprop.labels_): exemplar = words[affprop.cluster_centers_indices_[cluster_id]] cluster = np.unique( words[np.nonzero(affprop.labels_ == cluster_id)]) cluster_str = ", ".join(cluster) clusters.update({exemplar: cluster_str}) if self.print_output: print(f"- **{exemplar}** --> {cluster_str}") if self.save_output: # name of json is dynamic - all parameter values are integrated in the name of the file itself with open( f"{str(Path.cwd())}/results/{self.representation[0]}_affinity_" f"{metric}_{str(damping)}_{preference}_{entity_name}_{selected_base_models}.json", "w+") as out: json.dump(clusters, out, indent=4, sort_keys=True) return clusters
def agglomerative(self, entity_group: list = None, metric: str = None, linkage: str = None, distance_threshold: float = None, compute_full_tree: bool = True, n_clusters: int = None, embeddings: list = None, entity_name: str = None, selected_base_models: list = None): """ In agglomerative algorithms, each item starts in its own cluster and the two most similar items are then clustered. We continue accumulating the most similar items or clusters together two at a time until there is one cluster. :param entity_group, name of the entity group (company name, location, unknown soup). :param metric, distance/similarity matrix - jaro or levenshtein. Will only needed when "graph representation" is being selected at constructor time :param linkage, {“ward”, “complete”, “average”, “single”}, default=”ward” .Which linkage criterion to use. The linkage criterion determines which distance to use between sets of observation. The algorithm will merge the pairs of cluster that minimize this criterion. * ward minimizes the variance of the clusters being merged. * average uses the average of the distances of each observation of the two sets. * complete or maximum linkage uses the maximum distances between all observations of the two sets. * single uses the minimum of the distances between all observations of the two sets. :param distance_threshold, float, default=None. The linkage distance threshold above which, clusters will not be merged. If not None, n_clusters must be None and compute_full_tree must be True. :param n_clusters, the number of clusters to find. It must be None if distance_threshold is not None. :param compute_full_tree, ‘auto’ or bool, default=’auto'. It must be True if distance_threshold is not None. By default compute_full_tree is “auto”, which is equivalent to True when distance_threshold is not None or that n_clusters is inferior to the maximum between 100 or 0.02 * n_samples. Otherwise, “auto” is equivalent to False :param embeddings :param entity_name, the name of the set - helps with json naming (optional) :param selected_base_models need this for jason naming :return: clusters """ data = list(entity_group) data = np.asarray(data) # graph representation aka string similarity - requires precombuted matrix of destances between strings # and "custom" distance metric metric if self.representation == "graph_representation": affinity = "precomputed" selected_base_models = '' if metric == "jaro": distance_matrix = np.array([[jaro(w1, w2) for w1 in data] for w2 in data]) features = distance_matrix elif metric == "levenshtein": similarity_matrix = -1 * np.array( [[levenshtein(w1, w2) for w1 in data] for w2 in data]) features = similarity_matrix else: raise SystemExit( f"[ERROR]: function affinity_propagation() -> Provide one of the available metrics: " f"['jaro', 'levenshtein']") else: # in this case we are dealing with embeddings; python currently supports only euclidean distance affinity = metric metric = affinity features = embeddings if selected_base_models: selected_base_models = selected_base_models agg = AgglomerativeClustering(affinity=affinity, linkage=linkage, distance_threshold=distance_threshold, compute_full_tree=compute_full_tree, n_clusters=n_clusters) agg.fit(features) clusters = {} for idx, label in enumerate(agg.labels_): if label not in clusters.keys(): clusters.update({int(label): [data[idx]]}) else: clusters[int(label)].append(data[idx]) if self.print_output: for key, item in clusters.items(): print(key, item) if self.save_output: # name of json is dynamic - all parameter values are integrated in the name of the file itself with open( f"{str(Path.cwd())}/results/{self.representation[0]}_agglomarative_" f"{str(metric)}_{linkage}_{distance_threshold}_{compute_full_tree}_" f"{entity_name}_{selected_base_models}.json", "w+") as out: json.dump(clusters, out, indent=4, sort_keys=True) return clusters
def dbscan(self, entity_group: list = None, metric: str = None, epsilon: float = None, min_samples: int = None, embeddings: list = None, entity_name: str = None, selected_base_models: list = None): """ Density-based clustering works by identifying “dense” clusters of points, allowing it to learn clusters of arbitrary shape and identify outliers in the data. The general idea behind ɛ-neighborhoods is given a data point, we want to be able to reason about the data points in the space around it. Formally, for some real-valued ɛ > 0 and some point p, the ɛ-neighborhood of p is defined as the set of points that are at most distance ɛ away from p. In 2D space, the ɛ-neighborhood of a point p is the set of points contained in a circle of radius ɛ, centered at p. :param entity_group, name of the entity group (company name, location, unknown soup). :param metric, distance/similarity matrix - jaro or levenshtein. Will only needed when "graph representation" is being selected at constructor time :param epsilon, ɛ, the radius (size) of the neighborhood around a data point p. :param min_samples, the minimum number of data points that have to be withing that neighborhood for a point to be considered a core point (of that given cluster ) - cluster density level threshold. :param embeddings, list or embeddings in case of vector representation :param entity_name, the name of the set - helps with json naming (optional) :param selected_base_models, need this for jason naming :return: clusters """ words = list(entity_group) # graph representation aka string similarity - requires precomputed matrix of distances between strings # and "custom" distance metric metric if self.representation == "graph_representation": affinity = "precomputed" selected_base_models = '' if metric == "jaro": distance_matrix = np.array([[jaro(w1, w2) for w1 in words] for w2 in words]) features = distance_matrix elif metric == "levenshtein": similarity_matrix = np.array( [[levenshtein(w1, w2) for w1 in words] for w2 in words]) features = similarity_matrix else: raise SystemExit( f"[ERROR]: function affinity_propagation() -> Provide one of the available metrics: " f"['jaro', 'levenshtein']") else: # in this case we are dealing with embeddings; python currently supports only euclidean distance affinity = metric metric = affinity features = embeddings if selected_base_models: selected_base_models = selected_base_models model = DBSCAN(eps=epsilon, min_samples=min_samples, metric=affinity) features = np.array(features) model.fit(features) clusters = {} for idx, label in enumerate(model.labels_): if label not in clusters.keys(): # got an error here, needs to be int, not int64 - hence the type cast to int clusters.update({int(label): [words[idx]]}) else: clusters[int(label)].append(words[idx]) if self.print_output: for key, item in clusters.items(): print(key, item) if self.save_output: # name of json is dynamic - all parameter values are integrated in the name of the file itself with open(f"{str(Path.cwd())}/results/{self.representation[0]}_dbscan_" f"{metric}_{str(epsilon)}_{str(min_samples)}_{entity_name}_{selected_base_models}.json", "w+") \ as out: json.dump(clusters, out, indent=4, sort_keys=True) return clusters
def lev_metric(x, y): i, j = int(x[0]), int(y[0]) return levenshtein(data[i], data[j])
def feature_NORMALIZATION(self, lnk): edit = levenshtein(unicode(lnk["label"]), unicode(lnk["text"])) return float(edit) / len(lnk["text"])
def _get_seq_match_score(seq_1, seq_2): """ Can use any reasonable matching criteria. Using leven for now, rather overkill for what we need but it's k for now. """ return leven.levenshtein(seq_1, seq_2)
def get_edit_distance(self, row, geoname): sn_name = row['NAME'] leven_dist = levenshtein(geoname, sn_name) return leven_dist
rightNE=[] with open('/home/ahmad/duplicate-detection/eventregistrydata/pairs/StanfordNErightAttro2enesde_'+str(_c)+'.txt','r') as myfile: rightNE=myfile.readlines() leftNE=[item.replace("[u'","").replace("[","").replace("]","").replace("\', u\'"," ").replace("\'","").split() for item in leftNE] rightNE=[item.replace("[u'","").replace("[","").replace("]","").replace("\', u\'"," ").replace("\'","").split() for item in rightNE] for _i,_leftNE,_rightNE in zip(range(len(leftNE)),leftNE,rightNE): if Alllangpairs[n+_i][0]==Alllangpairs[n+_i][1]: score.append(-1) continue nintersection=0 for _iteml in set(_leftNE): for _itemr in set(_rightNE): if 1.0 * levenshtein(_iteml, _itemr)/(len(_iteml)+len(_itemr))<0.25: nintersection+=1 nunion=len(set(_leftNE))+len(set(_rightNE)) if nunion>0: score.append(1.0*nintersection/nunion) n+=len(leftNE) len(Alllangpairs),len(score),len(Allsentpairs),len(Allisdup_labels) i=score.index(-1) sum([True for _s, _l in zip(score, Alllangpairs[540000:540000+len(score)]) if _l[0]==_l[1] and _s==-1]) sum([True for _s in score if _s==-1])
def checkAuthorMatch(article1,article2,authorName=None, yearGap=2, nCommonAuths=3, matchThresh=0.70, absMatchLimit=0.5, titleMatchLimit=0.5, keywordMatchTol=2): """ Check if two articles are probably from the same author. I think they are if: 1) They are published within yearGap of eachother, from the same location or 2) They share nCommonRefs or more or 3) They share nCommonAuths or more or 4) Author lists are >1 and have the same names. or 5) Author of interest is spelled exactly the same, and they share 2+ keywords matchThresh sets the theshold for doing fuzzy string comparison with the affiliation names (so, 'Univeristy of Washington' will match 'University of Washington, Astronomy Department' returns ------- bool """ result = False # If any of the criteria match, return True and bail out if authorName is None: authorName = authSimple(article1.author[0]) # If author list is >1 and identical if len(article1.authorset) > 1: if article1.authorset == article2.authorset: return True # If published within yearGap from same location if hasattr(article1,'year') & hasattr(article2,'year'): if np.abs(int(article1.year)-int(article2.year)) <= yearGap: aff1 = None aff2 = None lDists = [levenshtein(unicode(authorName),unicode(authSimple(name))) for name in article1.author] good = np.where(lDists == np.min(lDists))[0] # Can't tell which one is author, can't link it up. if np.size(good) > 1: return False if lDists[good] < 3: aff1 = affClean(article1.aff[good]) lDists = [levenshtein(unicode(authorName),unicode(authSimple(name))) for name in article2.author] good = np.where(lDists == np.min(lDists))[0] if np.size(good) > 1: return False if lDists[good] < 3: aff2 = affClean(article2.aff[good]) if (aff1 is not None) & (aff2 is not None): if checkAffMatch(aff1,aff2,matchThresh=matchThresh): return True # Check if they have enough common authors commonAuthors = article1.authorset.intersection(article2.authorset) if len(commonAuthors) >= nCommonAuths: return True # If the keywords are in common if (article1.keyword is not None) & (article2.keyword is not None): if len(set(article1.keyword) & set(article2.keyword)) > keywordMatchTol: # If the author is spelled exactly the same, and there are matching keywords a1 = [author for author in article1.author if authSimple(author) == authSimple(authorName)] a2 = [author for author in article2.author if authSimple(author) == authSimple(authorName)] if (len(a1) == 1) & (len(a2) ==1): # If the names match exactly if a1[0] == a2[0]: return True return result
def lev_dist(X): #returns the levenshtein distance between all pairs of a vector X D = np.zeros((len(X),len(X))) for row in range(len(X)): for col in range(len(X)): D[row][col] = levenshtein(X[row],X[col]) return D