def get_similarities(Features, url_input): """ similarity metrics include: Levenshtein, jaro, damerau levenshtein, normalized_damerau_levenshtein, and hamming distance :param Features: input dictionary to add things to :param url_input :return: Features: after adding all similarity metrics """ for n in itertools.chain(product_domain_names, brand_names): Features['url_levenshtein_distance_' + n] = Levenshtein.distance( url_input, n) Features['fqdn_levenshtein_distance_' + n] = Levenshtein.distance( Features['fqdn'], n) Features['url_jaro_winkler_distance_' + n] = jw.get_jaro_distance( url_input, n) Features['fqdn_jaro_winkler_distance_' + n] = jw.get_jaro_distance( Features['fqdn'], n) Features['url_damerau_levenshtein_distance_' + n] = dl.damerau_levenshtein_distance(url_input, n) Features['fqdn_damerau_levenshtein_distance_' + n] = dl.damerau_levenshtein_distance(Features['fqdn'], n) Features['url_damerau_levenshtein_normalized_distance_' + n] = dl.normalized_damerau_levenshtein_distance(url_input, n) Features['fqdn_damerau_levenshtein_normalized_distance_' + n] = dl.normalized_damerau_levenshtein_distance( Features['fqdn'], n) if len(n) == len(url_input): Features['url_length_equals_' + n] = 1 Features['url_hamming_distance_' + n] = hamming(url_input, n) Features['fqdn_hamming_distance_' + n] = hamming( Features['fqdn'], n) else: Features['url_length_equals_' + n] = 0 return Features
def spell_score(misspelling, candidates, method=1): """ Calculates the edit distance between a misspelling and each candidate according to the chosen method :param misspelling: misspelling :param candidates: list of candidates :param method: chosen method from [1, 2, 3, 4] :return: list of edit distances between misspelling and each candidate """ lexical_scores = [damerau_levenshtein_distance(misspelling, candidate) for candidate in candidates] if method == 1: return lexical_scores else: phonetic_scores = [damerau_levenshtein_distance(dm(misspelling)[0], dm(candidate)[0]) for candidate in candidates] if method == 2: return [phonetic_score if phonetic_score != 0 else 1 for phonetic_score in phonetic_scores] elif method == 3: return [0.5 * (a + b) for a, b in zip(lexical_scores, phonetic_scores)] elif method == 4: return [(2 * a + b) ** 2 for a, b in zip(lexical_scores, phonetic_scores)] else: raise ValueError('Method must be element from [1, 2, 3, 4]')
def riclassifica_per_tag(p, tags, classi_da_riclassificare): parola = p[0] tags.append(parola) res = {} if len(classi_da_riclassificare) > 0: print 'riclassifica per tag solo per alcune classi' for classe in classi_da_riclassificare: # solo le classi con distanza simile o uguale dam = [] for campo_misurato in dictionary[classe]: for tag in tags: dam.append( damerau_levenshtein_distance(tag, campo_misurato)) res[classe] = min(dam) # print 'dam', classe, '(', min(dam), ')-->', dam return res else: print 'riclassifica per tag per tutte le classi' for classe in dictionary.keys(): # tutte le classi dam = [[(damerau_levenshtein_distance(tag, campo_misurato)) for tag in tags] for campo_misurato in dictionary[classe]] res[classe] = min(dam) # print 'dam', classe, '----->', dam # TODO !?!?!??! return res
def noisychannel_ranking(self, detection_list, candidates_list): """ An approximate implementation of the ranking method described in (Lai et al. 2015) :param detection_list: list of misspellings :param candidates_list: list of candidate list per misspelling :param frequency_dict: corpus frequencies from training data :param k_best: if True, return k highest ranked candidates instead of single one :return: list with corrections or k-best corrections """ correction_list = [] confidences = [] for misspelling, candidates in zip(detection_list, candidates_list): score_list = [] for candidate in candidates: orthographic_edit_distance = damerau_levenshtein_distance(misspelling, candidate) phonetic_edit_distance = damerau_levenshtein_distance(dm(misspelling)[0], dm(candidate)[0]) spell_score = (2 * orthographic_edit_distance + phonetic_edit_distance) ** 2 # P(m|c) try: frequency = self.frequency_dict[candidate] except KeyError: frequency = 1 frequency_score = 1 / (1 + log(frequency)) # P(c) score = spell_score * frequency_score # P(c|m) = P(m|c)*P(c) score_list.append(score) score_list = np.array(score_list) if len(score_list) > 1: sorted_distances = [score_list[i] for i in np.argsort(score_list)] top1 = sorted_distances[0] top2 = sorted_distances[1] confidence = abs(top1 - top2) / top1 confidences.append(confidence) else: confidences.append(0) if self.k == 1: try: correction_list.append(candidates[np.argmin(score_list)]) except ValueError: correction_list.append('') elif self.k > 1: correction_list.append([candidates[i] for i in np.argsort(score_list)[:self.k]]) else: raise ValueError('k must be positive natural number') self.confidences = confidences return correction_list
def merge(ci, cj): cluster1 = cluster_list[ci] cluster2 = cluster_list[cj] # get lcs between patrons and receive difference ids pi = cluster1.p # patron of cluster1 pj = cluster2.p # pi_ids has ids from events in (pi - p) (p, pi_ids, pj_ids) = lcs(pi, pj) # get common patron p_ = copy.deepcopy(p) # Candidate Events Ec = (Pi - P) U (Pj - P) cE = candidateEvents(pi, pj, pi_ids, pj_ids) # Sort Ec by frequency in desc order e, f = frecuencySort(cE) # e: event id, f: frecuency L = -1 ## Gi U Gj #G_u = np.concatenate(cluster1.g, cluster2.g) gi = cluster1.g # event sequences ids forming cluster1 gj = cluster2.g G_u = gi + gj n_edits_gi = sum([ damerau_levenshtein_distance(list(Secuences[e_]), list(pi)) for e_ in gi ]) n_edits_gj = sum([ damerau_levenshtein_distance(list(Secuences[e_]), list(pj)) for e_ in gj ]) # Pattern buildup phase for e in cE: p = add(p, e) n_edits_p = sum([ damerau_levenshtein_distance(list(Secuences[e_]), list(p)) for e_ in G_u ]) L_ = len(pi) + len(pj) - len(p) + alpha * (n_edits_gi) + alpha * ( n_edits_gj) - alpha * (n_edits_p) + lamda if L_ < 0 or L_ < L: break else: L = L_ p_ = p c_ = Cluster(p_, G_u) return (L, c_)
def score(network, dataset): score = 0. for x, y in dataset: predict = unconvert(network.activate(x)) score += damerau_levenshtein_distance(predict, unconvert(y)) score /= float(len(dataset)) return score
def is_same(u1, u2): """Determine whether Djk ≤ 0.2 × Min.[|Tj|,|Tk|]""" D_jk = damerau_levenshtein_distance(u1, u2) t_j = len(u1) t_k = len(u2) min_ = min(t_j, t_k) return D_jk < 0.2 * min_
def score(network, dataset): score = 0. for x, y in dataset: predict = unconvert(network.activate(x)) score += damerau_levenshtein_distance(predict,unconvert(y)) score /= float(len(dataset)) return score
def scan(self, filePath): ''' Read the content content of filename, extract the comments and preprocess them. Find the Damerau Levenshtein distance between the preprocessed file content and the license text. :param filePath: Path of the file to scan :return: Returns the license's short name with least damerau levenshtien distance ''' processedData = super().loadFile(filePath) temp = exactMatcher(processedData, self.licenseList) if temp == -1: # Classify the license with minimum distance with scanned file globalDistance = sys.maxsize result = 0 for idx in range(len(self.licenseList)): distance = damerau_levenshtein_distance( processedData.split(" "), self.licenseList.iloc[idx]['processed_text'].split(" ")) if self.verbose > 0: print( str(idx) + " " + self.licenseList.iloc[idx]['shortname'] + " " + str(distance)) if distance < globalDistance: globalDistance = distance result = idx return str(self.licenseList.iloc[result]['shortname']) else: return temp[0]
def correct(string,typo_dictionary): corrections_dict = {} min_correct_len = float('inf') queue = sorted(list(set([string] + generate_deletes(string, threshold_levensthein))), key=len, reverse=True) while len(queue) > 0: q_item = queue.pop(0) if ((len(corrections_dict) > 0) and ((len(string) - len(q_item)) > min_correct_len)): break if (q_item in typo_dictionary) and (q_item not in corrections_dict): if (typo_dictionary[q_item][1] > 0): corrections_dict[q_item] = (typo_dictionary[q_item][1], len(string) - len(q_item)) if len(string) == len(q_item): break elif (len(string) - len(q_item)) < min_correct_len: min_correct_len = len(string) - len(q_item) for sc_item in typo_dictionary[q_item][0]: if (sc_item not in corrections_dict): if len(q_item) == len(string): item_dist = len(sc_item) - len(q_item) item_dist = damerau_levenshtein_distance(sc_item, string) if item_dist > min_correct_len: pass elif item_dist <= threshold_levensthein: corrections_dict[sc_item] = (typo_dictionary[sc_item][1], item_dist) if item_dist < min_correct_len: min_correct_len = item_dist corrections_dict = {k: v for k, v in corrections_dict.items() if v[1] <= min_correct_len} return corrections_dict
def calcola_precisione_classe(lista_cluster): totale_campi = len(lista_cluster) classi_attribuite = {} for elem in lista_cluster: # Aggiungo la classe vera di questa parola tra le classi effettive count_metriche[classe_reale(elem)]['effettive'] += 1 # TODO la classe effettiva si riferisce al cluster o al campo ?!! res = {} for classe in dictionary.keys(): dam = (damerau_levenshtein_distance(elem, classe)) res[classe] = dam classe_attribuita = min(res, key=res.get) count_metriche[classe_attribuita]['classificate'] += 1 if classe_reale(elem) == classe_attribuita: count_metriche[classe_reale(elem)]['classificate_esatte'] += 1 if classi_attribuite.get(classe_attribuita) is None: classi_attribuite[classe_attribuita] = 1 else: classi_attribuite[classe_attribuita] += 1 classe_piu_frequente = max(classi_attribuite, key=classi_attribuite.get) # 100 : totale_campi = x : classi_attribuite[classe_piu_frequente] percentuale = (100 * classi_attribuite[classe_piu_frequente])/totale_campi return classe_piu_frequente, percentuale
def calcola_precisione_dizionario(lista_cluster): totale_campi = len(lista_cluster) classi_attribuite = {} # riempi_dizionario('coppieCampoClasse.csv') riempi_dizionario('training_set_1.csv') for elem in lista_cluster: res = {} for classe in dictionary.keys(): dam = [(damerau_levenshtein_distance(elem, campo_misurato)) for campo_misurato in dictionary[classe]] res[classe] = min(dam) classe_attribuita = min(res, key=res.get) if classi_attribuite.get(classe_attribuita) is None: classi_attribuite[classe_attribuita] = 1 else: classi_attribuite[classe_attribuita] += 1 classe_piu_frequente = max(classi_attribuite, key=classi_attribuite.get) # 100 : totale_campi = x : classi_attribuite[classe_piu_frequente] percentuale = (100 * classi_attribuite[classe_piu_frequente]) / totale_campi return classe_piu_frequente, percentuale
def rectify(self, word, prev_word): """ Предсказания спеллера """ # подбираем число кандидатов по длине запроса self.n_candidates = 350 if len(word) <= 4 else 250 if len(word) <= 7 else self.n_candidates # для каждого терма считаем расстояние Левенштейна candidates = sorted(self.words_list.find(word, 7)) # среди топа кандидатов ищем "хорошее" исправление # используем модифицированное расстояние Дамерау-Левенштейна (с перестановками) # а также ищем слово с минимальным количеством новых букв suggests = list() for _, sugg in candidates[:self.n_candidates]: dist = damerau_levenshtein_distance(sugg, word) context_list = self.voc_vectorizer.transform([f"{prev_word} {sugg}"]).tocoo().col.tolist() if dist <= 5: suggs = [(sugg, dist, 0.0)] if context_list: suggs = [(sugg, dist, self.voc.get(context, 0.0)) for context in context_list] suggests.extend(suggs) suggests = sorted(suggests, key=lambda tup: tup[1]) minimal_distance = min(suggest[1] for suggest in suggests) candidates = sorted( [(suggest[0], suggest[2]) for suggest in suggests if suggest[1] == minimal_distance and set(suggest[0]) == set(word)], key=lambda tup: -tup[1]) return candidates[0][0] if candidates and candidates[0][1] > 0 else suggests[0][0]
def calcola_precisione_classe(lista_cluster): totale_campi = len(lista_cluster) classi_attribuite = {} for elem in lista_cluster: # Aggiungo la classe vera di questa parola tra le classi effettive count_metriche[classe_reale( elem )]['effettive'] += 1 # TODO la classe effettiva si riferisce al cluster o al campo ?!! res = {} for classe in dictionary.keys(): dam = (damerau_levenshtein_distance(elem, classe)) res[classe] = dam classe_attribuita = min(res, key=res.get) count_metriche[classe_attribuita]['classificate'] += 1 if classe_reale(elem) == classe_attribuita: count_metriche[classe_reale(elem)]['classificate_esatte'] += 1 if classi_attribuite.get(classe_attribuita) is None: classi_attribuite[classe_attribuita] = 1 else: classi_attribuite[classe_attribuita] += 1 classe_piu_frequente = max(classi_attribuite, key=classi_attribuite.get) # 100 : totale_campi = x : classi_attribuite[classe_piu_frequente] percentuale = (100 * classi_attribuite[classe_piu_frequente]) / totale_campi return classe_piu_frequente, percentuale
def makeNet(learning_rate): ds = SupervisedDataSet(20, 20) with open('data/misspellingssmall.csv', 'rbU') as f: reader = csv.reader(f) for row in reader: ds.addSample(convert(row[0]),convert(row[1])) #testds, trainds = ds.splitWithProportion(0.2) net = buildNetwork(20, 20, 20) #trainer = BackpropTrainer(net, dataset=trainds, learningrate=learning_rate) trainer = BackpropTrainer(net, dataset=ds, learningrate=learning_rate) #trainer.train() #trainer.trainEpochs(5) trainer.trainUntilConvergence() score = 0 for x, y in testds: predict = unconvert(net.activate(x)) score += damerau_levenshtein_distance(predict,unconvert(y)) global lastNet lastNet = net global netNum netNum += 1 print "Network " + str(netNum) + " done with score " + str(score) return score
def test_damerau_levenshtein_distance(self): assert damerau_levenshtein_distance('smtih', 'smith') == 1 assert damerau_levenshtein_distance('snapple', 'apple') == 2 assert damerau_levenshtein_distance('testing', 'testtn') == 2 assert damerau_levenshtein_distance('saturday', 'sunday') == 3 assert damerau_levenshtein_distance('Saturday', 'saturday') == 1 assert damerau_levenshtein_distance('orange', 'pumpkin') == 7 assert damerau_levenshtein_distance('gifts', 'profit') == 5 assert damerau_levenshtein_distance('Sjöstedt', 'Sjostedt') == 1
def candidate_search(self, We, threshold): candidate = {} for Wc in self.dictionary: dist = damerau_levenshtein_distance(Wc, We) if dist <= threshold: candidate[Wc] = self.dictionary[Wc] return (candidate)
def wordsim2(word1, word2): '''return a similarity score for between the two words attempt to use a behavioral model of edit distances ''' dist = damerau_levenshtein_distance(str(word1), str(word2)) if dist > 3: return 0 edit = editops(str(word1), str(word2)) return match_model(edit)
def __call__(cls, *args, **kwargs): """ Checks for misprints in argument names """ obj = super(UnitRegistry, cls).__call__(*args, **kwargs) if hasattr(cls, "DISABLE_KWARGS_CHECK") or not UnitRegistry.enabled: return obj def warning(*largs): obj.warning(*largs) if root.common.trace.misprints: obj.warning( "Stack trace:\n%s", "".join( format_list( extract_stack( inspect.currentframe().f_back.f_back)))) # Build the matrix of differences matrix = {} matched = set() for given_kwarg in kwargs: for kwattr in cls.KWATTRS: if (kwattr, given_kwarg) in matrix: continue matrix[(given_kwarg, kwattr)] = d = \ damerau_levenshtein_distance(given_kwarg, kwattr) if d == 0: # perfect match, stop further comparisons matched.add(given_kwarg) break if len(matched) < len(kwargs): # Find replacement candidates with distance = 1 ignored_kwargs = set() for given_kwarg in set(kwargs).difference(matched): candidates = [] for kwattr in cls.KWATTRS: d = matrix.get((given_kwarg, kwattr)) if d == 1: candidates.append(kwattr) if len(candidates) == 0: ignored_kwargs.add(given_kwarg) else: warning( "Creating %s: potential misprint in keyword argument " "name: expected %s - got %s", obj, " or ".join(candidates), given_kwarg) try: __IPYTHON__ # pylint: disable=E0602 from IPython.terminal.interactiveshell import \ InteractiveShell ignored_kwargs -= set(InteractiveShell.instance().user_ns) except NameError: pass if len(ignored_kwargs) > 0: warning( "Creating %s: ignored the following keyword arguments: %s", obj, ", ".join(sorted(ignored_kwargs))) return obj
def noisychannel_ranking(self, candidates_list): """ An approximate implementation of the ranking method described in Lai et al. (2015), 'Automated Misspelling Detection and Correction in Clinical Free-Text Records' :param candidates_list: list of candidate list per misspelling :return: list with corrections or k-best corrections """ correction_list = [] confidences = [] for misspelling, candidates in zip(self.misspellings, candidates_list): if not candidates: correction_list.append('') continue score_list = [] for candidate in candidates: orthographic_edit_distance = damerau_levenshtein_distance( misspelling, candidate) phonetic_edit_distance = damerau_levenshtein_distance( dm(misspelling)[0], dm(candidate)[0]) spell_score = (2 * orthographic_edit_distance + phonetic_edit_distance)**2 # P(m|c) try: frequency = self.frequency_dict[candidate] except KeyError: frequency = 1 frequency_score = 1 / (1 + log(frequency)) # P(c) score = spell_score * frequency_score # P(c|m) = P(m|c)*P(c) score_list.append(score) score_list = np.array(score_list) if self.k == 1: try: correction_list.append(candidates[np.argmin(score_list)]) except ValueError: correction_list.append('') else: correction_list.append( [candidates[i] for i in np.argsort(score_list)[:self.k]]) return correction_list
def best(string,typo_dictionary): for word in attention_words: if damerau_levenshtein_distance(word, string) <= threshold_levensthein: return word try: as_list = correct(string,typo_dictionary).items() outlist = sorted(as_list, key=lambda item: (item[1][1], -item[1][0])) return outlist[0][0] except: return string
def __call__(cls, *args, **kwargs): """ Checks for misprints in argument names """ obj = super(UnitRegistry, cls).__call__(*args, **kwargs) if hasattr(cls, "DISABLE_KWARGS_CHECK") or not UnitRegistry.enabled: return obj def warning(*largs): obj.warning(*largs) if root.common.trace.misprints: obj.warning("Stack trace:\n%s", "".join(format_list(extract_stack( inspect.currentframe().f_back.f_back)))) # Build the matrix of differences matrix = {} matched = set() for given_kwarg in kwargs: for kwattr in cls.KWATTRS: if (kwattr, given_kwarg) in matrix: continue matrix[(given_kwarg, kwattr)] = d = \ damerau_levenshtein_distance(given_kwarg, kwattr) if d == 0: # perfect match, stop further comparisons matched.add(given_kwarg) break if len(matched) < len(kwargs): # Find replacement candidates with distance = 1 ignored_kwargs = set() for given_kwarg in set(kwargs).difference(matched): candidates = [] for kwattr in cls.KWATTRS: d = matrix.get((given_kwarg, kwattr)) if d == 1: candidates.append(kwattr) if len(candidates) == 0: ignored_kwargs.add(given_kwarg) else: warning( "Creating %s: potential misprint in keyword argument " "name: expected %s - got %s", obj, " or ".join(candidates), given_kwarg) try: __IPYTHON__ # pylint: disable=E0602 from IPython.terminal.interactiveshell import \ InteractiveShell ignored_kwargs -= set(InteractiveShell.instance().user_ns) except NameError: pass if len(ignored_kwargs) > 0: warning( "Creating %s: ignored the following keyword arguments: %s", obj, ", ".join(sorted(ignored_kwargs))) return obj
def classificate(stream,x,y): # res è un dict che conterrà (per ogni parola da testare) tutte le classi # e ad ognuna corrisponderà la distanza minima trovata (con le parole che la 'compongono') res = {} # **************** PRIMA CLASSIFICAZIONE ***************** # Per ogni classe calcolo la diumbstanza tra le parole di cui è composta e la parola da testare for classe in dictionary.keys(): # For each term in the dictionary for each class i save the DL distance, then i pick the minimum per class dam = [(damerau_levenshtein_distance(stream, campo_misurato)) for campo_misurato in dictionary[classe]] # array di distanze per classe if len(dam) > 0: res[classe] = min(dam) # I pick the minimum distance for each class #else: #res[classe] = 50 #Da modificare, ma per ora serve per evitare problemi con le classi senza parole distanza_minima = res[min(res, key=res.get)] classi_con_stessa_distanza_minima = [] # riempio una lista per vedere se la distanza minima trovata è duplicata for key, value in res.iteritems(): # TODO casi in cui ci sono distanze uguali !! if value == distanza_minima: # print 'distanza minima =', key classi_con_stessa_distanza_minima.append(key) if distanza_minima is 0: # TODO non so se riclassificare -> può venir fuori lo stesso risultato if len(classi_con_stessa_distanza_minima) > 1: # è stata trovata più di una classe con distanza 0 -> riclassifico per quelle classi #XXX res = riclassifica_per_tag(p, tags, classi_con_stessa_distanza_minima) res = res #toglilo else: """ A questo punto, verifico due condizioni: - se la distanza minima trovata tra tutte le classi è maggiore del x% di len(strea,) - se ci sono due distanze molto simili che hanno differenza y% sulla lunghezza """ percent_lunghezza = (len(stream) * x) / 100 #se non rispetta la condizione la assumo come buona if distanza_minima > percent_lunghezza: # riclassifico solo per alcune classi !? # TODO cerco le classi con distanze simili alla distanza minima # aggiungo alla lista di distanza minima simile, le classi con distanze diverse ma simili for classe, dist in res.iteritems(): diff = (abs((distanza_minima - dist)) * y) / 100 if diff < percent_lunghezza and (dist != distanza_minima): classi_con_stessa_distanza_minima.append(classe) #XXX res = riclassifica_per_tag(p, tags, classi_con_stessa_distanza_minima) # We decide finally the class and check whether is right or wrong classe_attribuita = min(res, key=res.get) return classe_attribuita
def wordsim1(word1, word2): '''return a similarity score for between the two words TODO: stem the words ''' probs = [1, 0.7669349429912811, 0.1784037558685446, 0.03386988598256204, 0.015090543259557344, 0.004024144869215292, 0.001676727028839705] dist = damerau_levenshtein_distance(str(word1), str(word2)) if dist<len(probs): return probs[dist] else: return 0
def get_candidates(self, word): """ Damerau-Levenshtein edit distance is used to generate a candidate set of this word. :param word: source word used to generate a candidate set :return: the candidate set of this word """ candidates = dict() for word_list_item in self.vocab_list: edit_distance = damerau_levenshtein_distance(word, word_list_item) if edit_distance <= 1: candidates[word_list_item] = edit_distance return sorted(candidates, key=candidates.get, reverse=False)
def rectify(self, word, prev_word): """ Предсказания спеллера """ if word == ',,': return ',' if word == '..': return '...' # запрос, преобразованный в нграммы char_ngrams_list = self.vectorizer.transform([word]).tocoo().col # подбираем число кандидатов по длине запроса self.n_candidates = 350 if len(word) <= 4 else 250 if len( word) <= 7 else self.n_candidates # для каждого терма считаем совпадение по нграммам counter = Counter() for token_id in char_ngrams_list: for word_id in self.index[token_id]: counter[word_id] += 1 # среди топа по совпадениям по нграммам ищем "хорошее" исправление # используем модифицированное расстояние Левенштейна (с перестановками) # а также ищем слово с минимальным количеством новых букв suggests = list() for suggest in counter.most_common(n=self.n_candidates): sugg = self.words_list[suggest[0]] dist = damerau_levenshtein_distance(sugg, word) context_list = self.voc_vectorizer.transform( [f"{prev_word} {sugg}"]).tocoo().col.tolist() if dist <= 5: suggs = [(sugg, dist, 0.0)] if context_list: suggs = [(sugg, dist, self.voc.get(context, 0.0)) for context in context_list] suggests.extend(suggs) suggests = sorted(suggests, key=lambda tup: tup[1]) minimal_distance = min(suggest[1] for suggest in suggests) candidates = sorted([ (suggest[0], suggest[2]) for suggest in suggests if suggest[1] == minimal_distance and set(suggest[0]) == set(word) ], key=lambda tup: -tup[1]) return candidates[0][ 0] if candidates and candidates[0][1] > 0 else suggests[0][0]
def compare(a, b): results = { 'editdistance': editdistance.eval(a, b), 'pylev': pylev.levenshtein(a, b), 'python-Levenshtein': Levenshtein.distance(a, b), 'pyxdameraulevenshtein': pyxdameraulevenshtein.damerau_levenshtein_distance(a, b), } return results
def calculate_similarity(docs_list, similarity_type, threshold): ''' Calculate vector similarity of all possible pairs in list ''' results = list() counter = 0 # Get all possible combinations of tweets that have same NER all_combinations = list(combinations(docs_list, 2)) # Filter handles, hashtags, emoticons, etc. for tweet_pair in all_combinations: tweet_pair[0].filter("*") tweet_pair[1].filter("*") # Filter out pairs with exact sentences if tweet_pair[0].clean_text != tweet_pair[1].clean_text: # Filter out sentences shorter than 4 words if tweet_pair[0].tweet_len() > 3 and tweet_pair[1].tweet_len() > 3: # Filter out those combinations with excesive word number differences if abs(tweet_pair[0].tweet_len() - tweet_pair[1].tweet_len()) < 4: if similarity_type == "jaccard": settext1 = tweet_pair[0].word_set() settext2 = tweet_pair[1].word_set() d = jaccard_distance(settext1, settext2) if similarity_type == "jaro_winkler": d = 1 - distance.jaro_winkler_similarity(tweet_pair[0].clean_text, tweet_pair[1].clean_text) if similarity_type == "levenshtein": d = damerau_levenshtein_distance(tweet_pair[0].clean_text, tweet_pair[1].clean_text) # Only return those results above the threshold if d < threshold: # Put in source sentences with more oov words and extra filter target if tweet_pair[0].oov_words() > tweet_pair[1].oov_words(): bi_combination = tweet_pair[0].source_filter(), tweet_pair[1].target_filter() else: bi_combination = tweet_pair[1].source_filter(), tweet_pair[0].target_filter() if bi_combination not in results: results.append(bi_combination) counter += 1 sys.stdout.write(f"\rAdding combinations...") sys.stdout.flush() return results
def dl_ngram_dist(ngram1, ngram2): """ Compute distance between ngrams by summing the Damerau-Levenshtein distance for consecutive words in ngrams. Params: ngram1: [tuple] Tuple of words. ngram2: [tuple] Tuple of words. Returns: distance [int] Measure of distance between two ngrams. """ return sum(damerau_levenshtein_distance(w1, w2) for w1, w2 in zip(ngram1, ngram2))
def dl_ngram_dist(ngram1, ngram2): """ Compute distance between ngrams by summing the Damerau-Levenshtein distance for consecutive words in ngrams. Params: ngram1: [tuple] Tuple of words. ngram2: [tuple] Tuple of words. Returns: distance [int] Measure of distance between two ngrams. """ return sum( damerau_levenshtein_distance(w1, w2) for w1, w2 in zip(ngram1, ngram2))
def riclassifica_per_tag(p, tags, classi_da_riclassificare): parola = p[0] tags.append(parola) res = {} if len(classi_da_riclassificare) > 0: print 'riclassifica per tag solo per alcune classi' for classe in classi_da_riclassificare: # solo le classi con distanza simile o uguale dam = [] for campo_misurato in dictionary[classe]: for tag in tags: dam.append(damerau_levenshtein_distance(tag,campo_misurato)) res[classe] = min(dam) # print 'dam', classe, '(', min(dam), ')-->', dam return res else: print 'riclassifica per tag per tutte le classi' for classe in dictionary.keys(): # tutte le classi dam = [[(damerau_levenshtein_distance(tag, campo_misurato)) for tag in tags] for campo_misurato in dictionary[classe]] res[classe] = min(dam) # print 'dam', classe, '----->', dam # TODO !?!?!??! return res
def convert_candidates(metaphone_candidates, detection, metaphone_dict): """ :param candidates: replacement candidates :param detection: misspelling :param metaphone_dict: output of load_metaphones() :return: candidates converted from Double Metaphone representation to normal lexical representation """ converted_candidates = [] for i, candidate in enumerate(metaphone_candidates): for item in metaphone_dict[candidate]: if len(set(item).intersection(set( candidate))) >= 1: # have at least one character in common if damerau_levenshtein_distance( item, detection) <= 3: # enough overlap converted_candidates.append(item) return converted_candidates
def calculate_global_dissimilarity_score(test_fingerprint, sampled_fingerprints): scores_list = [ ] # store scores between each test_fingerprint,sampled_fingerprint pajrs test_fingerprint_word, sampled_fingerprint_word_list = fingerprint2word( test_fingerprint, sampled_fingerprints) for sampled_fingerprint_word in sampled_fingerprint_word_list: distance = damerau_levenshtein_distance(sampled_fingerprint_word, test_fingerprint_word) scores_list.append(distance) normalized_scores_list = scores_list / float(max(scores_list)) global_score = sum(normalized_scores_list) return global_score
def compute_distances(file_path): hashes_dic = read_fuzzies("ris_androdump_safe/hashes.txt") hashes_dic_db = read_fuzzies("ris_androdump_safe/hashes_database.txt") with open(file_path, "w") as h_file: for apk_path, fuzzies_list in hashes_dic.items(): for apk_path2, fuzzies_list2 in hashes_dic_db.items(): sim_list = list() fam = apk_path.split("/")[-2] fam2 = apk_path2.split("/")[-2] if fam != fam2: for fuzzy in fuzzies_list: for fuzzy2 in fuzzies_list2: # Compute edit distance between two sub-fuzzies dist = damerau_levenshtein_distance(fuzzy, fuzzy2) if 0 < dist < 5: sim_list.append((dist, (fuzzy, fuzzy2))) # write down couple with delimiter couple_str = fam + "/" + apk_path.split("/")[-1] + "@" + fam2 + "/" + apk_path2.split("/")[-1] h_file.write(str((couple_str, sim_list))+"\n")
def processtxn(txn, choices): maxscoreJ = 0 matchstrJ = "" maxscoreDL = 0 matchstrDL = "" maxscoreNDL = 0 matchstrNDL = "" for c in choices: scoreJ = jaro.jaro_metric(txn, c) scoreDL = 1000 - damerau_levenshtein_distance(txn, c) scoreNDL = 1 - normalized_damerau_levenshtein_distance(txn, c) if scoreJ > maxscoreJ: matchstrJ = c maxscoreJ = scoreJ if scoreDL > maxscoreDL: matchstrDL = c maxscoreDL = scoreDL if scoreNDL > maxscoreNDL: matchstrNDL = c maxscoreNDL = scoreNDL return {'jaro': matchstrJ, 'dl': matchstrDL, 'ndl': matchstrNDL}
def CorrectSpelling(speech, vocab_init, vocab_endanlegt): """Use Damerau Levenshtein distance to correct the spelling in the intermediate texts""" for word in vocab_init: #word_dict={} replaced = 0 for w_endanlegt in vocab_endanlegt: #dist=MinEditDist(word,w_endanlegt) dist = damerau_levenshtein_distance(word, w_endanlegt) if dist == 1: speech = re.sub(r"\b%s\b" % word, w_endanlegt, speech) replaced = 1 break # else: # word_dict[dist]=w_endanlegt # # Need to find the min dist and substitute if not already substituted # if replaced == 0: # speech = re.sub(r"\b%s\b" % word,word_dict[min(word_dict,key=int)],speech) return speech
def levenshtein_candidates(word, vocab_dict, editdistance=2): """ Generates candidates :param word: the misspelling for which to generate replacement candidates :param vocab_dict: the output of load_vocab() :param editdistance: the maximum Damerau-Levenshtein edit distance :return: """ candidates = [] word_len = len(word) set_len = len(set(word)) if word_len <= 2: word_lengths = range(word_len, word_len + 1 + editdistance) else: word_lengths = range(word_len - editdistance, word_len + 1 + editdistance) if set_len - editdistance > 0: set_lengths = range(set_len - editdistance, set_len + 1 + editdistance) else: set_lengths = range(set_len, set_len + 1 + editdistance) selection = [] for i in word_lengths: key = vocab_dict[i] for j in set_lengths: selection += key[j] for item in set(selection): if damerau_levenshtein_distance(word, item) <= editdistance: candidates.append(item) full_candidates = list(set(candidates)) return full_candidates
def calculate_global_dissimilarity_score(test_fingerprint, sampled_fingerprints): scores_list = np.array( [] ) # store scores between each test_fingerprint,sampled_fingerprint pajrs test_fingerprint_word, sampled_fingerprint_word_list = fingerprint2word( test_fingerprint, sampled_fingerprints) cheat_flag = False # true if the testing sample itself is sampled for sampled_fingerprint_word in sampled_fingerprint_word_list: distance = damerau_levenshtein_distance(sampled_fingerprint_word, test_fingerprint_word) if distance == 0: cheat_flag = True scores_list = np.append(scores_list, distance) normalized_scores_list = scores_list / float(max(scores_list)) global_score = sum(normalized_scores_list) return global_score, cheat_flag
def rectify(self, word): """ Предсказания спеллера """ # запрос, преобразованный в нграммы char_ngrams_list = self.vectorizer.transform([word]).tocoo().col # подбираем число кандидатов по длине запроса self.n_candidates = 350 if len(word) <= 4 else 250 if len(word) <= 7 else self.n_candidates # для каждого терма считаем совпадение по нграммам counter = Counter() for token_id in char_ngrams_list: for word_id in self.index[token_id]: counter[word_id] += 1 # среди топа по совпадениям по нграммам ищем "хорошее" исправление # используем модифицированное расстояние Левенштейна (с перестановками) # а также ищем слово с минимальным количеством новых букв suggests = list() for suggest in counter.most_common(n=self.n_candidates): sugg = self.words_list[suggest[0]] dl_distance = damerau_levenshtein_distance(sugg, word) fitted_sugg_list = self.voc_vectorizer.transform([sugg]).tocoo().col if dl_distance <= 5: suggests.append((sugg, dl_distance, self.voc[fitted_sugg_list[0]] if fitted_sugg_list else 0)) suggests = sorted(suggests, key=lambda tup: tup[1]) minimal_distance = min([suggest[1] for suggest in suggests]) swap_words = sorted( [(suggest[0], suggest[2]) for suggest in suggests if suggest[1] == minimal_distance and set(suggest[0]) == set(word)], key=lambda tup: -tup[1]) return swap_words[0][0] if swap_words and swap_words[0][1] > 0 else suggests[0][0]
def calcola_precisione_dizionario(lista_cluster): totale_campi = len(lista_cluster) classi_attribuite = {} # riempi_dizionario('coppieCampoClasse.csv') riempi_dizionario('training_set_1.csv') for elem in lista_cluster: res = {} for classe in dictionary.keys(): dam = [(damerau_levenshtein_distance(elem, campo_misurato)) for campo_misurato in dictionary[classe]] res[classe] = min(dam) classe_attribuita = min(res, key=res.get) if classi_attribuite.get(classe_attribuita) is None: classi_attribuite[classe_attribuita] = 1 else: classi_attribuite[classe_attribuita] += 1 classe_piu_frequente = max(classi_attribuite, key=classi_attribuite.get) # 100 : totale_campi = x : classi_attribuite[classe_piu_frequente] percentuale = (100 * classi_attribuite[classe_piu_frequente])/totale_campi return classe_piu_frequente, percentuale
def classificazione(x, y): parole = get_words('test_set_1.csv') for p in parole: # Extract parts of the record parola = p[0] classe_effettiva = p[1] if len(p) > 4: tags = [] for i in range(4, len(p)): tags.append(p[i]) # res è un dict che conterrà (per ogni parola da testare) tutte le classi # e ad ognuna corrisponderà la distanza minima trovata (con le parole che la 'compongono') res = {} # **************** PRIMA CLASSIFICAZIONE ***************** # Per ogni classe calcolo la distanza tra le parole di cui è composta e la parola da testare for classe in dictionary.keys(): if classe == classe_effettiva: count_metriche[classe]['effettive'] += 1 # di ogni classe nel diz. so quante ce ne sono davvero # For each term in the dictionary for each class i save the DL distance, then i pick the minimum per class dam = [(damerau_levenshtein_distance(p[0], campo_misurato)) for campo_misurato in dictionary[classe]] # array di distanze per classe res[classe] = min(dam) # I pick the minimum distance for each class distanza_minima = res[min(res, key=res.get)] classi_con_stessa_distanza_minima = [] # riempio una lista per vedere se la distanza minima trovata è duplicata print 'PAROLA', parola, 'CLASSE', classe_effettiva, '-', x, '%', (len(parola) * x) / 100, 'distanza minima:', distanza_minima for key, value in res.iteritems(): # TODO casi in cui ci sono distanze uguali !! if value == distanza_minima: # print 'distanza minima =', key classi_con_stessa_distanza_minima.append(key) lista_distanze = [] for c in res.keys(): lista_distanze.append(res[c]) if distanza_minima is 0: # print 'LA DISTANZA MINIMA è 0' # TODO non so se riclassificare -> può venir fuori lo stesso risultato if lista_distanze.count(0) > 1: # è stata trovata più di una classe con distanza 0 -> riclassifico per quelle classi #XXX res = riclassifica_per_tag(p, tags, classi_con_stessa_distanza_minima) res = res #toglilo else: # print 'LA DISTANZA MINIMA NON è 0' """ A questo punto, verifico due condizioni: - se la distanza minima trovata tra tutte le classi è maggiore del x% di len(parola) - se ci sono due distanze molto simili che hanno differenza y% sulla lunghezza """ percent_lunghezza = (len(parola) * x) / 100 if distanza_minima > percent_lunghezza: # riclassifico solo per alcune classi !? # TODO cerco le classi con distanze simili alla distanza minima # aggiungo alla lista di distanza minima simile, le classi con distanze diverse ma simili for classe, dist in res.iteritems(): diff = (abs((distanza_minima - dist)) * y) / 100 if diff < percent_lunghezza and (dist != distanza_minima): classi_con_stessa_distanza_minima.append(classe) #XXX res = riclassifica_per_tag(p, tags, classi_con_stessa_distanza_minima) # We decide finally the class and check whether is right or wrong classe_attribuita = min(res, key=res.get) count_metriche[classe_attribuita]['classificate'] += 1 if classe_effettiva == classe_attribuita: count_metriche[classe_attribuita]['classificate_esatte'] += 1 print 'CLASSE ATTRIBUITA', classe_attribuita, 'distanza', res[classe_attribuita] return count_metriche
'effettive': 0, 'classificate_esatte': 0} file2 = open('risultatoDictionary.csv', 'a+') # Per ogni parola da testare, calcolo la distanza che c'è tra tale parola ed ogni parola nella lista di nomi_campo salvate nel dizionario for p in parole: classe_effettiva = p[1] # res è un dict che conterrà (per ogni parola da testare) tutte le classi e ad ognuna corrisponderà la distanza minima trovata (con le parole che la 'compongono') res = {} # Per ogni classe calcolo la distanza tra le parole di cui è composta e la parola da testare for classe in dictionary.keys(): if classe == classe_effettiva: count_metriche[classe][ 'effettive'] += 1 # tra l'insieme di parole, quante sono effettivamente di tale classe dam = [(damerau_levenshtein_distance(p[0], campo_misurato)) for campo_misurato in dictionary[classe]] # array di distanze per classe res[classe] = min(dam) # if len(dam) != 0: # # res[classe] = min(dam) # print res classe_attribuita = min(res, key=res.get) # print 'Classe attribuita -->', classe_attribuita, '\n' count_metriche[classe_attribuita]['classificate'] += 1 # quante volte attribuisco tale classe (giusta o sbagliata) # file2.write(str(p[0]) + ',' + ',' + str(p[1]) + ',' + str(min(res, key=res.get)) + '\n') if classe_effettiva == classe_attribuita: count_metriche[classe_attribuita]['classificate_esatte'] += 1 print count_metriche
def text_edit_ratio(doc, method=u'text_edit_ratio', ground_truth=None, xml_in=True, gt_format='tei', clean_in=True, clean_gt=True, divert=True): """ Calculates the similarity of the input documents and a given ground truth using the Damerau-Levenshtein distance. The result is a value between 0.0 (no commonality) and 1.0 (identical strings). Args: doc (unicode, unicode): The input document tuple method (unicode): The suffix string appended to the output file. ground_truth (unicode): Ground truth location tuple or a list of ground truths to choose from. When more than one is given, the file sharing the longest prefix with the input document is chosen. xml_in (bool): Switch to treat input as an TEI-XML document. gt_format (unicode): Switch to select ground truth format. Valid values are 'tei', 'hocr', and 'text'. clean_in (bool): Normalize to NFD and strip input data. (DO NOT DISABLE!) clean_gt (bool): Normalize to NFD and strip ground truth. (DO NOT DISABLE!) divert (bool): Switch selecting output diversion. If enabled the output will be added to the tracking arguments and the input document will be returned as the result of the task. Use this to insert a statistical measure into a chain without affecting the results. Returns: (unicode, unicode): Storage tuple of the output document """ input_path = storage.get_abs_path(*doc[0]) output_path = storage.insert_suffix(input_path, method, os.path.basename(input_path)) if not isinstance(ground_truth[0], basestring): ground_truth = find_matching(doc, ground_truth) with storage.StorageFile(*ground_truth) as fp: if gt_format == 'tei': tei = TEIFacsimile() tei.read(fp) t = StringIO.StringIO() tei.write_text(t) gt = t.getvalue() elif gt_format == 'hocr': gt = html.parse(fp).text_content() elif gt_format == 'text': gt = fp.read() else: raise NidabaInvalidParameterException('Input format ' + gt_format + ' unknown.') with storage.StorageFile(*doc) as fp: if xml_in: tei = TEIFacsimile() tei.read(fp) t = StringIO.StringIO() tei.write_text(t) text = t.getvalue() else: text = fp.read() if clean_in: text = cleanup(text) if clean_gt: gt = cleanup(gt) logger.debug('Recognition result: \n{}'.format(text)) logger.debug('Ground truth: \n{}'.format(gt)) edist = 1.0 - normalized_damerau_levenshtein_distance(text, gt) logger.debug('Edit distance: {}'.format(damerau_levenshtein_distance(text, gt))) logger.debug('Accuracy: {}'.format(edist)) if not divert: storage.write_text(*storage.get_storage_path(output_path), text=unicode(edit)) return output_path else: return {'edit_ratio': edist, 'ground_truth': ground_truth, 'doc': doc}
def error_prob(self, error, poss): dist = damerau_levenshtein_distance(error, poss) prob = (1/(2**dist)) return prob
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """ from pyxdameraulevenshtein import damerau_levenshtein_distance, normalized_damerau_levenshtein_distance import random import string import timeit print('#edit distances (low edit distance means words are similar):') print("damerau_levenshtein_distance('%s', '%s') = %d" % ('smtih', 'smith', damerau_levenshtein_distance('smtih', 'smith'))) print("damerau_levenshtein_distance('%s', '%s') = %d" % ('snapple', 'apple', damerau_levenshtein_distance('snapple', 'apple'))) print("damerau_levenshtein_distance('%s', '%s') = %d" % ('testing', 'testtn', damerau_levenshtein_distance('testing', 'testtn'))) print("damerau_levenshtein_distance('%s', '%s') = %d" % ('saturday', 'sunday', damerau_levenshtein_distance('saturday', 'sunday'))) print("damerau_levenshtein_distance('%s', '%s') = %d" % ('Saturday', 'saturday', damerau_levenshtein_distance('Saturday', 'saturday'))) print("damerau_levenshtein_distance('%s', '%s') = %d" % ('orange', 'pumpkin', damerau_levenshtein_distance('orange', 'pumpkin'))) print("damerau_levenshtein_distance('%s', '%s') = %d #unicode example\n" % ('Sjöstedt', 'Sjostedt', damerau_levenshtein_distance('Sjöstedt', 'Sjostedt'))) #unicode example print('#normalized edit distances (low ratio means words are similar):') print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('smtih', 'smith', normalized_damerau_levenshtein_distance('smtih', 'smith'))) print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('snapple', 'apple', normalized_damerau_levenshtein_distance('snapple', 'apple'))) print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('testing', 'testtn', normalized_damerau_levenshtein_distance('testing', 'testtn'))) print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('saturday', 'sunday', normalized_damerau_levenshtein_distance('saturday', 'sunday'))) print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('Saturday', 'saturday', normalized_damerau_levenshtein_distance('Saturday', 'saturday'))) print("normalized_damerau_levenshtein_distance('%s', '%s') = %f" % ('orange', 'pumpkin', normalized_damerau_levenshtein_distance('orange', 'pumpkin'))) print("normalized_damerau_levenshtein_distance('%s', '%s') = %f #unicode example\n" % ('Sjöstedt', 'Sjostedt', normalized_damerau_levenshtein_distance('Sjöstedt', 'Sjostedt'))) #unicode example
def test_damerau_levenshtein_distance(self): assert damerau_levenshtein_distance('smtih', 'smith') == 1 assert damerau_levenshtein_distance('snapple', 'apple') == 2 assert damerau_levenshtein_distance('testing', 'testtn') == 2 assert damerau_levenshtein_distance('saturday', 'sunday') == 3 assert damerau_levenshtein_distance('Saturday', 'saturday') == 1 assert damerau_levenshtein_distance('orange', 'pumpkin') == 7 assert damerau_levenshtein_distance('gifts', 'profit') == 5 assert damerau_levenshtein_distance('Sjöstedt', 'Sjostedt') == 1 assert damerau_levenshtein_distance([1, 2, 3], [1, 3, 2]) == 1 assert damerau_levenshtein_distance((1, 2, 3), (1, 3, 2)) == 1 assert damerau_levenshtein_distance((1, 2, 3), [1, 3, 2]) == 1 assert damerau_levenshtein_distance([], []) == 0 assert damerau_levenshtein_distance(list(range(10)), list(range(1, 11))) == 2 assert damerau_levenshtein_distance([1, 2, 3, 4, 5, 6], [7, 8, 9, 7, 10, 11, 4]) == 7
def descr_damerau_levenshtein(row): return damerau_levenshtein_distance(row['description_1'], row['description_2'])
def title_damerau_levenshtein(row): return damerau_levenshtein_distance(row['title_1'], row['title_2'])
def score_g(): for row in row_g: row[target] = damerau_levenshtein_distance(row[column], string) yield row
def edit_distance_norm(word1, word2): dmdist = float(damerau_levenshtein_distance(word1, word2)) #return dmdist / (float((len(word1)+ len(word2))) / 2) # mean of lenghts #return dmdist # pure #return dmdist / (float(max(len(word1), len(word2)))) # max return dmdist / (float((len(word1)+ len(word2))))
if classi_attribuite.get(classe_attribuita) is None: classi_attribuite[classe_attribuita] = 1 else: classi_attribuite[classe_attribuita] += 1 classe_piu_frequente = max(classi_attribuite, key=classi_attribuite.get) # 100 : totale_campi = x : classi_attribuite[classe_piu_frequente] percentuale = (100 * classi_attribuite[classe_piu_frequente])/totale_campi return classe_piu_frequente, percentuale # Affinity propagation words = np.asarray(parole) # So that indexing with a list will work dam = np.array([[(damerau_levenshtein_distance(w1, w2)) for w1 in words] for w2 in words]) distance_matrix = dam # matrice con le distanze affinity_matrix = 1 - distance_matrix # AFFINITY PROPAGATION CLUSTERING # mymat = -1 * distance_matrix print mymat # Perform Affinity Propagation Clustering of data affprop = sklearn.cluster.AffinityPropagation(affinity="precomputed", damping=0.5) # Create affinity matrix from negative euclidean distances, then apply affinity propagation clustering. affprop.fit(mymat) percentuali = 0 for cluster_id in np.unique(affprop.labels_): exemplar = words[affprop.cluster_centers_indices_[cluster_id]]
import numpy as np # from numpy package import sklearn.cluster # from sklearn package import distance # from distance package import jaro from pyxdameraulevenshtein import damerau_levenshtein_distance import sys from bozza import words # TODO togliere #words = [] # words = np.asarray(words) # So that indexing with a list will work words = np.asarray(words) # So that indexing with a list will work dam = np.array([[(damerau_levenshtein_distance(w1,w2)) for w1 in words] for w2 in words]) #damerau-levenshtein distance_matrix = dam # matrice con le distanze affinity_matrix = 1 - distance_matrix ## AFFINITY PROPAGATION CLUSTERING ## mymat = -1 * distance_matrix print mymat # Perform Affinity Propagation Clustering of data affprop = sklearn.cluster.AffinityPropagation(affinity="precomputed", damping=0.5) # Create affinity matrix from negative euclidean distances, then apply affinity propagation clustering. affprop.fit(mymat) for cluster_id in np.unique(affprop.labels_): exemplar = words[affprop.cluster_centers_indices_[cluster_id]] cluster = np.unique(words[np.nonzero(affprop.labels_ == cluster_id)]) cluster_str = ", ".join(cluster) #print(" - *%s:* %s" % (exemplar, cluster_str)) # (exemplar, cluster_str)) print exemplar + "-----------"
def classificazione(test_set,x, y,number,current_prefix): l = len(test_set) - 1 progress=0 for stream in test_set: printProgress(progress, l, prefix = current_prefix, suffix = 'Complete', barLength = 50) field = stream["field_name"].strip().replace(' ', '_') #print "Classifico field: " + field # res è un dict che conterrà (per ogni parola da testare) tutte le classi # e ad ognuna corrisponderà la distanza minima trovata (con le parole che la 'compongono') res = {} # **************** PRIMA CLASSIFICAZIONE ***************** # Per ogni classe calcolo la distanza tra le parole di cui è composta e la parola da testare for classe in dictionary.keys(): if classe == stream["field_class"]: count_metriche[classe]['effettive'] += 1 # di ogni classe nel diz. so quante ce ne sono davvero # For each term in the dictionary for each class i save the DL distance, then i pick the minimum per class dam = [(damerau_levenshtein_distance(field, campo_misurato)) for campo_misurato in dictionary[classe]] # array di distanze per classe if len(dam) > 0: res[classe] = min(dam) # I pick the minimum distance for each class #else: #res[classe] = 50 #Da modificare, ma per ora serve per evitare problemi con le classi senza parole distanza_minima = res[min(res, key=res.get)] classi_con_stessa_distanza_minima = [] # riempio una lista per vedere se la distanza minima trovata è duplicata for key, value in res.iteritems(): # TODO casi in cui ci sono distanze uguali !! if value == distanza_minima: # print 'distanza minima =', key classi_con_stessa_distanza_minima.append(key) ''' lista_distanze = [] for c in res.keys(): lista_distanze.append(res[c]) print "lista_distanze",lista_distanze ''' if distanza_minima is 0: # TODO non so se riclassificare -> può venir fuori lo stesso risultato if len(classi_con_stessa_distanza_minima) > 1: # è stata trovata più di una classe con distanza 0 -> riclassifico per quelle classi #XXX res = riclassifica_per_tag(p, tags, classi_con_stessa_distanza_minima) res = res #toglilo else: """ A questo punto, verifico due condizioni: - se la distanza minima trovata tra tutte le classi è maggiore del x% di len(field) - se ci sono due distanze molto simili che hanno differenza y% sulla lunghezza """ percent_lunghezza = (len(field) * x) / 100 #se non rispetta la condizione la assumo come buona if distanza_minima > percent_lunghezza: # riclassifico solo per alcune classi !? # TODO cerco le classi con distanze simili alla distanza minima # aggiungo alla lista di distanza minima simile, le classi con distanze diverse ma simili for classe, dist in res.iteritems(): diff = (abs((distanza_minima - dist)) * y) / 100 if diff < percent_lunghezza and (dist != distanza_minima): classi_con_stessa_distanza_minima.append(classe) #XXX res = riclassifica_per_tag(p, tags, classi_con_stessa_distanza_minima) # We decide finally the class and check whether is right or wrong classe_attribuita = min(res, key=res.get) count_metriche[classe_attribuita]['classificate'] += 1 if stream["field_class"] == classe_attribuita: count_metriche[classe_attribuita]['classificate_esatte'] += 1 y_real.append(stream["field_class"]) y_assigned.append(classe_attribuita) #print 'CLASSE ATTRIBUITA', classe_attribuita, 'distanza', res[classe_attribuita] db.store_classification(stream,classe_attribuita,res[classe_attribuita],number,"ClassificationRelevant") progress = progress + 1