def main(): if (len(sys.argv) != 2): print("Usage: python ML-Agglomerative.py path-name") sys.exit() path = sys.argv[1] # "../ames/test.csv" df = pd.read_csv(path) arr1 = df.columns.values arr2 = df.columns.values NL = NormalizedLevenshtein() vectors = pd.DataFrame([[NL.distance(i, j) for j in arr2] for i in arr1]) clusters = 5 agg = AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto', connectivity=None, linkage='ward', memory=None, n_clusters=clusters, pooling_func='deprecated').fit(vectors) for i in range(clusters): print("Cluster " + str(i) + ":") print(pd.Series(arr1[agg.labels_ == i])) print('\n')
def simDif(w1,w2): normalized_levenshtein = NormalizedLevenshtein() dis = normalized_levenshtein.distance(w1,w2) sim = normalized_levenshtein.similarity(w1,w2) print('distance: '+str(dis)+' similarity: '+ str(sim)) # 此线表示的是固定文本的一段和活动文本各段比较之后的分隔 print('----------------------------')
def get_norm_levenshtein(prediction, groundtruth): normalized_levenshtein = NormalizedLevenshtein() if type(groundtruth)==list: if len(groundtruth)==0: return 0 return np.max([get_norm_levenshtein(prediction, gt) for gt in groundtruth]) return normalized_levenshtein.similarity(normalize_answer(prediction), normalize_answer(groundtruth))
def searchlike(query_str,s1): # 字面量相似度 #print(difflib.SequenceMatcher(None, query_str, s1).quick_ratio()) # 编辑距离 normalized_levenshtein = NormalizedLevenshtein() #print(normalized_levenshtein.similarity(query_str, s1)) #杰卡德相似度 jaccard_coefficient = Jaccrad(query_str, s1) #print(jaccard_coefficient) return difflib.SequenceMatcher(None, query_str, s1).quick_ratio(), normalized_levenshtein.similarity(query_str, s1),jaccard_coefficient
def similarity(outputs_batch, labels_batch, dic): norm_lev = NormalizedLevenshtein() outputs_batch = torch.argmax(outputs_batch, -1) avg_sim = 0 for j in range(outputs_batch.size(-1)): pred = [dic[int(k)] for k in outputs_batch[:, j]] pred = utils.clear(pred) avg_sim += norm_lev.distance(pred, labels_batch[j]) avg_sim = 1 - avg_sim / outputs_batch.size(-1) return avg_sim
def similarity_plus(outputs_batch, labels_batch, dic): d = enchant.Dict("en_US") norm_lev = NormalizedLevenshtein() outputs_batch = torch.argmax(outputs_batch, -1) avg_sim = 0 for j in range(outputs_batch.size(-1)): pred = [dic[int(k)] for k in outputs_batch[:, j]] pred = utils.clear(pred) if not d.check(pred): pred = d.suggest(pred) avg_sim += norm_lev.distance(pred, labels_batch[j]) avg_sim = 1 - avg_sim / outputs_batch.size(-1) return avg_sim
def __init__(self, word2vec: Word2Vec, fasttext: FastText, lexicon: set) -> None: '''Creates a new PostOCRTextCorrection object Args: word2vec (Word2Vec): Word2Vec pre-trained model: a gensim model trained on dirty data (text without cleaning) fasttext (FastText): FastText pre-trained model: a gensim model trained on dirty data (text without cleaning) lexicon (set): complete list of words belonging to the language of the corpus documents (including conjugations, ...) ''' self.__n_levenshtein = NormalizedLevenshtein() # loads pretrained models self.word2vec = word2vec self.word2vec_vocab = set( [word for word in self.word2vec.wv.index_to_key if len(word) > 3]) self.fasttext = fasttext self.lexicon = set(lexicon)
def __init__(self): self.tokenizer = T5Tokenizer.from_pretrained('t5-base') model = T5ForConditionalGeneration.from_pretrained('Parth/result') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) # model.eval() self.device = device self.model = model self.nlp = spacy.load('tr') self.fdist = FreqDist(brown.words()) self.normalized_levenshtein = NormalizedLevenshtein() self.set_seed(42)
def data_classifier(arr, sources): NL = NormalizedLevenshtein() vectors = pd.DataFrame([[NL.distance(i, j) for j in arr] for i in arr]) clusters = int(len(arr)**.5) if clusters <= 1: clusters = 2 kmeans = KMeans(n_clusters=clusters, random_state=0).fit(vectors) field_ids = ["field-" + str(uuid.uuid4()) for field in arr] uncert = pd.DataFrame() for i in range(clusters): uncert[i] = vectors.apply(dist, args=(kmeans.cluster_centers_[i], ), axis=0) uncert.index = field_ids classifications_obj = {} fields_obj = {} for i in range(clusters): cluster = {} fields = pd.Series(field_ids)[kmeans.labels_ == i] for field_id in fields: cluster[field_id] = uncert.loc[field_id][i] idx = field_ids.index(field_id) fields_obj[field_id] = {"name": arr[idx], "source": sources[idx]} cid_obj = {} cid_obj["name"] = "classification" + str(i) cid_obj["metadata"] = None cid_obj["values"] = cluster cid_obj["distribution"] = np.array(list(cluster.values())).mean() classifications_obj["classification-" + str(uuid.uuid4())] = cid_obj data = {} data["Classifications"] = classifications_obj data["Fields"] = fields_obj return (json.dumps(data, sort_keys=True, indent=4))
def __init__(self): model_file_1 = "../input/s2v-old/s2v_old" self.tokenizer = T5Tokenizer.from_pretrained('t5-base') model = T5ForConditionalGeneration.from_pretrained('Parth/result') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) # model.eval() self.device = device self.model = model self.nlp = spacy.load('en_core_web_sm') self.s2v = Sense2Vec().from_disk('../input/s2v-old/s2v_old') self.fdist = FreqDist(brown.words()) self.normalized_levenshtein = NormalizedLevenshtein() self.set_seed(42)
def __init__(self, s2v_model_path='s2v_old', qg_model_path='Parth/result', bq_model_path='ramsrigouthamg/t5_boolean_questions', ap_model_path='Parth/boolean', t5_tokenizer_path='t5-base'): self.tokenizer = T5Tokenizer.from_pretrained(t5_tokenizer_path) self.normalized_levenshtein = NormalizedLevenshtein() self.rand = random.Random(datetime.now()) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.qg_model = T5ForConditionalGeneration.from_pretrained( qg_model_path).to(self.device) self.bq_model = T5ForConditionalGeneration.from_pretrained( bq_model_path).to(self.device) self.ap_model = T5ForConditionalGeneration.from_pretrained( ap_model_path).to(self.device)
def __init__(self, lang_code='en', max_questions=20): self.tokenizer = T5Tokenizer.from_pretrained('t5-base') model = T5ForConditionalGeneration.from_pretrained('Parth/result') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) # model.eval() self.device = device self.model = model self.nlp = self.try_load_spacy_model(lang_code) self.max_questions = int(max_questions) self.s2v = Sense2Vec().from_disk( '/Users/dev/Develop/text-to-anki/backend/src/Questgen.ai/Questgen.ai/Questgen/s2v_old' ) self.fdist = FreqDist(brown.words()) self.normalized_levenshtein = NormalizedLevenshtein() self.set_seed(42)
''' import argparse, os, cv2 from similarity.normalized_levenshtein import NormalizedLevenshtein from textract.extractor import TextExtractor # parse argument parser = argparse.ArgumentParser() parser.add_argument('--img_dir', type=str, help='image folder') parser.add_argument('--gd_dir', type=str, help='ground truth folder') parser.add_argument('--out_dir', type=str, help='ocr text output folder') args = parser.parse_args() if not os.path.exists(args.out_dir): os.mkdir(args.out_dir) normalized_levenshtein = NormalizedLevenshtein() similarities = {} # use textract extractor = TextExtractor() folder = os.listdir(args.img_dir) for article in folder: article_path = os.path.join(args.img_dir, article) if not os.path.isdir(article_path): continue groudtruth_path = os.path.join(args.gd_dir, article + '.txt') if not os.path.exists(groudtruth_path): continue
# Teste 4 Machine learning eu acho from similarity.normalized_levenshtein import NormalizedLevenshtein normalized_levenshtein = NormalizedLevenshtein() ##perguntas_respostas = {'olá' : 'Como posso ajuda-lo?', ## 'bom dia' : 'Bom Dia!!', ## 'qual é seu filme preferido' :' Ex Machina!', ## ' ' : ' ', ## ' ' : ' ', ## ' ' : ' ', ## ' ' : ' ', ## ' ' : ' ', ## ' ' : ' ', ## ' ' : ' ', ## ' ' : ' ' ## ## ## } from similarity.normalized_levenshtein import NormalizedLevenshtein normalized_levenshtein = NormalizedLevenshtein() import os pr_file = './perguntas_respostas.p' import pickle if os.path.isfile(pr_file): with open(pr_file, 'rb') as p: perguntas_respostas = pickle.load(p)
import re import numpy as np from similarity.normalized_levenshtein import NormalizedLevenshtein NORMALIZED_LEVENSHTEIN = NormalizedLevenshtein() IS_TRUE = re.compile('true', re.IGNORECASE) def by_name(data_frame, name): "produces a normalized distance score between 0 and 1" def make_score(_name): "make score takes 1 minus the distance to produce the similairty" return 1 - NORMALIZED_LEVENSHTEIN.distance(_name, name) name_scores = data_frame['name'].map(make_score) return name_scores def by_experience(data_frame, experience): "converts the experienced column into a score of 0 or 1" params_exper = re.match(IS_TRUE, experience) != None def make_score(_exp): "returns 1 if the bool from the query string matches the Series experienced value" result = params_exper == _exp return 1 if result else 0 exp_score = data_frame['experienced'].map(make_score) return exp_score def by_distance(data_frame, location_type, value): "to normalize for all distances 1 - distance / max distance" distances = np.square(data_frame[location_type] - value) max_distance = np.max(distances) normalized = 1 - distances / max_distance
def main(): v1 = 'text' v2 = 'text' # -----------------------------------------------Edit based ------------------------------------------------------ print( "-------------------------------- Edit based ----------------------------------" ) print("------- HAMMING ---------") ed = Hamming() #The return value is a float between 0 and 1, where 0 means totally different, and 1 equal. print("Hamming Similarity: ", ed.normalized_similarity(v1, v2)) print("\n-------- MLIPNS --------") ed = MLIPNS() print("MLIPNS similarity: ", ed.similarity(v1, v2)) print("\n-------- JaroWinkler --------") ed = JaroWinkler() print("JaroWinkler similarity: ", ed.similarity(v1, v2)) print("\n-------- Jaro --------") ed = Jaro() print("Jaro similarity: ", ed.similarity(v1, v2)) # ----------------------------------------------- Token based ------------------------------------------------------ print( "-------------------------------- Token based ----------------------------------" ) print("\n-------- JACCARD --------") ed = Jaccard() print("JACCARD similarity: ", ed.similarity(v1, v2)) #considera a quantidade de letras print("\n-------- Sorensen --------") ed = Sorensen() print("Sorensen similarity: ", ed.similarity(v1, v2)) print("\n-------- Tversky --------") ed = Tversky() print("Tversky similarity: ", ed.similarity(v1, v2)) print("\n-------- Overlap --------") ed = Overlap() print("Overlap similarity: ", ed.similarity(v1, v2)) print("\n-------- Cosine --------") ed = Cosine() print("Cosine similarity: ", ed.similarity(v1, v2)) # ----------------------------------------------- Sequence based ------------------------------------------------------ print( "-------------------------------- Sequence based ----------------------------------" ) print("\n-------- RatcliffObershelp --------") ed = RatcliffObershelp() print("RatcliffObershelp similarity: ", ed.similarity(v1, v2)) # ----------------------------------------------- Compression based ------------------------------------------------------ print( "-------------------------------- Compression based ----------------------------------" ) print("\n-------- EntropyNCD --------") ed = EntropyNCD() print("EntropyNCD similarity: ", ed.similarity(v1, v2)) print("\n-------- BZ2NCD --------") ed = BZ2NCD() print("BZ2NCD similarity: ", ed.similarity(v1, v2)) print("\n-------- LZMANCD --------") ed = LZMANCD() print("LZMANCD similarity: ", ed.similarity(v1, v2)) print("\n-------- ZLIBNCD --------") ed = ZLIBNCD() print("ZLIBNCD similarity: ", ed.similarity(v1, v2)) # ----------------------------------------------- Simple based ------------------------------------------------------ print( "-------------------------------- Simple based ----------------------------------" ) print("\n-------- Prefix --------") ed = Prefix() print("Prefix similarity: ", ed.similarity(v1, v2)) print("\n-------- Postfix --------") ed = Postfix() print("Postfix similarity: ", ed.similarity(v1, v2)) # ----------------------------------------------- strsim function ------------------------------------------------------ print( "-------------------------------- strsim function ----------------------------------" ) print("\n-------- Normalized Levenshtein --------") ed = NormalizedLevenshtein() print("Normalized Levenshtein similarity: ", ed.similarity(v1, v2)) print("\n-------- MetricLCS --------") ed = MetricLCS() print("MetricLCS similarity: ", ed.distance(v1, v2)) print("\n-------- NGram --------") ed = NGram() print("NGram similarity: ", ed.distance(v1, v2)) print("\n-------- Sorensen --------") ed = Sorensen() print("Sorensen similarity: ", ed.similarity(v1, v2))
def similaridade(function_name, string_1, string_2): if function_name == 'Hamming': ed = Hamming() return ed.normalized_similarity(string_1, string_2) elif function_name == 'MLIPNS': ed = MLIPNS() return ed.similarity(string_1, string_2) elif function_name == 'JaroWinkler': ed = JaroWinkler() return ed.similarity(string_1, string_2) elif function_name == 'Jaro': ed = Jaro() return ed.similarity(string_1, string_2) elif function_name == 'Jaccard': ed = Jaccard() return ed.similarity(string_1, string_2) elif function_name == 'Sorensen': ed = Sorensen() return ed.similarity(string_1, string_2) elif function_name == 'Tversky': ed = Tversky() return ed.similarity(string_1, string_2) elif function_name == 'Overlap': ed = Overlap() return ed.similarity(string_1, string_2) elif function_name == 'Cosine': ed = Cosine() return ed.similarity(string_1, string_2) elif function_name == 'RatcliffObershelp': ed = RatcliffObershelp() return ed.similarity(string_1, string_2) elif function_name == 'EntropyNCD': ed = EntropyNCD() return ed.similarity(string_1, string_2) elif function_name == 'BZ2NCD': ed = BZ2NCD() return ed.similarity(string_1, string_2) elif function_name == 'LZMANCD': ed = LZMANCD() return ed.similarity(string_1, string_2) elif function_name == 'ZLIBNCD': ed = ZLIBNCD() return ed.similarity(string_1, string_2) elif function_name == 'Prefix': ed = Prefix() return ed.similarity(string_1, string_2) elif function_name == 'Postfix': ed = Postfix() return ed.similarity(string_1, string_2) elif function_name == 'NormalizedLevenshtein': ed = NormalizedLevenshtein() return ed.similarity(string_1, string_2) elif function_name == 'MetricLCS': ed = MetricLCS() return ed.distance(string_1, string_2) elif function_name == 'NGram': ed = NGram() return ed.distance(string_1, string_2) elif function_name == 'StrCmp95': ed = StrCmp95() return ed.distance(string_1, string_2)
import os from similarity.normalized_levenshtein import NormalizedLevenshtein from tqdm import tqdm normalized_levenshtein = NormalizedLevenshtein() file_list = os.listdir("../../Final_data") for i in file_list: if (not i.endswith(".csv")): file_list.remove(i) files_increment = 0 for file_i in file_list: for file_j in file_list: if (file_i != file_j): files_increment = files_increment + 1 file_1 = open("../../Final_data/" + file_i, 'r', encoding='utf8') file_2 = open("../../Final_data/" + file_j, 'r', encoding='utf8') data_1 = file_1.readlines() data_2 = file_2.readlines() for i in range(60000): data_1[i] = data_1[i].replace("\n", '') data_2[i] = data_2[i].replace("\n", '') data_1[i] = data_1[i].split(";") data_2[i] = data_2[i].split(";")
print(len(clean_asr)) # Definition of function to calculate Levenshtein btw 2 sentences def similar(a, b): return SequenceMatcher(None, a, b).ratio() # Definition of the 5 different types to classify the sentences equal = 0 close = 0 medium = 0 low = 0 different = 0 normalized_levenshtein = NormalizedLevenshtein() norm_dist = [] # Loop to analyze each pair of sentences and count the number of occurences of each type for sent_nb in range(len(clean)): Ratio = normalized_levenshtein.similarity(clean[sent_nb], clean_asr[sent_nb]) norm_dist.append(Ratio) if (Ratio == 1): equal += 1 if (Ratio > 0.9): close += 1 if (Ratio <= 0.9 and Ratio > 0.7): medium += 1 if (Ratio <= 0.7 and Ratio > 0.5): low += 1
class PostOCRTextCorrection: ''' This class helps to correct spelling errors introduced in texts by OCR. To correct errors it combines FastText, Word2vec and Normalized Levenshtein Distance . ''' def __init__(self, word2vec: Word2Vec, fasttext: FastText, lexicon: set) -> None: '''Creates a new PostOCRTextCorrection object Args: word2vec (Word2Vec): Word2Vec pre-trained model: a gensim model trained on dirty data (text without cleaning) fasttext (FastText): FastText pre-trained model: a gensim model trained on dirty data (text without cleaning) lexicon (set): complete list of words belonging to the language of the corpus documents (including conjugations, ...) ''' self.__n_levenshtein = NormalizedLevenshtein() # loads pretrained models self.word2vec = word2vec self.word2vec_vocab = set( [word for word in self.word2vec.wv.index_to_key if len(word) > 3]) self.fasttext = fasttext self.lexicon = set(lexicon) def __word_in_word2vec__(self, word: str) -> bool: '''Returns True if there is a 'word'-vector in Word2vec Space, otherwise False''' return word in self.word2vec_vocab def __word_in_lexicon__(self, word: str) -> bool: '''Returns True if 'word' belongs to the language of the corpus documents''' return word in self.lexicon def normalized_levhenstein_similarity(self, word1: str, word2: str) -> float: '''Returns the normalized levhenstein similarity [0,1] between 'word1' and 'word2' ''' return self.__n_levenshtein.similarity(word1, word2) def weighted_normalized_levhenstein_similarity(self, word1: str, word2: str, weight: float) -> float: '''Returns the normalized levhenstein similarity [0,1] between 'word1' and 'word2' weighted by 'weight' ''' return weight * self.normalized_levhenstein_similarity(word1, word2) def ocr_correction(self, word: str, topn_we: int = 10, topn_nwe=150, weight_we=0.3, weight_nwe=1.0, topn_co: int = 1) -> list: '''Retrieves the most suitable words to correct the wrong one in input Args: word (str): a wrong word to correct topn_we (int, default=10): top N most similar words if 'word is models embedded topn_nwe (int, default=150): top N most similar words if 'word' is not models embedded weight_we (float, default=0.3): normalized levhenstein weight if 'word' is models embedded weight_nwe (float, default=1.0): normalized levhenstein weight if 'word' is not models embedded topn_co (int, default=1): ton N most suitable words to correct the wrong one in input Returns: list: ton N most suitable words to correct the wrong one in input Examples: >>> ocr = PostOCRTextCorrection(...) >>> word = 'niziitlniente' >>> ocr.ocr_correction(word, topn_co=1) >>> [('nullatenente', 0.99)] #output ''' word = word.lower() # {corrected word: score} is_word2vec_embedded = self.__word_in_word2vec__(word) corrections_score = defaultdict(int) if is_word2vec_embedded: for similar_word, cosine_simalirity in self.word2vec.wv.most_similar( word, topn=topn_we): if not self.__word_in_lexicon__(similar_word): continue corrections_score[similar_word] = \ cosine_simalirity + self.weighted_normalized_levhenstein_similarity(word, similar_word, weight_we) else: # if not self.is_word2vec_embedded(word) # we check on a larger neighborod in FastText topn_we = topn_nwe for similar_word, cosine_simalirity in self.fasttext.wv.most_similar( word, topn=topn_we): if not self.__word_in_word2vec__(similar_word): continue candidate = list() candidate.append(similar_word) if is_word2vec_embedded: candidate.extend( list( dict( self.word2vec.wv.most_similar(similar_word, topn=topn_we)))) else: pass for similar_word in candidate: if not self.__word_in_lexicon__(similar_word): continue if is_word2vec_embedded: sim = self.word2vec.wv.similarity(word, similar_word) + \ self.weighted_normalized_levhenstein_similarity(word, similar_word, weight_we) else: sim = cosine_simalirity + \ self.weighted_normalized_levhenstein_similarity(word, similar_word, weight_nwe) if sim > corrections_score[similar_word]: corrections_score[similar_word] = sim return sorted(corrections_score.items(), key=lambda x: x[1], reverse=True)[:topn_co] def ocr_correction_corpus( self, data: List[dict], key: str = 'text', processing_text: callable = lambda x: x, processing_correction: callable = lambda x: x.lower(), min_len: int = 5, topn_we: int = 10, topn_nwe=150, weight_we=0.3, weight_nwe=1.0, paragraphs_level: bool = False) -> Tuple[List[dict], dict]: ''' Given a corpus, retrieves ocr errors, estimates corrections for each error, corrects ocr errors with the estimated corrections. Args: data (List[dict]): a list of json documents key (str, default=text): key to the text field in each documents processing_text (callable, default=lambda x:x): function to process texts processing_correction (callable, default=lambda x:x.lower()): function to process corrections min_len (int, default=4): words less than 'min_len' characters long are not corrected. topn_we (int, default=10): top N most similar words if 'word' is models embedded topn_nwe (int, default=150): top N most similar words if 'word' is not models embedded weight_we (float, default=0.3): normalized levhenstein weight if 'word' is models embedded weight_nwe (float, default=1.0): normalized levhenstein weight if 'word' is not models embedded topn_co (int, default=1): ton N most suitable words to correct the wrong one in input Returns: Tuple[List[dict], dict]: a list of json documents corrected; corrections Examples: >>> ocr = PostOCRTextCorrection(...) >>> key='text' >>> data = [{'text': '[...] niziitlniente [...]'}, ..., {'text': '[...] preliininari [...]'}] >>> ocr.ocr_correction(data, key) >>> [{'text': '[...] nullatenente [...]'}, ..., {'text': '[...] preliminari [...]'}]#output ''' ocr_errors = set() for doc in tqdm(data, position=0, leave=True, desc='Retrieving ocr errors'): if paragraphs_level: for para_json in doc['paragraphs']: words = set( word for word in processing_text(para_json[key]).split() if len(word) >= min_len and not word.isdigit() and not word[0].isupper()) ocr_errors.update(words.difference(self.lexicon)) else: words = set(word for word in processing_text(doc[key]).split() if len(word) >= min_len and not word.isdigit() and not word[0].isupper()) ocr_errors.update(words.difference(self.lexicon)) ocr_corrections = dict() for ocr_error in tqdm(list(ocr_errors), position=0, leave=True, desc='Retrieving ocr corrections'): ocr_correction = self.ocr_correction(ocr_error, topn_we=topn_we, topn_nwe=topn_nwe, weight_we=weight_we, weight_nwe=weight_nwe, topn_co=1) if len(ocr_correction) == 0: continue ocr_correction = [ processing_correction(ocr_corr[0]) for ocr_corr in ocr_correction ] ocr_corrections[ocr_error] = ocr_correction[0] new_data = list() for doc in tqdm(data, position=0, leave=True, desc='Correcting ocr errors'): if '_id' in doc: del doc["_id"] if paragraphs_level: new_para_json = list() for para_json in doc['paragraphs']: para_json[key] = " " + processing_text( para_json[key]) + " " for token in para_json[key].split(): if token.lower() in ocr_corrections: para_json[key] = para_json[key].replace( " " + token.lower() + " ", " " + ocr_corrections[token.lower()] + " ") new_para_json.append(para_json) doc['paragraphs'] = new_para_json else: doc[key] = " " + processing_text(doc[key]) + " " for token in doc[key].split(): if token.lower() in ocr_corrections: doc[key] = doc[key].replace( " " + token.lower() + " ", " " + ocr_corrections[token.lower()] + " ") new_data.append(doc) return new_data, ocr_corrections
p = [] filter_thresh_45 = [] for i in range(len(temp_article)): jarowinkler = JaroWinkler() sim = jarowinkler.similarity(my_string, temp_article[i]) if sim > 0.45: filter_thresh_45.append(data[i]) normalized_levenshtein = NormalizedLevenshtein() filter_normalized_levenshtein = [] for i in range(len(filter_thresh_45)): sim = normalized_levenshtein.distance(my_string, filter_thresh_45[i][0]) if sim >= 0.7: filter_normalized_levenshtein.append(filter_thresh_45[i]) with open('filtered_levenshtein_human_mobility.txt', 'w', encoding="ISO-8859-1") as outfile: json.dump(filter_normalized_levenshtein, outfile)
def similarity(self, question, answer): stopword = self.read_from(folder_path + '上证专用停用词.txt') stopwords = [] for sw in stopword: sw = sw.strip('\n') sw = sw.strip(' ') stopwords.append(sw) # print(stopwords) meaningful_words1 = [] meaningful_words2 = [] words2 = jieba.cut(str(question)) words3 = jieba.cut(str(answer)) for word in words2: if word not in stopwords: meaningful_words1.append(word) for word in words3: if word not in stopwords: meaningful_words2.append(word) s2 = ''.join(meaningful_words1) # print(s2) s3 = ''.join(meaningful_words2) a1 = Cosine(1) b1 = Damerau() c1 = Jaccard(1) d1 = JaroWinkler() e1 = Levenshtein() f1 = LongestCommonSubsequence() g1 = MetricLCS() h1 = NGram(2) i1 = NormalizedLevenshtein() j1 = OptimalStringAlignment() k1 = QGram(1) l1 = SorensenDice(2) m1 = WeightedLevenshtein(character_substitution=CharSub()) line_sim = [] cos_s = a1.similarity(s2, s3) line_sim.append(cos_s) cos_d = a1.distance(s2, s3) line_sim.append(cos_d) dam = b1.distance(s2, s3) line_sim.append(dam) jac_d = c1.distance(s2, s3) line_sim.append(jac_d) jac_s = c1.similarity(s2, s3) line_sim.append(jac_s) jar_d = d1.distance(s2, s3) line_sim.append(jar_d) jar_s = d1.similarity(s2, s3) line_sim.append(jar_s) lev = e1.distance(s2, s3) line_sim.append(lev) lon = f1.distance(s2, s3) line_sim.append(lon) met = g1.distance(s2, s3) line_sim.append(met) ngr = h1.distance(s2, s3) line_sim.append(ngr) nor_d = i1.distance(s2, s3) line_sim.append(nor_d) nor_s = i1.similarity(s2, s3) line_sim.append(nor_s) opt = j1.distance(s2, s3) line_sim.append(opt) qgr = k1.distance(s2, s3) line_sim.append(qgr) sor_d = l1.distance(s2, s3) line_sim.append(sor_d) sor_s = l1.similarity(s2, s3) line_sim.append(sor_s) wei = m1.distance(s2, s3) line_sim.append(wei) return line_sim
from similarity.levenshtein import Levenshtein from similarity.normalized_levenshtein import NormalizedLevenshtein from similarity.cosine import Cosine lev = Levenshtein() nolev = NormalizedLevenshtein() cosine = Cosine(4) str1 = 'I enjoy playing football' str2 = 'I love to play soccer' print(lev.distance(str1, str2)) print('Levenshtein distance:') print(nolev.similarity(str1, str2)) print('Cosine similarity:') print(cosine.similarity(str1, str2))
from similarity.normalized_levenshtein import NormalizedLevenshtein from similarity.jaccard import Jaccard s1 = '中华人民共和国' s2 = '中国' normalized_levenshtein = NormalizedLevenshtein() print('Levenshtein: ', normalized_levenshtein.distance(s1, s2)) jaccard_distance = Jaccard(1) print('Jaccard: ', jaccard_distance.distance(s1, s2)) # print(jaccard_similarity_score(list(s1), list(s2)))
def diff_lev(source, source_asr, target, source_pos, result_source, result_source_asr, result_target, result_source_pos, workdir, **kwargs): with open(source, 'r', encoding='utf-8') as source, \ open(source_asr, 'r', encoding='utf-8') as source_asr, \ open(source_pos, 'r', encoding='utf-8') as source_pos, \ open(target, 'r', encoding='utf-8') as target, \ open(result_source, 'w', encoding='utf-8') as result_source, \ open(result_source_asr, 'w', encoding='utf-8') as result_source_asr, \ open(result_source_pos, 'w', encoding='utf-8') as result_source_pos, \ open(result_target, 'w', encoding='utf-8') as result_target, \ open(workdir / 'distances.txt', 'w', encoding='utf-8') as distances: source = source.readlines() source_asr = source_asr.readlines() target = target.readlines() source_pos = source_pos.readlines() # different types to classify the sentences counter = Counter() normalized_levenshtein = NormalizedLevenshtein() norm_dist = [] # Loop to analyze each pair of sentences and count the number of occurrences of each type for source_sent, source_asr_sent in tqdm.tqdm(zip(source, source_asr)): ratio = normalized_levenshtein.similarity(source_sent, source_asr_sent) norm_dist.append(ratio) if ratio == 1: counter["equal"] += 1 if ratio > 0.9: counter["close"] += 1 if 0.9 >= ratio > 0.7: counter["medium"] += 1 if 0.7 >= ratio > 0.5: counter["low"] += 1 if 0.5 >= ratio: counter["different"] += 1 # Write the results of the comparisons in output file for dist in norm_dist: distances.write(f"{dist}\n") # print Statistics print( f"Equal count:{counter['equal']}, ratio: {counter['equal'] / len(source)}" ) print(f"{counter['close']} {counter['close'] / len(source)}") print(f"{counter['medium']} {counter['medium'] / len(source)}") print(f"{counter['low']} {counter['low'] / len(source)}") print(f"{counter['different']} {counter['different'] / len(source)}") # Loop to identify similar sentences and write in output files # Counting sentences with same number of tokens comparing clean and asr for ratio, source_sent, source_asr_sent, target_sent, source_pos_sent in tqdm.tqdm( zip(norm_dist, source, source_asr, target, source_pos)): if float(ratio) >= 0.9: result_source.write(source_sent) result_source_asr.write(source_asr_sent) result_target.write(target_sent) result_source_pos.write(source_pos_sent) counter["after_filter"] += 1 if len(source_sent.split(" ")) == len( source_asr_sent.split(" ")): counter["equal_nb_tokens"] += 1 print(f"Sentences after cleaning {counter['after_filter']}") print( f"Sentences with equal number of tokens: {counter['equal_nb_tokens']}" )
def get_replacement(self, distance='lsh', threshold=.8): if distance == 'edit_distance': distance = Levenshtein() elif distance == 'normalized_edit_distance': distance = NormalizedLevenshtein() # for each token, get its bin # for each bin, iterate each element and get the groups of satisfied tokens such as # [white] = [whit, whie, whit] # [whie] = [whine,white] replacement = {} s = self.uniq_values while len(s) > 0: token = rd.sample(s, 1)[0] s.remove(token) m = self._generate_hash(token) similarities = self.lsh.query(m) similarities = [ _ for _ in similarities if _ not in replacement.values() and _ not in replacement.keys() ] if len(similarities) > 1: scores = {} bin_replacement = {} if distance != 'lsh': for idx, item in enumerate(similarities): count = 0 candidates = [] for idx_compared in range(idx + 1, len(similarities)): candidate = similarities[idx_compared] if item != candidate and distance.distance( item, candidate) < threshold: if idx not in bin_replacement: bin_replacement[idx] = [idx_compared] else: bin_replacement[idx].append(idx_compared) if idx_compared not in bin_replacement: bin_replacement[idx_compared] = [idx] else: bin_replacement[idx_compared].append(idx) for idx_item, candidates in sorted( bin_replacement.items(), key=lambda x: -len(x[1])): item = similarities[idx_item] if item in replacement.keys(): item = replacement[item] for idx_candidate in candidates: candidate = similarities[idx_candidate] if candidate != item and candidate not in replacement.keys( ): if item not in replacement.keys(): replacement[candidate] = item elif replacement[item] != candidate: replacement[candidate] = replacement[item] else: for candidate in similarities: if candidate != token: replacement[candidate] = token return replacement
import tensorflow as tf import numpy as np from similarity.normalized_levenshtein import NormalizedLevenshtein from tensorflow.keras.metrics import Metric labels = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" norm_lev = NormalizedLevenshtein() class LevenshteinMetric(Metric): def __init__(self, batch_size, **kwargs): super().__init__(**kwargs) self.levenshtein_distance_fn = levenshtein_distance_fn self.batch_size = batch_size self.total = self.add_weight("total", initializer="zeros") self.count = self.add_weight("count", initializer="zeros") def update_state(self, y_true, y_pred, sample_weight=None): metric = self.levenshtein_distance_fn(y_true, y_pred) self.total.assign_add(tf.reduce_sum(metric)) #self.count.assign_add(tf.cast(self.batch_size, tf.float32)) self.count.assign_add(tf.cast(len(y_true), tf.float32)) def result(self): return self.total / self.count def get_config(self): base_config = super().get_config() return base_config