def simDif(w1,w2): normalized_levenshtein = NormalizedLevenshtein() dis = normalized_levenshtein.distance(w1,w2) sim = normalized_levenshtein.similarity(w1,w2) print('distance: '+str(dis)+' similarity: '+ str(sim)) # 此线表示的是固定文本的一段和活动文本各段比较之后的分隔 print('----------------------------')
def get_norm_levenshtein(prediction, groundtruth): normalized_levenshtein = NormalizedLevenshtein() if type(groundtruth)==list: if len(groundtruth)==0: return 0 return np.max([get_norm_levenshtein(prediction, gt) for gt in groundtruth]) return normalized_levenshtein.similarity(normalize_answer(prediction), normalize_answer(groundtruth))
def searchlike(query_str,s1): # 字面量相似度 #print(difflib.SequenceMatcher(None, query_str, s1).quick_ratio()) # 编辑距离 normalized_levenshtein = NormalizedLevenshtein() #print(normalized_levenshtein.similarity(query_str, s1)) #杰卡德相似度 jaccard_coefficient = Jaccrad(query_str, s1) #print(jaccard_coefficient) return difflib.SequenceMatcher(None, query_str, s1).quick_ratio(), normalized_levenshtein.similarity(query_str, s1),jaccard_coefficient
def similarity(self, question, answer): stopword = self.read_from(folder_path + '上证专用停用词.txt') stopwords = [] for sw in stopword: sw = sw.strip('\n') sw = sw.strip(' ') stopwords.append(sw) # print(stopwords) meaningful_words1 = [] meaningful_words2 = [] words2 = jieba.cut(str(question)) words3 = jieba.cut(str(answer)) for word in words2: if word not in stopwords: meaningful_words1.append(word) for word in words3: if word not in stopwords: meaningful_words2.append(word) s2 = ''.join(meaningful_words1) # print(s2) s3 = ''.join(meaningful_words2) a1 = Cosine(1) b1 = Damerau() c1 = Jaccard(1) d1 = JaroWinkler() e1 = Levenshtein() f1 = LongestCommonSubsequence() g1 = MetricLCS() h1 = NGram(2) i1 = NormalizedLevenshtein() j1 = OptimalStringAlignment() k1 = QGram(1) l1 = SorensenDice(2) m1 = WeightedLevenshtein(character_substitution=CharSub()) line_sim = [] cos_s = a1.similarity(s2, s3) line_sim.append(cos_s) cos_d = a1.distance(s2, s3) line_sim.append(cos_d) dam = b1.distance(s2, s3) line_sim.append(dam) jac_d = c1.distance(s2, s3) line_sim.append(jac_d) jac_s = c1.similarity(s2, s3) line_sim.append(jac_s) jar_d = d1.distance(s2, s3) line_sim.append(jar_d) jar_s = d1.similarity(s2, s3) line_sim.append(jar_s) lev = e1.distance(s2, s3) line_sim.append(lev) lon = f1.distance(s2, s3) line_sim.append(lon) met = g1.distance(s2, s3) line_sim.append(met) ngr = h1.distance(s2, s3) line_sim.append(ngr) nor_d = i1.distance(s2, s3) line_sim.append(nor_d) nor_s = i1.similarity(s2, s3) line_sim.append(nor_s) opt = j1.distance(s2, s3) line_sim.append(opt) qgr = k1.distance(s2, s3) line_sim.append(qgr) sor_d = l1.distance(s2, s3) line_sim.append(sor_d) sor_s = l1.similarity(s2, s3) line_sim.append(sor_s) wei = m1.distance(s2, s3) line_sim.append(wei) return line_sim
pages = os.listdir(article_path) pages.sort() # sort by page number result_path = os.path.join(args.out_dir, article + '.txt') result_file = open(result_path, 'a', encoding='utf8') for page in pages: if page.split('.')[-1] not in ['png', 'jpg', 'tiff']: continue print(page) page_img = cv2.imread(os.path.join(article_path, page)) text = ''.join(extractor.extract(page_img)) result_file.write(text) result_file.close() # read file result_text = open(result_path, 'r').read().replace('\n', '').replace(' ', '') groudtruth_text = open(groudtruth_path, 'r').read().replace('\n', '').replace(' ', '') # calculate the similarity similarity = normalized_levenshtein.similarity(result_text, groudtruth_text) similarities[article] = similarity print('similarity of {} is {}'.format(article, similarity)) print(similarities)
def diff_lev(source, source_asr, target, source_pos, result_source, result_source_asr, result_target, result_source_pos, workdir, **kwargs): with open(source, 'r', encoding='utf-8') as source, \ open(source_asr, 'r', encoding='utf-8') as source_asr, \ open(source_pos, 'r', encoding='utf-8') as source_pos, \ open(target, 'r', encoding='utf-8') as target, \ open(result_source, 'w', encoding='utf-8') as result_source, \ open(result_source_asr, 'w', encoding='utf-8') as result_source_asr, \ open(result_source_pos, 'w', encoding='utf-8') as result_source_pos, \ open(result_target, 'w', encoding='utf-8') as result_target, \ open(workdir / 'distances.txt', 'w', encoding='utf-8') as distances: source = source.readlines() source_asr = source_asr.readlines() target = target.readlines() source_pos = source_pos.readlines() # different types to classify the sentences counter = Counter() normalized_levenshtein = NormalizedLevenshtein() norm_dist = [] # Loop to analyze each pair of sentences and count the number of occurrences of each type for source_sent, source_asr_sent in tqdm.tqdm(zip(source, source_asr)): ratio = normalized_levenshtein.similarity(source_sent, source_asr_sent) norm_dist.append(ratio) if ratio == 1: counter["equal"] += 1 if ratio > 0.9: counter["close"] += 1 if 0.9 >= ratio > 0.7: counter["medium"] += 1 if 0.7 >= ratio > 0.5: counter["low"] += 1 if 0.5 >= ratio: counter["different"] += 1 # Write the results of the comparisons in output file for dist in norm_dist: distances.write(f"{dist}\n") # print Statistics print( f"Equal count:{counter['equal']}, ratio: {counter['equal'] / len(source)}" ) print(f"{counter['close']} {counter['close'] / len(source)}") print(f"{counter['medium']} {counter['medium'] / len(source)}") print(f"{counter['low']} {counter['low'] / len(source)}") print(f"{counter['different']} {counter['different'] / len(source)}") # Loop to identify similar sentences and write in output files # Counting sentences with same number of tokens comparing clean and asr for ratio, source_sent, source_asr_sent, target_sent, source_pos_sent in tqdm.tqdm( zip(norm_dist, source, source_asr, target, source_pos)): if float(ratio) >= 0.9: result_source.write(source_sent) result_source_asr.write(source_asr_sent) result_target.write(target_sent) result_source_pos.write(source_pos_sent) counter["after_filter"] += 1 if len(source_sent.split(" ")) == len( source_asr_sent.split(" ")): counter["equal_nb_tokens"] += 1 print(f"Sentences after cleaning {counter['after_filter']}") print( f"Sentences with equal number of tokens: {counter['equal_nb_tokens']}" )
class PostOCRTextCorrection: ''' This class helps to correct spelling errors introduced in texts by OCR. To correct errors it combines FastText, Word2vec and Normalized Levenshtein Distance . ''' def __init__(self, word2vec: Word2Vec, fasttext: FastText, lexicon: set) -> None: '''Creates a new PostOCRTextCorrection object Args: word2vec (Word2Vec): Word2Vec pre-trained model: a gensim model trained on dirty data (text without cleaning) fasttext (FastText): FastText pre-trained model: a gensim model trained on dirty data (text without cleaning) lexicon (set): complete list of words belonging to the language of the corpus documents (including conjugations, ...) ''' self.__n_levenshtein = NormalizedLevenshtein() # loads pretrained models self.word2vec = word2vec self.word2vec_vocab = set( [word for word in self.word2vec.wv.index_to_key if len(word) > 3]) self.fasttext = fasttext self.lexicon = set(lexicon) def __word_in_word2vec__(self, word: str) -> bool: '''Returns True if there is a 'word'-vector in Word2vec Space, otherwise False''' return word in self.word2vec_vocab def __word_in_lexicon__(self, word: str) -> bool: '''Returns True if 'word' belongs to the language of the corpus documents''' return word in self.lexicon def normalized_levhenstein_similarity(self, word1: str, word2: str) -> float: '''Returns the normalized levhenstein similarity [0,1] between 'word1' and 'word2' ''' return self.__n_levenshtein.similarity(word1, word2) def weighted_normalized_levhenstein_similarity(self, word1: str, word2: str, weight: float) -> float: '''Returns the normalized levhenstein similarity [0,1] between 'word1' and 'word2' weighted by 'weight' ''' return weight * self.normalized_levhenstein_similarity(word1, word2) def ocr_correction(self, word: str, topn_we: int = 10, topn_nwe=150, weight_we=0.3, weight_nwe=1.0, topn_co: int = 1) -> list: '''Retrieves the most suitable words to correct the wrong one in input Args: word (str): a wrong word to correct topn_we (int, default=10): top N most similar words if 'word is models embedded topn_nwe (int, default=150): top N most similar words if 'word' is not models embedded weight_we (float, default=0.3): normalized levhenstein weight if 'word' is models embedded weight_nwe (float, default=1.0): normalized levhenstein weight if 'word' is not models embedded topn_co (int, default=1): ton N most suitable words to correct the wrong one in input Returns: list: ton N most suitable words to correct the wrong one in input Examples: >>> ocr = PostOCRTextCorrection(...) >>> word = 'niziitlniente' >>> ocr.ocr_correction(word, topn_co=1) >>> [('nullatenente', 0.99)] #output ''' word = word.lower() # {corrected word: score} is_word2vec_embedded = self.__word_in_word2vec__(word) corrections_score = defaultdict(int) if is_word2vec_embedded: for similar_word, cosine_simalirity in self.word2vec.wv.most_similar( word, topn=topn_we): if not self.__word_in_lexicon__(similar_word): continue corrections_score[similar_word] = \ cosine_simalirity + self.weighted_normalized_levhenstein_similarity(word, similar_word, weight_we) else: # if not self.is_word2vec_embedded(word) # we check on a larger neighborod in FastText topn_we = topn_nwe for similar_word, cosine_simalirity in self.fasttext.wv.most_similar( word, topn=topn_we): if not self.__word_in_word2vec__(similar_word): continue candidate = list() candidate.append(similar_word) if is_word2vec_embedded: candidate.extend( list( dict( self.word2vec.wv.most_similar(similar_word, topn=topn_we)))) else: pass for similar_word in candidate: if not self.__word_in_lexicon__(similar_word): continue if is_word2vec_embedded: sim = self.word2vec.wv.similarity(word, similar_word) + \ self.weighted_normalized_levhenstein_similarity(word, similar_word, weight_we) else: sim = cosine_simalirity + \ self.weighted_normalized_levhenstein_similarity(word, similar_word, weight_nwe) if sim > corrections_score[similar_word]: corrections_score[similar_word] = sim return sorted(corrections_score.items(), key=lambda x: x[1], reverse=True)[:topn_co] def ocr_correction_corpus( self, data: List[dict], key: str = 'text', processing_text: callable = lambda x: x, processing_correction: callable = lambda x: x.lower(), min_len: int = 5, topn_we: int = 10, topn_nwe=150, weight_we=0.3, weight_nwe=1.0, paragraphs_level: bool = False) -> Tuple[List[dict], dict]: ''' Given a corpus, retrieves ocr errors, estimates corrections for each error, corrects ocr errors with the estimated corrections. Args: data (List[dict]): a list of json documents key (str, default=text): key to the text field in each documents processing_text (callable, default=lambda x:x): function to process texts processing_correction (callable, default=lambda x:x.lower()): function to process corrections min_len (int, default=4): words less than 'min_len' characters long are not corrected. topn_we (int, default=10): top N most similar words if 'word' is models embedded topn_nwe (int, default=150): top N most similar words if 'word' is not models embedded weight_we (float, default=0.3): normalized levhenstein weight if 'word' is models embedded weight_nwe (float, default=1.0): normalized levhenstein weight if 'word' is not models embedded topn_co (int, default=1): ton N most suitable words to correct the wrong one in input Returns: Tuple[List[dict], dict]: a list of json documents corrected; corrections Examples: >>> ocr = PostOCRTextCorrection(...) >>> key='text' >>> data = [{'text': '[...] niziitlniente [...]'}, ..., {'text': '[...] preliininari [...]'}] >>> ocr.ocr_correction(data, key) >>> [{'text': '[...] nullatenente [...]'}, ..., {'text': '[...] preliminari [...]'}]#output ''' ocr_errors = set() for doc in tqdm(data, position=0, leave=True, desc='Retrieving ocr errors'): if paragraphs_level: for para_json in doc['paragraphs']: words = set( word for word in processing_text(para_json[key]).split() if len(word) >= min_len and not word.isdigit() and not word[0].isupper()) ocr_errors.update(words.difference(self.lexicon)) else: words = set(word for word in processing_text(doc[key]).split() if len(word) >= min_len and not word.isdigit() and not word[0].isupper()) ocr_errors.update(words.difference(self.lexicon)) ocr_corrections = dict() for ocr_error in tqdm(list(ocr_errors), position=0, leave=True, desc='Retrieving ocr corrections'): ocr_correction = self.ocr_correction(ocr_error, topn_we=topn_we, topn_nwe=topn_nwe, weight_we=weight_we, weight_nwe=weight_nwe, topn_co=1) if len(ocr_correction) == 0: continue ocr_correction = [ processing_correction(ocr_corr[0]) for ocr_corr in ocr_correction ] ocr_corrections[ocr_error] = ocr_correction[0] new_data = list() for doc in tqdm(data, position=0, leave=True, desc='Correcting ocr errors'): if '_id' in doc: del doc["_id"] if paragraphs_level: new_para_json = list() for para_json in doc['paragraphs']: para_json[key] = " " + processing_text( para_json[key]) + " " for token in para_json[key].split(): if token.lower() in ocr_corrections: para_json[key] = para_json[key].replace( " " + token.lower() + " ", " " + ocr_corrections[token.lower()] + " ") new_para_json.append(para_json) doc['paragraphs'] = new_para_json else: doc[key] = " " + processing_text(doc[key]) + " " for token in doc[key].split(): if token.lower() in ocr_corrections: doc[key] = doc[key].replace( " " + token.lower() + " ", " " + ocr_corrections[token.lower()] + " ") new_data.append(doc) return new_data, ocr_corrections
from similarity.levenshtein import Levenshtein from similarity.normalized_levenshtein import NormalizedLevenshtein from similarity.cosine import Cosine lev = Levenshtein() nolev = NormalizedLevenshtein() cosine = Cosine(4) str1 = 'I enjoy playing football' str2 = 'I love to play soccer' print(lev.distance(str1, str2)) print('Levenshtein distance:') print(nolev.similarity(str1, str2)) print('Cosine similarity:') print(cosine.similarity(str1, str2))
data_1[i][1] = data_1[i][1].replace(", ", ',') data_2[i][1] = data_2[i][1].replace(", ", ',') data_1[i][1] = data_1[i][1].replace(",;", '') data_2[i][1] = data_2[i][1].replace(",;", '') data_1[i][1] = data_1[i][1].split(",") data_2[i][1] = data_2[i][1].split(",") sim_file = open(str(file_i.replace('.csv', '')) + '-' + str(file_j.replace('.csv', '')) + '.txt', 'w+', encoding="utf8") print( str(file_i.replace('.csv', '')) + '-' + str(file_j.replace('.csv', '')) + '.txt: ' + str(files_increment)) for i in tqdm(range(60000)): for w_1 in data_1[i][1]: if (w_1 != 'NULL' and w_1 != 'null' and w_1 != ''): for w_2 in data_2[i][1]: if (w_2 != 'NULL' and w_2 != 'null' and w_2 != ''): simVal = normalized_levenshtein.similarity( w_1, w_2) if (simVal >= 0.85): sim_file.writelines( str(i + 1) + ";" + w_1 + ";" + w_2 + ';' + str(simVal) + '\n') sim_file.close()
return SequenceMatcher(None, a, b).ratio() # Definition of the 5 different types to classify the sentences equal = 0 close = 0 medium = 0 low = 0 different = 0 normalized_levenshtein = NormalizedLevenshtein() norm_dist = [] # Loop to analyze each pair of sentences and count the number of occurences of each type for sent_nb in range(len(clean)): Ratio = normalized_levenshtein.similarity(clean[sent_nb], clean_asr[sent_nb]) norm_dist.append(Ratio) if (Ratio == 1): equal += 1 if (Ratio > 0.9): close += 1 if (Ratio <= 0.9 and Ratio > 0.7): medium += 1 if (Ratio <= 0.7 and Ratio > 0.5): low += 1 if (Ratio <= 0.5): different += 1 # Write the results of the comparisons in output file for dist in norm_dist: result.write(str(dist) + "\n")