def match_business_tf(text, list_of_company_names, tf): """one wall of petitioner name text to be compared to a large list of company names to find best match""" max_overall_score, max_overall_company_name = -1, np.nan for company in list_of_company_names: max_company_score, max_company_name = -1, np.nan if company in text: return 1, company for company_word in company.split(): max_word_similarity_score, max_word_similarity = -1, np.nan # if company_word in text: # max_word_similarity_score, max_word_similarity = 1, word for word in text.split(): word_similarity_score = levenshtein.normalized_similarity( word, company_word) # print(word_similarity_score, word, company_word, '--', max_word_similarity_score, max_word_similarity, max_company_score, max_company_name, max_overall_score, max_overall_company_name) if word_similarity_score > max_word_similarity_score: max_word_similarity_score = word_similarity_score max_word_similarity = word # if max_word_similarity_score > 0.8: # print("------", max_word_similarity, '--', tf[max_word_similarity], company_word, '--', max_word_similarity_score, max_word_similarity, max_company_score, max_company_name, max_overall_score, max_overall_company_name) max_company_score += ( 1 / tf[max_word_similarity]) * max_word_similarity_score max_company_name = company if max_company_score > max_overall_score: max_overall_score = max_company_score max_overall_company_name = max_company_name return max_overall_score, max_overall_company_name
def check_similar_string(self, translation_string): cleaned_translation_string = self.clean_string(translation_string) for key in self._dict_of_strings: similarity = levenshtein.normalized_similarity( cleaned_translation_string, self.clean_string(key)) if similarity >= self.threshold: return key, similarity return None, None
def compare_entities(self, other_entity): similarity=0 num_correct_attributes = 0 for attr1, attr2 in product(self.attributes, other_entity.attributes): if attr1.compare_attributes(attr2): num_correct_attributes+=1 attribute_score = num_correct_attributes/self.number_of_attributes if self.number_of_attributes>0 else 0 similarity += attribute_score*entity_weightage['Attributes'] if self.strong_entity == other_entity.strong_entity: similarity += entity_weightage['Type'] similarity += entity_weightage['Name']*lev.normalized_similarity(self.name, other_entity.name) # TODO - Compare Keys similarity += entity_weightage['Keys'] # print('Entity Similarity: ', similarity) return similarity
def check_for_changes(self) -> float: """Checks for changes between the last two versions of the page. Returns ------- float Similarity between 0 and 1. """ # do not check if only one version exists if self.num_pages_in_db < 2: return 1 t0 = self.get_text(0) t1 = self.get_text(1) sim = levenshtein.normalized_similarity(t0, t1) if sim < 1: print(f"{self.url} has changed. Similarity is: " + str(sim)) return sim
def create_validation_file(input_file1, input_file2, prefix_filepath, output_file): """ Given two files containing different transcriptions of audio files, this function calculates the similarity (levenshtein distance) between the sentences, saving the result in a third file. Parameters: input_file1 (str): First filepath. The contents of the file must follow the template: "filename | text" input_file2 (str): Second filepath. The contents of the file must follow the template: "filename | text" prefix_filepath: Prefix to be added to the file path within the output file. Returns: output_file (str): Returns output filepath. The content of the file follows the template: prefix_filepath/filename | text1 | text2 | similarity """ # Loads the contents of the first input file try: with open(input_file1) as f: content_file1 = f.readlines() except KeyboardInterrupt: print("KeyboardInterrupt detected!") exit() except IOError: print("Error: File {} does not appear to exist.".format(input_file1)) return False # Loads the contents of the second input file try: with open(input_file2) as g: content_file2 = g.readlines() except KeyboardInterrupt: print("KeyboardInterrupt detected!") exit() except IOError: print("Error: File {} does not appear to exist.".format(input_file2)) return False # Both files must be the same length, otherwise there is an error. if not (len(content_file1) == len(content_file2)): print("Error: length File {} not igual to File {}.".format( content_file1, content_file2)) return False # Checks if the output folder exists output_folderpath = dirname(output_file) if not (exists(output_folderpath)): makedirs(output_folderpath) # Saves the result to the output file. try: o_file = open(output_file, 'w') except KeyboardInterrupt: print("KeyboardInterrupt detected!") exit() except IOError: print("Error: creating File {} problem.".format(output_file)) return False # Iterate over the two files content simultaneously to calculate the similarity between the sentences. else: separator = '|' header = separator.join( ['filename', 'text', 'transcript', 'similarity']) o_file.write(header + '\n') # Input files must be csv files with the character "|" as a separator: filename | text for line1, line2 in tqdm(zip(content_file1, content_file2), total=len(content_file1)): file1, text1 = line1.split('|') file2, text2 = line2.split('|') # Clears sentences by removing unwanted characters. clean_text1 = clear_sentences(text1) clean_text2 = clear_sentences(text2) filepath = join(prefix_filepath, file1) # Calculates the levenshtein distance to define the normalized similarity (0-1) between two sentences. l = levenshtein.normalized_similarity(clean_text1, clean_text2) # Defines the output content and writes to a file. line = separator.join( [filepath, text1.strip(), text2.strip(), str(l)]) o_file.write(line + '\n') finally: o_file.close() return True
def words_similarity(self, w1, w2): return 1/2*(levenshtein.normalized_similarity(w1, w2) +\ jaro_winkler.normalized_similarity(w1, w2))
def compare_attributes(self, other_attr): if lev.normalized_similarity(other_attr.name, self.name)>.9 and self==other_attr: return True return False
def __eq__(self, other): if lev.normalized_similarity(other.name, self.name)>.9: # TODO - Compare Keys if self.strong_entity == other.strong_entity and self.number_of_attributes==other.number_of_attributes: return True return False