예제 #1
0
def match_business_tf(text, list_of_company_names, tf):
    """one wall of petitioner name text to be compared to a large list of company names to find best match"""
    max_overall_score, max_overall_company_name = -1, np.nan
    for company in list_of_company_names:
        max_company_score, max_company_name = -1, np.nan
        if company in text:
            return 1, company
        for company_word in company.split():
            max_word_similarity_score, max_word_similarity = -1, np.nan
            # if company_word in text:
            #     max_word_similarity_score, max_word_similarity = 1, word
            for word in text.split():
                word_similarity_score = levenshtein.normalized_similarity(
                    word, company_word)
                # print(word_similarity_score, word, company_word, '--', max_word_similarity_score, max_word_similarity, max_company_score, max_company_name, max_overall_score, max_overall_company_name)
                if word_similarity_score > max_word_similarity_score:
                    max_word_similarity_score = word_similarity_score
                    max_word_similarity = word
            # if max_word_similarity_score > 0.8:
            # print("------", max_word_similarity, '--', tf[max_word_similarity], company_word, '--', max_word_similarity_score, max_word_similarity, max_company_score, max_company_name, max_overall_score, max_overall_company_name)
            max_company_score += (
                1 / tf[max_word_similarity]) * max_word_similarity_score
            max_company_name = company
        if max_company_score > max_overall_score:
            max_overall_score = max_company_score
            max_overall_company_name = max_company_name
    return max_overall_score, max_overall_company_name
예제 #2
0
 def check_similar_string(self, translation_string):
     cleaned_translation_string = self.clean_string(translation_string)
     for key in self._dict_of_strings:
         similarity = levenshtein.normalized_similarity(
             cleaned_translation_string, self.clean_string(key))
         if similarity >= self.threshold:
             return key, similarity
     return None, None
예제 #3
0
 def compare_entities(self, other_entity):
     similarity=0
     num_correct_attributes = 0
     for attr1, attr2 in product(self.attributes, other_entity.attributes):
         if attr1.compare_attributes(attr2):
             num_correct_attributes+=1
     attribute_score = num_correct_attributes/self.number_of_attributes if self.number_of_attributes>0 else 0
     similarity += attribute_score*entity_weightage['Attributes']
     if self.strong_entity == other_entity.strong_entity:
         similarity += entity_weightage['Type']
     similarity += entity_weightage['Name']*lev.normalized_similarity(self.name, other_entity.name)
     # TODO - Compare Keys
     similarity += entity_weightage['Keys']
     # print('Entity Similarity: ', similarity)
     return similarity
예제 #4
0
    def check_for_changes(self) -> float:
        """Checks for changes between the last two versions of the page.

        Returns
        -------
        float
            Similarity between 0 and 1.
        """
        # do not check if only one version exists
        if self.num_pages_in_db < 2:
            return 1

        t0 = self.get_text(0)
        t1 = self.get_text(1)
        sim = levenshtein.normalized_similarity(t0, t1)
        if sim < 1:
            print(f"{self.url} has changed. Similarity is: " + str(sim))
        return sim
예제 #5
0
def create_validation_file(input_file1, input_file2, prefix_filepath,
                           output_file):
    """
    Given two files containing different transcriptions of audio files, this function calculates the similarity (levenshtein distance) between the sentences,
    saving the result in a third file.

        Parameters:
        input_file1 (str): First filepath. The contents of the file must follow the template: "filename | text"
        input_file2 (str): Second filepath. The contents of the file must follow the template: "filename | text"
        prefix_filepath: Prefix to be added to the file path within the output file.

        Returns:
        output_file (str): Returns output filepath. The content of the file follows the template: prefix_filepath/filename | text1 | text2 | similarity
    """

    # Loads the contents of the first input file
    try:
        with open(input_file1) as f:
            content_file1 = f.readlines()

    except KeyboardInterrupt:
        print("KeyboardInterrupt detected!")
        exit()

    except IOError:
        print("Error: File {} does not appear to exist.".format(input_file1))
        return False

    # Loads the contents of the second input file
    try:
        with open(input_file2) as g:
            content_file2 = g.readlines()

    except KeyboardInterrupt:
        print("KeyboardInterrupt detected!")
        exit()

    except IOError:
        print("Error: File {} does not appear to exist.".format(input_file2))
        return False

    # Both files must be the same length, otherwise there is an error.
    if not (len(content_file1) == len(content_file2)):
        print("Error: length File {} not igual to File {}.".format(
            content_file1, content_file2))
        return False

    # Checks if the output folder exists
    output_folderpath = dirname(output_file)

    if not (exists(output_folderpath)):
        makedirs(output_folderpath)

    # Saves the result to the output file.
    try:
        o_file = open(output_file, 'w')

    except KeyboardInterrupt:
        print("KeyboardInterrupt detected!")
        exit()

    except IOError:
        print("Error: creating File {} problem.".format(output_file))
        return False

    # Iterate over the two files content simultaneously to calculate the similarity between the sentences.
    else:
        separator = '|'
        header = separator.join(
            ['filename', 'text', 'transcript', 'similarity'])
        o_file.write(header + '\n')

        # Input files must be csv files with the character "|" as a separator: filename | text
        for line1, line2 in tqdm(zip(content_file1, content_file2),
                                 total=len(content_file1)):

            file1, text1 = line1.split('|')
            file2, text2 = line2.split('|')

            # Clears sentences by removing unwanted characters.
            clean_text1 = clear_sentences(text1)
            clean_text2 = clear_sentences(text2)
            filepath = join(prefix_filepath, file1)

            # Calculates the levenshtein distance to define the normalized similarity (0-1) between two sentences.
            l = levenshtein.normalized_similarity(clean_text1, clean_text2)

            # Defines the output content and writes to a file.
            line = separator.join(
                [filepath, text1.strip(),
                 text2.strip(), str(l)])
            o_file.write(line + '\n')

    finally:
        o_file.close()

    return True
예제 #6
0
	def words_similarity(self, w1, w2):
		return 1/2*(levenshtein.normalized_similarity(w1, w2) +\
				jaro_winkler.normalized_similarity(w1, w2))
예제 #7
0
 def compare_attributes(self, other_attr):
     if lev.normalized_similarity(other_attr.name, self.name)>.9 and self==other_attr:
         return True
     return False
예제 #8
0
 def __eq__(self, other):
     if lev.normalized_similarity(other.name, self.name)>.9:
         # TODO - Compare Keys
         if self.strong_entity == other.strong_entity and self.number_of_attributes==other.number_of_attributes:
             return True
     return False