Пример #1
0
def find_most_similar_str(str_list, s):
    distances = []
    for s2 in str_list:
        distances.append(lev_distance(s, s2))

    distances = np.array(distances)
    min_dst_idx = np.argmin(distances)
    return str_list[min_dst_idx], distances[min_dst_idx]
Пример #2
0
    def search(self, filename):
        with open(filename, encoding='utf8') as fan_file:
            fan = sp(fan_file.read())

        # Create the fan windows:
        fan_vectors = mk_vectors(fan)
        fan_win_vectors = numpy.array([
            fan_vectors[i:i + self.window_size, :].ravel()
            for i in range(fan_vectors.shape[0] - self.window_size + 1)
        ])

        duplicate_records = defaultdict(list)
        for fan_ix, row in enumerate(fan_win_vectors):
            self._windows_processed += 1
            results = self.engine.neighbours(row)

            # Extract data about the original script
            # embedded in the engine's results.
            results = [(match_ix, match_str, distance)
                       for vec, (match_ix, match_str), distance in results
                       if distance < self.distance_threshold]

            # Create a new record with original script
            # information and fan work information.
            for match_ix, match_str, distance in results:
                fan_context = str(fan[fan_ix:fan_ix + window_size])
                lev_d = lev_distance(match_str, fan_context)

                for window_ix in range(window_size):
                    fan_word_ix = fan_ix + window_ix
                    fan_word = fan[fan_word_ix].orth_
                    fan_orth_id = fan[fan_word_ix].orth

                    orig_word_ix = match_ix + window_ix
                    orig_word = self.word_lowercase[orig_word_ix]
                    orig_orth_id = self.orth_id[orig_word_ix]
                    char = self.character[orig_word_ix]
                    scene = self.scene[orig_word_ix]

                    duplicate_records[(filename, fan_word_ix)].append(
                        # NOTE: This **must** match the definition
                        #       of `record_structure` above
                        [
                            filename, fan_word_ix, fan_word, fan_orth_id,
                            orig_word_ix, orig_word, orig_orth_id, char, scene,
                            distance, lev_d, distance * lev_d
                        ])

        # To deduplicate duplicate_records, we
        # pick the single best match, as measured by
        # the combined distance for the given n-gram
        # match that first identified the word.
        for k, dset in duplicate_records.items():
            duplicate_records[k] = min(dset, key=itemgetter(11))

        return sorted(duplicate_records.values())
Пример #3
0
def _find_other_entries_with_name(ocr_entries, entry_with_name_bot_left):
    # check that they are not exactly the same entry and they have relatively the same text
    other_entries_with_name = [
        entry for entry in ocr_entries
        if lev_distance(entry.text, entry_with_name_bot_left.text) <= 2
        and not (entry.max_x == entry_with_name_bot_left.max_x
                 and entry.min_x == entry_with_name_bot_left.min_x
                 and entry.max_y == entry_with_name_bot_left.max_y
                 and entry.min_y == entry_with_name_bot_left.min_y)
    ]
    return other_entries_with_name
Пример #4
0
def get_lev_distance(shows):
    """ Adds a lev_distance attribute to each show which contains a lookup for the
        Levenshtein Distance of the show's japanese_title to each other show's japanese_title """

    ## Create attribute for all shows
    for show in shows:
        show.lev_distance = dict()

    ## Compile Levenshtein Distances
    for show in shows:
        for other in shows:
            ## Skip this show
            if other == show: continue
            ## If other show has already done the calculation, use theirs
            if show.japanese_title in other.lev_distance:
                show.lev_distance[other.japanese_title] = other.lev_distance[
                    show.japanese_title]
            ## Otherwise calculate it yourself
            else:
                show.lev_distance[other.japanese_title] = lev_distance(
                    show.japanese_title, other.japanese_title)
Пример #5
0
 def get_lev_dist_list(query, data):
     query_data_levs = [lev_distance(q, data) for q in query]
     return query_data_levs
Пример #6
0
def levenshtein_distance(string1,string2):
  distance = lev_distance(string1, string2)
  return distance