def append_data(self, unigrams): word_list = [] aspell_dict = "models/aspell.en.dict" with open(aspell_dict, "r") as f: for line in f: word_list.append(line.strip("\r\n")) word_set = set(word_list) unigram_set = set(unigrams.keys()) ocr_key_map = { ocr_key_list_to_str(ocr_key_hash(word)): set() for word in unigram_set.intersection(word_set) } # Every word contained in the mixed case map and the dictionary for word in unigram_set.intersection(word_set): h_list = ocr_key_hash(word) h_str = ocr_key_list_to_str(h_list) ocr_key_map[h_str].add(word) # Add the word to the tab combine_struct = { key: set() for key in self.ocrkey_map.keys() + ocr_key_map.keys() } for key, value in self.ocrkey_map.items() + ocr_key_map.items(): combine_struct[key] = combine_struct[key].union(value) self.ocrkey_map = combine_struct self.save()
def append_data(self, unigrams): word_list = [] aspell_dict = "models/aspell.en.dict" with open(aspell_dict, "r") as f: for line in f: word_list.append(line.strip("\r\n")) word_set = set(word_list) unigram_set = set(unigrams.keys()) ocr_key_map = {ocr_key_list_to_str(ocr_key_hash(word)): set() for word in unigram_set.intersection(word_set)} # Every word contained in the mixed case map and the dictionary for word in unigram_set.intersection(word_set): h_list = ocr_key_hash(word) h_str = ocr_key_list_to_str(h_list) ocr_key_map[h_str].add(word) # Add the word to the tab combine_struct = {key: set() for key in self.ocrkey_map.keys() + ocr_key_map.keys()} for key, value in self.ocrkey_map.items() + ocr_key_map.items(): combine_struct[key] = combine_struct[key].union(value) self.ocrkey_map = combine_struct self.save()
def select_ocrsims(token, structures): """Select similar words for a given token Parameters: token (:func:`str`): Cleaned token structures (:func:`dict`): Datastructures from file Returns: :func:`dict` - Similar words (keys) along with their score (values) """ delta = 2 ocr_sims = {} word_hash = ocr_key_hash(token) sim_hash_list = { } # Using a dictionary avoid multiple entries if a key is retrieved twice key_index = -1 # for (key, value) in word_hash: for key, value in word_hash: key_index += 1 sim_hash = deepcopy(word_hash) for d in range(-delta, delta + 1): if d != 0: card = max(int(value) + d, 1) sim_hash[key_index] = (key, card) # Rebuild OCR key string sim_hash_str = "" for k, v in sim_hash: sim_hash_str += k + str(v) if sim_hash_str in structures["ocrkeys"]: card_diff = abs(int(value) - card) sim_hash_list[sim_hash_str] = [ (sim_word, card_diff) for sim_word in structures["ocrkeys"][sim_hash_str] if edit_distance(sim_word, token) <= 2 ] for sim_hash_str, sim_list in sim_hash_list.items(): for sim_word, card_diff in sim_list: sim_score = rate_ocr_key(structures["occurence_map"], token, sim_word, card_diff) if sim_score > 0: ocr_sims[sim_word] = sim_score return ocr_sims
def select_ocrsims(token, structures): """Select similar words for a given token Parameters: token (:func:`str`): Cleaned token structures (:func:`dict`): Datastructures from file Returns: :func:`dict` - Similar words (keys) along with their score (values) """ delta = 2 ocr_sims = {} word_hash = ocr_key_hash(token) sim_hash_list = {} # Using a dictionary avoid multiple entries if a key is retrieved twice key_index = -1 # for (key, value) in word_hash: for key, value in word_hash: key_index += 1 sim_hash = deepcopy(word_hash) for d in range(-delta, delta+1): if d != 0: card = max(int(value)+d, 1) sim_hash[key_index] = (key, card) # Rebuild OCR key string sim_hash_str = "" for k, v in sim_hash: sim_hash_str += k + str(v) if sim_hash_str in structures["ocrkeys"]: card_diff = abs(int(value)-card) sim_hash_list[sim_hash_str] = [(sim_word, card_diff) for sim_word in structures["ocrkeys"][sim_hash_str] if edit_distance(sim_word, token) <= 2] for sim_hash_str, sim_list in sim_hash_list.items(): for sim_word, card_diff in sim_list: sim_score = rate_ocr_key(structures["occurence_map"], token, sim_word, card_diff) if sim_score > 0: ocr_sims[sim_word] = sim_score return ocr_sims