def __init__(self, app_config): super(InlineModel, self).__init__(app_config) inline_models_dir = join(app_config["root"], app_config["dirs"]["models_root"], app_config["dirs"]["models"]["inline"]) inline_models_key = app_config["models"]["inline"] self.dictionary = Dictionary( join(inline_models_dir, inline_models_key["dictionary"])) self.unigrams = Unigrams( join(inline_models_dir, inline_models_key["unigrams"])) self.tmp_unigrams_filename = self.unigrams.filename + app_config[ "exts"]["tmp"] self.bigrams = Bigrams( join(inline_models_dir, inline_models_key["bigrams"])) self.altcase_map = AltCaseMap( join(inline_models_dir, inline_models_key["altcase"])) self.tmp_altcase_filename = self.altcase_map.filename + app_config[ "exts"]["tmp"] self.ocrkey_map = OcrKeyMap( join(inline_models_dir, inline_models_key["ocr_keys"])) self.anagram_map = AnagramMap( join(inline_models_dir, inline_models_key["anagrams"]))
def __init__(self, app_config): super(InlineModel, self).__init__(app_config) inline_models_dir = join( app_config["root"], app_config["dirs"]["models_root"], app_config["dirs"]["models"]["inline"] ) inline_models_key = app_config["models"]["inline"] self.dictionary = Dictionary(join(inline_models_dir, inline_models_key["dictionary"])) self.unigrams = Unigrams(join(inline_models_dir, inline_models_key["unigrams"])) self.tmp_unigrams_filename = self.unigrams.filename + app_config["exts"]["tmp"] self.bigrams = Bigrams(join(inline_models_dir, inline_models_key["bigrams"])) self.altcase_map = AltCaseMap(join(inline_models_dir, inline_models_key["altcase"])) self.tmp_altcase_filename = self.altcase_map.filename + app_config["exts"]["tmp"] self.ocrkey_map = OcrKeyMap(join(inline_models_dir, inline_models_key["ocr_keys"])) self.anagram_map = AnagramMap(join(inline_models_dir, inline_models_key["anagrams"]))
class InlineModel(AbstractModel): """Model for inline data structures """ def __init__(self, app_config): super(InlineModel, self).__init__(app_config) inline_models_dir = join( app_config["root"], app_config["dirs"]["models_root"], app_config["dirs"]["models"]["inline"] ) inline_models_key = app_config["models"]["inline"] self.dictionary = Dictionary(join(inline_models_dir, inline_models_key["dictionary"])) self.unigrams = Unigrams(join(inline_models_dir, inline_models_key["unigrams"])) self.tmp_unigrams_filename = self.unigrams.filename + app_config["exts"]["tmp"] self.bigrams = Bigrams(join(inline_models_dir, inline_models_key["bigrams"])) self.altcase_map = AltCaseMap(join(inline_models_dir, inline_models_key["altcase"])) self.tmp_altcase_filename = self.altcase_map.filename + app_config["exts"]["tmp"] self.ocrkey_map = OcrKeyMap(join(inline_models_dir, inline_models_key["ocr_keys"])) self.anagram_map = AnagramMap(join(inline_models_dir, inline_models_key["anagrams"])) def load(self, text_data): """Load text data to the model Args: text_data (`Text`): Text data """ if self.is_preprocessed(text_data.filename) != 0: self.logger.debug(text_data.filename+" already loaded: skipping it.") return tmp_u = Unigrams(self.tmp_unigrams_filename) word_list = tmp_u.append_data(text_data) self.bigrams.append_data(word_list) tmp_ac = AltCaseMap(self.tmp_altcase_filename) tmp_ac.append_data(tmp_u.raw_unigrams) tmp_u.generate_low_case(tmp_ac.altcase_map) self.ocrkey_map.append_data(tmp_u.raw_unigrams) # Updating data self.unigrams.raw_unigrams += tmp_u.raw_unigrams self.unigrams.ngrams += tmp_u.ngrams self.unigrams.prune(0.7) self.unigrams.save() combine_struct = {key: set() for key in tmp_ac.altcase_map.keys() + self.altcase_map.altcase_map.keys()} for key, value in tmp_ac.altcase_map.items() + self.altcase_map.altcase_map.items(): combine_struct[key] = combine_struct[key].union(value) self.altcase_map.altcase_map = combine_struct self.altcase_map.prune(self.unigrams.ngrams_pruned) self.altcase_map.save() unlink(self.tmp_unigrams_filename) unlink(self.tmp_altcase_filename) self.anagram_map.append_data(self.bigrams.ngrams_pruned, self.unigrams.ngrams_pruned) self.dictionary.append_data(self.unigrams.ngrams_pruned) self.logger.info(text_data.filename+"'s datastructures loaded") def correct(self, text_data): """Correct text data Args: text_data (`Text`): Text data """ correction_data = self.correction_data() for paragraph in text_data.text: for line in paragraph: for token in line.tokens: token[2] = init_correction_map(token[1], correction_data["dictionary"]) # Skip some correction steps if the token is too short, in the dictionary or already identified as # garbage if not token[2] is None and len(token[2]) == 0: anagrams = select_anagrams(token[1], correction_data) ocr_sims = select_ocrsims(token[1], correction_data) token[2] = build_candidates_list(token[1], anagrams, ocr_sims, correction_data) token[2] = correct_case(token[1], token[2], correction_data) token[2] = rate_corrections(token[2]) if len(token[2]) == 0: # No correction has been found token[2] = None # Applying the bigram boost to the tokens bigrams = extract_paragraph_bigrams(paragraph) apply_bigram_boost(paragraph, bigrams, correction_data["occurence_map"]) # Select the appropriate correction for line in paragraph: for token in line.tokens: token[2] = select_correction(token[1], token[2]) if token[2] is not None and len(token[2]) > 1: tkn_list = [tkn for tkn, sc in token[2].items() if sc == max(token[2].values())] if len(tkn_list) != 1: tkn_list = select_lower_edit_distance(token[1], {tkn: token[2][tkn] for tkn in tkn_list}) if len(tkn_list) != 1: tkn_list = [select_best_alphabetical_word(token[1], tkn_list)] token[2] = {tkn: token[2][tkn] for tkn in tkn_list} def correction_data(self): """Get the correction data Returns: dict: Correction data """ return { "occurence_map": self.unigrams.ngrams + self.bigrams.ngrams, "altcase": self.altcase_map.altcase_map, "ocrkeys": self.ocrkey_map.ocrkey_map, "anagrams": self.anagram_map.anagram_hashmap, "alphabet": self.anagram_map.anagram_alphabet, "dictionary": self.dictionary.dictionary }
class InlineModel(AbstractModel): """Model for inline data structures """ def __init__(self, app_config): super(InlineModel, self).__init__(app_config) inline_models_dir = join(app_config["root"], app_config["dirs"]["models_root"], app_config["dirs"]["models"]["inline"]) inline_models_key = app_config["models"]["inline"] self.dictionary = Dictionary( join(inline_models_dir, inline_models_key["dictionary"])) self.unigrams = Unigrams( join(inline_models_dir, inline_models_key["unigrams"])) self.tmp_unigrams_filename = self.unigrams.filename + app_config[ "exts"]["tmp"] self.bigrams = Bigrams( join(inline_models_dir, inline_models_key["bigrams"])) self.altcase_map = AltCaseMap( join(inline_models_dir, inline_models_key["altcase"])) self.tmp_altcase_filename = self.altcase_map.filename + app_config[ "exts"]["tmp"] self.ocrkey_map = OcrKeyMap( join(inline_models_dir, inline_models_key["ocr_keys"])) self.anagram_map = AnagramMap( join(inline_models_dir, inline_models_key["anagrams"])) def load(self, text_data): """Load text data to the model Args: text_data (`Text`): Text data """ if self.is_preprocessed(text_data.filename) != 0: self.logger.debug(text_data.filename + " already loaded: skipping it.") return tmp_u = Unigrams(self.tmp_unigrams_filename) word_list = tmp_u.append_data(text_data) self.bigrams.append_data(word_list) tmp_ac = AltCaseMap(self.tmp_altcase_filename) tmp_ac.append_data(tmp_u.raw_unigrams) tmp_u.generate_low_case(tmp_ac.altcase_map) self.ocrkey_map.append_data(tmp_u.raw_unigrams) # Updating data self.unigrams.raw_unigrams += tmp_u.raw_unigrams self.unigrams.ngrams += tmp_u.ngrams self.unigrams.prune(0.7) self.unigrams.save() combine_struct = { key: set() for key in tmp_ac.altcase_map.keys() + self.altcase_map.altcase_map.keys() } for key, value in tmp_ac.altcase_map.items( ) + self.altcase_map.altcase_map.items(): combine_struct[key] = combine_struct[key].union(value) self.altcase_map.altcase_map = combine_struct self.altcase_map.prune(self.unigrams.ngrams_pruned) self.altcase_map.save() unlink(self.tmp_unigrams_filename) unlink(self.tmp_altcase_filename) self.anagram_map.append_data(self.bigrams.ngrams_pruned, self.unigrams.ngrams_pruned) self.dictionary.append_data(self.unigrams.ngrams_pruned) self.logger.info(text_data.filename + "'s datastructures loaded") def correct(self, text_data): """Correct text data Args: text_data (`Text`): Text data """ correction_data = self.correction_data() for paragraph in text_data.text: for line in paragraph: for token in line.tokens: token[2] = init_correction_map( token[1], correction_data["dictionary"]) # Skip some correction steps if the token is too short, in the dictionary or already identified as # garbage if not token[2] is None and len(token[2]) == 0: anagrams = select_anagrams(token[1], correction_data) ocr_sims = select_ocrsims(token[1], correction_data) token[2] = build_candidates_list( token[1], anagrams, ocr_sims, correction_data) token[2] = correct_case(token[1], token[2], correction_data) token[2] = rate_corrections(token[2]) if len(token[2]) == 0: # No correction has been found token[2] = None # Applying the bigram boost to the tokens bigrams = extract_paragraph_bigrams(paragraph) apply_bigram_boost(paragraph, bigrams, correction_data["occurence_map"]) # Select the appropriate correction for line in paragraph: for token in line.tokens: token[2] = select_correction(token[1], token[2]) if token[2] is not None and len(token[2]) > 1: tkn_list = [ tkn for tkn, sc in token[2].items() if sc == max(token[2].values()) ] if len(tkn_list) != 1: tkn_list = select_lower_edit_distance( token[1], {tkn: token[2][tkn] for tkn in tkn_list}) if len(tkn_list) != 1: tkn_list = [ select_best_alphabetical_word( token[1], tkn_list) ] token[2] = {tkn: token[2][tkn] for tkn in tkn_list} def correction_data(self): """Get the correction data Returns: dict: Correction data """ return { "occurence_map": self.unigrams.ngrams + self.bigrams.ngrams, "altcase": self.altcase_map.altcase_map, "ocrkeys": self.ocrkey_map.ocrkey_map, "anagrams": self.anagram_map.anagram_hashmap, "alphabet": self.anagram_map.anagram_alphabet, "dictionary": self.dictionary.dictionary }