def __init__(self, app_config): super(InlineModel, self).__init__(app_config) inline_models_dir = join(app_config["root"], app_config["dirs"]["models_root"], app_config["dirs"]["models"]["inline"]) inline_models_key = app_config["models"]["inline"] self.dictionary = Dictionary( join(inline_models_dir, inline_models_key["dictionary"])) self.unigrams = Unigrams( join(inline_models_dir, inline_models_key["unigrams"])) self.tmp_unigrams_filename = self.unigrams.filename + app_config[ "exts"]["tmp"] self.bigrams = Bigrams( join(inline_models_dir, inline_models_key["bigrams"])) self.altcase_map = AltCaseMap( join(inline_models_dir, inline_models_key["altcase"])) self.tmp_altcase_filename = self.altcase_map.filename + app_config[ "exts"]["tmp"] self.ocrkey_map = OcrKeyMap( join(inline_models_dir, inline_models_key["ocr_keys"])) self.anagram_map = AnagramMap( join(inline_models_dir, inline_models_key["anagrams"]))
def correct(self, text_data): """Correct text data Args: text_data (`Text`): Text data """ unigrams = Unigrams( join( self.config["root"], self.config["dirs"]["models_root"], self.config["dirs"]["models"]["inline"], self.config["models"]["inline"]["unigrams"], )) ml_classifier = load( join(self.config["dirs"]["models_root"], self.config["dirs"]["models"]["learning"], self.config["models"]["learning"]["classifier"])) if ml_classifier is None: return self.model["algo"].set_classifier(ml_classifier) for paragraph in text_data.text: for line in paragraph: if line.grade % 5 == 0: continue f = MachineLearningFeatures() features = f.extract_features(line, unigrams.ngrams, text_data.stats) line.grade = self.model["algo"].classify(features) * 5
def train(self, dataset): """Train the model with a dataset Args: dataset (list): List of training files """ # Get the original training set training_set = self.model["algo"].training_set # Append the new data to it for text in dataset: self.logger.debug("Processing " + text.filename + "...") unigrams = Unigrams( join( self.config["root"], self.config["dirs"]["models_root"], self.config["dirs"]["models"]["inline"], self.config["models"]["inline"]["unigrams"], )) for p in text.text: for line in p: if line.grade % 5 != 0: # Unclassified lines are useless for the training continue f = MachineLearningFeatures() features = f.extract_features(line, unigrams.ngrams, text.stats) result = int(line.grade / 5) training_set["features"].append(features) training_set["results"].append(result) self.logger.debug("Saving training set...") save( training_set, join(self.config["dirs"]["models_root"], self.config["dirs"]["models"]["learning"], self.config["models"]["learning"]["training_set"])) self.logger.debug("Training model...") ml_classifier = SGDClassifier(loss="log", class_weight="auto") self.model["algo"].set_classifier(ml_classifier) self.model["algo"].set_training_set(training_set["features"], training_set["results"]) self.model["algo"].train() save( self.model["algo"].classifier, join(self.config["dirs"]["models_root"], self.config["dirs"]["models"]["learning"], self.config["models"]["learning"]["classifier"]))
def load(self, text_data): """Load text data to the model Args: text_data (`Text`): Text data """ if self.is_preprocessed(text_data.filename) != 0: self.logger.debug(text_data.filename + " already loaded: skipping it.") return tmp_u = Unigrams(self.tmp_unigrams_filename) word_list = tmp_u.append_data(text_data) self.bigrams.append_data(word_list) tmp_ac = AltCaseMap(self.tmp_altcase_filename) tmp_ac.append_data(tmp_u.raw_unigrams) tmp_u.generate_low_case(tmp_ac.altcase_map) self.ocrkey_map.append_data(tmp_u.raw_unigrams) # Updating data self.unigrams.raw_unigrams += tmp_u.raw_unigrams self.unigrams.ngrams += tmp_u.ngrams self.unigrams.prune(0.7) self.unigrams.save() combine_struct = { key: set() for key in tmp_ac.altcase_map.keys() + self.altcase_map.altcase_map.keys() } for key, value in tmp_ac.altcase_map.items( ) + self.altcase_map.altcase_map.items(): combine_struct[key] = combine_struct[key].union(value) self.altcase_map.altcase_map = combine_struct self.altcase_map.prune(self.unigrams.ngrams_pruned) self.altcase_map.save() unlink(self.tmp_unigrams_filename) unlink(self.tmp_altcase_filename) self.anagram_map.append_data(self.bigrams.ngrams_pruned, self.unigrams.ngrams_pruned) self.dictionary.append_data(self.unigrams.ngrams_pruned) self.logger.info(text_data.filename + "'s datastructures loaded")