def train_model(self): self.__begin() log_debug(self.final_model_path, token_id=self.training_token, prefix=self.common_prefix) log_debug(os.path.exists(self.final_model_path), token_id=self.training_token, prefix=self.common_prefix) should_train = not os.path.exists(self.final_model_path) has_error = self.__train() if should_train else self.__retrain( self.final_model_path) # Since python supports short-circuit operator for boolean conjunction if not has_error and self.__attach_to_networks(): message = 'Dataset: {0} training successful'.format( self.dataset_name) else: message = 'Dataset: {0} training Failed'.format(self.dataset_name) log_error(message, token_id=self.training_token, prefix=self.common_prefix) self.__tear_down() return has_error, message
def __begin(self): log_info('\\' * 80, token_id=self.training_token, prefix=self.common_prefix) log_info( "Training Begin: Dataset={3}, prefix=multistep_{0}, SetType={1},TrainDocs={2}" .format(self.common_prefix, self.increment_type, self.training_document_threshold, self.dataset_name), token_id=self.training_token, prefix=self.common_prefix) log_info('/' * 80, token_id=self.training_token, prefix=self.common_prefix) log_debug('-' * 80, token_id=self.training_token, prefix=self.common_prefix) log_debug('Training config', token_id=self.training_token, prefix=self.common_prefix) log_debug(self._config, token_id=self.training_token, prefix=self.common_prefix) log_debug('-' * 80, token_id=self.training_token, prefix=self.common_prefix) log_debug('enter Complete', token_id=self.training_token, prefix=self.common_prefix)
def parse_course_department(soup): header = get_header_text(soup) name = header[header.find("("):] title = header[:-len(name)].rstrip() name = name[1:-1] log_debug(header) return {"title": title, "name": name}
def parse_degree(soup): header = get_header_text(soup) level = header.split(",")[-1] subject = header[:-len(level) - 1] level = level.lstrip() log_debug(header) school, department = parse_school_department(soup) degree_requirements = parse_requirements(soup) return { "name": header, "subject": subject, "level": level, "school": school, "department": department, "requirements": degree_requirements }
def __can_train(self): log_debug('Training data set file options', token_id=self.training_token, prefix=self.prefix) log_debug(self.train_file_options, token_id=self.training_token, prefix=self.prefix) new_files = generate_train_file(self.train_file_options) # log_info('{0} training candidate files found'.format(new_files), token_id=self.training_token, prefix=self.prefix) can_proceed_to_training = new_files >= self.train_file_options[ 'training_document_threshold'] if can_proceed_to_training: message = "Wrote/appended {0} new files to {1}/train.txt\nAnd logged them in {2}.".format( new_files, self.train_file_options['train_document_path'], self.train_file_options['train_history_file']) log_debug(msg=message, token_id=self.training_token, prefix=self.prefix) else: message = 'Not enough documents. need {0} more documents.'.format( self.train_file_options['training_document_threshold'] - new_files) log_warning(msg="Can not train. " + message, token_id=self.training_token, prefix=self.prefix) return can_proceed_to_training, message
def __attach_to_networks(self): try: new_model_path = self.train_stats['finalModel'] log_debug('Before Update', token_id=self.training_token, prefix=self.common_prefix) log_debug('networks id : {0}'.format(id(self.network_dict)), token_id=self.training_token, prefix=self.common_prefix) replace_network = self.dataset_name in self.network_dict if replace_network: del self.network_dict[self.dataset_name] gc.collect() self.network_dict.update({self.dataset_name: self.trained_model}) new_model_file = os.path.basename(new_model_path) dest_folder = os.path.dirname(self.final_model_path) shutil.copy2(new_model_path, dest_folder) log_debug(self.network_dict, token_id=self.training_token, prefix=self.common_prefix) if replace_network: os.rename(self.final_model_path, self.final_model_path[:-3] + '.bak') os.rename(dest_folder + '/' + new_model_file, self.final_model_path) os.remove(self.final_model_path[:-3] + '.bak') else: os.rename(dest_folder + '/' + new_model_file, self.final_model_path) return True except Exception: log_error('Network replacement failed.', token_id=self.training_token, prefix=self.common_prefix) log_error(traceback.format_exc(), token_id=self.training_token, prefix=self.common_prefix) return False
def __retrain(self, model_path): has_error = False try: log_info("============== Re-training {0} ==============".format( self.training_token), token_id=self.training_token, prefix=self.common_prefix) model = BiLSTM(params=self.params, fn_log_info=log_info, fn_log_debug=log_debug, training_token=self.training_token, training_prefix=self.common_prefix) log_debug("Loading model %s" % model_path, token_id=self.training_token, prefix=self.common_prefix) model.loadModel(model_path, "") log_debug("Loaded model %s" % model_path, token_id=self.training_token, prefix=self.common_prefix) # os.remove(model_path) except Exception: log_error('Re-training: model loading failed.', token_id=self.training_token, prefix=self.common_prefix) log_error(traceback.format_exc(), token_id=self.training_token, prefix=self.common_prefix) has_error = True else: try: log_debug(model.mappings.keys()) with DataGenerator(dataset_name=self.dataset_name, mappings=model.mappings, cols=self.data_columns) as generator: log_debug("Train Sentences: %d" % len(generator.data['trainMatrix'])) log_debug("Dev Sentences: %d" % len(generator.data['devMatrix'])) log_debug("Test Sentences: %d" % len(generator.data['testMatrix'])) model.setTrainDataset(generator.data, self.label_key) model.modelSavePath = self.transient_model_path self.train_stats = model.evaluate() log_debug("%s" % self.train_stats, token_id=self.training_token, prefix=self.common_prefix) self.trained_model = model except Exception: log_error('Re-training: failed.', token_id=self.training_token, prefix=self.common_prefix) log_error(traceback.format_exc(), token_id=self.training_token, prefix=self.common_prefix) has_error = True finally: gc.collect() finally: return has_error
def __train(self): has_error = False try: log_debug("Dataset: %s" % self.dataset_name, token_id=self.training_token, prefix=self.common_prefix) log_debug("Label key: %s" % self.label_key, token_id=self.training_token, prefix=self.common_prefix) log_info("============== Training: {0} ==============".format( self.training_token), token_id=self.training_token, prefix=self.common_prefix) ###################################################### # # The training of the network starts here # ###################################################### model = BiLSTM(params=self.params, fn_log_info=log_info, fn_log_debug=log_debug, training_token=self.training_token, training_prefix=self.common_prefix) with EmbeddingsAndDataGenerator( embeddings_path=self.embeddings_path, dataset_file=self.dataset_files, reuse_embedding=self.reuse_embeddings) as generator: log_debug(generator.data['mappings'].keys(), token_id=self.training_token, prefix=self.common_prefix) log_info("Train Sentences: %d" % len(generator.data['trainMatrix']), token_id=self.training_token, prefix=self.common_prefix) log_info("Dev Sentences: %d" % len(generator.data['devMatrix']), token_id=self.training_token, prefix=self.common_prefix) log_info("Test Sentences: %d" % len(generator.data['testMatrix']), token_id=self.training_token, prefix=self.common_prefix) model.setMappings(generator.embeddings, generator.data['mappings']) model.setTrainDataset(generator.data, self.label_key) model.verboseBuild = True model.modelSavePath = self.transient_model_path self.train_stats = model.evaluate() log_debug("%s" % self.train_stats, token_id=self.training_token, prefix=self.common_prefix) # del word2Idx self.trained_model = model except Exception: log_error('Training: failed.', token_id=self.training_token, prefix=self.common_prefix) log_error(traceback.format_exc(), token_id=self.training_token, prefix=self.common_prefix) has_error = True finally: gc.collect() return has_error