def initial_dir(mode, config, model_file_path=None): if not os.path.exists(config.log_root): os.mkdir(config.log_root) if mode == 'train': _train_name = "" if config.pointer_gen: _train_name = _train_name + "_pointer_gen" if config.is_coverage: _train_name = _train_name + "_coverage" train_dir = os.path.join(config.log_root, 'train{}'.format(_train_name)) if not os.path.exists(train_dir): os.mkdir(train_dir) model_dir = os.path.join(train_dir, 'model') if not os.path.exists(model_dir): os.mkdir(model_dir) return train_dir, model_dir else: if model_file_path is None: logger.error("error!, no model to load") raise Exception("empty model file path!", model_file_path) parent_path = os.path.dirname(model_file_path) train_path = os.path.dirname(parent_path) model_name = os.path.basename(model_file_path) decode_path = os.path.join(train_path, 'decode_%s' % (model_name)) if not os.path.exists(decode_path): os.mkdir(decode_path) return decode_path
def forward(self, graphs, node_feats, node_idx, nodes_num_batch): # if graphs length = 1 there will be errors in dgl if len(graphs) == 1: graphs.append(dgl.DGLGraph()) g = dgl.batch(graphs) if g.number_of_nodes() != len(node_feats): logger.error( "error: number of nodes in dgl graph do not equal nodes in input graph !!!" ) logger.error( f"number of nodes this batch:{sum(nodes_num_batch).item()}, number of num in dgl graph {g.number_of_nodes()}" ) assert g.number_of_nodes() == len(node_feats) gnn_feat = self.gnn(g, node_feats) b = len(nodes_num_batch) n = max(nodes_num_batch) h = gnn_feat.shape[1] node_features = torch.zeros([b, n, h], device=gnn_feat.device) # 还原成 B x max_nodes_num x hidden for i in range(len(node_idx) - 1): curr_idx = node_idx[i] next_idx = node_idx[i + 1] mask = torch.arange(curr_idx, next_idx, device=gnn_feat.device) output_feat = torch.index_select(gnn_feat, 0, mask) if output_feat.shape[0] < n: pad_num = n - output_feat.shape[0] extra_zeros = torch.zeros(pad_num, h, device=gnn_feat.device) output_feat = torch.cat([output_feat, extra_zeros], 0) node_features[i] = output_feat return node_features
def on_exception(self, exception): if isinstance(exception, KeyboardInterrupt): logger.error( "[Error] Caught keyboard interrupt on worker. Stopping supervisor..." ) state = { 'iter': self.step, 'encoder_state_dict': self.model.encoder.state_dict(), 'decoder_state_dict': self.model.decoder.state_dict(), 'reduce_state_dict': self.model.reduce_state.state_dict(), 'optimizer': self.optimizer.state_dict(), 'current_loss': self.running_avg_loss } model_save_path = os.path.join(self.config.model_path, 'earlystop_step_%d.pkl' % self.step) # torch.save(state, model_save_path) #self.model.cpu() torch.save(self.model, model_save_path) #if self.config.use_gpu: # self.model.cuda() logger.info('[INFO] Saving early stop model to %s', model_save_path) if self.quit_all is True: sys.exit(0) # 直接退出程序 else: pass else: raise exception # 抛出陌生Error
def get_metric(self, reset=True): logger.info("[INFO] Hyps and Refer number is %d, %d", len(self.prediction), len(self.referece)) if len(self.prediction) == 0 or len(self.referece) == 0: logger.error("During testing, no hyps or refers is selected!") return rouge = Rouge() scores_all = rouge.get_scores(self.prediction, self.referece, avg=True) if reset: self.prediction = [] self.referece = [] logger.info(scores_all) scores_all = remend_score(scores_all) return scores_all
def get_metric(self, reset=True): logger.info("[INFO] Hyps and Refer number is %d, %d", len(self.prediction), len(self.referece)) if len(self.prediction) == 0 or len(self.referece) == 0: logger.error("During testing, no hyps or refers is selected!") return if isinstance(self.referece[0], list): logger.info("Multi Reference summaries!") scores_all = pyrouge_score_all_multi(self.prediction, self.referece, self.config) else: scores_all = pyrouge_score_all(self.prediction, self.referece, self.config) if reset: self.prediction = [] self.referece = [] logger.info(scores_all) return scores_all
def on_backward_begin(self, loss): """ :param loss: [] :return: """ print("|epoch: %d step: %d loss: %.4f|" % (self.epoch, self.step, loss.item())) if not np.isfinite(loss.item()): logger.error("train Loss is not finite. Stopping.") logger.info(loss.item()) for name, param in self.model.named_parameters(): if param.requires_grad: logger.info(name) logger.info(param.grad.data.sum()) raise Exception("train Loss is not finite. Stopping.") self.running_avg_loss = calc_running_avg_loss(loss.item(), self.running_avg_loss, self.summary_writer, self.step)
def on_backward_begin(self, loss): self.loss_update_every.append(loss.item()) if isinstance(loss, tuple) and not np.isfinite(loss[0].item()): logger.error("train Loss is not finite. Stopping.") logger.info(loss[0].item()) for name, param in self.model.named_parameters(): if param.requires_grad: logger.info(name) logger.info(param.grad.data.sum()) raise Exception("train Loss is not finite. Stopping.") if self.step % self.update_every == 0: assert len(self.loss_update_every) == self.update_every loss_batch = sum(self.loss_update_every) self.loss_update_every = [] # report the loss if self.step < 10 or self.step % 1000 == 0: logger.info( "|epoch: %d step: %d log_loss: %.4f |" % (self.epoch, self.step / self.update_every, loss_batch)) self.running_avg_loss = calc_running_avg_loss( loss_batch, self.running_avg_loss, self.step / self.update_every)
def initial_dir(mode, model_file_path=None): if mode == 'train': train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time()))) if not os.path.exists(train_dir): os.mkdir(train_dir) model_dir = os.path.join(train_dir, 'model') if not os.path.exists(model_dir): os.mkdir(model_dir) return train_dir, model_dir else: if model_file_path is None: logger.error("error!, no model to load") raise Exception("empty model file path!", model_file_path) parent_path = os.path.dirname(model_file_path) train_path = os.path.dirname(parent_path) model_name = os.path.basename(model_file_path) decode_path = os.path.join(train_path, 'decode_%s' % (model_name)) if not os.path.exists(decode_path): os.mkdir(decode_path) return decode_path
# 不是作为形参传入到Trainer里面的么,怎么里面的model变化会影响到外面的? logger.info('[INFO] Saving eval best model to %s', bestmodel_save_path) def run_test(model_file_path): decode_path = initial_dir('test', model_file_path) datainfo = set_up_data('test') model = Model(vocab=datainfo.vocabs["train"]) tester = Tester(datainfo.datasets['test'], model=model, metrics=PyRougeMetric(pred='prediction', art_oovs='article_oovs', abstract_sentences='abstract_sentences', config=config, vocab=datainfo.vocabs["train"]), batch_size=1) eval_results = tester.test() write_eval_results(decode_path, eval_results) if __name__ == '__main__': torch.cuda.set_device(4) mode = sys.argv[1] if mode == 'train': logger.info("------start mode train------") run_train() elif mode == 'test': logger.info("------start mode test-------") model_filename = sys.argv[2] run_test(model_filename) else: logger.error("error: none of the mode is in train or test!") raise Exception("wrong mode! neither train nor test!", mode)