예제 #1
0
def initial_dir(mode, config, model_file_path=None):
    if not os.path.exists(config.log_root):
        os.mkdir(config.log_root)

    if mode == 'train':
        _train_name = ""
        if config.pointer_gen:
            _train_name = _train_name + "_pointer_gen"
        if config.is_coverage:
            _train_name = _train_name + "_coverage"

        train_dir = os.path.join(config.log_root, 'train{}'.format(_train_name))
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(model_dir):
            os.mkdir(model_dir)

        return train_dir, model_dir
    else:
        if model_file_path is None:
            logger.error("error!, no model to load")
            raise Exception("empty model file path!", model_file_path)
        parent_path = os.path.dirname(model_file_path)
        train_path = os.path.dirname(parent_path)
        model_name = os.path.basename(model_file_path)
        decode_path = os.path.join(train_path, 'decode_%s' % (model_name))

        if not os.path.exists(decode_path):
            os.mkdir(decode_path)

        return decode_path
예제 #2
0
    def forward(self, graphs, node_feats, node_idx, nodes_num_batch):
        # if graphs length = 1 there will be errors in dgl
        if len(graphs) == 1:
            graphs.append(dgl.DGLGraph())

        g = dgl.batch(graphs)
        if g.number_of_nodes() != len(node_feats):
            logger.error(
                "error: number of nodes in dgl graph do not equal nodes in input graph !!!"
            )
            logger.error(
                f"number of nodes this batch:{sum(nodes_num_batch).item()}, number of num in dgl graph {g.number_of_nodes()}"
            )
            assert g.number_of_nodes() == len(node_feats)

        gnn_feat = self.gnn(g, node_feats)
        b = len(nodes_num_batch)
        n = max(nodes_num_batch)
        h = gnn_feat.shape[1]
        node_features = torch.zeros([b, n, h], device=gnn_feat.device)
        # 还原成 B x max_nodes_num x hidden
        for i in range(len(node_idx) - 1):
            curr_idx = node_idx[i]
            next_idx = node_idx[i + 1]
            mask = torch.arange(curr_idx, next_idx, device=gnn_feat.device)
            output_feat = torch.index_select(gnn_feat, 0, mask)
            if output_feat.shape[0] < n:
                pad_num = n - output_feat.shape[0]
                extra_zeros = torch.zeros(pad_num, h, device=gnn_feat.device)
                output_feat = torch.cat([output_feat, extra_zeros], 0)
            node_features[i] = output_feat

        return node_features
예제 #3
0
    def on_exception(self, exception):
        if isinstance(exception, KeyboardInterrupt):
            logger.error(
                "[Error] Caught keyboard interrupt on worker. Stopping supervisor..."
            )
            state = {
                'iter': self.step,
                'encoder_state_dict': self.model.encoder.state_dict(),
                'decoder_state_dict': self.model.decoder.state_dict(),
                'reduce_state_dict': self.model.reduce_state.state_dict(),
                'optimizer': self.optimizer.state_dict(),
                'current_loss': self.running_avg_loss
            }

            model_save_path = os.path.join(self.config.model_path,
                                           'earlystop_step_%d.pkl' % self.step)

            # torch.save(state, model_save_path)

            #self.model.cpu()
            torch.save(self.model, model_save_path)
            #if self.config.use_gpu:
            #    self.model.cuda()

            logger.info('[INFO] Saving early stop model to %s',
                        model_save_path)

            if self.quit_all is True:
                sys.exit(0)  # 直接退出程序
            else:
                pass
        else:
            raise exception  # 抛出陌生Error
예제 #4
0
 def get_metric(self, reset=True):
     logger.info("[INFO] Hyps and Refer number is %d, %d",
                 len(self.prediction), len(self.referece))
     if len(self.prediction) == 0 or len(self.referece) == 0:
         logger.error("During testing, no hyps or refers is selected!")
         return
     rouge = Rouge()
     scores_all = rouge.get_scores(self.prediction, self.referece, avg=True)
     if reset:
         self.prediction = []
         self.referece = []
     logger.info(scores_all)
     scores_all = remend_score(scores_all)
     return scores_all
예제 #5
0
 def get_metric(self, reset=True):
     logger.info("[INFO] Hyps and Refer number is %d, %d",
                 len(self.prediction), len(self.referece))
     if len(self.prediction) == 0 or len(self.referece) == 0:
         logger.error("During testing, no hyps or refers is selected!")
         return
     if isinstance(self.referece[0], list):
         logger.info("Multi Reference summaries!")
         scores_all = pyrouge_score_all_multi(self.prediction,
                                              self.referece, self.config)
     else:
         scores_all = pyrouge_score_all(self.prediction, self.referece,
                                        self.config)
     if reset:
         self.prediction = []
         self.referece = []
     logger.info(scores_all)
     return scores_all
예제 #6
0
    def on_backward_begin(self, loss):
        """
        :param loss: []
        :return:
        """
        print("|epoch: %d  step: %d  loss: %.4f|" %
              (self.epoch, self.step, loss.item()))
        if not np.isfinite(loss.item()):
            logger.error("train Loss is not finite. Stopping.")
            logger.info(loss.item())
            for name, param in self.model.named_parameters():
                if param.requires_grad:
                    logger.info(name)
                    logger.info(param.grad.data.sum())
            raise Exception("train Loss is not finite. Stopping.")

        self.running_avg_loss = calc_running_avg_loss(loss.item(),
                                                      self.running_avg_loss,
                                                      self.summary_writer,
                                                      self.step)
예제 #7
0
    def on_backward_begin(self, loss):
        self.loss_update_every.append(loss.item())
        if isinstance(loss, tuple) and not np.isfinite(loss[0].item()):
            logger.error("train Loss is not finite. Stopping.")
            logger.info(loss[0].item())
            for name, param in self.model.named_parameters():
                if param.requires_grad:
                    logger.info(name)
                    logger.info(param.grad.data.sum())
            raise Exception("train Loss is not finite. Stopping.")

        if self.step % self.update_every == 0:
            assert len(self.loss_update_every) == self.update_every
            loss_batch = sum(self.loss_update_every)
            self.loss_update_every = []
            # report the loss
            if self.step < 10 or self.step % 1000 == 0:
                logger.info(
                    "|epoch: %d  step: %d  log_loss: %.4f |" %
                    (self.epoch, self.step / self.update_every, loss_batch))
            self.running_avg_loss = calc_running_avg_loss(
                loss_batch, self.running_avg_loss,
                self.step / self.update_every)
예제 #8
0
def initial_dir(mode, model_file_path=None):
    if mode == 'train':
        train_dir = os.path.join(config.log_root, 'train_%d' % (int(time.time())))
        if not os.path.exists(train_dir):
            os.mkdir(train_dir)

        model_dir = os.path.join(train_dir, 'model')
        if not os.path.exists(model_dir):
            os.mkdir(model_dir)
        return train_dir, model_dir

    else:
        if model_file_path is None:
            logger.error("error!, no model to load")
            raise Exception("empty model file path!", model_file_path)
        parent_path = os.path.dirname(model_file_path)
        train_path = os.path.dirname(parent_path)
        model_name = os.path.basename(model_file_path)
        decode_path = os.path.join(train_path, 'decode_%s' % (model_name))

        if not os.path.exists(decode_path):
            os.mkdir(decode_path)

        return decode_path
예제 #9
0
    # 不是作为形参传入到Trainer里面的么,怎么里面的model变化会影响到外面的?
    logger.info('[INFO] Saving eval best model to %s', bestmodel_save_path)


def run_test(model_file_path):
    decode_path = initial_dir('test', model_file_path)
    datainfo = set_up_data('test')
    model = Model(vocab=datainfo.vocabs["train"])
    tester = Tester(datainfo.datasets['test'], model=model, metrics=PyRougeMetric(pred='prediction',
                                                                                  art_oovs='article_oovs',
                                                                                  abstract_sentences='abstract_sentences',
                                                                                  config=config,
                                                                                  vocab=datainfo.vocabs["train"]), batch_size=1)
    eval_results = tester.test()
    write_eval_results(decode_path, eval_results)


if __name__ == '__main__':
    torch.cuda.set_device(4)
    mode = sys.argv[1]
    if mode == 'train':
        logger.info("------start mode train------")
        run_train()
    elif mode == 'test':
        logger.info("------start mode test-------")
        model_filename = sys.argv[2]
        run_test(model_filename)
    else:
        logger.error("error: none of the mode is in train or test!")
        raise Exception("wrong mode! neither train nor test!", mode)