Пример #1
0
 def train_dataloader(self):
     # Random
     args = self.hparams.train.training
     train_sampler = RandomSampler()
     train_loader = DataSetIter(batch_size=args.batch_size,
                                dataset=self.train_dataset, sampler=train_sampler, drop_last=False)
     return train_loader
Пример #2
0
    def test(self):
        # turn on the testing mode; clean up the history
        network = self._model
        self.mode(network, is_test=True)
        self.eval_history.clear()
        output, truths = defaultdict(list), defaultdict(list)
        data_iterator = Batch(self.data,
                              self.batch_size,
                              sampler=RandomSampler(),
                              as_numpy=False)

        with torch.no_grad():
            for batch_x, batch_y in data_iterator:
                prediction = self.data_forward(network, batch_x)
                assert isinstance(prediction, dict)
                for k, v in prediction.items():
                    output[k].append(v)
                for k, v in batch_y.items():
                    truths[k].append(v)
            for k, v in output.items():
                output[k] = itertools.chain(*v)
            for k, v in truths.items():
                truths[k] = itertools.chain(*v)
            args = _build_args(self._evaluator, **output, **truths)
            eval_results = self._evaluator(**args)
        print("[tester] {}".format(self.print_eval_results(eval_results)))
        self.mode(network, is_test=False)
        return eval_results
Пример #3
0
def test_random_sampler():
    sampler = RandomSampler()
    data = [1, 3, 5, 7, 9, 2, 4, 6, 8, 10]
    ans = [data[i] for i in sampler(data)]
    assert len(ans) == len(data)
    for d in ans:
        assert d in data
Пример #4
0
 def train_init_dataloder_for_model2(self):
     assert self.hparams.joint_training
     init_sampler = RandomSampler()
     args = self.hparams.train.init.model2
     init_loader = DataSetIter(batch_size=args.batch_size,
                               dataset=self.train_dataset_init_for_model2,
                               sampler=init_sampler,
                               drop_last=False)
     return init_loader
Пример #5
0
    def train_init_dataloader(self):
        if self.train_dataset_init is None:
            return None

        init_sampler = RandomSampler()
        args = self.hparams.train.init if not self.hparams.joint_training else self.hparams.train.init.model1
        init_loader = DataSetIter(batch_size=args.batch_size,
                                   dataset=self.train_dataset_init,
                                   sampler=init_sampler,
                                   drop_last=False)
        return init_loader
Пример #6
0
    def init_data_loader(self, batch_size):

        self.train_data_loader = DataSetIter(self.train_set,
                                             batch_size,
                                             sampler=RandomSampler())
        self.dev_data_loader = DataSetIter(self.dev_set,
                                           batch_size,
                                           sampler=SequentialSampler())
        self.test_data_loader = DataSetIter(self.test_set,
                                            batch_size,
                                            sampler=SequentialSampler())
Пример #7
0
 def __init__(self,
              dataset,
              batch_size,
              sampler=RandomSampler(),
              as_numpy=False):
     self.dataset = dataset
     self.batch_size = batch_size
     self.sampler = sampler
     self.as_numpy = as_numpy
     self.idx_list = None
     self.curidx = 0
     self.num_batches = len(dataset) // batch_size + int(
         len(dataset) % batch_size != 0)
     self.cur_batch_indices = None
Пример #8
0
    def train(self, network, train_data, dev_data=None):
        """General Training Procedure

        :param network: a model
        :param train_data: a DataSet instance, the training data
        :param dev_data: a DataSet instance, the validation data (optional)
        """
        # transfer model to gpu if available
        if torch.cuda.is_available() and self.use_cuda:
            self._model = network.cuda()
            # self._model is used to access model-specific loss
        else:
            self._model = network

        # define Tester over dev data
        if self.validate:
            default_valid_args = {"batch_size": self.batch_size, "pickle_path": self.pickle_path,
                                  "use_cuda": self.use_cuda, "evaluator": self._evaluator}
            validator = self._create_validator(default_valid_args)
            logger.info("validator defined as {}".format(str(validator)))

        # optimizer and loss
        self.define_optimizer()
        logger.info("optimizer defined as {}".format(str(self._optimizer)))
        self.define_loss()
        logger.info("loss function defined as {}".format(str(self._loss_func)))

        # main training procedure
        start = time.time()
        logger.info("training epochs started")
        for epoch in range(1, self.n_epochs + 1):
            logger.info("training epoch {}".format(epoch))

            # turn on network training mode
            self.mode(network, is_test=False)
            # prepare mini-batch iterator
            data_iterator = Batch(train_data, batch_size=self.batch_size, sampler=RandomSampler(),
                                  use_cuda=self.use_cuda)
            logger.info("prepared data iterator")

            # one forward and backward pass
            self._train_step(data_iterator, network, start=start, n_print=self.print_every_step, epoch=epoch)

            # validation
            if self.validate:
                if dev_data is None:
                    raise RuntimeError(
                        "self.validate is True in trainer, but dev_data is None. Please provide the validation data.")
                logger.info("validation started")
                validator.test(network, dev_data)
Пример #9
0
    def train(self):
        """Start Training.

        :return:
        """
        try:
            if torch.cuda.is_available() and self.use_cuda:
                self.model = self.model.cuda()

            self.mode(self.model, is_test=False)

            start = time.time()
            self.start_time = str(datetime.now().strftime('%Y-%m-%d-%H-%M-%S'))
            print("training epochs started " + self.start_time)
            if self.save_path is None:

                class psudoSW:
                    def __getattr__(self, item):
                        def pass_func(*args, **kwargs):
                            pass

                        return pass_func

                self._summary_writer = psudoSW()
            else:
                path = os.path.join(
                    self.save_path,
                    'tensorboard_logs_{}'.format(self.start_time))
                self._summary_writer = SummaryWriter(path)

            epoch = 1
            while epoch <= self.n_epochs:

                data_iterator = Batch(self.train_data,
                                      batch_size=self.batch_size,
                                      sampler=RandomSampler(),
                                      as_numpy=False)

                self._train_epoch(data_iterator, self.model, epoch,
                                  self.dev_data, start)

                # validate_every override validation at end of epochs
                if self.dev_data and self.validate_every <= 0:
                    self.do_validation()
                epoch += 1
        finally:
            self._summary_writer.close()
            del self._summary_writer
Пример #10
0
    def test(self, network, dev_data):
        if torch.cuda.is_available() and self.use_cuda:
            self._model = network.cuda()
        else:
            self._model = network

        # turn on the testing mode; clean up the history
        self.mode(network, is_test=True)
        self.eval_history.clear()
        output_list = []
        truth_list = []

        data_iterator = Batch(dev_data, self.batch_size, sampler=RandomSampler(), use_cuda=self.use_cuda)

        for batch_x, batch_y in data_iterator:
            with torch.no_grad():
                prediction = self.data_forward(network, batch_x)
            output_list.append(prediction)
            truth_list.append(batch_y)
        eval_results = self.evaluate(output_list, truth_list)
        print("[tester] {}".format(self.print_eval_results(eval_results)))
        logger.info("[tester] {}".format(self.print_eval_results(eval_results)))
Пример #11
0
    def _train_epoch(self):
        total_loss = 0
        corrects, samples = 0, 0

        n_tasks = len(self.task_lst)
        task_seq = list(np.random.permutation(n_tasks))
        empty_task = copy.deepcopy(self.empty_tasks)
        self.model.train()
        self.model.zero_grad()

        for cur_step in range(self.n_steps_per_epoch):
            for task_id in task_seq:
                if task_id in empty_task:
                    continue
                task = find_task(task_id, self.task_lst)
                batch = next(task.train_data_loader, None)
                if batch is None:
                    # empty_task.add(task_id)
                    task.train_data_loader = DataSetIter(
                        task.train_set,
                        self.batch_size,
                        sampler=RandomSampler())
                    task.train_data_loader = iter(task.train_data_loader)
                    continue
                x, y = batch
                batch_task_id = x["task_id"].cuda()
                batch_x = x["x"].cuda()
                batch_y = y["y"].cuda()

                self.masker.before_forward(batch_task_id[0].item())
                if "seq_len" in x:
                    seq_len = x["seq_len"].cuda()
                    out = self.model(batch_task_id, batch_x, batch_y, seq_len)
                else:
                    seq_len = None
                    out = self.model(batch_task_id, batch_x, batch_y)
                loss, pred = out["loss"], out["pred"]
                self.steps += 1

                total_loss += loss.item()
                loss = loss / self.accumulation_steps
                loss.backward()
                self.masker.after_forward(batch_task_id[0].item())
                self.metrics[task_id].evaluate(pred, batch_y, seq_len)

                if self.steps % self.accumulation_steps == 0:
                    nn.utils.clip_grad_value_(self.model.parameters(), 5)

                    if self.scheduler is not None:
                        self.scheduler.step()
                    self.optim.step()
                    self.optim.zero_grad()

                if self.steps % self.print_every == 0:
                    self.summary_writer.add_scalar(
                        "train_loss", total_loss / self.print_every,
                        self.steps)
                    score = self.metrics[task_id].get_metric()
                    metric_name = "acc" if "acc" in score else "f1"
                    score = score["acc"] if "acc" in score else score["f"]
                    self.summary_writer.add_scalar("train_acc", score,
                                                   self.steps)
                    self.logger.info(" - Step {}: loss {}\t{}\t{}: {}".format(
                        self.steps,
                        total_loss / self.print_every,
                        task.task_name,
                        metric_name,
                        score,
                    ))
                    total_loss = 0
        if self.epoch_scheduler is not None:
            self.epoch_scheduler.step()
Пример #12
0
    model = model.cuda()
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)
    logger.info('done!')

    logger.info('=========== preparing data: [{}] ==========='.format(args.task))
    data_file = open('data/' + args.task + '.pkl', 'rb')
    data = pickle.load(data_file)
    data_file.close()

    bsz = args.batch_size // args.accumulation_steps

    logger.info('some examples:')
    if args.task == 'MNLI':
        train_ds = text2feature(data['train'], tokenizer, args.task)
        train_dataloader = Batch(train_ds, bsz, sampler=RandomSampler())

        dev_matched_ds = text2feature(data['dev_matched'], tokenizer, args.task)
        dev_matched_dataloader = Batch(dev_matched_ds, bsz, sampler=SequentialSampler())

        dev_mismatched_ds = text2feature(data['dev_mismatched'], tokenizer, args.task)
        dev_mismatched_dataloader = Batch(dev_mismatched_ds, bsz, sampler=SequentialSampler())

        dev_dataloader = [dev_matched_dataloader, dev_mismatched_dataloader]

        test_matched_ds = text2feature(data['test_matched'], tokenizer, args.task, True)
        test_matched_dataloader = Batch(test_matched_ds, bsz, sampler=SequentialSampler())

        test_mismatched_ds = text2feature(data['test_mismatched'], tokenizer, args.task, True)
        test_mismatched_dataloader = Batch(test_mismatched_ds, bsz, sampler=SequentialSampler())
Пример #13
0
    def __init__(self, train_data, model, loss=None, metrics=None, n_epochs=3, batch_size=32, print_every=50,
                 validate_every=-1, dev_data=None, save_path=None, optimizer=Adam(lr=0.01, weight_decay=0),
                 check_code_level=0, metric_key=None, sampler=RandomSampler(), prefetch=False, use_tqdm=True,
                 use_cuda=False, callbacks=None):
        """
        :param DataSet train_data: the training data
        :param torch.nn.modules.module model: a PyTorch model
        :param LossBase loss: a loss object
        :param MetricBase metrics: a metric object or a list of metrics (List[MetricBase])
        :param int n_epochs: the number of training epochs
        :param int batch_size: batch size for training and validation
        :param int print_every: step interval to print next training information. Default: -1(no print).
        :param int validate_every: step interval to do next validation. Default: -1(validate every epoch).
        :param DataSet dev_data: the validation data
        :param str save_path: file path to save models
        :param Optimizer optimizer: an optimizer object
        :param int check_code_level: level of FastNLP code checker. -1: don't check, 0: ignore. 1: warning. 2: strict.\\
            `ignore` will not check unused field; `warning` when warn if some field are not used; `strict` means
            it will raise error if some field are not used. 检查的原理是通过使用很小的batch(默认两个sample)来检查代码是
            否能够运行,但是这个过程理论上不会修改任何参数,只是会检查能否运行。但如果(1)模型中存在将batch_size写为某个
            固定值的情况;(2)模型中存在累加前向计算次数的,可能会多计算几次。以上情况建议将check_code_level设置为-1
        :param str metric_key: a single indicator used to decide the best model based on metric results. It must be one
            of the keys returned by the FIRST metric in `metrics`. If the overall result gets better if the indicator gets
            smaller, add "-" in front of the string. For example::

                    metric_key="-PPL"   # language model gets better as perplexity gets smaller
        :param BaseSampler sampler: method used to generate batch data.
        :param prefetch: bool, 是否使用额外的进程对产生batch数据。
        :param bool use_tqdm: whether to use tqdm to show train progress.
        :param callbacks: List[Callback]. 用于在train过程中起调节作用的回调函数。比如early stop,negative sampling等可以
            通过callback机制实现。
        """
        super(Trainer, self).__init__()

        if not isinstance(train_data, DataSet):
            raise TypeError(f"The type of train_data must be fastNLP.DataSet, got {type(train_data)}.")
        if not isinstance(model, nn.Module):
            raise TypeError(f"The type of model must be torch.nn.Module, got {type(model)}.")

        # check metrics and dev_data
        if (not metrics) and dev_data is not None:
            raise ValueError("No metric for dev_data evaluation.")
        if metrics and (dev_data is None):
            raise ValueError("No dev_data for evaluations, pass dev_data or set metrics to None. ")

        # check save_path
        if not (save_path is None or isinstance(save_path, str)):
            raise ValueError("save_path can only be None or `str`.")
        # prepare evaluate
        metrics = _prepare_metrics(metrics)

        # parse metric_key
        # increase_better is True. It means the exp result gets better if the indicator increases.
        # It is true by default.
        self.increase_better = True
        if metric_key is not None:
            self.increase_better = False if metric_key[0] == "-" else True
            self.metric_key = metric_key[1:] if metric_key[0] == "+" or metric_key[0] == "-" else metric_key
        elif len(metrics) > 0:
            self.metric_key = metrics[0].__class__.__name__.lower().strip('metric')

        # prepare loss
        losser = _prepare_losser(loss)

        # sampler check
        if not isinstance(sampler, BaseSampler):
            raise ValueError("The type of sampler should be fastNLP.BaseSampler, got {}.".format(type(sampler)))

        if check_code_level > -1:
            _check_code(dataset=train_data, model=model, losser=losser, metrics=metrics, dev_data=dev_data,
                        metric_key=metric_key, check_level=check_code_level,
                        batch_size=min(batch_size, DEFAULT_CHECK_BATCH_SIZE))

        self.train_data = train_data
        self.dev_data = dev_data  # If None, No validation.
        self.model = model
        self.losser = losser
        self.metrics = metrics
        self.n_epochs = int(n_epochs)
        self.batch_size = int(batch_size)
        self.use_cuda = bool(use_cuda)
        self.save_path = save_path
        self.print_every = int(print_every)
        self.validate_every = int(validate_every) if validate_every!=0 else -1
        self.best_metric_indicator = None
        self.best_dev_epoch = None
        self.best_dev_step = None
        self.best_dev_perf = None
        self.sampler = sampler
        self.prefetch = prefetch
        self.callback_manager = CallbackManager(env={"trainer": self}, callbacks=callbacks)

        if isinstance(optimizer, torch.optim.Optimizer):
            self.optimizer = optimizer
        else:
            self.optimizer = optimizer.construct_from_pytorch(self.model.parameters())

        self.use_tqdm = use_tqdm
        self.print_every = abs(self.print_every)

        if self.dev_data is not None:
            self.tester = Tester(model=self.model,
                                 data=self.dev_data,
                                 metrics=self.metrics,
                                 batch_size=self.batch_size,
                                 use_cuda=self.use_cuda,
                                 verbose=0)

        self.step = 0
        self.start_time = None  # start timestamp
Пример #14
0
    def testENAS(self):
        # 从csv读取数据到DataSet
        sample_path = "tutorials/sample_data/tutorial_sample_dataset.csv"
        dataset = DataSet.read_csv(sample_path,
                                   headers=('raw_sentence', 'label'),
                                   sep='\t')
        print(len(dataset))
        print(dataset[0])
        print(dataset[-3])

        dataset.append(Instance(raw_sentence='fake data', label='0'))
        # 将所有数字转为小写
        dataset.apply(lambda x: x['raw_sentence'].lower(),
                      new_field_name='raw_sentence')
        # label转int
        dataset.apply(lambda x: int(x['label']), new_field_name='label')

        # 使用空格分割句子
        def split_sent(ins):
            return ins['raw_sentence'].split()

        dataset.apply(split_sent, new_field_name='words')

        # 增加长度信息
        dataset.apply(lambda x: len(x['words']), new_field_name='seq_len')
        print(len(dataset))
        print(dataset[0])

        # DataSet.drop(func)筛除数据
        dataset.drop(lambda x: x['seq_len'] <= 3)
        print(len(dataset))

        # 设置DataSet中,哪些field要转为tensor
        # set target,loss或evaluate中的golden,计算loss,模型评估时使用
        dataset.set_target("label")
        # set input,模型forward时使用
        dataset.set_input("words", "seq_len")

        # 分出测试集、训练集
        test_data, train_data = dataset.split(0.5)
        print(len(test_data))
        print(len(train_data))

        # 构建词表, Vocabulary.add(word)
        vocab = Vocabulary(min_freq=2)
        train_data.apply(lambda x: [vocab.add(word) for word in x['words']])
        vocab.build_vocab()

        # index句子, Vocabulary.to_index(word)
        train_data.apply(
            lambda x: [vocab.to_index(word) for word in x['words']],
            new_field_name='words')
        test_data.apply(
            lambda x: [vocab.to_index(word) for word in x['words']],
            new_field_name='words')
        print(test_data[0])

        # 如果你们需要做强化学习或者GAN之类的项目,你们也可以使用这些数据预处理的工具
        from fastNLP.core.batch import Batch
        from fastNLP.core.sampler import RandomSampler

        batch_iterator = Batch(dataset=train_data,
                               batch_size=2,
                               sampler=RandomSampler())
        for batch_x, batch_y in batch_iterator:
            print("batch_x has: ", batch_x)
            print("batch_y has: ", batch_y)
            break

        from fastNLP.models.enas_model import ENASModel
        from fastNLP.models.enas_controller import Controller
        model = ENASModel(embed_num=len(vocab), num_classes=5)
        controller = Controller()

        from fastNLP.models.enas_trainer import ENASTrainer
        from copy import deepcopy

        # 更改DataSet中对应field的名称,要以模型的forward等参数名一致
        train_data.rename_field('words',
                                'word_seq')  # input field 与 forward 参数一致
        train_data.rename_field('label', 'label_seq')
        test_data.rename_field('words', 'word_seq')
        test_data.rename_field('label', 'label_seq')

        loss = CrossEntropyLoss(pred="output", target="label_seq")
        metric = AccuracyMetric(pred="predict", target="label_seq")

        trainer = ENASTrainer(model=model,
                              controller=controller,
                              train_data=train_data,
                              dev_data=test_data,
                              loss=CrossEntropyLoss(pred="output",
                                                    target="label_seq"),
                              metrics=AccuracyMetric(pred="predict",
                                                     target="label_seq"),
                              check_code_level=-1,
                              save_path=None,
                              batch_size=32,
                              print_every=1,
                              n_epochs=3,
                              final_epochs=1)
        trainer.train()
        print('Train finished!')

        # 调用Tester在test_data上评价效果
        from fastNLP import Tester

        tester = Tester(data=test_data,
                        model=model,
                        metrics=AccuracyMetric(pred="predict",
                                               target="label_seq"),
                        batch_size=4)

        acc = tester.test()
        print(acc)
Пример #15
0
def my_trainer(epochs, batch_size, lr, model_name, optimizer):
    lstm_model.to(device)
    loss_calc = nn.CrossEntropyLoss(reduction='mean')
    batch_iterator = Batch(dataset=train_data,
                           batch_size=batch_size,
                           sampler=RandomSampler())
    batch_iterator2 = Batch(dataset=dev_data,
                            batch_size=batch_size,
                            sampler=RandomSampler())
    loss_list = []
    metric_list = []
    # vali_loss_list = []
    count = 0
    min_perp = 0
    min_perp_epoch = 0
    for epo in range(epochs):
        for batch_x, batch_y in batch_iterator:
            x = batch_x['sentence'].cuda()
            y = batch_y['target'].cuda()
            optimizer.zero_grad()
            output = lstm_model(x)['pred']
            # seq_len = output.shape[2]
            loss = loss_calc(output, y)
            loss.backward()
            optimizer.step()
            with torch.no_grad():
                loss_list.append(loss.item())
                if count % 10 == 0:
                    print("step:", count, ", loss =", loss.item())
            count += 1

        perp = validation(batch_size, batch_iterator2)
        # vali_loss_list.append(vali_loss)
        if epo == 0 or min_perp >= perp:
            min_perp = perp
            # torch.save(lstm_model.state_dict(), model_name)
            min_perp_epoch = epo + 1
        with torch.no_grad():
            metric_list.append(perp)
            print("epochs =", epo + 1, ", perplexity =", perp)
            # print(gen_poem(lstm_model, vocab, "日"))
            # print(gen_poem(lstm_model, vocab, "红"))
            # print(gen_poem(lstm_model, vocab, "山"))
            # print(gen_poem(lstm_model, vocab, "夜"))
            # print(gen_poem(lstm_model, vocab, "湖"))
            # print(gen_poem(lstm_model, vocab, "海"))
            # print(gen_poem(lstm_model, vocab, "月"))

    print("finish train, best model in epoch", min_perp_epoch,
          ", perplexity =", min_perp)
    # torch.save(lstm_model.state_dict(), model_name+"_final")

    plt.plot(range(1, len(loss_list) + 1), loss_list, label='train_loss')
    plt.xlabel('steps')
    plt.ylabel('Loss')
    plt.title('Adam\nlearning_rate=%.1e, betas=(0.5, 0.99)' % (lr))
    plt.legend()
    plt.show()
    plt.plot(range(1, len(metric_list) + 1), metric_list, label='perplexity')
    plt.xlabel('epochs')
    plt.ylabel('Perplexity')
    plt.title('Adam\nlearning_rate=%.1e, betas=(0.5, 0.99)' % (lr))
    plt.legend()
    plt.show()
    return loss_list
Пример #16
0
    def __init__(self,
                 train_data,
                 model,
                 loss=None,
                 metrics=None,
                 n_epochs=3,
                 batch_size=32,
                 print_every=50,
                 validate_every=-1,
                 dev_data=None,
                 use_cuda=False,
                 save_path=None,
                 optimizer=Adam(lr=0.01, weight_decay=0),
                 check_code_level=0,
                 metric_key=None,
                 sampler=RandomSampler(),
                 use_tqdm=True):
        """

        :param DataSet train_data: the training data
        :param torch.nn.modules.module model: a PyTorch model
        :param LossBase loss: a loss object
        :param MetricBase or List[MetricBase] metrics: a metric object or a list of metrics
        :param int n_epochs: the number of training epochs
        :param int batch_size: batch size for training and validation
        :param int print_every: step interval to print next training information. Default: -1(no print).
        :param int validate_every: step interval to do next validation. Default: -1(validate every epoch).
        :param DataSet dev_data: the validation data
        :param use_cuda:
        :param save_path: file path to save models
        :param Optimizer optimizer: an optimizer object
        :param int check_code_level: level of FastNLP code checker. -1: don't check, 0: ignore. 1: warning. 2: strict.
            `ignore` will not check unused field; `warning` when warn if some field are not used; `strict` means
            it will raise error if some field are not used.
        :param str metric_key: a single indicator used to decide the best model based on metric results. It must be one
            of the keys returned by the FIRST metric in `metrics`. If the overall result gets better if the indicator gets
            smaller, add a `-` character in front of the string. For example
                ::
                    metric_key="-PPL"   # language model gets better as perplexity gets smaller
        :param sampler: method used to generate batch data.
        :param use_tqdm: boolean, use tqdm to show train progress.

        """
        super(Trainer, self).__init__()

        if not isinstance(train_data, DataSet):
            raise TypeError(
                f"The type of train_data must be fastNLP.DataSet, got {type(train_data)}."
            )
        if not isinstance(model, nn.Module):
            raise TypeError(
                f"The type of model must be torch.nn.Module, got {type(model)}."
            )

        # check metrics and dev_data
        if (not metrics) and dev_data is not None:
            raise ValueError("No metric for dev_data evaluation.")
        if metrics and (dev_data is None):
            raise ValueError(
                "No dev_data for evaluations, pass dev_data or set metrics to None. "
            )

        # check save_path
        if not (save_path is None or isinstance(save_path, str)):
            raise ValueError("save_path can only be None or `str`.")
        # prepare evaluate
        metrics = _prepare_metrics(metrics)

        # parse metric_key
        # increase_better is True. It means the exp result gets better if the indicator increases.
        # It is true by default.
        self.increase_better = True
        if metric_key is not None:
            self.increase_better = False if metric_key[0] == "-" else True
            self.metric_key = metric_key[1:] if metric_key[
                0] == "+" or metric_key[0] == "-" else metric_key
        elif len(metrics) > 0:
            self.metric_key = metrics[0].__class__.__name__.lower().strip(
                'metric')

        # prepare loss
        losser = _prepare_losser(loss)

        # sampler check
        if not isinstance(sampler, BaseSampler):
            raise ValueError(
                "The type of sampler should be fastNLP.BaseSampler, got {}.".
                format(type(sampler)))

        if check_code_level > -1:
            _check_code(dataset=train_data,
                        model=model,
                        losser=losser,
                        metrics=metrics,
                        dev_data=dev_data,
                        metric_key=metric_key,
                        check_level=check_code_level,
                        batch_size=min(batch_size, DEFAULT_CHECK_BATCH_SIZE))

        self.train_data = train_data
        self.dev_data = dev_data  # If None, No validation.
        self.model = model
        self.losser = losser
        self.metrics = metrics
        self.n_epochs = int(n_epochs)
        self.batch_size = int(batch_size)
        self.use_cuda = bool(use_cuda)
        self.save_path = save_path
        self.print_every = int(print_every)
        self.validate_every = int(validate_every)
        self.best_metric_indicator = None
        self.sampler = sampler

        if isinstance(optimizer, torch.optim.Optimizer):
            self.optimizer = optimizer
        else:
            self.optimizer = optimizer.construct_from_pytorch(
                self.model.parameters())

        self.use_tqdm = use_tqdm
        if self.use_tqdm:
            tester_verbose = 0
            self.print_every = abs(self.print_every)
        else:
            tester_verbose = 1

        if self.dev_data is not None:
            self.tester = Tester(model=self.model,
                                 data=self.dev_data,
                                 metrics=self.metrics,
                                 batch_size=self.batch_size,
                                 use_cuda=self.use_cuda,
                                 verbose=tester_verbose)

        self.step = 0
        self.start_time = None  # start timestamp
Пример #17
0
    def init_data_iterator(self, prop=0.8):
        train_data, test_data = get_text_classification_datasets()
        train_dataset = DataSet()
        valid_dataset = DataSet()
        length = len(train_data.data)
        for i in range(length):
            if i < int(prop * length):
                train_dataset.append(
                    Instance(text=train_data.data[i],
                             label=int(train_data.target[i])))
            else:
                valid_dataset.append(
                    Instance(text=train_data.data[i],
                             label=int(train_data.target[i])))

        test_dataset = DataSet()
        for i in range(len(test_data.data)):
            test_dataset.append(
                Instance(text=test_data.data[i],
                         label=int(test_data.target[i])))

        trans = str.maketrans({key: None for key in string.punctuation})

        train_dataset.apply(lambda x: x['text'].lower().translate(trans),
                            new_field_name='text')
        train_dataset.apply(
            lambda x: re.sub(pattern=r'\s', repl=' ', string=x['text']),
            new_field_name='text')
        train_dataset.apply(lambda x: x['text'].split(' '),
                            new_field_name='text')
        train_dataset.apply(remove_empty, new_field_name='text')
        train_dataset.apply(pad_label, new_field_name='label_pad')

        valid_dataset.apply(lambda x: x['text'].lower().translate(trans),
                            new_field_name='text')
        valid_dataset.apply(
            lambda x: re.sub(pattern=r'\s', repl=' ', string=x['text']),
            new_field_name='text')
        valid_dataset.apply(lambda x: x['text'].split(' '),
                            new_field_name='text')
        valid_dataset.apply(remove_empty, new_field_name='text')
        valid_dataset.apply(pad_label, new_field_name='label_pad')

        test_dataset.apply(lambda x: x['text'].lower().translate(trans),
                           new_field_name='text')
        test_dataset.apply(
            lambda x: re.sub(pattern=r'\s', repl=' ', string=x['text']),
            new_field_name='text')
        test_dataset.apply(lambda x: x['text'].split(' '),
                           new_field_name='text')
        test_dataset.apply(remove_empty, new_field_name='text')
        test_dataset.apply(pad_label, new_field_name='label_pad')

        vocab = Vocabulary(min_freq=10)
        train_dataset.apply(lambda x: [vocab.add(word) for word in x['text']])
        vocab.build_vocab()

        train_dataset.apply(
            lambda x: [vocab.to_index(word) for word in x['text']],
            new_field_name='text_index')
        valid_dataset.apply(
            lambda x: [vocab.to_index(word) for word in x['text']],
            new_field_name='text_index')
        test_dataset.apply(
            lambda x: [vocab.to_index(word) for word in x['text']],
            new_field_name='text_index')

        train_dataset.set_input('text_index')
        train_dataset.set_target('label_pad')

        valid_dataset.set_input('text_index')
        valid_dataset.set_target('label_pad')

        test_dataset.set_input('text_index')
        test_dataset.set_target('label_pad')

        bs = self.args['data']['batch_size']
        train_batch = Batch(dataset=train_dataset,
                            batch_size=bs,
                            sampler=RandomSampler())
        valid_batch = Batch(dataset=valid_dataset,
                            batch_size=bs,
                            sampler=RandomSampler())
        test_batch = Batch(dataset=test_dataset,
                           batch_size=bs,
                           sampler=RandomSampler())

        self.input_dim = len(vocab)

        return train_batch, valid_batch, test_batch
Пример #18
0
    def test_fastnlp_10min_tutorial(self):
        # 从csv读取数据到DataSet
        sample_path = "test/data_for_tests/tutorial_sample_dataset.csv"
        dataset = CSVLoader(headers=['raw_sentence', 'label'], sep='	')._load(sample_path)
        print(len(dataset))
        print(dataset[0])
        print(dataset[-3])

        dataset.append(Instance(raw_sentence='fake data', label='0'))
        # 将所有数字转为小写
        dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence')
        # label转int
        dataset.apply(lambda x: int(x['label']), new_field_name='label')

        # 使用空格分割句子
        def split_sent(ins):
            return ins['raw_sentence'].split()

        dataset.apply(split_sent, new_field_name='words')

        # 增加长度信息
        dataset.apply(lambda x: len(x['words']), new_field_name='seq_len')
        print(len(dataset))
        print(dataset[0])

        # DataSet.drop(func)筛除数据
        dataset.drop(lambda x: x['seq_len'] <= 3, inplace=True)
        print(len(dataset))

        # 设置DataSet中,哪些field要转为tensor
        # set target,loss或evaluate中的golden,计算loss,模型评估时使用
        dataset.set_target("label")
        # set input,模型forward时使用
        dataset.set_input("words", "seq_len")

        # 分出测试集、训练集
        test_data, train_data = dataset.split(0.5)
        print(len(test_data))
        print(len(train_data))

        # 构建词表, Vocabulary.add(word)
        vocab = Vocabulary(min_freq=2)
        train_data.apply(lambda x: [vocab.add(word) for word in x['words']])
        vocab.build_vocab()

        # index句子, Vocabulary.to_index(word)
        train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words')
        test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words')
        print(test_data[0])

        # 如果你们需要做强化学习或者GAN之类的项目,你们也可以使用这些数据预处理的工具
        from fastNLP.core.batch import DataSetIter
        from fastNLP.core.sampler import RandomSampler

        batch_iterator = DataSetIter(dataset=train_data, batch_size=2, sampler=RandomSampler())
        for batch_x, batch_y in batch_iterator:
            print("batch_x has: ", batch_x)
            print("batch_y has: ", batch_y)
            break

        from fastNLP.models import CNNText
        model = CNNText((len(vocab), 50), num_classes=5, dropout=0.1)

        from fastNLP import Trainer
        from copy import deepcopy

        # 更改DataSet中对应field的名称,要以模型的forward等参数名一致
        train_data.rename_field('label', 'label_seq')
        test_data.rename_field('label', 'label_seq')

        loss = CrossEntropyLoss(target="label_seq")
        metric = AccuracyMetric(target="label_seq")

        # 实例化Trainer,传入模型和数据,进行训练
        # 先在test_data拟合(确保模型的实现是正确的)
        copy_model = deepcopy(model)
        overfit_trainer = Trainer(train_data=test_data, model=copy_model, loss=loss, batch_size=32, n_epochs=5,
                                  dev_data=test_data, metrics=metric, save_path=None)
        overfit_trainer.train()

        # 用train_data训练,在test_data验证
        trainer = Trainer(model=model, train_data=train_data, dev_data=test_data,
                          loss=CrossEntropyLoss(target="label_seq"),
                          metrics=AccuracyMetric(target="label_seq"),
                          save_path=None,
                          batch_size=32,
                          n_epochs=5)
        trainer.train()
        print('Train finished!')

        # 调用Tester在test_data上评价效果
        from fastNLP import Tester

        tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(target="label_seq"),
                        batch_size=4)
        acc = tester.test()
        print(acc)
Пример #19
0
    def __init__(self, train_data, model, loss=None, metrics=None, n_epochs=3, batch_size=32, print_every=50,
                 validate_every=-1, dev_data=None, use_cuda=False, save_path=None,
                 optimizer=Adam(lr=0.01, weight_decay=0), check_code_level=0,
                 metric_key=None, sampler=RandomSampler(), use_tqdm=True):
        super(Trainer, self).__init__()

        if not isinstance(train_data, DataSet):
            raise TypeError(f"The type of train_data must be fastNLP.DataSet, got {type(train_data)}.")
        if not isinstance(model, nn.Module):
            raise TypeError(f"The type of model must be torch.nn.Module, got {type(model)}.")

        # check metrics and dev_data
        if (not metrics) and dev_data is not None:
            raise ValueError("No metric for dev_data evaluation.")
        if metrics and (dev_data is None):
            raise ValueError("No dev_data for evaluations, pass dev_data or set metrics to None. ")

        # check save_path
        if not (save_path is None or isinstance(save_path, str)):
            raise ValueError("save_path can only be None or `str`.")
        # prepare evaluate
        metrics = _prepare_metrics(metrics)

        # parse metric_key
        # increase_better is True. It means the exp result gets better if the indicator increases.
        # It is true by default.
        self.increase_better = True
        if metric_key is not None:
            self.increase_better = False if metric_key[0] == "-" else True
            self.metric_key = metric_key[1:] if metric_key[0] == "+" or metric_key[0] == "-" else metric_key
        elif len(metrics) > 0:
            self.metric_key = metrics[0].__class__.__name__.lower().strip('metric')

        # prepare loss
        losser = _prepare_losser(loss)

        # sampler check
        if not isinstance(sampler, BaseSampler):
            raise ValueError("The type of sampler should be fastNLP.BaseSampler, got {}.".format(type(sampler)))

        if check_code_level > -1:
            _check_code(dataset=train_data, model=model, losser=losser, metrics=metrics, dev_data=dev_data,
                        metric_key=metric_key, check_level=check_code_level,
                        batch_size=min(batch_size, DEFAULT_CHECK_BATCH_SIZE))

        self.train_data = train_data
        self.dev_data = dev_data  # If None, No validation.
        self.model = model
        self.losser = losser
        self.metrics = metrics
        self.n_epochs = int(n_epochs)
        self.batch_size = int(batch_size)
        self.use_cuda = bool(use_cuda)
        self.save_path = save_path
        self.print_every = int(print_every)
        self.validate_every = int(validate_every)
        self.best_metric_indicator = None
        self.sampler = sampler

        if isinstance(optimizer, torch.optim.Optimizer):
            self.optimizer = optimizer
        else:
            self.optimizer = optimizer.construct_from_pytorch(self.model.parameters())

        self.use_tqdm = use_tqdm
        if self.use_tqdm:
            tester_verbose = 0
            self.print_every = abs(self.print_every)
        else:
            tester_verbose = 1

        if self.dev_data is not None:
            self.tester = Tester(model=self.model,
                                 data=self.dev_data,
                                 metrics=self.metrics,
                                 batch_size=self.batch_size,
                                 use_cuda=self.use_cuda,
                                 verbose=tester_verbose)

        self.step = 0
        self.start_time = None  # start timestamp