class Task(object): def __init__(self, task_id, task_name, train_set, dev_set, test_set): self.task_id = task_id self.task_name = task_name self.train_set = train_set self.dev_set = dev_set self.test_set = test_set self.train_data_loader = None self.dev_data_loader = None self.test_data_loader = None def init_data_loader(self, batch_size): self.train_data_loader = Batch(self.train_set, batch_size, sampler=RandomSampler()) self.train_data_loader.init_iter() self.dev_data_loader = Batch(self.dev_set, batch_size, sampler=SequentialSampler()) self.test_data_loader = Batch(self.test_set, batch_size, sampler=SequentialSampler())
def _tqdm_train(self): self.step = 0 data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False) total_steps = data_iterator.num_batches * self.n_epochs with tqdm(total=total_steps, postfix='loss:{0:<6.5f}', leave=False, dynamic_ncols=True) as pbar: avg_loss = 0 for epoch in range(1, self.n_epochs + 1): pbar.set_description_str( desc="Epoch {}/{}".format(epoch, self.n_epochs)) for batch_x, batch_y in data_iterator: _move_dict_value_to_device(batch_x, batch_y, device=self._model_device) prediction = self._data_forward(self.model, batch_x) loss = self._compute_loss(prediction, batch_y) avg_loss += loss.item() self._grad_backward(loss) self._update() self._summary_writer.add_scalar("loss", loss.item(), global_step=self.step) for name, param in self.model.named_parameters(): if param.requires_grad: self._summary_writer.add_scalar( name + "_mean", param.mean(), global_step=self.step) # self._summary_writer.add_scalar(name + "_std", param.std(), global_step=self.step) # self._summary_writer.add_scalar(name + "_grad_sum", param.sum(), global_step=self.step) if (self.step + 1) % self.print_every == 0: pbar.set_postfix_str("loss:{0:<6.5f}".format( avg_loss / self.print_every)) avg_loss = 0 pbar.update(self.print_every) self.step += 1 if self.validate_every > 0 and self.step % self.validate_every == 0 \ and self.dev_data is not None: eval_res = self._do_validation(epoch=epoch, step=self.step) eval_str = "Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, total_steps) + \ self.tester._format_eval_results(eval_res) pbar.write(eval_str) if self.validate_every < 0 and self.dev_data: eval_res = self._do_validation(epoch=epoch, step=self.step) eval_str = "Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, total_steps) + \ self.tester._format_eval_results(eval_res) pbar.write(eval_str) if epoch != self.n_epochs: data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False) pbar.close()
def init_data_loader(self, batch_size): self.train_data_loader = Batch(self.train_set, batch_size, sampler=RandomSampler()) self.train_data_loader.init_iter() self.dev_data_loader = Batch(self.dev_set, batch_size, sampler=SequentialSampler()) self.test_data_loader = Batch(self.test_set, batch_size, sampler=SequentialSampler())
def test(self): # turn on the testing mode; clean up the history network = self._model self._mode(network, is_test=True) data_iterator = Batch(self.data, self.batch_size, sampler=SequentialSampler(), as_numpy=False) eval_results = {} try: with torch.no_grad(): for batch_x, batch_y in data_iterator: _move_dict_value_to_device(batch_x, batch_y, device=self._model_device) pred_dict = self._data_forward(self._predict_func, batch_x) if not isinstance(pred_dict, dict): raise TypeError(f"The return value of {get_func_signature(self._predict_func)} " f"must be `dict`, got {type(pred_dict)}.") for metric in self.metrics: metric(pred_dict, batch_y) for metric in self.metrics: eval_result = metric.get_metric() if not isinstance(eval_result, dict): raise TypeError(f"The return value of {get_func_signature(metric.get_metric)} must be " f"`dict`, got {type(eval_result)}") metric_name = metric.__class__.__name__ eval_results[metric_name] = eval_result except CheckError as e: prev_func_signature = get_func_signature(self._predict_func) _check_loss_evaluate(prev_func_signature=prev_func_signature, func_signature=e.func_signature, check_res=e.check_res, pred_dict=pred_dict, target_dict=batch_y, dataset=self.data, check_level=0) if self.verbose >= 1: print("[tester] \n{}".format(self._format_eval_results(eval_results))) self._mode(network, is_test=False) return eval_results
def test(self): # turn on the testing mode; clean up the history network = self._model self.mode(network, is_test=True) self.eval_history.clear() output, truths = defaultdict(list), defaultdict(list) data_iterator = Batch(self.data, self.batch_size, sampler=RandomSampler(), as_numpy=False) with torch.no_grad(): for batch_x, batch_y in data_iterator: prediction = self.data_forward(network, batch_x) assert isinstance(prediction, dict) for k, v in prediction.items(): output[k].append(v) for k, v in batch_y.items(): truths[k].append(v) for k, v in output.items(): output[k] = itertools.chain(*v) for k, v in truths.items(): truths[k] = itertools.chain(*v) args = _build_args(self._evaluator, **output, **truths) eval_results = self._evaluator(**args) print("[tester] {}".format(self.print_eval_results(eval_results))) self.mode(network, is_test=False) return eval_results
def predict(self, network, data): """Perform inference using the trained model. :param network: a PyTorch model (cpu) :param data: a DataSet object. :return: list of list of strings, [num_examples, tag_seq_length] """ # transform strings into DataSet object # data = self.prepare_input(data) # turn on the testing mode; clean up the history self.mode(network, test=True) batch_output = [] data_iterator = Batch(data, batch_size=self.batch_size, sampler=SequentialSampler(), use_cuda=False) for batch_x, _ in data_iterator: with torch.no_grad(): prediction = self.data_forward(network, batch_x) batch_output.append(prediction) return self._post_processor(batch_output, self.label_vocab)
def test(self, filepath): tag_proc = self._dict['tag_indexer'] cws_model = self.pipeline.pipeline[-2].model pipeline = self.pipeline.pipeline[:5] pipeline.insert(1, tag_proc) pp = Pipeline(pipeline) reader = ConlluCWSReader() # te_filename = '/home/hyan/ctb3/test.conllx' te_dataset = reader.load(filepath) pp(te_dataset) batch_size = 64 te_batcher = Batch(te_dataset, batch_size, SequentialSampler(), use_cuda=False) pre, rec, f1 = calculate_pre_rec_f1(cws_model, te_batcher, type='bmes') f1 = round(f1 * 100, 2) pre = round(pre * 100, 2) rec = round(rec * 100, 2) # print("f1:{:.2f}, pre:{:.2f}, rec:{:.2f}".format(f1, pre, rec)) return f1, pre, rec
def test(self): data = DataSet() for text, label in zip(texts, labels): x = TextField(text, is_target=False) y = LabelField(label, is_target=True) ins = Instance(text=x, label=y) data.append(ins) # use vocabulary to index data data.index_field("text", vocab) # define naive sampler for batch class class SeqSampler: def __call__(self, dataset): return list(range(len(dataset))) # use batch to iterate dataset data_iterator = Batch(data, 2, SeqSampler(), False) total_data = 0 for batch_x, batch_y in data_iterator: total_data += batch_x["text"].size(0) self.assertTrue(batch_x["text"].size(0) == 2 or total_data == len(raw_texts)) self.assertTrue(isinstance(batch_x, dict)) self.assertTrue(isinstance(batch_x["text"], torch.LongTensor)) self.assertTrue(isinstance(batch_y, dict)) self.assertTrue(isinstance(batch_y["label"], torch.LongTensor))
def process(self, dataset): self.model.eval() assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) data_iterator = Batch(dataset, batch_size=self.batch_size, sampler=SequentialSampler()) batch_output = defaultdict(list) if hasattr(self.model, "predict"): predict_func = self.model.predict else: predict_func = self.model.forward with torch.no_grad(): for batch_x, _ in data_iterator: refined_batch_x = _build_args(predict_func, **batch_x) prediction = predict_func(**refined_batch_x) seq_lens = batch_x[self.seq_len_field_name].tolist() for key, value in prediction.items(): tmp_batch = [] value = value.cpu().numpy() if len(value.shape) == 1 or (len(value.shape) == 2 and value.shape[1] == 1): batch_output[key].extend(value.tolist()) else: for idx, seq_len in enumerate(seq_lens): tmp_batch.append(value[idx, :seq_len]) batch_output[key].extend(tmp_batch) if not self.seq_len_field_name in prediction: batch_output[self.seq_len_field_name].extend(seq_lens) # TODO 当前的实现会导致之后的processor需要知道model输出的output的key是什么 for field_name, fields in batch_output.items(): dataset.add_field(field_name, fields, is_input=True, is_target=False) return dataset
def test(self, network, dev_data): if torch.cuda.is_available() and self.use_cuda: self._model = network.cuda() else: self._model = network # turn on the testing mode; clean up the history self.mode(network, is_test=True) self.eval_history.clear() self.batch_output.clear() data_iterator = Batch(dev_data, self.batch_size, sampler=RandomSampler(), use_cuda=self.use_cuda) step = 0 for batch_x, batch_y in data_iterator: with torch.no_grad(): prediction = self.data_forward(network, batch_x) eval_results = self.evaluate(prediction, batch_y) if self.save_output: self.batch_output.append(prediction) if self.save_loss: self.eval_history.append(eval_results) print_output = "[test step {}] {}".format(step, eval_results) logger.info(print_output) if self.print_every_step > 0 and step % self.print_every_step == 0: print(self.make_eval_output(prediction, eval_results)) step += 1
def test_list_of_numpy_to_tensor(self): ds = DataSet([Instance(x=np.array([1, 2]), y=np.array([3, 4])) for _ in range(2)] + [Instance(x=np.array([1, 2, 3, 4]), y=np.array([3, 4, 5, 6])) for _ in range(2)]) ds.set_input("x") ds.set_target("y") iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False) for x, y in iter: print(x, y)
def test_sequential_batch(self): batch_size = 32 pause_seconds = 0.01 num_samples = 1000 dataset = generate_fake_dataset(num_samples) batch = Batch(dataset, batch_size=batch_size, sampler=SequentialSampler()) for batch_x, batch_y in batch: time.sleep(pause_seconds)
def next_batch(self): try: return next(self.train_iter) except StopAsyncIteration: self.train_iter = iter( Batch(dataset=self.train_data, batch_size=self.batch_size, sampler=SequentialSampler())) return next(self.train_iter)
def test_numpy_padding(self): ds = DataSet({"x": np.array([[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10), "y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)}) ds.set_input("x") ds.set_target("y") iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True) for x, y in iter: self.assertEqual(x["x"].shape, (4, 4)) self.assertEqual(y["y"].shape, (4, 4))
def test_simple(self): dataset = construct_dataset( [["FastNLP", "is", "the", "most", "beautiful", "tool", "in", "the", "world"] for _ in range(40)]) dataset.set_target() batch = Batch(dataset, batch_size=4, sampler=SequentialSampler(), as_numpy=True) cnt = 0 for _, _ in batch: cnt += 1 self.assertEqual(cnt, 10)
def test_list_of_list_to_tensor(self): ds = DataSet([Instance(x=[1, 2], y=[3, 4]) for _ in range(2)] + [Instance(x=[1, 2, 3, 4], y=[3, 4, 5, 6]) for _ in range(2)]) ds.set_input("x") ds.set_target("y") iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False) for x, y in iter: self.assertTrue(isinstance(x["x"], torch.Tensor)) self.assertEqual(tuple(x["x"].shape), (4, 4)) self.assertTrue(isinstance(y["y"], torch.Tensor)) self.assertEqual(tuple(y["y"].shape), (4, 4))
def test_dataset_batching(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40}) ds.set_input("x") ds.set_target("y") iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True) for x, y in iter: self.assertTrue(isinstance(x["x"], np.ndarray) and isinstance(y["y"], np.ndarray)) self.assertEqual(len(x["x"]), 4) self.assertEqual(len(y["y"]), 4) self.assertListEqual(list(x["x"][-1]), [1, 2, 3, 4]) self.assertListEqual(list(y["y"][-1]), [5, 6])
def test_numpy_to_tensor(self): ds = DataSet({"x": np.array([[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10), "y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)}) ds.set_input("x") ds.set_target("y") iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False) for x, y in iter: self.assertTrue(isinstance(x["x"], torch.Tensor)) self.assertEqual(tuple(x["x"].shape), (4, 4)) self.assertTrue(isinstance(y["y"], torch.Tensor)) self.assertEqual(tuple(y["y"].shape), (4, 4))
def _train(self): if not self.use_tqdm: from fastNLP.core.utils import _pseudo_tqdm as inner_tqdm else: inner_tqdm = tqdm self.step = 0 start = time.time() total_steps = (len(self.train_data) // self.batch_size + int( len(self.train_data) % self.batch_size != 0)) * self.n_epochs with inner_tqdm(total=total_steps, postfix='loss:{0:<6.5f}', leave=False, dynamic_ncols=True) as pbar: avg_loss = 0 data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, prefetch=self.prefetch) for epoch in range(1, self.n_epochs + 1): pbar.set_description_str( desc="Epoch {}/{}".format(epoch, self.n_epochs)) last_stage = (epoch > self.n_epochs + 1 - self.final_epochs) if epoch == self.n_epochs + 1 - self.final_epochs: print( 'Entering the final stage. (Only train the selected structure)' ) # early stopping self.callback_manager.on_epoch_begin(epoch, self.n_epochs) # 1. Training the shared parameters omega of the child models self.train_shared(pbar) # 2. Training the controller parameters theta if not last_stage: self.train_controller() if ((self.validate_every > 0 and self.step % self.validate_every == 0) or (self.validate_every < 0 and self.step % len(data_iterator) == 0)) \ and self.dev_data is not None: if not last_stage: self.derive() eval_res = self._do_validation(epoch=epoch, step=self.step) eval_str = "Evaluation at Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, total_steps) + \ self.tester._format_eval_results(eval_res) pbar.write(eval_str) # lr decay; early stopping self.callback_manager.on_epoch_end(epoch, self.n_epochs, self.optimizer) # =============== epochs end =================== # pbar.close()
def next_batch(self): try: _next_batch = next(self.train_iter) if _next_batch[0]['word_seq'].shape[0] != self.batch_size: raise StopIteration return _next_batch except StopIteration: self.train_iter = iter( Batch(dataset=self.train_data, batch_size=self.batch_size, sampler=SequentialSampler())) return self.next_batch()
def __init__(self, path=".data/yelp", dataset="yelp", batch_size=32): if dataset == "yelp": dataset = DataSet() for db_set in ['train']: text_file = os.path.join(path, 'sentiment.' + db_set + '.text') label_file = os.path.join(path, 'sentiment.' + db_set + '.labels') with io.open(text_file, 'r', encoding="utf-8") as tf, io.open( label_file, 'r', encoding="utf-8") as lf: for text in tf: label = lf.readline() dataset.append(Instance(text=text, label=label)) dataset.apply(lambda x: x['text'].lower(), new_field_name='text') dataset.apply( lambda x: ['<start>'] + x['text'].split() + ['<eos>'], new_field_name='words') dataset.drop(lambda x: len(x['words']) > 1 + 15 + 1) dataset.apply(lambda x: x['words'] + ['<pad>'] * (17 - len(x['words'])), new_field_name='words') dataset.apply(lambda x: int(x['label']), new_field_name='label_seq', is_target=True) _train_data, _test_data = dataset.split(0.3) _vocab = Vocabulary(min_freq=2) _train_data.apply( lambda x: [_vocab.add(word) for word in x['words']]) _vocab.build_vocab() _train_data.apply( lambda x: [_vocab.to_index(word) for word in x['words']], new_field_name='word_seq', is_input=True) _test_data.apply( lambda x: [_vocab.to_index(word) for word in x['words']], new_field_name='word_seq', is_input=True) self.train_data = _train_data self.test_data = _test_data self.vocab = _vocab self.batch_size = batch_size self.train_iter = iter( Batch(dataset=self.train_data, batch_size=self.batch_size, sampler=SequentialSampler()))
def train(self, network, train_data, dev_data=None): """General Training Procedure :param network: a model :param train_data: a DataSet instance, the training data :param dev_data: a DataSet instance, the validation data (optional) """ # transfer model to gpu if available if torch.cuda.is_available() and self.use_cuda: self._model = network.cuda() # self._model is used to access model-specific loss else: self._model = network # define Tester over dev data if self.validate: default_valid_args = {"batch_size": self.batch_size, "pickle_path": self.pickle_path, "use_cuda": self.use_cuda, "evaluator": self._evaluator} validator = self._create_validator(default_valid_args) logger.info("validator defined as {}".format(str(validator))) # optimizer and loss self.define_optimizer() logger.info("optimizer defined as {}".format(str(self._optimizer))) self.define_loss() logger.info("loss function defined as {}".format(str(self._loss_func))) # main training procedure start = time.time() logger.info("training epochs started") for epoch in range(1, self.n_epochs + 1): logger.info("training epoch {}".format(epoch)) # turn on network training mode self.mode(network, is_test=False) # prepare mini-batch iterator data_iterator = Batch(train_data, batch_size=self.batch_size, sampler=RandomSampler(), use_cuda=self.use_cuda) logger.info("prepared data iterator") # one forward and backward pass self._train_step(data_iterator, network, start=start, n_print=self.print_every_step, epoch=epoch) # validation if self.validate: if dev_data is None: raise RuntimeError( "self.validate is True in trainer, but dev_data is None. Please provide the validation data.") logger.info("validation started") validator.test(network, dev_data)
def train(self): """Start Training. :return: """ try: if torch.cuda.is_available() and self.use_cuda: self.model = self.model.cuda() self.mode(self.model, is_test=False) start = time.time() self.start_time = str(datetime.now().strftime('%Y-%m-%d-%H-%M-%S')) print("training epochs started " + self.start_time) if self.save_path is None: class psudoSW: def __getattr__(self, item): def pass_func(*args, **kwargs): pass return pass_func self._summary_writer = psudoSW() else: path = os.path.join( self.save_path, 'tensorboard_logs_{}'.format(self.start_time)) self._summary_writer = SummaryWriter(path) epoch = 1 while epoch <= self.n_epochs: data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=RandomSampler(), as_numpy=False) self._train_epoch(data_iterator, self.model, epoch, self.dev_data, start) # validate_every override validation at end of epochs if self.dev_data and self.validate_every <= 0: self.do_validation() epoch += 1 finally: self._summary_writer.close() del self._summary_writer
def _print_train(self): epoch = 1 start = time.time() while epoch <= self.n_epochs: data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False) for batch_x, batch_y in data_iterator: # TODO 这里可能会遇到问题,万一用户在model内部修改了prediction的device就会有问题 _move_dict_value_to_device(batch_x, batch_y, device=self._model_device) prediction = self._data_forward(self.model, batch_x) loss = self._compute_loss(prediction, batch_y) self._grad_backward(loss) self._update() self._summary_writer.add_scalar("loss", loss.item(), global_step=self.step) for name, param in self.model.named_parameters(): if param.requires_grad: self._summary_writer.add_scalar(name + "_mean", param.mean(), global_step=self.step) # self._summary_writer.add_scalar(name + "_std", param.std(), global_step=self.step) # self._summary_writer.add_scalar(name + "_grad_sum", param.sum(), global_step=self.step) if self.print_every > 0 and self.step % self.print_every == 0: end = time.time() diff = timedelta(seconds=round(end - start)) print_output = "[epoch: {:>3} step: {:>4}] train loss: {:>4.6} time: {}".format( epoch, self.step, loss.data, diff) print(print_output) if (self.validate_every > 0 and self.step % self.validate_every == 0 and self.dev_data is not None): self._do_validation(epoch=epoch, step=self.step) self.step += 1 # validate_every override validation at end of epochs if self.dev_data and self.validate_every <= 0: self._do_validation(epoch=epoch, step=self.step) epoch += 1
def test(self, model, dataset): self.model = model.cuda() if self.use_cuda else model self.model.eval() batchiter = Batch(dataset, self.batch_size, SequentialSampler(), self.use_cuda) eval_res = defaultdict(list) i = 0 for batch_x, batch_y in batchiter: with torch.no_grad(): pred_y = self.model(**batch_x) eval_one = self.model.evaluate(**pred_y, **batch_y) i += self.batch_size for eval_name, tensor in eval_one.items(): eval_res[eval_name].append(tensor) tmp = {} for eval_name, tensorlist in eval_res.items(): tmp[eval_name] = torch.cat(tensorlist, dim=0) self.res = self.model.metrics(**tmp)
def test(self, network, dev_data): if torch.cuda.is_available() and self.use_cuda: self._model = network.cuda() else: self._model = network # turn on the testing mode; clean up the history self.mode(network, is_test=True) self.eval_history.clear() output_list = [] truth_list = [] data_iterator = Batch(dev_data, self.batch_size, sampler=RandomSampler(), use_cuda=self.use_cuda) for batch_x, batch_y in data_iterator: with torch.no_grad(): prediction = self.data_forward(network, batch_x) output_list.append(prediction) truth_list.append(batch_y) eval_results = self.evaluate(output_list, truth_list) print("[tester] {}".format(self.print_eval_results(eval_results))) logger.info("[tester] {}".format(self.print_eval_results(eval_results)))
def predict(self, network, data): """Perform inference using the trained model. :param network: a PyTorch model (cpu) :param data: a DataSet object. :return: list of batch outputs """ # turn on the testing mode; clean up the history self.mode(network, test=True) batch_output = [] data_iterator = Batch(data, batch_size=self.batch_size, sampler=SequentialSampler(), as_numpy=False) for batch_x, _ in data_iterator: with torch.no_grad(): prediction = self.data_forward(network, batch_x) batch_output.append(prediction) return batch_output
def get_reward(self, dag, entropies, hidden, valid_idx=0): """Computes the perplexity of a single sampled model on a minibatch of validation data. """ if not isinstance(entropies, np.ndarray): entropies = entropies.data.cpu().numpy() data_iterator = Batch(self.dev_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False, prefetch=self.prefetch) for inputs, targets in data_iterator: valid_loss, hidden, _ = self.get_loss(inputs, targets, hidden, dag) valid_loss = utils.to_item(valid_loss.data) valid_ppl = math.exp(valid_loss) R = 80 / valid_ppl rewards = R + 1e-4 * entropies return rewards, hidden
def train(self, network, train_data, dev_data=None): """General Training Procedure :param network: a model :param train_data: a DataSet instance, the training data :param dev_data: a DataSet instance, the validation data (optional) """ # transfer model to gpu if available if torch.cuda.is_available() and self.use_cuda: self._model = network.cuda() # self._model is used to access model-specific loss else: self._model = network # define Tester over dev data if self.validate: default_valid_args = { "save_output": True, "validate_in_training": True, "save_dev_input": True, "save_loss": True, "batch_size": self.batch_size, "pickle_path": self.pickle_path, "use_cuda": self.use_cuda, "print_every_step": 0 } validator = self._create_validator(default_valid_args) logger.info("validator defined as {}".format(str(validator))) # optimizer and loss self.define_optimizer() logger.info("optimizer defined as {}".format(str(self._optimizer))) self.define_loss() logger.info("loss function defined as {}".format(str(self._loss_func))) # main training procedure start = time.time() logger.info("training epochs started") for epoch in range(1, self.n_epochs + 1): logger.info("training epoch {}".format(epoch)) # turn on network training mode self.mode(network, is_test=False) # prepare mini-batch iterator data_iterator = Batch(train_data, batch_size=self.batch_size, sampler=RandomSampler(), use_cuda=self.use_cuda) logger.info("prepared data iterator") # one forward and backward pass self._train_step(data_iterator, network, start=start, n_print=self.print_every_step, epoch=epoch) # validation if self.validate: logger.info("validation started") validator.test(network, dev_data) if self.save_best_dev and self.best_eval_result(validator): self.save_model(network, self.model_name) print("Saved better model selected by validation.") logger.info("Saved better model selected by validation.") valid_results = validator.show_metrics() print("[epoch {}] {}".format(epoch, valid_results)) logger.info("[epoch {}] {}".format(epoch, valid_results))
def __init__(self, path='.data/sst/trees', data_type='sst', batch_size=32, split_ratio=0.1, seq_len=15, min_freq=2): data_set = DataSet() if data_type == 'yelp': path = '.data/yelp' for db_set in ['train']: text_file = os.path.join(path, 'sentiment.' + db_set + '.text') label_file = os.path.join(path, 'sentiment.' + db_set + '.labels') with io.open(text_file, 'r', encoding="utf-8") as tf, io.open( label_file, 'r', encoding="utf-8") as lf: for text in tf: label = lf.readline() data_set.append(Instance(text=text, label=label)) data_set.apply( lambda x: ['<start>'] + x['text'].lower().split() + ['<eos>'], new_field_name='words') data_set.drop(lambda x: len(x['words']) > seq_len + 2) elif data_type == 'sst': path = '.data/sst/trees' text = data.Field(init_token='<start>', eos_token='<eos>', lower=True, tokenize='spacy', fix_length=16) label = data.Field(sequential=False, unk_token='<unk>') filter = lambda ex: len(ex.text ) <= seq_len and ex.label != 'neutral' sst_train = datasets.SST(os.path.join(path, 'train.txt'), text, label, filter_pred=filter) sst_dev = datasets.SST(os.path.join(path, 'dev.txt'), text, label, filter_pred=filter) sst_test = datasets.SST(os.path.join(path, 'test.txt'), text, label, filter_pred=filter) for ex in sst_train.examples + sst_dev.examples + sst_test.examples: data_set.append( Instance(words=ex.text, label={ 'negative': 0, 'positive': 1 }[ex.label])) data_set.apply( lambda x: ['<start>'] + [w.lower() for w in x['words']] + ['<eos>'], new_field_name='words') elif data_type == 'test': with io.open('fasttrial1.pos', 'r', encoding="utf-8") as f: for text in f: data_set.append(Instance(text=text, label=1)) with io.open('fasttrial1.neg', 'r', encoding="utf-8") as f: for text in f: data_set.append(Instance(text=text, label=0)) data_set.apply( lambda x: ['<start>'] + x['text'].lower().split() + ['<eos>'], new_field_name='words') data_set.drop(lambda x: len(x['words']) > seq_len + 2) data_set.apply(lambda x: x['words'] + ['<pad>'] * (seq_len + 2 - len(x['words'])), new_field_name='words') _train_data, _ = data_set.split(split_ratio) _vocab = Vocabulary(min_freq=min_freq) _train_data.apply(lambda x: [_vocab.add(word) for word in x['words']]) _vocab.build_vocab() data_set.apply(lambda x: [_vocab.to_index(w) for w in x['words']], new_field_name='word_seq', is_input=True) data_set.apply(lambda x: x['word_seq'][1:] + [0], new_field_name='dec_target', is_target=True) data_set.apply(lambda x: int(x['label']), new_field_name='label_seq', is_target=True) _train_data, _test_data = data_set.split(split_ratio) self.train_data = _train_data self.test_data = _test_data self.vocab = _vocab self.batch_size = batch_size self.train_iter = iter( Batch(dataset=self.train_data, batch_size=self.batch_size, sampler=SequentialSampler()))