def init_data_loader(self, batch_size): self.train_data_loader = DataSetIter(self.train_set, batch_size, sampler=RandomSampler()) self.dev_data_loader = DataSetIter(self.dev_set, batch_size, sampler=SequentialSampler()) self.test_data_loader = DataSetIter(self.test_set, batch_size, sampler=SequentialSampler())
def predict(self, network, data): """Perform inference using the trained model. :param network: a PyTorch model (cpu) :param data: a DataSet object. :return: list of list of strings, [num_examples, tag_seq_length] """ # transform strings into DataSet object # data = self.prepare_input(data) # turn on the testing mode; clean up the history self.mode(network, test=True) batch_output = [] data_iterator = Batch(data, batch_size=self.batch_size, sampler=SequentialSampler(), use_cuda=False) for batch_x, _ in data_iterator: with torch.no_grad(): prediction = self.data_forward(network, batch_x) batch_output.append(prediction) return self._post_processor(batch_output, self.label_vocab)
def test(self, filepath): tag_proc = self._dict['tag_indexer'] cws_model = self.pipeline.pipeline[-2].model pipeline = self.pipeline.pipeline[:5] pipeline.insert(1, tag_proc) pp = Pipeline(pipeline) reader = ConlluCWSReader() # te_filename = '/home/hyan/ctb3/test.conllx' te_dataset = reader.load(filepath) pp(te_dataset) batch_size = 64 te_batcher = Batch(te_dataset, batch_size, SequentialSampler(), use_cuda=False) pre, rec, f1 = calculate_pre_rec_f1(cws_model, te_batcher, type='bmes') f1 = round(f1 * 100, 2) pre = round(pre * 100, 2) rec = round(rec * 100, 2) # print("f1:{:.2f}, pre:{:.2f}, rec:{:.2f}".format(f1, pre, rec)) return f1, pre, rec
def test(self): # turn on the testing mode; clean up the history network = self._model self._mode(network, is_test=True) data_iterator = Batch(self.data, self.batch_size, sampler=SequentialSampler(), as_numpy=False) eval_results = {} try: with torch.no_grad(): for batch_x, batch_y in data_iterator: _move_dict_value_to_device(batch_x, batch_y, device=self._model_device) pred_dict = self._data_forward(self._predict_func, batch_x) if not isinstance(pred_dict, dict): raise TypeError(f"The return value of {get_func_signature(self._predict_func)} " f"must be `dict`, got {type(pred_dict)}.") for metric in self.metrics: metric(pred_dict, batch_y) for metric in self.metrics: eval_result = metric.get_metric() if not isinstance(eval_result, dict): raise TypeError(f"The return value of {get_func_signature(metric.get_metric)} must be " f"`dict`, got {type(eval_result)}") metric_name = metric.__class__.__name__ eval_results[metric_name] = eval_result except CheckError as e: prev_func_signature = get_func_signature(self._predict_func) _check_loss_evaluate(prev_func_signature=prev_func_signature, func_signature=e.func_signature, check_res=e.check_res, pred_dict=pred_dict, target_dict=batch_y, dataset=self.data, check_level=0) if self.verbose >= 1: print("[tester] \n{}".format(self._format_eval_results(eval_results))) self._mode(network, is_test=False) return eval_results
def process(self, dataset): self.model.eval() assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset)) data_iterator = Batch(dataset, batch_size=self.batch_size, sampler=SequentialSampler()) batch_output = defaultdict(list) if hasattr(self.model, "predict"): predict_func = self.model.predict else: predict_func = self.model.forward with torch.no_grad(): for batch_x, _ in data_iterator: refined_batch_x = _build_args(predict_func, **batch_x) prediction = predict_func(**refined_batch_x) seq_lens = batch_x[self.seq_len_field_name].tolist() for key, value in prediction.items(): tmp_batch = [] value = value.cpu().numpy() if len(value.shape) == 1 or (len(value.shape) == 2 and value.shape[1] == 1): batch_output[key].extend(value.tolist()) else: for idx, seq_len in enumerate(seq_lens): tmp_batch.append(value[idx, :seq_len]) batch_output[key].extend(tmp_batch) if not self.seq_len_field_name in prediction: batch_output[self.seq_len_field_name].extend(seq_lens) # TODO 当前的实现会导致之后的processor需要知道model输出的output的key是什么 for field_name, fields in batch_output.items(): dataset.add_field(field_name, fields, is_input=True, is_target=False) return dataset
def test_list_of_numpy_to_tensor(self): ds = DataSet([Instance(x=np.array([1, 2]), y=np.array([3, 4])) for _ in range(2)] + [Instance(x=np.array([1, 2, 3, 4]), y=np.array([3, 4, 5, 6])) for _ in range(2)]) ds.set_input("x") ds.set_target("y") iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False) for x, y in iter: print(x, y)
def next_batch(self): try: return next(self.train_iter) except StopAsyncIteration: self.train_iter = iter( Batch(dataset=self.train_data, batch_size=self.batch_size, sampler=SequentialSampler())) return next(self.train_iter)
def test_numpy_padding(self): ds = DataSet({"x": np.array([[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10), "y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)}) ds.set_input("x") ds.set_target("y") iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True) for x, y in iter: self.assertEqual(x["x"].shape, (4, 4)) self.assertEqual(y["y"].shape, (4, 4))
def test_sequential_batch(self): batch_size = 32 pause_seconds = 0.01 num_samples = 1000 dataset = generate_fake_dataset(num_samples) batch = Batch(dataset, batch_size=batch_size, sampler=SequentialSampler()) for batch_x, batch_y in batch: time.sleep(pause_seconds)
def test_simple(self): dataset = construct_dataset( [["FastNLP", "is", "the", "most", "beautiful", "tool", "in", "the", "world"] for _ in range(40)]) dataset.set_target() batch = Batch(dataset, batch_size=4, sampler=SequentialSampler(), as_numpy=True) cnt = 0 for _, _ in batch: cnt += 1 self.assertEqual(cnt, 10)
def test_dataset_batching(self): ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40}) ds.set_input("x") ds.set_target("y") iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True) for x, y in iter: self.assertTrue(isinstance(x["x"], np.ndarray) and isinstance(y["y"], np.ndarray)) self.assertEqual(len(x["x"]), 4) self.assertEqual(len(y["y"]), 4) self.assertListEqual(list(x["x"][-1]), [1, 2, 3, 4]) self.assertListEqual(list(y["y"][-1]), [5, 6])
def test_numpy_to_tensor(self): ds = DataSet({"x": np.array([[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10), "y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)}) ds.set_input("x") ds.set_target("y") iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False) for x, y in iter: self.assertTrue(isinstance(x["x"], torch.Tensor)) self.assertEqual(tuple(x["x"].shape), (4, 4)) self.assertTrue(isinstance(y["y"], torch.Tensor)) self.assertEqual(tuple(y["y"].shape), (4, 4))
def test_list_of_list_to_tensor(self): ds = DataSet([Instance(x=[1, 2], y=[3, 4]) for _ in range(2)] + [Instance(x=[1, 2, 3, 4], y=[3, 4, 5, 6]) for _ in range(2)]) ds.set_input("x") ds.set_target("y") iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False) for x, y in iter: self.assertTrue(isinstance(x["x"], torch.Tensor)) self.assertEqual(tuple(x["x"].shape), (4, 4)) self.assertTrue(isinstance(y["y"], torch.Tensor)) self.assertEqual(tuple(y["y"].shape), (4, 4))
def next_batch(self): try: _next_batch = next(self.train_iter) if _next_batch[0]['word_seq'].shape[0] != self.batch_size: raise StopIteration return _next_batch except StopIteration: self.train_iter = iter( Batch(dataset=self.train_data, batch_size=self.batch_size, sampler=SequentialSampler())) return self.next_batch()
def __init__(self, path=".data/yelp", dataset="yelp", batch_size=32): if dataset == "yelp": dataset = DataSet() for db_set in ['train']: text_file = os.path.join(path, 'sentiment.' + db_set + '.text') label_file = os.path.join(path, 'sentiment.' + db_set + '.labels') with io.open(text_file, 'r', encoding="utf-8") as tf, io.open( label_file, 'r', encoding="utf-8") as lf: for text in tf: label = lf.readline() dataset.append(Instance(text=text, label=label)) dataset.apply(lambda x: x['text'].lower(), new_field_name='text') dataset.apply( lambda x: ['<start>'] + x['text'].split() + ['<eos>'], new_field_name='words') dataset.drop(lambda x: len(x['words']) > 1 + 15 + 1) dataset.apply(lambda x: x['words'] + ['<pad>'] * (17 - len(x['words'])), new_field_name='words') dataset.apply(lambda x: int(x['label']), new_field_name='label_seq', is_target=True) _train_data, _test_data = dataset.split(0.3) _vocab = Vocabulary(min_freq=2) _train_data.apply( lambda x: [_vocab.add(word) for word in x['words']]) _vocab.build_vocab() _train_data.apply( lambda x: [_vocab.to_index(word) for word in x['words']], new_field_name='word_seq', is_input=True) _test_data.apply( lambda x: [_vocab.to_index(word) for word in x['words']], new_field_name='word_seq', is_input=True) self.train_data = _train_data self.test_data = _test_data self.vocab = _vocab self.batch_size = batch_size self.train_iter = iter( Batch(dataset=self.train_data, batch_size=self.batch_size, sampler=SequentialSampler()))
def test(self, model, dataset): self.model = model.cuda() if self.use_cuda else model self.model.eval() batchiter = Batch(dataset, self.batch_size, SequentialSampler(), self.use_cuda) eval_res = defaultdict(list) i = 0 for batch_x, batch_y in batchiter: with torch.no_grad(): pred_y = self.model(**batch_x) eval_one = self.model.evaluate(**pred_y, **batch_y) i += self.batch_size for eval_name, tensor in eval_one.items(): eval_res[eval_name].append(tensor) tmp = {} for eval_name, tensorlist in eval_res.items(): tmp[eval_name] = torch.cat(tensorlist, dim=0) self.res = self.model.metrics(**tmp)
def predict(self, network, data): """Perform inference using the trained model. :param network: a PyTorch model (cpu) :param data: a DataSet object. :return: list of batch outputs """ # turn on the testing mode; clean up the history self.mode(network, test=True) batch_output = [] data_iterator = Batch(data, batch_size=self.batch_size, sampler=SequentialSampler(), as_numpy=False) for batch_x, _ in data_iterator: with torch.no_grad(): prediction = self.data_forward(network, batch_x) batch_output.append(prediction) return batch_output
def __init__(self, path='.data/sst/trees', data_type='sst', batch_size=32, split_ratio=0.1, seq_len=15, min_freq=2): data_set = DataSet() if data_type == 'yelp': path = '.data/yelp' for db_set in ['train']: text_file = os.path.join(path, 'sentiment.' + db_set + '.text') label_file = os.path.join(path, 'sentiment.' + db_set + '.labels') with io.open(text_file, 'r', encoding="utf-8") as tf, io.open( label_file, 'r', encoding="utf-8") as lf: for text in tf: label = lf.readline() data_set.append(Instance(text=text, label=label)) data_set.apply( lambda x: ['<start>'] + x['text'].lower().split() + ['<eos>'], new_field_name='words') data_set.drop(lambda x: len(x['words']) > seq_len + 2) elif data_type == 'sst': path = '.data/sst/trees' text = data.Field(init_token='<start>', eos_token='<eos>', lower=True, tokenize='spacy', fix_length=16) label = data.Field(sequential=False, unk_token='<unk>') filter = lambda ex: len(ex.text ) <= seq_len and ex.label != 'neutral' sst_train = datasets.SST(os.path.join(path, 'train.txt'), text, label, filter_pred=filter) sst_dev = datasets.SST(os.path.join(path, 'dev.txt'), text, label, filter_pred=filter) sst_test = datasets.SST(os.path.join(path, 'test.txt'), text, label, filter_pred=filter) for ex in sst_train.examples + sst_dev.examples + sst_test.examples: data_set.append( Instance(words=ex.text, label={ 'negative': 0, 'positive': 1 }[ex.label])) data_set.apply( lambda x: ['<start>'] + [w.lower() for w in x['words']] + ['<eos>'], new_field_name='words') elif data_type == 'test': with io.open('fasttrial1.pos', 'r', encoding="utf-8") as f: for text in f: data_set.append(Instance(text=text, label=1)) with io.open('fasttrial1.neg', 'r', encoding="utf-8") as f: for text in f: data_set.append(Instance(text=text, label=0)) data_set.apply( lambda x: ['<start>'] + x['text'].lower().split() + ['<eos>'], new_field_name='words') data_set.drop(lambda x: len(x['words']) > seq_len + 2) data_set.apply(lambda x: x['words'] + ['<pad>'] * (seq_len + 2 - len(x['words'])), new_field_name='words') _train_data, _ = data_set.split(split_ratio) _vocab = Vocabulary(min_freq=min_freq) _train_data.apply(lambda x: [_vocab.add(word) for word in x['words']]) _vocab.build_vocab() data_set.apply(lambda x: [_vocab.to_index(w) for w in x['words']], new_field_name='word_seq', is_input=True) data_set.apply(lambda x: x['word_seq'][1:] + [0], new_field_name='dec_target', is_target=True) data_set.apply(lambda x: int(x['label']), new_field_name='label_seq', is_target=True) _train_data, _test_data = data_set.split(split_ratio) self.train_data = _train_data self.test_data = _test_data self.vocab = _vocab self.batch_size = batch_size self.train_iter = iter( Batch(dataset=self.train_data, batch_size=self.batch_size, sampler=SequentialSampler()))
def _check_code(dataset, model, batch_size=DEFAULT_CHECK_BATCH_SIZE, dev_data=None, check_level=WARNING_CHECK_LEVEL): # check get_loss 方法 model_name = model.__class__.__name__ if not hasattr(model, 'get_loss'): raise AttributeError( "{} has to have a 'get_loss' function.".format(model_name)) batch = Batch(dataset=dataset, batch_size=batch_size, sampler=SequentialSampler()) for batch_count, (batch_x, batch_y) in enumerate(batch): _syn_model_data(model, batch_x, batch_y) # forward check if batch_count == 0: _check_forward_error(model_func=model.forward, check_level=check_level, batch_x=batch_x) refined_batch_x = _build_args(model.forward, **batch_x) output = model(**refined_batch_x) func_signature = get_func_signature(model.forward) assert isinstance( output, dict), "The return value of {} should be dict.".format( func_signature) # loss check if batch_count == 0: _check_loss_evaluate(prev_func=model.forward, func=model.get_loss, check_level=check_level, output=output, batch_y=batch_y) loss_input = _build_args(model.get_loss, **output, **batch_y) loss = model.get_loss(**loss_input) # check loss output if batch_count == 0: if not isinstance(loss, torch.Tensor): raise ValueError( "The return value of {}.get_loss() should be torch.Tensor, but {} got." .format(model_name, type(loss))) if len(loss.size()) != 0: raise ValueError( "The size of return value of {}.get_loss() is {}, should be torch.size([])" .format(model_name, loss.size())) loss.backward() model.zero_grad() if batch_count + 1 >= DEFAULT_CHECK_NUM_BATCH: break if dev_data is not None: if not hasattr(model, 'evaluate'): raise AttributeError( "{} has to have a 'evaluate' function to do evaluation. Or set" "dev_data to 'None'.".format(model_name)) outputs, truths = defaultdict(list), defaultdict(list) dev_batch = Batch(dataset=dataset, batch_size=batch_size, sampler=SequentialSampler()) with torch.no_grad(): for batch_count, (batch_x, batch_y) in enumerate(dev_batch): _syn_model_data(model, batch_x, batch_y) if hasattr(model, 'predict'): refined_batch_x = _build_args(model.predict, **batch_x) prev_func = model.predict output = prev_func(**refined_batch_x) func_signature = get_func_signature(model.predict) assert isinstance( output, dict), "The return value of {} should be dict.".format( func_signature) else: refined_batch_x = _build_args(model.forward, **batch_x) prev_func = model.forward output = prev_func(**refined_batch_x) for k, v in output.items(): outputs[k].append(v) for k, v in batch_y.items(): truths[k].append(v) if batch_count + 1 > DEFAULT_CHECK_NUM_BATCH: break for k, v in outputs.items(): outputs[k] = itertools.chain(*v) for k, v in truths.items(): truths[k] = itertools.chain(*v) _check_loss_evaluate(prev_func=prev_func, func=model.evaluate, check_level=check_level, output=outputs, batch_y=truths) refined_input = _build_args(model.evaluate, **outputs, **truths) metrics = model.evaluate(**refined_input) func_signature = get_func_signature(model.evaluate) assert isinstance(metrics, dict), "The return value of {} should be dict.". \ format(func_signature)
def _check_code(dataset, model, losser, metrics, batch_size=DEFAULT_CHECK_BATCH_SIZE, dev_data=None, metric_key=None, check_level=0): # check get_loss 方法 model_devcie = model.parameters().__next__().device batch = Batch(dataset=dataset, batch_size=batch_size, sampler=SequentialSampler()) for batch_count, (batch_x, batch_y) in enumerate(batch): _move_dict_value_to_device(batch_x, batch_y, device=model_devcie) # forward check if batch_count == 0: info_str = "" input_fields = _get_value_info(batch_x) target_fields = _get_value_info(batch_y) if len(input_fields) > 0: info_str += "input fields after batch(if batch size is {}):\n".format( batch_size) info_str += "\n".join(input_fields) info_str += '\n' else: raise RuntimeError("There is no input field.") if len(target_fields) > 0: info_str += "target fields after batch(if batch size is {}):\n".format( batch_size) info_str += "\n".join(target_fields) info_str += '\n' else: info_str += 'There is no target field.' print(info_str) _check_forward_error(forward_func=model.forward, dataset=dataset, batch_x=batch_x, check_level=check_level) refined_batch_x = _build_args(model.forward, **batch_x) pred_dict = model(**refined_batch_x) func_signature = get_func_signature(model.forward) if not isinstance(pred_dict, dict): raise TypeError( f"The return value of {func_signature} should be `dict`, not `{type(pred_dict)}`." ) # loss check try: loss = losser(pred_dict, batch_y) # check loss output if batch_count == 0: if not isinstance(loss, torch.Tensor): raise TypeError( f"The return value of {get_func_signature(losser.get_loss)} should be `torch.Tensor`, " f"but got `{type(loss)}`.") if len(loss.size()) != 0: raise ValueError( f"The size of return value of {get_func_signature(losser.get_loss)} is {loss.size()}, " f"should be torch.size([])") loss.backward() except CheckError as e: # TODO: another error raised if CheckError caught pre_func_signature = get_func_signature(model.forward) _check_loss_evaluate(prev_func_signature=pre_func_signature, func_signature=e.func_signature, check_res=e.check_res, pred_dict=pred_dict, target_dict=batch_y, dataset=dataset, check_level=check_level) model.zero_grad() if batch_count + 1 >= DEFAULT_CHECK_NUM_BATCH: break if dev_data is not None: tester = Tester(data=dataset[:batch_size * DEFAULT_CHECK_NUM_BATCH], model=model, metrics=metrics, batch_size=batch_size, verbose=-1) evaluate_results = tester.test() _check_eval_results(metrics=evaluate_results, metric_key=metric_key, metric_list=metrics)
logger.info('done!') logger.info('=========== preparing data: [{}] ==========='.format(args.task)) data_file = open('data/' + args.task + '.pkl', 'rb') data = pickle.load(data_file) data_file.close() bsz = args.batch_size // args.accumulation_steps logger.info('some examples:') if args.task == 'MNLI': train_ds = text2feature(data['train'], tokenizer, args.task) train_dataloader = Batch(train_ds, bsz, sampler=RandomSampler()) dev_matched_ds = text2feature(data['dev_matched'], tokenizer, args.task) dev_matched_dataloader = Batch(dev_matched_ds, bsz, sampler=SequentialSampler()) dev_mismatched_ds = text2feature(data['dev_mismatched'], tokenizer, args.task) dev_mismatched_dataloader = Batch(dev_mismatched_ds, bsz, sampler=SequentialSampler()) dev_dataloader = [dev_matched_dataloader, dev_mismatched_dataloader] test_matched_ds = text2feature(data['test_matched'], tokenizer, args.task, True) test_matched_dataloader = Batch(test_matched_ds, bsz, sampler=SequentialSampler()) test_mismatched_ds = text2feature(data['test_mismatched'], tokenizer, args.task, True) test_mismatched_dataloader = Batch(test_mismatched_ds, bsz, sampler=SequentialSampler()) test_dataloader = [test_matched_dataloader, test_mismatched_dataloader] logger.info(train_ds[0])
def test_sequential_sampler(): sampler = SequentialSampler() data = [1, 3, 5, 7, 9, 2, 4, 6, 8, 10] for idx, i in enumerate(sampler(data)): assert idx == i
def __init__(self, path=".data/sst2/", dataset="sst2", batch_size=32): if dataset == "sst2": dataset = DataSet() for db_set in ['full']: text_file = os.path.join(path, 'sst2-' + db_set + '.text') label_file = os.path.join(path, 'sst2-' + db_set + '.labels') with io.open(text_file, 'r', encoding="utf-8") as tf, io.open( label_file, 'r', encoding="utf-8") as lf: for text in tf: label = lf.readline() dataset.append(Instance(text=text, label=label)) dataset.apply(lambda x: x['text'].lower(), new_field_name='text') dataset.apply( lambda x: ['<start>'] + x['text'].split() + ['<eos>'], new_field_name='words') dataset.drop(lambda x: len(x['words']) > 17) dataset.apply(lambda x: x['words'] + ['<pad>'] * (17 - len(x['words'])), new_field_name='words') dataset.apply(lambda x: int(x['label']), new_field_name='label_seq', is_target=True) # 不能随机分 # _train_data, _test_data = dataset.split(0.3) _train_data = dataset dataset2 = DataSet() for db_set in ['test']: text_file = os.path.join(path, 'sst2-' + db_set + '.text') label_file = os.path.join(path, 'sst2-' + db_set + '.labels') with io.open(text_file, 'r', encoding="utf-8") as tf, io.open( label_file, 'r', encoding="utf-8") as lf: for text in tf: label = lf.readline() dataset2.append(Instance(text=text, label=label)) dataset2.apply(lambda x: x['text'].lower(), new_field_name='text') dataset2.apply( lambda x: ['<start>'] + x['text'].split() + ['<eos>'], new_field_name='words') dataset2.drop(lambda x: len(x['words']) > 17) dataset2.apply(lambda x: x['words'] + ['<pad>'] * (17 - len(x['words'])), new_field_name='words') dataset2.apply(lambda x: int(x['label']), new_field_name='label_seq', is_target=True) _test_data = dataset2 # _vocab = Vocabulary(min_freq=2) # TODO:设置min_freq会使得counter较大,影响vector _vocab = Vocabulary() _train_data.apply( lambda x: [_vocab.add(word) for word in x['words']]) _vocab.build_vocab() del _vocab.word_count['<unk>'] del _vocab.word_count['<pad>'] del _vocab.word_count['<start>'] del _vocab.word_count['<eos>'] _train_data.apply( lambda x: [_vocab.to_index(word) for word in x['words']], new_field_name='word_seq', is_input=True) _test_data.apply( lambda x: [_vocab.to_index(word) for word in x['words']], new_field_name='word_seq', is_input=True) self.train_data = _train_data self.test_data = _test_data self.vocab = _vocab self.n_vocab = len(_vocab.idx2word) - 3 self.batch_size = batch_size self.train_iter = iter( Batch(dataset=self.train_data, batch_size=self.batch_size, sampler=SequentialSampler()))