示例#1
0
    def init_data_loader(self, batch_size):

        self.train_data_loader = DataSetIter(self.train_set,
                                             batch_size,
                                             sampler=RandomSampler())
        self.dev_data_loader = DataSetIter(self.dev_set,
                                           batch_size,
                                           sampler=SequentialSampler())
        self.test_data_loader = DataSetIter(self.test_set,
                                            batch_size,
                                            sampler=SequentialSampler())
示例#2
0
    def predict(self, network, data):
        """Perform inference using the trained model.

        :param network: a PyTorch model (cpu)
        :param data: a DataSet object.
        :return: list of list of strings, [num_examples, tag_seq_length]
        """
        # transform strings into DataSet object
        # data = self.prepare_input(data)

        # turn on the testing mode; clean up the history
        self.mode(network, test=True)
        batch_output = []

        data_iterator = Batch(data,
                              batch_size=self.batch_size,
                              sampler=SequentialSampler(),
                              use_cuda=False)

        for batch_x, _ in data_iterator:
            with torch.no_grad():
                prediction = self.data_forward(network, batch_x)
            batch_output.append(prediction)

        return self._post_processor(batch_output, self.label_vocab)
示例#3
0
    def test(self, filepath):

        tag_proc = self._dict['tag_indexer']
        cws_model = self.pipeline.pipeline[-2].model
        pipeline = self.pipeline.pipeline[:5]

        pipeline.insert(1, tag_proc)
        pp = Pipeline(pipeline)

        reader = ConlluCWSReader()

        # te_filename = '/home/hyan/ctb3/test.conllx'
        te_dataset = reader.load(filepath)
        pp(te_dataset)

        batch_size = 64
        te_batcher = Batch(te_dataset,
                           batch_size,
                           SequentialSampler(),
                           use_cuda=False)
        pre, rec, f1 = calculate_pre_rec_f1(cws_model, te_batcher, type='bmes')
        f1 = round(f1 * 100, 2)
        pre = round(pre * 100, 2)
        rec = round(rec * 100, 2)
        # print("f1:{:.2f}, pre:{:.2f}, rec:{:.2f}".format(f1, pre, rec))

        return f1, pre, rec
示例#4
0
    def test(self):
        # turn on the testing mode; clean up the history
        network = self._model
        self._mode(network, is_test=True)
        data_iterator = Batch(self.data, self.batch_size, sampler=SequentialSampler(), as_numpy=False)
        eval_results = {}
        try:
            with torch.no_grad():
                for batch_x, batch_y in data_iterator:
                    _move_dict_value_to_device(batch_x, batch_y, device=self._model_device)
                    pred_dict = self._data_forward(self._predict_func, batch_x)
                    if not isinstance(pred_dict, dict):
                        raise TypeError(f"The return value of {get_func_signature(self._predict_func)} " 
                                                         f"must be `dict`, got {type(pred_dict)}.")
                    for metric in self.metrics:
                        metric(pred_dict, batch_y)
                for metric in self.metrics:
                    eval_result = metric.get_metric()
                    if not isinstance(eval_result, dict):
                        raise TypeError(f"The return value of {get_func_signature(metric.get_metric)} must be "
                                        f"`dict`, got {type(eval_result)}")
                    metric_name = metric.__class__.__name__
                    eval_results[metric_name] = eval_result
        except CheckError as e:
            prev_func_signature = get_func_signature(self._predict_func)
            _check_loss_evaluate(prev_func_signature=prev_func_signature, func_signature=e.func_signature,
                                 check_res=e.check_res, pred_dict=pred_dict, target_dict=batch_y,
                                 dataset=self.data, check_level=0)

        if self.verbose >= 1:
            print("[tester] \n{}".format(self._format_eval_results(eval_results)))
        self._mode(network, is_test=False)
        return eval_results
示例#5
0
    def process(self, dataset):
        self.model.eval()
        assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
        data_iterator = Batch(dataset, batch_size=self.batch_size, sampler=SequentialSampler())

        batch_output = defaultdict(list)
        if hasattr(self.model, "predict"):
            predict_func = self.model.predict
        else:
            predict_func = self.model.forward
        with torch.no_grad():
            for batch_x, _ in data_iterator:
                refined_batch_x = _build_args(predict_func, **batch_x)
                prediction = predict_func(**refined_batch_x)
                seq_lens = batch_x[self.seq_len_field_name].tolist()

                for key, value in prediction.items():
                    tmp_batch = []
                    value = value.cpu().numpy()
                    if len(value.shape) == 1 or (len(value.shape) == 2 and value.shape[1] == 1):
                        batch_output[key].extend(value.tolist())
                    else:
                        for idx, seq_len in enumerate(seq_lens):
                            tmp_batch.append(value[idx, :seq_len])
                        batch_output[key].extend(tmp_batch)
                if not self.seq_len_field_name in prediction:
                    batch_output[self.seq_len_field_name].extend(seq_lens)

        # TODO 当前的实现会导致之后的processor需要知道model输出的output的key是什么
        for field_name, fields in batch_output.items():
            dataset.add_field(field_name, fields, is_input=True, is_target=False)

        return dataset
示例#6
0
 def test_list_of_numpy_to_tensor(self):
     ds = DataSet([Instance(x=np.array([1, 2]), y=np.array([3, 4])) for _ in range(2)] +
                  [Instance(x=np.array([1, 2, 3, 4]), y=np.array([3, 4, 5, 6])) for _ in range(2)])
     ds.set_input("x")
     ds.set_target("y")
     iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
     for x, y in iter:
         print(x, y)
示例#7
0
 def next_batch(self):
     try:
         return next(self.train_iter)
     except StopAsyncIteration:
         self.train_iter = iter(
             Batch(dataset=self.train_data,
                   batch_size=self.batch_size,
                   sampler=SequentialSampler()))
         return next(self.train_iter)
示例#8
0
 def test_numpy_padding(self):
     ds = DataSet({"x": np.array([[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10),
                   "y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)})
     ds.set_input("x")
     ds.set_target("y")
     iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
     for x, y in iter:
         self.assertEqual(x["x"].shape, (4, 4))
         self.assertEqual(y["y"].shape, (4, 4))
示例#9
0
    def test_sequential_batch(self):
        batch_size = 32
        pause_seconds = 0.01
        num_samples = 1000
        dataset = generate_fake_dataset(num_samples)

        batch = Batch(dataset, batch_size=batch_size, sampler=SequentialSampler())
        for batch_x, batch_y in batch:
            time.sleep(pause_seconds)
示例#10
0
    def test_simple(self):
        dataset = construct_dataset(
            [["FastNLP", "is", "the", "most", "beautiful", "tool", "in", "the", "world"] for _ in range(40)])
        dataset.set_target()
        batch = Batch(dataset, batch_size=4, sampler=SequentialSampler(), as_numpy=True)

        cnt = 0
        for _, _ in batch:
            cnt += 1
        self.assertEqual(cnt, 10)
示例#11
0
 def test_dataset_batching(self):
     ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40})
     ds.set_input("x")
     ds.set_target("y")
     iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
     for x, y in iter:
         self.assertTrue(isinstance(x["x"], np.ndarray) and isinstance(y["y"], np.ndarray))
         self.assertEqual(len(x["x"]), 4)
         self.assertEqual(len(y["y"]), 4)
         self.assertListEqual(list(x["x"][-1]), [1, 2, 3, 4])
         self.assertListEqual(list(y["y"][-1]), [5, 6])
示例#12
0
 def test_numpy_to_tensor(self):
     ds = DataSet({"x": np.array([[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10),
                   "y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)})
     ds.set_input("x")
     ds.set_target("y")
     iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
     for x, y in iter:
         self.assertTrue(isinstance(x["x"], torch.Tensor))
         self.assertEqual(tuple(x["x"].shape), (4, 4))
         self.assertTrue(isinstance(y["y"], torch.Tensor))
         self.assertEqual(tuple(y["y"].shape), (4, 4))
示例#13
0
 def test_list_of_list_to_tensor(self):
     ds = DataSet([Instance(x=[1, 2], y=[3, 4]) for _ in range(2)] +
                  [Instance(x=[1, 2, 3, 4], y=[3, 4, 5, 6]) for _ in range(2)])
     ds.set_input("x")
     ds.set_target("y")
     iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
     for x, y in iter:
         self.assertTrue(isinstance(x["x"], torch.Tensor))
         self.assertEqual(tuple(x["x"].shape), (4, 4))
         self.assertTrue(isinstance(y["y"], torch.Tensor))
         self.assertEqual(tuple(y["y"].shape), (4, 4))
示例#14
0
 def next_batch(self):
     try:
         _next_batch = next(self.train_iter)
         if _next_batch[0]['word_seq'].shape[0] != self.batch_size:
             raise StopIteration
         return _next_batch
     except StopIteration:
         self.train_iter = iter(
             Batch(dataset=self.train_data,
                   batch_size=self.batch_size,
                   sampler=SequentialSampler()))
         return self.next_batch()
示例#15
0
    def __init__(self, path=".data/yelp", dataset="yelp", batch_size=32):

        if dataset == "yelp":
            dataset = DataSet()

            for db_set in ['train']:
                text_file = os.path.join(path, 'sentiment.' + db_set + '.text')
                label_file = os.path.join(path,
                                          'sentiment.' + db_set + '.labels')
                with io.open(text_file, 'r', encoding="utf-8") as tf, io.open(
                        label_file, 'r', encoding="utf-8") as lf:
                    for text in tf:
                        label = lf.readline()
                        dataset.append(Instance(text=text, label=label))

            dataset.apply(lambda x: x['text'].lower(), new_field_name='text')
            dataset.apply(
                lambda x: ['<start>'] + x['text'].split() + ['<eos>'],
                new_field_name='words')
            dataset.drop(lambda x: len(x['words']) > 1 + 15 + 1)
            dataset.apply(lambda x: x['words'] + ['<pad>'] *
                          (17 - len(x['words'])),
                          new_field_name='words')
            dataset.apply(lambda x: int(x['label']),
                          new_field_name='label_seq',
                          is_target=True)

            _train_data, _test_data = dataset.split(0.3)

            _vocab = Vocabulary(min_freq=2)
            _train_data.apply(
                lambda x: [_vocab.add(word) for word in x['words']])
            _vocab.build_vocab()

            _train_data.apply(
                lambda x: [_vocab.to_index(word) for word in x['words']],
                new_field_name='word_seq',
                is_input=True)
            _test_data.apply(
                lambda x: [_vocab.to_index(word) for word in x['words']],
                new_field_name='word_seq',
                is_input=True)

        self.train_data = _train_data
        self.test_data = _test_data
        self.vocab = _vocab
        self.batch_size = batch_size
        self.train_iter = iter(
            Batch(dataset=self.train_data,
                  batch_size=self.batch_size,
                  sampler=SequentialSampler()))
示例#16
0
    def test(self, model, dataset):
        self.model = model.cuda() if self.use_cuda else model
        self.model.eval()
        batchiter = Batch(dataset, self.batch_size, SequentialSampler(),
                          self.use_cuda)
        eval_res = defaultdict(list)
        i = 0
        for batch_x, batch_y in batchiter:
            with torch.no_grad():
                pred_y = self.model(**batch_x)
                eval_one = self.model.evaluate(**pred_y, **batch_y)
            i += self.batch_size
            for eval_name, tensor in eval_one.items():
                eval_res[eval_name].append(tensor)
        tmp = {}
        for eval_name, tensorlist in eval_res.items():
            tmp[eval_name] = torch.cat(tensorlist, dim=0)

        self.res = self.model.metrics(**tmp)
示例#17
0
    def predict(self, network, data):
        """Perform inference using the trained model.

        :param network: a PyTorch model (cpu)
        :param data: a DataSet object.
        :return: list of batch outputs
        """
        # turn on the testing mode; clean up the history
        self.mode(network, test=True)
        batch_output = []

        data_iterator = Batch(data,
                              batch_size=self.batch_size,
                              sampler=SequentialSampler(),
                              as_numpy=False)

        for batch_x, _ in data_iterator:
            with torch.no_grad():
                prediction = self.data_forward(network, batch_x)
            batch_output.append(prediction)

        return batch_output
示例#18
0
    def __init__(self,
                 path='.data/sst/trees',
                 data_type='sst',
                 batch_size=32,
                 split_ratio=0.1,
                 seq_len=15,
                 min_freq=2):

        data_set = DataSet()
        if data_type == 'yelp':
            path = '.data/yelp'
            for db_set in ['train']:
                text_file = os.path.join(path, 'sentiment.' + db_set + '.text')
                label_file = os.path.join(path,
                                          'sentiment.' + db_set + '.labels')

                with io.open(text_file, 'r', encoding="utf-8") as tf, io.open(
                        label_file, 'r', encoding="utf-8") as lf:
                    for text in tf:
                        label = lf.readline()
                        data_set.append(Instance(text=text, label=label))

            data_set.apply(
                lambda x: ['<start>'] + x['text'].lower().split() + ['<eos>'],
                new_field_name='words')
            data_set.drop(lambda x: len(x['words']) > seq_len + 2)

        elif data_type == 'sst':
            path = '.data/sst/trees'
            text = data.Field(init_token='<start>',
                              eos_token='<eos>',
                              lower=True,
                              tokenize='spacy',
                              fix_length=16)
            label = data.Field(sequential=False, unk_token='<unk>')
            filter = lambda ex: len(ex.text
                                    ) <= seq_len and ex.label != 'neutral'
            sst_train = datasets.SST(os.path.join(path, 'train.txt'),
                                     text,
                                     label,
                                     filter_pred=filter)
            sst_dev = datasets.SST(os.path.join(path, 'dev.txt'),
                                   text,
                                   label,
                                   filter_pred=filter)
            sst_test = datasets.SST(os.path.join(path, 'test.txt'),
                                    text,
                                    label,
                                    filter_pred=filter)
            for ex in sst_train.examples + sst_dev.examples + sst_test.examples:
                data_set.append(
                    Instance(words=ex.text,
                             label={
                                 'negative': 0,
                                 'positive': 1
                             }[ex.label]))

            data_set.apply(
                lambda x: ['<start>'] + [w.lower()
                                         for w in x['words']] + ['<eos>'],
                new_field_name='words')

        elif data_type == 'test':
            with io.open('fasttrial1.pos', 'r', encoding="utf-8") as f:
                for text in f:
                    data_set.append(Instance(text=text, label=1))
            with io.open('fasttrial1.neg', 'r', encoding="utf-8") as f:
                for text in f:
                    data_set.append(Instance(text=text, label=0))

            data_set.apply(
                lambda x: ['<start>'] + x['text'].lower().split() + ['<eos>'],
                new_field_name='words')
            data_set.drop(lambda x: len(x['words']) > seq_len + 2)

        data_set.apply(lambda x: x['words'] + ['<pad>'] *
                       (seq_len + 2 - len(x['words'])),
                       new_field_name='words')

        _train_data, _ = data_set.split(split_ratio)

        _vocab = Vocabulary(min_freq=min_freq)
        _train_data.apply(lambda x: [_vocab.add(word) for word in x['words']])
        _vocab.build_vocab()

        data_set.apply(lambda x: [_vocab.to_index(w) for w in x['words']],
                       new_field_name='word_seq',
                       is_input=True)
        data_set.apply(lambda x: x['word_seq'][1:] + [0],
                       new_field_name='dec_target',
                       is_target=True)
        data_set.apply(lambda x: int(x['label']),
                       new_field_name='label_seq',
                       is_target=True)
        _train_data, _test_data = data_set.split(split_ratio)

        self.train_data = _train_data
        self.test_data = _test_data
        self.vocab = _vocab
        self.batch_size = batch_size
        self.train_iter = iter(
            Batch(dataset=self.train_data,
                  batch_size=self.batch_size,
                  sampler=SequentialSampler()))
示例#19
0
def _check_code(dataset,
                model,
                batch_size=DEFAULT_CHECK_BATCH_SIZE,
                dev_data=None,
                check_level=WARNING_CHECK_LEVEL):
    # check get_loss 方法
    model_name = model.__class__.__name__
    if not hasattr(model, 'get_loss'):
        raise AttributeError(
            "{} has to have a 'get_loss' function.".format(model_name))

    batch = Batch(dataset=dataset,
                  batch_size=batch_size,
                  sampler=SequentialSampler())
    for batch_count, (batch_x, batch_y) in enumerate(batch):
        _syn_model_data(model, batch_x, batch_y)
        # forward check
        if batch_count == 0:
            _check_forward_error(model_func=model.forward,
                                 check_level=check_level,
                                 batch_x=batch_x)

        refined_batch_x = _build_args(model.forward, **batch_x)
        output = model(**refined_batch_x)
        func_signature = get_func_signature(model.forward)
        assert isinstance(
            output, dict), "The return value of {} should be dict.".format(
                func_signature)

        # loss check
        if batch_count == 0:
            _check_loss_evaluate(prev_func=model.forward,
                                 func=model.get_loss,
                                 check_level=check_level,
                                 output=output,
                                 batch_y=batch_y)
        loss_input = _build_args(model.get_loss, **output, **batch_y)
        loss = model.get_loss(**loss_input)

        # check loss output
        if batch_count == 0:
            if not isinstance(loss, torch.Tensor):
                raise ValueError(
                    "The return value of {}.get_loss() should be torch.Tensor, but {} got."
                    .format(model_name, type(loss)))
            if len(loss.size()) != 0:
                raise ValueError(
                    "The size of return value of {}.get_loss() is {}, should be torch.size([])"
                    .format(model_name, loss.size()))
        loss.backward()
        model.zero_grad()
        if batch_count + 1 >= DEFAULT_CHECK_NUM_BATCH:
            break

    if dev_data is not None:
        if not hasattr(model, 'evaluate'):
            raise AttributeError(
                "{} has to have a 'evaluate' function to do evaluation. Or set"
                "dev_data to 'None'.".format(model_name))
        outputs, truths = defaultdict(list), defaultdict(list)
        dev_batch = Batch(dataset=dataset,
                          batch_size=batch_size,
                          sampler=SequentialSampler())
        with torch.no_grad():
            for batch_count, (batch_x, batch_y) in enumerate(dev_batch):
                _syn_model_data(model, batch_x, batch_y)

                if hasattr(model, 'predict'):
                    refined_batch_x = _build_args(model.predict, **batch_x)
                    prev_func = model.predict
                    output = prev_func(**refined_batch_x)
                    func_signature = get_func_signature(model.predict)
                    assert isinstance(
                        output,
                        dict), "The return value of {} should be dict.".format(
                            func_signature)
                else:
                    refined_batch_x = _build_args(model.forward, **batch_x)
                    prev_func = model.forward
                    output = prev_func(**refined_batch_x)
                for k, v in output.items():
                    outputs[k].append(v)
                for k, v in batch_y.items():
                    truths[k].append(v)
                if batch_count + 1 > DEFAULT_CHECK_NUM_BATCH:
                    break
            for k, v in outputs.items():
                outputs[k] = itertools.chain(*v)
            for k, v in truths.items():
                truths[k] = itertools.chain(*v)
            _check_loss_evaluate(prev_func=prev_func,
                                 func=model.evaluate,
                                 check_level=check_level,
                                 output=outputs,
                                 batch_y=truths)
            refined_input = _build_args(model.evaluate, **outputs, **truths)
            metrics = model.evaluate(**refined_input)
            func_signature = get_func_signature(model.evaluate)
            assert isinstance(metrics, dict), "The return value of {} should be dict.". \
                format(func_signature)
示例#20
0
def _check_code(dataset,
                model,
                losser,
                metrics,
                batch_size=DEFAULT_CHECK_BATCH_SIZE,
                dev_data=None,
                metric_key=None,
                check_level=0):
    # check get_loss 方法
    model_devcie = model.parameters().__next__().device

    batch = Batch(dataset=dataset,
                  batch_size=batch_size,
                  sampler=SequentialSampler())
    for batch_count, (batch_x, batch_y) in enumerate(batch):
        _move_dict_value_to_device(batch_x, batch_y, device=model_devcie)
        # forward check
        if batch_count == 0:
            info_str = ""
            input_fields = _get_value_info(batch_x)
            target_fields = _get_value_info(batch_y)
            if len(input_fields) > 0:
                info_str += "input fields after batch(if batch size is {}):\n".format(
                    batch_size)
                info_str += "\n".join(input_fields)
                info_str += '\n'
            else:
                raise RuntimeError("There is no input field.")
            if len(target_fields) > 0:
                info_str += "target fields after batch(if batch size is {}):\n".format(
                    batch_size)
                info_str += "\n".join(target_fields)
                info_str += '\n'
            else:
                info_str += 'There is no target field.'
            print(info_str)
            _check_forward_error(forward_func=model.forward,
                                 dataset=dataset,
                                 batch_x=batch_x,
                                 check_level=check_level)

        refined_batch_x = _build_args(model.forward, **batch_x)
        pred_dict = model(**refined_batch_x)
        func_signature = get_func_signature(model.forward)
        if not isinstance(pred_dict, dict):
            raise TypeError(
                f"The return value of {func_signature} should be `dict`, not `{type(pred_dict)}`."
            )

        # loss check
        try:
            loss = losser(pred_dict, batch_y)
            # check loss output
            if batch_count == 0:
                if not isinstance(loss, torch.Tensor):
                    raise TypeError(
                        f"The return value of {get_func_signature(losser.get_loss)} should be `torch.Tensor`, "
                        f"but got `{type(loss)}`.")
                if len(loss.size()) != 0:
                    raise ValueError(
                        f"The size of return value of {get_func_signature(losser.get_loss)} is {loss.size()}, "
                        f"should be torch.size([])")
            loss.backward()
        except CheckError as e:
            # TODO: another error raised if CheckError caught
            pre_func_signature = get_func_signature(model.forward)
            _check_loss_evaluate(prev_func_signature=pre_func_signature,
                                 func_signature=e.func_signature,
                                 check_res=e.check_res,
                                 pred_dict=pred_dict,
                                 target_dict=batch_y,
                                 dataset=dataset,
                                 check_level=check_level)
        model.zero_grad()
        if batch_count + 1 >= DEFAULT_CHECK_NUM_BATCH:
            break

    if dev_data is not None:
        tester = Tester(data=dataset[:batch_size * DEFAULT_CHECK_NUM_BATCH],
                        model=model,
                        metrics=metrics,
                        batch_size=batch_size,
                        verbose=-1)
        evaluate_results = tester.test()
        _check_eval_results(metrics=evaluate_results,
                            metric_key=metric_key,
                            metric_list=metrics)
示例#21
0
    logger.info('done!')

    logger.info('=========== preparing data: [{}] ==========='.format(args.task))
    data_file = open('data/' + args.task + '.pkl', 'rb')
    data = pickle.load(data_file)
    data_file.close()

    bsz = args.batch_size // args.accumulation_steps

    logger.info('some examples:')
    if args.task == 'MNLI':
        train_ds = text2feature(data['train'], tokenizer, args.task)
        train_dataloader = Batch(train_ds, bsz, sampler=RandomSampler())

        dev_matched_ds = text2feature(data['dev_matched'], tokenizer, args.task)
        dev_matched_dataloader = Batch(dev_matched_ds, bsz, sampler=SequentialSampler())

        dev_mismatched_ds = text2feature(data['dev_mismatched'], tokenizer, args.task)
        dev_mismatched_dataloader = Batch(dev_mismatched_ds, bsz, sampler=SequentialSampler())

        dev_dataloader = [dev_matched_dataloader, dev_mismatched_dataloader]

        test_matched_ds = text2feature(data['test_matched'], tokenizer, args.task, True)
        test_matched_dataloader = Batch(test_matched_ds, bsz, sampler=SequentialSampler())

        test_mismatched_ds = text2feature(data['test_mismatched'], tokenizer, args.task, True)
        test_mismatched_dataloader = Batch(test_mismatched_ds, bsz, sampler=SequentialSampler())

        test_dataloader = [test_matched_dataloader, test_mismatched_dataloader]

        logger.info(train_ds[0])
示例#22
0
def test_sequential_sampler():
    sampler = SequentialSampler()
    data = [1, 3, 5, 7, 9, 2, 4, 6, 8, 10]
    for idx, i in enumerate(sampler(data)):
        assert idx == i
示例#23
0
    def __init__(self, path=".data/sst2/", dataset="sst2", batch_size=32):
        if dataset == "sst2":
            dataset = DataSet()

            for db_set in ['full']:
                text_file = os.path.join(path, 'sst2-' + db_set + '.text')
                label_file = os.path.join(path, 'sst2-' + db_set + '.labels')
                with io.open(text_file, 'r', encoding="utf-8") as tf, io.open(
                        label_file, 'r', encoding="utf-8") as lf:
                    for text in tf:
                        label = lf.readline()
                        dataset.append(Instance(text=text, label=label))

            dataset.apply(lambda x: x['text'].lower(), new_field_name='text')
            dataset.apply(
                lambda x: ['<start>'] + x['text'].split() + ['<eos>'],
                new_field_name='words')
            dataset.drop(lambda x: len(x['words']) > 17)
            dataset.apply(lambda x: x['words'] + ['<pad>'] *
                          (17 - len(x['words'])),
                          new_field_name='words')
            dataset.apply(lambda x: int(x['label']),
                          new_field_name='label_seq',
                          is_target=True)

            # 不能随机分
            # _train_data, _test_data = dataset.split(0.3)
            _train_data = dataset

            dataset2 = DataSet()

            for db_set in ['test']:
                text_file = os.path.join(path, 'sst2-' + db_set + '.text')
                label_file = os.path.join(path, 'sst2-' + db_set + '.labels')
                with io.open(text_file, 'r', encoding="utf-8") as tf, io.open(
                        label_file, 'r', encoding="utf-8") as lf:
                    for text in tf:
                        label = lf.readline()
                        dataset2.append(Instance(text=text, label=label))

            dataset2.apply(lambda x: x['text'].lower(), new_field_name='text')
            dataset2.apply(
                lambda x: ['<start>'] + x['text'].split() + ['<eos>'],
                new_field_name='words')
            dataset2.drop(lambda x: len(x['words']) > 17)
            dataset2.apply(lambda x: x['words'] + ['<pad>'] *
                           (17 - len(x['words'])),
                           new_field_name='words')
            dataset2.apply(lambda x: int(x['label']),
                           new_field_name='label_seq',
                           is_target=True)

            _test_data = dataset2

            # _vocab = Vocabulary(min_freq=2)
            # TODO:设置min_freq会使得counter较大,影响vector
            _vocab = Vocabulary()
            _train_data.apply(
                lambda x: [_vocab.add(word) for word in x['words']])

            _vocab.build_vocab()
            del _vocab.word_count['<unk>']
            del _vocab.word_count['<pad>']
            del _vocab.word_count['<start>']
            del _vocab.word_count['<eos>']

            _train_data.apply(
                lambda x: [_vocab.to_index(word) for word in x['words']],
                new_field_name='word_seq',
                is_input=True)
            _test_data.apply(
                lambda x: [_vocab.to_index(word) for word in x['words']],
                new_field_name='word_seq',
                is_input=True)

        self.train_data = _train_data
        self.test_data = _test_data
        self.vocab = _vocab
        self.n_vocab = len(_vocab.idx2word) - 3
        self.batch_size = batch_size
        self.train_iter = iter(
            Batch(dataset=self.train_data,
                  batch_size=self.batch_size,
                  sampler=SequentialSampler()))