예제 #1
0
    def test_case_1(self):
        args = {
            "epochs": 3,
            "batch_size": 2,
            "validate": False,
            "use_cuda": False,
            "pickle_path": "./save/",
            "save_best_dev": True,
            "model_name": "default_model_name.pkl",
            "loss": Loss("cross_entropy"),
            "optimizer": Optimizer("Adam", lr=0.001, weight_decay=0),
            "vocab_size": 10,
            "word_emb_dim": 100,
            "rnn_hidden_units": 100,
            "num_classes": 5,
            "evaluator": SeqLabelEvaluator()
        }
        trainer = SeqLabelTrainer(**args)

        train_data = [
            [['a', 'b', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']],
            [['a', '@', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']],
            [['a', 'b', '#', 'd', 'e'], ['a', '@', 'c', 'd', 'e']],
            [['a', 'b', 'c', '?', 'e'], ['a', '@', 'c', 'd', 'e']],
            [['a', 'b', 'c', 'd', '$'], ['a', '@', 'c', 'd', 'e']],
            [['!', 'b', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']],
        ]
        vocab = {
            'a': 0,
            'b': 1,
            'c': 2,
            'd': 3,
            'e': 4,
            '!': 5,
            '@': 6,
            '#': 7,
            '$': 8,
            '?': 9
        }
        label_vocab = {'a': 0, '@': 1, 'c': 2, 'd': 3, 'e': 4}

        data_set = DataSet()
        for example in train_data:
            text, label = example[0], example[1]
            x = TextField(text, False)
            x_len = LabelField(len(text), is_target=False)
            y = TextField(label, is_target=False)
            ins = Instance(word_seq=x, truth=y, word_seq_origin_len=x_len)
            data_set.append(ins)

        data_set.index_field("word_seq", vocab)
        data_set.index_field("truth", label_vocab)

        model = SeqLabeling(args)

        trainer.train(network=model, train_data=data_set, dev_data=data_set)
        # If this can run, everything is OK.

        os.system("rm -rf save")
        print("pickle path deleted")
예제 #2
0
    def load(self, path):
        datalist = []
        with open(path, 'r', encoding='utf-8') as f:
            sample = []
            for line in f:
                if line.startswith('\n'):
                    datalist.append(sample)
                    sample = []
                elif line.startswith('#'):
                    continue
                else:
                    sample.append(line.split('\t'))
            if len(sample) > 0:
                datalist.append(sample)

        ds = DataSet(name='conll')
        for sample in datalist:
            # print(sample)
            res = self.get_one(sample)
            ds.append(
                Instance(word_seq=TextField(res[0], is_target=False),
                         pos_seq=TextField(res[1], is_target=False),
                         head_indices=SeqLabelField(res[2], is_target=True),
                         head_labels=TextField(res[3], is_target=True)))

        return ds
예제 #3
0
 def convert(self, data):
     dataset = DataSet()
     for sample in data:
         word_seq = [BOS] + sample[0] + [EOS]
         pos_seq = [BOS] + sample[1] + [EOS]
         heads = [0] + list(map(int, sample[2])) + [0]
         head_tags = [BOS] + sample[3] + [EOS]
         dataset.append(
             Instance(word_seq=TextField(word_seq, is_target=False),
                      pos_seq=TextField(pos_seq, is_target=False),
                      gold_heads=SeqLabelField(heads, is_target=False),
                      head_indices=SeqLabelField(heads, is_target=True),
                      head_labels=TextField(head_tags, is_target=True)))
     return dataset
예제 #4
0
파일: dataset.py 프로젝트: ssttv/fastNLP
 def convert_with_vocabs(self, data, vocabs):
     for example in data:
         word_seq, label_seq = example[0], example[1]
         # list, list
         x = TextField(word_seq, is_target=False)
         x_len = LabelField(len(word_seq), is_target=False)
         y = TextField(label_seq, is_target=False)
         instance = Instance()
         instance.add_field("word_seq", x)
         instance.add_field("truth", y)
         instance.add_field("word_seq_origin_len", x_len)
         self.append(instance)
     self.index_field("word_seq", vocabs["word_vocab"])
     self.index_field("truth", vocabs["label_vocab"])
예제 #5
0
파일: test_batch.py 프로젝트: yhcc/fastNLP
    def test(self):
        data = DataSet()
        for text, label in zip(texts, labels):
            x = TextField(text, is_target=False)
            y = LabelField(label, is_target=True)
            ins = Instance(text=x, label=y)
            data.append(ins)

        # use vocabulary to index data
        data.index_field("text", vocab)

        # define naive sampler for batch class
        class SeqSampler:
            def __call__(self, dataset):
                return list(range(len(dataset)))

        # use batch to iterate dataset
        data_iterator = Batch(data, 2, SeqSampler(), False)
        total_data = 0
        for batch_x, batch_y in data_iterator:
            total_data += batch_x["text"].size(0)
            self.assertTrue(batch_x["text"].size(0) == 2
                            or total_data == len(raw_texts))
            self.assertTrue(isinstance(batch_x, dict))
            self.assertTrue(isinstance(batch_x["text"], torch.LongTensor))
            self.assertTrue(isinstance(batch_y, dict))
            self.assertTrue(isinstance(batch_y["label"], torch.LongTensor))
예제 #6
0
파일: preprocess.py 프로젝트: ssttv/fastNLP
    def convert_to_dataset(self, data, vocab, label_vocab):
        """Convert list of indices into a DataSet object.

        :param data: list. Entries are strings.
        :param vocab: a dict, mapping string (token) to index (int).
        :param label_vocab: a dict, mapping string (label) to index (int).
        :return data_set: a DataSet object
        """
        use_word_seq = False
        use_label_seq = False
        use_label_str = False

        # construct a DataSet object and fill it with Instances
        data_set = DataSet()
        for example in data:
            words, label = example[0], example[1]
            instance = Instance()

            if isinstance(words, list):
                x = TextField(words, is_target=False)
                instance.add_field("word_seq", x)
                use_word_seq = True
            else:
                raise NotImplementedError("words is a {}".format(type(words)))

            if isinstance(label, list):
                y = TextField(label, is_target=True)
                instance.add_field("label_seq", y)
                use_label_seq = True
            elif isinstance(label, str):
                y = LabelField(label, is_target=True)
                instance.add_field("label", y)
                use_label_str = True
            else:
                raise NotImplementedError("label is a {}".format(type(label)))
            data_set.append(instance)

        # convert strings to indices
        if use_word_seq:
            data_set.index_field("word_seq", vocab)
        if use_label_seq:
            data_set.index_field("label_seq", label_vocab)
        if use_label_str:
            data_set.index_field("label", label_vocab)

        return data_set
예제 #7
0
 def test(self):
     data = DataSet()
     for text in texts:
         x = TextField(text, is_target=False)
         ins = Instance(text=x)
         data.append(ins)
     data_set = create_dataset_from_lists(texts, vocab, has_target=False)
     self.assertTrue(type(data) == type(data_set))
예제 #8
0
파일: dataset.py 프로젝트: ssttv/fastNLP
 def convert_for_infer(self, data, vocabs):
     for word_seq in data:
         # list
         x = TextField(word_seq, is_target=False)
         instance = Instance()
         instance.add_field("word_seq", x)
         self.append(instance)
     self.index_field("word_seq", vocabs["word_vocab"])
예제 #9
0
    def test_case_1(self):
        model_args = {
            "vocab_size": 10,
            "word_emb_dim": 100,
            "rnn_hidden_units": 100,
            "num_classes": 5
        }
        valid_args = {"save_output": True, "validate_in_training": True, "save_dev_input": True,
                      "save_loss": True, "batch_size": 2, "pickle_path": "./save/",
                      "use_cuda": False, "print_every_step": 1}

        train_data = [
            [['a', 'b', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']],
            [['a', '@', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']],
            [['a', 'b', '#', 'd', 'e'], ['a', '@', 'c', 'd', 'e']],
            [['a', 'b', 'c', '?', 'e'], ['a', '@', 'c', 'd', 'e']],
            [['a', 'b', 'c', 'd', '$'], ['a', '@', 'c', 'd', 'e']],
            [['!', 'b', 'c', 'd', 'e'], ['a', '@', 'c', 'd', 'e']],
        ]
        vocab = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, '!': 5, '@': 6, '#': 7, '$': 8, '?': 9}
        label_vocab = {'a': 0, '@': 1, 'c': 2, 'd': 3, 'e': 4}

        data_set = DataSet()
        for example in train_data:
            text, label = example[0], example[1]
            x = TextField(text, False)
            y = TextField(label, is_target=True)
            ins = Instance(word_seq=x, label_seq=y)
            data_set.append(ins)

        data_set.index_field("word_seq", vocab)
        data_set.index_field("label_seq", label_vocab)

        model = SeqLabeling(model_args)

        tester = SeqLabelTester(**valid_args)
        tester.test(network=model, dev_data=data_set)
        # If this can run, everything is OK.

        os.system("rm -rf save")
        print("pickle path deleted")
예제 #10
0
파일: dataset.py 프로젝트: ssttv/fastNLP
 def convert_with_vocabs(self, data, vocabs):
     for example in data:
         word_seq, label = example[0], example[1]
         # list, str
         x = TextField(word_seq, is_target=False)
         y = LabelField(label, is_target=True)
         instance = Instance()
         instance.add_field("word_seq", x)
         instance.add_field("label", y)
         self.append(instance)
     self.index_field("word_seq", vocabs["word_vocab"])
     self.index_field("label", vocabs["label_vocab"])
예제 #11
0
파일: dataset.py 프로젝트: ssttv/fastNLP
    def convert(self, data):
        """Convert lists of strings into Instances with Fields.

        :param data: 3-level lists. Entries are strings.
        """
        bar = ProgressBar(total=len(data))
        for example in data:
            word_seq, label_seq = example[0], example[1]
            # list, list
            self.word_vocab.update(word_seq)
            self.label_vocab.update(label_seq)
            x = TextField(word_seq, is_target=False)
            x_len = LabelField(len(word_seq), is_target=False)
            y = TextField(label_seq, is_target=False)
            instance = Instance()
            instance.add_field("word_seq", x)
            instance.add_field("truth", y)
            instance.add_field("word_seq_origin_len", x_len)
            self.append(instance)
            bar.move()
        self.index_field("word_seq", self.word_vocab)
        self.index_field("truth", self.label_vocab)
예제 #12
0
파일: dataset.py 프로젝트: ssttv/fastNLP
 def convert(self, data):
     for example in data:
         word_seq, label = example[0], example[1]
         # list, str
         self.word_vocab.update(word_seq)
         self.label_vocab.update(label)
         x = TextField(word_seq, is_target=False)
         y = LabelField(label, is_target=True)
         instance = Instance()
         instance.add_field("word_seq", x)
         instance.add_field("label", y)
         self.append(instance)
     self.index_field("word_seq", self.word_vocab)
     self.index_field("label", self.label_vocab)
예제 #13
0
파일: dataset.py 프로젝트: 2017alan/fastNLP
def create_labeled_dataset_from_lists(str_lists, word_vocab, label_vocab):
    """Create an DataSet instance that contains labels.

    :param str_lists: list of list of strings, [num_examples, 2, *].
            ::
            [
                [[word_11, word_12, ...], [label_11, label_12, ...]],
                ...
            ]

    :param word_vocab: dict of (str: int), which means (word: index).
    :param label_vocab: dict of (str: int), which means (word: index).
    :return data_set: a DataSet instance.

    """
    data_set = DataSet()
    for example in str_lists:
        word_seq, label_seq = example[0], example[1]
        x = TextField(word_seq, is_target=False)
        y = TextField(label_seq, is_target=True)
        data_set.append(Instance(word_seq=x, label_seq=y))
    data_set.index_field("word_seq", word_vocab)
    data_set.index_field("label_seq", label_vocab)
    return data_set
예제 #14
0
파일: dataset.py 프로젝트: 2017alan/fastNLP
def create_unlabeled_dataset_from_lists(str_lists, word_vocab):
    """Create an DataSet instance that contains no labels.

    :param str_lists: list of list of strings, [num_examples, *].
            ::
            [
                [word_11, word_12, ...],
                ...
            ]

    :param word_vocab: dict of (str: int), which means (word: index).
    :return data_set: a DataSet instance.

    """
    data_set = DataSet()
    for word_seq in str_lists:
        x = TextField(word_seq, is_target=False)
        data_set.append(Instance(word_seq=x))
    data_set.index_field("word_seq", word_vocab)
    return data_set
예제 #15
0
    """
    texts = ["i am a cat", "this is a test of new batch", "haha"]
    labels = [0, 1, 0]

    # prepare vocabulary
    vocab = {}
    for text in texts:
        for tokens in text.split():
            if tokens not in vocab:
                vocab[tokens] = len(vocab)
    print("vocabulary: ", vocab)

    # prepare input dataset
    data = DataSet()
    for text, label in zip(texts, labels):
        x = TextField(text.split(), False)
        y = LabelField(label, is_target=True)
        ins = Instance(text=x, label=y)
        data.append(ins)

    # use vocabulary to index data
    data.index_field("text", vocab)

    # define naive sampler for batch class
    class SeqSampler:
        def __call__(self, dataset):
            return list(range(len(dataset)))

    # use batch to iterate dataset
    data_iterator = Batch(data, 2, SeqSampler(), False)
    for epoch in range(1):