예제 #1
0
    def test_init_with_nested_field_as_nesting_field(self):
        nesting_field = data.NestedField(data.Field())

        with pytest.raises(ValueError) as excinfo:
            data.NestedField(nesting_field)
        assert "nesting field must not be another NestedField" in str(
            excinfo.value)
예제 #2
0
    def test_pad_when_fix_length_is_not_none(self):
        nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>",
                                   init_token="<w>", eos_token="</w>")
        CHARS = data.NestedField(
            nesting_field, init_token="<s>", eos_token="</s>", fix_length=3)
        minibatch = [
            ["john", "loves", "mary"],
            ["mary", "cries"]
        ]
        expected = [
            [
                ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                ["<w>"] + list("john") + ["</w>", "<cpad>"],
                ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
            ],
            [
                ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
            ]
        ]

        assert CHARS.pad(minibatch) == expected

        # test include length
        nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>",
                                   init_token="<w>", eos_token="</w>")
        CHARS = data.NestedField(nesting_field, init_token="<s>",
                                 eos_token="</s>", include_lengths=True, fix_length=3)
        arr, seq_len, words_len = CHARS.pad(minibatch)
        assert arr == expected
        assert seq_len == [3, 3]
        assert words_len == [[3, 6, 3], [3, 6, 3]]
    def __init__(self):
        super(DMNIterator, self).__init__()
        # Define text nested field
        self.text_sent = data.Field(sequential=True,
                                    lower=True,
                                    tokenize=lambda x: x.split(" "))
        self.text_doc = data.NestedField(self.text_sent,
                                         tokenize=lambda x: x.split("<EOS>"),
                                         include_lengths=True)

        # Define entity nested field
        self.entity_sent = data.Field(sequential=True,
                                      tokenize=lambda x: x.split(" "),
                                      unk_token=None)
        self.entity_doc = data.NestedField(self.entity_sent,
                                           tokenize=lambda x: x.split("<EOS>"))

        # Define label nested field
        self.label_sent = data.Field(sequential=True,
                                     tokenize=lambda x: x.split(" "),
                                     unk_token=None)
        self.label_doc = data.NestedField(self.label_sent,
                                          tokenize=lambda x: x.split("<EOS>"))

        # Define offset nested field
        self.offset_sent = self.InfoField(sequential=True,
                                          tokenize=lambda x: x.split(" "),
                                          use_vocab=False)
        self.offset_doc = self.NestedInfoField(self.offset_sent,
                                               tokenize=lambda x: x.split("<EOS>"),
                                               use_vocab=False)

        # Define length nested field
        self.length_sent = self.InfoField(sequential=True,
                                          tokenize=lambda x: x.split(" "),
                                          use_vocab=False,
                                          pad_token=None)
        self.length_doc = self.NestedInfoField(self.length_sent,
                                               tokenize=lambda x: x.split("<EOS>"),
                                               use_vocab=False)

        # Define word attention field
        self.word_attn_sent = self.InfoField(sequential=True,
                                             tokenize=lambda x: x.split(" "),
                                             use_vocab=False)
        self.word_attn_doc = self.NestedInfoField(self.word_attn_sent,
                                                  tokenize=lambda x: x.split("<EOS>"),
                                                  use_vocab=False)
        # Define sentence attention field
        self.sent_attn_doc = self.InfoField(sequential=True,
                                            tokenize=lambda x: x.split("<EOS>"),
                                            use_vocab=False)

        # Define doc id field
        self.doc_id = self.InfoField(sequential=False, use_vocab=False)

        self.vectors = None
예제 #4
0
파일: train.py 프로젝트: kahne/examples
def get_data_iter():
    WORD = data.Field(init_token='<bos>', eos_token='<eos>', include_lengths=True)
    UD_TAG = data.Field(init_token='<bos>', eos_token='<eos>')
    PTB_TAG = data.Field(init_token='<bos>', eos_token='<eos>')
    CHAR_NESTING = data.Field(tokenize=list, init_token='<bos>', eos_token='<eos>')
    CHAR = data.NestedField(CHAR_NESTING, init_token='<bos>', eos_token='<eos>', include_lengths=True)

    train, val, test = datasets.UDPOS.splits(
        fields=((('word', 'char'), (WORD, CHAR)), ('tag', UD_TAG), ('ptbtag', PTB_TAG)),
        root='.data',
        train='en-ud-tag.v2.train.txt',
        validation='en-ud-tag.v2.dev.txt',
        test='en-ud-tag.v2.test.txt'
    )

    WORD.build_vocab(train, min_freq=args.word_min_freq)
    UD_TAG.build_vocab(train)
    PTB_TAG.build_vocab(train)
    CHAR.build_vocab(train)

    args.word2idx = WORD.vocab.stoi
    args.tag2idx = PTB_TAG.vocab.stoi
    args.char2idx = CHAR.vocab.stoi
    args.tag_bos = PTB_TAG.init_token
    args.tag_eos = PTB_TAG.eos_token
    args.tag_pad = PTB_TAG.pad_token

    train_iter, val_iter, test_iter = data.BucketIterator.splits((train, val, test),
                                                                 batch_sizes=(args.train_batch_size, args.val_batch_size, args.val_batch_size),
                                                                 device=args.device, repeat=False)

    return train_iter, val_iter, test_iter
예제 #5
0
 def __init__(self):
     self.CHAR_NESTING = data.Field(batch_first=True, tokenize=list)
     self.char_field = data.NestedField(self.CHAR_NESTING,
                                        tokenize=lambda x: x.split(),
                                        fix_length=60)
     # 构建Field
     self.TEXT = data.Field(batch_first=True,
                            lower=True,
                            tokenize=lambda x: x.split(),
                            fix_length=60)
     self.bigram = data.Field(batch_first=True,
                              lower=True,
                              tokenize=lambda x: n_gram_tokenizer(x, 2),
                              fix_length=60)
     self.trigram = data.Field(batch_first=True,
                               lower=True,
                               tokenize=lambda x: n_gram_tokenizer(x, 3),
                               fix_length=60)
     # 标签域一定要加LabelField!!!!气哭
     self.LABEL = data.LabelField(use_vocab=True, dtype=torch.long)
     self.WORD_FIELD = [("sentence_word", self.TEXT), ("label", self.LABEL)]
     self.CHAR_FIELD = [("sentence_char", self.char_field),
                        ("sentence_word", self.TEXT), ("label", self.LABEL)]
     self.BIGRAM_FIELD = [("sentence_word", self.TEXT),
                          ("sentence_bigram", self.bigram),
                          ("label", self.LABEL)]
예제 #6
0
def get_dataset(base_path,
                batch_size,
                pretrained_embedding=None,
                is_inference=False):
    sentence = data.Field(lower=False, include_lengths=True, batch_first=True)
    char_nesting = data.Field(lower=False, tokenize=list)
    char_sentence = data.NestedField(char_nesting, include_lengths=True)
    tags = data.Field(batch_first=True)

    train, val, test = SequenceTaggingDataset.splits(
        path=base_path,
        train="train.txt",
        validation="dev.txt",
        test="test.txt",
        fields=[(("sentence", "char_sentence"), (sentence, char_sentence)),
                ("tags", tags)])
    tags.build_vocab(train.tags)
    if not pretrained_embedding:
        sentence.build_vocab(train.sentence, min_freq=5)
    else:
        sentence.build_vocab(train.sentence, vectors=pretrained_embedding)
    char_sentence.build_vocab(train.char_sentence)

    train_iter, val_iter, test_iter = data.BucketIterator.splits(
        (train, val, test), [batch_size] * 3,
        repeat=False,
        shuffle=True,
        sort_key=lambda x: len(x.sentence),
        sort_within_batch=True)

    return sentence, char_sentence, tags, val_iter, train_iter, test_iter
예제 #7
0
    def test_serialization(self):
        nesting_field = data.Field(batch_first=True)
        field = data.NestedField(nesting_field)
        ex1 = data.Example.fromlist(["john loves mary"], [("words", field)])
        ex2 = data.Example.fromlist(["mary cries"], [("words", field)])
        dataset = data.Dataset([ex1, ex2], [("words", field)])
        field.build_vocab(dataset)
        examples_data = [[
            ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
            ["<w>"] + list("john") + ["</w>", "<cpad>"],
            ["<w>"] + list("loves") + ["</w>"],
            ["<w>"] + list("mary") + ["</w>", "<cpad>"],
            ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
        ],
                         [
                             ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                             ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                             ["<w>"] + list("cries") + ["</w>"],
                             ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
                             ["<cpad>"] * 7,
                         ]]

        field_pickle_filename = "char_field.pl"
        field_pickle_path = os.path.join(self.test_dir, field_pickle_filename)
        torch.save(field, field_pickle_path)

        loaded_field = torch.load(field_pickle_path)
        assert loaded_field == field

        original_numericalization = field.numericalize(examples_data)
        pickled_numericalization = loaded_field.numericalize(examples_data)

        assert torch.all(
            torch.eq(original_numericalization, pickled_numericalization))
예제 #8
0
def load_data(data_path):
    input_word = data.Field(init_token="<bos>",
                            eos_token="<eos>",
                            batch_first=True,
                            lower=True,
                            include_lengths=True)

    input_char_nesting = data.Field(tokenize=list,
                                    init_token="<bos>",
                                    eos_token="<eos>",
                                    batch_first=True)

    input_char = data.NestedField(input_char_nesting,
                                  init_token="<bos>",
                                  eos_token="<eos>")

    label = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True)

    fields = [(('input_word', 'input_char'), (input_word, input_char)),
              ('label', label)]

    dataset = read_data(data_path, fields)
    for item in dataset:
        print(item.input_word)
        print(item.input_char)
        print(item.label)
        break
예제 #9
0
    def __init__(self, args_dict):
        """
        Params:
            arg_dict: ...
        The dataset json is read and splitted into three jsons: "train.json", "val.json", "test.json".
        
        """

        self.args_dict = args_dict

        # Create data field
        self.ID = data.Field()
        self.LABEL = data.LabelField()

        if self.args_dict['net_type'] == 'han':
            max_sent_len = self.args_dict['max_sent_len'] if self.args_dict[
                'max_sent_len'] != 0 else None
            max_doc_len = self.args_dict[
                'max_doc_len'] if self.args_dict['max_doc_len'] != 0 else None
            # nested sentence tokens
            nest_field = data.Field(
                pad_token='<pad>', fix_length=max_sent_len
            )  # fix num of words in each sent (fix max_sent_len)
            self.TEXT = data.NestedField(
                nest_field,
                fix_length=max_doc_len)  # fix num of sents (fix max_doc_len)
        else:
            self.TEXT = data.Field()  # word tokens

        # Modify rob name
        self.rob_item = self.args_dict['rob_item']

        self.under_sample_ratio = self.args_dict['under_sample_ratio']
예제 #10
0
    def test_preprocess(self):
        nesting_field = data.Field(
            tokenize=list, preprocessing=lambda xs: [x.upper() for x in xs])
        field = data.NestedField(nesting_field, preprocessing=lambda xs: reversed(xs))
        preprocessed = field.preprocess("john loves mary")

        assert preprocessed == [list("MARY"), list("LOVES"), list("JOHN")]
예제 #11
0
    def test_init_when_nesting_field_has_include_lengths_equal_true(self):
        nesting_field = data.Field(include_lengths=True)

        with pytest.raises(ValueError) as excinfo:
            data.NestedField(nesting_field)
        assert "nesting field cannot have include_lengths=True" in str(
            excinfo.value)
예제 #12
0
    def test_numericalize(self):
        nesting_field = data.Field(batch_first=True)
        field = data.NestedField(nesting_field)
        ex1 = data.Example.fromlist(["john loves mary"], [("words", field)])
        ex2 = data.Example.fromlist(["mary cries"], [("words", field)])
        dataset = data.Dataset([ex1, ex2], [("words", field)])
        field.build_vocab(dataset)
        examples_data = [[
            ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
            ["<w>"] + list("john") + ["</w>", "<cpad>"],
            ["<w>"] + list("loves") + ["</w>"],
            ["<w>"] + list("mary") + ["</w>", "<cpad>"],
            ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
        ],
                         [
                             ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                             ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                             ["<w>"] + list("cries") + ["</w>"],
                             ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
                             ["<cpad>"] * 7,
                         ]]
        numericalized = field.numericalize(examples_data, device=-1)

        assert numericalized.dim() == 3
        assert numericalized.size(0) == len(examples_data)
        for example, numericalized_example in zip(examples_data,
                                                  numericalized):
            verify_numericalized_example(field,
                                         example,
                                         numericalized_example,
                                         batch_first=True)
예제 #13
0
    def test_pad_when_pad_first_is_true(self):
        nesting_field = data.Field(tokenize=list,
                                   unk_token="<cunk>",
                                   pad_token="<cpad>",
                                   init_token="<w>",
                                   eos_token="</w>")
        CHARS = data.NestedField(nesting_field,
                                 init_token="<s>",
                                 eos_token="</s>",
                                 pad_first=True)
        minibatch = [
            [list("john"), list("loves"),
             list("mary")],
            [list("mary"), list("cries")],
        ]
        expected = [[
            ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
            ["<w>"] + list("john") + ["</w>", "<cpad>"],
            ["<w>"] + list("loves") + ["</w>"],
            ["<w>"] + list("mary") + ["</w>", "<cpad>"],
            ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
        ],
                    [
                        ["<cpad>"] * 7,
                        ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                        ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                        ["<w>"] + list("cries") + ["</w>"],
                        ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
                    ]]

        assert CHARS.pad(minibatch) == expected
예제 #14
0
    def test_pad_when_nesting_field_has_fix_length(self):
        nesting_field = data.Field(tokenize=list,
                                   unk_token="<cunk>",
                                   pad_token="<cpad>",
                                   init_token="<w>",
                                   eos_token="</w>",
                                   fix_length=5)
        CHARS = data.NestedField(nesting_field,
                                 init_token="<s>",
                                 eos_token="</s>")
        minibatch = [["john", "loves", "mary"], ["mary", "cries"]]
        expected = [[
            ["<w>", "<s>", "</w>"] + ["<cpad>"] * 2,
            ["<w>"] + list("joh") + ["</w>"],
            ["<w>"] + list("lov") + ["</w>"],
            ["<w>"] + list("mar") + ["</w>"],
            ["<w>", "</s>", "</w>"] + ["<cpad>"] * 2,
        ],
                    [
                        ["<w>", "<s>", "</w>"] + ["<cpad>"] * 2,
                        ["<w>"] + list("mar") + ["</w>"],
                        ["<w>"] + list("cri") + ["</w>"],
                        ["<w>", "</s>", "</w>"] + ["<cpad>"] * 2,
                        ["<cpad>"] * 5,
                    ]]

        assert CHARS.pad(minibatch) == expected
예제 #15
0
    def test_pad_when_pad_first_is_true(self):
        nesting_field = data.Field(tokenize=list,
                                   unk_token="<cunk>",
                                   pad_token="<cpad>",
                                   init_token="<w>",
                                   eos_token="</w>")
        CHARS = data.NestedField(nesting_field,
                                 init_token="<s>",
                                 eos_token="</s>",
                                 pad_first=True)
        minibatch = [
            [list("john"), list("loves"),
             list("mary")],
            [list("mary"), list("cries")],
        ]
        expected = [[
            ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
            ["<w>"] + list("john") + ["</w>", "<cpad>"],
            ["<w>"] + list("loves") + ["</w>"],
            ["<w>"] + list("mary") + ["</w>", "<cpad>"],
            ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
        ],
                    [
                        ["<cpad>"] * 7,
                        ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                        ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                        ["<w>"] + list("cries") + ["</w>"],
                        ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
                    ]]

        assert CHARS.pad(minibatch) == expected

        # test include_length
        nesting_field = data.Field(tokenize=list,
                                   unk_token="<cunk>",
                                   pad_token="<cpad>",
                                   init_token="<w>",
                                   eos_token="</w>")
        CHARS = data.NestedField(nesting_field,
                                 init_token="<s>",
                                 eos_token="</s>",
                                 include_lengths=True,
                                 pad_first=True)
        arr, seq_len, words_len = CHARS.pad(minibatch)
        assert arr == expected
        assert seq_len == [5, 4]
        assert words_len == [[3, 6, 7, 6, 3], [0, 3, 6, 7, 3]]
예제 #16
0
def load_data_word_lstm_char(path_file_data,
                             name_file_train,
                             name_file_test=None,
                             min_freq_word=1,
                             min_freq_char=1,
                             batch_size=2):

    inputs_word = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True)

    inputs_char_nesting = data.Field(tokenize=list, init_token="<bos>", eos_token="<eos>", batch_first=True)

    inputs_char = data.NestedField(inputs_char_nesting,
                                   init_token="<bos>", eos_token="<eos>")

    labels = data.LabelField(sequential=False)

    fields = ([(('inputs_word', 'inputs_char'), (inputs_word, inputs_char)), ('labels', labels)])

    if name_file_test is not None:
        train, test = data.TabularDataset.splits(path=path_file_data,
                                                 train=name_file_train,
                                                 test=name_file_test,
                                                 fields=tuple(fields),
                                                 format='csv',
                                                 skip_header=True,
                                                 csv_reader_params={'delimiter': '|'})

        inputs_word.build_vocab(train.inputs_word, test.inputs_word, min_freq=min_freq_word)
        inputs_char.build_vocab(train.inputs_char, test.inputs_char, min_freq=min_freq_char)
        labels.build_vocab(train.labels)

        train_iter, test_iter = data.BucketIterator.splits(datasets=(train, test),
                                                           batch_size=batch_size,
                                                           sort_key=lambda x: len(x.inputs_word),
                                                           device=torch.device("cuda:0"
                                                                               if torch.cuda.is_available() else "cpu"))
        dict_return = {'iters': (train_iter, test_iter),
                       'vocabs': (inputs_word.vocab, inputs_char.vocab, labels.vocab)}
    else:
        path_file_data_train = path_file_data + name_file_train
        train = data.TabularDataset(path_file_data_train,
                                    fields=tuple(fields),
                                    format='csv',
                                    skip_header=True,
                                    csv_reader_params={'delimiter': '|'})

        inputs_word.build_vocab(train.inputs_word, min_freq=min_freq_word)
        inputs_char.build_vocab(train.inputs_char, min_freq=min_freq_char)
        labels.build_vocab(train.labels)
        train_iter = data.BucketIterator(train,
                                         batch_size=batch_size,
                                         sort_key=lambda x: len(x.inputs_word),
                                         device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"))

        dict_return = {'iters': (train_iter),
                       'vocabs': (inputs_word.vocab, inputs_char.vocab, labels.vocab)}
        
        
    return dict_return
예제 #17
0
    def __init__(self, args):
        path = '.data/squad'
        dataset_path = path + '/torchtext/'
        train_examples_path = dataset_path + 'train_examples.pt'
        dev_examples_path = dataset_path + 'dev_examples.pt'

        self.RAW = data.RawField()
        self.CHAR_NESTING = data.Field(batch_first=True,
                                       tokenize=list,
                                       lower=True)
        self.CHAR = data.NestedField(self.CHAR_NESTING, tokenize=word_tokenize)
        self.WORD = data.Field(batch_first=True,
                               tokenize=word_tokenize,
                               lower=True,
                               include_lengths=True)
        self.LABEL = data.Field(sequential=False,
                                unk_token=None,
                                use_vocab=False)

        dict_fields = {
            'id': ('id', self.RAW),
            's_idx': ('s_idx', self.LABEL),
            'e_idx': ('e_idx', self.LABEL),
            'context': [('c_word', self.WORD), ('c_char', self.CHAR)],
            'question': [('q_word', self.WORD), ('q_char', self.CHAR)]
        }

        list_fields = [('id', self.RAW), ('s_idx', self.LABEL),
                       ('e_idx', self.LABEL), ('c_word', self.WORD),
                       ('c_char', self.CHAR), ('q_word', self.WORD),
                       ('q_char', self.CHAR)]

        if os.path.exists(dataset_path):
            print("loading splits...")
            dev_examples = torch.load(dev_examples_path)

            self.dev = data.Dataset(examples=dev_examples, fields=list_fields)
        else:
            print("building splits...")
            self.dev = data.TabularDataset(path=path + f'/dev-v1.1.jsonl',
                                           format='json',
                                           fields=dict_fields)

            os.makedirs(dataset_path)
            torch.save(self.dev.examples, dev_examples_path)

        print("building vocab...")
        self.CHAR.build_vocab(self.dev, min_freq=10000)
        self.WORD.build_vocab(self.dev,
                              vectors=GloVe(name='6B', dim=args.word_dim),
                              max_size=80000)
        device = torch.device(
            f"cuda:0" if torch.cuda.is_available() else "cpu")
        self.dev_iter = \
            data.BucketIterator(self.dev,
                                batch_size=60,
                                device=device,
                                sort=True,
                                sort_key=lambda x: len(x.c_word))
예제 #18
0
def load_data():
    input_word = data.Field(init_token="<bos>",
                            eos_token="<eos>",
                            batch_first=True,
                            lower=True,
                            include_lengths=True)

    input_char_nesting = data.Field(tokenize=list,
                                    init_token="<bos>",
                                    eos_token="<eos>",
                                    batch_first=True)

    input_char = data.NestedField(input_char_nesting,
                                  init_token="<bos>",
                                  eos_token="<eos>")

    label = data.Field(init_token="<bos>", eos_token="<eos>", batch_first=True)

    fields = [(('input_word', 'input_char'), (input_word, input_char)),
              (None, None), ('label', label)]

    train, valid, test = datasets.CoNLL2000Chunking.splits(fields)

    for item in train:
        print(item.__dict__.keys())
        print(item.input_word)
        print(item.input_char)
        print(item.label)
        break

    input_word.build_vocab(train.input_word,
                           test.input_word,
                           valid.input_word,
                           vectors=GloVe(name='6B', dim=300))
    input_char.build_vocab(train.input_char, test.input_word, valid.input_word)
    label.build_vocab(train.label)

    vocab_word = input_word.vocab
    print(vocab_word.vectors)
    # word_embeddings = vocab_word.vectors
    # vocab_word_size = len(vocab_word)
    #
    # vocab_char = input_char.vocab
    # vocab_char_size = len(vocab_char)

    train_iter, test_iter, valid_iter = data.BucketIterator.splits(
        (train, test, valid),
        batch_size=32,
        sort_key=lambda x: len(x.input_word),
        repeat=False,
        sort_within_batch=True,
        shuffle=True)

    return {
        'iter': (train_iter, valid_iter, test_iter),
        'vocabs': (input_word.vocab, input_char.vocab, label.vocab)
    }
예제 #19
0
파일: data.py 프로젝트: jangdn/nlpserver
	def __init__(self,args):
		path = './data/squad'
		dataset_path = path +'/torchtext/'
		train_examples_path = dataset_path + 'train_examples.pt'
		dev_examples_path = dataset_path +'dev_examples.pt'

		print ("[+] Preprocessing data files..")

		if not os.path.exists(f'{path}/{args.train_file}l'): # what's the l chracter means?
 			self.preprocess_file(f'{path}/{args.train_file}')
		if not os.path.exists(f'{path}/{args.dev_file}l'):
			self.preprocess_file(f'{path}/{args.dev_file}')

		self.RAW = data.RawField()
		self.CHAR_NESTING = data.Field(batch_first=True,tokenize=list,lower=True ) # tokenize list? # nesting filed? char -> [c,h,a,r]?
		self.CHAR = data.NestedField(self.CHAR_NESTING,tokenize=word_tokenize) 
		# In this line, what's the mean of Nested Field ( I thinck that Nested filed contains other filed ). In this case, self.charnetsting is chracter based tokenizer 
		self.WORD = data.Field(batch_first =True,tokenize=word_tokenize,lower=True,include_lengths=True)
		self.LABEL = data.Field(sequential=False,unk_token=None,use_vocab=False)

		dict_field = { 'id' : ('id',self.RAW) ,
					's_idx':('s_idx',self.LABEL),
					'e_idx':('e_idx',self.LABEL),
					'context': [('c_word',self.WORD),('c_char',self.CHAR)],
					'questions':[('q_word',self.WORD),('q_char',self.CHAR)]}
		list_field = [ ('id',self.RAW) ,('s_idx',self.LABEL),('e_idx',self.LABEL),
					('c_word',self.WORD),('c_char',self.CHAR),('q_word',self.WORD),('q_char',self.CHAR)]

		if os.path.exists(dataset_path):
			print ("[+] Loading splits....")
			train_examples = torch.load(train_examples_path)
			dev_examples = torch.load(dev_examples_path)

			self.train = data.Dataset(examples=train_examples,fields=list_field)
			self.dev = data.Dataset(examples=dev_examples,fields=list_field)

		else:
			print ('[+] building splits...')
			self.train,self.dev = data.TabularDataset.splits(
								path=path,train=f'{args.train_file}l',
								validation=f'{args.dev_file}l',
								format='json',
								fields=dict_field)
			os.makedir(dataset_path)
			torch.save(self.train.examples,train_examples_path)
			torch.save(self.dev.examples,dev_examples_path)
		
		# cut too long context in the training set for efficiency 

		if args.context_threshold > 0: 
			self.train.examples = [e for e in self.train.examples if len(e.c_word) <= args.context_threshold]

		print ("building iterators...")
		self.train_tier,self.dev_iter = \
			data.BucketIterator.splits((self.train,self.dev),batch_size = [args.train_batch_size,args.dev_batch_size],
								device=args.gpu_num ,sort_key = lambda x: len(x.c_word))
예제 #20
0
    def test_pad_when_nesting_field_has_fix_length(self):
        nesting_field = data.Field(tokenize=list,
                                   unk_token="<cunk>",
                                   pad_token="<cpad>",
                                   init_token="<w>",
                                   eos_token="</w>",
                                   fix_length=5)
        CHARS = data.NestedField(nesting_field,
                                 init_token="<s>",
                                 eos_token="</s>")
        minibatch = [["john", "loves", "mary"], ["mary", "cries"]]
        expected = [[
            ["<w>", "<s>", "</w>"] + ["<cpad>"] * 2,
            ["<w>"] + list("joh") + ["</w>"],
            ["<w>"] + list("lov") + ["</w>"],
            ["<w>"] + list("mar") + ["</w>"],
            ["<w>", "</s>", "</w>"] + ["<cpad>"] * 2,
        ],
                    [
                        ["<w>", "<s>", "</w>"] + ["<cpad>"] * 2,
                        ["<w>"] + list("mar") + ["</w>"],
                        ["<w>"] + list("cri") + ["</w>"],
                        ["<w>", "</s>", "</w>"] + ["<cpad>"] * 2,
                        ["<cpad>"] * 5,
                    ]]

        assert CHARS.pad(minibatch) == expected

        # test include length
        nesting_field = data.Field(tokenize=list,
                                   unk_token="<cunk>",
                                   pad_token="<cpad>",
                                   init_token="<w>",
                                   eos_token="</w>",
                                   fix_length=5)
        CHARS = data.NestedField(nesting_field,
                                 init_token="<s>",
                                 eos_token="</s>",
                                 include_lengths=True)
        arr, seq_len, words_len = CHARS.pad(minibatch)
        assert arr == expected
        assert seq_len == [5, 4]
        assert words_len == [[3, 5, 5, 5, 3], [3, 5, 5, 3, 0]]
예제 #21
0
def get_input_processor_words(vocab_word,
                              vocab_char=None,
                              convert_digits=True):
    """
    Returns a function that converts text into a processed batch. Required duing
    inference.
    Parameters:
        vocab_word: Instance of torchtext.Vocab for input word vocabulary
        vocab_char[optional]: Instance of torchtext.Vocab for input per-word 
                              character vocabulary
        convert_digits: If True will convert numbers to single 0's
    """
    inputs_word = data.Field(
        init_token="<bos>",
        eos_token="<eos>",
        batch_first=True,
        lower=True,
        preprocessing=data.Pipeline(lambda w: '0'
                                    if convert_digits and w.isdigit() else w))
    # Set the vocab object manually without building from training dataset
    inputs_word.vocab = vocab_word

    if vocab_char is not None:
        inputs_char_nesting = data.Field(tokenize=list,
                                         init_token="<bos>",
                                         eos_token="<eos>",
                                         batch_first=True)

        inputs_char = data.NestedField(inputs_char_nesting,
                                       init_token="<bos>",
                                       eos_token="<eos>")
        # Set the vocab object manually without building from training dataset
        inputs_char.vocab = inputs_char_nesting.vocab = vocab_char

        fields = [(('inputs_word', 'inputs_char'), (inputs_word, inputs_char))]
    else:
        fields = [('inputs_word', inputs_word)]

    def input_processor_fn(inputs):
        if not isinstance(inputs, list):
            inputs = [inputs]

        examples = []
        for line in inputs:
            examples.append(data.Example.fromlist([line], fields))

        dataset = data.Dataset(examples, fields)
        # Entire input in one batch
        return data.Batch(
            data=dataset,
            dataset=dataset,
            device=torch.device(
                "cuda:0" if torch.cuda.is_available() else "cpu"))

    return input_processor_fn
    def __init__(self,
                 data_path,
                 glove_size,
                 batch_size,
                 train_file='train.csv',
                 dev_file='dev.csv'):
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

        # Defining the Fields
        self.RAW = data.RawField(is_target=False)
        self.WORDS = data.Field(batch_first=True,
                                tokenize=post_ptbtokenizer,
                                lower=True,
                                include_lengths=True)
        self.CHAR = data.NestedField(data.Field(batch_first=True,
                                                tokenize=list,
                                                lower=True),
                                     tokenize=post_ptbtokenizer)

        self.INDEX = data.Field(sequential=False,
                                unk_token=None,
                                use_vocab=False)

        fields = {
            'id': ('id', self.RAW),
            'context_ptb_tok': [('context_words', self.WORDS),
                                ('context_char', self.CHAR)],
            'question_ptb_tok': [('question_words', self.WORDS),
                                 ('question_char', self.CHAR)],
            'answer_ptb_tok': [('answer_words', self.WORDS),
                               ('answer_char', self.CHAR)],
            'start_idx': ('start_idx', self.INDEX),
            'end_idx': ('end_idx', self.INDEX)
        }

        print('Loading CSV Data Into Torch Tabular Dataset')
        self.train, self.dev = data.TabularDataset.splits(path=data_path,
                                                          train=train_file,
                                                          validation=dev_file,
                                                          format='csv',
                                                          fields=fields)

        print('Building Vocabulary')
        self.CHAR.build_vocab(self.train, self.dev)
        self.WORDS.build_vocab(self.train,
                               self.dev,
                               vectors=GloVe(name='6B', dim=glove_size))

        print('Creating Iterators')
        self.train_iter = PreprocessData.create_train_iterator(
            self.train, device, batch_size)
        self.dev_iter = PreprocessData.create_dev_iterator(
            self.dev, device, batch_size)
def read_files(args):
    target_path = args.target_path
    if args.is_from_scratch:
        plausible_path = args.plausible_path
        implausible_path = args.implausible_path
        prepare_tsv(plausible_path,
                    implausible_path,
                    target_path,
                    option='combined')

    nesting_field = data.Field(batch_first=True,
                               tokenize=word_tokenizer,
                               unk_token='<unk>',
                               include_lengths=False,
                               sequential=True,
                               fix_length=args.word_max_len)
    text_field = data.NestedField(nesting_field,
                                  tokenize=sent_tokenize,
                                  fix_length=args.sent_max_len)
    label_field = data.Field(sequential=False,
                             use_vocab=False,
                             batch_first=True,
                             dtype=torch.float)
    fields = [('text', text_field), ('label', label_field)]

    train_path = os.path.join(target_path, 'train.tsv')
    logger.debug('Reading training samples from {}'.format(train_path))
    train = PlausibleDataset(path=train_path,
                             format='tsv',
                             skip_header=True,
                             fields=fields)

    dev_path = os.path.join(target_path, 'dev.tsv')
    logger.debug('Reading dev samples from {}'.format(train_path))
    dev = PlausibleDataset(path=dev_path,
                           format='tsv',
                           skip_header=True,
                           fields=fields)

    test_path = os.path.join(target_path, 'test.tsv')
    logger.debug('Reading test samples from {}'.format(test_path))
    test = PlausibleDataset(path=test_path,
                            format='tsv',
                            skip_header=True,
                            fields=fields)

    logging.info('Initializing the vocabulary...')
    text_field.build_vocab(train,
                           max_size=args.max_vocab_size,
                           vectors=get_embeddings(args.embedding_name),
                           unk_init=torch.Tensor.normal_)

    return train, dev, test, text_field, label_field
예제 #24
0
파일: test_field.py 프로젝트: ppuliu/text
    def test_build_vocab_from_iterable(self):
        nesting_field = data.Field(unk_token="<cunk>", pad_token="<cpad>")
        CHARS = data.NestedField(nesting_field)
        CHARS.build_vocab(
            [[list("aaa"), list("bbb"), ["c"]], [list("bbb"), list("aaa")]],
            [[list("ccc"), list("bbb")], [list("bbb")]],
        )

        expected = "a b c <cunk> <cpad>".split()
        assert len(CHARS.vocab) == len(expected)
        for c in expected:
            assert c in CHARS.vocab.stoi
예제 #25
0
    def test_pad_when_nesting_field_is_not_sequential(self):
        nesting_field = data.Field(sequential=False, unk_token="<cunk>",
                                   pad_token="<cpad>", init_token="<w>", eos_token="</w>")
        CHARS = data.NestedField(nesting_field, init_token="<s>", eos_token="</s>")
        minibatch = [
            ["john", "loves", "mary"],
            ["mary", "cries"]
        ]
        expected = [
            ["<s>", "john", "loves", "mary", "</s>"],
            ["<s>", "mary", "cries", "</s>", "<pad>"],
        ]

        assert CHARS.pad(minibatch) == expected
예제 #26
0
파일: test_field.py 프로젝트: ppuliu/text
    def test_build_vocab_from_dataset(self):
        nesting_field = data.Field(tokenize=list, unk_token="<cunk>", pad_token="<cpad>",
                                   init_token="<w>", eos_token="</w>")
        CHARS = data.NestedField(nesting_field, init_token="<s>", eos_token="</s>")
        ex1 = data.Example.fromlist(["aaa bbb c"], [("chars", CHARS)])
        ex2 = data.Example.fromlist(["bbb aaa"], [("chars", CHARS)])
        dataset = data.Dataset([ex1, ex2], [("chars", CHARS)])

        CHARS.build_vocab(dataset, min_freq=2)

        expected = "a b <w> </w> <s> </s> <cunk> <cpad>".split()
        assert len(CHARS.vocab) == len(expected)
        for c in expected:
            assert c in CHARS.vocab.stoi
예제 #27
0
    def test_build_vocab(self):
        nesting_field = data.Field(tokenize=list, init_token="<w>", eos_token="</w>")

        field = data.NestedField(nesting_field, init_token='<s>', eos_token='</s>',
                                 include_lengths=True,
                                 pad_first=True)

        sources = [[['a'], ['s', 'e', 'n', 't', 'e', 'n', 'c', 'e'], ['o', 'f'],
                    ['d', 'a', 't', 'a'], ['.']],
                   [['y', 'e', 't'], ['a', 'n', 'o', 't', 'h', 'e', 'r']],
                   [['o', 'n', 'e'], ['l', 'a', 's', 't'], ['s', 'e', 'n', 't']]]

        field.build_vocab(sources, vectors='glove.6B.50d',
                          unk_init=init.normal_,
                          vectors_cache=".vector_cache")
예제 #28
0
def load_iters(batch_size=32,
               device="cpu",
               data_path='data',
               vectors=None,
               word2lower=True):
    zero_char_in_word = lambda ex: [re.sub('\d', '0', w) for w in ex]
    zero_char = lambda w: [re.sub('\d', '0', c) for c in w]
    WORD_TEXT = data.Field(lower=word2lower,
                           batch_first=True,
                           include_lengths=True,
                           preprocessing=zero_char_in_word)
    CHAR_NESTING = data.Field(
        tokenize=list, preprocessing=zero_char)  # process a word in char list
    CHAR_TEXT = data.NestedField(CHAR_NESTING)  #
    LABEL = data.Field(unk_token=None, pad_token="O", batch_first=True)
    train_data = ConllDataset(WORD_TEXT, CHAR_TEXT, LABEL,
                              os.path.join(data_path, "train.txt"))
    dev_data = ConllDataset(WORD_TEXT, CHAR_TEXT, LABEL,
                            os.path.join(data_path, "dev.txt"))
    test_data = ConllDataset(WORD_TEXT, CHAR_TEXT, LABEL,
                             os.path.join(data_path, "test.txt"))

    if vectors is not None:
        WORD_TEXT.build_vocab(train_data.word,
                              vectors=vectors,
                              unk_init=unk_init)
    else:
        WORD_TEXT.build_vocab(train_data.word)
    CHAR_TEXT.build_vocab(train_data.char)
    LABEL.build_vocab(train_data.label)

    train_iter, dev_iter = BucketIterator.splits(
        (train_data, dev_data),
        batch_sizes=(batch_size, batch_size),
        device=device,
        sort_key=lambda x: len(x.word),
        sort_within_batch=True,
        repeat=False,
        shuffle=True)

    test_iter = Iterator(test_data,
                         batch_size=batch_size,
                         device=device,
                         sort=False,
                         sort_within_batch=False,
                         repeat=False,
                         shuffle=False)
    return train_iter, dev_iter, test_iter, WORD_TEXT, CHAR_TEXT, LABEL
예제 #29
0
    def cadec(self, opt, tag_type='ner'):
        """
           cadec: CADEC (Parser only. You must place the files)
           Extract CADEC dataset using torchtext.
        """
        logger.info('---------- CADEC = %s ---------' % (tag_type))
        train_file = mapping_files[opt.lang]
        # Setup fields with batch dimension first
        inputs_word = data.Field(
            batch_first=True,
            fix_length=opt.maxlen,
            lower=opt.lower,
            preprocessing=data.Pipeline(
                lambda w: '0' if opt.convert_digits and w.isdigit() else w))

        inputs_char_nesting = data.Field(tokenize=list,
                                         batch_first=True,
                                         fix_length=opt.maxlen)
        inputs_char = data.NestedField(inputs_char_nesting)

        inputs_case = data.Field(
            batch_first=True,
            fix_length=opt.maxlen,
            preprocessing=data.Pipeline(lambda w: self.getCasing(w)))

        labels = data.Field(batch_first=True,
                            unk_token=None,
                            fix_length=opt.maxlen)  # pad_token=None,
        # preprocessing=data.Pipeline(lambda w: labels_map[w]))

        id = data.Field(batch_first=True, use_vocab=False)

        self.fields = ([(('inputs_word', 'inputs_char', 'inputs_case'),
                         (inputs_word, inputs_char, inputs_case))] +
                       [('labels', labels) if label == tag_type else
                        (None, None) for label in ['ner']] + [('id', id)])

        # Load the data
        datafile = NERDataset.splits(path='.',
                                     train=train_file,
                                     separator='\t',
                                     encoding='utf-8',
                                     fields=tuple(self.fields))[0]

        self.train, self.val, self.test = datafile.split(
            split_ratio=[5610, 1000, 1000])
        return inputs_word, inputs_char, inputs_case, labels
예제 #30
0
    def __init__(self, args):
        print("args=",args)
        path = './data/squad'

        logging.info("Preprocessing Data - First Phase  :: Reading And Transforming")

        self.preprocess('{}/{}'.format(path, args.train_file),draft=args.draft)
        self.preprocess('{}/{}'.format(path, args.dev_file),draft=args.draft)

        self.RAW = data.RawField(); self.RAW.is_target = False

        self.CHAR_NESTING  = data.Field(batch_first = True, tokenize = list, lower=True)
        self.CHAR  = data.NestedField(self.CHAR_NESTING, tokenize = word_tokenize)
        self.WORD  = data.Field(batch_first = True, tokenize = word_tokenize, lower = True, include_lengths = True)
        self.LABEL = data.Field(sequential = False, unk_token = None, use_vocab = False)

        dict_fields = {'qid'      : ('qid', self.RAW),
                       'start_idx': ('start_idx', self.LABEL),
                       'end_idx'  : ('end_idx', self.LABEL),
                       'context'  : [('c_word', self.WORD), ('c_char', self.CHAR)],
                       'question' : [('q_word', self.WORD), ('q_char', self.CHAR)]}
        
        logging.info("Preprocessing Data - Second Phase :: To Torchtext")
        
        self.train, self.dev = data.TabularDataset.splits(path=path, train=args.train_file + 'l',  \
                                                          validation=args.dev_file + 'l', format='json', fields=dict_fields)
        if args.max_token_len > 0:
            self.train.examples = [e for e in self.train.examples if len(e.c_word) <= args.max_token_len]

        logging.info("Preprocessing Data - Third Phase  :: Building Vocabulary")
        
        self.CHAR.build_vocab(self.train, self.dev)
        self.WORD.build_vocab(self.train, self.dev, vectors=GloVe(name='6B', dim=args.word_dim))

        logging.info("Preprocessing Data - Fourth Phase :: Building Itertors")

        device = torch.device("cuda:{}".format(args.GPU) if torch.cuda.is_available() else "cpu")
        
        self.train_iter = data.BucketIterator(dataset = self.train, batch_size = args.train_batch_size, \
                                              sort_key = lambda x : len(x.c_word), device=device)
        
        self.dev_iter   = data.BucketIterator(dataset = self.dev, batch_size = args.dev_batch_size, sort_key = lambda x : len(x.c_word), device=device)