Exemplo n.º 1
0
    def tokenize(self):
        ENGLISH = Field(sequential=True,
                        use_vocab=True,
                        tokenize=str.split,
                        lower=True,
                        init_token="<sos>",
                        eos_token="<eos>")
        FRENCH = Field(sequential=True,
                        use_vocab=True,
                        tokenize=str.split,
                        lower=True,
                        init_token="<sos>",
                        eos_token="<eos>")

        """
        in order for this to work, change
        "csv.field_size_limit(sys.maxsize)" in torchtext/utils.py to "csv.field_size_limit(maxInt)"
        """
        train, test = TabularDataset.splits(path='./data/', train='train.csv', test='test.csv',
                                            format='csv', fields=[('en',ENGLISH),('fr',FRENCH)])
        ENGLISH.build_vocab(train, test)
        FRENCH.build_vocab(train, test)
        self.en_vocab = ENGLISH
        self.fr_vocab = FRENCH
        self.en_vocabsize = len(ENGLISH.vocab)
        self.fr_vocabsize = len(FRENCH.vocab)

        if self.config.debug :
            train_loader, test_loader = Iterator.splits((train, test), batch_size=2, device="cuda", shuffle=False,
                                                        sort_key=lambda x : len(x.en), sort_within_batch=False)
        else :
            train_loader, test_loader = Iterator.splits((train, test), batch_size=self.config.batchsize, device="cuda", shuffle=False,
                                                        sort_key=lambda x : len(x.en), sort_within_batch=False)
        return train_loader, test_loader
Exemplo n.º 2
0
def data_split(text_field, label_field, dataset, mode=False):
    if mode == 'init':
        for index, c in enumerate(dataset):
            partial = NLPDataLoader(c,
                                    text_field=text_field,
                                    label_field=label_field,
                                    test=False)
            if index == 0:
                text_field.build_vocab(partial)
                label_field.build_vocab(list(range(13)))
            else:
                text_counter = text_field.vocab.freqs
                for example in partial.examples:
                    text_counter.update(example.text)
                text_field.vocab = text_field.vocab_cls(
                    text_counter, specials=['<unk>', '<pad>'])
        return
    elif mode is False:
        dataset = NLPDataLoader(dataset,
                                text_field=text_field,
                                label_field=label_field,
                                test=False)
        return Iterator.splits((dataset, ), batch_size=20)
    elif mode is True:
        dataset = NLPDataLoader(dataset,
                                text_field=text_field,
                                label_field=label_field,
                                test=True)
        return Iterator.splits((dataset, ), batch_size=20, shuffle=False)
Exemplo n.º 3
0
    def __init__(self, config):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.batch_size = config['batch_size']
        self.pad_id = self.tokenizer._convert_token_to_id("[PAD]")

        # Objects in which the data will be stored.
        self.text = Field(sequential=True,
                          lower=True,
                          tokenize=self.tokenizer.tokenize,
                          batch_first=True,
                          pad_token='[PAD]',
                          unk_token='[UNK]')
        self.labels = Field(sequential=False, is_target=True)

        self.train, self.dev, self.test = MultiNLI.splits(
            self.text, self.labels)

        # Builds vocabulary for the data.
        self.text.build_vocab(self.train, self.dev, self.test)
        self.labels.build_vocab(self.train)

        # Standard torchtext iterators, these do not return input suitable for BERT.
        self.train_iter, self.dev_iter, self.test_iter = Iterator.splits(
            (self.train, self.dev, self.test),
            batch_size=config['batch_size'],
            device=config['device'])
Exemplo n.º 4
0
    def iters(cls,
              path,
              vectors_name,
              vectors_cache,
              batch_size=64,
              vectors=None,
              unk_init=uniform_unk_init(),
              device="cuda:0",
              train="train.tsv",
              dev="dev.tsv",
              test="test.tsv"):
        if vectors is None:
            vectors = Vectors(name=vectors_name,
                              cache=vectors_cache,
                              unk_init=unk_init)

        train, val, test = cls.splits(path, train=train, dev=dev, test=test)
        cls.TEXT_FIELD.build_vocab(train, val, test, vectors=vectors)
        sort_within_batch = False
        if sort_within_batch:
            print("SORTING WITHIN BATCH!!!!!!!!!!!!!!!!!!!!!!!")
        return Iterator.splits((train, val, test),
                               batch_size=batch_size,
                               repeat=False,
                               sort_within_batch=sort_within_batch,
                               device=device,
                               sort=False)
Exemplo n.º 5
0
    def __init__(self, options):
        print("preparing the dataset for training...")
        self.TEXT = Field(lower=True, tokenize='spacy', batch_first=True)
        self.LABEL = Field(sequential=False, unk_token=None, is_target=True)

        # Since MNLI does not provide public test data
        # self.dev - MultiNLI Matched data
        # self.test - MultiNLI Mismatched data
        # To evaluate your system on the full test set, use the following Kaggle in Class competitions.
        # https://www.kaggle.com/c/multinli-matched-open-evaluation
        # https://www.kaggle.com/c/multinli-mismatched-open-evaluation

        self.train, self.dev, self.test = datasets.MultiNLI.splits(
            self.TEXT, self.LABEL)

        self.TEXT.build_vocab(self.train, self.dev)
        self.LABEL.build_vocab(self.train)

        vector_cache_loc = '.vector_cache/multinli_vectors.pt'
        if os.path.isfile(vector_cache_loc):
            self.TEXT.vocab.vectors = torch.load(vector_cache_loc)
        else:
            self.TEXT.vocab.load_vectors('glove.840B.300d')
            makedirs(os.path.dirname(vector_cache_loc))
            torch.save(self.TEXT.vocab.vectors, vector_cache_loc)

        self.train_iter, self.dev_iter, self.test_iter = Iterator.splits(
            (self.train, self.dev, self.test),
            batch_size=options['batch_size'],
            device=options['device'],
            sort_key=lambda x: len(x.premise),
            sort_within_batch=False,
            shuffle=True)
Exemplo n.º 6
0
    def create_iter(self, batch_size):
        """
        构建迭代器
        :param batch_size: 每批的大小
        :return: iter
        """
        # 定义torchtext中的Field
        fields = [('english', self.english), ('chinese', self.chinese)]
        examples = []
        # 构建中英文example
        for en, ch in zip(self.english_list, self.chinese_list):
            item = [en, ch]
            examples.append(data.Example().fromlist(item, fields))
        # 划分训练集,测试集
        train, test = Dataset(examples=examples,
                              fields=fields).split(split_ratio=0.8)
        self.english.build_vocab(train)
        self.chinese.build_vocab(train)
        self.english_voca_size = len(self.english.vocab)
        self.chinese_voca_size = len(self.chinese.vocab)
        train_iter, test_iter = Iterator.splits(
            (train, test),
            batch_sizes=(batch_size, len(test)),
            sort_key=lambda x: len(x.english),
            sort_within_batch=True,
            device=-1)

        return train_iter, test_iter
Exemplo n.º 7
0
  def __init__(self, batch_size):

    self.text = Field(
        lower=True,
        tokenize=lambda x: [tok.text for tok in spacy_en.tokenizer(x)],
        batch_first=True)
    self.label = Field(sequential=False, unk_token=None, is_target=True)

    self.train, self.dev, self.test = SNLI.splits(self.text, self.label)
    self.sizes = {
        'train': len(self.train),
        'val': len(self.dev),
        'test': len(self.test)
    }
    self.text.build_vocab(self.train, self.dev)
    self.label.build_vocab(self.train)

    vector_cache_loc = '.vector_cache/snli_vectors.pt'
    if os.path.isfile(vector_cache_loc):
      self.text.vocab.vectors = torch.load(vector_cache_loc)
    else:
      self.text.vocab.load_vectors('glove.840B.300d')
      torch.save(self.text.vocab.vectors, vector_cache_loc)

    # Batching
    self.train_iter, self.dev_iter, self.test_iter = Iterator.splits(
        (self.train, self.dev, self.test),
        batch_size=batch_size,
        device='cuda:0' if torch.cuda.is_available() else 'cpu')

    self.vocab_size = len(self.text.vocab)
    self.out_dim = len(self.label.vocab)
    self.labels = self.label.vocab.stoi
Exemplo n.º 8
0
    def __init__(self, batch_size):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.pad_id = self.tokenizer._convert_token_to_id("[PAD]")
        self.batch_size = batch_size
        # Objects in which the data will be stored.
        self.text = Field(sequential=True,
                          lower=True,
                          tokenize=self.tokenizer.tokenize,
                          batch_first=True,
                          pad_token='[PAD]',
                          unk_token='[UNK]')
        self.labels = Field(sequential=False, is_target=True)

        self.train, self.dev, self.test = MultiNLI.splits(
            self.text, self.labels)

        # Builds vocabulary for the data.
        self.text.build_vocab(self.train, self.dev, self.test)
        self.labels.build_vocab(self.train)

        self.train_size = len(self.train)
        self.val_size = len(self.dev)
        self.test_size = len(self.test)
        self.name = 'mnli'
        # Standard torchtext iterators, these do not return input suitable for BERT.
        self.train_iter, self.dev_iter, self.test_iter = Iterator.splits(
            (self.train, self.dev, self.test),
            batch_size=self.batch_size,
            device=torch.device(
                'cuda:0' if torch.cuda.is_available() else 'cpu'))
Exemplo n.º 9
0
    def dataloader(self):
        train_iter, valid_iter, test_iter = Iterator.splits(
            (self.train_data, self.valid_data, self.test_data),
            sort_within_batch=True,
            sort_key=lambda x: len(x.kor),
            batch_size=self.args.batch_size,
            device=device)

        return train_iter, valid_iter, test_iter
Exemplo n.º 10
0
def load_dataset(text, label, args, **kwargs):
    train_dataset, dev_dataset, test_dataset = get_dataset(
        '../data', text, label)
    text.build_vocab(train_dataset, dev_dataset, test_dataset)
    label.build_vocab(train_dataset, dev_dataset)
    train_data, dev_data, test_data = Iterator.splits(
        (train_dataset, dev_dataset, test_dataset),
        batch_sizes=(args.batch_size, len(dev_dataset), len(test_dataset)),
        sort_key=lambda x: len(x.text),
        **kwargs)
    return train_data, dev_data, test_data
def binary_classification(obj):
    tokenize = lambda x: x.split()
    TEXT = Field(sequential=True,
                 tokenize=tokenize,
                 lower=True,
                 batch_first=True,
                 fix_length=obj.fix_length)

    LABEL = Field(sequential=False,
                  dtype=torch.float,
                  batch_first=True,
                  use_vocab=False)

    fields = [
        ('id', None),
        ('content', TEXT),
        ('trump_percentage', LABEL),
    ]

    train_csv = 'twitter_pollster_' + str(
        obj.days) + '_days_train_trump_percentage.csv'
    test_csv = 'twitter_pollster_' + str(
        obj.days) + '_days_test_trump_percentage.csv'

    train_dataset = TabularDataset(path=obj.data_path + '/' + train_csv,
                                   format='csv',
                                   skip_header=True,
                                   fields=fields)

    test_dataset = TabularDataset(path=obj.data_path + '/' + test_csv,
                                  format='csv',
                                  skip_header=True,
                                  fields=fields)

    TEXT.build_vocab(train_dataset,
                     vectors=GloVe(name=obj.Glove_name, dim=obj.embedding_dim))
    vocab_size = len(TEXT.vocab)
    word_embeddings = TEXT.vocab.vectors
    print("vector size of text vocabulary: ", TEXT.vocab.vectors.size())

    train_iter, test_iter = Iterator.splits(
        (train_dataset, test_dataset),
        sort_key=lambda x: len(x.content),
        batch_sizes=(obj.train_batch_size, obj.test_batch_size),
        device=torch.device(obj.device),
        sort_within_batch=True,
        repeat=False)

    train_iter_ = BatchWrapper(train_iter, 'content', ['trump_percentage'])
    test_iter_ = BatchWrapper(test_iter, 'content', ['trump_percentage'])

    return TEXT, vocab_size, word_embeddings, train_iter_, test_iter_
Exemplo n.º 12
0
    def create_iterators(self):
        '''

        train_iter, dev_iter, test_iter = BucketIterator.splits(
            (self.datasets['train'], self.datasets['dev'], self.datasets['test']),
            # batch_sizes=(self.args.batch_size, len(self.datasets['dev']), len(self.datasets['test'])),
            batch_sizes=(self.args.batch_size, self.args.batch_size, self.args.batch_size),
            device=self.args.device)
        '''

        train_iter = Iterator.splits(self.datasets['train'],
                                     batch_size=self.args.batch_size,
                                     device=self.args.device)

        dev_iter = Iterator.splits(self.datasets['dev'],
                                   batch_size=len(self.datasets['dev']),
                                   device=self.args.device)

        test_iter = Iterator.splits(self.datasets['test'],
                                    batch_size=len(self.datasets['test']),
                                    device=self.args.device)

        return train_iter, dev_iter, test_iter
    def __prepare_train_data(self, X, y, sample_weight):
        self.__text_field = Field(lower=True)
        self.__label_field = Field(sequential=False)
        self.__text_field.tokenize = self.__tokenize
        sample_weight = None if sample_weight is None else list(sample_weight)
        sw = [1 for yi in y] if sample_weight is None else sample_weight
        s = y if Counter(y).most_common()[-1][1] > 1 else None
        X_t, X_d, y_t, y_d, w_t, _ = split(X,
                                           y,
                                           sw,
                                           shuffle=True,
                                           stratify=s,
                                           random_state=self.random_state,
                                           train_size=self.split_ratio)
        fields = [("text", self.__text_field), ("label", self.__label_field)]
        examples = [[X_t[i], y_t[i]] for i in range(len(X_t))]
        examples = [Example.fromlist(example, fields) for example in examples]
        weights = compute_sample_weight(self.class_weight, y_t)
        weights = [weights[i] * w_t[i] for i in range(len(y_t))]
        min_weight = min(weights)
        weights = [int(round(weight / min_weight)) for weight in weights]

        for i in range(len(X_t)):
            Xi = [X_t[i] for j in range(weights[i] - 1)]
            examples += [Example.fromlist([x, y_t[i]], fields) for x in Xi]

        train_data = Dataset(examples, fields)
        dev_data = [[X_d[i], y_d[i]] for i in range(len(X_d))]
        dev_data = [Example.fromlist(example, fields) for example in dev_data]
        dev_data = Dataset(dev_data, fields)

        self.__text_field.build_vocab(train_data,
                                      dev_data,
                                      vectors=self.vectors)
        self.__label_field.build_vocab(train_data, dev_data)

        batch_sizes = (self.batch_size, len(dev_data))
        return Iterator.splits((train_data, dev_data),
                               batch_sizes=batch_sizes,
                               sort_key=lambda ex: len(ex.text),
                               repeat=False)
Exemplo n.º 14
0
    def __init__(self, options):
        self.TEXT = Field(lower=True, tokenize='spacy', batch_first=True)
        self.LABEL = Field(sequential=False, unk_token=None, is_target=True)

        self.train, self.dev, self.test = datasets.SNLI.splits(
            self.TEXT, self.LABEL)

        self.TEXT.build_vocab(self.train, self.dev)
        self.LABEL.build_vocab(self.train)

        vector_cache_loc = '.vector_cache/snli_vectors.pt'
        if os.path.isfile(vector_cache_loc):
            self.TEXT.vocab.vectors = torch.load(vector_cache_loc)
        else:
            self.TEXT.vocab.load_vectors('glove.840B.300d')
            makedirs(os.path.dirname(vector_cache_loc))
            torch.save(self.TEXT.vocab.vectors, vector_cache_loc)

        self.train_iter, self.dev_iter, self.test_iter = Iterator.splits(
            (self.train, self.dev, self.test),
            batch_size=options['batch_size'],
            device=options['device'])
Exemplo n.º 15
0
def load_dataset(config, device):

    label_dict = {"observing": 0, "against": 1, "for": 2}
    LABEL = Field(use_vocab = False, sequential = False,\
     dtype = torch.long, preprocessing = lambda x: label_dict[x.strip()])

    SEQ = Field(dtype = torch.long, lower = True, batch_first = True,\
     preprocessing = lambda x:x[:45], include_lengths = True)
    SENT = Field(dtype = torch.long, lower = True, batch_first = True,\
     preprocessing = lambda x:x[:45], include_lengths = False)

    DOC = NestedField(SENT, tokenize = lambda s:s.strip().split(' </s> '), \
     preprocessing = lambda s:[x for x in s[:45] if x], dtype = torch.long,\
     include_lengths = True)

    fields = [('label', LABEL), ('claim', SEQ), ('hline', SEQ),\
     ('abst', SEQ), ('body', DOC)]

    train, test = TabularDataset.splits(path="../stance_data/", format = "tsv",\
     fields = fields, train = config.train_file, test = config.test_file)
    train, val = train.split(split_ratio=0.80)

    vectors = GloVe(name="6B",
                    dim=config.embed_dim,
                    cache='/users4/jwduan/vectors/')
    DOC.build_vocab(train, val, test, vectors=vectors)

    SEQ.build_vocab()
    SEQ.vocab = DOC.vocab

    config.vocab_size = len(DOC.vocab)
    train_loader, val_loader, test_loader = Iterator.splits((train, val, test),\
     batch_sizes = (config.batch_size, 256, 256), sort_key = lambda x:len(x.body), sort = True,
      device = device, shuffle = True, repeat = False)

    return (train_loader, val_loader, test_loader), DOC.vocab.vectors
Exemplo n.º 16
0
    skip_header=True,
    fields=test_data_fields)

''' Get embedding from cache '''
vectors = GloVe(name='6B', dim=100, cache='..\.vector_cache')

''' Build vocabulary and embed it '''
text_field.build_vocab(train_data_set, test_dataset, vectors=vectors)
label_field.build_vocab(valid_data_set)

''' Define Bucket Iterators '''
train_iter, val_iter = Iterator.splits(
    (train_data_set, valid_data_set),
    batch_sizes=(64, 64),
    device=device,
    sort_key=lambda x: len(x.text_field),
    sort_within_batch=False,
    repeat=False,
    shuffle=True
)

test_iter = Iterator(test_dataset,
                     batch_size=64,
                     device=device,
                     sort=False,
                     sort_within_batch=False,
                     repeat=False)


''' Define model '''
'''
Exemplo n.º 17
0
def basic_meta_data(obj):
    tokenize = lambda x: x.split()
    TEXT = Field(sequential=True,
                 tokenize=tokenize,
                 lower=True,
                 batch_first=True,
                 fix_length=obj.fix_length)

    VARIABLE = Field(sequential=False,
                  dtype=torch.float,
                  batch_first=True,
                  use_vocab=False)
    
    LABEL = Field(sequential=False,
                  dtype=torch.float,
                  batch_first=True,
                  use_vocab=False)
    
    fields = [#('id', None),
              ('content', TEXT),
              ('avg_followers',VARIABLE),
              ('avg_following', VARIABLE),
              ('avg_left', VARIABLE),
              ('avg_news', VARIABLE),
              ('avg_right', VARIABLE),
              ('time', VARIABLE),
              ('baseline_pred_left', VARIABLE),
              ('baseline_pred_mid', VARIABLE),
              ('baseline_pred_right', VARIABLE),
              ('left', LABEL),
             ('mid', LABEL),
             ('right', LABEL),
             ('7', None),
             ('8', None),
             ('9', None)]
    
    #train_csv = 'twitter_pollster_'+str(obj.days)+'_days_train_small.csv'
    #test_csv = 'twitter_pollster_'+str(obj.days)+'_days_test_small.csv'
    train_csv = 'train1.csv'
    test_csv = 'test1.csv'
    
    train_dataset = TabularDataset(path=obj.data_path+'/'+train_csv,
                                   format='csv',
                                   skip_header=True,
                                   fields=fields)
    
    test_dataset = TabularDataset(path=obj.data_path+'/'+test_csv,
                                  format='csv',
                                  skip_header=True,
                                  fields=fields)
    
    TEXT.build_vocab(train_dataset, vectors=GloVe(name=obj.Glove_name,
                                                  dim=obj.embedding_dim, 
                                                 cache=glove_path))
    vocab_size = len(TEXT.vocab)
    word_embeddings = TEXT.vocab.vectors
    print ("vector size of text vocabulary: ", TEXT.vocab.vectors.size())
    
    train_iter, test_iter = Iterator.splits(
            (train_dataset, test_dataset),
            sort_key=lambda x: len(x.content), 
            batch_sizes=(obj.train_batch_size, obj.test_batch_size),
            device=torch.device(obj.device),
            sort_within_batch=True,
            repeat=False)
    
    train_iter_ = BatchWrapper(train_iter, ['content', 'avg_followers', 'avg_following', 'avg_left', 'avg_news', 'avg_right', 'time', 'baseline_pred_left', 'baseline_pred_mid', 'baseline_pred_right'], ['left', 'mid', 'right'])
    test_iter_ = BatchWrapper(test_iter, ['content', 'avg_followers', 'avg_following', 'avg_left', 'avg_news', 'avg_right', 'time', 'baseline_pred_left', 'baseline_pred_mid', 'baseline_pred_right'], ['left', 'mid', 'right'])
    
    return TEXT, vocab_size, word_embeddings, train_iter_, test_iter_
Exemplo n.º 18
0
    test_dataset = TabularDataset(path='mydata/'+test_csv,
                                  format='csv',
                                  skip_header=True,
                                  fields=fields)

    TEXT.build_vocab(train_dataset, vectors=GloVe(name='twitter.27B',
                                                  dim=25))
    vocab_size = len(TEXT.vocab)
    word_embeddings = TEXT.vocab.vectors
    print ("vector size of text vocabulary: ", TEXT.vocab.vectors.size())

    train_iter, test_iter = Iterator.splits(
            (train_dataset, test_dataset),
            sort_key=lambda x: len(x.content), 
            batch_sizes=(7, 7),
            device=torch.device('cpu'),
            sort_within_batch=True,
            repeat=False)
    
    print(train_csv, test_csv)
    train_iter_ = BatchWrapper(train_iter, ['content', 'avg_followers', 'avg_following', 'avg_left', 'avg_news', 'avg_right', 'time', 'baseline_pred_left', 'baseline_pred_mid', 'baseline_pred_right'], ['left', 'mid', 'right'])
    test_iter_ = BatchWrapper(test_iter, ['content', 'avg_followers', 'avg_following', 'avg_left', 'avg_news', 'avg_right', 'time', 'baseline_pred_left', 'baseline_pred_mid', 'baseline_pred_right'], ['left', 'mid', 'right'])

    batch0 = None
    batch1 = None
    batch2 = None
    for iter, batch in enumerate(train_iter_, 1):
        if iter==1:
            #print(iter, batch)
            batch0 = batch[0]
Exemplo n.º 19
0
# 2.构建表格型dataset
ds_train, ds_test = TabularDataset.splits(path='./data/',
                                          train='train.tsv',
                                          test='test.tsv',
                                          format='tsv',
                                          fields=[('label', LABEL),
                                                  ('text', TEXT)],
                                          skip_header=False)

# 3.构建词典
TEXT.build_vocab(ds_train)

# 4.构建数据管道迭代器
train_iter, test_iter = Iterator.splits((ds_train, ds_test),
                                        sort_within_batch=True,
                                        sort_key=lambda x: len(x.text),
                                        batch_sizes=(BATCH_SIZE, BATCH_SIZE),
                                        device='cuda:4')


# 将数据管道组织成torch.utils.data.DataLoader相似的features,label输出形式
class DataLoader:
    def __init__(self, data_iter):
        self.data_iter = data_iter
        self.length = len(data_iter)

    def __len__(self):
        return self.length

    def __iter__(self):
        # 注意:此处调整features为 batch first,并调整label的shape和dtype
Exemplo n.º 20
0
def main(language, hidden_dim, dropout, proc, letter_proc, objective, operator,
         alpha, lr, momentum, optimizer, batch_size, n_epochs,
         pretrained_embeddings, letter_hidden_dim, letter_embedding_dim,
         n_samples, pad_edge, augment, _seed, _run, _log):
    if objective not in ['erm', 'nll']:
        raise ValueError("`objective` should be in ['erm', 'nll'],"
                         "got %s" % objective)

    # Technical
    device = init_system()

    if pad_edge:
        init_token = '<init>'
        eos_token = '<end>'
    else:
        init_token = None
        eos_token = None
    # Data loading using torchtext abstraction
    tags = ttdata.Field(sequential=True,
                        include_lengths=True,
                        preprocessing=iob1_iobes,
                        init_token=init_token,
                        eos_token=eos_token,
                        pad_token=None,
                        unk_token=None,
                        batch_first=True)
    sentences = ttdata.Field(sequential=True,
                             include_lengths=False,
                             batch_first=True,
                             init_token=init_token,
                             eos_token=eos_token,
                             preprocessing=zero_num)
    letter = ttdata.Field(sequential=True,
                          tokenize=list,
                          include_lengths=True,
                          init_token=None,
                          eos_token=None,
                          preprocessing=zero_num,
                          batch_first=True)
    letters = NestedField(
        letter,
        use_vocab=True,
        tensor_type=torch.FloatTensor,
        init_token=init_token,
        eos_token=eos_token,
    )

    if language == 'en':
        fields = [[('sentences', sentences), ('letters', letters)], ('', None),
                  ('', None), ('tags', tags)]
    elif language == 'de':
        fields = [[('sentences', sentences), ('letters', letters)], ('', None),
                  ('', None), ('', None), ('tags', tags)]
    elif language in ['es', 'nl']:
        fields = [[('sentences', sentences), ('letters', letters)], ('', None),
                  ('tags', tags)]
    else:
        raise ValueError('Wrong language')

    tagger_languages = {'en': 'eng', 'nl': 'ned', 'de': 'deu', 'es': 'esp'}

    train_data, val_data, test_data = SequenceTaggingDataset.splits(
        path=expanduser('~/data/sdtw_data/conll'),
        train='%s.train' % tagger_languages[language],
        validation='%s.testa' % tagger_languages[language],
        test='%s.testb' % tagger_languages[language],
        n_samples=n_samples,
        fields=fields)

    letters.build_vocab(train_data, val_data, test_data)
    tags.build_vocab(train_data)
    tag_itos = tags.vocab.itos
    if pad_edge:
        eos_idx = tags.vocab.stoi[tags.eos_token]
        init_idx = tags.vocab.stoi[tags.init_token]
        tag_itos[eos_idx] = 'O'
        tag_itos[init_idx] = 'O'
    else:
        eos_idx = None
        init_idx = None

    if isinstance(pretrained_embeddings, int):
        sentences.build_vocab(train_data, val_data, test_data)
        embedding_dim = pretrained_embeddings
    else:
        if pretrained_embeddings == 'ner':
            vectors = CaseInsensitiveVectors(
                expanduser('~/data/sdtw_data/ner/%s' %
                           tagger_languages[language]),
                unk_init=lambda x: x.normal_(0, 1),
                cache=expanduser('~/cache'))
        elif 'glove' in pretrained_embeddings:
            _, name, dim = pretrained_embeddings.split('.')
            dim = dim[:-1]
            GloVe.__getitem__ = CaseInsensitiveVectors.__getitem__
            vectors = GloVe(name=name, dim=dim, cache=expanduser('~/cache'))
        elif pretrained_embeddings == 'fasttext':
            FastText.__getitem__ = CaseInsensitiveVectors.__getitem__
            FastText.cache = CaseInsensitiveVectors.cache
            vectors = FastText(language=language, cache=expanduser('~/cache'))
        # extend vocab with words of test/val set that has embeddings in
        # pre-trained embedding
        # A prod-version would do it dynamically at inference time
        counter = Counter()
        sentences.build_vocab(val_data, test_data)
        for word in sentences.vocab.stoi:
            if word in vectors.stoi or word.lower() in vectors.stoi or \
                    re.sub('\d', '0', word.lower()) in vectors.stoi:
                counter[word] = 1
        eval_vocab = Vocab(counter)
        print("%i/%i eval/test word in pretrained" %
              (len(counter), len(sentences.vocab.stoi)))
        sentences.build_vocab(train_data)
        prev_vocab_size = len(sentences.vocab.stoi)
        sentences.vocab.extend(eval_vocab)
        new_vocab_size = len(sentences.vocab.stoi)
        print('New vocab size: %i (was %i)' %
              (new_vocab_size, prev_vocab_size))
        sentences.vocab.load_vectors(vectors)
        embedding_dim = sentences.vocab.vectors.shape[1]
    artifact_dir = _run.info['artifact_dir']
    vocab_dict = {
        'sentences': sentences.vocab,
        'tags': tags.vocab,
        'letters': letter.vocab
    }
    torch.save(vocab_dict, open(join(artifact_dir, 'vocab.pt'), 'wb+'))

    unk_idx = sentences.vocab.stoi[sentences.unk_token]
    padding_idx = sentences.vocab.stoi[sentences.pad_token]
    singleton_idx = [
        tags.vocab.stoi[singleton] for singleton in tags.vocab.stoi
        if 'S-' in singleton
    ]
    tagset_size = len(tags.vocab)
    vocab_size = len(sentences.vocab)
    letter_size = len(letters.vocab)

    device_iter = -1 if device.type == 'cpu' else device.index
    train_iter, val_iter, test_iter = Iterator.splits(
        (train_data, val_data, test_data),
        sort_within_batch=True,
        batch_sizes=(batch_size, 512, 512),
        device=device_iter)
    train_test_iter = Iterator(train_data,
                               sort_within_batch=True,
                               batch_size=512,
                               shuffle=True,
                               device=device_iter)
    eval_iter = {
        'val': val_iter,
        'test': test_iter,
        'train_test': [next(iter(train_test_iter))]
    }

    model = Tagger(embedding_dim,
                   vocab_size,
                   tagset_size,
                   hidden_dim=hidden_dim,
                   proc=proc,
                   padding_idx=padding_idx,
                   letter_proc=letter_proc,
                   letter_embedding_dim=letter_embedding_dim,
                   letter_hidden_dim=letter_hidden_dim,
                   letter_size=letter_size,
                   dropout=dropout,
                   eos_idx=eos_idx,
                   init_idx=init_idx,
                   alpha=alpha,
                   operator=operator)

    # Load vectors
    if hasattr(sentences.vocab, 'vectors'):
        model.embedder.word_embeddings.weight.data = sentences.vocab.vectors
        model.embedder.word_embeddings.weight.data[padding_idx].fill_(0.)

    model = model.to(device=device)

    if operator == 'softmax':
        loss_function = OurNLLLoss()
    else:
        loss_function = BinaryMSELoss()

    score_function = functools.partial(ner_score,
                                       tag_itos=tag_itos,
                                       format='iobes')

    if optimizer == 'sgd':
        optimizer = torch.optim.SGD(params=model.parameters(),
                                    lr=lr * batch_size,
                                    momentum=momentum)
    elif optimizer == 'adam':
        optimizer = torch.optim.Adam(params=model.parameters(), lr=lr)
    else:
        raise ValueError()
    scheduler = ReduceLROnPlateau(optimizer,
                                  mode='min',
                                  factor=0.5,
                                  patience=5,
                                  threshold=1e-3,
                                  cooldown=2)

    for fold in eval_iter:
        _run.info['%s_loss' % fold] = []
        _run.info['%s_prec' % fold] = []
        _run.info['%s_recall' % fold] = []
        _run.info['%s_f1' % fold] = []
    _run.info['epochs'] = []
    _run.info['time'] = []
    last_epoch = floor(train_iter.epoch)
    t0 = time.clock()
    total_time = 0

    for batch in train_iter:
        epoch = floor(train_iter.epoch)
        if epoch > last_epoch:
            t1 = time.clock()
            elapsed = t1 - t0
            total_time += elapsed
            model.eval()
            _log.info("epoch %i, time/epoch %.3f s" % (epoch, elapsed))
            if epoch % 10 == 0:
                dump_model(model, 'model_%i.pt' % epoch)
            for fold in eval_iter:
                this_iter = eval_iter[fold]
                this_iter = iter(this_iter)
                loss, prec, recall, f1 = validate(model, this_iter,
                                                  score_function, objective,
                                                  loss_function)
                if fold == 'val':
                    scheduler.step(loss.item(), epoch=epoch)
                _log.info("%s: loss %.4f, prec %.4f, recall %.4f, f1 %.4f" %
                          (fold, loss, prec, recall, f1))
                _run.info['%s_loss' % fold].append(loss.item())
                _run.info['%s_prec' % fold].append(prec)
                _run.info['%s_recall' % fold].append(recall)
                _run.info['%s_f1' % fold].append(f1)
            _run.info['time'].append(total_time)
            _run.info['epochs'].append(epoch)
            if epoch > n_epochs:
                break
            t0 = time.clock()
        data = make_data(batch,
                         augment=augment,
                         unk_idx=unk_idx,
                         singleton_idx=singleton_idx)
        model.train()
        model.zero_grad()
        loss = compute_loss(model, data, objective, loss_function)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 5, norm_type=2)
        optimizer.step()
        last_epoch = epoch
    dump_model(model, 'model_final.pt')

    return _run.info['test_f1'][-1]
Exemplo n.º 21
0
def create_dataset(config: Config, device: torch.device) -> Tuple[Vocab, Iterator, Iterator, Iterator]:

    fields = dict()
    raw_field = RawField()
    # torchtext 0.3.1
    # AttributeError: 'RawField' object has no attribute 'is_target'
    raw_field.is_target = False
    fields[SeqType.ArticleID.value] = (SeqType.ArticleID.value, raw_field)

    time_field = Field(use_vocab=False, batch_first=True, sequential=False)
    fields['jst_hour'] = (SeqType.Time.value, time_field)

    token_field = \
        Field(use_vocab=True,
              init_token=SpecialToken.BOS.value,
              eos_token=SpecialToken.EOS.value,
              pad_token=SpecialToken.Padding.value,
              unk_token=SpecialToken.Unknown.value) \
        if config.use_init_token_tag \
        else Field(use_vocab=True,
                   eos_token=SpecialToken.EOS.value,
                   pad_token=SpecialToken.Padding.value,
                   unk_token=SpecialToken.Unknown.value)

    fields['processed_tokens'] = (SeqType.Token.value, token_field)

    seqtypes = [SeqType.RawShort, SeqType.RawLong,
                SeqType.MovRefShort, SeqType.MovRefLong,
                SeqType.NormMovRefShort, SeqType.NormMovRefLong,
                SeqType.StdShort, SeqType.StdLong]

    for (ric, seqtype) in itertools.product(config.rics, seqtypes):
        n = N_LONG_TERM \
            if seqtype.value.endswith('long') \
            else N_SHORT_TERM
        price_field = Field(use_vocab=False,
                            fix_length=n,
                            batch_first=True,
                            pad_token=0.0,
                            preprocessing=lambda xs: [float(x) for x in xs],
                            dtype=torch.float)
        key = stringify_ric_seqtype(ric, seqtype)
        fields[key] = (key, price_field)

    train, val, test = \
        TabularDataset.splits(path=str(config.dir_output),
                              format='json',
                              train='alignment-train.json',
                              validation='alignment-valid.json',
                              test='alignment-test.json',
                              fields=fields)

    token_field.build_vocab(train, min_freq=config.token_min_freq)

    batch_size = config.batch_size
    train_iter, val_iter, test_iter = \
        Iterator.splits((train, val, test),
                        batch_sizes=(batch_size, batch_size, batch_size),
                        device=-1 if device.type == 'cpu' else device,
                        repeat=False,
                        sort=False)

    return (token_field.vocab, train_iter, val_iter, test_iter)
Exemplo n.º 22
0
def main(params):
    try:
        output_dir = os.path.join(
            params['outf'], datetime.strftime(datetime.now(), "%Y%m%d_%H%M"))
        os.makedirs(output_dir)
    except OSError:
        pass

    if torch.cuda.is_available() and not params['cuda']:
        print(
            "WARNING: You have a CUDA device, so you should probably run with --cuda"
        )

    writer = SummaryWriter(output_dir)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    SOS_token = '<sos>'
    EOS_token = '<eos>'
    PAD_token = '<pad>'

    TEXT = Field(sequential=True,
                 use_vocab=True,
                 tokenize=tokenizer,
                 lower=True,
                 batch_first=True,
                 init_token=SOS_token,
                 eos_token=EOS_token)
    # LABEL = Field(sequential=True, use_vocab=True, tokenize=tokenizer, is_target=True, batch_first=True, init_token='#', eos_token='$')
    IMG_IND = Field(sequential=False, use_vocab=False, batch_first=True)

    fields = {
        'ans': ('ans', TEXT),
        'img_ind': ('img_ind', IMG_IND),
        'question': ('question', TEXT)
    }

    train, val = TabularDataset.splits(
        path=params['dataroot'],
        train=params['input_train'],
        validation=params['input_test'],
        format='csv',
        skip_header=False,
        fields=fields,
    )

    print("Train data")
    print(train[0].__dict__.keys())
    print(train[0].ans, train[0].img_ind, train[0].question)

    print("Validation data")
    print(val[0].__dict__.keys())
    print(val[0].ans, val[0].img_ind, val[0].question)

    print("Building Vocabulary ..")
    TEXT.build_vocab(train, vectors='glove.6B.100d')
    vocab = TEXT.vocab

    PAD_token_ind = vocab.stoi[PAD_token]
    SOS_token_ind = vocab.stoi[SOS_token]
    EOS_token_ind = vocab.stoi[EOS_token]

    print("Creating Embedding from vocab vectors ..")
    txt_embed = nn.Embedding.from_pretrained(vocab.vectors)
    print("Text Embeddings are generated of size ", txt_embed.weight.size())

    print("Loading Image embeddings ..")
    with open(params['image_embeddings'], 'rb') as f:
        img_embs = pkl.load(f)['image_features']

    img_embed = nn.Embedding.from_pretrained(torch.FloatTensor(img_embs))

    print("Creating Encoder_attn ..")
    encoder = Encoder(img_embed, txt_embed, params)
    print(encoder)

    print("Creating Decoder ..")
    decoder = Decoder(txt_embed, params)
    print(decoder)

    criterion = torch.nn.PairwiseDistance(keepdim=False)
    criterion.to(device)

    ## [Completed] TODO(Jay) : Remove this check and use .to(device)
    # if params['cuda']:
    #     encoder.cuda()
    #     decoder.cuda()
    #     criterion.cuda()

    encoder_optimizer = torch.optim.Adam(encoder.parameters(),
                                         lr=params['lr'],
                                         weight_decay=1e-5,
                                         amsgrad=True)
    decoder_optimizer = torch.optim.Adam(decoder.parameters(),
                                         lr=params['lr'],
                                         weight_decay=1e-5,
                                         amsgrad=True)

    encoder_LR_scheduler = ReduceLROnPlateau(encoder_optimizer,
                                             'min',
                                             patience=1)
    decoder_LR_scheduler = ReduceLROnPlateau(decoder_optimizer,
                                             'min',
                                             patience=1)

    if params['use_checkpoint']:
        checkpoint = torch.load(params['enc_dec_model'])
        encoder.load_state_dict(checkpoint['encoder_state_dict'])
        decoder.load_state_dict(checkpoint['decoder_state_dict'])
        encoder_optimizer.load_state_dict(
            checkpoint['encoder_optimizer_state_dict'])
        decoder_optimizer.load_state_dict(
            checkpoint['decoder_optimizer_state_dict'])
        encoder_LR_scheduler.load_state_dict(
            checkpoint['encoder_LR_scheduler'])
        decoder_LR_scheduler.load_state_dict(
            checkpoint['decoder_LR_scheduler'])

    encoder.to(device)
    decoder.to(device)

    train_iter, val_iter = Iterator.splits(
        (train, val),
        batch_sizes=(params['batch_size'], params['batch_size']),
        sort=False,
        shuffle=True,
        device=device)

    for epoch in range(params['niter']):

        for is_train in (True, False):
            print('Is Training: ', is_train)
            if is_train:
                encoder.train()
                decoder.train()
                data_iter = train_iter
            else:
                encoder.eval()
                decoder.eval()
                data_iter = val_iter

            total_loss = 0
            total_acc = 0

            with torch.set_grad_enabled(is_train):

                for i, row in enumerate(data_iter, 1):

                    if len(row) < params['batch_size']:
                        continue

                    encoder.zero_grad()
                    decoder.zero_grad()

                    ans, img_ind, question = row.ans, row.img_ind, row.question
                    batch_size = params['batch_size']

                    ## target_length-1 since we are not predicting SOS token
                    target_length = ans.shape[1] - 1

                    encoder.hidden = encoder.init_hidden(params)

                    ans = ans.to(device)
                    img_ind = img_ind.to(device)
                    question = question.to(device)
                    encoder.hidden = (encoder.hidden[0].to(device),
                                      encoder.hidden[1].to(device))

                    ans_embed = txt_embed(ans)

                    encoder_output = encoder(img_ind, question)

                    decoder_input = ans_embed[:, 0].reshape(
                        (batch_size, 1, -1))  ## (batch_size, 1) check again
                    ans_embed = ans_embed[:, 1:]  ## removed the SOS token
                    ans = ans[:, 1:]  ## removed the SOS token

                    decoder_hidden = decoder.init_hidden(
                        encoder_output, params)

                    if params['cuda']:
                        decoder_hidden = (decoder_hidden[0].cuda(),
                                          decoder_hidden[1].cuda())

                    outputs = torch.zeros(batch_size, target_length,
                                          params['txt_emb_size'])

                    ## [Completed] TODO(Jay) : remove the sos token from the ans and ans_embed before calc loss and acc
                    for di in range(target_length - 1):
                        decoder_output, decoder_hidden = decoder(
                            decoder_input, decoder_hidden)

                        ## TODO(Jay) : Detach the input from history
                        decoder_input = decoder_output

                        outputs[:, di, :] = decoder_output.reshape(
                            batch_size, -1)

                    filtered_labels, filtered_label_embeds, filtered_outputs = filterOutput(
                        outputs.reshape(batch_size * target_length, -1),
                        ans.reshape(batch_size * target_length, -1),
                        ans_embed.reshape(batch_size * target_length, -1),
                        PAD_token_ind)

                    filtered_label_embeds = filtered_label_embeds.to(device)
                    filtered_outputs = filtered_outputs.to(device)

                    batch_loss = maskedLoss(filtered_label_embeds,
                                            filtered_outputs, criterion)

                    batch_acc = word_accuracy(filtered_outputs,
                                              vocab.vectors.to(device),
                                              filtered_labels)

                    total_loss += batch_loss.item()
                    total_acc += batch_acc

                    if is_train:
                        if i % 1000 == 0:
                            print(
                                '[%d/%d][%d/%d] train_loss: %.4f, Accuracy: %.4f'
                                % (epoch, params['niter'], i, len(data_iter),
                                   total_loss / i, total_acc / i))

                        batch_loss.backward()
                        encoder_optimizer.step()
                        decoder_optimizer.step()

                avg_loss = total_loss / len(data_iter)
                avg_acc = total_acc / len(data_iter)

                if is_train:
                    PATH = os.path.join(output_dir, 'enc_dec_model.pth')
                    torch.save(
                        {
                            'encoder_state_dict':
                            encoder.state_dict(),
                            'decoder_state_dict':
                            decoder.state_dict(),
                            'encoder_optimizer_state_dict':
                            encoder_optimizer.state_dict(),
                            'decoder_optimizer_state_dict':
                            decoder_optimizer.state_dict(),
                            'encoder_LR_scheduler':
                            encoder_LR_scheduler.state_dict(),
                            'decoder_LR_scheduler':
                            decoder_LR_scheduler.state_dict(),
                        }, PATH)

                    writer.add_scalars('data', {
                        'train_loss': avg_loss,
                        'train_acc': avg_acc
                    }, epoch)
                else:
                    print('Calculating Validation loss')
                    print('val_loss: %.4f, Accuracy: %.4f' %
                          (avg_loss, avg_acc))

                    encoder_LR_scheduler.step(avg_loss)
                    decoder_LR_scheduler.step(avg_loss)

                    writer.add_scalars('data', {
                        'val_loss': avg_loss,
                        'val_acc': avg_acc
                    }, epoch)

    writer.close()
Exemplo n.º 23
0
            # predicted: (batch, trg_len-1, trg_vocab_size)
            predicted = predicted.reshape(-1, predicted.shape[-1])
            # predicted: (*, trg_vocab_size)
            trg = trg[:, 1:].reshape(-1)
            # trg: (*, )

            loss = criterion(predicted, trg)

            epoch_loss += loss.item()

    return epoch_loss / len(data_iter)


train_data, val_data, test_data, SRC, TRG = utils.read_data(batch_first=True)
train_iter, val_iter = Iterator.splits((train_data, val_data),
                                       batch_size=BATCH_SIZE,
                                       shuffle=True,
                                       sort=False)
test_iter = Iterator(test_data,
                     batch_size=BATCH_SIZE,
                     shuffle=False,
                     sort=False)

SRC_PAD_IDX = SRC.vocab.stoi['<pad>']
TRG_PAD_IDX = TRG.vocab.stoi['<pad>']

model = Transformer(len(SRC.vocab), len(TRG.vocab), MAX_LEN, MODEL_SIZE,
                    FF_SIZE, KEY_SIZE, VALUE_SIZE, NUM_HEADS, NUM_LAYERS,
                    DROPOUT, SRC_PAD_IDX, TRG_PAD_IDX).to(DEVICE)
criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX)
opt = AdamWrapper(model.parameters(), MODEL_SIZE, WARMUP)
Exemplo n.º 24
0
def main(params):
    try:
        output_dir = os.path.join(
            params['outf'], datetime.strftime(datetime.now(), "%Y%m%d_%H%M"))
        os.makedirs(output_dir)
    except OSError:
        pass

    writer = SummaryWriter(output_dir)
    if torch.cuda.is_available() and not params['cuda']:
        print(
            "WARNING: You have a CUDA device, so you should probably run with --cuda"
        )

    TEXT = Field(sequential=True,
                 use_vocab=True,
                 tokenize=tokenizer,
                 lower=True,
                 batch_first=True)
    LABEL = Field(sequential=False,
                  use_vocab=False,
                  is_target=True,
                  batch_first=True)
    IMG_IND = Field(sequential=False, use_vocab=False, batch_first=True)

    fields = {
        'ans': ('ans', LABEL),
        'img_ind': ('img_ind', IMG_IND),
        'question': ('question', TEXT)
    }

    train, val = TabularDataset.splits(path=params['dataroot'],
                                       train=params['input_train'],
                                       validation=params['input_test'],
                                       format='csv',
                                       skip_header=False,
                                       fields=fields)

    print("Train data")
    print(train[0].__dict__.keys())
    print(train[0].ans, train[0].img_ind, train[0].question)

    print("Validation data")
    print(val[0].__dict__.keys())
    print(val[0].ans, val[0].img_ind, val[0].question)

    print("Building Vocabulary ..")
    TEXT.build_vocab(train, vectors='glove.6B.100d')
    vocab = TEXT.vocab

    print("Creating Embedding from vocab vectors ..")
    params['vocab'] = vocab

    vqa_model = model.Model(params)

    print(vqa_model)

    if params['use_checkpoint']:
        checkpoint = torch.load(params['mcq_model'])
        vqa_model.load_state_dict(checkpoint['model_state_dict'])
        vqa_model.hidden = checkpoint['lstm_hidden']

    criterion = torch.nn.CrossEntropyLoss()

    if params['cuda']:
        vqa_model.cuda()
        criterion.cuda()

    optimizer = torch.optim.Adam(vqa_model.parameters(), lr=params['lr'])

    train_iter, val_iter = Iterator.splits(
        (train, val),
        batch_sizes=(params['batch_size'], params['batch_size']),
        sort_within_batch=False,
        sort=False)

    for epoch in range(1, params['niter'] + 1):

        total_val_loss = 0
        total_val_matches = 0
        total_train_loss = 0
        total_train_matches = 0

        for i, row in enumerate(train_iter):

            vqa_model.train()
            # Starting each batch, we detach the hidden state from how it was previously produced.
            # If we didn't, the model would try backpropagating all the way to start of the dataset.
            if len(row) < params['batch_size']:
                continue
            vqa_model.hidden = repackage_hidden(vqa_model.hidden)
            vqa_model.zero_grad()
            ans, img_ind, question = row.ans, row.img_ind, row.question

            batch_size = ans.size(0)

            if params['cuda']:
                ans = ans.cuda()
                img_ind = img_ind.cuda()
                question = question.cuda()
                vqa_model.hidden = tuple([v.cuda() for v in vqa_model.hidden])

            ans_var = Variable(ans)
            img_ind_var = Variable(img_ind)
            question_var = Variable(question)

            pred_ans = vqa_model(img_ind_var, question_var)

            train_loss = criterion(pred_ans, ans_var)

            pred_ind = pred_ans.max(dim=1)[1]
            train_acc = (pred_ind == ans_var).sum()

            total_train_loss += train_loss.item()
            total_train_matches += train_acc.item()

            train_loss.backward()
            optimizer.step()

            if i % 1000 == 0:
                print('[%d/%d][%d/%d] train_loss: %.4f' %
                      (epoch, params['niter'], i + 1, len(train_iter),
                       train_loss))

        vqa_model.eval()
        for row in val_iter:

            if len(row) < params['batch_size']:
                continue

            vqa_model.hidden = repackage_hidden(vqa_model.hidden)
            vqa_model.zero_grad()
            ans, img_ind, question = row.ans, row.img_ind, row.question

            batch_size = ans.size(0)

            if params['cuda']:
                ans = ans.cuda()
                img_ind = img_ind.cuda()
                question = question.cuda()
                vqa_model.hidden = tuple([v.cuda() for v in vqa_model.hidden])

            ans_var = Variable(ans)
            img_ind_var = Variable(img_ind)
            question_var = Variable(question)

            pred_ans = vqa_model(img_ind_var, question_var)

            val_loss = criterion(pred_ans, ans_var)
            pred_ind = pred_ans.max(dim=1)[1]
            val_acc = (pred_ind == ans_var).sum()
            total_val_loss += val_loss.item()
            total_val_matches += val_acc.item()

        print(
            '[%d/%d] train_loss: %.4f val_loss: %.4f train_acc: %.4f val_acc: %.4f'
            % (epoch, params['niter'], total_train_loss / len(train_iter),
               total_val_loss / len(val_iter), total_train_matches * 100 /
               len(train_iter) / params['batch_size'],
               total_val_matches * 100 / len(val_iter) / params['batch_size']))

        writer.add_scalars(
            'data', {
                'train_loss':
                train_loss,
                'train_acc':
                total_train_matches * 100 / len(train_iter) /
                params['batch_size'],
                'val_loss':
                total_val_loss / len(val_iter),
                'val_acc':
                total_val_matches * 100 / len(val_iter) / params['batch_size']
            }, epoch)

        torch.save(
            {
                'lstm_hidden': vqa_model.hidden,
                'model_state_dict': vqa_model.state_dict()
            }, '%s/baseline_%d.pth' % (output_dir, epoch))

    writer.close()
Exemplo n.º 25
0
### 3 创建迭代

train_iter, valid_iter = BucketIterator.splits(
    (train, valid),
    batch_size=64,
    device='cpu',
    sort_key=lambda x: len(x.comment_text),  #使用什么功能对数据进行分组。
    sort_within_batch=False,
    repeat=False)

print(train_iter)
print(next(train_iter.__iter__()))

test_iter = Iterator.splits(datasets=test,
                            batch_size=64,
                            device=-1,
                            sort=False,
                            sort_within_batch=False,
                            repeat=False)
"""
这个是需要将数字化的序列转为原始的文本
TEXT = ReversibleField(sequential=True, lower=True, include_lengths=True)
for data in train_iter:
    (x, x_lengths), y = data.Text, data.Description
    orig_text = TEXT.reverse(x.data)
    print(orig_text)
"""


class BatchWrapper():
    def __init__(self, dl, x_var, y_vars):
        self.dl, self.x_var, self.y_vars = dl, x_var, y_vars  # we pass in the list of attributes for x and y
Exemplo n.º 26
0
    def test_multinli(self):
        batch_size = 4

        # create fields
        TEXT = ParsedTextField()
        TREE = ShiftReduceField()
        GENRE = LabelField()
        LABEL = LabelField()

        # create train/val/test splits
        train, val, test = MultiNLI.splits(TEXT, LABEL, TREE, GENRE)

        # check all are MultiNLI datasets
        assert type(train) == type(val) == type(test) == MultiNLI

        # check all have correct number of fields
        assert len(train.fields) == len(val.fields) == len(test.fields) == 6

        # check fields are the correct type
        assert type(train.fields['premise']) == ParsedTextField
        assert type(train.fields['premise_transitions']) == ShiftReduceField
        assert type(train.fields['hypothesis']) == ParsedTextField
        assert type(train.fields['hypothesis_transitions']) == ShiftReduceField
        assert type(train.fields['label']) == LabelField
        assert type(train.fields['genre']) == LabelField

        assert type(val.fields['premise']) == ParsedTextField
        assert type(val.fields['premise_transitions']) == ShiftReduceField
        assert type(val.fields['hypothesis']) == ParsedTextField
        assert type(val.fields['hypothesis_transitions']) == ShiftReduceField
        assert type(val.fields['label']) == LabelField
        assert type(val.fields['genre']) == LabelField

        assert type(test.fields['premise']) == ParsedTextField
        assert type(test.fields['premise_transitions']) == ShiftReduceField
        assert type(test.fields['hypothesis']) == ParsedTextField
        assert type(test.fields['hypothesis_transitions']) == ShiftReduceField
        assert type(test.fields['label']) == LabelField
        assert type(test.fields['genre']) == LabelField

        # check each is the correct length
        assert len(train) == 392702
        assert len(val) == 9815
        assert len(test) == 9832

        # build vocabulary
        TEXT.build_vocab(train)
        LABEL.build_vocab(train)
        GENRE.build_vocab(train)

        # ensure vocabulary has been created
        assert hasattr(TEXT, 'vocab')
        assert hasattr(TEXT.vocab, 'itos')
        assert hasattr(TEXT.vocab, 'stoi')

        # create iterators
        train_iter, val_iter, test_iter = Iterator.splits(
            (train, val, test), batch_size=batch_size)

        # get a batch to test
        batch = next(iter(train_iter))

        # split premise and hypothesis from tuples to tensors
        premise, premise_transitions = batch.premise
        hypothesis, hypothesis_transitions = batch.hypothesis
        label = batch.label
        genre = batch.genre

        # check each is actually a tensor
        assert type(premise) == torch.Tensor
        assert type(premise_transitions) == torch.Tensor
        assert type(hypothesis) == torch.Tensor
        assert type(hypothesis_transitions) == torch.Tensor
        assert type(label) == torch.Tensor
        assert type(genre) == torch.Tensor

        # check have the correct batch dimension
        assert premise.shape[-1] == batch_size
        assert premise_transitions.shape[-1] == batch_size
        assert hypothesis.shape[-1] == batch_size
        assert hypothesis_transitions.shape[-1] == batch_size
        assert label.shape[-1] == batch_size
        assert genre.shape[-1] == batch_size

        # repeat the same tests with iters instead of split
        train_iter, val_iter, test_iter = MultiNLI.iters(batch_size=batch_size,
                                                         trees=True)

        # split premise and hypothesis from tuples to tensors
        premise, premise_transitions = batch.premise
        hypothesis, hypothesis_transitions = batch.hypothesis
        label = batch.label

        # check each is actually a tensor
        assert type(premise) == torch.Tensor
        assert type(premise_transitions) == torch.Tensor
        assert type(hypothesis) == torch.Tensor
        assert type(hypothesis_transitions) == torch.Tensor
        assert type(label) == torch.Tensor

        # check have the correct batch dimension
        assert premise.shape[-1] == batch_size
        assert premise_transitions.shape[-1] == batch_size
        assert hypothesis.shape[-1] == batch_size
        assert hypothesis_transitions.shape[-1] == batch_size
        assert label.shape[-1] == batch_size

        # remove downloaded multinli directory
        shutil.rmtree('.data/multinli')
Exemplo n.º 27
0
def train(args: Dict):
    """ Train the NMT Model.
    @param args (Dict): args from cmd line
    """
    train_data_src = read_corpus(args['--train-src'], source='src')
    train_data_tgt = read_corpus(args['--train-tgt'], source='tgt')

    dev_data_src = read_corpus(args['--dev-src'], source='src')
    dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt')

    train_data = list(zip(train_data_src, train_data_tgt))
    dev_data = list(zip(dev_data_src, dev_data_tgt))

    train_batch_size = int(args['--batch-size'])
    clip_grad = float(args['--clip-grad'])
    valid_niter = int(args['--valid-niter'])
    log_every = int(args['--log-every'])
    model_save_path = args['--save-to']

    #prefer to do our entire train,test,val split in the code itself as opposed to our previous script
    # remove these comments

    #data preprocessing for Qs and As.
    spacy_en = spacy.load('en')

    def tokenizer(text):  # create a tokenizer function
        return [tok.text for tok in spacy_en.tokenizer(text)]

    TEXT = Field(sequential=True,
                 tokenize=tokenizer,
                 lower=True,
                 include_lengths=True,
                 init_token='<s>',
                 eos_token='</s>')
    analogies_datafields = [("abc", TEXT), ("d", TEXT)]

    train, val, test = TabularDataset.splits(
        path="data",  # the root directory where the data lies
        train='ngram_train.csv',
        validation="ngram_val.csv",
        test='ngram_test.csv',
        format='csv',
        skip_header=
        False,  # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
        fields=analogies_datafields)

    pretrained_vecs = torchtext.vocab.Vectors('../GloVe-1.2/life_vectors.txt')
    TEXT.build_vocab(
        vectors=pretrained_vecs)  # specials=['<pad>', '<s>', '</s>']

    if args['--cuda'] == 'cpu':
        torch_text_device = -1
    else:
        torch_text_device = 0

    training_iter, val_iter, test_iter = Iterator.splits(
        (train, val, test),
        sort_key=lambda x: len(x.abc),
        batch_sizes=(100, 20, 1),
        device=torch_text_device,
        sort_within_batch=True)

    model = NMT(embed_size=int(args['--embed-size']),
                hidden_size=int(args['--hidden-size']),
                dropout_rate=float(args['--dropout']),
                vocab=TEXT.vocab)
    model.train()  #sets training = True

    uniform_init = float(args['--uniform-init'])
    if np.abs(uniform_init) > 0.:
        print('uniformly initialize parameters [-%f, +%f]' %
              (uniform_init, uniform_init),
              file=sys.stderr)
        for p in model.parameters():
            p.data.uniform_(-uniform_init, uniform_init)

    device = torch.device("cuda:0" if args['--cuda'] else "cpu")
    print('use device: %s' % device, file=sys.stderr)

    model = model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr']))

    num_trial = 0
    train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0
    cum_examples = report_examples = epoch = valid_num = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    print('begin Maximum Likelihood training')

    writer = SummaryWriter('logs')
    is_better_count = 0  #TODO: Remove this and debug the nonstopping part
    while True:
        epoch += 1

        for _, data in enumerate(training_iter):
            (src_sents, src_lengths), (tgt_sents, _) = data.abc, data.d

            train_iter += 1

            optimizer.zero_grad()

            batch_size = src_sents.shape[1]

            example_losses = model(src_sents, src_lengths,
                                   tgt_sents)  # (batch_size,)
            batch_loss = example_losses.sum()
            loss = batch_loss / batch_size

            loss.backward()

            # clip gradient
            grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       clip_grad)

            optimizer.step()

            batch_losses_val = batch_loss.item()
            report_loss += batch_losses_val
            cum_loss += batch_losses_val

            tgt_words_num_to_predict = sum(
                len(s[1:]) for s in tgt_sents)  # omitting leading `<s>`
            report_tgt_words += tgt_words_num_to_predict
            cum_tgt_words += tgt_words_num_to_predict
            report_examples += batch_size
            cum_examples += batch_size

            if train_iter % log_every == 0:
                print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \
                      'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter,
                                                                                         report_loss / report_examples,
                                                                                         math.exp(report_loss / report_tgt_words),
                                                                                         cum_examples,
                                                                                         report_tgt_words / (time.time() - train_time),
                                                                                         time.time() - begin_time), file=sys.stderr)

                writer.add_scalar('Train/AvgLoss',
                                  report_loss / report_examples, epoch)
                writer.add_scalar('Train/AvgPPL',
                                  math.exp(report_loss / report_tgt_words),
                                  epoch)

                train_time = time.time()
                report_loss = report_tgt_words = report_examples = 0.

            # perform validation
            if train_iter % valid_niter == 0:
                print(
                    'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d'
                    % (epoch, train_iter, cum_loss / cum_examples,
                       np.exp(cum_loss / cum_tgt_words), cum_examples),
                    file=sys.stderr)

                cum_loss = cum_examples = cum_tgt_words = 0.
                valid_num += 1

                print('begin validation ...', file=sys.stderr)

                # compute dev. ppl and bleu
                dev_ppl, val_loss = evaluate_ppl(
                    model, val_iter)  # dev batch size can be a bit larger
                valid_metric = -dev_ppl

                print('validation: iter %d, dev. ppl %f, dev loss %f' %
                      (train_iter, dev_ppl, val_loss),
                      file=sys.stderr)
                writer.add_scalar('Val/AvgPPL', dev_ppl, epoch)
                writer.add_scalar('Val/AvgLoss', val_loss, epoch)

                is_better = len(hist_valid_scores
                                ) == 0 or valid_metric > max(hist_valid_scores)
                print(hist_valid_scores)
                print(valid_metric)
                hist_valid_scores.append(valid_metric)

                if is_better:
                    patience = 0
                    print('save currently the best model to [%s]' %
                          model_save_path,
                          file=sys.stderr)
                    model.save(model_save_path)
                    is_better_count = is_better_count + 1
                    print(is_better_count)
                    # also save the optimizers' state
                    torch.save(optimizer.state_dict(),
                               model_save_path + '.optim')
                    if is_better_count > 3:
                        print('reached maximum number of epochs!',
                              file=sys.stderr)
                        writer.close()
                        exit(0)

                elif patience < int(args['--patience']):
                    patience += 1
                    print('hit patience %d' % patience, file=sys.stderr)

                    if patience == int(args['--patience']):
                        num_trial += 1
                        print('hit #%d trial' % num_trial, file=sys.stderr)
                        if num_trial == int(args['--max-num-trial']):
                            print('early stop!', file=sys.stderr)
                            exit(0)

                        # decay lr, and restore from previously best checkpoint
                        lr = optimizer.param_groups[0]['lr'] * float(
                            args['--lr-decay'])
                        print(
                            'load previously best model and decay learning rate to %f'
                            % lr,
                            file=sys.stderr)

                        # load model
                        params = torch.load(
                            model_save_path,
                            map_location=lambda storage, loc: storage)
                        model.load_state_dict(params['state_dict'])
                        model = model.to(device)

                        print('restore parameters of the optimizers',
                              file=sys.stderr)
                        optimizer.load_state_dict(
                            torch.load(model_save_path + '.optim'))

                        # set new lr
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr

                        # reset patience
                        patience = 0

                if epoch == int(args['--max-epoch']):
                    print('reached maximum number of epochs!', file=sys.stderr)
                    writer.close()
                    exit(0)
Exemplo n.º 28
0
def train(train_dir: str, config: Dict, force: bool = False,
          metric_logger: Optional[Callable] = None, device: Optional[torch.device] = None,
          verbose: bool = True):
    train_dir = Path(train_dir)
    if train_dir.exists() and force:
        shutil.rmtree(train_dir)
    train_dir.mkdir(parents=True, exist_ok=False)

    params_file = train_dir / f"config.jsonnet"
    with params_file.open('w') as fp:
        json.dump(config, fp, indent=4)
    params = Params(config)
    pprint(f"Config:")
    pprint(config)

    writer = SummaryWriter(logdir=str(train_dir))

    training_params = params.pop('training')
    dataset_params = params.pop('dataset')
    sampling_params = params.pop('sampling')
    sampling_temperatures = sampling_params.get('temperature', [1.0])
    if isinstance(sampling_temperatures, (int, float)):
        sampling_temperatures = [sampling_temperatures]

    dataset_name = dataset_params.pop('name', "PTB")
    # TODO: unify datasets creation
    if dataset_name == "PTB":
        TEXT = Field(sequential=True, use_vocab=True, lower=True,
                     init_token=SOS_TOKEN, eos_token=EOS_TOKEN,
                     pad_token=PAD_TOKEN, unk_token=UNK_TOKEN,
                     tokenize=lambda x: x.strip().split(), include_lengths=True)
        fields = (('inp', TEXT), ('trg', TEXT))
        train_data, dev_data, test_data = PTB.splits(fields=fields)
    elif dataset_name == "YelpReview":
        TEXT = Field(sequential=True, use_vocab=True, lower=True,
                     init_token=SOS_TOKEN, eos_token=EOS_TOKEN,
                     pad_token=PAD_TOKEN, unk_token=UNK_TOKEN,
                     tokenize="spacy", include_lengths=True)
        fields = (('inp', TEXT), ('trg', TEXT))
        train_data, dev_data, test_data = YelpReview.splits(fields=fields,
                                                            num_samples=120_000,
                                                            split_ratio=[100_000, 10_000, 10_000],
                                                            max_len=150,
                                                            verbose=verbose)
    elif dataset_name == "YahooAnswers":
        TEXT = Field(sequential=True, use_vocab=True, lower=True,
                     init_token=SOS_TOKEN, eos_token=EOS_TOKEN,
                     pad_token=PAD_TOKEN, unk_token=UNK_TOKEN,
                     tokenize="spacy", include_lengths=True)
        fields = (('inp', TEXT), ('trg', TEXT))
        train_data, dev_data, test_data = YahooAnswers.splits(fields=fields,
                                                              num_samples=120_00,
                                                              split_ratio=[100_000, 10_000, 10_000],
                                                              max_len=200,
                                                              verbose=verbose)
    else:
        raise ValueError(f"Dataset {dataset_name} is not supported!")

    TEXT.build_vocab(train_data, max_size=20_000)

    if device is None:
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"Running on device: {device}")

    train_iter, dev_iter, test_iter = Iterator.splits(
        datasets=(train_data, dev_data, test_data),
        batch_sizes=(training_params.batch_size,
                     training_params.test_batch_size,
                     training_params.test_batch_size),
        shuffle=True,
        sort_within_batch=True,
        sort_key=lambda x: len(x.inp),
        device=device
    )

    model_params = params.pop('model')
    model_type = model_params.pop('model_type')
    if model_type == 'svae':
        model = RecurrentVAE(vocab=TEXT.vocab, params=model_params)
    elif model_type == 'ivae':
        model = DilatedConvVAE(vocab=TEXT.vocab, params=model_params)
    else:
        raise ValueError(f"Unsupported model type: {model_type}")
    model.to(device)
    optimizer = optim.Adam(params=model.parameters(), **training_params.pop('optimizer'))

    scheduler = None
    scheduler_params = training_params.pop('lr_scheduler', None)
    if scheduler_params is not None:
        scheduler = WarmUpDecayLR(optimizer=optimizer, **scheduler_params)

    iters = 0
    for epoch in range(training_params.epochs):
        if verbose:
            print("#" * 20)
            print(f"EPOCH {epoch}\n")
        # Training
        model.train()
        for batch in tqdm(train_iter, desc='Training', disable=not verbose):
            iters += 1
            output = model(batch)
            loss = output['loss']
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5)
            optimizer.step()
            writer.add_scalar('train/ELBO', -output['rec_loss'] - output['kl_loss'], iters)
            writer.add_scalar('train/rec_loss', output['rec_loss'], iters)
            writer.add_scalar('train/kl_loss', output['kl_loss'], iters)
            writer.add_scalar('train/kl_weight', output['kl_weight'], iters)
        metrics = model.get_metrics(reset=True)
        for metric, value in metrics.items():
            writer.add_scalar(f'train/{metric}', value, epoch)
        if metric_logger is not None:
            metric_logger({f"train_{key}": value for key, value in metrics.items()}, epoch)
        # Validation
        model.eval()
        with torch.no_grad():
            for batch in tqdm(dev_iter, desc='Validation', disable=not verbose):
                _ = model(batch)
            valid_metrics = model.get_metrics(reset=True)
            for metric, value in valid_metrics.items():
                writer.add_scalar(f'dev/{metric}', value, epoch)
            if metric_logger is not None:
                metric_logger({f"valid_{key}": value for key, value in valid_metrics.items()}, epoch)
        if verbose:
            for temperature in sampling_temperatures:
                print("#" * 20)
                print(f"Sentence samples. Temperature: {temperature}")
                samples = model.sample(num_samples=10,
                                       temperature=temperature,
                                       device=device,
                                       max_len=sampling_params.get('max_len', 50))
                print(*samples, sep='\n')
        if scheduler_params is not None:
            scheduler.step()

    with (train_dir / 'TEXT.Field').open("wb") as fp:
        dill.dump(TEXT, fp)
    save_checkpoint(model.state_dict(), train_dir)

    if params.get('eval_on_test', False):
        if verbose:
            print("Evaluating model on test data...")
        model.eval()
        with torch.no_grad():
            for batch in tqdm(test_iter, desc='Test set evaluation', disable=not verbose):
                _ = model(batch)
            test_metrics = model.get_metrics(reset=True)
            if verbose:
                for metric, value in test_metrics.items():
                    print(f"{metric}: {value}")
            if metric_logger is not None:
                metric_logger(test_metrics)

    writer.close()
Exemplo n.º 29
0
def decode(args: Dict[str, str]):
    """ Performs decoding on a test set, and save the best-scoring decoding results.
    If the target gold-standard sentences are given, the function also computes
    corpus-level BLEU score.
    @param args (Dict): args from cmd line
    """
    spacy_en = spacy.load('en')

    def tokenizer(text):  # create a tokenizer function
        return [tok.text for tok in spacy_en.tokenizer(text)]

    TEXT = Field(sequential=True,
                 tokenize=tokenizer,
                 lower=True,
                 include_lengths=True,
                 init_token='<s>',
                 eos_token='</s>')
    analogies_datafields = [("abc", TEXT), ("d", TEXT)]

    train, val, test = TabularDataset.splits(
        path="data",  # the root directory where the data lies
        train='ngram_train.csv',
        validation="ngram_val.csv",
        test='ngram_test.csv',
        format='csv',
        skip_header=
        False,  # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data!
        fields=analogies_datafields)

    pretrained_vecs = torchtext.vocab.Vectors('../GloVe-1.2/life_vectors.txt')
    TEXT.build_vocab(
        vectors=pretrained_vecs)  # specials=['<pad>', '<s>', '</s>']

    if args['--cuda'] == 'cpu':
        torch_text_device = -1
    else:
        torch_text_device = 0

    training_iter, val_iter, test_iter = Iterator.splits(
        (train, val, test),
        sort_key=lambda x: len(x.abc),
        batch_sizes=(100, 20, 1),
        device=torch_text_device,
        sort_within_batch=True)

    print("load test source sentences from [{}]".format(
        args['TEST_SOURCE_FILE']),
          file=sys.stderr)
    test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src')
    if args['TEST_TARGET_FILE']:
        print("load test target sentences from [{}]".format(
            args['TEST_TARGET_FILE']),
              file=sys.stderr)
        test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt')

    print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr)
    model = NMT.load(args['MODEL_PATH'])

    if args['--cuda']:
        model = model.to(torch.device("cuda:0"))

    hypotheses = beam_search(model,
                             test_iter,
                             beam_size=int(args['--beam-size']),
                             max_decoding_time_step=int(
                                 args['--max-decoding-time-step']))

    if args['TEST_TARGET_FILE']:
        top_hypotheses = [hyps[0] for hyps in hypotheses]
        bleu_score = compute_corpus_level_bleu_score(test_data_tgt,
                                                     top_hypotheses)

        #accuracy (unigrams)
        perfectly_correct = 0
        for index, hyp in enumerate(top_hypotheses):
            if hyp.value[0] == test_data_tgt[index][1]:
                perfectly_correct += 1
        print('Ignore accuracy for non unigrams')
        print('Accuracy: {}'.format(perfectly_correct / len(test_data_tgt)),
              file=sys.stderr)
        print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr)

    with open(args['OUTPUT_FILE'], 'w') as f:
        for src_sent, hyps in zip(test_data_src, hypotheses):
            top_hyp = hyps[0]
            hyp_sent = ' '.join(top_hyp.value)
            f.write(hyp_sent + '\n')
Exemplo n.º 30
0
    def test_xnli(self):
        batch_size = 4

        # create fields
        TEXT = Field()
        GENRE = LabelField()
        LABEL = LabelField()
        LANGUAGE = LabelField()

        # create val/test splits, XNLI does not have a test set
        val, test = XNLI.splits(TEXT, LABEL, GENRE, LANGUAGE)

        # check both are XNLI datasets
        assert type(val) == type(test) == XNLI

        # check all have the correct number of fields
        assert len(val.fields) == len(test.fields) == 5

        # check fields are the correct type
        assert type(val.fields['premise']) == Field
        assert type(val.fields['hypothesis']) == Field
        assert type(val.fields['label']) == LabelField
        assert type(val.fields['genre']) == LabelField
        assert type(val.fields['language']) == LabelField

        assert type(test.fields['premise']) == Field
        assert type(test.fields['hypothesis']) == Field
        assert type(test.fields['label']) == LabelField
        assert type(test.fields['genre']) == LabelField
        assert type(test.fields['language']) == LabelField

        # check each is the correct length
        assert len(val) == 37350
        assert len(test) == 75150

        # build vocabulary
        TEXT.build_vocab(val)
        LABEL.build_vocab(val)
        GENRE.build_vocab(val)
        LANGUAGE.build_vocab(val)

        # ensure vocabulary has been created
        assert hasattr(TEXT, 'vocab')
        assert hasattr(TEXT.vocab, 'itos')
        assert hasattr(TEXT.vocab, 'stoi')

        # create iterators
        val_iter, test_iter = Iterator.splits((val, test),
                                              batch_size=batch_size)

        # get a batch to test
        batch = next(iter(val_iter))

        # split premise and hypothesis from tuples to tensors
        premise = batch.premise
        hypothesis = batch.hypothesis
        label = batch.label
        genre = batch.genre
        language = batch.language

        # check each is actually a tensor
        assert type(premise) == torch.Tensor
        assert type(hypothesis) == torch.Tensor
        assert type(label) == torch.Tensor
        assert type(genre) == torch.Tensor
        assert type(language) == torch.Tensor

        # check have the correct batch dimension
        assert premise.shape[-1] == batch_size
        assert hypothesis.shape[-1] == batch_size
        assert label.shape[-1] == batch_size
        assert genre.shape[-1] == batch_size
        assert language.shape[-1] == batch_size

        # xnli cannot use the iters method, ensure raises error
        with self.assertRaises(NotImplementedError):
            val_iter, test_iter = XNLI.iters(batch_size=batch_size)

        # remove downloaded xnli directory
        shutil.rmtree('.data/xnli')