def tokenize(self): ENGLISH = Field(sequential=True, use_vocab=True, tokenize=str.split, lower=True, init_token="<sos>", eos_token="<eos>") FRENCH = Field(sequential=True, use_vocab=True, tokenize=str.split, lower=True, init_token="<sos>", eos_token="<eos>") """ in order for this to work, change "csv.field_size_limit(sys.maxsize)" in torchtext/utils.py to "csv.field_size_limit(maxInt)" """ train, test = TabularDataset.splits(path='./data/', train='train.csv', test='test.csv', format='csv', fields=[('en',ENGLISH),('fr',FRENCH)]) ENGLISH.build_vocab(train, test) FRENCH.build_vocab(train, test) self.en_vocab = ENGLISH self.fr_vocab = FRENCH self.en_vocabsize = len(ENGLISH.vocab) self.fr_vocabsize = len(FRENCH.vocab) if self.config.debug : train_loader, test_loader = Iterator.splits((train, test), batch_size=2, device="cuda", shuffle=False, sort_key=lambda x : len(x.en), sort_within_batch=False) else : train_loader, test_loader = Iterator.splits((train, test), batch_size=self.config.batchsize, device="cuda", shuffle=False, sort_key=lambda x : len(x.en), sort_within_batch=False) return train_loader, test_loader
def data_split(text_field, label_field, dataset, mode=False): if mode == 'init': for index, c in enumerate(dataset): partial = NLPDataLoader(c, text_field=text_field, label_field=label_field, test=False) if index == 0: text_field.build_vocab(partial) label_field.build_vocab(list(range(13))) else: text_counter = text_field.vocab.freqs for example in partial.examples: text_counter.update(example.text) text_field.vocab = text_field.vocab_cls( text_counter, specials=['<unk>', '<pad>']) return elif mode is False: dataset = NLPDataLoader(dataset, text_field=text_field, label_field=label_field, test=False) return Iterator.splits((dataset, ), batch_size=20) elif mode is True: dataset = NLPDataLoader(dataset, text_field=text_field, label_field=label_field, test=True) return Iterator.splits((dataset, ), batch_size=20, shuffle=False)
def __init__(self, config): self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.batch_size = config['batch_size'] self.pad_id = self.tokenizer._convert_token_to_id("[PAD]") # Objects in which the data will be stored. self.text = Field(sequential=True, lower=True, tokenize=self.tokenizer.tokenize, batch_first=True, pad_token='[PAD]', unk_token='[UNK]') self.labels = Field(sequential=False, is_target=True) self.train, self.dev, self.test = MultiNLI.splits( self.text, self.labels) # Builds vocabulary for the data. self.text.build_vocab(self.train, self.dev, self.test) self.labels.build_vocab(self.train) # Standard torchtext iterators, these do not return input suitable for BERT. self.train_iter, self.dev_iter, self.test_iter = Iterator.splits( (self.train, self.dev, self.test), batch_size=config['batch_size'], device=config['device'])
def iters(cls, path, vectors_name, vectors_cache, batch_size=64, vectors=None, unk_init=uniform_unk_init(), device="cuda:0", train="train.tsv", dev="dev.tsv", test="test.tsv"): if vectors is None: vectors = Vectors(name=vectors_name, cache=vectors_cache, unk_init=unk_init) train, val, test = cls.splits(path, train=train, dev=dev, test=test) cls.TEXT_FIELD.build_vocab(train, val, test, vectors=vectors) sort_within_batch = False if sort_within_batch: print("SORTING WITHIN BATCH!!!!!!!!!!!!!!!!!!!!!!!") return Iterator.splits((train, val, test), batch_size=batch_size, repeat=False, sort_within_batch=sort_within_batch, device=device, sort=False)
def __init__(self, options): print("preparing the dataset for training...") self.TEXT = Field(lower=True, tokenize='spacy', batch_first=True) self.LABEL = Field(sequential=False, unk_token=None, is_target=True) # Since MNLI does not provide public test data # self.dev - MultiNLI Matched data # self.test - MultiNLI Mismatched data # To evaluate your system on the full test set, use the following Kaggle in Class competitions. # https://www.kaggle.com/c/multinli-matched-open-evaluation # https://www.kaggle.com/c/multinli-mismatched-open-evaluation self.train, self.dev, self.test = datasets.MultiNLI.splits( self.TEXT, self.LABEL) self.TEXT.build_vocab(self.train, self.dev) self.LABEL.build_vocab(self.train) vector_cache_loc = '.vector_cache/multinli_vectors.pt' if os.path.isfile(vector_cache_loc): self.TEXT.vocab.vectors = torch.load(vector_cache_loc) else: self.TEXT.vocab.load_vectors('glove.840B.300d') makedirs(os.path.dirname(vector_cache_loc)) torch.save(self.TEXT.vocab.vectors, vector_cache_loc) self.train_iter, self.dev_iter, self.test_iter = Iterator.splits( (self.train, self.dev, self.test), batch_size=options['batch_size'], device=options['device'], sort_key=lambda x: len(x.premise), sort_within_batch=False, shuffle=True)
def create_iter(self, batch_size): """ 构建迭代器 :param batch_size: 每批的大小 :return: iter """ # 定义torchtext中的Field fields = [('english', self.english), ('chinese', self.chinese)] examples = [] # 构建中英文example for en, ch in zip(self.english_list, self.chinese_list): item = [en, ch] examples.append(data.Example().fromlist(item, fields)) # 划分训练集,测试集 train, test = Dataset(examples=examples, fields=fields).split(split_ratio=0.8) self.english.build_vocab(train) self.chinese.build_vocab(train) self.english_voca_size = len(self.english.vocab) self.chinese_voca_size = len(self.chinese.vocab) train_iter, test_iter = Iterator.splits( (train, test), batch_sizes=(batch_size, len(test)), sort_key=lambda x: len(x.english), sort_within_batch=True, device=-1) return train_iter, test_iter
def __init__(self, batch_size): self.text = Field( lower=True, tokenize=lambda x: [tok.text for tok in spacy_en.tokenizer(x)], batch_first=True) self.label = Field(sequential=False, unk_token=None, is_target=True) self.train, self.dev, self.test = SNLI.splits(self.text, self.label) self.sizes = { 'train': len(self.train), 'val': len(self.dev), 'test': len(self.test) } self.text.build_vocab(self.train, self.dev) self.label.build_vocab(self.train) vector_cache_loc = '.vector_cache/snli_vectors.pt' if os.path.isfile(vector_cache_loc): self.text.vocab.vectors = torch.load(vector_cache_loc) else: self.text.vocab.load_vectors('glove.840B.300d') torch.save(self.text.vocab.vectors, vector_cache_loc) # Batching self.train_iter, self.dev_iter, self.test_iter = Iterator.splits( (self.train, self.dev, self.test), batch_size=batch_size, device='cuda:0' if torch.cuda.is_available() else 'cpu') self.vocab_size = len(self.text.vocab) self.out_dim = len(self.label.vocab) self.labels = self.label.vocab.stoi
def __init__(self, batch_size): self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') self.pad_id = self.tokenizer._convert_token_to_id("[PAD]") self.batch_size = batch_size # Objects in which the data will be stored. self.text = Field(sequential=True, lower=True, tokenize=self.tokenizer.tokenize, batch_first=True, pad_token='[PAD]', unk_token='[UNK]') self.labels = Field(sequential=False, is_target=True) self.train, self.dev, self.test = MultiNLI.splits( self.text, self.labels) # Builds vocabulary for the data. self.text.build_vocab(self.train, self.dev, self.test) self.labels.build_vocab(self.train) self.train_size = len(self.train) self.val_size = len(self.dev) self.test_size = len(self.test) self.name = 'mnli' # Standard torchtext iterators, these do not return input suitable for BERT. self.train_iter, self.dev_iter, self.test_iter = Iterator.splits( (self.train, self.dev, self.test), batch_size=self.batch_size, device=torch.device( 'cuda:0' if torch.cuda.is_available() else 'cpu'))
def dataloader(self): train_iter, valid_iter, test_iter = Iterator.splits( (self.train_data, self.valid_data, self.test_data), sort_within_batch=True, sort_key=lambda x: len(x.kor), batch_size=self.args.batch_size, device=device) return train_iter, valid_iter, test_iter
def load_dataset(text, label, args, **kwargs): train_dataset, dev_dataset, test_dataset = get_dataset( '../data', text, label) text.build_vocab(train_dataset, dev_dataset, test_dataset) label.build_vocab(train_dataset, dev_dataset) train_data, dev_data, test_data = Iterator.splits( (train_dataset, dev_dataset, test_dataset), batch_sizes=(args.batch_size, len(dev_dataset), len(test_dataset)), sort_key=lambda x: len(x.text), **kwargs) return train_data, dev_data, test_data
def binary_classification(obj): tokenize = lambda x: x.split() TEXT = Field(sequential=True, tokenize=tokenize, lower=True, batch_first=True, fix_length=obj.fix_length) LABEL = Field(sequential=False, dtype=torch.float, batch_first=True, use_vocab=False) fields = [ ('id', None), ('content', TEXT), ('trump_percentage', LABEL), ] train_csv = 'twitter_pollster_' + str( obj.days) + '_days_train_trump_percentage.csv' test_csv = 'twitter_pollster_' + str( obj.days) + '_days_test_trump_percentage.csv' train_dataset = TabularDataset(path=obj.data_path + '/' + train_csv, format='csv', skip_header=True, fields=fields) test_dataset = TabularDataset(path=obj.data_path + '/' + test_csv, format='csv', skip_header=True, fields=fields) TEXT.build_vocab(train_dataset, vectors=GloVe(name=obj.Glove_name, dim=obj.embedding_dim)) vocab_size = len(TEXT.vocab) word_embeddings = TEXT.vocab.vectors print("vector size of text vocabulary: ", TEXT.vocab.vectors.size()) train_iter, test_iter = Iterator.splits( (train_dataset, test_dataset), sort_key=lambda x: len(x.content), batch_sizes=(obj.train_batch_size, obj.test_batch_size), device=torch.device(obj.device), sort_within_batch=True, repeat=False) train_iter_ = BatchWrapper(train_iter, 'content', ['trump_percentage']) test_iter_ = BatchWrapper(test_iter, 'content', ['trump_percentage']) return TEXT, vocab_size, word_embeddings, train_iter_, test_iter_
def create_iterators(self): ''' train_iter, dev_iter, test_iter = BucketIterator.splits( (self.datasets['train'], self.datasets['dev'], self.datasets['test']), # batch_sizes=(self.args.batch_size, len(self.datasets['dev']), len(self.datasets['test'])), batch_sizes=(self.args.batch_size, self.args.batch_size, self.args.batch_size), device=self.args.device) ''' train_iter = Iterator.splits(self.datasets['train'], batch_size=self.args.batch_size, device=self.args.device) dev_iter = Iterator.splits(self.datasets['dev'], batch_size=len(self.datasets['dev']), device=self.args.device) test_iter = Iterator.splits(self.datasets['test'], batch_size=len(self.datasets['test']), device=self.args.device) return train_iter, dev_iter, test_iter
def __prepare_train_data(self, X, y, sample_weight): self.__text_field = Field(lower=True) self.__label_field = Field(sequential=False) self.__text_field.tokenize = self.__tokenize sample_weight = None if sample_weight is None else list(sample_weight) sw = [1 for yi in y] if sample_weight is None else sample_weight s = y if Counter(y).most_common()[-1][1] > 1 else None X_t, X_d, y_t, y_d, w_t, _ = split(X, y, sw, shuffle=True, stratify=s, random_state=self.random_state, train_size=self.split_ratio) fields = [("text", self.__text_field), ("label", self.__label_field)] examples = [[X_t[i], y_t[i]] for i in range(len(X_t))] examples = [Example.fromlist(example, fields) for example in examples] weights = compute_sample_weight(self.class_weight, y_t) weights = [weights[i] * w_t[i] for i in range(len(y_t))] min_weight = min(weights) weights = [int(round(weight / min_weight)) for weight in weights] for i in range(len(X_t)): Xi = [X_t[i] for j in range(weights[i] - 1)] examples += [Example.fromlist([x, y_t[i]], fields) for x in Xi] train_data = Dataset(examples, fields) dev_data = [[X_d[i], y_d[i]] for i in range(len(X_d))] dev_data = [Example.fromlist(example, fields) for example in dev_data] dev_data = Dataset(dev_data, fields) self.__text_field.build_vocab(train_data, dev_data, vectors=self.vectors) self.__label_field.build_vocab(train_data, dev_data) batch_sizes = (self.batch_size, len(dev_data)) return Iterator.splits((train_data, dev_data), batch_sizes=batch_sizes, sort_key=lambda ex: len(ex.text), repeat=False)
def __init__(self, options): self.TEXT = Field(lower=True, tokenize='spacy', batch_first=True) self.LABEL = Field(sequential=False, unk_token=None, is_target=True) self.train, self.dev, self.test = datasets.SNLI.splits( self.TEXT, self.LABEL) self.TEXT.build_vocab(self.train, self.dev) self.LABEL.build_vocab(self.train) vector_cache_loc = '.vector_cache/snli_vectors.pt' if os.path.isfile(vector_cache_loc): self.TEXT.vocab.vectors = torch.load(vector_cache_loc) else: self.TEXT.vocab.load_vectors('glove.840B.300d') makedirs(os.path.dirname(vector_cache_loc)) torch.save(self.TEXT.vocab.vectors, vector_cache_loc) self.train_iter, self.dev_iter, self.test_iter = Iterator.splits( (self.train, self.dev, self.test), batch_size=options['batch_size'], device=options['device'])
def load_dataset(config, device): label_dict = {"observing": 0, "against": 1, "for": 2} LABEL = Field(use_vocab = False, sequential = False,\ dtype = torch.long, preprocessing = lambda x: label_dict[x.strip()]) SEQ = Field(dtype = torch.long, lower = True, batch_first = True,\ preprocessing = lambda x:x[:45], include_lengths = True) SENT = Field(dtype = torch.long, lower = True, batch_first = True,\ preprocessing = lambda x:x[:45], include_lengths = False) DOC = NestedField(SENT, tokenize = lambda s:s.strip().split(' </s> '), \ preprocessing = lambda s:[x for x in s[:45] if x], dtype = torch.long,\ include_lengths = True) fields = [('label', LABEL), ('claim', SEQ), ('hline', SEQ),\ ('abst', SEQ), ('body', DOC)] train, test = TabularDataset.splits(path="../stance_data/", format = "tsv",\ fields = fields, train = config.train_file, test = config.test_file) train, val = train.split(split_ratio=0.80) vectors = GloVe(name="6B", dim=config.embed_dim, cache='/users4/jwduan/vectors/') DOC.build_vocab(train, val, test, vectors=vectors) SEQ.build_vocab() SEQ.vocab = DOC.vocab config.vocab_size = len(DOC.vocab) train_loader, val_loader, test_loader = Iterator.splits((train, val, test),\ batch_sizes = (config.batch_size, 256, 256), sort_key = lambda x:len(x.body), sort = True, device = device, shuffle = True, repeat = False) return (train_loader, val_loader, test_loader), DOC.vocab.vectors
skip_header=True, fields=test_data_fields) ''' Get embedding from cache ''' vectors = GloVe(name='6B', dim=100, cache='..\.vector_cache') ''' Build vocabulary and embed it ''' text_field.build_vocab(train_data_set, test_dataset, vectors=vectors) label_field.build_vocab(valid_data_set) ''' Define Bucket Iterators ''' train_iter, val_iter = Iterator.splits( (train_data_set, valid_data_set), batch_sizes=(64, 64), device=device, sort_key=lambda x: len(x.text_field), sort_within_batch=False, repeat=False, shuffle=True ) test_iter = Iterator(test_dataset, batch_size=64, device=device, sort=False, sort_within_batch=False, repeat=False) ''' Define model ''' '''
def basic_meta_data(obj): tokenize = lambda x: x.split() TEXT = Field(sequential=True, tokenize=tokenize, lower=True, batch_first=True, fix_length=obj.fix_length) VARIABLE = Field(sequential=False, dtype=torch.float, batch_first=True, use_vocab=False) LABEL = Field(sequential=False, dtype=torch.float, batch_first=True, use_vocab=False) fields = [#('id', None), ('content', TEXT), ('avg_followers',VARIABLE), ('avg_following', VARIABLE), ('avg_left', VARIABLE), ('avg_news', VARIABLE), ('avg_right', VARIABLE), ('time', VARIABLE), ('baseline_pred_left', VARIABLE), ('baseline_pred_mid', VARIABLE), ('baseline_pred_right', VARIABLE), ('left', LABEL), ('mid', LABEL), ('right', LABEL), ('7', None), ('8', None), ('9', None)] #train_csv = 'twitter_pollster_'+str(obj.days)+'_days_train_small.csv' #test_csv = 'twitter_pollster_'+str(obj.days)+'_days_test_small.csv' train_csv = 'train1.csv' test_csv = 'test1.csv' train_dataset = TabularDataset(path=obj.data_path+'/'+train_csv, format='csv', skip_header=True, fields=fields) test_dataset = TabularDataset(path=obj.data_path+'/'+test_csv, format='csv', skip_header=True, fields=fields) TEXT.build_vocab(train_dataset, vectors=GloVe(name=obj.Glove_name, dim=obj.embedding_dim, cache=glove_path)) vocab_size = len(TEXT.vocab) word_embeddings = TEXT.vocab.vectors print ("vector size of text vocabulary: ", TEXT.vocab.vectors.size()) train_iter, test_iter = Iterator.splits( (train_dataset, test_dataset), sort_key=lambda x: len(x.content), batch_sizes=(obj.train_batch_size, obj.test_batch_size), device=torch.device(obj.device), sort_within_batch=True, repeat=False) train_iter_ = BatchWrapper(train_iter, ['content', 'avg_followers', 'avg_following', 'avg_left', 'avg_news', 'avg_right', 'time', 'baseline_pred_left', 'baseline_pred_mid', 'baseline_pred_right'], ['left', 'mid', 'right']) test_iter_ = BatchWrapper(test_iter, ['content', 'avg_followers', 'avg_following', 'avg_left', 'avg_news', 'avg_right', 'time', 'baseline_pred_left', 'baseline_pred_mid', 'baseline_pred_right'], ['left', 'mid', 'right']) return TEXT, vocab_size, word_embeddings, train_iter_, test_iter_
test_dataset = TabularDataset(path='mydata/'+test_csv, format='csv', skip_header=True, fields=fields) TEXT.build_vocab(train_dataset, vectors=GloVe(name='twitter.27B', dim=25)) vocab_size = len(TEXT.vocab) word_embeddings = TEXT.vocab.vectors print ("vector size of text vocabulary: ", TEXT.vocab.vectors.size()) train_iter, test_iter = Iterator.splits( (train_dataset, test_dataset), sort_key=lambda x: len(x.content), batch_sizes=(7, 7), device=torch.device('cpu'), sort_within_batch=True, repeat=False) print(train_csv, test_csv) train_iter_ = BatchWrapper(train_iter, ['content', 'avg_followers', 'avg_following', 'avg_left', 'avg_news', 'avg_right', 'time', 'baseline_pred_left', 'baseline_pred_mid', 'baseline_pred_right'], ['left', 'mid', 'right']) test_iter_ = BatchWrapper(test_iter, ['content', 'avg_followers', 'avg_following', 'avg_left', 'avg_news', 'avg_right', 'time', 'baseline_pred_left', 'baseline_pred_mid', 'baseline_pred_right'], ['left', 'mid', 'right']) batch0 = None batch1 = None batch2 = None for iter, batch in enumerate(train_iter_, 1): if iter==1: #print(iter, batch) batch0 = batch[0]
# 2.构建表格型dataset ds_train, ds_test = TabularDataset.splits(path='./data/', train='train.tsv', test='test.tsv', format='tsv', fields=[('label', LABEL), ('text', TEXT)], skip_header=False) # 3.构建词典 TEXT.build_vocab(ds_train) # 4.构建数据管道迭代器 train_iter, test_iter = Iterator.splits((ds_train, ds_test), sort_within_batch=True, sort_key=lambda x: len(x.text), batch_sizes=(BATCH_SIZE, BATCH_SIZE), device='cuda:4') # 将数据管道组织成torch.utils.data.DataLoader相似的features,label输出形式 class DataLoader: def __init__(self, data_iter): self.data_iter = data_iter self.length = len(data_iter) def __len__(self): return self.length def __iter__(self): # 注意:此处调整features为 batch first,并调整label的shape和dtype
def main(language, hidden_dim, dropout, proc, letter_proc, objective, operator, alpha, lr, momentum, optimizer, batch_size, n_epochs, pretrained_embeddings, letter_hidden_dim, letter_embedding_dim, n_samples, pad_edge, augment, _seed, _run, _log): if objective not in ['erm', 'nll']: raise ValueError("`objective` should be in ['erm', 'nll']," "got %s" % objective) # Technical device = init_system() if pad_edge: init_token = '<init>' eos_token = '<end>' else: init_token = None eos_token = None # Data loading using torchtext abstraction tags = ttdata.Field(sequential=True, include_lengths=True, preprocessing=iob1_iobes, init_token=init_token, eos_token=eos_token, pad_token=None, unk_token=None, batch_first=True) sentences = ttdata.Field(sequential=True, include_lengths=False, batch_first=True, init_token=init_token, eos_token=eos_token, preprocessing=zero_num) letter = ttdata.Field(sequential=True, tokenize=list, include_lengths=True, init_token=None, eos_token=None, preprocessing=zero_num, batch_first=True) letters = NestedField( letter, use_vocab=True, tensor_type=torch.FloatTensor, init_token=init_token, eos_token=eos_token, ) if language == 'en': fields = [[('sentences', sentences), ('letters', letters)], ('', None), ('', None), ('tags', tags)] elif language == 'de': fields = [[('sentences', sentences), ('letters', letters)], ('', None), ('', None), ('', None), ('tags', tags)] elif language in ['es', 'nl']: fields = [[('sentences', sentences), ('letters', letters)], ('', None), ('tags', tags)] else: raise ValueError('Wrong language') tagger_languages = {'en': 'eng', 'nl': 'ned', 'de': 'deu', 'es': 'esp'} train_data, val_data, test_data = SequenceTaggingDataset.splits( path=expanduser('~/data/sdtw_data/conll'), train='%s.train' % tagger_languages[language], validation='%s.testa' % tagger_languages[language], test='%s.testb' % tagger_languages[language], n_samples=n_samples, fields=fields) letters.build_vocab(train_data, val_data, test_data) tags.build_vocab(train_data) tag_itos = tags.vocab.itos if pad_edge: eos_idx = tags.vocab.stoi[tags.eos_token] init_idx = tags.vocab.stoi[tags.init_token] tag_itos[eos_idx] = 'O' tag_itos[init_idx] = 'O' else: eos_idx = None init_idx = None if isinstance(pretrained_embeddings, int): sentences.build_vocab(train_data, val_data, test_data) embedding_dim = pretrained_embeddings else: if pretrained_embeddings == 'ner': vectors = CaseInsensitiveVectors( expanduser('~/data/sdtw_data/ner/%s' % tagger_languages[language]), unk_init=lambda x: x.normal_(0, 1), cache=expanduser('~/cache')) elif 'glove' in pretrained_embeddings: _, name, dim = pretrained_embeddings.split('.') dim = dim[:-1] GloVe.__getitem__ = CaseInsensitiveVectors.__getitem__ vectors = GloVe(name=name, dim=dim, cache=expanduser('~/cache')) elif pretrained_embeddings == 'fasttext': FastText.__getitem__ = CaseInsensitiveVectors.__getitem__ FastText.cache = CaseInsensitiveVectors.cache vectors = FastText(language=language, cache=expanduser('~/cache')) # extend vocab with words of test/val set that has embeddings in # pre-trained embedding # A prod-version would do it dynamically at inference time counter = Counter() sentences.build_vocab(val_data, test_data) for word in sentences.vocab.stoi: if word in vectors.stoi or word.lower() in vectors.stoi or \ re.sub('\d', '0', word.lower()) in vectors.stoi: counter[word] = 1 eval_vocab = Vocab(counter) print("%i/%i eval/test word in pretrained" % (len(counter), len(sentences.vocab.stoi))) sentences.build_vocab(train_data) prev_vocab_size = len(sentences.vocab.stoi) sentences.vocab.extend(eval_vocab) new_vocab_size = len(sentences.vocab.stoi) print('New vocab size: %i (was %i)' % (new_vocab_size, prev_vocab_size)) sentences.vocab.load_vectors(vectors) embedding_dim = sentences.vocab.vectors.shape[1] artifact_dir = _run.info['artifact_dir'] vocab_dict = { 'sentences': sentences.vocab, 'tags': tags.vocab, 'letters': letter.vocab } torch.save(vocab_dict, open(join(artifact_dir, 'vocab.pt'), 'wb+')) unk_idx = sentences.vocab.stoi[sentences.unk_token] padding_idx = sentences.vocab.stoi[sentences.pad_token] singleton_idx = [ tags.vocab.stoi[singleton] for singleton in tags.vocab.stoi if 'S-' in singleton ] tagset_size = len(tags.vocab) vocab_size = len(sentences.vocab) letter_size = len(letters.vocab) device_iter = -1 if device.type == 'cpu' else device.index train_iter, val_iter, test_iter = Iterator.splits( (train_data, val_data, test_data), sort_within_batch=True, batch_sizes=(batch_size, 512, 512), device=device_iter) train_test_iter = Iterator(train_data, sort_within_batch=True, batch_size=512, shuffle=True, device=device_iter) eval_iter = { 'val': val_iter, 'test': test_iter, 'train_test': [next(iter(train_test_iter))] } model = Tagger(embedding_dim, vocab_size, tagset_size, hidden_dim=hidden_dim, proc=proc, padding_idx=padding_idx, letter_proc=letter_proc, letter_embedding_dim=letter_embedding_dim, letter_hidden_dim=letter_hidden_dim, letter_size=letter_size, dropout=dropout, eos_idx=eos_idx, init_idx=init_idx, alpha=alpha, operator=operator) # Load vectors if hasattr(sentences.vocab, 'vectors'): model.embedder.word_embeddings.weight.data = sentences.vocab.vectors model.embedder.word_embeddings.weight.data[padding_idx].fill_(0.) model = model.to(device=device) if operator == 'softmax': loss_function = OurNLLLoss() else: loss_function = BinaryMSELoss() score_function = functools.partial(ner_score, tag_itos=tag_itos, format='iobes') if optimizer == 'sgd': optimizer = torch.optim.SGD(params=model.parameters(), lr=lr * batch_size, momentum=momentum) elif optimizer == 'adam': optimizer = torch.optim.Adam(params=model.parameters(), lr=lr) else: raise ValueError() scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, threshold=1e-3, cooldown=2) for fold in eval_iter: _run.info['%s_loss' % fold] = [] _run.info['%s_prec' % fold] = [] _run.info['%s_recall' % fold] = [] _run.info['%s_f1' % fold] = [] _run.info['epochs'] = [] _run.info['time'] = [] last_epoch = floor(train_iter.epoch) t0 = time.clock() total_time = 0 for batch in train_iter: epoch = floor(train_iter.epoch) if epoch > last_epoch: t1 = time.clock() elapsed = t1 - t0 total_time += elapsed model.eval() _log.info("epoch %i, time/epoch %.3f s" % (epoch, elapsed)) if epoch % 10 == 0: dump_model(model, 'model_%i.pt' % epoch) for fold in eval_iter: this_iter = eval_iter[fold] this_iter = iter(this_iter) loss, prec, recall, f1 = validate(model, this_iter, score_function, objective, loss_function) if fold == 'val': scheduler.step(loss.item(), epoch=epoch) _log.info("%s: loss %.4f, prec %.4f, recall %.4f, f1 %.4f" % (fold, loss, prec, recall, f1)) _run.info['%s_loss' % fold].append(loss.item()) _run.info['%s_prec' % fold].append(prec) _run.info['%s_recall' % fold].append(recall) _run.info['%s_f1' % fold].append(f1) _run.info['time'].append(total_time) _run.info['epochs'].append(epoch) if epoch > n_epochs: break t0 = time.clock() data = make_data(batch, augment=augment, unk_idx=unk_idx, singleton_idx=singleton_idx) model.train() model.zero_grad() loss = compute_loss(model, data, objective, loss_function) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 5, norm_type=2) optimizer.step() last_epoch = epoch dump_model(model, 'model_final.pt') return _run.info['test_f1'][-1]
def create_dataset(config: Config, device: torch.device) -> Tuple[Vocab, Iterator, Iterator, Iterator]: fields = dict() raw_field = RawField() # torchtext 0.3.1 # AttributeError: 'RawField' object has no attribute 'is_target' raw_field.is_target = False fields[SeqType.ArticleID.value] = (SeqType.ArticleID.value, raw_field) time_field = Field(use_vocab=False, batch_first=True, sequential=False) fields['jst_hour'] = (SeqType.Time.value, time_field) token_field = \ Field(use_vocab=True, init_token=SpecialToken.BOS.value, eos_token=SpecialToken.EOS.value, pad_token=SpecialToken.Padding.value, unk_token=SpecialToken.Unknown.value) \ if config.use_init_token_tag \ else Field(use_vocab=True, eos_token=SpecialToken.EOS.value, pad_token=SpecialToken.Padding.value, unk_token=SpecialToken.Unknown.value) fields['processed_tokens'] = (SeqType.Token.value, token_field) seqtypes = [SeqType.RawShort, SeqType.RawLong, SeqType.MovRefShort, SeqType.MovRefLong, SeqType.NormMovRefShort, SeqType.NormMovRefLong, SeqType.StdShort, SeqType.StdLong] for (ric, seqtype) in itertools.product(config.rics, seqtypes): n = N_LONG_TERM \ if seqtype.value.endswith('long') \ else N_SHORT_TERM price_field = Field(use_vocab=False, fix_length=n, batch_first=True, pad_token=0.0, preprocessing=lambda xs: [float(x) for x in xs], dtype=torch.float) key = stringify_ric_seqtype(ric, seqtype) fields[key] = (key, price_field) train, val, test = \ TabularDataset.splits(path=str(config.dir_output), format='json', train='alignment-train.json', validation='alignment-valid.json', test='alignment-test.json', fields=fields) token_field.build_vocab(train, min_freq=config.token_min_freq) batch_size = config.batch_size train_iter, val_iter, test_iter = \ Iterator.splits((train, val, test), batch_sizes=(batch_size, batch_size, batch_size), device=-1 if device.type == 'cpu' else device, repeat=False, sort=False) return (token_field.vocab, train_iter, val_iter, test_iter)
def main(params): try: output_dir = os.path.join( params['outf'], datetime.strftime(datetime.now(), "%Y%m%d_%H%M")) os.makedirs(output_dir) except OSError: pass if torch.cuda.is_available() and not params['cuda']: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) writer = SummaryWriter(output_dir) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") SOS_token = '<sos>' EOS_token = '<eos>' PAD_token = '<pad>' TEXT = Field(sequential=True, use_vocab=True, tokenize=tokenizer, lower=True, batch_first=True, init_token=SOS_token, eos_token=EOS_token) # LABEL = Field(sequential=True, use_vocab=True, tokenize=tokenizer, is_target=True, batch_first=True, init_token='#', eos_token='$') IMG_IND = Field(sequential=False, use_vocab=False, batch_first=True) fields = { 'ans': ('ans', TEXT), 'img_ind': ('img_ind', IMG_IND), 'question': ('question', TEXT) } train, val = TabularDataset.splits( path=params['dataroot'], train=params['input_train'], validation=params['input_test'], format='csv', skip_header=False, fields=fields, ) print("Train data") print(train[0].__dict__.keys()) print(train[0].ans, train[0].img_ind, train[0].question) print("Validation data") print(val[0].__dict__.keys()) print(val[0].ans, val[0].img_ind, val[0].question) print("Building Vocabulary ..") TEXT.build_vocab(train, vectors='glove.6B.100d') vocab = TEXT.vocab PAD_token_ind = vocab.stoi[PAD_token] SOS_token_ind = vocab.stoi[SOS_token] EOS_token_ind = vocab.stoi[EOS_token] print("Creating Embedding from vocab vectors ..") txt_embed = nn.Embedding.from_pretrained(vocab.vectors) print("Text Embeddings are generated of size ", txt_embed.weight.size()) print("Loading Image embeddings ..") with open(params['image_embeddings'], 'rb') as f: img_embs = pkl.load(f)['image_features'] img_embed = nn.Embedding.from_pretrained(torch.FloatTensor(img_embs)) print("Creating Encoder_attn ..") encoder = Encoder(img_embed, txt_embed, params) print(encoder) print("Creating Decoder ..") decoder = Decoder(txt_embed, params) print(decoder) criterion = torch.nn.PairwiseDistance(keepdim=False) criterion.to(device) ## [Completed] TODO(Jay) : Remove this check and use .to(device) # if params['cuda']: # encoder.cuda() # decoder.cuda() # criterion.cuda() encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=params['lr'], weight_decay=1e-5, amsgrad=True) decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=params['lr'], weight_decay=1e-5, amsgrad=True) encoder_LR_scheduler = ReduceLROnPlateau(encoder_optimizer, 'min', patience=1) decoder_LR_scheduler = ReduceLROnPlateau(decoder_optimizer, 'min', patience=1) if params['use_checkpoint']: checkpoint = torch.load(params['enc_dec_model']) encoder.load_state_dict(checkpoint['encoder_state_dict']) decoder.load_state_dict(checkpoint['decoder_state_dict']) encoder_optimizer.load_state_dict( checkpoint['encoder_optimizer_state_dict']) decoder_optimizer.load_state_dict( checkpoint['decoder_optimizer_state_dict']) encoder_LR_scheduler.load_state_dict( checkpoint['encoder_LR_scheduler']) decoder_LR_scheduler.load_state_dict( checkpoint['decoder_LR_scheduler']) encoder.to(device) decoder.to(device) train_iter, val_iter = Iterator.splits( (train, val), batch_sizes=(params['batch_size'], params['batch_size']), sort=False, shuffle=True, device=device) for epoch in range(params['niter']): for is_train in (True, False): print('Is Training: ', is_train) if is_train: encoder.train() decoder.train() data_iter = train_iter else: encoder.eval() decoder.eval() data_iter = val_iter total_loss = 0 total_acc = 0 with torch.set_grad_enabled(is_train): for i, row in enumerate(data_iter, 1): if len(row) < params['batch_size']: continue encoder.zero_grad() decoder.zero_grad() ans, img_ind, question = row.ans, row.img_ind, row.question batch_size = params['batch_size'] ## target_length-1 since we are not predicting SOS token target_length = ans.shape[1] - 1 encoder.hidden = encoder.init_hidden(params) ans = ans.to(device) img_ind = img_ind.to(device) question = question.to(device) encoder.hidden = (encoder.hidden[0].to(device), encoder.hidden[1].to(device)) ans_embed = txt_embed(ans) encoder_output = encoder(img_ind, question) decoder_input = ans_embed[:, 0].reshape( (batch_size, 1, -1)) ## (batch_size, 1) check again ans_embed = ans_embed[:, 1:] ## removed the SOS token ans = ans[:, 1:] ## removed the SOS token decoder_hidden = decoder.init_hidden( encoder_output, params) if params['cuda']: decoder_hidden = (decoder_hidden[0].cuda(), decoder_hidden[1].cuda()) outputs = torch.zeros(batch_size, target_length, params['txt_emb_size']) ## [Completed] TODO(Jay) : remove the sos token from the ans and ans_embed before calc loss and acc for di in range(target_length - 1): decoder_output, decoder_hidden = decoder( decoder_input, decoder_hidden) ## TODO(Jay) : Detach the input from history decoder_input = decoder_output outputs[:, di, :] = decoder_output.reshape( batch_size, -1) filtered_labels, filtered_label_embeds, filtered_outputs = filterOutput( outputs.reshape(batch_size * target_length, -1), ans.reshape(batch_size * target_length, -1), ans_embed.reshape(batch_size * target_length, -1), PAD_token_ind) filtered_label_embeds = filtered_label_embeds.to(device) filtered_outputs = filtered_outputs.to(device) batch_loss = maskedLoss(filtered_label_embeds, filtered_outputs, criterion) batch_acc = word_accuracy(filtered_outputs, vocab.vectors.to(device), filtered_labels) total_loss += batch_loss.item() total_acc += batch_acc if is_train: if i % 1000 == 0: print( '[%d/%d][%d/%d] train_loss: %.4f, Accuracy: %.4f' % (epoch, params['niter'], i, len(data_iter), total_loss / i, total_acc / i)) batch_loss.backward() encoder_optimizer.step() decoder_optimizer.step() avg_loss = total_loss / len(data_iter) avg_acc = total_acc / len(data_iter) if is_train: PATH = os.path.join(output_dir, 'enc_dec_model.pth') torch.save( { 'encoder_state_dict': encoder.state_dict(), 'decoder_state_dict': decoder.state_dict(), 'encoder_optimizer_state_dict': encoder_optimizer.state_dict(), 'decoder_optimizer_state_dict': decoder_optimizer.state_dict(), 'encoder_LR_scheduler': encoder_LR_scheduler.state_dict(), 'decoder_LR_scheduler': decoder_LR_scheduler.state_dict(), }, PATH) writer.add_scalars('data', { 'train_loss': avg_loss, 'train_acc': avg_acc }, epoch) else: print('Calculating Validation loss') print('val_loss: %.4f, Accuracy: %.4f' % (avg_loss, avg_acc)) encoder_LR_scheduler.step(avg_loss) decoder_LR_scheduler.step(avg_loss) writer.add_scalars('data', { 'val_loss': avg_loss, 'val_acc': avg_acc }, epoch) writer.close()
# predicted: (batch, trg_len-1, trg_vocab_size) predicted = predicted.reshape(-1, predicted.shape[-1]) # predicted: (*, trg_vocab_size) trg = trg[:, 1:].reshape(-1) # trg: (*, ) loss = criterion(predicted, trg) epoch_loss += loss.item() return epoch_loss / len(data_iter) train_data, val_data, test_data, SRC, TRG = utils.read_data(batch_first=True) train_iter, val_iter = Iterator.splits((train_data, val_data), batch_size=BATCH_SIZE, shuffle=True, sort=False) test_iter = Iterator(test_data, batch_size=BATCH_SIZE, shuffle=False, sort=False) SRC_PAD_IDX = SRC.vocab.stoi['<pad>'] TRG_PAD_IDX = TRG.vocab.stoi['<pad>'] model = Transformer(len(SRC.vocab), len(TRG.vocab), MAX_LEN, MODEL_SIZE, FF_SIZE, KEY_SIZE, VALUE_SIZE, NUM_HEADS, NUM_LAYERS, DROPOUT, SRC_PAD_IDX, TRG_PAD_IDX).to(DEVICE) criterion = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX) opt = AdamWrapper(model.parameters(), MODEL_SIZE, WARMUP)
def main(params): try: output_dir = os.path.join( params['outf'], datetime.strftime(datetime.now(), "%Y%m%d_%H%M")) os.makedirs(output_dir) except OSError: pass writer = SummaryWriter(output_dir) if torch.cuda.is_available() and not params['cuda']: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) TEXT = Field(sequential=True, use_vocab=True, tokenize=tokenizer, lower=True, batch_first=True) LABEL = Field(sequential=False, use_vocab=False, is_target=True, batch_first=True) IMG_IND = Field(sequential=False, use_vocab=False, batch_first=True) fields = { 'ans': ('ans', LABEL), 'img_ind': ('img_ind', IMG_IND), 'question': ('question', TEXT) } train, val = TabularDataset.splits(path=params['dataroot'], train=params['input_train'], validation=params['input_test'], format='csv', skip_header=False, fields=fields) print("Train data") print(train[0].__dict__.keys()) print(train[0].ans, train[0].img_ind, train[0].question) print("Validation data") print(val[0].__dict__.keys()) print(val[0].ans, val[0].img_ind, val[0].question) print("Building Vocabulary ..") TEXT.build_vocab(train, vectors='glove.6B.100d') vocab = TEXT.vocab print("Creating Embedding from vocab vectors ..") params['vocab'] = vocab vqa_model = model.Model(params) print(vqa_model) if params['use_checkpoint']: checkpoint = torch.load(params['mcq_model']) vqa_model.load_state_dict(checkpoint['model_state_dict']) vqa_model.hidden = checkpoint['lstm_hidden'] criterion = torch.nn.CrossEntropyLoss() if params['cuda']: vqa_model.cuda() criterion.cuda() optimizer = torch.optim.Adam(vqa_model.parameters(), lr=params['lr']) train_iter, val_iter = Iterator.splits( (train, val), batch_sizes=(params['batch_size'], params['batch_size']), sort_within_batch=False, sort=False) for epoch in range(1, params['niter'] + 1): total_val_loss = 0 total_val_matches = 0 total_train_loss = 0 total_train_matches = 0 for i, row in enumerate(train_iter): vqa_model.train() # Starting each batch, we detach the hidden state from how it was previously produced. # If we didn't, the model would try backpropagating all the way to start of the dataset. if len(row) < params['batch_size']: continue vqa_model.hidden = repackage_hidden(vqa_model.hidden) vqa_model.zero_grad() ans, img_ind, question = row.ans, row.img_ind, row.question batch_size = ans.size(0) if params['cuda']: ans = ans.cuda() img_ind = img_ind.cuda() question = question.cuda() vqa_model.hidden = tuple([v.cuda() for v in vqa_model.hidden]) ans_var = Variable(ans) img_ind_var = Variable(img_ind) question_var = Variable(question) pred_ans = vqa_model(img_ind_var, question_var) train_loss = criterion(pred_ans, ans_var) pred_ind = pred_ans.max(dim=1)[1] train_acc = (pred_ind == ans_var).sum() total_train_loss += train_loss.item() total_train_matches += train_acc.item() train_loss.backward() optimizer.step() if i % 1000 == 0: print('[%d/%d][%d/%d] train_loss: %.4f' % (epoch, params['niter'], i + 1, len(train_iter), train_loss)) vqa_model.eval() for row in val_iter: if len(row) < params['batch_size']: continue vqa_model.hidden = repackage_hidden(vqa_model.hidden) vqa_model.zero_grad() ans, img_ind, question = row.ans, row.img_ind, row.question batch_size = ans.size(0) if params['cuda']: ans = ans.cuda() img_ind = img_ind.cuda() question = question.cuda() vqa_model.hidden = tuple([v.cuda() for v in vqa_model.hidden]) ans_var = Variable(ans) img_ind_var = Variable(img_ind) question_var = Variable(question) pred_ans = vqa_model(img_ind_var, question_var) val_loss = criterion(pred_ans, ans_var) pred_ind = pred_ans.max(dim=1)[1] val_acc = (pred_ind == ans_var).sum() total_val_loss += val_loss.item() total_val_matches += val_acc.item() print( '[%d/%d] train_loss: %.4f val_loss: %.4f train_acc: %.4f val_acc: %.4f' % (epoch, params['niter'], total_train_loss / len(train_iter), total_val_loss / len(val_iter), total_train_matches * 100 / len(train_iter) / params['batch_size'], total_val_matches * 100 / len(val_iter) / params['batch_size'])) writer.add_scalars( 'data', { 'train_loss': train_loss, 'train_acc': total_train_matches * 100 / len(train_iter) / params['batch_size'], 'val_loss': total_val_loss / len(val_iter), 'val_acc': total_val_matches * 100 / len(val_iter) / params['batch_size'] }, epoch) torch.save( { 'lstm_hidden': vqa_model.hidden, 'model_state_dict': vqa_model.state_dict() }, '%s/baseline_%d.pth' % (output_dir, epoch)) writer.close()
### 3 创建迭代 train_iter, valid_iter = BucketIterator.splits( (train, valid), batch_size=64, device='cpu', sort_key=lambda x: len(x.comment_text), #使用什么功能对数据进行分组。 sort_within_batch=False, repeat=False) print(train_iter) print(next(train_iter.__iter__())) test_iter = Iterator.splits(datasets=test, batch_size=64, device=-1, sort=False, sort_within_batch=False, repeat=False) """ 这个是需要将数字化的序列转为原始的文本 TEXT = ReversibleField(sequential=True, lower=True, include_lengths=True) for data in train_iter: (x, x_lengths), y = data.Text, data.Description orig_text = TEXT.reverse(x.data) print(orig_text) """ class BatchWrapper(): def __init__(self, dl, x_var, y_vars): self.dl, self.x_var, self.y_vars = dl, x_var, y_vars # we pass in the list of attributes for x and y
def test_multinli(self): batch_size = 4 # create fields TEXT = ParsedTextField() TREE = ShiftReduceField() GENRE = LabelField() LABEL = LabelField() # create train/val/test splits train, val, test = MultiNLI.splits(TEXT, LABEL, TREE, GENRE) # check all are MultiNLI datasets assert type(train) == type(val) == type(test) == MultiNLI # check all have correct number of fields assert len(train.fields) == len(val.fields) == len(test.fields) == 6 # check fields are the correct type assert type(train.fields['premise']) == ParsedTextField assert type(train.fields['premise_transitions']) == ShiftReduceField assert type(train.fields['hypothesis']) == ParsedTextField assert type(train.fields['hypothesis_transitions']) == ShiftReduceField assert type(train.fields['label']) == LabelField assert type(train.fields['genre']) == LabelField assert type(val.fields['premise']) == ParsedTextField assert type(val.fields['premise_transitions']) == ShiftReduceField assert type(val.fields['hypothesis']) == ParsedTextField assert type(val.fields['hypothesis_transitions']) == ShiftReduceField assert type(val.fields['label']) == LabelField assert type(val.fields['genre']) == LabelField assert type(test.fields['premise']) == ParsedTextField assert type(test.fields['premise_transitions']) == ShiftReduceField assert type(test.fields['hypothesis']) == ParsedTextField assert type(test.fields['hypothesis_transitions']) == ShiftReduceField assert type(test.fields['label']) == LabelField assert type(test.fields['genre']) == LabelField # check each is the correct length assert len(train) == 392702 assert len(val) == 9815 assert len(test) == 9832 # build vocabulary TEXT.build_vocab(train) LABEL.build_vocab(train) GENRE.build_vocab(train) # ensure vocabulary has been created assert hasattr(TEXT, 'vocab') assert hasattr(TEXT.vocab, 'itos') assert hasattr(TEXT.vocab, 'stoi') # create iterators train_iter, val_iter, test_iter = Iterator.splits( (train, val, test), batch_size=batch_size) # get a batch to test batch = next(iter(train_iter)) # split premise and hypothesis from tuples to tensors premise, premise_transitions = batch.premise hypothesis, hypothesis_transitions = batch.hypothesis label = batch.label genre = batch.genre # check each is actually a tensor assert type(premise) == torch.Tensor assert type(premise_transitions) == torch.Tensor assert type(hypothesis) == torch.Tensor assert type(hypothesis_transitions) == torch.Tensor assert type(label) == torch.Tensor assert type(genre) == torch.Tensor # check have the correct batch dimension assert premise.shape[-1] == batch_size assert premise_transitions.shape[-1] == batch_size assert hypothesis.shape[-1] == batch_size assert hypothesis_transitions.shape[-1] == batch_size assert label.shape[-1] == batch_size assert genre.shape[-1] == batch_size # repeat the same tests with iters instead of split train_iter, val_iter, test_iter = MultiNLI.iters(batch_size=batch_size, trees=True) # split premise and hypothesis from tuples to tensors premise, premise_transitions = batch.premise hypothesis, hypothesis_transitions = batch.hypothesis label = batch.label # check each is actually a tensor assert type(premise) == torch.Tensor assert type(premise_transitions) == torch.Tensor assert type(hypothesis) == torch.Tensor assert type(hypothesis_transitions) == torch.Tensor assert type(label) == torch.Tensor # check have the correct batch dimension assert premise.shape[-1] == batch_size assert premise_transitions.shape[-1] == batch_size assert hypothesis.shape[-1] == batch_size assert hypothesis_transitions.shape[-1] == batch_size assert label.shape[-1] == batch_size # remove downloaded multinli directory shutil.rmtree('.data/multinli')
def train(args: Dict): """ Train the NMT Model. @param args (Dict): args from cmd line """ train_data_src = read_corpus(args['--train-src'], source='src') train_data_tgt = read_corpus(args['--train-tgt'], source='tgt') dev_data_src = read_corpus(args['--dev-src'], source='src') dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt') train_data = list(zip(train_data_src, train_data_tgt)) dev_data = list(zip(dev_data_src, dev_data_tgt)) train_batch_size = int(args['--batch-size']) clip_grad = float(args['--clip-grad']) valid_niter = int(args['--valid-niter']) log_every = int(args['--log-every']) model_save_path = args['--save-to'] #prefer to do our entire train,test,val split in the code itself as opposed to our previous script # remove these comments #data preprocessing for Qs and As. spacy_en = spacy.load('en') def tokenizer(text): # create a tokenizer function return [tok.text for tok in spacy_en.tokenizer(text)] TEXT = Field(sequential=True, tokenize=tokenizer, lower=True, include_lengths=True, init_token='<s>', eos_token='</s>') analogies_datafields = [("abc", TEXT), ("d", TEXT)] train, val, test = TabularDataset.splits( path="data", # the root directory where the data lies train='ngram_train.csv', validation="ngram_val.csv", test='ngram_test.csv', format='csv', skip_header= False, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data! fields=analogies_datafields) pretrained_vecs = torchtext.vocab.Vectors('../GloVe-1.2/life_vectors.txt') TEXT.build_vocab( vectors=pretrained_vecs) # specials=['<pad>', '<s>', '</s>'] if args['--cuda'] == 'cpu': torch_text_device = -1 else: torch_text_device = 0 training_iter, val_iter, test_iter = Iterator.splits( (train, val, test), sort_key=lambda x: len(x.abc), batch_sizes=(100, 20, 1), device=torch_text_device, sort_within_batch=True) model = NMT(embed_size=int(args['--embed-size']), hidden_size=int(args['--hidden-size']), dropout_rate=float(args['--dropout']), vocab=TEXT.vocab) model.train() #sets training = True uniform_init = float(args['--uniform-init']) if np.abs(uniform_init) > 0.: print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr) for p in model.parameters(): p.data.uniform_(-uniform_init, uniform_init) device = torch.device("cuda:0" if args['--cuda'] else "cpu") print('use device: %s' % device, file=sys.stderr) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr'])) num_trial = 0 train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0 cum_examples = report_examples = epoch = valid_num = 0 hist_valid_scores = [] train_time = begin_time = time.time() print('begin Maximum Likelihood training') writer = SummaryWriter('logs') is_better_count = 0 #TODO: Remove this and debug the nonstopping part while True: epoch += 1 for _, data in enumerate(training_iter): (src_sents, src_lengths), (tgt_sents, _) = data.abc, data.d train_iter += 1 optimizer.zero_grad() batch_size = src_sents.shape[1] example_losses = model(src_sents, src_lengths, tgt_sents) # (batch_size,) batch_loss = example_losses.sum() loss = batch_loss / batch_size loss.backward() # clip gradient grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad) optimizer.step() batch_losses_val = batch_loss.item() report_loss += batch_losses_val cum_loss += batch_losses_val tgt_words_num_to_predict = sum( len(s[1:]) for s in tgt_sents) # omitting leading `<s>` report_tgt_words += tgt_words_num_to_predict cum_tgt_words += tgt_words_num_to_predict report_examples += batch_size cum_examples += batch_size if train_iter % log_every == 0: print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \ 'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter, report_loss / report_examples, math.exp(report_loss / report_tgt_words), cum_examples, report_tgt_words / (time.time() - train_time), time.time() - begin_time), file=sys.stderr) writer.add_scalar('Train/AvgLoss', report_loss / report_examples, epoch) writer.add_scalar('Train/AvgPPL', math.exp(report_loss / report_tgt_words), epoch) train_time = time.time() report_loss = report_tgt_words = report_examples = 0. # perform validation if train_iter % valid_niter == 0: print( 'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter, cum_loss / cum_examples, np.exp(cum_loss / cum_tgt_words), cum_examples), file=sys.stderr) cum_loss = cum_examples = cum_tgt_words = 0. valid_num += 1 print('begin validation ...', file=sys.stderr) # compute dev. ppl and bleu dev_ppl, val_loss = evaluate_ppl( model, val_iter) # dev batch size can be a bit larger valid_metric = -dev_ppl print('validation: iter %d, dev. ppl %f, dev loss %f' % (train_iter, dev_ppl, val_loss), file=sys.stderr) writer.add_scalar('Val/AvgPPL', dev_ppl, epoch) writer.add_scalar('Val/AvgLoss', val_loss, epoch) is_better = len(hist_valid_scores ) == 0 or valid_metric > max(hist_valid_scores) print(hist_valid_scores) print(valid_metric) hist_valid_scores.append(valid_metric) if is_better: patience = 0 print('save currently the best model to [%s]' % model_save_path, file=sys.stderr) model.save(model_save_path) is_better_count = is_better_count + 1 print(is_better_count) # also save the optimizers' state torch.save(optimizer.state_dict(), model_save_path + '.optim') if is_better_count > 3: print('reached maximum number of epochs!', file=sys.stderr) writer.close() exit(0) elif patience < int(args['--patience']): patience += 1 print('hit patience %d' % patience, file=sys.stderr) if patience == int(args['--patience']): num_trial += 1 print('hit #%d trial' % num_trial, file=sys.stderr) if num_trial == int(args['--max-num-trial']): print('early stop!', file=sys.stderr) exit(0) # decay lr, and restore from previously best checkpoint lr = optimizer.param_groups[0]['lr'] * float( args['--lr-decay']) print( 'load previously best model and decay learning rate to %f' % lr, file=sys.stderr) # load model params = torch.load( model_save_path, map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) model = model.to(device) print('restore parameters of the optimizers', file=sys.stderr) optimizer.load_state_dict( torch.load(model_save_path + '.optim')) # set new lr for param_group in optimizer.param_groups: param_group['lr'] = lr # reset patience patience = 0 if epoch == int(args['--max-epoch']): print('reached maximum number of epochs!', file=sys.stderr) writer.close() exit(0)
def train(train_dir: str, config: Dict, force: bool = False, metric_logger: Optional[Callable] = None, device: Optional[torch.device] = None, verbose: bool = True): train_dir = Path(train_dir) if train_dir.exists() and force: shutil.rmtree(train_dir) train_dir.mkdir(parents=True, exist_ok=False) params_file = train_dir / f"config.jsonnet" with params_file.open('w') as fp: json.dump(config, fp, indent=4) params = Params(config) pprint(f"Config:") pprint(config) writer = SummaryWriter(logdir=str(train_dir)) training_params = params.pop('training') dataset_params = params.pop('dataset') sampling_params = params.pop('sampling') sampling_temperatures = sampling_params.get('temperature', [1.0]) if isinstance(sampling_temperatures, (int, float)): sampling_temperatures = [sampling_temperatures] dataset_name = dataset_params.pop('name', "PTB") # TODO: unify datasets creation if dataset_name == "PTB": TEXT = Field(sequential=True, use_vocab=True, lower=True, init_token=SOS_TOKEN, eos_token=EOS_TOKEN, pad_token=PAD_TOKEN, unk_token=UNK_TOKEN, tokenize=lambda x: x.strip().split(), include_lengths=True) fields = (('inp', TEXT), ('trg', TEXT)) train_data, dev_data, test_data = PTB.splits(fields=fields) elif dataset_name == "YelpReview": TEXT = Field(sequential=True, use_vocab=True, lower=True, init_token=SOS_TOKEN, eos_token=EOS_TOKEN, pad_token=PAD_TOKEN, unk_token=UNK_TOKEN, tokenize="spacy", include_lengths=True) fields = (('inp', TEXT), ('trg', TEXT)) train_data, dev_data, test_data = YelpReview.splits(fields=fields, num_samples=120_000, split_ratio=[100_000, 10_000, 10_000], max_len=150, verbose=verbose) elif dataset_name == "YahooAnswers": TEXT = Field(sequential=True, use_vocab=True, lower=True, init_token=SOS_TOKEN, eos_token=EOS_TOKEN, pad_token=PAD_TOKEN, unk_token=UNK_TOKEN, tokenize="spacy", include_lengths=True) fields = (('inp', TEXT), ('trg', TEXT)) train_data, dev_data, test_data = YahooAnswers.splits(fields=fields, num_samples=120_00, split_ratio=[100_000, 10_000, 10_000], max_len=200, verbose=verbose) else: raise ValueError(f"Dataset {dataset_name} is not supported!") TEXT.build_vocab(train_data, max_size=20_000) if device is None: device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print(f"Running on device: {device}") train_iter, dev_iter, test_iter = Iterator.splits( datasets=(train_data, dev_data, test_data), batch_sizes=(training_params.batch_size, training_params.test_batch_size, training_params.test_batch_size), shuffle=True, sort_within_batch=True, sort_key=lambda x: len(x.inp), device=device ) model_params = params.pop('model') model_type = model_params.pop('model_type') if model_type == 'svae': model = RecurrentVAE(vocab=TEXT.vocab, params=model_params) elif model_type == 'ivae': model = DilatedConvVAE(vocab=TEXT.vocab, params=model_params) else: raise ValueError(f"Unsupported model type: {model_type}") model.to(device) optimizer = optim.Adam(params=model.parameters(), **training_params.pop('optimizer')) scheduler = None scheduler_params = training_params.pop('lr_scheduler', None) if scheduler_params is not None: scheduler = WarmUpDecayLR(optimizer=optimizer, **scheduler_params) iters = 0 for epoch in range(training_params.epochs): if verbose: print("#" * 20) print(f"EPOCH {epoch}\n") # Training model.train() for batch in tqdm(train_iter, desc='Training', disable=not verbose): iters += 1 output = model(batch) loss = output['loss'] optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 5) optimizer.step() writer.add_scalar('train/ELBO', -output['rec_loss'] - output['kl_loss'], iters) writer.add_scalar('train/rec_loss', output['rec_loss'], iters) writer.add_scalar('train/kl_loss', output['kl_loss'], iters) writer.add_scalar('train/kl_weight', output['kl_weight'], iters) metrics = model.get_metrics(reset=True) for metric, value in metrics.items(): writer.add_scalar(f'train/{metric}', value, epoch) if metric_logger is not None: metric_logger({f"train_{key}": value for key, value in metrics.items()}, epoch) # Validation model.eval() with torch.no_grad(): for batch in tqdm(dev_iter, desc='Validation', disable=not verbose): _ = model(batch) valid_metrics = model.get_metrics(reset=True) for metric, value in valid_metrics.items(): writer.add_scalar(f'dev/{metric}', value, epoch) if metric_logger is not None: metric_logger({f"valid_{key}": value for key, value in valid_metrics.items()}, epoch) if verbose: for temperature in sampling_temperatures: print("#" * 20) print(f"Sentence samples. Temperature: {temperature}") samples = model.sample(num_samples=10, temperature=temperature, device=device, max_len=sampling_params.get('max_len', 50)) print(*samples, sep='\n') if scheduler_params is not None: scheduler.step() with (train_dir / 'TEXT.Field').open("wb") as fp: dill.dump(TEXT, fp) save_checkpoint(model.state_dict(), train_dir) if params.get('eval_on_test', False): if verbose: print("Evaluating model on test data...") model.eval() with torch.no_grad(): for batch in tqdm(test_iter, desc='Test set evaluation', disable=not verbose): _ = model(batch) test_metrics = model.get_metrics(reset=True) if verbose: for metric, value in test_metrics.items(): print(f"{metric}: {value}") if metric_logger is not None: metric_logger(test_metrics) writer.close()
def decode(args: Dict[str, str]): """ Performs decoding on a test set, and save the best-scoring decoding results. If the target gold-standard sentences are given, the function also computes corpus-level BLEU score. @param args (Dict): args from cmd line """ spacy_en = spacy.load('en') def tokenizer(text): # create a tokenizer function return [tok.text for tok in spacy_en.tokenizer(text)] TEXT = Field(sequential=True, tokenize=tokenizer, lower=True, include_lengths=True, init_token='<s>', eos_token='</s>') analogies_datafields = [("abc", TEXT), ("d", TEXT)] train, val, test = TabularDataset.splits( path="data", # the root directory where the data lies train='ngram_train.csv', validation="ngram_val.csv", test='ngram_test.csv', format='csv', skip_header= False, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data! fields=analogies_datafields) pretrained_vecs = torchtext.vocab.Vectors('../GloVe-1.2/life_vectors.txt') TEXT.build_vocab( vectors=pretrained_vecs) # specials=['<pad>', '<s>', '</s>'] if args['--cuda'] == 'cpu': torch_text_device = -1 else: torch_text_device = 0 training_iter, val_iter, test_iter = Iterator.splits( (train, val, test), sort_key=lambda x: len(x.abc), batch_sizes=(100, 20, 1), device=torch_text_device, sort_within_batch=True) print("load test source sentences from [{}]".format( args['TEST_SOURCE_FILE']), file=sys.stderr) test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src') if args['TEST_TARGET_FILE']: print("load test target sentences from [{}]".format( args['TEST_TARGET_FILE']), file=sys.stderr) test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt') print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr) model = NMT.load(args['MODEL_PATH']) if args['--cuda']: model = model.to(torch.device("cuda:0")) hypotheses = beam_search(model, test_iter, beam_size=int(args['--beam-size']), max_decoding_time_step=int( args['--max-decoding-time-step'])) if args['TEST_TARGET_FILE']: top_hypotheses = [hyps[0] for hyps in hypotheses] bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses) #accuracy (unigrams) perfectly_correct = 0 for index, hyp in enumerate(top_hypotheses): if hyp.value[0] == test_data_tgt[index][1]: perfectly_correct += 1 print('Ignore accuracy for non unigrams') print('Accuracy: {}'.format(perfectly_correct / len(test_data_tgt)), file=sys.stderr) print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr) with open(args['OUTPUT_FILE'], 'w') as f: for src_sent, hyps in zip(test_data_src, hypotheses): top_hyp = hyps[0] hyp_sent = ' '.join(top_hyp.value) f.write(hyp_sent + '\n')
def test_xnli(self): batch_size = 4 # create fields TEXT = Field() GENRE = LabelField() LABEL = LabelField() LANGUAGE = LabelField() # create val/test splits, XNLI does not have a test set val, test = XNLI.splits(TEXT, LABEL, GENRE, LANGUAGE) # check both are XNLI datasets assert type(val) == type(test) == XNLI # check all have the correct number of fields assert len(val.fields) == len(test.fields) == 5 # check fields are the correct type assert type(val.fields['premise']) == Field assert type(val.fields['hypothesis']) == Field assert type(val.fields['label']) == LabelField assert type(val.fields['genre']) == LabelField assert type(val.fields['language']) == LabelField assert type(test.fields['premise']) == Field assert type(test.fields['hypothesis']) == Field assert type(test.fields['label']) == LabelField assert type(test.fields['genre']) == LabelField assert type(test.fields['language']) == LabelField # check each is the correct length assert len(val) == 37350 assert len(test) == 75150 # build vocabulary TEXT.build_vocab(val) LABEL.build_vocab(val) GENRE.build_vocab(val) LANGUAGE.build_vocab(val) # ensure vocabulary has been created assert hasattr(TEXT, 'vocab') assert hasattr(TEXT.vocab, 'itos') assert hasattr(TEXT.vocab, 'stoi') # create iterators val_iter, test_iter = Iterator.splits((val, test), batch_size=batch_size) # get a batch to test batch = next(iter(val_iter)) # split premise and hypothesis from tuples to tensors premise = batch.premise hypothesis = batch.hypothesis label = batch.label genre = batch.genre language = batch.language # check each is actually a tensor assert type(premise) == torch.Tensor assert type(hypothesis) == torch.Tensor assert type(label) == torch.Tensor assert type(genre) == torch.Tensor assert type(language) == torch.Tensor # check have the correct batch dimension assert premise.shape[-1] == batch_size assert hypothesis.shape[-1] == batch_size assert label.shape[-1] == batch_size assert genre.shape[-1] == batch_size assert language.shape[-1] == batch_size # xnli cannot use the iters method, ensure raises error with self.assertRaises(NotImplementedError): val_iter, test_iter = XNLI.iters(batch_size=batch_size) # remove downloaded xnli directory shutil.rmtree('.data/xnli')