def create_dataset(path_to_dataset,batch_size,split_ratio=0.7,min_vocab_freq=10,max_vocab_size=4000): text_field = Field(tokenize="spacy",tokenizer_language="en",batch_first=True,init_token="<sos>",eos_token="<eos>",lower=True) def transform(caption): caption = caption.strip().lower().split() return caption dataset = CocoCaptions(annFile=os.path.join(path_to_dataset,"captions_train2014.json"),text_field=text_field,transform=transform) train,val = dataset.split(split_ratio=split_ratio) test = CocoCaptions(annFile=os.path.join(path_to_dataset,"captions_val2014.json"),text_field=text_field,transform=transform) print("Dataset loaded") print("Train set size:",len(train)) text_field.build_vocab(dataset.text,min_freq=min_vocab_freq,max_size=max_vocab_size) SOS_TOKEN = text_field.vocab.stoi['<sos>'] EOS_TOKEN = text_field.vocab.stoi['<eos>'] UNK_TOKEN = text_field.vocab.stoi['<unk>'] PAD_TOKEN = text_field.vocab.stoi['<pad>'] print("Vocabuly build") print("Vocabuly statistics") print("\nMost common words in the vocabulary:\n",text_field.vocab.freqs.most_common(10)) print("Size of the vocabulary:",len(text_field.vocab)) print("Max sequence lenght",dataset.max_seq_len) train_iter,val_iter = BucketIterator.splits((train,val),repeat=False,batch_size=batch_size) test_iter = BucketIterator(test,batch_size=batch_size,repeat=False,train=False) vocab_dict = text_field.vocab.stoi return {"data_iters":(train_iter,val_iter,test_iter),"fields":text_field, "word_to_num_vocab":vocab_dict,"num_to_word_vocab":{y:x for x,y in vocab_dict.items()}, "num_classes":len(text_field.vocab),"tokens":(SOS_TOKEN,EOS_TOKEN,UNK_TOKEN,PAD_TOKEN),"max_seq_len":dataset.max_seq_len}
def load_dataset(file_name): """Loads contents from a file in the *data* directory into a torchtext.data.TabularDataset instance. """ file_path = join(DATA_DIR, file_name) text_field = Field(pad_token=None, tokenize=_tokenize_str) dataset = TabularDataset( path=file_path, format='csv', fields=[('text', text_field)]) text_field.build_vocab(dataset) return dataset
def load_dataset(batch_size): spacy_de = spacy.load('de') spacy_en = spacy.load('en') url = re.compile('(<url>.*</url>)') def tokenize_de(text): return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))] def tokenize_en(text): return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))] DE = Field(tokenize=tokenize_de, include_lengths=True, init_token='<sos>', eos_token='<eos>') EN = Field(tokenize=tokenize_en, include_lengths=True, init_token='<sos>', eos_token='<eos>') train, val, test = Multi30k.splits(exts=('.de', '.en'), fields=(DE, EN)) DE.build_vocab(train.src, min_freq=2) EN.build_vocab(train.trg, max_size=10000) train_iter, val_iter, test_iter = BucketIterator.splits( (train, val, test), batch_size=batch_size, repeat=False) return train_iter, val_iter, test_iter, DE, EN
#EN = Field(tokenize=tokenize_en,batch_first=True,init_token="<SOS>",eos_token="<EOS>") #DE = Field(tokenize=tokenize_de,batch_first=True,init_token="<SOS>",eos_token="<EOS>") EN = Field(tokenize="spacy",tokenizer_language="en",batch_first=True,init_token="<SOS>",eos_token="<EOS>") DE = Field(tokenize="spacy",tokenizer_language="de",batch_first=True,init_token="<SOS>",eos_token="<EOS>") # multi30k dataloader train,val,test = datasets.Multi30k.splits(exts=(".en",".de"),fields=(EN,DE),root=data_path) # wmt14 dataloader (better than using datasets.WMT14.splits since it's slow) #train,val,test = datasets.TranslationDataset.splits(exts=(".en",".de"),fields=[("src",EN),("trg",DE)],path=os.path.join(data_path,"wmt14"), # train="train.tok.clean.bpe.32000",validation="newstest2013.tok.bpe.32000",test="newstest2014.tok.bpe.32000") print("Dataset loaded") EN.build_vocab(train.src,min_freq=3) DE.build_vocab(train.trg,max_size=50000) print("Vocabularies build") train_iter,val_iter = BucketIterator.splits((train, val),batch_size=3) test_iter = BucketIterator(test,batch_size=3) print("Start iterating through data") for i,batch in enumerate(train_iter): print(batch.src) # the source language print(batch.trg) # the target language break for i,batch in enumerate(val_iter):
class WordToPhonemeModel: '''Contains pytorch model for converting words to phonemes.''' UNK_TOKEN = '<unk>' PAD_TOKEN = '<pad>' SOS_TOKEN = '<sos>' EOS_TOKEN = '<eos>' def __init__(self, model_dir=None, device=None, **load_kwargs): self._logger = logging.getLogger(__class__.__name__) self.device = device self.best_test_loss = float('inf') if model_dir is not None: self.load_model(model_dir, **load_kwargs) def word2phonemes(self, word: str, lower: bool = True): if lower: word = word.lower() tokenized = WordToPhonemeModel.tokenize_word(word) tokenized = [WordToPhonemeModel.SOS_TOKEN ] + tokenized + [WordToPhonemeModel.EOS_TOKEN] numericalized = [self.src_field.vocab.stoi[t] for t in tokenized] src = torch.LongTensor(numericalized).unsqueeze(1).to(self.device) self.model.eval() output = self.model(src, None, teacher_forcing_ratio=0)[1:] predicted = torch.argmax(output.squeeze(1), 1) tokenized = [ self.trg_field.vocab.itos[int(i)] for i in predicted if i != self.eos_idx ] return tokenized # ------------------------------------------------------------------------- def load_dataset(self, csv_path: str, lower=True) -> None: '''Loads a CSV dataset of the form WORD,PH ON EM ES''' self.src_field = Field(tokenize=WordToPhonemeModel.tokenize_word, init_token=WordToPhonemeModel.SOS_TOKEN, eos_token=WordToPhonemeModel.EOS_TOKEN, lower=lower) self.trg_field = Field(tokenize=WordToPhonemeModel.tokenize_phonemes, init_token=WordToPhonemeModel.SOS_TOKEN, eos_token=WordToPhonemeModel.EOS_TOKEN, lower=lower) self._logger.debug(f'Loading dataset from {csv_path}') self.dataset = TabularDataset(path=csv_path, format='csv', fields=[('src', self.src_field), ('trg', self.trg_field)]) self.train_data, self.test_data = self.dataset.split() self._logger.debug(f'Training examples: {len(self.train_data)}') self._logger.debug(f'Testing examples: {len(self.test_data)}') self._logger.debug( f'Building vocabulary from {len(self.train_data)} example(s)') self.src_field.build_vocab(self.train_data, min_freq=1) self.trg_field.build_vocab(self.train_data, min_freq=1) self.model = self._make_model() self._logger.debug(self.model) # ------------------------------------------------------------------------- def save_vocabulary(self, model_dir): # Source with open(os.path.join(model_dir, 'src-vocab.txt'), 'w') as src_vocab_file: for symbol in self.src_field.vocab.itos: print(symbol, file=src_vocab_file) with open(os.path.join(model_dir, 'src-freqs.txt'), 'w') as src_freq_file: for symbol, count in self.src_field.vocab.freqs.items(): print(symbol, count, file=src_freq_file) # Target with open(os.path.join(model_dir, 'trg-vocab.txt'), 'w') as trg_vocab_file: for symbol in self.trg_field.vocab.itos: print(symbol, file=trg_vocab_file) with open(os.path.join(model_dir, 'trg-freqs.txt'), 'w') as trg_freq_file: for symbol, count in self.trg_field.vocab.freqs.items(): print(symbol, count, file=trg_freq_file) self._logger.debug(f'Saved vocabulary to {model_dir}') # ------------------------------------------------------------------------- def train(self, epochs, save_path, load_previous=True, clip=10, batch_size=128): save_dir = os.path.split(save_path)[0] os.makedirs(save_dir, exist_ok=True) if load_previous and os.path.exists(save_path): self._logger.debug(f'Loading model state from {save_path}') self.model.load_state_dict(torch.load(save_path)) train_iterator, test_iterator = BucketIterator.splits( (self.train_data, self.test_data), batch_size=batch_size, device=self.device, sort_key=lambda x: len(x.src)) optimizer = optim.Adam(self.model.parameters()) trg_pad_idx = self.trg_field.vocab.stoi[WordToPhonemeModel.PAD_TOKEN] criterion = nn.CrossEntropyLoss(ignore_index=trg_pad_idx) # Training loop self._logger.debug(f'Beginning training for {epochs} epoch(s)') for epoch in range(epochs): train_loss = self._train_iter(train_iterator, optimizer, criterion, clip) test_loss = self._evaluate_iter(test_iterator, criterion) if test_loss < self.best_test_loss: # Save model if better self.best_test_loss = test_loss torch.save(self.model.state_dict(), save_path) self._logger.debug( f'| Epoch: {epoch+1:03} | Train Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f} | Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |' ) # Save model torch.save(self.model.state_dict(), save_path) self._logger.info(save_path) return self.best_test_loss # ------------------------------------------------------------------------- def _train_iter(self, iterator, optimizer, criterion, clip) -> float: self.model.train() epoch_loss = 0 for i, batch in enumerate(iterator): src = batch.src trg = batch.trg optimizer.zero_grad() output = self.model(src, trg) #trg = [sent len, batch size] #output = [sent len, batch size, output dim] #reshape to: #trg = [(sent len - 1) * batch size] #output = [(sent len - 1) * batch size, output dim] loss = criterion(output[1:].view(-1, output.shape[2]), trg[1:].view(-1)) loss.backward() torch.nn.utils.clip_grad_norm_(self.model.parameters(), clip) optimizer.step() epoch_loss += loss.item() return epoch_loss / len(iterator) def _evaluate_iter(self, iterator, criterion): self.model.eval() epoch_loss = 0 with torch.no_grad(): for i, batch in enumerate(iterator): src = batch.src trg = batch.trg output = self.model(src, trg, 0) #turn off teacher forcing loss = criterion(output[1:].view(-1, output.shape[2]), trg[1:].view(-1)) epoch_loss += loss.item() return epoch_loss / len(iterator) # ------------------------------------------------------------------------- def load_model(self, model_dir, lower=True, no_state=False): self._logger.debug(f'Loading vocabulary files from {model_dir}') self.src_field = Field(tokenize=WordToPhonemeModel.tokenize_word, init_token=WordToPhonemeModel.SOS_TOKEN, eos_token=WordToPhonemeModel.EOS_TOKEN, lower=lower) self.src_field.vocab = WordToPhonemeModel.load_vocab( os.path.join(model_dir, 'src-freqs.txt')) self.trg_field = Field(tokenize=WordToPhonemeModel.tokenize_phonemes, init_token=WordToPhonemeModel.SOS_TOKEN, eos_token=WordToPhonemeModel.EOS_TOKEN, lower=lower) self.trg_field.vocab = WordToPhonemeModel.load_vocab( os.path.join(model_dir, 'trg-freqs.txt')) self.model = self._make_model() self._logger.debug(self.model) if not no_state: state_path = os.path.join(model_dir, 'g2p-model.pt') if os.path.exists(state_path): self._logger.debug(f'Loading model state from {state_path}') self.model.load_state_dict(torch.load(state_path)) else: self._logger.warning( f'Missing model state file at {state_path}!') # ------------------------------------------------------------------------- def _make_model(self): input_dim = len(self.src_field.vocab) output_dim = len(self.trg_field.vocab) enc_emb_dim = 256 dec_emb_dim = 256 hid_dim = 512 enc_dropout = 0.5 dec_dropout = 0.5 self.sos_idx = self.trg_field.vocab.stoi[WordToPhonemeModel.SOS_TOKEN] self.eos_idx = self.trg_field.vocab.stoi[WordToPhonemeModel.EOS_TOKEN] self.pad_idx = self.src_field.vocab.stoi[WordToPhonemeModel.PAD_TOKEN] enc = Encoder(input_dim, enc_emb_dim, hid_dim, enc_dropout) dec = Decoder(output_dim, dec_emb_dim, hid_dim, dec_dropout) return Seq2Seq(enc, dec, self.pad_idx, self.sos_idx, self.eos_idx, self.device) # ------------------------------------------------------------------------- @classmethod def tokenize_word(cls, word): return list(word) @classmethod def tokenize_phonemes(cls, text): return re.split(r'\s+', text) @classmethod def load_vocab(cls, freqs_path): counter = Counter() with open(freqs_path, 'r') as freqs_file: for line in freqs_file: name, freq = re.split(r'\s+', line.strip(), maxsplit=1) counter[name] = int(freq) return Vocab(counter, specials=[ WordToPhonemeModel.UNK_TOKEN, WordToPhonemeModel.PAD_TOKEN, WordToPhonemeModel.SOS_TOKEN, WordToPhonemeModel.EOS_TOKEN ])
super().__init__(examples, fields) # In[9]: train=MyDataset(trainset,text_field=TEXT,label_field=LABEL,test=False) valid=MyDataset(validset,text_field=TEXT,label_field=LABEL,test=False) # In[10]: from torchtext.vocab import Vectors vectors=Vectors(name='./sgns.sogou.word') #使用预训练的词向量,维度为300Dimension TEXT.build_vocab(train, vectors=vectors) #构建词典 LABEL.build_vocab(train) # In[11]: import torch from torchtext.data import BucketIterator batchsize=64 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu") train_iter = data.BucketIterator(dataset=train, batch_size=batchsize, shuffle=True, sort_key=lambda x: len(x.text), device=DEVICE,sort_within_batch=False, repeat=False) valid_iter = data.BucketIterator(dataset=valid, batch_size=batchsize, shuffle=True, sort_key=lambda x: len(x.text),
german = Field(tokenize=tokenizer_ger, lower=True, init_token="<sos>", eos_token="<eos>") english = Field(tokenize=tokenizer_eng, lower=True, init_token="<sos>", eos_token="<eos>") train_data, valid_data, test_data = Multi30k.splits(exts=(".de", ".en"), fields=(german, english), root='data') german.build_vocab(train_data, max_size=10000, min_freq=2) english.build_vocab(train_data, max_size=10000, min_freq=2) class Transformer(nn.Module): def __init__( self, embedding_size, src_vocab_size, trg_vocab_size, src_pad_idx, num_heads, num_encoder_layers, num_decoder_layers, forward_expansion, dropout,
fields_dataset = [("query_title", TEXT), ("query_description", TEXT), ("doc_text", TEXT), ("label", LABEL)] train_data = Dataset(torch_examples, fields_dataset) save_examples(train_data, "../traindata.json") exit(0) else: TEXT = Field(tokenize=tokenize_en, batch_first=True, include_lengths=True) LABEL = LabelField(dtype=torch.float, batch_first=True) fields_dataset = [("query_title", TEXT), ("query_description", TEXT), ("doc_text", TEXT), ("label", LABEL)] train_data = Dataset( load_examples("../traindata.json", fields_dataset), fields_dataset) print("build_vocabulary...") TEXT.build_vocab(train_data, min_freq=1, vectors="glove.6B.300d") LABEL.build_vocab(train_data) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("build_iterator...") train_iterator, vaild_iterator = BucketIterator.splits( (train_data, train_data), batch_size=64, sort_key=lambda x: len(x.doc_text), sort_within_batch=False, device=device) size_of_vocab = len(TEXT.vocab) embedding_dim = 300 num_hidden_nodes = 128 num_layers = 2 num_output_nodes = 1
src_field = Field(tokenize=english_tokenizer, init_token='<sos>', eos_token='<eos>', lower=True) #initialize the field for trg language trg_field = Field(tokenize=hindi_tokenizer, init_token='<sos>', eos_token='<eos>', lower=True) train_data, valid_data, test_data = load_datasets( model_config['global']['dataset_path'], model_config['global']['dataset_file_names'], model_config['global']['translate_pair'], model_config['global']['lang_extensions'], [src_field, trg_field]) #initialize the vocabulary src_field.build_vocab(train_data, min_freq=1) trg_field.build_vocab(train_data, min_freq=1) #display dataset stats print_dataset_statistics(train_data, valid_data, test_data, model_config['global']['lang_extensions'], [src_field, trg_field]) train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=model_config['global']['batch_size'], device=device) cache_file_name = "%s-%s-%s-epoch-%s.pt" % ( model_config['global']['name'], model_config['global']['lang_extensions'][0], model_config['global']['lang_extensions'][1], model_config['global']['epochs'])
lower=True, fix_length=MAX_LEN, postprocessing=filter_low_freq_words) LABEL = Field(sequential=False, use_vocab=False) # 2.构建表格型dataset ds_train, ds_test = TabularDataset.splits(path='./data/', train='train.tsv', test='test.tsv', format='tsv', fields=[('label', LABEL), ('text', TEXT)], skip_header=False) # 3.构建词典 TEXT.build_vocab(ds_train) # 4.构建数据管道迭代器 train_iter, test_iter = Iterator.splits((ds_train, ds_test), sort_within_batch=True, sort_key=lambda x: len(x.text), batch_sizes=(BATCH_SIZE, BATCH_SIZE), device='cuda:4') # 将数据管道组织成torch.utils.data.DataLoader相似的features,label输出形式 class DataLoader: def __init__(self, data_iter): self.data_iter = data_iter self.length = len(data_iter)
def iters(cls, lower=True, example_mode='sentence', use_wiki=False, n_wiki_sentences=5, replace_title_mentions='', batch_size=128, device=-1, root='.data', vectors='glove.6B.300d', unigrams=True, bigrams=False, trigrams=False, combined_ngrams=True, combined_max_vocab_size=None, unigram_max_vocab_size=None, bigram_max_vocab_size=None, trigram_max_vocab_size=None, **kwargs): QNUM = LongField() SENT = LongField() PAGE = Field(sequential=False, tokenize=str_split) if combined_ngrams: tokenizer = create_qb_tokenizer(unigrams=unigrams, bigrams=bigrams, trigrams=trigrams) TEXT = QBTextField(batch_first=True, tokenize=tokenizer, include_lengths=True, lower=lower) train, val, dev = cls.splits( qnum_field=QNUM, sent_field=SENT, text_field=TEXT, page_field=PAGE, root=root, example_mode=example_mode, use_wiki=use_wiki, n_wiki_sentences=n_wiki_sentences, replace_title_mentions=replace_title_mentions, **kwargs) TEXT.build_vocab(train, vectors=vectors, max_size=combined_max_vocab_size) PAGE.build_vocab(train) else: if unigrams: unigram_tokenizer = create_qb_tokenizer(unigrams=True, bigrams=False, trigrams=False) UNIGRAM_TEXT = QBTextField(batch_first=True, tokenize=unigram_tokenizer, include_lengths=True, lower=lower) else: UNIGRAM_TEXT = None if bigrams: bigram_tokenizer = create_qb_tokenizer(unigrams=False, bigrams=True, trigrams=False) BIGRAM_TEXT = QBTextField(batch_first=True, tokenize=bigram_tokenizer, include_lengths=True, lower=lower) else: BIGRAM_TEXT = None if trigrams: trigram_tokenizer = create_qb_tokenizer(unigrams=False, bigrams=False, trigrams=True) TRIGRAM_TEXT = QBTextField(batch_first=True, tokenize=trigram_tokenizer, include_lengths=True, lower=lower) else: TRIGRAM_TEXT = None train, val, dev = cls.splits( qnum_field=QNUM, sent_field=SENT, page_field=PAGE, unigram_field=UNIGRAM_TEXT, bigram_field=BIGRAM_TEXT, trigram_field=TRIGRAM_TEXT, root=root, example_mode=example_mode, use_wiki=use_wiki, n_wiki_sentences=n_wiki_sentences, replace_title_mentions=replace_title_mentions, **kwargs) if UNIGRAM_TEXT is not None: UNIGRAM_TEXT.build_vocab(train, vectors=vectors, max_size=unigram_max_vocab_size) if BIGRAM_TEXT is not None: BIGRAM_TEXT.build_vocab(train, max_size=bigram_max_vocab_size) if TRIGRAM_TEXT is not None: TRIGRAM_TEXT.build_vocab(train, max_size=trigram_max_vocab_size) PAGE.build_vocab(train) return BucketIterator.splits((train, val, dev), batch_size=batch_size, device=-1, repeat=False)
for src_line, trg_line in zip(src_file, trg_file): src_line, trg_line = src_line.strip(), trg_line.strip() if src_line != '' and trg_line != '': # TODO 需要注意下面这个 fields 的使用 temp = data.Example.fromlist([src_line, trg_line], fields) examples.append(temp) print(vars(examples[0])) # 根据得到的example,构建一个Dataset,得到 trainData allData = tDataset(examples, fields) trainData, validData, testData = allData.split(split_ratio=[0.8, 0.1, 0.1]) #再分割成三份 # 只根据训练数据(trainData)构建字典 SRC.build_vocab(trainData) TRG.build_vocab(trainData) BATCH_SIZE = 12 device = t.device("cuda:0" if t.cuda.is_available() else "cpu") train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (trainData, validData, testData), batch_size=BATCH_SIZE, sort=False, device=device) INPUT_DIM = len(SRC.vocab) OUTPUT_DIM = len(TRG.vocab) ENC_EMB_DIM = 256 DEC_EMB_DIM = 256 HID_DIM = 512
def test_single_gpu_batch_parse(): trainer = Trainer(gpus=1) trainer.accelerator_backend = GPUAccelerator(trainer) # non-transferrable types primitive_objects = [ None, {}, [], 1.0, "x", [None, 2], { "x": (1, 2), "y": None } ] for batch in primitive_objects: data = trainer.accelerator_backend.batch_to_device( batch, torch.device('cuda:0')) assert data == batch # batch is just a tensor batch = torch.rand(2, 3) batch = trainer.accelerator_backend.batch_to_device( batch, torch.device('cuda:0')) assert batch.device.index == 0 and batch.type() == 'torch.cuda.FloatTensor' # tensor list batch = [torch.rand(2, 3), torch.rand(2, 3)] batch = trainer.accelerator_backend.batch_to_device( batch, torch.device('cuda:0')) assert batch[0].device.index == 0 and batch[0].type( ) == 'torch.cuda.FloatTensor' assert batch[1].device.index == 0 and batch[1].type( ) == 'torch.cuda.FloatTensor' # tensor list of lists batch = [[torch.rand(2, 3), torch.rand(2, 3)]] batch = trainer.accelerator_backend.batch_to_device( batch, torch.device('cuda:0')) assert batch[0][0].device.index == 0 and batch[0][0].type( ) == 'torch.cuda.FloatTensor' assert batch[0][1].device.index == 0 and batch[0][1].type( ) == 'torch.cuda.FloatTensor' # tensor dict batch = [{'a': torch.rand(2, 3), 'b': torch.rand(2, 3)}] batch = trainer.accelerator_backend.batch_to_device( batch, torch.device('cuda:0')) assert batch[0]['a'].device.index == 0 and batch[0]['a'].type( ) == 'torch.cuda.FloatTensor' assert batch[0]['b'].device.index == 0 and batch[0]['b'].type( ) == 'torch.cuda.FloatTensor' # tuple of tensor list and list of tensor dict batch = ([torch.rand(2, 3) for _ in range(2)], [{ 'a': torch.rand(2, 3), 'b': torch.rand(2, 3) } for _ in range(2)]) batch = trainer.accelerator_backend.batch_to_device( batch, torch.device('cuda:0')) assert batch[0][0].device.index == 0 and batch[0][0].type( ) == 'torch.cuda.FloatTensor' assert batch[1][0]['a'].device.index == 0 assert batch[1][0]['a'].type() == 'torch.cuda.FloatTensor' assert batch[1][0]['b'].device.index == 0 assert batch[1][0]['b'].type() == 'torch.cuda.FloatTensor' # namedtuple of tensor BatchType = namedtuple('BatchType', ['a', 'b']) batch = [ BatchType(a=torch.rand(2, 3), b=torch.rand(2, 3)) for _ in range(2) ] batch = trainer.accelerator_backend.batch_to_device( batch, torch.device('cuda:0')) assert batch[0].a.device.index == 0 assert batch[0].a.type() == 'torch.cuda.FloatTensor' # non-Tensor that has `.to()` defined class CustomBatchType: def __init__(self): self.a = torch.rand(2, 2) def to(self, *args, **kwargs): self.a = self.a.to(*args, **kwargs) return self batch = trainer.accelerator_backend.batch_to_device( CustomBatchType(), torch.device('cuda:0')) assert batch.a.type() == 'torch.cuda.FloatTensor' # torchtext.data.Batch samples = [{ 'text': 'PyTorch Lightning is awesome!', 'label': 0 }, { 'text': 'Please make it work with torchtext', 'label': 1 }] text_field = Field() label_field = LabelField() fields = {'text': ('text', text_field), 'label': ('label', label_field)} examples = [Example.fromdict(sample, fields) for sample in samples] dataset = Dataset(examples=examples, fields=fields.values()) # Batch runs field.process() that numericalizes tokens, but it requires to build dictionary first text_field.build_vocab(dataset) label_field.build_vocab(dataset) batch = Batch(data=examples, dataset=dataset) batch = trainer.accelerator_backend.batch_to_device( batch, torch.device('cuda:0')) assert batch.text.type() == 'torch.cuda.LongTensor' assert batch.label.type() == 'torch.cuda.LongTensor'
eos_token='<eos>', lower=True) train_data, valid_data, test_data = TranslationDataset.splits( path='IITB_small', validation='dev', exts=('.en', '.hi'), fields=(SRC, TRG)) print(f"Number of training examples: {len(train_data.examples)}") print(f"Number of validation examples: {len(valid_data.examples)}") print(f"Number of testing examples: {len(test_data.examples)}") vars(train_data.examples[0]) SRC.build_vocab(train_data, min_freq=2) TRG.build_vocab(train_data, min_freq=2, specials=['<pad>', '<sop>', '<eop>']) print(f"Unique tokens in source (en) vocabulary: {len(SRC.vocab)}") print(f"Unique tokens in target (hi) vocabulary: {len(TRG.vocab)}") device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') device BATCH_SIZE = 2 train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device) """# EnCoder Parameters"""
'val') from_txt_to_dataframe_and_csv('toy-revert', 'src-test.txt', 'tgt-test.txt', 'test') data_fields = [('src', TEXT), ('trg', TRG_TEXT)] # load the dataset in csv format train_data, valid_data, test_data = TabularDataset.splits( path='toy-revert', train='train.csv', validation='val.csv', test='test.csv', format='csv', fields=data_fields, skip_header=True) TEXT.build_vocab(train_data) TRG_TEXT.build_vocab(train_data) SRC, TRG = TEXT, TRG_TEXT BATCH_SIZE = 128 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=BATCH_SIZE, sort_within_batch=True, sort_key=lambda x: len(x.src), device=device) #######################################
class SequenceDataLoader(CommonDataLoader): def __init__(self, data_config): super(SequenceDataLoader, self).__init__(data_config) self.__build_field() self._load_data() pass def __build_field(self): self.TEXT = Field(sequential=True, use_vocab=True, tokenize=tokenizer, include_lengths=True) self.TAG = Field(sequential=True, use_vocab=True, tokenize=tokenizer, is_target=True) self._fields = [ ('text', self.TEXT), ('tag', self.TAG) ] self._fields_test = [('text', self.TEXT)] pass @timeit def _load_data(self): self.train_data = REDataset(path=self._config.data.chip_relation.train_path, fields=self._fields) self.valid_data = REDataset(path=self._config.data.chip_relation.valid_path, fields=self._fields) self.test_data = REDataset(path=self._config.data.chip_relation.test_path, fields=self._fields_test) self.__build_vocab(self.train_data, self.valid_data, self.test_data) self.__build_iterator(self.train_data, self.valid_data, self.test_data) pass def __build_vocab(self, *dataset): """ :param dataset: train_data, valid_data, test_data :return: text_vocab, tag_vocab """ self.TEXT.build_vocab(*dataset) self.TAG.build_vocab(*dataset[:-1]) self.word_vocab = self.TEXT.vocab self.tag_vocab = self.TAG.vocab pass def __build_iterator(self, *dataset): self._train_iter = BucketIterator( dataset[0], batch_size=self._config.data.train_batch_size, shuffle=True, sort_key=lambda x: len(x.text), sort_within_batch=True, device=self._config.device) self._valid_iter = BucketIterator( dataset[1], batch_size=self._config.data.train_batch_size, shuffle=False, sort_key=lambda x: len(x.text), sort_within_batch=True, device=self._config.device) self._test_iter = BucketIterator( dataset[2], batch_size=self._config.data.train_batch_size, shuffle=False, sort_key=lambda x: len(x.text), sort_within_batch=True, device=self._config.device) def load_train(self): return self._train_iter pass def load_test(self): return self._test_iter pass def load_valid(self): return self._valid_iter pass
print('load data') configfile = open('../config.yaml') config=AttrDict(yaml.load(configfile, Loader=yaml.FullLoader)) trainSet = TIMIT(config.data.data_root, mode='train') devSet = TIMIT(config.data.data_root, mode='test') TEXT = Field(lower=True, include_lengths=True, batch_first=True, unk_token=None) print('build vocab') sents = ['iy', 'ix', 'eh', 'ae', 'ax', 'uw', 'uh', 'ao', 'ey', 'ay', 'oy', 'aw', 'ow', 'er', 'l', 'r', 'w', 'y', 'm', 'n', 'ng', 'v', 'f', 'dh', 'th', 'z', 's', 'zh', 'jh', 'ch', 'b', 'p', 'd', 'dx', 't', 'g', 'k', 'hh', 'h#'] sents = [[i] for i in sents] TEXT.build_vocab(sents, specials=['<blank>']) assert config.data.vocabSize == len(TEXT.vocab) assert config.data.pad_idx == TEXT.vocab.stoi['<pad>'] assert config.data.blank_idx == TEXT.vocab.stoi['<blank>'] def my_collate(batch): ''' inputs: [N,L] targets: [N,L] ''' txt_seqs, seqs_len = TEXT.process([item[1] for item in batch]) inputs = txt_seqs[:,:-1] targets = txt_seqs[:,1:] return {'inputs':inputs, 'targets':targets}
class DataLoader(): """ This is the dataloader class that takes in a path and return a generator that could be iterated through init: path: path of the data to read in (assumes CSV format) config: a Config object that contains the parameters to be used shuffle: whether to shuffle the data or not (true by default) """ def __init__(self, config, split, type_="train", lang="en"): assert config.extension in ["json"] # Only supports csv now self.config = config self.extension = self.config.extension self.max_length = self.config.max_length self.max_tweets = self.config.max_tweets self.lang = lang if self.lang == "zh": print("Doing RD for chinese") nlp = nlp_chinese # <------------ Running some defined functions -----------> if type_ == "train": # self.data_folder_path = self.config.data_folder + "_{}/".format(split) self.data_folder_path = self.config.data_folder self.train_file_path = self.config.train_file_path self.test_1_file_path = self.config.test_1_file_path self.test_2_file_path = self.config.test_2_file_path self.run_pipeline() def get_data(self, type_, return_id=False): assert type_ in ["train", "train_test", "test_1", "test_2", "test"] max_batch_size = self.config.batch_size if type_ == "train" else self.config.batch_size_test if type_ == "train_test" else self.config.batch_size_test if type_ == "test" else self.config.batch_size_test if type_ == "test_1" else self.config.batch_size_test if type_ == "test_2" else "something is wrong" data = self.train_batch if type_ == "train" else self.train_test_batch if type_ == "train_test" else self.test_batch if type_ == "test" else self.test_1_batch if type_ == "test_1" else self.test_2_batch if type_ == "test_2" else "something is wrong" for batch in data: id_ = getattr(batch, self.config.keys_order["post_id"]) X = getattr(batch, self.config.keys_order["content"]) y = getattr(batch, self.config.keys_order["label"]) structure = getattr(batch, self.config.keys_order["structure"]) time_delay = getattr(batch, self.config.keys_order["time_delay"]) # <-------- Getting the sizes ---------> batch_size, num_articles, num_words, = X.shape # <-------- Getting the word_pos tensor ---------> word_pos = np.repeat(np.expand_dims(np.repeat(np.expand_dims( np.arange(num_words), axis=0), num_articles, axis=0), axis=0), batch_size, axis=0) word_pos = torch.from_numpy(word_pos) # <-------- Getting the attention_mask vector (for words) ---------> # The mask has 1 for real tokens and 0 for padding / unknown tokens. Only real tokens + last pad are attended to # <pad> has an index of 1 attention_mask_word = torch.where( (X == 1), torch.zeros(1), torch.ones(1)).type(torch.FloatTensor) check = torch.sum(torch.where((X == 1), torch.ones(1), torch.zeros(1)), dim=-1) # <-------- Getting the attention_mask vector (for posts) ---------> attention_mask_post = torch.where( (check == self.config.max_length), torch.zeros(1), torch.ones(1)).type(torch.FloatTensor) if batch_size >= len(self.config.gpu_idx): if return_id: yield id_, X, y, word_pos, time_delay, structure, attention_mask_word, attention_mask_post else: yield X, y, word_pos, time_delay, structure, attention_mask_word, attention_mask_post @staticmethod def clean_text(text): """ This function cleans the text in the following ways: 1. Replace websites with URL 1. Replace 's with <space>'s (eg, her's --> her 's) """ text = re.sub( r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", "URL", text) # Replace urls with special token text = text.replace("\'s", "") text = text.replace("\'", "") text = text.replace("n\'t", " n\'t") text = text.replace("@", "") text = text.replace("#", "") text = text.replace("_", " ") text = text.replace("-", " ") text = text.replace("&", "") text = text.replace(">", "") text = text.replace("\"", "") text = text.replace(".", "") text = text.replace(",", "") text = text.replace("(", "") text = text.replace(")", "") text = ' '.join(text.split()) return text.strip() @staticmethod def clean_tokenized_text(text_lst): if len(text_lst) <= 1: return text_lst idx = 0 cleaned_token_lst = [] while idx < len(text_lst) - 1: current_token = text_lst[idx] next_token = text_lst[idx + 1] if current_token != next_token: cleaned_token_lst.append(current_token) idx += 1 else: last_idx = max([ i + idx for i, val in enumerate(text_lst[idx:]) if val == current_token ]) + 1 cleaned_token_lst.append(current_token) idx = last_idx if cleaned_token_lst[-1] != text_lst[-1]: cleaned_token_lst.append(text_lst[-1]) return cleaned_token_lst @staticmethod def tokenize_structure(structure_lst): return structure_lst @staticmethod def tokenize_text(text): text = DataLoader.clean_text(text) token_lst = [token.text.lower() for token in nlp(text)] token_lst = DataLoader.clean_tokenized_text(token_lst) return token_lst # Step 1: Define the data fields def define_fields(self): self.id_field = Field(sequential=False, tokenize=lambda x: x, use_vocab=True) self.tweet_field = Field(sequential=True, tokenize=DataLoader.tokenize_text, include_lengths=False, lower=True, fix_length=self.max_length, use_vocab=True) self.timestamp_field = Field(sequential=False, include_lengths=False, use_vocab=False) self.structure_field = Field( sequential=True, tokenize=lambda x: DataLoader.tokenize_structure(x), include_lengths=False, fix_length=self.config.max_tweets, pad_token=self.config.num_structure_index, use_vocab=False) self.label_field = Field(sequential=False, use_vocab=False) self.tweet_lst_field = NestedField(self.tweet_field, fix_length=self.config.max_tweets) self.timestamp_lst_field = NestedField( self.timestamp_field, pad_token=str(self.config.size), fix_length=self.config.max_tweets) self.structure_lst_field = NestedField( self.structure_field, fix_length=self.config.max_tweets) data_fields = {} for key, val in self.config.keys_order.items(): if key == "post_id": data_fields[val] = (val, self.id_field) if key == "content": data_fields[val] = (val, self.tweet_lst_field) elif key == "label": data_fields[val] = (val, self.label_field) elif key == "time_delay": data_fields[val] = (val, self.timestamp_lst_field) elif key == "structure": data_fields[val] = (val, self.structure_lst_field) self.data_fields = data_fields # Step 2: Reading the data def read_data(self, path): data = TabularDataset(path=path, format=self.extension, fields=self.data_fields) return data # Step 3: Building the vectors def build_vectors(self): # specify the path to the localy saved vectors (Glove in this case) vec = vocab.Vectors(name=self.config.glove_file, cache=self.config.glove_directory) self.id_field.build_vocab( getattr(self.train, self.config.keys_order["post_id"]), getattr(self.test_1, self.config.keys_order["post_id"]), getattr(self.test_2, self.config.keys_order["post_id"])) # Build the vocabulary (for tweets) using the train and test dataset self.tweet_field.build_vocab( getattr(self.train, self.config.keys_order["content"]), getattr(self.test_1, self.config.keys_order["content"]), getattr(self.test_2, self.config.keys_order["content"]), max_size=self.config.max_vocab, vectors=vec) # Step 4: Loading the data in batches def load_batches(self, dataset, batch_size): data = BucketIterator.splits( datasets=(dataset, ), # specify data batch_sizes=(batch_size, ), # batch size sort_key=lambda x: len( getattr(x, self.config.keys_order["content"]) ), # on what attribute the text should be sorted sort_within_batch=True, repeat=False) return data[0] def load_vocab_vectors(self, vocab): self.tweet_field.vocab = vocab def run_pipeline(self): """ Pipeline to run all the necessary steps in sequence Note: DO NOT CHANGE THE SEQUENCE OF EXECUTION """ # Step 1 : Define the fields self.define_fields() # Step 2: Read data self.train = self.read_data( os.path.join(self.data_folder_path, self.train_file_path)) self.test_1 = self.read_data( os.path.join(self.data_folder_path, self.test_1_file_path)) self.test_2 = self.read_data( os.path.join(self.data_folder_path, self.test_2_file_path)) # Step 3: Building the vectors self.build_vectors() # Step 4: Batching the data self.train_batch = self.load_batches(self.train, self.config.batch_size) self.train_test_batch = self.load_batches(self.train, self.config.batch_size_test) self.test_1_batch = self.load_batches(self.test_1, self.config.batch_size_test) self.test_2_batch = self.load_batches(self.test_2, self.config.batch_size_test)
def run(self): print("Running on", self.a.device) self.set_device(self.a.device) np.random.seed(self.a.seed) torch.manual_seed(self.a.seed) torch.backends.cudnn.benchmark = True #################### loading event extraction dataset #################### if self.a.train_ee: log('loading event extraction corpus from %s' % self.a.train_ee) # both for grounding and ee WordsField = Field(lower=True, include_lengths=True, batch_first=True) PosTagsField = Field(lower=True, batch_first=True) EntityLabelsField = MultiTokenField(lower=False, batch_first=True) AdjMatrixField = SparseField(sequential=False, use_vocab=False, batch_first=True) EntitiesField = EntityField(lower=False, batch_first=True, use_vocab=False) # only for ee LabelField = Field(lower=False, batch_first=True, pad_token='0', unk_token=None) EventsField = EventField(lower=False, batch_first=True) if self.a.amr: colcc = 'amr-colcc' else: colcc = 'stanford-colcc' print(colcc) train_ee_set = ACE2005Dataset(path=self.a.train_ee, fields={"words": ("WORDS", WordsField), "pos-tags": ("POSTAGS", PosTagsField), "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), colcc: ("ADJM", AdjMatrixField), "golden-event-mentions": ("LABEL", LabelField), "all-events": ("EVENT", EventsField), "all-entities": ("ENTITIES", EntitiesField)}, amr=self.a.amr, keep_events=1) dev_ee_set = ACE2005Dataset(path=self.a.dev_ee, fields={"words": ("WORDS", WordsField), "pos-tags": ("POSTAGS", PosTagsField), "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), colcc: ("ADJM", AdjMatrixField), "golden-event-mentions": ("LABEL", LabelField), "all-events": ("EVENT", EventsField), "all-entities": ("ENTITIES", EntitiesField)}, amr=self.a.amr, keep_events=0) test_ee_set = ACE2005Dataset(path=self.a.test_ee, fields={"words": ("WORDS", WordsField), "pos-tags": ("POSTAGS", PosTagsField), "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), colcc: ("ADJM", AdjMatrixField), "golden-event-mentions": ("LABEL", LabelField), "all-events": ("EVENT", EventsField), "all-entities": ("ENTITIES", EntitiesField)}, amr=self.a.amr, keep_events=0) if self.a.webd: pretrained_embedding = Vectors(self.a.webd, ".", unk_init=partial(torch.nn.init.uniform_, a=-0.15, b=0.15)) LabelField.build_vocab(train_ee_set.LABEL, dev_ee_set.LABEL, vectors=pretrained_embedding) EventsField.build_vocab(train_ee_set.EVENT, dev_ee_set.EVENT, vectors=pretrained_embedding) else: LabelField.build_vocab(train_ee_set.LABEL, dev_ee_set.LABEL) EventsField.build_vocab(train_ee_set.EVENT, dev_ee_set.EVENT) # add role mask self.a.role_mask = event_role_mask(self.a.train_ee, self.a.dev_ee, LabelField.vocab.stoi, EventsField.vocab.stoi, self.device) #################### loading SR dataset #################### # both for grounding and sr if self.a.train_sr: log('loading corpus from %s' % self.a.train_sr) transform = transforms.Compose([ transforms.Resize(256), transforms.RandomHorizontalFlip(), transforms.RandomCrop(224), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) vocab_noun = Vocab(os.path.join(self.a.vocab, 'vocab_situation_noun.pkl'), load=True) vocab_role = Vocab(os.path.join(self.a.vocab, 'vocab_situation_role.pkl'), load=True) vocab_verb = Vocab(os.path.join(self.a.vocab, 'vocab_situation_verb.pkl'), load=True) # train_sr_loader = imsitu_loader(self.a.image_dir, self.vocab_noun, self.vocab_role, self.vocab_verb, self.a.imsitu_ontology_file, # self.a.train_sr, self.a.verb_mapping_file, self.a.role_mapping_file, # self.a.object_class_map_file, self.a.object_detection_pkl_file, # self.a.object_detection_threshold, # transform, self.a.batch, shuffle=self.a.shuffle, num_workers=1) #self.a.shuffle # dev_sr_loader = imsitu_loader(self.a.image_dir, self.vocab_noun, self.vocab_role, self.vocab_verb, self.a.imsitu_ontology_file, # self.a.dev_sr, self.a.verb_mapping_file, self.a.role_mapping_file, # self.a.object_class_map_file, self.a.object_detection_pkl_file, # self.a.object_detection_threshold, # transform, self.a.batch, shuffle=self.a.shuffle, num_workers=1) # test_sr_loader = imsitu_loader(self.a.image_dir, self.vocab_noun, self.vocab_role, self.vocab_verb, self.a.imsitu_ontology_file, # self.a.test_sr, self.a.verb_mapping_file, self.a.role_mapping_file, # self.a.object_class_map_file, self.a.object_detection_pkl_file, # self.a.object_detection_threshold, # transform, self.a.batch, shuffle=self.a.shuffle, num_workers=1) train_sr_set = ImSituDataset(self.a.image_dir, vocab_noun, vocab_role, vocab_verb, LabelField.vocab.stoi, EventsField.vocab.stoi, self.a.imsitu_ontology_file, self.a.train_sr, self.a.verb_mapping_file, self.a.object_class_map_file, self.a.object_detection_pkl_file, self.a.object_detection_threshold, transform, filter_irrelevant_verbs=self.a.filter_irrelevant_verbs, load_object=self.a.add_object, filter_place=self.a.filter_place) dev_sr_set = ImSituDataset(self.a.image_dir, vocab_noun, vocab_role, vocab_verb, LabelField.vocab.stoi, EventsField.vocab.stoi, self.a.imsitu_ontology_file, self.a.dev_sr, self.a.verb_mapping_file, self.a.object_class_map_file, self.a.object_detection_pkl_file, self.a.object_detection_threshold, transform, filter_irrelevant_verbs=self.a.filter_irrelevant_verbs, load_object=self.a.add_object, filter_place=self.a.filter_place) test_sr_set = ImSituDataset(self.a.image_dir, vocab_noun, vocab_role, vocab_verb, LabelField.vocab.stoi, EventsField.vocab.stoi, self.a.imsitu_ontology_file, self.a.test_sr, self.a.verb_mapping_file, self.a.object_class_map_file, self.a.object_detection_pkl_file, self.a.object_detection_threshold, transform, filter_irrelevant_verbs=self.a.filter_irrelevant_verbs, load_object=self.a.add_object, filter_place=self.a.filter_place) #################### loading grounding dataset #################### if self.a.train_grounding: log('loading grounding corpus from %s' % self.a.train_grounding) # only for grounding IMAGEIDField = SparseField(sequential=False, use_vocab=False, batch_first=True) SENTIDField = SparseField(sequential=False, use_vocab=False, batch_first=True) # IMAGEField = SparseField(sequential=False, use_vocab=False, batch_first=True) train_grounding_set = GroundingDataset(path=self.a.train_grounding, img_dir=self.a.img_dir_grounding, fields={"id": ("IMAGEID", IMAGEIDField), "sentence_id": ("SENTID", SENTIDField), "words": ("WORDS", WordsField), "pos-tags": ("POSTAGS", PosTagsField), "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), colcc: ("ADJM", AdjMatrixField), "all-entities": ("ENTITIES", EntitiesField), # "image": ("IMAGE", IMAGEField), }, transform=transform, amr=self.a.amr, load_object=self.a.add_object, object_ontology_file=self.a.object_class_map_file, object_detection_pkl_file=self.a.object_detection_pkl_file_g, object_detection_threshold=self.a.object_detection_threshold, ) dev_grounding_set = GroundingDataset(path=self.a.dev_grounding, img_dir=self.a.img_dir_grounding, fields={"id": ("IMAGEID", IMAGEIDField), "sentence_id": ("SENTID", SENTIDField), "words": ("WORDS", WordsField), "pos-tags": ("POSTAGS", PosTagsField), "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), colcc: ("ADJM", AdjMatrixField), "all-entities": ("ENTITIES", EntitiesField), # "image": ("IMAGE", IMAGEField), }, transform=transform, amr=self.a.amr, load_object=self.a.add_object, object_ontology_file=self.a.object_class_map_file, object_detection_pkl_file=self.a.object_detection_pkl_file_g, object_detection_threshold=self.a.object_detection_threshold, ) test_grounding_set = GroundingDataset(path=self.a.test_grounding, img_dir=self.a.img_dir_grounding, fields={"id": ("IMAGEID", IMAGEIDField), "sentence_id": ("SENTID", SENTIDField), "words": ("WORDS", WordsField), "pos-tags": ("POSTAGS", PosTagsField), "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), colcc: ("ADJM", AdjMatrixField), "all-entities": ("ENTITIES", EntitiesField), # "image": ("IMAGE", IMAGEField), }, transform=transform, amr=self.a.amr, load_object=self.a.add_object, object_ontology_file=self.a.object_class_map_file, object_detection_pkl_file=self.a.object_detection_pkl_file_g, object_detection_threshold=self.a.object_detection_threshold, ) #################### build vocabulary #################### if self.a.webd: pretrained_embedding = Vectors(self.a.webd, ".", unk_init=partial(torch.nn.init.uniform_, a=-0.15, b=0.15)) WordsField.build_vocab(train_ee_set.WORDS, dev_ee_set.WORDS, train_grounding_set.WORDS, dev_grounding_set.WORDS, vectors=pretrained_embedding) else: WordsField.build_vocab(train_ee_set.WORDS, dev_ee_set.WORDS, train_grounding_set.WORDS, dev_grounding_set.WORDS) PosTagsField.build_vocab(train_ee_set.POSTAGS, dev_ee_set.POSTAGS, train_grounding_set.POSTAGS, dev_grounding_set.POSTAGS) EntityLabelsField.build_vocab(train_ee_set.ENTITYLABELS, dev_ee_set.ENTITYLABELS, train_grounding_set.ENTITYLABELS, dev_grounding_set.ENTITYLABELS) consts.O_LABEL = LabelField.vocab.stoi[consts.O_LABEL_NAME] # print("O label is", consts.O_LABEL) consts.ROLE_O_LABEL = EventsField.vocab.stoi[consts.ROLE_O_LABEL_NAME] # print("O label for AE is", consts.ROLE_O_LABEL) dev_ee_set1 = ACE2005Dataset(path=self.a.dev_ee, fields={"words": ("WORDS", WordsField), "pos-tags": ("POSTAGS", PosTagsField), "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), colcc: ("ADJM", AdjMatrixField), "golden-event-mentions": ("LABEL", LabelField), "all-events": ("EVENT", EventsField), "all-entities": ("ENTITIES", EntitiesField)}, amr=self.a.amr, keep_events=1, only_keep=True) test_ee_set1 = ACE2005Dataset(path=self.a.test_ee, fields={"words": ("WORDS", WordsField), "pos-tags": ("POSTAGS", PosTagsField), "golden-entity-mentions": ("ENTITYLABELS", EntityLabelsField), colcc: ("ADJM", AdjMatrixField), "golden-event-mentions": ("LABEL", LabelField), "all-events": ("EVENT", EventsField), "all-entities": ("ENTITIES", EntitiesField)}, amr=self.a.amr, keep_events=1, only_keep=True) print("train set length", len(train_ee_set)) print("dev set length", len(dev_ee_set)) print("dev set 1/1 length", len(dev_ee_set1)) print("test set length", len(test_ee_set)) print("test set 1/1 length", len(test_ee_set1)) # sr model initialization if not self.a.sr_hps_path: self.a.sr_hps = eval(self.a.sr_hps) embeddingMatrix_noun = torch.FloatTensor(np.load(self.a.wnebd)).to(self.device) embeddingMatrix_verb = torch.FloatTensor(np.load(self.a.wvebd)).to(self.device) embeddingMatrix_role = torch.FloatTensor(np.load(self.a.wrebd)).to(self.device) if "wvemb_size" not in self.a.sr_hps: self.a.sr_hps["wvemb_size"] = len(vocab_verb.id2word) if "wremb_size" not in self.a.sr_hps: self.a.sr_hps["wremb_size"] = len(vocab_role.id2word) if "wnemb_size" not in self.a.sr_hps: self.a.sr_hps["wnemb_size"] = len(vocab_noun.id2word) self.a.ee_label_weight = torch.ones([len(LabelField.vocab.itos)]) * 5 self.a.ee_label_weight[consts.O_LABEL] = 1.0 self.a.ee_arg_weight = torch.ones([len(EventsField.vocab.itos)]) * 5 self.a.ee_hps = eval(self.a.ee_hps) if "wemb_size" not in self.a.ee_hps: self.a.ee_hps["wemb_size"] = len(WordsField.vocab.itos) if "pemb_size" not in self.a.ee_hps: self.a.ee_hps["pemb_size"] = len(PosTagsField.vocab.itos) if "psemb_size" not in self.a.ee_hps: # self.a.ee_hps["psemb_size"] = max([train_grounding_set.longest(), dev_grounding_set.longest(), test_grounding_set.longest()]) + 2 self.a.ee_hps["psemb_size"] = max([train_ee_set.longest(), dev_ee_set.longest(), test_ee_set.longest(), train_grounding_set.longest(), dev_grounding_set.longest(), test_grounding_set.longest()]) + 2 if "eemb_size" not in self.a.ee_hps: self.a.ee_hps["eemb_size"] = len(EntityLabelsField.vocab.itos) if "oc" not in self.a.ee_hps: self.a.ee_hps["oc"] = len(LabelField.vocab.itos) if "ae_oc" not in self.a.ee_hps: self.a.ee_hps["ae_oc"] = len(EventsField.vocab.itos) if "oc" not in self.a.sr_hps: self.a.sr_hps["oc"] = len(LabelField.vocab.itos) if "ae_oc" not in self.a.sr_hps: self.a.sr_hps["ae_oc"] = len(EventsField.vocab.itos) ee_tester = EDTester(LabelField.vocab.itos, EventsField.vocab.itos, self.a.ignore_time_test) sr_tester = SRTester() g_tester = GroundingTester() j_tester = JointTester(self.a.ignore_place_sr_test, self.a.ignore_time_test) ace_classifier = ACEClassifier(2 * self.a.ee_hps["lstm_dim"], self.a.ee_hps["oc"], self.a.ee_hps["ae_oc"], self.device) if self.a.finetune_ee: log('init ee model from ' + self.a.finetune_ee) ee_model = load_ee_model(self.a.ee_hps, self.a.finetune_ee, WordsField.vocab.vectors, self.device, ace_classifier) log('ee model loaded, there are %i sets of params' % len(ee_model.parameters_requires_grads())) else: ee_model = load_ee_model(self.a.ee_hps, None, WordsField.vocab.vectors, self.device, ace_classifier) log('ee model created from scratch, there are %i sets of params' % len(ee_model.parameters_requires_grads())) if self.a.finetune_sr: log('init sr model from ' + self.a.finetune_sr) sr_model = load_sr_model(self.a.sr_hps, embeddingMatrix_noun, embeddingMatrix_verb, embeddingMatrix_role, self.a.finetune_sr, self.device, ace_classifier, add_object=self.a.add_object, load_partial=True) log('sr model loaded, there are %i sets of params' % len(sr_model.parameters_requires_grads())) else: sr_model = load_sr_model(self.a.sr_hps, embeddingMatrix_noun, embeddingMatrix_verb, embeddingMatrix_role, None, self.device, ace_classifier, add_object=self.a.add_object, load_partial=True) log('sr model created from scratch, there are %i sets of params' % len(sr_model.parameters_requires_grads())) model = GroundingModel(ee_model, sr_model, self.get_device()) # ee_model = torch.nn.DataParallel(ee_model) # sr_model = torch.nn.DataParallel(sr_model) # model = torch.nn.DataParallel(model) if self.a.optimizer == "adadelta": optimizer_constructor = partial(torch.optim.Adadelta, params=model.parameters_requires_grads(), weight_decay=self.a.l2decay) elif self.a.optimizer == "adam": optimizer_constructor = partial(torch.optim.Adam, params=model.parameters_requires_grads(), weight_decay=self.a.l2decay) else: optimizer_constructor = partial(torch.optim.SGD, params=model.parameters_requires_grads(), weight_decay=self.a.l2decay, momentum=0.9) log('optimizer in use: %s' % str(self.a.optimizer)) if not os.path.exists(self.a.out): os.mkdir(self.a.out) with open(os.path.join(self.a.out, "word.vec"), "wb") as f: pickle.dump(WordsField.vocab, f) with open(os.path.join(self.a.out, "pos.vec"), "wb") as f: pickle.dump(PosTagsField.vocab.stoi, f) with open(os.path.join(self.a.out, "entity.vec"), "wb") as f: pickle.dump(EntityLabelsField.vocab.stoi, f) with open(os.path.join(self.a.out, "label.vec"), "wb") as f: pickle.dump(LabelField.vocab.stoi, f) with open(os.path.join(self.a.out, "role.vec"), "wb") as f: pickle.dump(EventsField.vocab.stoi, f) with open(os.path.join(self.a.out, "ee_hyps.json"), "w") as f: json.dump(self.a.ee_hps, f) with open(os.path.join(self.a.out, "sr_hyps.json"), "w") as f: json.dump(self.a.sr_hps, f) log('init complete\n') # ee mappings self.a.ee_word_i2s = WordsField.vocab.itos self.a.ee_label_i2s = LabelField.vocab.itos self.a.ee_role_i2s = EventsField.vocab.itos # sr mappings self.a.sr_word_i2s = vocab_noun.id2word self.a.sr_label_i2s = vocab_verb.id2word # LabelField.vocab.itos self.a.sr_role_i2s = vocab_role.id2word writer = SummaryWriter(os.path.join(self.a.out, "exp")) self.a.writer = writer joint_train( model_ee=ee_model, model_sr=sr_model, model_g=model, train_set_g=train_grounding_set, dev_set_g=dev_grounding_set, test_set_g=test_grounding_set, train_set_ee=train_ee_set, dev_set_ee=dev_ee_set, test_set_ee=test_ee_set, train_set_sr=train_sr_set, dev_set_sr=dev_sr_set, test_set_sr=test_sr_set, optimizer_constructor=optimizer_constructor, epochs=self.a.epochs, ee_tester=ee_tester, sr_tester=sr_tester, g_tester=g_tester, j_tester=j_tester, parser=self.a, other_testsets={ "dev ee 1/1": dev_ee_set1, "test ee 1/1": test_ee_set1, }, transform=transform, vocab_objlabel=vocab_noun.word2id ) log('Done!')
batch_first=True, dtype=torch.float) text_field = Field(tokenize='spacy', batch_first=True, include_lengths=True, lower=True) fields = [('Class', label_field), ('Text', text_field)] train, valid = TabularDataset.splits(path="./", train='processed_train.csv', validation='processed_val.csv', format='CSV', fields=fields, skip_header=True) text_field.build_vocab(train, min_freq=2, vectors='glove.840B.300d') # In[10]: with open("text_field", "wb") as f: dill.dump(text_field, f) batch_size = 48 train_iter = BucketIterator(train, batch_size=batch_size, sort_key=lambda x: len(x.Text), device=device, sort=True, sort_within_batch=True, shuffle=True) valid_iter = Iterator(valid, sort=False,
eos_token="<eos>") # associate the text in the 'Question' column with the Q_TEXT field, # and 'Answer' with A_TEXT field data_fields = [('Question', Q_TEXT), ('Answer', A_TEXT)] # train, val = TabularDataset.splits(path=PATH, train='train.csv', validation='val.csv', format='csv', # fields=data_fields, skip_header=True) tab_dataset = TabularDataset(path=f'{args.path}/all.csv', format='csv', fields=data_fields, skip_header=True) train, val, test = tab_dataset.split(split_ratio=[0.5, 0.2, 0.3], random_state=random.getstate()) Q_TEXT.build_vocab(train) A_TEXT.build_vocab(train) print('Question Tokenize') print(list(Q_TEXT.vocab.stoi.items())) print('Answer Tokenize') print(list(A_TEXT.vocab.stoi.items())) # print(list(A_TEXT.vocab.itos)) INPUT_DIM = len(Q_TEXT.vocab) OUTPUT_DIM = len(A_TEXT.vocab) # BATCH_SIZE = 512 # ENC_EMB_DIM = 256 # 256 # DEC_EMB_DIM = 256 # 256 # HID_DIM = 512 # 512 # N_LAYERS = 2
def init(model_config, device='cpu'): logging.critical("[CRITICAL] %s device is selected" % device) logging.info( '[INFO] Using directory %s for the translation pair with filename %s' % (os.path.abspath(model_config['global']['dataset_path']), model_config['global']['translate_pair'])) #initialize the field for src language src_field = Field(tokenize=english_tokenizer, init_token='<sos>', eos_token='<eos>', lower=True) #initialize the field for trg language trg_field = Field(tokenize=hindi_tokenizer, init_token='<sos>', eos_token='<eos>', lower=True) train_data, valid_data, test_data = load_datasets( model_config['global']['dataset_path'], model_config['global']['dataset_file_names'], model_config['global']['translate_pair'], model_config['global']['lang_extensions'], [src_field, trg_field]) #initialize the vocabulary src_field.build_vocab(train_data, min_freq=1) trg_field.build_vocab(train_data, min_freq=1) #display dataset stats print_dataset_statistics(train_data, valid_data, test_data, model_config['global']['lang_extensions'], [src_field, trg_field]) model = create_seq2seq_model(model_config, len(src_field.vocab), len(trg_field.vocab), device) optimizer = optim.Adam(model.parameters()) #defining the loss function loss_function = nn.CrossEntropyLoss( ignore_index=trg_field.vocab.stoi[trg_field.pad_token]) logging.info(model.apply(init_weights)) logging.info('[INFO] Model has %s trainable parameters' % (count_parameters(model))) logging.info('[INFO] About to start the primary training loop') train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=model_config['global']['batch_size'], device=device) cache_file_name = "%s-%s-%s-epoch-%s.pt" % ( model_config['global']['name'], model_config['global']['lang_extensions'][0], model_config['global']['lang_extensions'][1], model_config['global']['epochs']) cache_file_path = os.path.join(model_config['global']['cache_path'], cache_file_name) stats = execute_training_loop( model, train_iterator, valid_iterator, loss_function, optimizer, model_config['global']['clip_value'], src_field, trg_field, epochs=model_config['global']['epochs'], model_cache_path=os.path.abspath(cache_file_path)) stats_file_name = "%s-%s-%s-epoch-%s-stats.pickle" % ( model_config['global']['name'], model_config['global']['lang_extensions'][0], model_config['global']['lang_extensions'][1], model_config['global']['epochs']) store_object( stats, os.path.join(model_config['global']['cache_path'], stats_file_name)) logging.info("[INFO] loading the model %s" % (cache_file_name)) model.load_state_dict(torch.load(os.path.abspath(cache_file_path))) test_loss, test_bleu = evaluate_model(model, test_iterator, loss_function, src_field, trg_field) logging.info( f'[INFO] | Test Loss: {test_loss:.3f} Test Bleu: {test_bleu:.3f} | Test PPL: {math.exp(test_loss):7.3f} |' )
def load_dataset(batch_size): spacy_de = spacy.load( 'de') #run it on your env or virtrual env:#python -m spacy download de spacy_en = spacy.load( 'en') #run it on your env or virtrual env:#python -m spacy download en url = re.compile('(<url>.*</url>)') def tokenize_de(text): return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))] def tokenize_en(text): return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))] # create dataset according to Field object. # Field define the basic token and tokenize. # Field can create vocab. # If you don't define init_token and eos_token, you will not get these token when you get training batch data from train_iter # Because you define the init_token and eos_token in here, you can get init_token + sentence + eos_token when you create train, val, test from TranslationDataset.splits DE = Field(tokenize=tokenize_de, include_lengths=True, init_token='<sos>', eos_token='<eos>') EN = Field(tokenize=tokenize_en, include_lengths=True, init_token='<sos>', eos_token='<eos>') #you can find: len(val.examples)=1014; len(test.examples)=1000; len(train.examples)=29000 in Multi30k.splits... #train, val, test = Multi30k.splits(exts=('.de', '.en'), fields=(DE, EN)) #I download the data and read it directly: #if your file name is not the same as defualt, you must change the function input parameter: train='train', validation='val', test='test' #exts parameter is the data file ext name. #So the data file depends on the parameter:path+(train\validation\test)+exts train, val, test = TranslationDataset.splits(path='./data/', exts=('.de', '.en'), fields=(DE, EN)) #build vocabury #You can find one word from: DE.vocab.itos[0], it will depend on the order of frenquency #You can also find the index of word from: DE.vocab.stoi['word name'] #It will automatically create the '<pad>' into vocab even you never use it. The '<pad>' sometimes only be used after creating iterators. #It is the same to unkonw_token '<pad>'. If you want: init_token='<sos>', eos_token='<eos>', #you need to give a arguement in creating the Field object. DE.build_vocab( train.src, min_freq=2 ) # you can just use DE.build_vocab(train, min_freq=2), but not: DE.build_vocab(train.trg, min_freq=2) EN.build_vocab(train.trg, max_size=10000 ) # you can just use EN.build_vocab(train, max_size=10000) # Create batch and make the length of every sentence in one batch become the same # If repeat=True, program will forever run in: 'for b, batch in enumerate(train_iter):' train_iter, val_iter, test_iter = BucketIterator.splits( (train, val, test), batch_size=batch_size, repeat=False) return train_iter, val_iter, test_iter, DE, EN # import re # import spacy # import torch # from torchtext.data import Field, BucketIterator # from torchtext.datasets import Multi30k, TranslationDataset # spacy_de = spacy.load('de')#run it on your env or virtrual env:#python -m spacy download de # spacy_en = spacy.load('en')#run it on your env or virtrual env:#python -m spacy download en # url = re.compile('(<url>.*</url>)') # def tokenize_de(text): # return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))] # def tokenize_en(text): # return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))] # DE = Field(tokenize=tokenize_de, include_lengths=True, # init_token='<sos>', eos_token='<eos>') # EN = Field(tokenize=tokenize_en, include_lengths=True, # init_token='<sos>', eos_token='<eos>') # #you can find: len(val.examples)=1014; len(test.examples)=1000; len(train.examples)=29000 in Multi30k.splits... # #train, val, test = Multi30k.splits(exts=('.de', '.en'), fields=(DE, EN)) # #I download the data and read it directly: # #if your file name is not the same as defualt, you must change the function input parameter: train='train', validation='val', test='test' # #exts parameter is the data file ext name. # #So the data file depends on the parameter:path+(train\validation\test)+exts # train, val, test = TranslationDataset.splits(path='./data2/',exts=('.de', '.en'), fields=(DE, EN)) # #build vocabury # #You can find one word from: DE.vocab.itos[0], it will depend on the order of frenquency # #You can also find the index of word from: DE.vocab.stoi['word name'] # DE.build_vocab(train, min_freq=2) # EN.build_vocab(train, max_size=10000) # for i in range(5): # print(DE.vocab.itos[i]) # train_iter, val_iter, test_iter = BucketIterator.splits( # (train, val, test), batch_size=2, repeat=False, sort=True, sort_within_batch=False) # DE.vocab.stoi # for i in range(5): # print(DE.vocab.itos[i]) # for i in range(len(EN.vocab)): # print(EN.vocab.itos[i]) # for e in range(3): # for b, batch in enumerate(train_iter): # src, len_src = batch.src # trg, len_trg = batch.trg # tensorToCsv2D(src,name='src',path='/home/yj/Documents/Python/Github/seq2seq/data2/gan.txt') # tensorToCsv2D(len_src,name='len_src',path='/home/yj/Documents/Python/Github/seq2seq/data2/gan.txt') # tensorToCsv2D(trg,name='trg',path='/home/yj/Documents/Python/Github/seq2seq/data2/gan.txt') # tensorToCsv2D(len_trg,name='len_trg',path='/home/yj/Documents/Python/Github/seq2seq/data2/gan.txt') # import numpy # def tensorToCsv2D(tensor,name='defualt',path=None,token=','): # def get_variable_name(variable): # callers_local_vars = inspect.currentframe().f_back.f_locals.items() # return [var_name for var_name, var_val in callers_local_vars if var_val is variable] # name = ''.join(get_variable_name(tensor)) # assert(path is not None) # z = tensor.numpy().tolist() # if len(numpy.shape(z)) == 2: # with open(path,'a') as f: # f.write(name) # f.write('\r') # for i in range(numpy.shape(z)[0]): # for j in range(numpy.shape(z)[1]): # f.write(str(z[i][j])) # f.write(token) # f.write('\r') # elif len(numpy.shape(z)) == 1: # with open(path,'a') as f: # f.write(name) # f.write('\r') # for i in range(numpy.shape(z)[0]): # f.write(str(z[i])) # f.write(token) # f.write('\r') # tensorToCsv2D(src,name='src',path='/home/yj/Documents/Python/Github/seq2seq/data/gan.txt') # tensorToCsv2D(len_src,name='len_src',path='/home/yj/Documents/Python/Github/seq2seq/data/gan.txt') # tensorToCsv2D(trg,name='trg',path='/home/yj/Documents/Python/Github/seq2seq/data/gan.txt') # tensorToCsv2D(len_trg,name='len_trg',path='/home/yj/Documents/Python/Github/seq2seq/data/gan.txt') # with open('/home/yj/Documents/Python/Github/seq2seq/data/gan.txt','w') as f: # f.write(str(src)) # f.write(str(len_src)) # f.write(str(trg)) # f.write(str(len_trg)) # f # z = src.numpy().tolist() # z[0][0] # len(numpy.shape(z)) # numpy.shape(z)[0]
return nltk.tokenize.word_tokenize(text) TEXT = Field(tokenize=tokenize, init_token='<sos>', eos_token='<eos>', include_lengths=True, lower=True) LABEL = Field(tokenize=tokenize, lower=True) train_data, valid_data, test_data = SNLI.splits(TEXT, LABEL) # train_data, valid_data, test_data = MultiNLI.splits(TEXT, LABEL) TEXT.build_vocab(train_data, min_freq=2, specials=[u'<esos>', u'<nsos>', u'<csos>'], vectors='glove.42B.300d') LABEL.build_vocab(train_data, min_freq=2) BATCH_SIZE = 32 device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu") # device = torch.device("cpu") train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=BATCH_SIZE, sort_within_batch=True, sort_key=lambda x: len(x.premise), device=device) print "Preparing data completed !"
def main(args): print('start ..!') BATCH_SIZE = args.batch_size device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') TEXT = Field( sequential=True, # text: sequential data tokenize=str.split, batch_first=True, fix_length=56, # padding size: max length of data text lower=True) LABEL = LabelField(sequential=False, dtype=torch.float) w2v = KeyedVectors.load_word2vec_format( './model/GoogleNews-vectors-negative300.bin.gz', binary=True) data_dir = args.data_dir train_paths, val_paths = build_data(data_dir) N_EPOCHS = args.epochs EMBEDDING_DIM = args.embedding N_FILTERS = args.n_filters FILTER_SIZES = args.filter_sizes OUTPUT_DIM = 1 DROPOUT = args.dropout test_acc_lists = [] for kfold in range(10): # make datasets train_path = train_paths[kfold] val_path = val_paths[kfold] train_data = TabularDataset(path=train_path, skip_header=True, format='csv', fields=[('label', LABEL), ('text', TEXT)]) test_data = TabularDataset(path=val_path, skip_header=True, format='csv', fields=[('label', LABEL), ('text', TEXT)]) TEXT.build_vocab(train_data) LABEL.build_vocab(train_data) # for pretrained embedding vectors w2v_vectors = [] for token, idx in TEXT.vocab.stoi.items(): # pad token -> zero if idx == 1: w2v_vectors.append(torch.zeros(EMBEDDING_DIM)) # if word in word2vec vocab -> replace with pretrained word2vec elif token in w2v.wv.vocab.keys(): w2v_vectors.append(torch.FloatTensor(w2v[token])) # oov -> randomly initialized uniform distribution else: w2v_vectors.append( torch.distributions.Uniform(-0.25, +0.25).sample( (EMBEDDING_DIM, ))) TEXT.vocab.set_vectors(TEXT.vocab.stoi, w2v_vectors, EMBEDDING_DIM) pretrained_embeddings = torch.FloatTensor(TEXT.vocab.vectors) # make iterators train_iterator, test_iterator = BucketIterator.splits( (train_data, test_data), batch_size=BATCH_SIZE, device=device, sort=False, shuffle=True) # define a model INPUT_DIM = len(TEXT.vocab) model = CNN1d(pretrained_embeddings, INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT) optimizer = optim.Adadelta(model.parameters(), rho=0.95) criterion = nn.BCEWithLogitsLoss() model = model.to(device) criterion = criterion.to(device) # train best_test_acc = -float('inf') model_name = './model/model' + str(kfold) + '.pt' print('kfold', kfold) for epoch in range(N_EPOCHS): start_time = time.time() train_loss, train_acc = train(model, train_iterator, optimizer, criterion) test_loss, test_acc = evaluate(model, test_iterator, criterion) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) if test_acc > best_test_acc: best_test_acc = test_acc torch.save(model.state_dict(), model_name) # print(f'\tEpoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s') # print(f'\t\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%') # print(f'\t\tTest. Loss: {test_loss:.3f} | Val. Acc: {test_acc * 100:.2f}%') logging.info( f'\tEpoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s' ) logging.info( f'\t\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%' ) logging.info( f'\t\tTest. Loss: {test_loss:.3f} | Val. Acc: {test_acc * 100:.2f}%' ) model.load_state_dict(torch.load(model_name)) test_loss, test_acc = evaluate(model, test_iterator, criterion) test_acc_lists.append(test_acc) logging.info(f'============== last test accuracy: {test_acc}') # print(f'============== last test accuracy: {test_acc}') print() return test_acc_lists
batch_first=True) tgt = Field(sequential=True, use_vocab=True, pad_token=PAD, tokenize=tokenizer_de, lower=True, init_token=BOS, eos_token=EOS, batch_first=True) prefix_f = './escape.en-de.tok.5k' parallel_dataset = TranslationDataset(path=prefix_f, exts=('.en', '.de'), fields=[('src', src), ('tgt', tgt)]) src.build_vocab(parallel_dataset, min_freq=5, max_size=15000) tgt.build_vocab(parallel_dataset, min_freq=5, max_size=15000) train, valid = parallel_dataset.split(split_ratio=0.97) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') BATCH_SIZE = 20 train_iterator, valid_iterator = BucketIterator.splits((train, valid), batch_size=BATCH_SIZE, sort_key=lambda x: interleave_keys(len(x.src), len(x.tgt)), device=device) class Encoder(nn.Module):
#Getting the path to the Data folder. pwd = os.getcwd() pwd = pwd.replace('Utils','Data') TEXT = Field(sequential=True, tokenize=lambda x: x.split(), lower=True) #spacy's performance is really good but it takes some time to execute. LABEL = Field(sequential=False, use_vocab=False) #set use_vocab = False when the data is already numerical. datafields = [("id", None),("conversation",TEXT), ("category", LABEL)] #If skip_header is set to False, then the headers also get processed! trn = TabularDataset(path=pwd+"/train_custom.csv", format='csv', skip_header=True, fields=datafields) tst = TabularDataset(path=pwd+'/test_custom.csv', format='csv', skip_header=True, fields=datafields) #Creating the vocabulary using GloVe embeddings. TEXT.build_vocab(trn, vectors="glove.42B.300d") train_iter = BucketIterator( dataset =trn, # we pass in the datasets we want the iterator to draw data from batch_size =64, device=device, sort_key=lambda x: len(x.conversation), # the BucketIterator needs to be told what function it should use to group the data. sort_within_batch=False, repeat=False, # we pass repeat=False because we want to wrap this Iterator layer. shuffle =False, #Experiment with this to see if you're getting improved performance. train =True #Whether the dataset is a training set or not. ) test_iter = Iterator(tst, batch_size=64, device=device, sort=False, sort_within_batch=False, repeat=False, shuffle=False) class BatchWrapper:
fields = (SRC, TRG)) import torch from torchtext import data #data.Dataset.splits() # In[ ]: print(vars(train_data.examples[0])) # In[ ]: SRC.build_vocab(train_data, min_freq = 2) TRG.build_vocab(train_data, min_freq = 2) # In[ ]: device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # In[ ]: BATCH_SIZE = 128 train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size = BATCH_SIZE,
toknized.extend(["<pad>"]*(5-len(toknized))) return toknized TEXT = Field(tokenize=tagger.morphs,lower=True,include_lengths=False,batch_first=True,preprocessing=pad_under_five) LABEL = Field(sequential=False,use_vocab=True,unk_token=None) train_data, test_data = TabularDataset.splits(path=DATA_PATH+'/nsmc/', train='ratings_train.txt', test='ratings_test.txt', format='tsv', skip_header=True, fields=[('id',None),('text',TEXT),('label',LABEL)], filter_pred = lambda x: True if len(x.text) > 1 else False) # 토큰 레벨 문장의 길이가 1 이상인 경우만 허용 TEXT.build_vocab(train_data,min_freq=2) LABEL.build_vocab(train_data) # print (TEXT.vocab) # print (len(TEXT.vocab),len(LABEL.vocab)) # print (TEXT.vocab.itos[:5]) # print (LABEL.vocab.itos) train_loader, test_loader = BucketIterator.splits((train_data,test_data),sort_key=lambda x:len(x.text), sort_within_batch=True, repeat=False,shuffle=True, batch_size=32,device=DEVICE) for batch in train_loader: break
def getData_old_method(USE_BPE): if USE_BPE == False: german = Field(tokenize=tokenize_ger, lower=True, init_token="<sos>", eos_token="<eos>", pad_token="<pad>", unk_token="<unk>") english = Field(tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>", pad_token="<pad>", unk_token="<unk>") # print("===============================before ") train_data, valid_data, test_data = Multi30k.splits( exts=(".de", ".en"), fields=(german, english), # root='.data', train='train', validation='val', test='test2016', path='../../../data/multi30k') #The study’s questions are carefully worded and chosen. # The study questions were carefully worded and chosen. # train_data, valid_data, test_data = Multi30k.splits( # exts=(".src", ".tgt"), fields=(german, english), # # root='.data', # train='train', # validation='valid', # test='test', # path = '/data/chaudhryz/uwstudent1/GDATA' # ) german.build_vocab(train_data, max_size=10000, min_freq=2) english.build_vocab(train_data, max_size=10000, min_freq=2) german.vocab.init_token = "<sos>" german.vocab.eos_token = "<eos>" english.vocab.init_token = "<sos>" english.vocab.eos_token = "<eos>" # print("Train") # for i in range(10): # #print(train_data[i].src, train_data[i].trg) # printSent(train_data[i].src) # printSent(train_data[i].trg) # print("Test") # for i in range(10): # #print(train_data[i].src, train_data[i].trg) # printSent(test_data[i].src) # printSent(test_data[i].trg) # exit() print("train_data ", len(train_data.examples)) print("valid_data ", len(valid_data.examples)) print("test_data ", len(test_data.examples)) return german.vocab, english.vocab, train_data, valid_data, test_data else: print("Not Implemented") exit()
device eng=spacy.load('en') ger=spacy.load('de_core_news_sm') def Tokenize_eng(text): return [a.text for a in eng.tokenizer(text)] def Tokenize_german(text): return [b.text for b in ger.tokenizer(text)] german=Field(tokenize=Tokenize_german,lower=True,init_token='<sos>',eos_token='<eos>') english=Field(tokenize=Tokenize_eng,lower=True,init_token='<sos>',eos_token='<eos>') Train,Val,Test=Multi30k.splits(exts=('.de','.en'),fields=(german,english)) german.build_vocab(Train,max_size=10000,min_freq=2) english.build_vocab(Train,max_size=10000,min_freq=2) ##building encoder class Encode(Module): def __init__(self,inp_size,emd_size,hidden_size): super(Encode,self).__init__() self.inp_size=inp_size self.emd_size=emd_size self.hidden_size=hidden_size self.embed=Embedding(self.inp_size,self.emd_size) self.lstm=LSTM(self.emd_size,self.hidden_size,num_layers=2,dropout=0.3) def forward(self,x): x=self.embed(x) x,(h,c)=self.lstm(x) return h,c
unk_token=tokenizer.unk_token, pad_first=False, batch_first=True) LABEL = Field(use_vocab=False, sequential=False) datafields = [('text', TEXT), ('label', LABEL)] trn, cv = TabularDataset.splits(path='.', train='train.csv', validation='cv.csv', format='csv', skip_header=True, fields=datafields) TEXT.build_vocab(trn, cv) stoi = dict(tokenizer.vocab) itos = list(stoi.keys()) TEXT.vocab.stoi = stoi TEXT.vocab.itos = itos train_iter, val_iter = BucketIterator.splits((trn, cv), batch_sizes=(64, 64), device=device, sort_key=lambda x: len(x.text), sort_within_batch=False, repeat=False) vocab_sz = len(tokenizer.vocab) print(vocab_sz) hidden_sz = 50
) fields = {"Lithuanian": ("src", lithuanian), "English": ("trg", english)} # Convert into Tabular Dataset train_data, valid_data, test_data = TabularDataset.splits( path="", train="train.json", validation="valid.json", test="test.json", format="json", fields=fields, ) # Create separate Vocab english.build_vocab(train_data, max_size=10000, min_freq=2) lithuanian.build_vocab(train_data, max_size=10000, min_freq=2) # Prebuild transformer class from pytorch class Transformer(nn.Module): def __init__( self, embedding_size, src_vocab_size, trg_vocab_size, src_pad_idx, num_heads, num_encoder_layers, num_decoder_layers, forward_expansion,
class Data(object): WORDS_NAME = "words" LAB_NAME = "lab" CHAR_NAME = "char" def __init__( self, train_path: str, unlabeled_path: str, semi_supervised: bool, dev_path: str = None, test_path: str = None, batch_size: int = 32, device: object = None, logger: typing.Optional[logging.Logger] = None, ) -> None: if logger is None: logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) handler = logging.StreamHandler() handler.setFormatter( logging.Formatter('%(levelname)s - %(name)s - %(message)s')) logger.addHandler(handler) self.train_path = train_path self.dev_path = dev_path self.test_path = test_path self.unlabeled_path = unlabeled_path self.batch_size = batch_size self.semi_supervised = semi_supervised self.device = device self.logger = logger def initialize(self): ## initialize fields and create dataset ## self._init_fields() self._read_sentences() self.train = self._make_bucket_iterator(self._make_dataset(False), batch_size=self.batch_size, device=self.device) self.dev = self._make_bucket_iterator(self._make_dataset(False, which="dev"), batch_size=self.batch_size, device=self.device) self.test = self._make_bucket_iterator(self._make_dataset( False, which="test"), batch_size=self.batch_size, device=self.device) # self.unlabeled_train = self._make_bucket_iterator(self._make_dataset(True), # batch_size=self.batch_size, device=self.device) self.unlabeled_data = self._make_dataset(True) self._build_vocabularies() def _read_sentences(self): self.train_sentences = [] with open(self.train_path) as f: for line in f: self.train_sentences.append(line.replace("\n", "")) self.logger.info('{} train sentences successfully read'.format( len(self.train_sentences))) self.dev_sentences = [] with open(self.dev_path) as f: for line in f: self.dev_sentences.append(line.replace("\n", "")) self.logger.info('{} dev sentences successfully read'.format( len(self.dev_sentences))) self.unlabeled_sentences = [] temp = [] with open(self.unlabeled_path) as f: for line in f: sen_len = len(line.split()) if sen_len > 0 and sen_len <= 20: temp.append(line.replace("\n", "")) #self.unlabeled_sentences = random.sample(temp, 101420) self.unlabeled_sentences = temp self.logger.info('{} unlabeled sentences successfully read'.format( len(self.unlabeled_sentences))) self.test_sentences = [] with open(self.test_path) as f: for line in f: self.test_sentences.append(line.replace("\n", "")) self.logger.info('{} test sentences successfully read'.format( len(self.train_sentences))) def _init_fields(self): self.words = Field(batch_first=True, init_token='<s>', eos_token='</s>') self.lab = Field(batch_first=True, unk_token=None, pad_token=None) # self.char = NestedField(Field(batch_first=True, tokenize=list, unk_token='<cunk>') # , init_token='<s>', eos_token='</s>') self.char = NestedField(Field(batch_first=True, tokenize=list, unk_token='<cunk>', init_token='<w>', eos_token='</w>'), init_token='<s>', eos_token='</s>') self.labeled_fields = [(self.WORDS_NAME, self.words), (self.CHAR_NAME, self.char), (self.LAB_NAME, self.lab)] self.unlabeled_fields = [(self.WORDS_NAME, self.words), (self.CHAR_NAME, self.char)] self.logger.info('fields initialized successfully') def _make_dataset(self, unlabeled, which=None) -> Dataset: if not unlabeled: sentences = self.train_sentences if which == "dev": sentences = self.dev_sentences elif which == "test": sentences = self.test_sentences examples = [self._make_example(s) for s in sentences] return Dataset(examples, self.labeled_fields) else: sentences = self.unlabeled_sentences examples = [self._make_example_unlabeled(s) for s in sentences] return Dataset(examples, self.unlabeled_fields) def _make_example(self, sent) -> Example: cols = sent.split("\t") words = [word for word in cols[0].split()] tags = [tag for tag in cols[1].split()] return Example.fromlist([words, words, tags], self.labeled_fields) def _make_example_unlabeled(self, sent) -> Example: words = [word for word in sent.split()] return Example.fromlist([words, words], self.unlabeled_fields) def _make_bucket_iterator(self, data, batch_size=32, device=None): # return BucketIterator( # dataset=data, batch_size=batch_size, # sort=False, sort_within_batch=True, # sort_key=lambda x: len(x.words), # device=device, repeat=False) return GroupedBucketIterator(data, batch_size, lambda ex: len(ex.words), device=device) def _build_vocabularies(self): self.words.build_vocab(self.train.dataset) self.lab.build_vocab(self.train.dataset) self.char.build_vocab(self.train.dataset) self.num_words = len(self.words.vocab) self.num_tags = len(self.lab.vocab) self.num_char = len(self.char.vocab) self.logger.info( 'Found %d words, %d chars, and %d tags for both the labeled and unlabeled dataset', self.num_words, self.num_char, self.num_tags) def _get_unlabeled_sentences(self): while True: for us in self.unlabeled_sentences: yield us def _get_unlabeled_examples(self): #while True: lines = [] for words in self._get_unlabeled_sentences(): lines.append(words) if len(lines) >= 10142: yield [self._make_example_unlabeled(line) for line in lines] lines = [] def _endless_unlabeled(self): #while True: for ex in self._get_unlabeled_examples(): unlabeled_iterator = self._make_bucket_iterator( Dataset(ex, self.unlabeled_fields), batch_size=self.batch_size, device=self.device) yield unlabeled_iterator del unlabeled_iterator torch.cuda.empty_cache() def _endless_minibatch(self, data): while True: for i, batch in enumerate(data): yield batch def get_alternating_minibatch(self): # self._create_dataset() while True: for iter in self._endless_unlabeled(): for mb in iter: yield next(self._endless_minibatch(self.train)), "labeled" if self.semi_supervised: yield mb, "unlabeled" def get_input_sizes(self): return self.num_words, self.num_char, self.num_tags def get_pad_token_id(self): return self.char.vocab.stoi[self.char.pad_token] def get_unk_token_id(self): return self.char.vocab.stoi[self.char.unk_token] def get_train_sentences_length(self): return len(self.train_sentences)
def build_dataset_and_vocab(sentences: List[str]): """ Define source and target fields, iterate over the list of sentences to create list of Examples, and return: - training and validation dataset (split 90-10%) - source and target fields with Vocab object """ # Minimum and maximum length for sentences to be included in the dataset min_length, max_length = 4, 10 # Define source and target fields bos_word = '<s>' eos_word = '</s>' pad_word = '<pad>' src_field = Field(tokenize=tokenize_en, pad_token=pad_word, lower=True) tgt_field = Field(tokenize=tokenize_en, init_token=bos_word, eos_token=eos_word, pad_token=pad_word, lower=True) # Create list of Examples from the list of sentences examples = [] sent_count = 0 for sentence in sentences: sentence_split = sentence.split(' ') sentence_length = len(sentence_split) if sentence_length <= min_length or sentence_length >= max_length: continue sent_count += 1 # If sent length is less than 8 if sentence_length <= min_length + 4: # Src length is 3 src_length = min_length - 1 else: # Src length is 5 src_length = min_length + 1 for i in range(0, sentence_length - src_length, src_length): src = ' '.join(sentence_split[i:i + src_length]) tgt = ' '.join(sentence_split[i + src_length:]) example = Example.fromlist(data=[src, tgt], fields=[('src', src_field), ('tgt', tgt_field)]) examples.append(example) print( f'Total {sent_count} sentences processed into {len(examples)} examples.' ) train_dataset, valid_dataset = Dataset(examples=examples, fields=[ ('src', src_field), ('tgt', tgt_field) ]).split(split_ratio=[0.9, 0.1]) # Set the minimum frequency needed to include a token in the vocabulary min_freq = 2 src_field.build_vocab(train_dataset, min_freq=min_freq) tgt_field.build_vocab(train_dataset, min_freq=min_freq) return train_dataset, valid_dataset, src_field, tgt_field