def preprocess(self, train_csv): '''Returns the Dataset''' # Create the fields self.fields = { 'query_text': ('query', self.que_f), 'program_text': ('program', self.prog_f) } # Create dataset object train_data = TabularDataset.splits(path="./", train=train_csv, format="csv", fields=self.fields)[0] # Build vocabulary self.que_f.build_vocab(train_data, max_size=100, min_freq=1) self.prog_f.build_vocab(train_data, max_size=100, min_freq=1, specials=['<nxt>']) return train_data
def __init__(self, train_data = 'offenseval-training-v1.tsv', trained_cnn_model = 'MIDAS_CNN.pt', trained_blstm_model = 'MIDAS_BLSTM.pt', trained_blstmGru_model = 'MIDAS_BLSTM-GRU.pt'): self.tokenize = lambda x: nltk.word_tokenize(x.lower()) self.TEXT = Field(sequential = True, tokenize = self.tokenize, lower = True, include_lengths=True) self.LABEL = Field(sequential = False, use_vocab = False, dtype = torch.float) self.ID = Field(sequential = False, use_vocab = False) off_datafields = [('id', None), ('text', self.TEXT), ('label', self.LABEL), ('is_target', None), ('target', None)] trn = TabularDataset.splits(path='.', train=train_data, format='tsv', fields=off_datafields)[0] self.TEXT.build_vocab(trn, vectors='glove.6B.200d') self.BATCH_SIZE = 64 self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # load pre-trained model self.cnn_model = torch.load(trained_cnn_model) self.blstm_model = torch.load(trained_blstm_model) self.blstmGru_model = torch.load(trained_blstmGru_model)
def preprocess_couplet(): SRC = Field(include_lengths=True, init_token="<sos>", eos_token="<eos>", pad_token="<pad>", unk_token="<unk>", lower=True, batch_first=False, tokenize=lambda text: text.split()) TRG = Field(include_lengths=True, init_token="<sos>", eos_token="<eos>", pad_token="<pad>", unk_token="<unk>", lower=True, batch_first=False, tokenize=lambda text: text.split()) _train, _test = TabularDataset.splits(path="data/couplet", root="data", train="train.tsv", test="test.tsv", format='csv', skip_header=False, fields=[("src", SRC), ("trg", TRG)], csv_reader_params={"quoting": csv.QUOTE_NONE, "delimiter": "\t"}) SRC.build_vocab(_train.src, _train.trg, min_freq=1) TRG.vocab = SRC.vocab return _train, _test, SRC, TRG
def init_dataset(self, root_path, train_path, dev_path, test_path, isSkipHead): if self.isBertCat: if train_path and dev_path and test_path: return BertTabularDataset_MultipleChoice.splits( path = root_path, train = train_path, validation = dev_path, test = test_path, format='tsv', question_fix_length = 40, fields=self.dataset_field, bert_fields=self.bert_field, skip_header=isSkipHead ) else: return BertTabularDataset_MultipleChoice( path = os.path.join(root_path, train_path), format='tsv', question_fix_length = 40, fields=self.dataset_field, bert_fields=self.bert_field, skip_header=isSkipHead ), None, None else: if train_path and dev_path and test_path: return TabularDataset.splits( path = root_path, train = train_path, validation = dev_path, test = test_path, format='tsv', fields=self.dataset_field, skip_header=isSkipHead ) else: return TabularDataset( path = os.path.join(root_path, train_path), format='tsv', fields=self.dataset_field, skip_header=isSkipHead ), None, None
def get(self): REVIEW = Field(tokenize=self.en_tokenizer, init_token='<sos>', eos_token='<eos>', stop_words=STOP_WORDS, use_vocab=True) INPUT_H = Field(sequential=False, use_vocab=False, pad_token=None, unk_token=None, dtype=torch.float32) INPUT_F = Field(sequential=False, use_vocab=False, pad_token=None, unk_token=None, dtype=torch.float32) OUTPUT = Field(sequential=False, use_vocab=False, pad_token=None, unk_token=None, dtype=torch.float32) fields = { 'Review': ('r', REVIEW), 'Input Hidden': ('h', INPUT_H), 'Input Final': ('f', INPUT_F), 'Output': ('o', OUTPUT) } trainds, valds, testds = TabularDataset.splits(path='./', train='train.json', validation='val.json', test='test.json', format='json', fields=fields) REVIEW.build_vocab(trainds, valds) length_of_vocab = len(REVIEW.vocab) return trainds, valds, testds, length_of_vocab
def load_dataset(config, device): label_dict = {"observing": 0, "against": 1, "for": 2} LABEL = Field(use_vocab = False, sequential = False,\ dtype = torch.long, preprocessing = lambda x: label_dict[x.strip()]) SEQ = Field(dtype = torch.long, lower = True, batch_first = True,\ preprocessing = lambda x:x[:45], include_lengths = True) SENT = Field(dtype = torch.long, lower = True, batch_first = True,\ preprocessing = lambda x:x[:45], include_lengths = False) DOC = NestedField(SENT, tokenize = lambda s:s.strip().split(' </s> '), \ preprocessing = lambda s:[x for x in s[:45] if x], dtype = torch.long,\ include_lengths = True) fields = [('label', LABEL), ('claim', SEQ), ('hline', SEQ),\ ('abst', SEQ), ('body', DOC)] train, test = TabularDataset.splits(path="../stance_data/", format = "tsv",\ fields = fields, train = config.train_file, test = config.test_file) train, val = train.split(split_ratio=0.80) vectors = GloVe(name="6B", dim=config.embed_dim, cache='/users4/jwduan/vectors/') DOC.build_vocab(train, val, test, vectors=vectors) SEQ.build_vocab() SEQ.vocab = DOC.vocab config.vocab_size = len(DOC.vocab) train_loader, val_loader, test_loader = Iterator.splits((train, val, test),\ batch_sizes = (config.batch_size, 256, 256), sort_key = lambda x:len(x.body), sort = True, device = device, shuffle = True, repeat = False) return (train_loader, val_loader, test_loader), DOC.vocab.vectors
def generate_equation_for_torch(allowed_operators: Iterable, min_value: int, max_value: int, train_size: int, validation_size: int, test_size: int, x: Field, y: Field): train_samples = generate_equations(allowed_operators, train_size, min_value, max_value) test_samples = generate_equations(allowed_operators, test_size, min_value, max_value) validation_samples = generate_equations(allowed_operators, validation_size, min_value, max_value) with open('tmp_train.csv', 'w') as f: writer = csv.writer(f) writer.writerow(['x', 'y']) writer.writerows(train_samples) with open('tmp_validation.csv', 'w') as f: writer = csv.writer(f) writer.writerow(['x', 'y']) writer.writerows(validation_samples) with open('tmp_test.csv', 'w') as f: writer = csv.writer(f) writer.writerow(['x', 'y']) writer.writerows(test_samples) train, validation, test = TabularDataset.splits( path='', train='tmp_train.csv', validation='tmp_validation.csv', test='tmp_test.csv', fields=[('x', x), ('y', y)], format='csv', skip_header=True) os.remove('tmp_train.csv') os.remove('tmp_validation.csv') os.remove('tmp_test.csv') return train, validation, test
def load_dataset(config, device): LABEL = Field(sequential = False, dtype = torch.long, use_vocab = False,\ batch_first = True, preprocessing = lambda x:1 if float(x) > 0. else 0) TARGET = Field(batch_first=True, lower=True, dtype=torch.long, preprocessing=lambda x: x[0].split('_'), include_lengths=True) TEXT = Field(dtype = torch.long, lower = True, batch_first = True,\ preprocessing = lambda x:x[:50])# [w for w in x if w not in stopwords_set][:50]) LEADS = NestedField(TEXT, dtype = torch.long, include_lengths = True,\ tokenize = lambda s: s.split('</s>'), preprocessing = lambda x:x[-5:]) DOC = NestedField(TEXT, dtype = torch.long, include_lengths = True,\ tokenize = lambda s: s.split('</s>'), preprocessing = lambda x:[s for s in x[1:50] if s]) DOCS = NestNestedField(DOC, dtype = torch.long, include_lengths = True,\ tokenize = lambda s: s.split('</p>'), preprocessing = lambda x:x[-5:]) fields = [('label', LABEL), ('target', TARGET), ('leads', LEADS), ('docs', DOCS)] train, val, test = TabularDataset.splits(path="../abrt_data/", format = "tsv", \ fields = fields, train = config.train_file, validation = config.dev_file, test = config.test_file) TARGET.build_vocab(train, val, test) DOCS.build_vocab(train, val, test) config.wvocab_size = len(DOCS.vocab) config.tvocab_size = len(TARGET.vocab) # sort = False, train_loader, val_loader, test_loader = BucketIterator.splits((train, val, test),\ sort_key = lambda x: len(x.docs), sort = True, batch_sizes = (config.batch_size, 32, 32),\ device = device, repeat = False) return (train_loader, val_loader, test_loader)
def pad_under_five(toknized): """ 모델에서 5-gram 단위 필터를 사용하기 때문에 5-gram이 안되는 문장에 <pad>로 채워준다 """ if len(toknized) < 5: toknized.extend(["<pad>"]*(5-len(toknized))) return toknized TEXT = Field(tokenize=tagger.morphs,lower=True,include_lengths=False,batch_first=True,preprocessing=pad_under_five) LABEL = Field(sequential=False,use_vocab=True,unk_token=None) train_data, test_data = TabularDataset.splits(path=DATA_PATH+'/nsmc/', train='ratings_train.txt', test='ratings_test.txt', format='tsv', skip_header=True, fields=[('id',None),('text',TEXT),('label',LABEL)], filter_pred = lambda x: True if len(x.text) > 1 else False) # 토큰 레벨 문장의 길이가 1 이상인 경우만 허용 TEXT.build_vocab(train_data,min_freq=2) LABEL.build_vocab(train_data) # print (TEXT.vocab) # print (len(TEXT.vocab),len(LABEL.vocab)) # print (TEXT.vocab.itos[:5]) # print (LABEL.vocab.itos) train_loader, test_loader = BucketIterator.splits((train_data,test_data),sort_key=lambda x:len(x.text), sort_within_batch=True,
def train_data(): tokenize = lambda x: x.split() Text_src = Field(sequential=True, tokenize=tokenize, eos_token='<EOS>', include_lengths=True, lower=True) Answer = Field(sequential=True, tokenize=tokenize, eos_token='<EOS>', include_lengths=True, lower=True) Text_tgt = Field(sequential=True, tokenize=tokenize, eos_token='<EOS>', include_lengths=True, init_token='<SOS>', lower=True) trn_datafields = [("source",Text_src), ("target", Text_tgt), ("answer", Answer)] trn, val = TabularDataset.splits( path="../data/"+str(data_name), # the root directory where the data lies train='train.json', validation = 'validation.json', format='json', # skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data! fields={'source': trn_datafields[0], 'target': trn_datafields[1], 'answer': trn_datafields[2]}) # Text_src.build_vocab(trn, max_size=vocab_size) Text_src.build_vocab(trn, max_size=src_vocab_size) Text_tgt.build_vocab(trn, max_size=tgt_vocab_size) Answer.build_vocab(trn) Text_src.vocab.load_vectors("glove.840B.300d") Text_tgt.vocab.load_vectors("glove.840B.300d") train_iter, val_iter = BucketIterator.splits( (trn, val), # we pass in the datasets we want the iterator to draw data from batch_sizes= (batch_size, batch_size), device=-1, # if you want to use the GPU, specify the GPU number here sort_key=lambda x: len(x.source), # the BucketIterator needs to be told what function it should use to group the data. sort_within_batch=True, shuffle = True, repeat= False) Text_tgt_r = ReversibleField(sequential=True, include_lengths=True, eos_token='<EOS>', init_token='<SOS>', lower=True) Text_tgt_r.vocab = Text_tgt.vocab Text_src_r = ReversibleField(sequential=True, include_lengths=True, eos_token='<EOS>', lower=True) Text_src_r.vocab = Text_src.vocab Text_ans_r = ReversibleField(sequential=True, tokenize=tokenize, eos_token='<EOS>', include_lengths=True, lower=True) Text_ans_r.vocab = Answer.vocab src_pad = Text_src.vocab.stoi['<pad>'] src_unk = Text_src.vocab.stoi['<unk>'] src_eos = Text_src.vocab.stoi['<EOS>'] src_special = [src_pad, src_unk, src_eos] ans_pad = Answer.vocab.stoi['<pad>'] ans_unk = Answer.vocab.stoi['<unk>'] ans_eos = Answer.vocab.stoi['<EOS>'] ans_special = [ans_pad, ans_unk, ans_eos] tgt_pad = Text_tgt.vocab.stoi['<pad>'] tgt_unk = Text_tgt.vocab.stoi['<unk>'] tgt_eos = Text_tgt.vocab.stoi['<EOS>'] tgt_sos = Text_tgt.vocab.stoi['<SOS>'] tgt_special = [tgt_pad, tgt_unk, tgt_eos, tgt_sos] # discriminator data iterator passage = Field(sequential=True, tokenize=tokenize, eos_token='<EOS>', include_lengths=True, lower=True) ans = Field(sequential=True, tokenize=tokenize, eos_token='<EOS>', include_lengths=True, lower=True) ques = Field(sequential=True, tokenize=tokenize, eos_token='<EOS>',include_lengths=True, lower=True) target = Field(sequential=False, use_vocab=False) disc_trn_datafields = [("question", ques), ("answer", ans), ("passage", passage), ("target", target)] disc_trn = TabularDataset( path="../data/" + str(data_name) + "/disc.json", # the root directory where the data lies # train='disc.json', format='json', # skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data! fields={'question': disc_trn_datafields[0], 'answer': disc_trn_datafields[1], 'passage': disc_trn_datafields[2], 'target': disc_trn_datafields[3]}) passage.vocab = Text_src.vocab ans.vocab = Answer.vocab ques.vocab = Text_tgt.vocab disc_train_iter = BucketIterator( dataset=disc_trn, # we pass in the datasets we want the iterator to draw data from batch_size = batch_size, device=-1, # if you want to use the GPU, specify the GPU number here sort_key=lambda x: len(x.question), # the BucketIterator needs to be told what function it should use to group the data. sort_within_batch=True, shuffle=True, repeat=False) # raw data iterator Text_tgt_raw = ReversibleField(sequential=True, tokenize=tokenize, include_lengths=True, lower=True) trn_datafields = [("source", Text_tgt_raw), ("target", Text_tgt_raw)] trn_raw, val_raw = TabularDataset.splits( path="../data/"+str(data_name), # the root directory where the data lies train='train.json', validation='validation.json', format='json', # skip_header=True, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data! fields={'source': trn_datafields[0], 'target': trn_datafields[1]}) Text_tgt_raw.build_vocab(val_raw) train_iter_raw, val_iter_raw = BucketIterator.splits( (trn_raw, val_raw), # we pass in the datasets we want the iterator to draw data from batch_sizes=(batch_size, batch_size), device=-1, # if you want to use the GPU, specify the GPU number here sort_key=lambda x: len(x.source), # the BucketIterator needs to be told what function it should use to group the data. sort_within_batch=True, shuffle=True, repeat=False) return train_iter, val_iter, src_special, tgt_special, Text_tgt_r, val_iter_raw, Text_tgt_raw, Text_src_r,\ Text_src, Text_tgt, ans_special, Text_ans_r, disc_train_iter
def tokenize_en(sentence): return [tok.text for tok in en.tokenizer(sentence)] def tokenize_fr(sentence): return [tok.text for tok in fr.tokenizer(sentence)] EN_TEXT = Field(tokenize=tokenize_en) FR_TEXT = Field(tokenize=tokenize_fr, init_token='<sos>', eos_token='<eos>') # associate the text in the 'English' column with the EN_TEXT field, # and 'French' with FR_TEXT data_fields = [('English', EN_TEXT), ('French', FR_TEXT)] train, val = TabularDataset.splits(path='data_small', train='train.csv', validation='val.csv', format='csv', fields=data_fields) FR_TEXT.build_vocab(train, val) EN_TEXT.build_vocab(train, val) max_src_in_batch, max_tgt_in_batch = 100, 100 def batch_size_fn(new, count, sofar): "Keep augmenting batch and calculate total number of tokens + padding." global max_src_in_batch, max_tgt_in_batch if count == 1: max_src_in_batch = 0 max_tgt_in_batch = 0
def tokenize(text): return [tok.text for tok in spacy_en.tokenizer(text)] #tokenize = lambda x: x.split() # preprocess quote = Field(sequential=True, use_vocab=True, tokenize=tokenize, lower=True) score = Field(sequential=False, use_vocab=False) fields = {'quote': ('q', quote), 'score': ('s', score)} train_data, test_data = TabularDataset.splits( path='data', train='train.json', test='test.json', #validation='validation.json', format='json', fields=fields) print(train_data[0].__dict__.keys()) print(train_data[0].__dict__.values()) ''' train_data, test_data = TabularDataset.splits( path='data', train='train.csv', test='test.csv', format='csv', fields=fields ) train_data, test_data = TabularDataset.splits( path='data',
from utils.custom_utils import create_sentence, tokenize_text, modified_bleu, foldify from utils.utils import save_checkpoint, load_checkpoint input_text = Field(tokenize=tokenize_text, lower=True, init_token="<sos>", eos_token="<eos>") output_text = Field(tokenize=tokenize_text, lower=True, init_token="<sos>", eos_token="<eos>") fields = {'Input': ('i', input_text), 'Output': ('o', output_text)} big_data = TabularDataset.splits(path="", train="./shuffledgutenberg.json", format='json', fields=fields) input_text.build_vocab(big_data[0], max_size=20_000, min_freq=8) # , vectors='fasttext.simple.300d' output_text.build_vocab(big_data[0], max_size=20_000, min_freq=8) # , vectors='fasttext.simple.300d' print("Input Vocab Size: {}".format(len(input_text.vocab))) print("Output Vocab Size: {}".format(len(output_text.vocab))) # We're ready to define everything we need for training our Seq2Seq model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") load_model = True save_model = True
DEVICE = 0 if USE_CUDA else -1 batch_size = 64 # Tokenizer tagger = Mecab() tagger = tagger.morphs # Make Field REVIEW = Field(tokenize=tagger, use_vocab=True, lower=True, #init_token="<s>", eos_token="</s>", include_lengths=True, batch_first=True) LABEL = Field(sequential=False, use_vocab=False, preprocessing=lambda x: int(x)) # Get train/test data train_data, test_data = TabularDataset.splits( path="./data/", train='train_docs.txt', validation="test_docs.txt", format='tsv', fields=[('review', REVIEW), ('label', LABEL)]) # Build Vocaburary REVIEW.build_vocab(train_data) len(REVIEW.vocab) # Make iterator for splits train_iter, test_iter = BucketIterator.splits( (train_data, test_data), batch_size=batch_size, device=DEVICE, # device -1 : cpu, device 0 : 남는 gpu sort_key=lambda x: len(x.review), sort_within_batch=True, repeat=False) # x.TEXT 길이 기준으로 정렬 # parameters V = len(REVIEW.vocab) D = 100 H = 200
def train(args: Dict): """ Train the NMT Model. @param args (Dict): args from cmd line """ train_data_src = read_corpus(args['--train-src'], source='src') train_data_tgt = read_corpus(args['--train-tgt'], source='tgt') dev_data_src = read_corpus(args['--dev-src'], source='src') dev_data_tgt = read_corpus(args['--dev-tgt'], source='tgt') train_data = list(zip(train_data_src, train_data_tgt)) dev_data = list(zip(dev_data_src, dev_data_tgt)) train_batch_size = int(args['--batch-size']) clip_grad = float(args['--clip-grad']) valid_niter = int(args['--valid-niter']) log_every = int(args['--log-every']) model_save_path = args['--save-to'] #prefer to do our entire train,test,val split in the code itself as opposed to our previous script # remove these comments #data preprocessing for Qs and As. spacy_en = spacy.load('en') def tokenizer(text): # create a tokenizer function return [tok.text for tok in spacy_en.tokenizer(text)] TEXT = Field(sequential=True, tokenize=tokenizer, lower=True, include_lengths=True, init_token='<s>', eos_token='</s>') analogies_datafields = [("abc", TEXT), ("d", TEXT)] train, val, test = TabularDataset.splits( path="data", # the root directory where the data lies train='ngram_train.csv', validation="ngram_val.csv", test='ngram_test.csv', format='csv', skip_header= False, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data! fields=analogies_datafields) pretrained_vecs = torchtext.vocab.Vectors('../GloVe-1.2/life_vectors.txt') TEXT.build_vocab( vectors=pretrained_vecs) # specials=['<pad>', '<s>', '</s>'] if args['--cuda'] == 'cpu': torch_text_device = -1 else: torch_text_device = 0 training_iter, val_iter, test_iter = Iterator.splits( (train, val, test), sort_key=lambda x: len(x.abc), batch_sizes=(100, 20, 1), device=torch_text_device, sort_within_batch=True) model = NMT(embed_size=int(args['--embed-size']), hidden_size=int(args['--hidden-size']), dropout_rate=float(args['--dropout']), vocab=TEXT.vocab) model.train() #sets training = True uniform_init = float(args['--uniform-init']) if np.abs(uniform_init) > 0.: print('uniformly initialize parameters [-%f, +%f]' % (uniform_init, uniform_init), file=sys.stderr) for p in model.parameters(): p.data.uniform_(-uniform_init, uniform_init) device = torch.device("cuda:0" if args['--cuda'] else "cpu") print('use device: %s' % device, file=sys.stderr) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=float(args['--lr'])) num_trial = 0 train_iter = patience = cum_loss = report_loss = cum_tgt_words = report_tgt_words = 0 cum_examples = report_examples = epoch = valid_num = 0 hist_valid_scores = [] train_time = begin_time = time.time() print('begin Maximum Likelihood training') writer = SummaryWriter('logs') is_better_count = 0 #TODO: Remove this and debug the nonstopping part while True: epoch += 1 for _, data in enumerate(training_iter): (src_sents, src_lengths), (tgt_sents, _) = data.abc, data.d train_iter += 1 optimizer.zero_grad() batch_size = src_sents.shape[1] example_losses = model(src_sents, src_lengths, tgt_sents) # (batch_size,) batch_loss = example_losses.sum() loss = batch_loss / batch_size loss.backward() # clip gradient grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad) optimizer.step() batch_losses_val = batch_loss.item() report_loss += batch_losses_val cum_loss += batch_losses_val tgt_words_num_to_predict = sum( len(s[1:]) for s in tgt_sents) # omitting leading `<s>` report_tgt_words += tgt_words_num_to_predict cum_tgt_words += tgt_words_num_to_predict report_examples += batch_size cum_examples += batch_size if train_iter % log_every == 0: print('epoch %d, iter %d, avg. loss %.2f, avg. ppl %.2f ' \ 'cum. examples %d, speed %.2f words/sec, time elapsed %.2f sec' % (epoch, train_iter, report_loss / report_examples, math.exp(report_loss / report_tgt_words), cum_examples, report_tgt_words / (time.time() - train_time), time.time() - begin_time), file=sys.stderr) writer.add_scalar('Train/AvgLoss', report_loss / report_examples, epoch) writer.add_scalar('Train/AvgPPL', math.exp(report_loss / report_tgt_words), epoch) train_time = time.time() report_loss = report_tgt_words = report_examples = 0. # perform validation if train_iter % valid_niter == 0: print( 'epoch %d, iter %d, cum. loss %.2f, cum. ppl %.2f cum. examples %d' % (epoch, train_iter, cum_loss / cum_examples, np.exp(cum_loss / cum_tgt_words), cum_examples), file=sys.stderr) cum_loss = cum_examples = cum_tgt_words = 0. valid_num += 1 print('begin validation ...', file=sys.stderr) # compute dev. ppl and bleu dev_ppl, val_loss = evaluate_ppl( model, val_iter) # dev batch size can be a bit larger valid_metric = -dev_ppl print('validation: iter %d, dev. ppl %f, dev loss %f' % (train_iter, dev_ppl, val_loss), file=sys.stderr) writer.add_scalar('Val/AvgPPL', dev_ppl, epoch) writer.add_scalar('Val/AvgLoss', val_loss, epoch) is_better = len(hist_valid_scores ) == 0 or valid_metric > max(hist_valid_scores) print(hist_valid_scores) print(valid_metric) hist_valid_scores.append(valid_metric) if is_better: patience = 0 print('save currently the best model to [%s]' % model_save_path, file=sys.stderr) model.save(model_save_path) is_better_count = is_better_count + 1 print(is_better_count) # also save the optimizers' state torch.save(optimizer.state_dict(), model_save_path + '.optim') if is_better_count > 3: print('reached maximum number of epochs!', file=sys.stderr) writer.close() exit(0) elif patience < int(args['--patience']): patience += 1 print('hit patience %d' % patience, file=sys.stderr) if patience == int(args['--patience']): num_trial += 1 print('hit #%d trial' % num_trial, file=sys.stderr) if num_trial == int(args['--max-num-trial']): print('early stop!', file=sys.stderr) exit(0) # decay lr, and restore from previously best checkpoint lr = optimizer.param_groups[0]['lr'] * float( args['--lr-decay']) print( 'load previously best model and decay learning rate to %f' % lr, file=sys.stderr) # load model params = torch.load( model_save_path, map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) model = model.to(device) print('restore parameters of the optimizers', file=sys.stderr) optimizer.load_state_dict( torch.load(model_save_path + '.optim')) # set new lr for param_group in optimizer.param_groups: param_group['lr'] = lr # reset patience patience = 0 if epoch == int(args['--max-epoch']): print('reached maximum number of epochs!', file=sys.stderr) writer.close() exit(0)
use_vocab=False, batch_first=True, dtype=torch.float) text_field = Field(tokenize='spacy', lower=True, include_lengths=True, batch_first=True) fields = [('label', label_field), ('title', text_field), ('text', text_field), ('titletext', text_field)] # TabularDataset train, valid, test = TabularDataset.splits(path=source_folder, train='train.csv', validation='valid.csv', test='test.csv', format='CSV', fields=fields, skip_header=True) # Iterators train_iter = BucketIterator(train, batch_size=32, sort_key=lambda x: len(x.text), device=device, sort=True, sort_within_batch=True) valid_iter = BucketIterator(valid, batch_size=32, sort_key=lambda x: len(x.text),
def __init__(self, config): # logger self.logger = logging.getLogger(config["name"]) # data loader params self.config = config["data_loader"]["args"] data_path = self.config["data_path"] ensure_dir(data_path) self.train_path = os.path.join(data_path, self.config["train_file"]) self.valid_path = os.path.join(data_path, self.config["valid_file"]) self.test_path = os.path.join(data_path, self.config["test_file"]) # limit max text length self.context_threshold = self.config["context_threshold"] self.logger.info("preprocessing data files...") if not os.path.exists(self.train_path) or not os.path.exists( self.valid_path): self.preprocess(type="train") if not os.path.exists(self.test_path): self.preprocess(type="test") # define filed TEXT = Field(sequential=True, use_vocab=True, tokenize=lambda x: x, lower=True, include_lengths=True, batch_first=True) LABLE = Field(sequential=False, use_vocab=False) # build dataset self.logger.info("building dataset......") train_dict_fileds = {'text': ('text', TEXT), 'label': ('label', LABLE)} self.train, self.valid, self.test = TabularDataset.splits( path=data_path, # data root path format="json", train=self.config["train_file"], validation=self.config["valid_file"], test=self.config["test_file"], fields=train_dict_fileds) # build vocab self.logger.info("building vocab....") TEXT.build_vocab(self.train, self.valid) # load pretrained embeddings self.logger.info("load pretrained embeddings...") Vectors = vocab.Vectors(self.config["pretrain_emd_file"]) TEXT.vocab.load_vectors(Vectors) # just for call easy self.vocab = TEXT.vocab # build iterators self.logger.info("building iterators.....") self.train_iter, self.valid_iter = BucketIterator.splits( (self.train, self.valid), batch_sizes=(self.config["train_batch_size"], self.config["valid_batch_size"]), device=self.config["device"], sort_key=lambda x: len(x.text), sort_within_batch=False) self.test_iter = BucketIterator( self.test, batch_size=self.config["test_batch_size"], device=self.config["device"], sort_key=lambda x: len(x.text), sort_within_batch=False) self.logger.info("building iterators done!") self.logger.info( "Total train data set is: {}, valid data set is: {}, test " "data is: {}".format(len(self.train), len(self.valid), len(self.test)))
def handle(self, *args, **kwargs): min_freq = kwargs['min_freq'] batch_size = kwargs['batch_size'] num_epochs = kwargs['num_epochs'] embedding_output = kwargs['embedding_output'] hidden_size = kwargs['hidden_size'] num_layers = kwargs['num_layers'] bi_lstm = kwargs['bi_lstm'] self.stdout.write("Loading Dataset ... ") device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.stdout.write("creating fields ...") # Fields label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float) text_field = Field(tokenize='moses', lower=True, include_lengths=True, batch_first=True) fields = [('label', label_field), ('title', text_field), ('text', text_field), ('titletext', text_field)] # TabularDataset self.stdout.write('creating TabularDataset...') train, valid, test = TabularDataset.splits(path='./data/preprocessed/', train='train.csv', validation='valid.csv', test='test.csv', format='CSV', fields=fields, skip_header=True) # Iterators self.stdout.write("Creating iterators...") train_iter = BucketIterator(train, batch_size=batch_size, sort_key=lambda x: len(x.text), device=device, sort=False, sort_within_batch=True) valid_iter = BucketIterator(valid, batch_size=batch_size, sort_key=lambda x: len(x.text), device=device, sort=False, sort_within_batch=True) test_iter = BucketIterator(test, batch_size=batch_size, sort_key=lambda x: len(x.text), device=device, sort=False, sort_within_batch=True) # Vocabulary self.stdout.write("Creating vocabulary") text_field.build_vocab(train, min_freq=min_freq, ) class FakeNewsNet(nn.Module): def __init__(self, vocab_size=len(text_field.vocab), hidden_size=300, num_layers=1, bi_lstm=True): super(FakeNewsNet, self).__init__() self.vocab_size = vocab_size self.hidden_size = hidden_size self.num_layers = num_layers self.bi_lstm = bi_lstm self.embedding = nn.Embedding(self.vocab_size, embedding_output) self.LSTM = nn.LSTM(input_size=embedding_output, hidden_size=self.hidden_size, num_layers=self.num_layers, bidirectional=self.bi_lstm, batch_first=True) self.drop = nn.Dropout(p=0.5) if bi_lstm: self.out = nn.Linear(2 * self.hidden_size, 1) else: self.out = nn.Linear(self.hidden_size, 1) def forward(self, inp, input_len): embeded_text = self.embedding(inp) packed_input = pack_padded_sequence(embeded_text, input_len, batch_first=True, enforce_sorted=False) packed_output, _ = self.LSTM(packed_input) output, _ = pad_packed_sequence(packed_output, batch_first=True) out_forward = output[range(len(output)), input_len - 1, :self.hidden_size] out_reverse = output[:, 0, self.hidden_size:] out_reduced = torch.cat((out_forward, out_reverse), 1) text_fea = self.drop(out_reduced) text_fea = self.out(text_fea) text_fea = torch.squeeze(text_fea, 1) text_out = torch.sigmoid(text_fea) return text_out def save_checkpoint(save_path, model, optimizer, valid_loss): if save_path == None: return state_dict = {'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'valid_loss': valid_loss} torch.save(state_dict, save_path) self.stdout.write(f'Model saved to :{save_path}') def load_checkpoint(load_path, model, optimizer): if load_path == None: return state_dict = torch.load(load_path, map_location=device) self.stdout.write(f'Model loaded from : {load_path}') model.load_state_dict(state_dict['model_state_dict']) optimizer.load_state_dict(state_dict['optimizer_state_dict']) return state_dict['valid_loss'] def save_metrics(save_path, train_loss_list, valid_loss_list, global_steps_list): if save_path == None: return state_dict = {'train_loss_list': train_loss_list, 'valid_loss_list': valid_loss_list, 'global_steps_list': global_steps_list} torch.save(state_dict, save_path) self.stdout.write(f'Model saved to: {save_path}') def load_metrics(load_path): if load_path == None: return state_dict = torch.load(load_path, map_location=device) self.stdout.write(f'Model loaded from: {load_path}') return state_dict['train_loss_list'], state_dict['valid_loss_list'], state_dict['global_steps_list'] def train(model, optimizer, criterion=nn.BCELoss(), train_loader=train_iter, valid_loader=valid_iter, num_epochs=100, eval_every=len(train_iter) // 2, file_path='./saved', best_valid_loss=float("Inf")): # initialize running values running_loss = 0.0 valid_running_loss = 0.0 global_step = 0 train_loss_list = [] valid_loss_list = [] global_steps_list = [] # training loop self.stdout.write("training ...") model.train() for epoch in range(num_epochs): for (labels, (title, title_len), (text, text_len), (titletext, titletext_len)), _ in train_loader: labels = labels.to(device) titletext = titletext.to(device) titletext_len = titletext_len.to(device) output = model(titletext, titletext_len) loss = criterion(output, labels) optimizer.zero_grad() loss.backward() optimizer.step() # update running values running_loss += loss.item() global_step += 1 # evaluation step if global_step % eval_every == 0: model.eval() with torch.no_grad(): # validation loop for (labels, (title, title_len), (text, text_len), (titletext, titletext_len)), _ in valid_loader: labels = labels.to(device) titletext = titletext.to(device) titletext_len = titletext_len.to(device) output = model(titletext, titletext_len) loss = criterion(output, labels) valid_running_loss += loss.item() # evaluation average_train_loss = running_loss / eval_every average_valid_loss = valid_running_loss / len(valid_loader) train_loss_list.append(average_train_loss) valid_loss_list.append(average_valid_loss) global_steps_list.append(global_step) # resetting running values running_loss = 0.0 valid_running_loss = 0.0 model.train() # self.stdout.write progress self.stdout.write('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}' .format(epoch + 1, num_epochs, global_step, num_epochs * len(train_loader), average_train_loss, average_valid_loss)) # checkpoint if best_valid_loss > average_valid_loss: best_valid_loss = average_valid_loss save_checkpoint(file_path + '/model.pt', model, optimizer, best_valid_loss) save_metrics(file_path + '/metrics.pt', train_loss_list, valid_loss_list, global_steps_list) save_metrics(file_path + '/metrics.pt', train_loss_list, valid_loss_list, global_steps_list) self.stdout.write('Finished Training!') model = FakeNewsNet(hidden_size=hidden_size, num_layers=num_layers, bi_lstm=bi_lstm).to(device) self.stdout.write(model) optimizer = optim.Adam(model.parameters(), lr=0.01, eps=1e-6, ) train(model=model, optimizer=optimizer, num_epochs=num_epochs, eval_every=2)
def main(): # -----------------get train, val and test data-------------------- train_data, test_data = TabularDataset.splits( path=r"D:\ruin\data\csv_file\imdb_split", train='train_data.csv', test='test_data.csv', format='csv', fields=[('review', TEXT), ('sentiment', LABEL)], skip_header=True) train_data, eval_data = train_data.split(random_state = random.seed(RANDOM_SEED)) print('Number of train data {}'.format(len(train_data))) print('Number of val data {}'.format(len(eval_data))) print('Number of test data {}'.format(len(test_data))) TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE, vectors="glove.6B.100d", min_freq=10) LABEL.build_vocab(train_data) print('Unique token in Text vocabulary {}'.format(len(TEXT.vocab))) # 250002(<unk>, <pad>) print(TEXT.vocab.itos) print('Unique token in LABEL vocabulary {}'.format(len(LABEL.vocab))) print(LABEL.vocab.itos) print('Top 20 frequency of word: \n {}'.format(TEXT.vocab.freqs.most_common(20))) print('Embedding shape {}'.format(TEXT.vocab.vectors.size)) print('Done') # generate dataloader train_iter = data.BucketIterator(train_data, batch_size=BATCH_SIZE, device=device, shuffle=True) eval_iter, test_iter = data.BucketIterator.splits((eval_data, test_data), batch_size=BATCH_SIZE, device=device, sort_key=lambda x: len(x.review), sort_within_batch=True) ## https://stackoverflow.com/questions/58241313/understanding-typeerror-not-supported-between-instances-of-example-and-e ## sort_key=lambda x: len(x.review) 추가에 대해선 위 링크에서 설명. for batch_data in train_iter: print(batch_data.review) # text, text_length print(batch_data.sentiment) # label break # construct model VOCAB_SIZE = len(TEXT.vocab) HIDDEN_SIZE = 256 OUTPUT_SIZE = 1 NUM_LAYER = 2 BIDIRECTIONAL = True DROPOUT = 0.5 EMBEDDING_DIM = 100 PAD_INDEX = TEXT.vocab.stoi[TEXT.pad_token] UNK_INDEX = TEXT.vocab.stoi[TEXT.unk_token] model = BiLSTMSentiment(vocab_size=VOCAB_SIZE, embedding_dim=EMBEDDING_DIM, hidden_size=HIDDEN_SIZE, output_size=OUTPUT_SIZE, num_layer=NUM_LAYER, bidirectional=BIDIRECTIONAL, dropout=DROPOUT, pad_index=PAD_INDEX) # load pretrained weight of embedding layer pretrained_embedding = TEXT.vocab.vectors print(pretrained_embedding) pretrained_embedding[PAD_INDEX] = 0 pretrained_embedding[UNK_INDEX] = 0 print(pretrained_embedding) model.embedding.weight.data.copy_(pretrained_embedding) # optimizer optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4) # criterion criterion = nn.BCEWithLogitsLoss() scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1) model = model.to(device) EPOCH = 5 # MODEL_PATH = './output/bilstm_model.pth' # BEST_MODEL_PATH = './output/bilstm_model_best.pth' best_eval_loss = float('inf') for epoch in range(EPOCH): print('{}/{}'.format(epoch, EPOCH)) train_acc, train_loss = train(model, train_iter, optimizer=optimizer, criterion=criterion) eval_acc, eval_loss = test(model, eval_iter, criterion=criterion) print('Train => acc {:.3f}, loss {:4f}'.format(train_acc, train_loss)) print('Valid => acc {:.3f}, loss {:4f}'.format(eval_acc, eval_loss)) scheduler.step() # save model state = { 'vocab_size': VOCAB_SIZE, 'embedding_dim': EMBEDDING_DIM, 'hidden_size': HIDDEN_SIZE, 'output_size': OUTPUT_SIZE, 'num_layer': NUM_LAYER, 'bidirectional': BIDIRECTIONAL, 'dropout': DROPOUT, 'state_dict': model.state_dict(), 'pad_index': PAD_INDEX, 'unk_index': UNK_INDEX, 'text_vocab': TEXT.vocab.stoi, } # os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True) # torch.save(state, MODEL_PATH) if eval_loss < best_eval_loss: # shutil.copy(MODEL_PATH, BEST_MODEL_PATH) best_eval_loss = eval_loss test_acc, test_loss = test(model, test_iter, criterion=criterion) print('Test Eval => acc {:.3f}, loss {:4f}'.format(test_acc, test_loss))
unk_token=g_bert_tokenizer.unk_token_id) TGT = Field(use_vocab=False, tokenize=g_gpt_tokenizer.tokenize, preprocessing=g_gpt_tokenizer.convert_tokens_to_ids, init_token=g_gpt_tokenizer.bos_token_id, eos_token=g_gpt_tokenizer.eos_token_id, pad_token=g_gpt_tokenizer.eos_token_id, unk_token=g_gpt_tokenizer.unk_token_id) g_data_fields = [('src', SRC), ('tgt', TGT)] train_data, validation_data, test_data = TabularDataset.splits( path='datasets/', format='csv', train='chat_corpus_train.csv', validation='chat_corpus_validation.csv', test='chat_corpus_test.csv', skip_header=False, fields=g_data_fields) g_device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') train_iterator, validation_iterator, test_iterator = BucketIterator.splits( (train_data, validation_data, test_data), batch_size=BATCH_SIZE, sort_key=lambda x: len(x.src), # function used to group the data sort_within_batch=False, device=g_device) class TransBertEncoder(nn.Module):
TEXT = data.Field( sequential=True, use_vocab=True, tokenize=okt.morphs, lower=True, batch_first=True, fix_length=20) # 모든 text length를 fix_length에 맞추고 길이가 부족하면 padding LABEL = data.Field(sequential=False, use_vocab=False, is_target=True) # 대상 변수인지 여부, train_data, test_data = TabularDataset.splits(path='.', train='ratings_train.txt', test='ratings_test.txt', format='tsv', fields=[('id', ID), ('text', TEXT), ('label', LABEL)], skip_header=True) print(vars(train_data[0])) TEXT.build_vocab(train_data, min_freq=10, max_size=70000) # 최소 10번 이상 나온 단어 word2index batch_size = 50 train_loader = Iterator(dataset=train_data, batch_size=batch_size, shuffle=True) test_loader = Iterator(dataset=test_data, batch_size=batch_size)
def decode(args: Dict[str, str]): """ Performs decoding on a test set, and save the best-scoring decoding results. If the target gold-standard sentences are given, the function also computes corpus-level BLEU score. @param args (Dict): args from cmd line """ spacy_en = spacy.load('en') def tokenizer(text): # create a tokenizer function return [tok.text for tok in spacy_en.tokenizer(text)] TEXT = Field(sequential=True, tokenize=tokenizer, lower=True, include_lengths=True, init_token='<s>', eos_token='</s>') analogies_datafields = [("abc", TEXT), ("d", TEXT)] train, val, test = TabularDataset.splits( path="data", # the root directory where the data lies train='ngram_train.csv', validation="ngram_val.csv", test='ngram_test.csv', format='csv', skip_header= False, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data! fields=analogies_datafields) pretrained_vecs = torchtext.vocab.Vectors('../GloVe-1.2/life_vectors.txt') TEXT.build_vocab( vectors=pretrained_vecs) # specials=['<pad>', '<s>', '</s>'] if args['--cuda'] == 'cpu': torch_text_device = -1 else: torch_text_device = 0 training_iter, val_iter, test_iter = Iterator.splits( (train, val, test), sort_key=lambda x: len(x.abc), batch_sizes=(100, 20, 1), device=torch_text_device, sort_within_batch=True) print("load test source sentences from [{}]".format( args['TEST_SOURCE_FILE']), file=sys.stderr) test_data_src = read_corpus(args['TEST_SOURCE_FILE'], source='src') if args['TEST_TARGET_FILE']: print("load test target sentences from [{}]".format( args['TEST_TARGET_FILE']), file=sys.stderr) test_data_tgt = read_corpus(args['TEST_TARGET_FILE'], source='tgt') print("load model from {}".format(args['MODEL_PATH']), file=sys.stderr) model = NMT.load(args['MODEL_PATH']) if args['--cuda']: model = model.to(torch.device("cuda:0")) hypotheses = beam_search(model, test_iter, beam_size=int(args['--beam-size']), max_decoding_time_step=int( args['--max-decoding-time-step'])) if args['TEST_TARGET_FILE']: top_hypotheses = [hyps[0] for hyps in hypotheses] bleu_score = compute_corpus_level_bleu_score(test_data_tgt, top_hypotheses) #accuracy (unigrams) perfectly_correct = 0 for index, hyp in enumerate(top_hypotheses): if hyp.value[0] == test_data_tgt[index][1]: perfectly_correct += 1 print('Ignore accuracy for non unigrams') print('Accuracy: {}'.format(perfectly_correct / len(test_data_tgt)), file=sys.stderr) print('Corpus BLEU: {}'.format(bleu_score * 100), file=sys.stderr) with open(args['OUTPUT_FILE'], 'w') as f: for src_sent, hyps in zip(test_data_src, hypotheses): top_hyp = hyps[0] hyp_sent = ' '.join(top_hyp.value) f.write(hyp_sent + '\n')
decoy_strength = args.decoy_strength seed(p) s = S(p) out_name = str(args.which_adversarial) + p._str(p) use_individual = True # XXX torch.cuda.set_device(args.gpu) inputs = data.Field(lower=True) answers = data.Field(sequential=False, unk_token=None) tv_datafields = [("text", inputs), ("label", answers)] train, dev, test = TabularDataset.splits( path=dataset_path, # the root directory where the data lies train='train_bias_SST.csv', validation="dev_bias_SST.csv", test="test_bias_SST.csv", format='csv', skip_header= False, # if your csv header has a header, make sure to pass this to ensure it doesn't get proceesed as data! fields=tv_datafields) inputs.build_vocab(train, dev, test) if args.word_vectors: if os.path.isfile(args.vector_cache): inputs.vocab.vectors = torch.load(args.vector_cache) else: inputs.vocab.load_vectors(args.word_vectors) os.makedirs(os.path.dirname(args.vector_cache), exist_ok=True) torch.save(inputs.vocab.vectors, args.vector_cache) answers.build_vocab(train) class_decoy = (inputs.vocab.stoi['a'], inputs.vocab.stoi['the'])
patience = 0 decoy_strength = args.decoy_strength seed(p) s = S(p) out_name = str(args.which_adversarial) + p._str(p) torch.cuda.set_device(args.gpu) inputs = data.Field(lower=True) answers = data.Field(sequential=False, unk_token=None) tv_datafields = [("text", inputs), ("label", answers)] train, dev, test = TabularDataset.splits( path=dataset_path, # the root directory where the data lies train='train_bias_SST.csv', validation="dev_bias_SST.csv", test="test_bias_SST.csv", format='csv', skip_header=False, fields=tv_datafields) inputs.build_vocab(train, dev, test) if args.word_vectors: if os.path.isfile(args.vector_cache): inputs.vocab.vectors = torch.load(args.vector_cache) else: inputs.vocab.load_vectors(args.word_vectors) os.makedirs(os.path.dirname(args.vector_cache), exist_ok=True) torch.save(inputs.vocab.vectors, args.vector_cache) answers.build_vocab(train) class_decoy = (inputs.vocab.stoi['a'], inputs.vocab.stoi['the'])
def main(): train_data, test_data = TabularDataset.splits( path=r"D:\ruin\data\csv_file\imdb_split", train='train_data.csv', test='test_data.csv', format='csv', fields=[('review', TEXT), ('sentiment', LABEL)], skip_header=True) train_data, valid_data = train_data.split(random_state=random.seed(SEED)) print('Number of train data {}'.format(len(train_data))) print('Number of val data {}'.format(len(valid_data))) print('Number of test data {}'.format(len(test_data))) MAX_VOCAB_SIZE = 25_000 TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE, vectors='glove.6B.100d', unk_init=torch.Tensor.normal_) LABEL.build_vocab(train_data) BATCH_SIZE = 64 train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits( (train_data, valid_data, test_data), batch_size=BATCH_SIZE, sort_key=lambda x: len(x.review), device=device) INPUT_DIM = len(TEXT.vocab) EMBEDDING_DIM = 100 N_FILTERS = 100 FILTER_SIZES = [3, 4, 5] OUTPUT_DIM = 1 DROPOUT = 0.5 PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX) pretrained_embeddings = TEXT.vocab.vectors model.embedding.weight.data.copy_(pretrained_embeddings) UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token] model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM) model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM) optimizer = torch.optim.Adam(model.parameters()) criterion = nn.BCEWithLogitsLoss() model = model.to(device) criterion = criterion.to(device) def epoch_time(start_time, end_time): elapsed_time = end_time - start_time elapsed_mins = int(elapsed_time / 60) elapsed_secs = int(elapsed_time - (elapsed_mins * 60)) return elapsed_mins, elapsed_secs best_valid_loss = float('inf') N_EPOCHS = 5 for epoch in range(N_EPOCHS): start_time = time.time() train_loss, train_acc = train(model, train_iterator, optimizer, criterion) valid_loss, valid_acc = evaluate(model, valid_iterator, criterion) end_time = time.time() epoch_mins, epoch_secs = epoch_time(start_time, end_time) if valid_loss < best_valid_loss: best_valid_loss = valid_loss print( f'Epoch: {epoch + 1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s') print( f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%' ) print( f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc * 100:.2f}%' ) test_loss, test_acc = evaluate(model, test_iterator, criterion) print( f'\t Test. Loss: {test_loss:.3f} | Test. Acc: {test_acc * 100:.2f}%')
# Declaring Fields tokenize = lambda x: x.split() TEXT = Field(sequential=True, tokenize=tokenize, lower=True) LABEL = Field(sequential=False, use_vocab=False) # Creating the Dataset tv_datafields = [("id", None), ("comment_text", TEXT), ("toxic", LABEL), ("severe_toxic", LABEL), ("threat", LABEL), ("obscene", LABEL), ("insult", LABEL), ("identity_hate", LABEL)] train_data, valid_data = TabularDataset.splits( path="data", # the root directory where the data lies train="train.csv", validation="valid.csv", format='csv', skip_header=True, fields=tv_datafields) tst_datafields = [("id", None), ("comment_text", TEXT)] test_data = TabularDataset(path="data/test.csv", format='csv', skip_header=True, fields=tst_datafields) TEXT.build_vocab(train_data) TEXT.vocab.freqs.most_common(10) print(train_data[0]) print(train_data[0].__dict__.keys)
def load_dataset(source_folder, device, tokenizer, MAX_SEQ_LEN=128, BATCH_SIZE=16, name_of_train_dataset='train.csv', name_of_validation_dataset='valid.csv', name_of_test_dataset='test.csv'): PAD_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token) UNK_INDEX = tokenizer.convert_tokens_to_ids(tokenizer.unk_token) # Fields label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float) text_field = Field(use_vocab=False, tokenize=tokenizer.encode, lower=False, include_lengths=False, batch_first=True, fix_length=MAX_SEQ_LEN, pad_token=PAD_INDEX, unk_token=UNK_INDEX) # note, the fields must be in the same order as the input csv columns fields = [('clause_text', text_field), ('label', label_field)] # TabularDataset train, valid, test = TabularDataset.splits( path=source_folder, train=name_of_train_dataset, validation=name_of_validation_dataset, test=name_of_test_dataset, format='CSV', fields=fields, skip_header=True) # Iterators train_iter = BucketIterator(train, batch_size=BATCH_SIZE, sort_key=lambda x: len(x.clause_text), device=device, train=True, sort=True, sort_within_batch=True) valid_iter = BucketIterator(valid, batch_size=BATCH_SIZE, sort_key=lambda x: len(x.clause_text), device=device, train=True, sort=True, sort_within_batch=True) test_iter = Iterator(test, batch_size=BATCH_SIZE, device=device, train=False, shuffle=False, sort=False) return train_iter, valid_iter, test_iter
def run_smiles_generator(test_file): src = Field(sequential=True, tokenize=tokenize_drug, init_token='<sos>', eos_token='<eos>') trg = Field(sequential=True, tokenize=tokenize_drug, init_token='<sos>', eos_token='<eos>') #Get the train and test set in torchtext format datafields = [ ("src", src), # we won't be needing the id, so we pass in None as the field ("trg", trg) ] train, test = TabularDataset.splits(path='../data/SMILES_Autoencoder/', train='all_smiles_revised_final.csv', test=test_file, format='csv', skip_header=True, fields=datafields) #Split the dataset into train and validation set train_data, valid_data = train.split(split_ratio=0.99) print(f"Number of examples: {len(train_data.examples)}") src.build_vocab(train_data, min_freq=2) trg.build_vocab(train_data, min_freq=2) #Total no of unique words in our vocabulary print(f"Unique tokens in source vocabulary: {len(src.vocab)}") print(f"Unique tokens in target vocabulary: {len(trg.vocab)}") TRG_PAD_IDX = trg.vocab.stoi[trg.pad_token] print("Padding Id: ", TRG_PAD_IDX) #Create the iterator to traverse over test samples for which we need to generate latent space BATCH_SIZE = 128 (train_iterator, test_iterator) = BucketIterator.splits( (train_data, test), batch_size=BATCH_SIZE, device=DEVICE, sort=False, shuffle=False) print(src.vocab.stoi) print(trg.vocab.stoi) #Define the model once again INPUT_DIM = len(src.vocab) OUTPUT_DIM = len(trg.vocab) ENC_EMB_DIM = 128 DEC_EMB_DIM = 128 HID_DIM = 256 N_LAYERS = 1 ENC_DROPOUT = 0.0 DEC_DROPOUT = 0.0 enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT) dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT) model = Seq2Seq(enc, dec, device=DEVICE).to(DEVICE) model.apply(init_weights) optimizer = optim.Adam(model.parameters()) criterion = nn.CrossEntropyLoss().to(DEVICE) model.load_state_dict( torch.load('../models/lstm_out/torchtext_checkpoint.pt', map_location=torch.device('cpu'))) #Get latent space for all drugs model.eval() epoch_loss = 0 ls_list = [] encode_list = [] decode_list = [] error_list = [] with torch.no_grad(): for j, batch in enumerate(test_iterator): new_src = batch.src new_trg = batch.trg #Get output outputs = model(new_src, new_trg, 1) #turn on teacher forcing output = outputs[0] hidden = outputs[1] cell_state = outputs[2] #Get latent space o1 = torch.argmax(torch.softmax(output, dim=2), dim=2) h1 = torch.mean(hidden, dim=0).cpu().detach().tolist() c1 = torch.mean(cell_state, dim=0).cpu().detach().tolist() for i in range(len(h1)): temp_ls = h1[i] temp_encode = new_trg[:, i].cpu().detach().tolist() temp_decode = o1[:, i].cpu().detach().tolist() try: index_1 = temp_decode.index(1) except: index_1 = len(temp_decode) temp_error = np.array(temp_encode) - np.array(temp_decode) error = sum( np.absolute(temp_error[1:index_1]) > 0) / len(temp_error) error_list.append(error) ls_list.append(temp_ls) encode_list.append(temp_encode) decode_list.append(temp_decode) output_dim = output.shape[-1] output = output[1:].view(-1, output_dim) rev_trg = new_trg[1:].view(-1) loss = criterion(output, rev_trg) print("Reconstruction Loss for iteration " + str(j) + " is :" + str(round(loss.item(), 3))) epoch_loss += loss.item() #Print overall average error print("Average reconstruction error: ", epoch_loss / len(test_iterator)) torch.cuda.empty_cache() final_list, only_smiles_list = [], [] for i in range(len(encode_list)): temp_encode = encode_list[i] temp_decode = decode_list[i] temp_encode_str, temp_decode_str, temp_mol_str, temp_error_str = '', '', '', '' #Get original string for j in range(1, len(temp_encode)): #Break when it sees padding if (temp_encode[j] == 1): break #Don't pad end of sentence if (temp_encode[j] != 3): temp_encode_str += src.vocab.itos[temp_encode[j]] #Get decoded string for j in range(1, len(temp_decode)): if (temp_decode[j] == 1): break if (temp_decode[j] != 3): temp_decode_str += src.vocab.itos[temp_decode[j]] #m = Chem.MolFromSmiles(temp_decode_str) #if (m is not None): # temp_mol_str = '1' #else: # temp_mol_str = '0' #string_list = [temp_encode_str, temp_decode_str, temp_mol_str, str(error_list[i])] #only_smiles_list.append(string_list) #string_list_with_ls = string_list + ls_list[i] #final_list.append(string_list_with_ls) colids = ['LS_' + str(x) for x in range(len(ls_list[0]))] final_out_df = pd.DataFrame(ls_list, columns=colids) return (final_out_df)
a = 'Hello World!' print(a) print('Tokenization: ', tokenizer(a)) #define fields TEXT = Field(sequential=True, tokenize=tokenizer, lower=True) LABEL = Field(sequential=False, use_vocab=False, dtype=torch.float) fields = [("question_text", TEXT), ("label", LABEL)] #load datasets train_data, valid_data, test_data = TabularDataset.splits( path="data/", train='train.csv', validation='validation.csv', test='test.csv', format='csv', fields=fields, skip_header=True) ##print(len(train_data)) ##print(len(valid_data)) ##print(len(test_data)) #build vocabulary TEXT.build_vocab(train_data) print('Vocabulary size: ', len(TEXT.vocab)) print('First example: ', vars(train_data.examples[0])) #print(train_data[0])
def main(): # Add ckp parser = argparse.ArgumentParser( description='PyTorch PennTreeBank RNN/LSTM Language Model') parser.add_argument( '--data', type=str, default='/input', # /input help='location of the data corpus') parser.add_argument('--checkpoint', type=str, default='', help='model checkpoint to use') parser.add_argument( '--model', type=str, default='LSTM', help='type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)') parser.add_argument('--emsize', type=int, default=200, help='size of word embeddings') parser.add_argument('--nhid', type=int, default=200, help='number of hidden units per layer') parser.add_argument('--nlayers', type=int, default=2, help='number of layers') parser.add_argument('--lr', type=float, default=0.001, help='initial learning rate') parser.add_argument('--clip', type=float, default=0.25, help='gradient clipping') parser.add_argument('--epochs', type=int, default=40, help='upper epoch limit') parser.add_argument('--batch_size', type=int, default=256, metavar='N', help='batch size') parser.add_argument('--dropout', type=float, default=0.2, help='dropout applied to layers (0 = no dropout)') parser.add_argument('--tied', action='store_true', help='tie the word embedding and softmax weights') parser.add_argument('--seed', type=int, default=1111, help='random seed') parser.add_argument('--cuda', action='store_true', help='use CUDA') parser.add_argument('--log-interval', type=int, default=200, metavar='N', help='report interval') parser.add_argument( '--save', type=str, default='/output/model.pt', # /output help='path to save the final model') args = parser.parse_args() # Set the random seed manually for reproducibility. torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) else: torch.cuda.manual_seed(args.seed) # Load checkpoint build_vocab = False if args.checkpoint != '' and os.path.exists(args.checkpoint): print(f'Loading field from {args.checkpoint}') save_dict = torch.load(args.checkpoint) field = save_dict['field'] start_epoch = save_dict['start_epoch'] else: save_dict = None field = Field(tokenize=split_tokenize, init_token='<init>') build_vocab = True start_epoch = 0 ############################################################################### # Load data ############################################################################### train_data, val_data, test_data = TabularDataset.splits( path=args.data, train='train.txt', validation='valid.txt', test='test.txt', format='tsv', fields=[('text', field)]) print(train_data, len(train_data), val_data, len(val_data), test_data, len(test_data)) if build_vocab: field.eos_token = '<eos>' field.build_vocab(train_data, val_data, min_freq=1000) field.eos_token = None eos_id = field.vocab.stoi['<eos>'] pad_id = field.vocab.stoi[field.pad_token] train_iter = BucketIterator(train_data, args.batch_size, train=True, repeat=False, device='cuda:0' if args.cuda else 'cpu:0') val_iter = Iterator(val_data, args.batch_size, repeat=False, device='cuda:0' if args.cuda else 'cpu:0') test_iter = Iterator(test_data, args.batch_size, repeat=False, device='cuda:0' if args.cuda else 'cpu:0') print(train_iter, len(train_iter), val_iter, len(val_iter), test_iter, len(test_iter)) ############################################################################### # Build the model ############################################################################### ntokens = len(field.vocab) model = RNNModel(args.model, ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied) if save_dict is not None: model.load_state_dict(save_dict['model']) if args.cuda: model.cuda() else: model.cpu() print(model) if save_dict: opt = save_dict['optimizer'] else: opt = torch.optim.Adam(model.parameters(), lr=args.lr) if args.checkpoint: torch.save( dict(field=field, model=model.state_dict(), optimizer=opt, start_epoch=start_epoch), args.checkpoint) ############################################################################### # Training code ############################################################################### criterion = torch.nn.CrossEntropyLoss(ignore_index=pad_id) def make_target(text): batch_size = text.size()[1] eos_vector = torch.full((1, batch_size), eos_id, dtype=text.dtype, device='cuda:0' if args.cuda else 'cpu:0') target = torch.cat((text[1:], eos_vector), dim=0) return target def compute_loss(output, text): output_flat = output.view(-1, ntokens) target = make_target(text) target_flat = target.view(-1) return criterion(output_flat, target_flat) def evaluate(data_source): # Turn on evaluation mode which disables dropout. with torch.no_grad(): model.eval() total_loss = 0 for batch in data_source: output, hidden = model(batch.text) loss = compute_loss(output, batch.text) total_loss += loss.item() return total_loss / len(data_source) def train(): # Turn on training mode which enables dropout. model.train() total_loss = 0 start_time = time.time() for i, batch in enumerate(train_iter): model.zero_grad() output, hidden = model(batch.text) target = make_target(batch.text) loss = compute_loss(output, batch.text) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip) opt.step() total_loss += loss.item() if i % args.log_interval == 0 and i > 0: cur_loss = total_loss / args.log_interval elapsed = time.time() - start_time print( '| epoch {:3d} | {:5d}/{:5d} batches | ms/batch {:5.2f} | ' 'loss {:5.2f} | ppl {:8.2f}'.format( epoch, i, len(train_iter), elapsed * 1000 / args.log_interval, cur_loss, math.exp(cur_loss))) total_loss = 0 start_time = time.time() # Loop over epochs. best_val_loss = None # At any point you can hit Ctrl + C to break out of training early. try: for epoch in range(start_epoch, args.epochs): epoch_start_time = time.time() train() val_loss = evaluate(val_iter) print('-' * 89) print( '| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | ' 'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time), val_loss, math.exp(val_loss))) print('-' * 89) # Save the model if the validation loss is the best we've seen so far. if not best_val_loss or val_loss < best_val_loss: if args.checkpoint: torch.save( dict(field=field, model=model.state_dict(), optimizer=opt, start_epoch=epoch), args.checkpoint) best_val_loss = val_loss except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') torch.save( dict(vocab=field.vocab.itos, model=model.state_dict(), settings=dict(rnn_type=args.model, emsize=args.emsize, nhid=args.nhid, nlayers=args.nlayers)), args.save) # Load the best saved model. #with open(args.save, 'rb') as f: # save_dict = torch.load(f) # field = save_dict['field'] # if save_dict is not None: # model.load_state_dict(save_dict['model']) # # if args.cuda: # model.cuda() # else: # model.cpu() # Run on test data. test_loss = evaluate(test_iter) print('=' * 89) print('| End of training | test loss {:5.2f} | test ppl {:8.2f}'.format( test_loss, math.exp(test_loss))) print('=' * 89)
data_test.FULLNAME = data_test.FULLNAME.apply(split_fio) data_test.head() data_test.to_csv('C:/help_files/comp_test.csv', index=None) tokenize = lambda x: x.split(' ') TEXT = Field(sequential=True, tokenize=tokenize, lower=True) LABEL = Field(sequential=False, use_vocab=False, is_target=True) nation_fields = [('FULLNAME', TEXT), ('NATION', LABEL)] trn, vld = TabularDataset.splits(path='C:/help_files/', train='train.csv', validation="test.csv", format='csv', skip_header=True, fields=nation_fields) TEXT.build_vocab(trn) TEXT.vocab.freqs.most_common(10) TEXT.vocab.stoi batch_size = 256 train_iter, val_iter = BucketIterator.splits( (trn, vld), batch_sizes=(batch_size, batch_size), device=device,