def test(self): """ Run trained model on test set """ # # Setup data, logging # self.dataset = SummDatasetFactory.get(self.opt.dataset) test_iter = self.dataset.get_data_loader(split='test', sample_reviews=False, n_docs=1, batch_size=self.hp.batch_size, shuffle=False) tb_path = os.path.join(self.save_dir, 'tensorboard/test/') if not os.path.exists(tb_path): os.mkdir(tb_path) self.tb_test_writer = SummaryWriter(tb_path) # # Get model and loss # self.model = torch.load(opt.load_test_model)['model'] if self.hp.clf_mse: self.loss_fn = nn.MSELoss() else: self.loss_fn = nn.CrossEntropyLoss() if torch.cuda.is_available(): self.model.cuda() if len(self.opt.gpus) > 1: self.model = nn.DataParallel(self.model) n_params = sum([p.nelement() for p in self.model.parameters()]) print('Number of parameters: {}'.format(n_params)) # # Test # self.model.eval() with torch.no_grad(): loss_avg, acc_avg, rating_diff_avg, per_rating_acc = self.run_epoch( test_iter, test_iter.__len__(), 0, 'test', tb_writer=self.tb_test_writer, save_intermediate=False) self.tb_test_writer.add_scalar('overall/loss', loss_avg, 0) self.tb_test_writer.add_scalar('overall/acc', acc_avg, 0) self.tb_test_writer.add_scalar('overall/rating_diff', rating_diff_avg, 0) for r, acc in per_rating_acc.items(): self.tb_test_writer.add_scalar( 'overall/per_rating_acc_{}_stars'.format(r), acc, 0)
def main(opt): if opt.dataset: dataset = SummDatasetFactory.get(opt.dataset) dl = dataset.get_data_loader(split='lm', n_docs=opt.n_docs, sample_reviews=False, batch_size=1, num_workers=0, shuffle=False) print('Writing reviews to file') with open('/tmp/{}_data.txt'.format(opt.dataset), 'w', encoding='utf-8') as f: for texts, ratings, metadata in dl: f.write('{}\n'.format(texts[0])) print('Creating token counts') token_counts = tokenizer.corpus_token_counts('/tmp/{}_data.txt'.format( opt.dataset), opt.corpus_max_lines, split_on_newlines=True) elif opt.corpus_filepattern: token_counts = tokenizer.corpus_token_counts(opt.corpus_filepattern, opt.corpus_max_lines, split_on_newlines=True) else: raise ValueError( 'Must provide --dataset or provide --corpus_filepattern') print('Building to target size') encoder = text_encoder.SubwordTextEncoder.build_to_target_size( opt.target_size, token_counts, 0, 1e9, reserved_tokens=RESERVED_TOKENS) print('Saving tokenizer') vocab_fp = os.path.join(opt.output_dir, opt.output_fn + '.txt') # stores vocab coutns encoder.store_to_file(vocab_fp) enc_fp = os.path.join(opt.output_dir, opt.output_fn + '.pkl') save_file(encoder, enc_fp, verbose=True) pdb.set_trace()
Calculate and print some statistics on the original dataset """ if self.reviews is None: self.reviews, self.item_to_reviews = AmazonDataset.load_all_reviews( ) lens = [] for rev in self.reviews: lens.append(len(self.subwordenc.encode(rev['reviewText']))) print(np.median(lens)) print(np.percentile(lens, 75)) print(np.percentile(lens, 90)) pdb.set_trace() if __name__ == '__main__': from data_loaders.summ_dataset_factory import SummDatasetFactory hp = HParams() ds = SummDatasetFactory.get('amazon') # ds.save_processed_splits() ds.print_original_data_stats() # ds.print_filtered_data_stats() # test_dl = ds.get_data_loader(split='test', n_docs=8, sample_reviews=True, # category='Electronics', # batch_size=4, shuffle=True) # for texts, ratings, metadata in test_dl: # x, lengths, labels = ds.prepare_batch(texts, ratings) # pdb.set_trace()
def train(self): """ Main train loop """ # # Get data, setup # self.dataset = SummDatasetFactory.get(self.opt.dataset, '../datasets/yelp_dataset/') subwordenc = self.dataset.subwordenc train_iter = self.dataset.get_data_loader( split='train', n_docs=self.hp.n_docs, sample_reviews=True, batch_size=self.hp.batch_size, shuffle=True) train_nbatches = train_iter.__len__() val_iter = self.dataset.get_data_loader(split='val', n_docs=self.hp.n_docs, sample_reviews=False, batch_size=self.hp.batch_size, shuffle=False) val_nbatches = val_iter.__len__() tb_path = os.path.join(self.save_dir, 'tensorboard/') print('Tensorboard events will be logged to: {}'.format(tb_path)) os.mkdir(tb_path) os.mkdir(tb_path + 'train/') os.mkdir(tb_path + 'val/') self.tb_tr_writer = SummaryWriter(tb_path + 'train/') self.tb_val_writer = SummaryWriter(tb_path + 'val/') # # Get model and loss # if len(self.opt.load_model) > 0: raise NotImplementedError( 'Need to save run to same directory, handle changes in hp, etc.' ) # checkpoint = torch.load(opt.load_model) # self.model = checkpoint['model'] else: if self.hp.model_type == 'mlstm': embed = nn.Embedding(subwordenc.vocab_size, self.hp.emb_size) lstm = StackedLSTM(mLSTM, self.hp.lstm_layers, self.hp.emb_size, self.hp.hidden_size, subwordenc.vocab_size, self.hp.lstm_dropout, layer_norm=self.hp.lstm_ln) self.model = StackedLSTMEncoder(embed, lstm) self.loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_ID) elif self.hp.model_type == 'transformer': self.model = make_model(subwordenc.vocab_size, subwordenc.vocab_size, N=self.hp.tsfr_blocks, d_model=self.hp.hidden_size, d_ff=self.hp.tsfr_ff_size, dropout=self.hp.tsfr_dropout, tie_embs=self.hp.tsfr_tie_embs, decoder_only=True) self.loss_fn = LabelSmoothing( size=subwordenc.vocab_size, smoothing=self.hp.tsfr_label_smooth) if torch.cuda.is_available(): self.model.cuda() self.ngpus = 1 if len(self.opt.gpus) > 1: self.ngpus = len(self.opt.gpus.split(',')) self.model = DataParallelModel(self.model) self.loss_fn = DataParallelCriterion(self.loss_fn) n_params = sum([p.nelement() for p in self.model.parameters()]) print('Number of parameters: {}'.format(n_params)) # # Get optimizer # if self.hp.optim == 'normal': self.optimizer = OptWrapper( self.model, self.hp.lm_clip, optim.Adam(self.model.parameters(), lr=self.hp.lm_lr)) elif self.hp.optim == 'noam': d_model = self.model.module.tgt_embed[0].d_model if self.ngpus > 1 else \ self.model.tgt_embed[0].d_model self.optimizer = NoamOpt( d_model, 2, self.hp.noam_warmup, torch.optim.Adam(self.model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) # # Train epochs # for e in range(hp.max_nepochs): try: self.model.train() loss_avg = self.run_epoch(train_iter, train_nbatches, e, 'train', optimizer=self.optimizer, tb_writer=self.tb_tr_writer) self.tb_tr_writer.add_scalar('overall_stats/loss_avg', loss_avg, e) except KeyboardInterrupt: print('Exiting from training early') self.model.eval() loss_avg = self.run_epoch(val_iter, val_nbatches, e, 'val', optimizer=None) self.tb_val_writer.add_scalar('overall_stats/loss_avg', loss_avg, e) save_model(self.save_dir, self.model, self.optimizer, e, self.opt, loss_avg)
def train(self): """ Main train loop """ # # Get data, setup # # NOTE: Use n_docs=1 so we can classify one review self.dataset = SummDatasetFactory.get(self.opt.dataset, self.opt.dir_path) train_iter = self.dataset.get_data_loader(split='train', sample_reviews=True, n_docs=1, batch_size=self.hp.batch_size, shuffle=True) val_iter = self.dataset.get_data_loader(split='val', sample_reviews=False, n_docs=1, batch_size=self.hp.batch_size, shuffle=False) self.tb_tr_writer = None self.tb_val_writer = None tb_path = os.path.join(self.save_dir, 'tensorboard/') print('Tensorboard events will be logged to: {}'.format(tb_path)) os.mkdir(tb_path) os.mkdir(tb_path + 'train/') os.mkdir(tb_path + 'val/') self.tb_tr_writer = SummaryWriter(tb_path + 'train/') self.tb_val_writer = SummaryWriter(tb_path + 'val/') # # Get model and loss # if len(self.opt.load_train_model) > 0: raise NotImplementedError('Need to save run to same directory, handle changes in hp, etc.') # checkpoint = torch.load(opt.load_model) # self.model = checkpoint['model'] else: if self.hp.model_type == 'cnn': cnn_output_size = self.hp.cnn_n_feat_maps * len(self.hp.cnn_filter_sizes) self.model = TextClassifier(self.dataset.subwordenc.vocab_size, self.hp.emb_size, self.hp.cnn_filter_sizes, self.hp.cnn_n_feat_maps, self.hp.cnn_dropout, cnn_output_size, self.dataset.n_ratings_labels, onehot_inputs=self.hp.clf_onehot, mse=self.hp.clf_mse) if self.hp.clf_mse: self.loss_fn = nn.MSELoss() else: self.loss_fn = nn.CrossEntropyLoss() if torch.cuda.is_available(): self.model.cuda() if len(self.opt.gpus) > 1: self.model = nn.DataParallel(self.model) n_params = sum([p.nelement() for p in self.model.parameters()]) print('Number of parameters: {}'.format(n_params)) # # Get optimizer # self.optimizer = OptWrapper( self.model, self.hp.clf_clip, optim.Adam(self.model.parameters(), lr=self.hp.clf_lr)) # # Train epochs # for e in range(hp.max_nepochs): try: self.model.train() loss_avg, acc_avg, rating_diff_avg, per_rating_acc = self.run_epoch( train_iter, train_iter.__len__(), e, 'train', optimizer=self.optimizer, tb_writer=self.tb_tr_writer) self.tb_tr_writer.add_scalar('overall/loss', loss_avg, e) self.tb_tr_writer.add_scalar('overall/acc', acc_avg, e) self.tb_tr_writer.add_scalar('overall/rating_diff', rating_diff_avg, e) for r, acc in per_rating_acc.items(): self.tb_tr_writer.add_scalar('overall/per_rating_acc_{}_stars'.format(r), acc, e) except KeyboardInterrupt: print('Exiting from training early') self.model.eval() loss_avg, acc_avg, rating_diff_avg, per_rating_acc = self.run_epoch( val_iter, val_iter.__len__(), e, 'val', optimizer=None) self.tb_val_writer.add_scalar('overall/loss', loss_avg, e) self.tb_val_writer.add_scalar('overall/acc', acc_avg, e) self.tb_val_writer.add_scalar('overall/rating_diff', rating_diff_avg, e) for r, acc in per_rating_acc.items(): self.tb_val_writer.add_scalar('overall/per_rating_acc_{}'.format(r), acc, e) fn_str = 'l{:.4f}_a{:.4f}_d{:.4f}'.format(loss_avg, acc_avg, rating_diff_avg) model_to_save = self.model.module if len(self.opt.gpus) > 1 else self.model save_model(self.save_dir, model_to_save, self.optimizer, e, self.opt, fn_str)
def __init__(self, hp, opt): self.hp = hp self.opt = opt self.dataset = SummDatasetFactory.get(opt.dataset)
rating_to_count[ratings[i].item()] += 1 print('Number of reviews per star rating:') for rating, count in sorted(rating_to_count.items()): print('-- {} stars: {:.2f} reviews; {} of dataset'.format( rating, count, float(count) / len(all_rev_lens))) print('Length of review:') print('-- mean: {}'.format(np.mean(all_rev_lens))) print('-- 75th percentile: {}'.format(np.percentile(all_rev_lens, 75))) print('-- 90th percentile: {}'.format(np.percentile(all_rev_lens, 90))) if __name__ == '__main__': from data_loaders.summ_dataset_factory import SummDatasetFactory hp = HParams() ds = SummDatasetFactory.get('yelp') ds.save_processed_splits() # ds.print_original_data_stats() # ds.print_filtered_data_stats() # Variable batch size and n_docs # test_dl = ds.get_data_loader(split='test', n_docs_min=4, n_docs_max=16, sample_reviews=True, # batch_size=1, shuffle=False) # test_dl = ds.get_data_loader(split='test', n_docs=8, sample_reviews=False, # batch_size=1, shuffle=False) # for texts, ratings, metadata in test_dl: # x, lengths, labels = ds.prepare_batch(texts, ratings) # pdb.set_trace()