def extractive_baseline(self, data_iter, clf_model=None): """ Run an extractive method """ evaluator = EvalMetrics(remove_stopwords=self.hp.remove_stopwords, use_stemmer=self.hp.use_stemmer, store_all=True) summarizer = CentroidW2VSummarizer(WORD2VEC_PATH, length_limit=2, topic_threshold=0.3, sim_threshold=0.95, reordering=True, subtract_centroid=False, keep_first=False, bow_param=0, length_param=0, position_param=0, debug=False) summaries = [] accuracy = 0.0 per_rating_counts = defaultdict(int) per_rating_acc = defaultdict(int) for i, (texts, ratings, metadata) in enumerate(data_iter): for j, text in enumerate(texts): # texts is a list of of length batch_size # each item in texts is a str, i.e. n_docs documents concatenated together src_docs = SummDataset.split_docs(text) # limit is number of words # concatenate documents without the token summary = summarizer.summarize( SummDataset.concat_docs(src_docs, edok_token=False), limit=self.dataset.conf.extractive_max_len) evaluator.batch_update_avg_rouge([summary], [src_docs]) acc, per_rating_counts, per_rating_acc, pred_ratings, pred_probs = \ classify_summ_batch(clf_model, [summary], [ratings[j]], self.dataset, per_rating_counts, per_rating_acc) if acc is None: print('Summary was too short to classify') pred_rating, pred_prob = None, None else: pred_rating, pred_prob = pred_ratings[j].item( ), pred_probs[j].item() accuracy = update_moving_avg(accuracy, acc, i * len(texts) + j + 1) dic = { 'docs': text, 'summary': summary, 'rating': ratings[j].item(), 'pred_rating': pred_rating, 'pred_prob': pred_prob } for k, values in metadata.items(): dic[k] = values[j] summaries.append(dic) return evaluator, summaries, accuracy.item(), per_rating_acc
def run_clf_baseline(self): """ Calculate the classification accuracy when the input is all the reviews concatenated together. This provdies a sort of ceiling on how well each of the summarization methods can do, as the classification model is not perfect either. """ print('\n', '=' * 50) print('Running classifier baseline') # Load classifier clf_model = torch.load(self.opt.load_clf)['model'] clf_model = clf_model.module if isinstance( clf_model, nn.DataParallel) else clf_model if torch.cuda.is_available(): clf_model.cuda() if len(self.opt.gpus) > 1: clf_model = nn.DataParallel(clf_model) summaries = [] accuracy = 0.0 per_rating_counts = defaultdict(int) per_rating_acc = defaultdict(int) dl = self.get_test_set_data_iter(self.hp.batch_size) for i, (texts, ratings_batch, metadata) in enumerate(dl): summaries_batch = [] for j, text in enumerate(texts): # texts is a list of of length batch_size # each item in texts is a str, i.e. n_docs documents concatenated together # concatenate documents without the token src_docs = SummDataset.split_docs(text) summary = SummDataset.concat_docs(src_docs, edok_token=False) summaries_batch.append(summary) acc, per_rating_counts, per_rating_acc, pred_ratings, pred_probs = \ classify_summ_batch(clf_model, summaries_batch, ratings_batch, self.dataset, per_rating_counts, per_rating_acc) accuracy = update_moving_avg(accuracy, acc, i + 1) for j in range(len(summaries_batch)): dic = { 'docs': summaries_batch[j], 'rating': ratings_batch[j].item(), 'pred_rating': pred_ratings[j].item(), 'pred_prob': pred_probs[j].item() } for k, values in metadata.items(): dic[k] = values[j] summaries.append(dic) # Calculate NLL of summaries using fixed, pretrained LM pretrained_lm = torch.load( self.opt.load_lm)['model'] # StackedLSTMEncoder pretrained_lm = pretrained_lm.module if isinstance( pretrained_lm, nn.DataParallel) else pretrained_lm avg_nll = 0.0 batch_size = self.hp.batch_size for i in range(0, len(summaries), batch_size): batch_summs = summaries[i:i + batch_size] batch_texts = [d['docs'] for d in batch_summs] dummy_ratings = [ torch.LongTensor([0]) for _ in range(len(batch_texts)) ] batch_x, _, _ = self.dataset.prepare_batch(batch_texts, dummy_ratings) nll = calc_lm_nll(pretrained_lm, batch_x) avg_nll = update_moving_avg(avg_nll, nll.item(), i + 1) # Print and save accuracies, summaries, etc. print('NLL: ', avg_nll) print('Accuracy: ', accuracy.item()) print('Per rating accuracy: ', per_rating_acc) dataset_dir = self.opt.dataset if self.opt.az_cat is None else 'amazon_{}'.format( self.opt.az_cat) out_dir = os.path.join(OUTPUTS_EVAL_DIR, dataset_dir, 'n_docs_{}'.format(self.hp.n_docs), 'clf_baseline') if not os.path.exists(out_dir): os.makedirs(out_dir) out_fp = os.path.join(out_dir, 'summaries.json') save_file(summaries, out_fp) out_fp = os.path.join(out_dir, 'stats.json') save_file( { 'acc': accuracy.item(), 'per_rating_acc': per_rating_acc, 'nll': avg_nll }, out_fp)
def lm_autoenc_baseline(self, data_iter, clf_model=None): """ Use the pretrained language model to initialize an encoder-decoder model. This is basically the unsupervised abstractive summarization model without training. """ # Load encoder decoder by initializing with languag emodel docs_enc = torch.load(self.opt.load_lm)['model'] # StackedLSTMEncoder docs_enc = docs_enc.module if isinstance(docs_enc, nn.DataParallel) else docs_enc summ_dec = StackedLSTMDecoder(copy.deepcopy(docs_enc.embed), copy.deepcopy(docs_enc.rnn)) # Create Summarizer so that we can use run_epoch() # Copy hp and opt as we're modifying some params. This way there won't be any unexpected errors # if it's used by another method hp = copy.deepcopy(self.hp) hp.sum_cycle = False hp.autoenc_docs = False hp.sum_clf = False opt = copy.deepcopy(self.opt) opt.print_every_nbatches = float('inf') summarizer = Summarizer(hp, opt, '/tmp/') summarizer.tb_val_sub_writer = None summarizer.tau = self.hp.tau summarizer.ngpus = 1 if len(self.opt.gpus) == 1 else len( self.opt.gpus.split(',')) summarizer.sum_model = torch.load(self.opt.load_lm) summarizer.dataset = self.dataset summarizer.fixed_lm = torch.load( self.opt.load_lm)['model'] # StackedLSTMEncoder summarizer.fixed_lm = summarizer.fixed_lm.module if isinstance(summarizer.fixed_lm, nn.DataParallel) \ else summarizer.fixed_lm # Create SummarizationModel docs_autodec, combine_encs_h_net, combine_encs_c_net = None, None, None summ_enc, docs_dec, discrim_model, clf_model_arg, fixed_lm = None, None, None, None, None summarizer.sum_model = SummarizationModel(docs_enc, docs_autodec, combine_encs_h_net, combine_encs_c_net, summ_dec, summ_enc, docs_dec, discrim_model, clf_model_arg, fixed_lm, hp, self.dataset) if torch.cuda.is_available(): summarizer.sum_model.cuda() if summarizer.ngpus > 1: summarizer.sum_model = DataParallelModel(summarizer.sum_model) summarizer.sum_model.eval() with torch.no_grad(): stats_avgs, evaluator, summaries = summarizer.run_epoch( data_iter, data_iter.__len__(), 0, 'test', store_all_rouges=True, store_all_summaries=True, save_intermediate=False, run_val_subset=False) # # Pass summaries through classifier # # Note: I know that since the SummarizationModel already calculates the classification accuracy # if sum_clf=True. Hence, technically, I could refactor it to add everything that I'd like to compute # in the forward pass and add to stats(). However, I think it's cleaner /easier to just do everything # I want here, especially if I add more things like per rating counts and accuracy. Plus, # it's just one pass through the test set -- which I'll run infrequently to evaluate a trained model. # I think that it takes more time is fine. # results = [] accuracy = 0.0 per_rating_counts = defaultdict(int) per_rating_acc = defaultdict(int) for i, (texts, ratings_batch, metadata) in enumerate(data_iter): summaries_batch = summaries[i * self.hp.batch_size:i * self.hp.batch_size + len(texts)] acc, per_rating_counts, per_rating_acc, pred_ratings, pred_probs = \ classify_summ_batch(clf_model, summaries_batch, ratings_batch, self.dataset, per_rating_counts, per_rating_acc) if acc is None: print('Summary was too short to classify') pred_ratings = [None for _ in range(len(summaries_batch))] pred_probs = [None for _ in range(len(summaries_batch))] else: accuracy = update_moving_avg(accuracy, acc, i + 1) for j in range(len(summaries_batch)): dic = { 'docs': texts[j], 'summary': summaries_batch[j], 'rating': ratings_batch[j].item(), 'pred_rating': pred_ratings[j].item(), 'pred_prob': pred_probs[j].item() } for k, values in metadata.items(): dic[k] = values[j] results.append(dic) return evaluator, results, accuracy.item(), per_rating_acc
def best_or_worst_review_baseline(self, data_iter, method='best', clf_model=None): """ When summarizing n_docs reviews, calculate the average ROUGE1-F for each review as if it was the summary. Choose the document with the best / worst score. Note: it'd be far more efficient to calculate best and worst at the same time as all the rouges are already calculated... """ evaluator = EvalMetrics(remove_stopwords=self.hp.remove_stopwords, use_stemmer=self.hp.use_stemmer, store_all=True) summaries = [] accuracy = 0.0 per_rating_counts = defaultdict(int) per_rating_acc = defaultdict(int) for i, (texts, ratings, metadata) in enumerate(data_iter): # texts is a list of of length batch_size # each item in texts is a str, i.e. n_docs documents concatenated together for j, text in enumerate(texts): bw_evaluator = None bw_rouge1_f = 0.0 if method == 'best' else 1.0 bw_doc = None # Set each document as the summary and find the best one src_docs = SummDataset.split_docs(text) for doc in src_docs: cur_evaluator = EvalMetrics( remove_stopwords=self.hp.remove_stopwords, use_stemmer=self.hp.use_stemmer, store_all=True) avg_rouges, _, _, _ = cur_evaluator.batch_update_avg_rouge( [doc], [src_docs]) is_better_worse = (method == 'best' and (avg_rouges['rouge1']['f'] >= bw_rouge1_f)) or \ (method == 'worst' and (avg_rouges['rouge1']['f'] <= bw_rouge1_f)) if is_better_worse: bw_evaluator = cur_evaluator bw_rouge1_f = avg_rouges['rouge1']['f'] bw_doc = doc evaluator.update_with_evaluator(bw_evaluator) try: acc, per_rating_counts, per_rating_acc, pred_ratings, pred_probs = \ classify_summ_batch(clf_model, [bw_doc], [ratings[j]], self.dataset, per_rating_counts, per_rating_acc) except Exception as e: # worst_review in the Amazon dataset has a rare edge case # where the worst review is an empty string. # No reviews should be empty, but it appears to just be one or two reviews pass if acc is None: print('Summary was too short to classify') pred_rating, pred_prob = None, None else: pred_rating, pred_prob = pred_ratings[j].item( ), pred_probs[j].item() accuracy = update_moving_avg(accuracy, acc, i * len(texts) + j + 1) dic = { 'docs': text, 'summary': bw_doc, 'rating': ratings[j].item(), 'pred_rating': pred_rating, 'pred_prob': pred_prob } for k, values in metadata.items(): dic[k] = values[j] summaries.append(dic) return evaluator, summaries, accuracy.item(), per_rating_acc
def ledes_baseline(self, data_iter, n=1, clf_model=None): """ Add up until the first n sentences from each review, or until the maximum review length is exceeded """ evaluator = EvalMetrics(remove_stopwords=self.hp.remove_stopwords, use_stemmer=self.hp.use_stemmer, store_all=True) summaries = [] accuracy = 0.0 per_rating_counts = defaultdict(int) per_rating_acc = defaultdict(int) for i, (texts, ratings, metadata) in enumerate(data_iter): # texts is a list of of length batch_size # each item in texts is a str, i.e. n_docs documents concatenated together for j, text in enumerate(texts): src_docs = SummDataset.split_docs(text) summary = [] doc_sents = [nltk.sent_tokenize(doc) for doc in src_docs] summary_len = 0 doc_idx, sent_idx = 0, 0 # Keep adding sentences as long as summary isn't over maximum length and # there are still sentences to add while (summary_len < self.dataset.conf.review_max_len) and (sent_idx < n): # Current document has this many sentences if sent_idx < len(doc_sents[doc_idx]): sent = doc_sents[doc_idx][sent_idx] sent_tok_len = len(nltk.word_tokenize(sent)) # Adding sentence won't exceed maximum length if summary_len + sent_tok_len <= self.dataset.conf.review_max_len: summary.append(sent) summary_len += sent_tok_len # Move on to next document doc_idx = (doc_idx + 1) % len(src_docs) if doc_idx == 0: # back to the first doc, all first sentences have been added sent_idx += 1 summary = ' '.join(summary) evaluator.batch_update_avg_rouge([summary], [src_docs]) acc, per_rating_counts, per_rating_acc, pred_ratings, pred_probs = \ classify_summ_batch(clf_model, [summary], [ratings[j]], self.dataset, per_rating_counts, per_rating_acc) if acc is None: print('Summary was too short to classify') pred_rating, pred_prob = None, None else: pred_rating, pred_prob = pred_ratings[j].item( ), pred_probs[j].item() accuracy = update_moving_avg(accuracy, acc, i * len(texts) + j + 1) dic = { 'docs': text, 'summary': summary, 'rating': ratings[j].item(), 'pred_rating': pred_rating, 'pred_prob': pred_prob } for k, values in metadata.items(): dic[k] = values[j] summaries.append(dic) return evaluator, summaries, accuracy.item(), per_rating_acc