def process_one_example(example): raw_article_sents, groundtruth_summ_sents, example_idx, pretty_html_path, out_full_dir = example article_sent_tokens = [util.process_sent(sent, whitespace=True) for sent in raw_article_sents] doc_indices = None if doc_indices is None or (FLAGS.dataset_name != 'duc_2004' and len(doc_indices) != len( util.flatten_list_of_lists(article_sent_tokens))): doc_indices = [0] * len(util.flatten_list_of_lists(article_sent_tokens)) doc_indices = [int(doc_idx) for doc_idx in doc_indices] summary_sent_tokens = [util.process_sent(sent, whitespace=True) for sent in groundtruth_summ_sents] writer = open(os.path.join(out_full_dir, '%06d_ssi.tsv' % example_idx), 'wb') if len(article_sent_tokens) == 0: print('Skipping because empty') writer.write('\n') return ''' This is the main function that finds the article sentences that were fused to create the given summary sentence''' simple_similar_source_indices, lcs_paths_list, smooth_article_paths_list = ssi_functions.get_simple_source_indices_list( summary_sent_tokens, article_sent_tokens, vocab=None, sentence_limit=FLAGS.sentence_limit, min_matched_tokens=FLAGS.min_matched_tokens) highlight_line = '\t'.join([','.join([str(src_idx) for src_idx in source_indices]) for source_indices in simple_similar_source_indices]) + '\n' writer.write(highlight_line.encode()) if example_idx < 1: f_pretty_html = open(pretty_html_path, 'wb') extracted_sents_in_article_html = ssi_functions.html_highlight_sents_in_article(summary_sent_tokens, simple_similar_source_indices, article_sent_tokens, doc_indices, lcs_paths_list, smooth_article_paths_list) f_pretty_html.write(extracted_sents_in_article_html.encode())
def load_and_evaluate_example(ex): example, example_idx, single_feat_len, pair_feat_len = ex print(example_idx) # example_idx += 1 raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text = util.unpack_tf_example( example, names_to_types) article_sent_tokens = [ util.process_sent(sent) for sent in raw_article_sents ] groundtruth_summ_sents = [[ sent.strip() for sent in groundtruth_summary_text.strip().split('\n') ]] groundtruth_summ_sent_tokens = [ sent.split(' ') for sent in groundtruth_summ_sents[0] ] # summ_sent_tokens = [sent.strip().split() for sent in summary_text.strip().split('\n')] temp_in_path = os.path.join(temp_in_dir, '%06d.txt' % example_idx) temp_out_path = os.path.join(temp_out_dir, '%06d.txt' % example_idx) if importance: summary_sents, similar_source_indices_list, summary_sents_for_html = generate_summary_importance( raw_article_sents, article_sent_tokens, temp_in_path, temp_out_path, single_feat_len, pair_feat_len) else: summary_sents = generate_summary(raw_article_sents, article_sent_tokens, temp_in_path, temp_out_path, single_feat_len, pair_feat_len) if example_idx <= 100: summary_sent_tokens = [ sent.split(' ') for sent in summary_sents_for_html ] extracted_sents_in_article_html = html_highlight_sents_in_article( summary_sent_tokens, similar_source_indices_list, article_sent_tokens) write_highlighted_html(extracted_sents_in_article_html, html_dir, example_idx) groundtruth_similar_source_indices_list, lcs_paths_list, article_lcs_paths_list = get_simple_source_indices_list( groundtruth_summ_sent_tokens, article_sent_tokens, None, sentence_limit, min_matched_tokens) groundtruth_highlighted_html = html_highlight_sents_in_article( groundtruth_summ_sent_tokens, groundtruth_similar_source_indices_list, article_sent_tokens, lcs_paths_list=lcs_paths_list, article_lcs_paths_list=article_lcs_paths_list) all_html = '<u>System Summary</u><br><br>' + extracted_sents_in_article_html + '<u>Groundtruth Summary</u><br><br>' + groundtruth_highlighted_html write_highlighted_html(all_html, html_dir, example_idx) rouge_eval_references.write_for_rouge(groundtruth_summ_sents, summary_sents, example_idx, ref_dir, dec_dir)
def add_sents_to_article(sentences, article, raw_article_sents, doc_indices, doc_idx): for orig_sent in sentences: tokenized_sent = util.process_sent(orig_sent) if is_quote(tokenized_sent): continue sent = ' '.join(tokenized_sent) article += sent + ' ' doc_indices_for_tokens = [doc_idx] * len(tokenized_sent) doc_indices_str = ' '.join(str(x) for x in doc_indices_for_tokens) doc_indices += doc_indices_str + ' ' raw_article_sents.append(orig_sent) return article, raw_article_sents, doc_indices
def main(unused_argv): if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) if FLAGS.dataset_split == 'all': dataset_splits = ['test', 'val', 'train'] else: dataset_splits = [FLAGS.dataset_split] vocab_counter = collections.Counter() for dataset_split in dataset_splits: source_dir = os.path.join(FLAGS.data_root, FLAGS.dataset_name) source_files = sorted(glob.glob(source_dir + '/' + dataset_split + '*')) total = len(source_files) * 1000 example_generator = data.example_generator(source_dir + '/' + dataset_split + '*', True, False, should_check_valid=False) for example_idx, example in enumerate( tqdm(example_generator, total=total)): raw_article_sents, article, abstracts, doc_indices = util.unpack_tf_example( example, names_to_types) article_sent_tokens = [ util.process_sent(sent) for sent in raw_article_sents ] # groundtruth_summ_sent_tokens = [sent.strip().split() for sent in groundtruth_summary_text.strip().split('\n')] groundtruth_summ_sent_tokens = [[ token for token in abstract.strip().split() if token not in ['<s>', '</s>'] ] for abstract in abstracts] all_tokens = util.flatten_list_of_lists( article_sent_tokens) + util.flatten_list_of_lists( groundtruth_summ_sent_tokens) vocab_counter.update(all_tokens) print("Writing vocab file...") with open(os.path.join('logs', "vocab_" + FLAGS.dataset_name), 'w') as writer: for word, count in vocab_counter.most_common(VOCAB_SIZE): writer.write(word + ' ' + str(count) + '\n') print("Finished writing vocab file")
def load_and_evaluate_example(ex): example, example_idx, single_feat_len, pair_feat_len, singles_and_pairs = ex print(example_idx) # example_idx += 1 raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, corefs = util.unpack_tf_example(example, names_to_types) article_sent_tokens = [util.process_sent(sent) for sent in raw_article_sents] groundtruth_summ_sents = [[sent.strip() for sent in groundtruth_summary_text.strip().split('\n')]] groundtruth_summ_sent_tokens = [sent.split(' ') for sent in groundtruth_summ_sents[0]] # summ_sent_tokens = [sent.strip().split() for sent in summary_text.strip().split('\n')] temp_in_path = os.path.join(temp_in_dir, '%06d.txt' % example_idx) temp_out_path = os.path.join(temp_out_dir, '%06d.txt' % example_idx) if importance: summary_sents, similar_source_indices_list, summary_sents_for_html = generate_summary_importance(raw_article_sents, article_sent_tokens, corefs, temp_in_path, temp_out_path, single_feat_len, pair_feat_len, singles_and_pairs)
def text_generator(self, example_generator): """Generates article and abstract text from tf.Example. Args: example_generator: a generator of tf.Examples from file. See data.example_generator""" # i = 0 while True: e = next(example_generator) # e is a tf.Example try: names_to_types = [('raw_article_sents', 'string_list'), ('similar_source_indices', 'delimited_list_of_tuples'), ('summary_text', 'string_list')] if self._hps.dataset_name == 'duc_2004': names_to_types[2] = ('summary_text', 'string_list') raw_article_sents, ssi, groundtruth_summary_sents = util.unpack_tf_example( e, names_to_types) groundtruth_summary_text = '\n'.join(groundtruth_summary_sents) article_sent_tokens = [util.process_sent(sent) for sent in raw_article_sents] article_text = ' '.join([' '.join(sent) for sent in article_sent_tokens]) if self._hps.dataset_name == 'duc_2004': abstract_sentences = [['<s> ' + sent.strip() + ' </s>' for sent in gt_summ_text.strip().split('\n')] for gt_summ_text in groundtruth_summary_text] abstract_sentences = [abs_sents[:max_dec_sents] for abs_sents in abstract_sentences] abstract_texts = [' '.join(abs_sents) for abs_sents in abstract_sentences] else: abstract_sentences = ['<s> ' + sent.strip() + ' </s>' for sent in groundtruth_summary_text.strip().split('\n')] abstract_sentences = abstract_sentences[:max_dec_sents] abstract_texts = [' '.join(abstract_sentences)] if 'doc_indices' not in e.features.feature or len(e.features.feature['doc_indices'].bytes_list.value) == 0: num_words = len(article_text.split()) doc_indices_text = '0 ' * num_words else: doc_indices_text = e.features.feature['doc_indices'].bytes_list.value[0] sentence_limit = 1 if self._hps.singles_and_pairs == 'singles' else 2 ssi = util.enforce_sentence_limit(ssi, sentence_limit) ssi = ssi[:max_dec_sents] ssi = util.make_ssi_chronological(ssi) except: logging.error('Failed to get article or abstract from example') raise if len(article_text)==0: # See https://github.com/abisee/pointer-generator/issues/1 logging.warning('Found an example with empty article text. Skipping it.\n*********************************************') elif len(article_text.strip().split()) < 3 and self._hps.skip_with_less_than_3: print('Article has less than 3 tokens, so skipping\n*********************************************') elif len(abstract_texts[0].strip().split()) < 3 and self._hps.skip_with_less_than_3: print('Abstract has less than 3 tokens, so skipping\n*********************************************') else: yield (article_text, abstract_texts, doc_indices_text, raw_article_sents, ssi)
def write_to_lambdamart_examples_to_file(ex): example, example_idx, single_feat_len, pair_feat_len, singles_and_pairs = ex print(example_idx) # example_idx += 1 temp_in_path = os.path.join(temp_in_dir, '%06d.txt' % example_idx) if not FLAGS.start_over and os.path.exists(temp_in_path): return raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, corefs, doc_indices = util.unpack_tf_example( example, names_to_types) article_sent_tokens = [ util.process_sent(sent) for sent in raw_article_sents ] if doc_indices is None: doc_indices = [0] * len( util.flatten_list_of_lists(article_sent_tokens)) doc_indices = [int(doc_idx) for doc_idx in doc_indices] if len(doc_indices) != len( util.flatten_list_of_lists(article_sent_tokens)): doc_indices = [0] * len( util.flatten_list_of_lists(article_sent_tokens)) rel_sent_indices, _, _ = get_rel_sent_indices(doc_indices, article_sent_tokens) groundtruth_similar_source_indices_list = util.enforce_sentence_limit( groundtruth_similar_source_indices_list, sentence_limit) groundtruth_summ_sents = [[ sent.strip() for sent in groundtruth_summary_text.strip().split('\n') ]] groundtruth_summ_sent_tokens = [ sent.split(' ') for sent in groundtruth_summ_sents[0] ] # summ_sent_tokens = [sent.strip().split() for sent in summary_text.strip().split('\n')] if FLAGS.dataset_name == 'duc_2004': first_k_indices = get_indices_of_first_k_sents_of_each_article( rel_sent_indices, FLAGS.first_k) else: first_k_indices = [idx for idx in range(len(raw_article_sents))] if importance: get_instances(example_idx, raw_article_sents, article_sent_tokens, corefs, rel_sent_indices, first_k_indices, temp_in_path, temp_out_path, single_feat_len, pair_feat_len, singles_and_pairs)
def convert_singpairmix_to_tf_examples(dataset_name, processed_data_dir, tf_example_dir, dataset_split='all'): out_dir = os.path.join(tf_example_dir, dataset_name) out_full_dir = os.path.join(out_dir, 'all') util.create_dirs(out_full_dir) if dataset_split == 'all': if dataset_name == 'duc_2004': dataset_splits = ['test'] else: dataset_splits = ['test', 'val', 'train'] else: dataset_splits = [dataset_split] for dataset_split in dataset_splits: processed_data_path = os.path.join(processed_data_dir, dataset_name, dataset_split) articles_path = os.path.join(processed_data_path,'articles.tsv') abstracts_path = os.path.join(processed_data_path,'summaries.tsv') highlight_path = os.path.join(processed_data_path,'highlight.tsv') f_art = open(articles_path) f_abs = open(abstracts_path) f_hl = open(highlight_path) writer = open(os.path.join(out_full_dir, dataset_split + '.bin'), 'wb') total = util.num_lines_in_file(articles_path) for example_idx in tqdm(range(total)): raw_article_sents = f_art.readline().strip().split('\t') groundtruth_summ_sents = f_abs.readline().strip().split('\t') summary_text = '\n'.join(groundtruth_summ_sents) article_sent_tokens = [util.process_sent(sent, whitespace=True) for sent in raw_article_sents] doc_indices = None if doc_indices is None or (dataset_name != 'duc_2004' and len(doc_indices) != len( util.flatten_list_of_lists(article_sent_tokens))): doc_indices = [0] * len(util.flatten_list_of_lists(article_sent_tokens)) doc_indices_str = ' '.join([str(idx) for idx in doc_indices]) similar_source_indices = [source_indices.split(',') for source_indices in f_hl.readline().split('\t')] write_bert_tf_example(similar_source_indices, raw_article_sents, summary_text, None, doc_indices_str, None, writer, dataset_name) writer.close() if dataset_name == 'cnn_dm' or dataset_name == 'newsroom' or dataset_name == 'xsum': chunk_size = 1000 else: chunk_size = 1 util.chunk_file(dataset_split, out_full_dir, out_dir, chunk_size=chunk_size)
def gigaword_generator(dataset_name, dataset_split): article_path = os.path.join(kaiqiang_data_dir, dataset_name, dataset_split + '.Ndocument') abstract_path = os.path.join(kaiqiang_data_dir, dataset_name, dataset_split + '.Nsummary') giga_article_lines = [line.strip() for line in open(article_path).readlines()] giga_abstract_lines = [line.strip() for line in open(abstract_path).readlines()] if len(giga_article_lines) != len(giga_abstract_lines): util.print_vars(giga_article_lines, giga_abstract_lines) raise Exception('len(article_lines) != len(abstract_lines)') for article_idx in range(len(giga_abstract_lines)): article_line = giga_article_lines[article_idx] abstract_line = giga_abstract_lines[article_idx] article = '' doc_indices = '' raw_article_sents = [] orig_sent = article_line tokenized_sent = util.process_sent(orig_sent) # if is_quote(tokenized_sent): # continue sent = ' '.join(tokenized_sent) article += sent + ' ' doc_indices_for_tokens = [0] * len(tokenized_sent) doc_indices_str = ' '.join(str(x) for x in doc_indices_for_tokens) doc_indices += doc_indices_str + ' ' raw_article_sents.append(orig_sent) article = article.strip() abstracts_unprocessed = [[abstract_line]] abstracts = [] for abstract_lines in abstracts_unprocessed: abstract = process_abstract(abstract_lines) abstracts.append(abstract) # yield article, abstracts, doc_indices, raw_article_sents example = make_example(article, abstracts, doc_indices, raw_article_sents, None) yield example
def evaluate_example(ex): example, example_idx, qid_ssi_to_importances, _, _ = ex print(example_idx) # Read example from dataset raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, doc_indices = util.unpack_tf_example(example, names_to_types) article_sent_tokens = [util.process_sent(sent) for sent in raw_article_sents] enforced_groundtruth_ssi_list = util.enforce_sentence_limit(groundtruth_similar_source_indices_list, sentence_limit) groundtruth_summ_sents = [[sent.strip() for sent in groundtruth_summary_text.strip().split('\n')]] groundtruth_summ_sent_tokens = [sent.split(' ') for sent in groundtruth_summ_sents[0]] if FLAGS.upper_bound: # If upper bound, then get the groundtruth singletons/pairs replaced_ssi_list = util.replace_empty_ssis(enforced_groundtruth_ssi_list, raw_article_sents) selected_article_sent_indices = util.flatten_list_of_lists(replaced_ssi_list) summary_sents = [' '.join(sent) for sent in util.reorder(article_sent_tokens, selected_article_sent_indices)] similar_source_indices_list = groundtruth_similar_source_indices_list ssi_length_extractive = len(similar_source_indices_list) else: # Generates summary based on BERT output. This is an extractive summary. summary_sents, similar_source_indices_list, summary_sents_for_html, ssi_length_extractive = generate_summary(article_sent_tokens, qid_ssi_to_importances, example_idx) similar_source_indices_list_trunc = similar_source_indices_list[:ssi_length_extractive] summary_sents_for_html_trunc = summary_sents_for_html[:ssi_length_extractive] if example_idx <= 1: summary_sent_tokens = [sent.split(' ') for sent in summary_sents_for_html_trunc] extracted_sents_in_article_html = html_highlight_sents_in_article(summary_sent_tokens, similar_source_indices_list_trunc, article_sent_tokens, doc_indices=doc_indices) groundtruth_ssi_list, lcs_paths_list, article_lcs_paths_list = get_simple_source_indices_list( groundtruth_summ_sent_tokens, article_sent_tokens, None, sentence_limit, min_matched_tokens) groundtruth_highlighted_html = html_highlight_sents_in_article(groundtruth_summ_sent_tokens, groundtruth_ssi_list, article_sent_tokens, lcs_paths_list=lcs_paths_list, article_lcs_paths_list=article_lcs_paths_list, doc_indices=doc_indices) all_html = '<u>System Summary</u><br><br>' + extracted_sents_in_article_html + '<u>Groundtruth Summary</u><br><br>' + groundtruth_highlighted_html ssi_functions.write_highlighted_html(all_html, html_dir, example_idx) rouge_functions.write_for_rouge(groundtruth_summ_sents, summary_sents, example_idx, ref_dir, dec_dir) return (groundtruth_similar_source_indices_list, similar_source_indices_list, ssi_length_extractive)
def main(unused_argv): print('Running statistics on %s' % FLAGS.dataset_name) if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) source_dir = os.path.join(data_dir, FLAGS.dataset_name) source_files = sorted(glob.glob(source_dir + '/' + FLAGS.dataset_split + '*')) total = len(source_files) * 1000 if ('cnn' in FLAGS.dataset_name or 'newsroom' in FLAGS.dataset_name or 'xsum' in FLAGS.dataset_name) else len(source_files) example_generator = data.example_generator(source_dir + '/' + FLAGS.dataset_split + '*', True, False, should_check_valid=False) for example_idx, example in enumerate(tqdm(example_generator, total=total)): raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, corefs, doc_indices = util.unpack_tf_example( example, names_to_types) article_sent_tokens = [util.process_sent(sent) for sent in raw_article_sents] groundtruth_summ_sents = [[sent.strip() for sent in groundtruth_summary_text.strip().split('\n')]] if doc_indices is None: doc_indices = [0] * len(util.flatten_list_of_lists(article_sent_tokens)) doc_indices = [int(doc_idx) for doc_idx in doc_indices] rel_sent_indices, _, _ = preprocess_for_lambdamart_no_flags.get_rel_sent_indices(doc_indices, article_sent_tokens) groundtruth_similar_source_indices_list = util.enforce_sentence_limit(groundtruth_similar_source_indices_list, FLAGS.sentence_limit)
def evaluate_example(ex): example, example_idx, qid_ssi_to_importances, _, _ = ex print(example_idx) # example_idx += 1 qid = example_idx raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, corefs, doc_indices = util.unpack_tf_example( example, names_to_types) article_sent_tokens = [ util.process_sent(sent) for sent in raw_article_sents ] enforced_groundtruth_ssi_list = util.enforce_sentence_limit( groundtruth_similar_source_indices_list, sentence_limit) if FLAGS.dataset_name == 'duc_2004': groundtruth_summ_sents = [[ sent.strip() for sent in gt_summ_text.strip().split('\n') ] for gt_summ_text in groundtruth_summary_text] else: groundtruth_summ_sents = [[ sent.strip() for sent in groundtruth_summary_text.strip().split('\n') ]] groundtruth_summ_sent_tokens = [ sent.split(' ') for sent in groundtruth_summ_sents[0] ] if FLAGS.upper_bound: replaced_ssi_list = util.replace_empty_ssis( enforced_groundtruth_ssi_list, raw_article_sents) selected_article_sent_indices = util.flatten_list_of_lists( replaced_ssi_list) summary_sents = [ ' '.join(sent) for sent in util.reorder( article_sent_tokens, selected_article_sent_indices) ] similar_source_indices_list = groundtruth_similar_source_indices_list ssi_length_extractive = len(similar_source_indices_list) elif FLAGS.lead: lead_ssi_list = [(idx, ) for idx in list( range(util.average_sents_for_dataset[FLAGS.dataset_name]))] lead_ssi_list = lead_ssi_list[:len( raw_article_sents )] # make sure the sentence indices don't go past the total number of sentences in the article selected_article_sent_indices = util.flatten_list_of_lists( lead_ssi_list) summary_sents = [ ' '.join(sent) for sent in util.reorder( article_sent_tokens, selected_article_sent_indices) ] similar_source_indices_list = lead_ssi_list ssi_length_extractive = len(similar_source_indices_list) else: summary_sents, similar_source_indices_list, summary_sents_for_html, ssi_length_extractive = generate_summary( article_sent_tokens, qid_ssi_to_importances, example_idx) similar_source_indices_list_trunc = similar_source_indices_list[: ssi_length_extractive] summary_sents_for_html_trunc = summary_sents_for_html[: ssi_length_extractive] if example_idx <= 100: summary_sent_tokens = [ sent.split(' ') for sent in summary_sents_for_html_trunc ] extracted_sents_in_article_html = html_highlight_sents_in_article( summary_sent_tokens, similar_source_indices_list_trunc, article_sent_tokens, doc_indices=doc_indices) # write_highlighted_html(extracted_sents_in_article_html, html_dir, example_idx) groundtruth_ssi_list, lcs_paths_list, article_lcs_paths_list = get_simple_source_indices_list( groundtruth_summ_sent_tokens, article_sent_tokens, None, sentence_limit, min_matched_tokens) groundtruth_highlighted_html = html_highlight_sents_in_article( groundtruth_summ_sent_tokens, groundtruth_ssi_list, article_sent_tokens, lcs_paths_list=lcs_paths_list, article_lcs_paths_list=article_lcs_paths_list, doc_indices=doc_indices) all_html = '<u>System Summary</u><br><br>' + extracted_sents_in_article_html + '<u>Groundtruth Summary</u><br><br>' + groundtruth_highlighted_html write_highlighted_html(all_html, html_dir, example_idx) rouge_functions.write_for_rouge(groundtruth_summ_sents, summary_sents, example_idx, ref_dir, dec_dir) return (groundtruth_similar_source_indices_list, similar_source_indices_list, ssi_length_extractive)
def text_generator(self, example_generator): """Generates article and abstract text from tf.Example. Args: example_generator: a generator of tf.Examples from file. See data.example_generator""" # i = 0 while True: # i += 1 e = next(example_generator) # e is a tf.Example abstract_texts = [] raw_article_sents = [] # if self._hps.pg_mmr or '_sent' in self._hps.dataset_name: try: # names_to_types = [('raw_article_sents', 'string_list'), ('similar_source_indices', 'delimited_list_of_tuples'), ('summary_text', 'string'), ('corefs', 'json'), ('article_lcs_paths_list', 'delimited_list_of_list_of_lists')] names_to_types = [('raw_article_sents', 'string_list'), ('similar_source_indices', 'delimited_list_of_tuples'), ('summary_text', 'string_list'), ('corefs', 'json'), ('article_lcs_paths_list', 'delimited_list_of_list_of_lists')] if self._hps.dataset_name == 'duc_2004': names_to_types[2] = ('summary_text', 'string_list') # raw_article_sents, ssi, groundtruth_summary_text, corefs, article_lcs_paths_list = util.unpack_tf_example( # e, names_to_types) raw_article_sents, ssi, groundtruth_summary_sents, corefs, article_lcs_paths_list = util.unpack_tf_example( e, names_to_types) groundtruth_summary_text = '\n'.join(groundtruth_summary_sents) article_sent_tokens = [ util.process_sent(sent) for sent in raw_article_sents ] article_text = ' '.join( [' '.join(sent) for sent in article_sent_tokens]) if self._hps.dataset_name == 'duc_2004': abstract_sentences = [[ '<s> ' + sent.strip() + ' </s>' for sent in gt_summ_text.strip().split('\n') ] for gt_summ_text in groundtruth_summary_text] abstract_sentences = [ abs_sents[:max_dec_sents] for abs_sents in abstract_sentences ] abstract_texts = [ ' '.join(abs_sents) for abs_sents in abstract_sentences ] else: abstract_sentences = [ '<s> ' + sent.strip() + ' </s>' for sent in groundtruth_summary_text.strip().split('\n') ] abstract_sentences = abstract_sentences[:max_dec_sents] abstract_texts = [' '.join(abstract_sentences)] if 'doc_indices' not in e.features.feature or len( e.features.feature['doc_indices'].bytes_list.value ) == 0: num_words = len(article_text.split()) doc_indices_text = '0 ' * num_words else: doc_indices_text = e.features.feature[ 'doc_indices'].bytes_list.value[0] sentence_limit = 1 if self._hps.singles_and_pairs == 'singles' else 2 ssi = util.enforce_sentence_limit(ssi, sentence_limit) ssi = ssi[:max_dec_sents] article_lcs_paths_list = util.enforce_sentence_limit( article_lcs_paths_list, sentence_limit) article_lcs_paths_list = article_lcs_paths_list[:max_dec_sents] ssi, article_lcs_paths_list = util.make_ssi_chronological( ssi, article_lcs_paths_list) except: logging.error('Failed to get article or abstract from example') raise # continue # else: # try: # article_text = e.features.feature['article'].bytes_list.value[0] # the article text was saved under the key 'article' in the data files # for abstract in e.features.feature['abstract'].bytes_list.value: # abstract_texts.append(abstract) # the abstract text was saved under the key 'abstract' in the data files # if 'doc_indices' not in e.features.feature or len(e.features.feature['doc_indices'].bytes_list.value) == 0: # num_words = len(article_text.split()) # doc_indices_text = '0 ' * num_words # else: # doc_indices_text = e.features.feature['doc_indices'].bytes_list.value[0] # for sent in e.features.feature['raw_article_sents'].bytes_list.value: # raw_article_sents.append(sent) # the abstract text was saved under the key 'abstract' in the data files # ssi = None # article_lcs_paths_list = None # except ValueError: # logging.error('Failed to get article or abstract from example\n*********************************************') # raise # # continue if len( article_text ) == 0: # See https://github.com/abisee/pointer-generator/issues/1 logging.warning( 'Found an example with empty article text. Skipping it.\n*********************************************' ) elif len(article_text.strip().split() ) < 3 and self._hps.skip_with_less_than_3: print( 'Article has less than 3 tokens, so skipping\n*********************************************' ) elif len(abstract_texts[0].strip().split() ) < 3 and self._hps.skip_with_less_than_3: print( 'Abstract has less than 3 tokens, so skipping\n*********************************************' ) else: # print i yield (article_text, abstract_texts, doc_indices_text, raw_article_sents, ssi, article_lcs_paths_list)
def fill_example_queue(self): """Reads data from file and processes into Examples which are then placed into the example queue.""" if self._example_generator is None: input_gen = self.text_generator( data.example_generator(self._data_path, self._single_pass, self._cnn_500_dm_500, is_original=('with_coref' not in self._data_path))) else: input_gen = self.text_generator(self._example_generator) if self._hps.pg_mmr and self._hps.ssi_data_path != '': # if use pg_mmr and bert print(util.bcolors.OKGREEN + "Loading SSI from BERT at %s" % os.path.join(self._hps.ssi_data_path, 'ssi.pkl') + util.bcolors.ENDC) with open(os.path.join(self._hps.ssi_data_path, 'ssi.pkl')) as f: ssi_triple_list = pickle.load(f) # ssi_list = [ssi_triple[1] for ssi_triple in ssi_triple_list] else: ssi_triple_list = None counter = 0 while True: try: ( article, abstracts, doc_indices_str, raw_article_sents, ssi, article_lcs_paths_list ) = next( input_gen ) # read the next example from file. article and abstract are both strings. except StopIteration: # if there are no more examples: logging.info( "The example generator for this example queue filling thread has exhausted data." ) if self._single_pass: logging.info( "single_pass mode is on, so we've finished reading dataset. This thread is stopping." ) self._finished_reading = True if ssi_triple_list is not None and counter < len( ssi_triple_list): raise Exception( 'Len of ssi list (%d) is greater than number of examples (%d)' % (len(ssi_triple_list), counter)) break else: raise Exception( "single_pass mode is off but the example generator is out of data; error." ) if ssi_triple_list is not None: if counter >= len(ssi_triple_list): raise Exception( 'Len of ssi list (%d) is less than number of examples (>=%d)' % (len(ssi_triple_list), counter)) ssi_length_extractive = ssi_triple_list[counter][2] ssi = ssi_triple_list[counter][1] ssi = ssi[:ssi_length_extractive] article = article abstracts = [abstract for abstract in abstracts] if type(doc_indices_str) != str: doc_indices_str = doc_indices_str raw_article_sents = [sent for sent in raw_article_sents] all_abstract_sentences = [[ sent.strip() for sent in data.abstract2sents(abstract) ] for abstract in abstracts] if len(all_abstract_sentences) != 0: abstract_sentences = all_abstract_sentences[0] else: abstract_sentences = [] doc_indices = [int(idx) for idx in doc_indices_str.strip().split()] # join_separator = ' [SEP] ' if self._hps.sep else ' ' if self._hps.by_instance: # if we are running iteratively on only instances (a singleton/pair + a summary sentence), not the whole article for abs_idx, abstract_sentence in enumerate( abstract_sentences): inst_ssi = ssi[abs_idx] if len(inst_ssi) == 0: continue inst_abstract_sentences = abstract_sentence inst_raw_article_sents = util.reorder( raw_article_sents, inst_ssi) inst_article = ' '.join([ ' '.join(util.process_sent(sent, whitespace=True)) for sent in inst_raw_article_sents ]) inst_doc_indices = [0] * len(inst_article.split()) inst_article_lcs_paths_list = article_lcs_paths_list[ abs_idx] if len( inst_article ) == 0: # See https://github.com/abisee/pointer-generator/issues/1 logging.warning( 'Found an example with empty article text. Skipping it.\n*********************************************' ) elif len(inst_article.strip().split() ) < 3 and self._hps.skip_with_less_than_3: print( 'Article has less than 3 tokens, so skipping\n*********************************************' ) elif len(inst_abstract_sentences.strip().split() ) < 3 and self._hps.skip_with_less_than_3: print( 'Abstract has less than 3 tokens, so skipping\n*********************************************' ) else: inst_example = Example(None, [inst_abstract_sentences], all_abstract_sentences, None, inst_raw_article_sents, None, [inst_article_lcs_paths_list], self._vocab, self._hps) self._example_queue.put(inst_example) else: example = Example(None, abstract_sentences, all_abstract_sentences, None, raw_article_sents, ssi, article_lcs_paths_list, self._vocab, self._hps) # Process into an Example. self._example_queue.put( example) # place the Example in the example queue. # print "example num", counter counter += 1
def __init__(self, article, abstract_sentences, all_abstract_sentences, doc_indices, raw_article_sents, ssi, article_lcs_paths_list, vocab, hps): """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self. Args: article: source text; a string. each token is separated by a single space. abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space. vocab: Vocabulary object hps: hyperparameters """ self.hps = hps # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # # Process the article # article_words = article.split() # if len(article_words) > hps.max_enc_steps: # article_words = article_words[:hps.max_enc_steps] # self.enc_input = [vocab.word2id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token # Process the abstract abstract = ' '.join(abstract_sentences) # string abstract_words = abstract.split() # list of strings abs_ids = [ vocab.word2id(w) for w in abstract_words ] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs( abs_ids, hps.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if hps.pointer_gen: if raw_article_sents is not None and len(raw_article_sents) > 0: # self.tokenized_sents = [util.process_sent(sent) for sent in raw_article_sents] self.tokenized_sents = [ util.process_sent(sent, whitespace=True) for sent in raw_article_sents ] if self.hps.sep: for sent in self.tokenized_sents[:-1]: sent.append(data.SEP_TOKEN) # Process the article article_words = util.flatten_list_of_lists( self.tokenized_sents) if len(article_words) > hps.max_enc_steps: article_words = article_words[:hps.max_enc_steps] self.enc_input = [ vocab.word2id(w) for w in article_words ] # list of word ids; OOVs are represented by the id for UNK token if len(all_abstract_sentences) == 1: doc_indices = [0] * len(article_words) self.word_ids_sents, self.article_oovs = data.tokenizedarticle2ids( self.tokenized_sents, vocab) self.enc_input_extend_vocab = util.flatten_list_of_lists( self.word_ids_sents) if len(self.enc_input_extend_vocab) > hps.max_enc_steps: self.enc_input_extend_vocab = self.enc_input_extend_vocab[: hps . max_enc_steps] self.enc_len = len( self.enc_input_extend_vocab ) # store the length after truncation but before padding else: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves article_str = util.to_unicode(article) raw_article_sents = nltk.tokenize.sent_tokenize(article_str) self.tokenized_sents = [ util.process_sent(sent) for sent in raw_article_sents ] # Process the article article_words = util.flatten_list_of_lists( self.tokenized_sents) if len(article_words) > hps.max_enc_steps: article_words = article_words[:hps.max_enc_steps] self.enc_input = [ vocab.word2id(w) for w in article_words ] # list of word ids; OOVs are represented by the id for UNK token if len(all_abstract_sentences) == 1: doc_indices = [0] * len(article_words) self.word_ids_sents, self.article_oovs = data.tokenizedarticle2ids( self.tokenized_sents, vocab) self.enc_input_extend_vocab = util.flatten_list_of_lists( self.word_ids_sents) # self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab) if len(self.enc_input_extend_vocab) > hps.max_enc_steps: self.enc_input_extend_vocab = self.enc_input_extend_vocab[: hps . max_enc_steps] self.enc_len = len( self.enc_input_extend_vocab ) # store the length after truncation but before padding if self.hps.word_imp_reg: self.enc_importances = self.get_enc_importances( self.tokenized_sents, abstract_words) # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs( abs_ids_extend_vocab, hps.max_dec_steps, start_decoding, stop_decoding) if ssi is not None: # Translate the similar source indices into masks over the encoder input self.ssi_masks = [] for source_indices in ssi: ssi_sent_mask = [0.] * len(raw_article_sents) for source_idx in source_indices: if source_idx >= len(ssi_sent_mask): a = 0 ssi_sent_mask[source_idx] = 1. ssi_mask = pg_mmr_functions.convert_to_word_level( ssi_sent_mask, self.tokenized_sents) self.ssi_masks.append(ssi_mask) summary_sent_tokens = [ sent.strip().split() for sent in abstract_sentences ] if self.hps.ssi_data_path is None and len( self.ssi_masks) != len(summary_sent_tokens): raise Exception( 'len(self.ssi_masks) != len(summary_sent_tokens)') self.sent_indices = pg_mmr_functions.convert_to_word_level( list(range(len(summary_sent_tokens))), summary_sent_tokens).tolist() if article_lcs_paths_list is not None: if len(article_lcs_paths_list) > 1: raise Exception('Need to implement for non-sent_dataset') article_lcs_paths = article_lcs_paths_list[0] imp_mask = [0] * len(article_words) to_add = 0 for source_idx, word_indices_list in enumerate(article_lcs_paths): if source_idx > 0: to_add += len(self.tokenized_sents[source_idx - 1]) for word_idx in word_indices_list: if word_idx + to_add >= len(imp_mask): if len(imp_mask) == hps.max_enc_steps: continue else: print(self.tokenized_sents, article_lcs_paths) raise Exception( 'word_idx + to_add (%d) is larger than imp_mask size (%d)' % (word_idx + to_add, len(imp_mask))) imp_mask[word_idx + to_add] = 1 self.importance_mask = imp_mask # Store the original strings self.original_article = article self.raw_article_sents = raw_article_sents self.original_abstract = abstract self.original_abstract_sents = abstract_sentences self.all_original_abstract_sents = all_abstract_sentences self.doc_indices = doc_indices self.ssi = ssi self.article_lcs_paths_list = article_lcs_paths_list
def main(unused_argv): print('Running statistics on %s' % FLAGS.dataset_name) if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) if FLAGS.dataset_name == 'all': dataset_names = ['cnn_dm', 'xsum', 'duc_2004'] else: dataset_names = [FLAGS.dataset_name] if not os.path.exists(plot_data_file): all_lists_of_histogram_pairs = [] for dataset_name in dataset_names: FLAGS.dataset_name = dataset_name if dataset_name == 'duc_2004': dataset_splits = ['test'] elif FLAGS.dataset_split == 'all': dataset_splits = ['test', 'val', 'train'] else: dataset_splits = [FLAGS.dataset_split] ssi_list = [] for dataset_split in dataset_splits: ssi_path = os.path.join(ssi_dir, FLAGS.dataset_name, dataset_split + '_ssi.pkl') with open(ssi_path) as f: ssi_list.extend(pickle.load(f)) if FLAGS.dataset_name == 'duc_2004': for abstract_idx in [1, 2, 3]: ssi_path = os.path.join( ssi_dir, FLAGS.dataset_name, dataset_split + '_ssi_' + str(abstract_idx) + '.pkl') with open(ssi_path) as f: temp_ssi_list = pickle.load(f) ssi_list.extend(temp_ssi_list) ssi_2d = util.flatten_list_of_lists(ssi_list) num_extracted = [ len(ssi) for ssi in util.flatten_list_of_lists(ssi_list) ] hist_num_extracted = np.histogram(num_extracted, bins=6, range=(0, 5)) print(hist_num_extracted) print('Histogram of number of sentences merged: ' + util.hist_as_pdf_str(hist_num_extracted)) distances = [ abs(ssi[0] - ssi[1]) for ssi in ssi_2d if len(ssi) >= 2 ] print('Distance between sentences (mean, median): ', np.mean(distances), np.median(distances)) hist_dist = np.histogram(distances, bins=max(distances)) print('Histogram of distances: ' + util.hist_as_pdf_str(hist_dist)) summ_sent_idx_to_number_of_source_sents = [[], [], [], [], [], [], [], [], [], []] for ssi in ssi_list: for summ_sent_idx, source_indices in enumerate(ssi): if len(source_indices) == 0 or summ_sent_idx >= len( summ_sent_idx_to_number_of_source_sents): continue num_sents = len(source_indices) if num_sents > 2: num_sents = 2 summ_sent_idx_to_number_of_source_sents[ summ_sent_idx].append(num_sents) print( "Number of source sents for summary sentence indices (Is the first summary sent more likely to match with a singleton or a pair?):" ) for summ_sent_idx, list_of_numbers_of_source_sents in enumerate( summ_sent_idx_to_number_of_source_sents): if len(list_of_numbers_of_source_sents) == 0: percent_singleton = 0. else: percent_singleton = list_of_numbers_of_source_sents.count( 1) * 1. / len(list_of_numbers_of_source_sents) percent_pair = list_of_numbers_of_source_sents.count( 2) * 1. / len(list_of_numbers_of_source_sents) print str(percent_singleton) + '\t', print '' for summ_sent_idx, list_of_numbers_of_source_sents in enumerate( summ_sent_idx_to_number_of_source_sents): if len(list_of_numbers_of_source_sents) == 0: percent_pair = 0. else: percent_singleton = list_of_numbers_of_source_sents.count( 1) * 1. / len(list_of_numbers_of_source_sents) percent_pair = list_of_numbers_of_source_sents.count( 2) * 1. / len(list_of_numbers_of_source_sents) print str(percent_pair) + '\t', print '' primary_pos = [ssi[0] for ssi in ssi_2d if len(ssi) >= 1] secondary_pos = [ssi[1] for ssi in ssi_2d if len(ssi) >= 2] all_pos = [max(ssi) for ssi in ssi_2d if len(ssi) >= 1] # if FLAGS.dataset_name != 'duc_2004': # plot_positions(primary_pos, secondary_pos, all_pos) if FLAGS.dataset_split == 'all': glob_string = '*.bin' else: glob_string = dataset_splits[0] print('Loading TFIDF vectorizer') with open(tfidf_vec_path, 'rb') as f: tfidf_vectorizer = pickle.load(f) source_dir = os.path.join(data_dir, FLAGS.dataset_name) source_files = sorted( glob.glob(source_dir + '/' + glob_string + '*')) total = len(source_files) * 1000 if ( 'cnn' in FLAGS.dataset_name or 'newsroom' in FLAGS.dataset_name or 'xsum' in FLAGS.dataset_name) else len(source_files) example_generator = data.example_generator( source_dir + '/' + glob_string + '*', True, False, should_check_valid=False) all_possible_singles = 0 all_possible_pairs = [0] all_filtered_pairs = 0 all_all_combinations = 0 all_ssi_pairs = [0] ssi_pairs_with_shared_coref = [0] ssi_pairs_with_shared_word = [0] ssi_pairs_with_either_coref_or_word = [0] all_pairs_with_shared_coref = [0] all_pairs_with_shared_word = [0] all_pairs_with_either_coref_or_word = [0] actual_total = [0] rel_positions_primary = [] rel_positions_secondary = [] rel_positions_all = [] sent_lens = [] all_sent_lens = [] all_pos = [] y = [] normalized_positions_primary = [] normalized_positions_secondary = [] all_normalized_positions_primary = [] all_normalized_positions_secondary = [] normalized_positions_singles = [] normalized_positions_pairs_first = [] normalized_positions_pairs_second = [] primary_pos_duc = [] secondary_pos_duc = [] all_pos_duc = [] all_distances = [] distances_duc = [] tfidf_similarities = [] all_tfidf_similarities = [] average_mmrs = [] all_average_mmrs = [] for example_idx, example in enumerate( tqdm(example_generator, total=total)): # def process(example_idx_example): # # print '0' # example = example_idx_example if FLAGS.num_instances != -1 and example_idx >= FLAGS.num_instances: break raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, corefs, doc_indices = util.unpack_tf_example( example, names_to_types) article_sent_tokens = [ util.process_sent(sent) for sent in raw_article_sents ] article_text = ' '.join(raw_article_sents) groundtruth_summ_sents = [[ sent.strip() for sent in groundtruth_summary_text.strip().split('\n') ]] if doc_indices is None: doc_indices = [0] * len( util.flatten_list_of_lists(article_sent_tokens)) doc_indices = [int(doc_idx) for doc_idx in doc_indices] rel_sent_indices, doc_sent_indices, doc_sent_lens = preprocess_for_lambdamart_no_flags.get_rel_sent_indices( doc_indices, article_sent_tokens) groundtruth_similar_source_indices_list = util.enforce_sentence_limit( groundtruth_similar_source_indices_list, FLAGS.sentence_limit) sent_term_matrix = util.get_doc_substituted_tfidf_matrix( tfidf_vectorizer, raw_article_sents, article_text) sents_similarities = util.cosine_similarity( sent_term_matrix, sent_term_matrix) importances = util.special_squash( util.get_tfidf_importances(tfidf_vectorizer, raw_article_sents)) if FLAGS.dataset_name == 'duc_2004': first_k_indices = lambdamart_scores_to_summaries.get_indices_of_first_k_sents_of_each_article( rel_sent_indices, FLAGS.first_k) else: first_k_indices = [ idx for idx in range(len(raw_article_sents)) ] article_indices = list(range(len(raw_article_sents))) possible_pairs = [ x for x in list(itertools.combinations(article_indices, 2)) ] # all pairs # # # filtered_possible_pairs = preprocess_for_lambdamart_no_flags.filter_pairs_by_criteria(raw_article_sents, possible_pairs, corefs) # if FLAGS.dataset_name == 'duc_2004': # filtered_possible_pairs = [x for x in list(itertools.combinations(first_k_indices, 2))] # all pairs # else: # filtered_possible_pairs = preprocess_for_lambdamart_no_flags.filter_pairs_by_sent_position(possible_pairs) # # removed_pairs = list(set(possible_pairs) - set(filtered_possible_pairs)) # possible_singles = [(i,) for i in range(len(raw_article_sents))] # all_combinations = filtered_possible_pairs + possible_singles # # all_possible_singles += len(possible_singles) # all_possible_pairs[0] += len(possible_pairs) # all_filtered_pairs += len(filtered_possible_pairs) # all_all_combinations += len(all_combinations) # for ssi in groundtruth_similar_source_indices_list: # if len(ssi) > 0: # idx = rel_sent_indices[ssi[0]] # rel_positions_primary.append(idx) # rel_positions_all.append(idx) # if len(ssi) > 1: # idx = rel_sent_indices[ssi[1]] # rel_positions_secondary.append(idx) # rel_positions_all.append(idx) # # # # coref_pairs = preprocess_for_lambdamart_no_flags.get_coref_pairs(corefs) # # DO OVER LAP PAIRS BETTER # overlap_pairs = preprocess_for_lambdamart_no_flags.filter_by_overlap(article_sent_tokens, possible_pairs) # either_coref_or_word = list(set(list(coref_pairs) + overlap_pairs)) # # for ssi in groundtruth_similar_source_indices_list: # if len(ssi) == 2: # all_ssi_pairs[0] += 1 # do_share_coref = ssi in coref_pairs # do_share_words = ssi in overlap_pairs # if do_share_coref: # ssi_pairs_with_shared_coref[0] += 1 # if do_share_words: # ssi_pairs_with_shared_word[0] += 1 # if do_share_coref or do_share_words: # ssi_pairs_with_either_coref_or_word[0] += 1 # all_pairs_with_shared_coref[0] += len(coref_pairs) # all_pairs_with_shared_word[0] += len(overlap_pairs) # all_pairs_with_either_coref_or_word[0] += len(either_coref_or_word) if FLAGS.dataset_name == 'duc_2004': primary_pos_duc.extend([ rel_sent_indices[ssi[0]] for ssi in groundtruth_similar_source_indices_list if len(ssi) >= 1 ]) secondary_pos_duc.extend([ rel_sent_indices[ssi[1]] for ssi in groundtruth_similar_source_indices_list if len(ssi) >= 2 ]) all_pos_duc.extend([ max([rel_sent_indices[sent_idx] for sent_idx in ssi]) for ssi in groundtruth_similar_source_indices_list if len(ssi) >= 1 ]) for ssi in groundtruth_similar_source_indices_list: for sent_idx in ssi: sent_lens.append(len(article_sent_tokens[sent_idx])) if len(ssi) >= 1: orig_val = ssi[0] vals_to_add = get_integral_values_for_histogram( orig_val, rel_sent_indices, doc_sent_indices, doc_sent_lens, raw_article_sents) normalized_positions_primary.extend(vals_to_add) if len(ssi) >= 2: orig_val = ssi[1] vals_to_add = get_integral_values_for_histogram( orig_val, rel_sent_indices, doc_sent_indices, doc_sent_lens, raw_article_sents) normalized_positions_secondary.extend(vals_to_add) if FLAGS.dataset_name == 'duc_2004': distances_duc.append( abs(rel_sent_indices[ssi[1]] - rel_sent_indices[ssi[0]])) tfidf_similarities.append(sents_similarities[ssi[0], ssi[1]]) average_mmrs.append( (importances[ssi[0]] + importances[ssi[1]]) / 2) for ssi in groundtruth_similar_source_indices_list: if len(ssi) == 1: orig_val = ssi[0] vals_to_add = get_integral_values_for_histogram( orig_val, rel_sent_indices, doc_sent_indices, doc_sent_lens, raw_article_sents) normalized_positions_singles.extend(vals_to_add) if len(ssi) >= 2: if doc_sent_indices[ssi[0]] != doc_sent_indices[ ssi[1]]: continue orig_val_first = min(ssi[0], ssi[1]) vals_to_add = get_integral_values_for_histogram( orig_val_first, rel_sent_indices, doc_sent_indices, doc_sent_lens, raw_article_sents) normalized_positions_pairs_first.extend(vals_to_add) orig_val_second = max(ssi[0], ssi[1]) vals_to_add = get_integral_values_for_histogram( orig_val_second, rel_sent_indices, doc_sent_indices, doc_sent_lens, raw_article_sents) normalized_positions_pairs_second.extend(vals_to_add) # all_normalized_positions_primary.extend(util.flatten_list_of_lists([get_integral_values_for_histogram(single[0], rel_sent_indices, doc_sent_indices, doc_sent_lens, raw_article_sents) for single in possible_singles])) # all_normalized_positions_secondary.extend(util.flatten_list_of_lists([get_integral_values_for_histogram(pair[1], rel_sent_indices, doc_sent_indices, doc_sent_lens, raw_article_sents) for pair in possible_pairs])) all_sent_lens.extend( [len(sent) for sent in article_sent_tokens]) all_distances.extend([ abs(rel_sent_indices[pair[1]] - rel_sent_indices[pair[0]]) for pair in possible_pairs ]) all_tfidf_similarities.extend([ sents_similarities[pair[0], pair[1]] for pair in possible_pairs ]) all_average_mmrs.extend([ (importances[pair[0]] + importances[pair[1]]) / 2 for pair in possible_pairs ]) # if FLAGS.dataset_name == 'duc_2004': # rel_pos_single = [rel_sent_indices[single[0]] for single in possible_singles] # rel_pos_pair = [[rel_sent_indices[pair[0]], rel_sent_indices[pair[1]]] for pair in possible_pairs] # all_pos.extend(rel_pos_single) # all_pos.extend([max(pair) for pair in rel_pos_pair]) # else: # all_pos.extend(util.flatten_list_of_lists(possible_singles)) # all_pos.extend([max(pair) for pair in possible_pairs]) # y.extend([1 if single in groundtruth_similar_source_indices_list else 0 for single in possible_singles]) # y.extend([1 if pair in groundtruth_similar_source_indices_list else 0 for pair in possible_pairs]) # actual_total[0] += 1 # # p = Pool(144) # # list(tqdm(p.imap(process, example_generator), total=total)) # # # print 'Possible_singles\tPossible_pairs\tFiltered_pairs\tAll_combinations: \n%.2f\t%.2f\t%.2f\t%.2f' % (all_possible_singles*1./actual_total, \ # # all_possible_pairs*1./actual_total, all_filtered_pairs*1./actual_total, all_all_combinations*1./actual_total) # # # # # print 'Relative positions of groundtruth source sentences in document:\nPrimary\tSecondary\tBoth\n%.2f\t%.2f\t%.2f' % (np.mean(rel_positions_primary), np.mean(rel_positions_secondary), np.mean(rel_positions_all)) # # # # print 'SSI Pair statistics:\nShare_coref\tShare_word\tShare_either\n%.2f\t%.2f\t%.2f' \ # # % (ssi_pairs_with_shared_coref[0]*100./all_ssi_pairs[0], ssi_pairs_with_shared_word[0]*100./all_ssi_pairs[0], ssi_pairs_with_either_coref_or_word[0]*100./all_ssi_pairs[0]) # # print 'All Pair statistics:\nShare_coref\tShare_word\tShare_either\n%.2f\t%.2f\t%.2f' \ # # % (all_pairs_with_shared_coref[0]*100./all_possible_pairs[0], all_pairs_with_shared_word[0]*100./all_possible_pairs[0], all_pairs_with_either_coref_or_word[0]*100./all_possible_pairs[0]) # # # hist_all_pos = np.histogram(all_pos, bins=max(all_pos)+1) # # print 'Histogram of all sent positions: ', util.hist_as_pdf_str(hist_all_pos) # # min_sent_len = min(sent_lens) # # hist_sent_lens = np.histogram(sent_lens, bins=max(sent_lens)-min_sent_len+1) # # print 'min, max sent lens:', min_sent_len, max(sent_lens) # # print 'Histogram of sent lens: ', util.hist_as_pdf_str(hist_sent_lens) # # min_all_sent_len = min(all_sent_lens) # # hist_all_sent_lens = np.histogram(all_sent_lens, bins=max(all_sent_lens)-min_all_sent_len+1) # # print 'min, max all sent lens:', min_all_sent_len, max(all_sent_lens) # # print 'Histogram of all sent lens: ', util.hist_as_pdf_str(hist_all_sent_lens) # # # print 'Pearsons r, p value', pearsonr(all_pos, y) # # fig, ax1 = plt.subplots(nrows=1) # # plt.scatter(all_pos, y) # # pp = PdfPages(os.path.join('stuff/plots', FLAGS.dataset_name + '_position_scatter.pdf')) # # plt.savefig(pp, format='pdf',bbox_inches='tight') # # plt.show() # # pp.close() # # # if FLAGS.dataset_name == 'duc_2004': # # plot_positions(primary_pos_duc, secondary_pos_duc, all_pos_duc) # # normalized_positions_all = normalized_positions_primary + normalized_positions_secondary # # plot_histogram(normalized_positions_primary, num_bins=100) # # plot_histogram(normalized_positions_secondary, num_bins=100) # # plot_histogram(normalized_positions_all, num_bins=100) # # sent_lens_together = [sent_lens, all_sent_lens] # # plot_histogram(sent_lens_together, pdf=True, start_at_0=True, max_val=70) # # if FLAGS.dataset_name == 'duc_2004': # distances = distances_duc # sent_distances_together = [distances, all_distances] # # plot_histogram(sent_distances_together, pdf=True, start_at_0=True, max_val=100) # # tfidf_similarities_together = [tfidf_similarities, all_tfidf_similarities] # # plot_histogram(tfidf_similarities_together, pdf=True, num_bins=100) # # average_mmrs_together = [average_mmrs, all_average_mmrs] # # plot_histogram(average_mmrs_together, pdf=True, num_bins=100) # # normalized_positions_primary_together = [normalized_positions_primary, bin_values] # normalized_positions_secondary_together = [normalized_positions_secondary, bin_values] # # plot_histogram(normalized_positions_primary_together, pdf=True, num_bins=100) # # plot_histogram(normalized_positions_secondary_together, pdf=True, num_bins=100) # # # list_of_hist_pairs = [ # { # 'lst': normalized_positions_primary_together, # 'pdf': True, # 'num_bins': 100, # 'y_lim': 3.9, # 'y_label': FLAGS.dataset_name, # 'x_label': 'Sent position (primary)' # }, # { # 'lst': normalized_positions_secondary_together, # 'pdf': True, # 'num_bins': 100, # 'y_lim': 3.9, # 'x_label': 'Sent position (secondary)' # }, # { # 'lst': sent_distances_together, # 'pdf': True, # 'start_at_0': True, # 'max_val': 100, # 'x_label': 'Sent distance' # }, # { # 'lst': sent_lens_together, # 'pdf': True, # 'start_at_0': True, # 'max_val': 70, # 'x_label': 'Sent length' # }, # { # 'lst': average_mmrs_together, # 'pdf': True, # 'num_bins': 100, # 'x_label': 'Average TF-IDF importance' # } # ] normalized_positions_pairs_together = [ normalized_positions_pairs_first, normalized_positions_pairs_second ] list_of_hist_pairs = [ { 'lst': [normalized_positions_singles], 'pdf': True, 'num_bins': 100, # 'y_lim': 3.9, 'x_lim': 1.0, 'y_label': FLAGS.dataset_name, 'x_label': 'Sent Position (Singles)', 'legend_labels': ['Primary'] }, { 'lst': normalized_positions_pairs_together, 'pdf': True, 'num_bins': 100, # 'y_lim': 3.9, 'x_lim': 1.0, 'x_label': 'Sent Position (Pairs)', 'legend_labels': ['Primary', 'Secondary'] } ] all_lists_of_histogram_pairs.append(list_of_hist_pairs) with open(plot_data_file, 'w') as f: cPickle.dump(all_lists_of_histogram_pairs, f) else: with open(plot_data_file) as f: all_lists_of_histogram_pairs = cPickle.load(f) plot_histograms(all_lists_of_histogram_pairs)
def main(unused_argv): print('Running statistics on %s' % FLAGS.dataset_name) if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) if FLAGS.summarizer == 'all': summary_methods = list(summarizers.keys()) else: summary_methods = [FLAGS.summarizer] if FLAGS.dataset_name == 'all': dataset_names = datasets else: dataset_names = [FLAGS.dataset_name] sheets_strs = [] for summary_method in summary_methods: summary_fn = summarizers[summary_method] for dataset_name in dataset_names: FLAGS.dataset_name = dataset_name original_dataset_name = 'xsum' if 'xsum' in dataset_name else 'cnn_dm' if 'cnn_dm' in dataset_name or 'duc_2004' in dataset_name else '' vocab = Vocab('logs/vocab' + '_' + original_dataset_name, 50000) # create a vocabulary source_dir = os.path.join(data_dir, dataset_name) source_files = sorted( glob.glob(source_dir + '/' + FLAGS.dataset_split + '*')) total = len(source_files) * 1000 if ( 'cnn' in dataset_name or 'newsroom' in dataset_name or 'xsum' in dataset_name) else len(source_files) example_generator = data.example_generator( source_dir + '/' + FLAGS.dataset_split + '*', True, False, should_check_valid=False) if dataset_name == 'duc_2004': abs_source_dir = os.path.join( os.path.expanduser('~') + '/data/tf_data/with_coref', dataset_name) abs_example_generator = data.example_generator( abs_source_dir + '/' + FLAGS.dataset_split + '*', True, False, should_check_valid=False) abs_names_to_types = [('abstract', 'string_list')] triplet_ssi_list = [] for example_idx, example in enumerate( tqdm(example_generator, total=total)): raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, corefs, doc_indices = util.unpack_tf_example( example, names_to_types) if dataset_name == 'duc_2004': abs_example = next(abs_example_generator) groundtruth_summary_texts = util.unpack_tf_example( abs_example, abs_names_to_types) groundtruth_summary_texts = groundtruth_summary_texts[0] groundtruth_summ_sents_list = [[ sent.strip() for sent in data.abstract2sents(abstract) ] for abstract in groundtruth_summary_texts] else: groundtruth_summary_texts = [groundtruth_summary_text] groundtruth_summ_sents_list = [] for groundtruth_summary_text in groundtruth_summary_texts: groundtruth_summ_sents = [ sent.strip() for sent in groundtruth_summary_text.strip().split('\n') ] groundtruth_summ_sents_list.append( groundtruth_summ_sents) article_sent_tokens = [ util.process_sent(sent) for sent in raw_article_sents ] if doc_indices is None: doc_indices = [0] * len( util.flatten_list_of_lists(article_sent_tokens)) doc_indices = [int(doc_idx) for doc_idx in doc_indices] groundtruth_similar_source_indices_list = util.enforce_sentence_limit( groundtruth_similar_source_indices_list, FLAGS.sentence_limit) log_dir = os.path.join(log_root, dataset_name + '_' + summary_method) dec_dir = os.path.join(log_dir, 'decoded') ref_dir = os.path.join(log_dir, 'reference') util.create_dirs(dec_dir) util.create_dirs(ref_dir) parser = PlaintextParser.from_string( ' '.join(raw_article_sents), Tokenizer("english")) summarizer = summary_fn() summary = summarizer( parser.document, 5) #Summarize the document with 5 sentences summary = [str(sentence) for sentence in summary] summary_tokenized = [] for sent in summary: summary_tokenized.append(sent.lower()) rouge_functions.write_for_rouge(groundtruth_summ_sents_list, summary_tokenized, example_idx, ref_dir, dec_dir, log=False) decoded_sent_tokens = [ sent.split() for sent in summary_tokenized ] sentence_limit = 2 sys_ssi_list, _, _ = get_simple_source_indices_list( decoded_sent_tokens, article_sent_tokens, vocab, sentence_limit, min_matched_tokens) triplet_ssi_list.append( (groundtruth_similar_source_indices_list, sys_ssi_list, -1)) print('Evaluating Lambdamart model F1 score...') suffix = util.all_sent_selection_eval(triplet_ssi_list) print(suffix) results_dict = rouge_functions.rouge_eval(ref_dir, dec_dir) print(("Results_dict: ", results_dict)) sheets_str = rouge_functions.rouge_log(results_dict, log_dir, suffix=suffix) sheets_strs.append(dataset_name + '_' + summary_method + '\n' + sheets_str) for sheets_str in sheets_strs: print(sheets_str + '\n')
def evaluate_example(ex): example, example_idx, qid_ssi_to_importances, qid_ssi_to_token_scores_and_mappings = ex print(example_idx) # example_idx += 1 qid = example_idx raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, corefs, doc_indices = util.unpack_tf_example( example, names_to_types) article_sent_tokens = [ util.process_sent(sent) for sent in raw_article_sents ] enforced_groundtruth_ssi_list = util.enforce_sentence_limit( groundtruth_similar_source_indices_list, sentence_limit) groundtruth_summ_sent_tokens = [] groundtruth_summ_sents = [[ sent.strip() for sent in groundtruth_summary_text.strip().split('\n') ]] groundtruth_summ_sent_tokens = [ sent.split(' ') for sent in groundtruth_summ_sents[0] ] if FLAGS.upper_bound: replaced_ssi_list = util.replace_empty_ssis( enforced_groundtruth_ssi_list, raw_article_sents) selected_article_sent_indices = util.flatten_list_of_lists( replaced_ssi_list) summary_sents = [ ' '.join(sent) for sent in util.reorder( article_sent_tokens, selected_article_sent_indices) ] similar_source_indices_list = groundtruth_similar_source_indices_list ssi_length_extractive = len(similar_source_indices_list) else: summary_sents, similar_source_indices_list, summary_sents_for_html, ssi_length_extractive, \ article_lcs_paths_list, token_probs_list = generate_summary(article_sent_tokens, qid_ssi_to_importances, example_idx, qid_ssi_to_token_scores_and_mappings) similar_source_indices_list_trunc = similar_source_indices_list[: ssi_length_extractive] summary_sents_for_html_trunc = summary_sents_for_html[: ssi_length_extractive] if example_idx < 100 or (example_idx >= 2000 and example_idx < 2100): summary_sent_tokens = [ sent.split(' ') for sent in summary_sents_for_html_trunc ] if FLAGS.tag_tokens and FLAGS.tag_loss_wt != 0: lcs_paths_list_param = copy.deepcopy(article_lcs_paths_list) else: lcs_paths_list_param = None extracted_sents_in_article_html = html_highlight_sents_in_article( summary_sent_tokens, similar_source_indices_list_trunc, article_sent_tokens, doc_indices=doc_indices, lcs_paths_list=lcs_paths_list_param) # write_highlighted_html(extracted_sents_in_article_html, html_dir, example_idx) groundtruth_ssi_list, gt_lcs_paths_list, gt_article_lcs_paths_list, gt_smooth_article_paths_list = get_simple_source_indices_list( groundtruth_summ_sent_tokens, article_sent_tokens, None, sentence_limit, min_matched_tokens) groundtruth_highlighted_html = html_highlight_sents_in_article( groundtruth_summ_sent_tokens, groundtruth_ssi_list, article_sent_tokens, lcs_paths_list=gt_lcs_paths_list, article_lcs_paths_list=gt_smooth_article_paths_list, doc_indices=doc_indices) all_html = '<u>System Summary</u><br><br>' + extracted_sents_in_article_html + '<u>Groundtruth Summary</u><br><br>' + groundtruth_highlighted_html # all_html = '<u>System Summary</u><br><br>' + extracted_sents_in_article_html write_highlighted_html(all_html, html_dir, example_idx) rouge_functions.write_for_rouge(groundtruth_summ_sents, summary_sents, example_idx, ref_dir, dec_dir) return (groundtruth_similar_source_indices_list, similar_source_indices_list, ssi_length_extractive, token_probs_list)
def __init__(self, article, abstract_sentences, all_abstract_sentences, doc_indices, raw_article_sents, ssi, vocab, hps): """Initializes the Example, performing tokenization and truncation to produce the encoder, decoder and target sequences, which are stored in self. Args: article: source text; a string. each token is separated by a single space. abstract_sentences: list of strings, one per abstract sentence. In each sentence, each token is separated by a single space. vocab: Vocabulary object hps: hyperparameters """ self.hps = hps # Get ids of special tokens start_decoding = vocab.word2id(data.START_DECODING) stop_decoding = vocab.word2id(data.STOP_DECODING) # # Process the article # article_words = article.split() # if len(article_words) > hps.max_enc_steps: # article_words = article_words[:hps.max_enc_steps] # self.enc_input = [vocab.word2id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token # Process the abstract abstract = ' '.join(abstract_sentences) # string abstract_words = abstract.split() # list of strings abs_ids = [vocab.word2id(w) for w in abstract_words] # list of word ids; OOVs are represented by the id for UNK token # Get the decoder input sequence and target sequence self.dec_input, self.target = self.get_dec_inp_targ_seqs(abs_ids, hps.max_dec_steps, start_decoding, stop_decoding) self.dec_len = len(self.dec_input) # If using pointer-generator mode, we need to store some extra info if hps.pointer_gen: if raw_article_sents is not None and len(raw_article_sents) > 0: # self.tokenized_sents = [util.process_sent(sent) for sent in raw_article_sents] self.tokenized_sents = [util.process_sent(sent, whitespace=True) for sent in raw_article_sents] # Process the article article_words = util.flatten_list_of_lists(self.tokenized_sents) if len(article_words) > hps.max_enc_steps: article_words = article_words[:hps.max_enc_steps] self.enc_input = [vocab.word2id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token if len(all_abstract_sentences) == 1: doc_indices = [0] * len(article_words) self.word_ids_sents, self.article_oovs = data.tokenizedarticle2ids(self.tokenized_sents, vocab) self.enc_input_extend_vocab = util.flatten_list_of_lists(self.word_ids_sents) if len(self.enc_input_extend_vocab) > hps.max_enc_steps: self.enc_input_extend_vocab = self.enc_input_extend_vocab[:hps.max_enc_steps] self.enc_len = len(self.enc_input_extend_vocab) # store the length after truncation but before padding else: # Store a version of the enc_input where in-article OOVs are represented by their temporary OOV id; also store the in-article OOVs words themselves article_str = util.to_unicode(article) raw_article_sents = nltk.tokenize.sent_tokenize(article_str) self.tokenized_sents = [util.process_sent(sent) for sent in raw_article_sents] # Process the article article_words = util.flatten_list_of_lists(self.tokenized_sents) if len(article_words) > hps.max_enc_steps: article_words = article_words[:hps.max_enc_steps] self.enc_input = [vocab.word2id(w) for w in article_words] # list of word ids; OOVs are represented by the id for UNK token if len(all_abstract_sentences) == 1: doc_indices = [0] * len(article_words) self.word_ids_sents, self.article_oovs = data.tokenizedarticle2ids(self.tokenized_sents, vocab) self.enc_input_extend_vocab = util.flatten_list_of_lists(self.word_ids_sents) # self.enc_input_extend_vocab, self.article_oovs = data.article2ids(article_words, vocab) if len(self.enc_input_extend_vocab) > hps.max_enc_steps: self.enc_input_extend_vocab = self.enc_input_extend_vocab[:hps.max_enc_steps] self.enc_len = len(self.enc_input_extend_vocab) # store the length after truncation but before padding # Get a verison of the reference summary where in-article OOVs are represented by their temporary article OOV id abs_ids_extend_vocab = data.abstract2ids(abstract_words, vocab, self.article_oovs) # Overwrite decoder target sequence so it uses the temp article OOV ids _, self.target = self.get_dec_inp_targ_seqs(abs_ids_extend_vocab, hps.max_dec_steps, start_decoding, stop_decoding) # Store the original strings self.original_article = article self.raw_article_sents = raw_article_sents self.original_abstract = abstract self.original_abstract_sents = abstract_sentences self.all_original_abstract_sents = all_abstract_sentences self.doc_indices = doc_indices self.ssi = ssi
def fill_example_queue(self): """Reads data from file and processes into Examples which are then placed into the example queue.""" if self._example_generator is None: input_gen = self.text_generator( data.example_generator(self._data_path, self._single_pass, self._cnn_500_dm_500, is_original=False)) else: input_gen = self.text_generator(self._example_generator) counter = 0 while True: try: (article, abstracts, doc_indices_str, raw_article_sents, ssi) = next(input_gen) # read the next example from file. article and abstract are both strings. except StopIteration: # if there are no more examples: logging.info("The example generator for this example queue filling thread has exhausted data.") if self._single_pass: logging.info( "single_pass mode is on, so we've finished reading dataset. This thread is stopping.") self._finished_reading = True break else: raise Exception("single_pass mode is off but the example generator is out of data; error.") article = article abstracts = [abstract for abstract in abstracts] if type(doc_indices_str) != str: doc_indices_str = doc_indices_str raw_article_sents = [sent for sent in raw_article_sents] all_abstract_sentences = [[sent.strip() for sent in data.abstract2sents( abstract)] for abstract in abstracts] if len(all_abstract_sentences) != 0: abstract_sentences = all_abstract_sentences[0] else: abstract_sentences = [] doc_indices = [int(idx) for idx in doc_indices_str.strip().split()] if self._hps.by_instance: # if we are running iteratively on only instances (a singleton/pair + a summary sentence), not the whole article for abs_idx, abstract_sentence in enumerate(abstract_sentences): inst_ssi = ssi[abs_idx] if len(inst_ssi) == 0: continue inst_abstract_sentences = abstract_sentence inst_raw_article_sents = util.reorder(raw_article_sents, inst_ssi) inst_article = ' '.join([' '.join(util.process_sent(sent, whitespace=True)) for sent in inst_raw_article_sents]) inst_doc_indices = [0] * len(inst_article.split()) if len(inst_article) == 0: # See https://github.com/abisee/pointer-generator/issues/1 logging.warning( 'Found an example with empty article text. Skipping it.\n*********************************************') elif len(inst_article.strip().split()) < 3 and self._hps.skip_with_less_than_3: print( 'Article has less than 3 tokens, so skipping\n*********************************************') elif len(inst_abstract_sentences.strip().split()) < 3 and self._hps.skip_with_less_than_3: print( 'Abstract has less than 3 tokens, so skipping\n*********************************************') else: inst_example = Example(inst_article, [inst_abstract_sentences], all_abstract_sentences, inst_doc_indices, inst_raw_article_sents, None, self._vocab, self._hps) self._example_queue.put(inst_example) else: example = Example(article, abstract_sentences, all_abstract_sentences, doc_indices, raw_article_sents, ssi, self._vocab, self._hps) # Process into an Example. self._example_queue.put(example) # place the Example in the example queue. # print "example num", counter counter += 1
def decode_iteratively(self, example_generator, total, names_to_types, ssi_list, hps): attn_vis_idx = 0 for example_idx, example in enumerate( tqdm(example_generator, total=total)): raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, corefs, groundtruth_article_lcs_paths_list = util.unpack_tf_example( example, names_to_types) article_sent_tokens = [ util.process_sent(sent) for sent in raw_article_sents ] groundtruth_summ_sents = [[ sent.strip() for sent in groundtruth_summary_text.strip().split('\n') ]] groundtruth_summ_sent_tokens = [ sent.split(' ') for sent in groundtruth_summ_sents[0] ] if ssi_list is None: # this is if we are doing the upper bound evaluation (ssi_list comes straight from the groundtruth) sys_ssi = groundtruth_similar_source_indices_list sys_alp_list = groundtruth_article_lcs_paths_list if FLAGS.singles_and_pairs == 'singles': sys_ssi = util.enforce_sentence_limit(sys_ssi, 1) sys_alp_list = util.enforce_sentence_limit(sys_alp_list, 1) elif FLAGS.singles_and_pairs == 'both': sys_ssi = util.enforce_sentence_limit(sys_ssi, 2) sys_alp_list = util.enforce_sentence_limit(sys_alp_list, 2) sys_ssi, sys_alp_list = util.replace_empty_ssis( sys_ssi, raw_article_sents, sys_alp_list=sys_alp_list) else: gt_ssi, sys_ssi, ext_len, sys_token_probs_list = ssi_list[ example_idx] sys_alp_list = ssi_functions.list_labels_from_probs( sys_token_probs_list, FLAGS.tag_threshold) if FLAGS.singles_and_pairs == 'singles': sys_ssi = util.enforce_sentence_limit(sys_ssi, 1) sys_alp_list = util.enforce_sentence_limit(sys_alp_list, 1) groundtruth_similar_source_indices_list = util.enforce_sentence_limit( groundtruth_similar_source_indices_list, 1) gt_ssi = util.enforce_sentence_limit(gt_ssi, 1) elif FLAGS.singles_and_pairs == 'both': sys_ssi = util.enforce_sentence_limit(sys_ssi, 2) sys_alp_list = util.enforce_sentence_limit(sys_alp_list, 2) groundtruth_similar_source_indices_list = util.enforce_sentence_limit( groundtruth_similar_source_indices_list, 2) gt_ssi = util.enforce_sentence_limit(gt_ssi, 2) # if gt_ssi != groundtruth_similar_source_indices_list: # raise Exception('Example %d has different groundtruth source indices: ' + str(groundtruth_similar_source_indices_list) + ' || ' + str(gt_ssi)) if FLAGS.dataset_name == 'xsum': sys_ssi = [sys_ssi[0]] final_decoded_words = [] final_decoded_outpus = '' best_hyps = [] highlight_html_total = '<u>System Summary</u><br><br>' for ssi_idx, ssi in enumerate(sys_ssi): # selected_article_lcs_paths = None selected_article_lcs_paths = sys_alp_list[ssi_idx] ssi, selected_article_lcs_paths = util.make_ssi_chronological( ssi, selected_article_lcs_paths) selected_article_lcs_paths = [selected_article_lcs_paths] selected_raw_article_sents = util.reorder( raw_article_sents, ssi) selected_article_text = ' '.join([ ' '.join(sent) for sent in util.reorder(article_sent_tokens, ssi) ]) selected_doc_indices_str = '0 ' * len( selected_article_text.split()) if FLAGS.upper_bound: selected_groundtruth_summ_sent = [[ groundtruth_summ_sents[0][ssi_idx] ]] else: selected_groundtruth_summ_sent = groundtruth_summ_sents batch = create_batch(selected_article_text, selected_groundtruth_summ_sent, selected_doc_indices_str, selected_raw_article_sents, selected_article_lcs_paths, FLAGS.batch_size, hps, self._vocab) original_article = batch.original_articles[0] # string original_abstract = batch.original_abstracts[0] # string article_withunks = data.show_art_oovs(original_article, self._vocab) # string abstract_withunks = data.show_abs_oovs( original_abstract, self._vocab, (batch.art_oovs[0] if FLAGS.pointer_gen else None)) # string # article_withunks = data.show_art_oovs(original_article, self._vocab) # string # abstract_withunks = data.show_abs_oovs(original_abstract, self._vocab, (batch.art_oovs[0] if FLAGS.pointer_gen else None)) # string if FLAGS.first_intact and ssi_idx == 0: decoded_words = selected_article_text.strip().split() decoded_output = selected_article_text else: decoded_words, decoded_output, best_hyp = decode_example( self._sess, self._model, self._vocab, batch, example_idx, hps) best_hyps.append(best_hyp) final_decoded_words.extend(decoded_words) final_decoded_outpus += decoded_output if example_idx < 100 or (example_idx >= 2000 and example_idx < 2100): min_matched_tokens = 2 selected_article_sent_tokens = [ util.process_sent(sent) for sent in selected_raw_article_sents ] highlight_summary_sent_tokens = [decoded_words] highlight_ssi_list, lcs_paths_list, highlight_article_lcs_paths_list, highlight_smooth_article_lcs_paths_list = ssi_functions.get_simple_source_indices_list( highlight_summary_sent_tokens, selected_article_sent_tokens, None, 2, min_matched_tokens) highlighted_html = ssi_functions.html_highlight_sents_in_article( highlight_summary_sent_tokens, highlight_ssi_list, selected_article_sent_tokens, lcs_paths_list=lcs_paths_list, article_lcs_paths_list= highlight_smooth_article_lcs_paths_list) highlight_html_total += highlighted_html + '<br>' if FLAGS.attn_vis and example_idx < 200: self.write_for_attnvis( article_withunks, abstract_withunks, decoded_words, best_hyp.attn_dists, best_hyp.p_gens, attn_vis_idx ) # write info to .json file for visualization tool attn_vis_idx += 1 if len(final_decoded_words) >= 100: break gt_ssi_list, gt_alp_list = util.replace_empty_ssis( groundtruth_similar_source_indices_list, raw_article_sents, sys_alp_list=groundtruth_article_lcs_paths_list) highlight_html_gt = '<u>Reference Summary</u><br><br>' for ssi_idx, ssi in enumerate(gt_ssi_list): selected_article_lcs_paths = gt_alp_list[ssi_idx] try: ssi, selected_article_lcs_paths = util.make_ssi_chronological( ssi, selected_article_lcs_paths) except: util.print_vars(ssi, example_idx, selected_article_lcs_paths) raise selected_raw_article_sents = util.reorder( raw_article_sents, ssi) if example_idx < 100 or (example_idx >= 2000 and example_idx < 2100): min_matched_tokens = 2 selected_article_sent_tokens = [ util.process_sent(sent) for sent in selected_raw_article_sents ] highlight_summary_sent_tokens = [ groundtruth_summ_sent_tokens[ssi_idx] ] highlight_ssi_list, lcs_paths_list, highlight_article_lcs_paths_list, highlight_smooth_article_lcs_paths_list = ssi_functions.get_simple_source_indices_list( highlight_summary_sent_tokens, selected_article_sent_tokens, None, 2, min_matched_tokens) highlighted_html = ssi_functions.html_highlight_sents_in_article( highlight_summary_sent_tokens, highlight_ssi_list, selected_article_sent_tokens, lcs_paths_list=lcs_paths_list, article_lcs_paths_list= highlight_smooth_article_lcs_paths_list) highlight_html_gt += highlighted_html + '<br>' if example_idx < 100 or (example_idx >= 2000 and example_idx < 2100): self.write_for_human(raw_article_sents, groundtruth_summ_sents, final_decoded_words, example_idx) highlight_html_total = ssi_functions.put_html_in_two_columns( highlight_html_total, highlight_html_gt) ssi_functions.write_highlighted_html(highlight_html_total, self._highlight_dir, example_idx) # if example_idx % 100 == 0: # attn_dir = os.path.join(self._decode_dir, 'attn_vis_data') # attn_selections.process_attn_selections(attn_dir, self._decode_dir, self._vocab) rouge_functions.write_for_rouge( groundtruth_summ_sents, None, example_idx, self._rouge_ref_dir, self._rouge_dec_dir, decoded_words=final_decoded_words, log=False ) # write ref summary and decoded summary to file, to eval with pyrouge later # if FLAGS.attn_vis: # self.write_for_attnvis(article_withunks, abstract_withunks, decoded_words, best_hyp.attn_dists, best_hyp.p_gens, example_idx) # write info to .json file for visualization tool example_idx += 1 # this is how many examples we've decoded logging.info("Decoder has finished reading dataset for single_pass.") logging.info("Output has been saved in %s and %s.", self._rouge_ref_dir, self._rouge_dec_dir) if len(os.listdir(self._rouge_ref_dir)) != 0: if FLAGS.dataset_name == 'xsum': l_param = 100 else: l_param = 100 logging.info("Now starting ROUGE eval...") results_dict = rouge_functions.rouge_eval(self._rouge_ref_dir, self._rouge_dec_dir, l_param=l_param) rouge_functions.rouge_log(results_dict, self._decode_dir)
def main(unused_argv): if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) if FLAGS.singles_and_pairs == 'both': FLAGS.exp_name = FLAGS.exp_name + '_both' exp_name = _exp_name + '_both' dataset_articles = _dataset_articles else: FLAGS.exp_name = FLAGS.exp_name + '_singles' exp_name = _exp_name + '_singles' dataset_articles = _dataset_articles + '_singles' my_log_dir = os.path.join(log_dir, FLAGS.ssi_exp_name) print('Running statistics on %s' % FLAGS.exp_name) if FLAGS.dataset_name != "": FLAGS.data_path = os.path.join(FLAGS.data_root, FLAGS.dataset_name, FLAGS.dataset_split + '*') if not os.path.exists(os.path.join(FLAGS.data_root, FLAGS.dataset_name)) or len(os.listdir(os.path.join(FLAGS.data_root, FLAGS.dataset_name))) == 0: print(('No TF example data found at %s so creating it from raw data.' % os.path.join(FLAGS.data_root, FLAGS.dataset_name))) convert_data.process_dataset(FLAGS.dataset_name) logging.set_verbosity(logging.INFO) # choose what level of logging you want logging.info('Starting seq2seq_attention in %s mode...', (FLAGS.mode)) # Change log_root to FLAGS.log_root/FLAGS.exp_name and create the dir if necessary FLAGS.exp_name = FLAGS.exp_name if FLAGS.exp_name != '' else FLAGS.dataset_name FLAGS.actual_log_root = FLAGS.log_root FLAGS.log_root = os.path.join(FLAGS.log_root, FLAGS.exp_name) vocab = Vocab(FLAGS.vocab_path, FLAGS.vocab_size) # create a vocabulary # If in decode mode, set batch_size = beam_size # Reason: in decode mode, we decode one example at a time. # On each step, we have beam_size-many hypotheses in the beam, so we need to make a batch of these hypotheses. if FLAGS.mode == 'decode': FLAGS.batch_size = FLAGS.beam_size # If single_pass=True, check we're in decode mode if FLAGS.single_pass and FLAGS.mode!='decode': raise Exception("The single_pass flag should only be True in decode mode") # Make a namedtuple hps, containing the values of the hyperparameters that the model needs hparam_list = ['mode', 'lr', 'adagrad_init_acc', 'rand_unif_init_mag', 'trunc_norm_init_std', 'max_grad_norm', 'hidden_dim', 'emb_dim', 'batch_size', 'max_dec_steps', 'max_enc_steps', 'coverage', 'cov_loss_wt', 'pointer_gen', 'lambdamart_input'] hps_dict = {} for key,val in FLAGS.__flags.items(): # for each flag if key in hparam_list: # if it's in the list hps_dict[key] = val.value # add it to the dict hps = namedtuple("HParams", list(hps_dict.keys()))(**hps_dict) tf.set_random_seed(113) # a seed value for randomness decode_model_hps = hps._replace( max_dec_steps=1) # The model is configured with max_dec_steps=1 because we only ever run one step of the decoder at a time (to do beam search). Note that the batcher is initialized with max_dec_steps equal to e.g. 100 because the batches need to contain the full summaries if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) start_time = time.time() np.random.seed(random_seed) source_dir = os.path.join(data_dir, dataset_articles) source_files = sorted(glob.glob(source_dir + '/' + dataset_split + '*')) with open(os.path.join(my_log_dir, 'ssi.pkl')) as f: ssi_list = pickle.load(f) total = len(source_files) * 1000 if 'cnn' or 'newsroom' in dataset_articles else len(source_files) example_generator = data.example_generator(source_dir + '/' + dataset_split + '*', True, False, should_check_valid=False) for example_idx, example in enumerate(tqdm(example_generator, total=total)): raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, corefs = util.unpack_tf_example( example, names_to_types) article_sent_tokens = [util.process_sent(sent) for sent in raw_article_sents] groundtruth_summ_sents = [[sent.strip() for sent in groundtruth_summary_text.strip().split('\n')]] gt_ssi, sys_ssi, ext_len = ssi_list[example_idx] if gt_ssi != groundtruth_similar_source_indices_list: raise Exception('Example %d has different groundtruth source indices: '%example_idx + str( groundtruth_similar_source_indices_list) + ' || ' + str(gt_ssi)) if len(groundtruth_summ_sents) == len(groundtruth_similar_source_indices_list): tqdm.write('Example %d has different len groundtruth source indices from len summ sents: '%example_idx + str( groundtruth_similar_source_indices_list) + ' || ' + str(groundtruth_summ_sents)) a=0 print('done') a=0
def convert_article_to_lambdamart_features(ex): # example_idx += 1 # if num_instances != -1 and example_idx >= num_instances: # break example, example_idx, single_feat_len, pair_feat_len, singles_and_pairs, out_path = ex print(example_idx) raw_article_sents, similar_source_indices_list, summary_text, corefs, doc_indices = util.unpack_tf_example( example, names_to_types) article_sent_tokens = [ util.process_sent(sent) for sent in raw_article_sents ] if doc_indices is None: doc_indices = [0] * len( util.flatten_list_of_lists(article_sent_tokens)) doc_indices = [int(doc_idx) for doc_idx in doc_indices] if len(doc_indices) != len( util.flatten_list_of_lists(article_sent_tokens)): doc_indices = [0] * len( util.flatten_list_of_lists(article_sent_tokens)) rel_sent_indices, _, _ = util.get_rel_sent_indices(doc_indices, article_sent_tokens) if FLAGS.singles_and_pairs == 'singles': sentence_limit = 1 else: sentence_limit = 2 similar_source_indices_list = util.enforce_sentence_limit( similar_source_indices_list, sentence_limit) summ_sent_tokens = [ sent.strip().split() for sent in summary_text.strip().split('\n') ] # sent_term_matrix = util.get_tfidf_matrix(raw_article_sents) article_text = ' '.join(raw_article_sents) sent_term_matrix = util.get_doc_substituted_tfidf_matrix( tfidf_vectorizer, raw_article_sents, article_text, pca) doc_vector = np.mean(sent_term_matrix, axis=0) out_str = '' # ssi_idx_cur_inst_id = defaultdict(int) instances = [] if importance: importances = util.special_squash( util.get_tfidf_importances(tfidf_vectorizer, raw_article_sents, pca)) possible_pairs = [ x for x in list( itertools.combinations(list(range(len(raw_article_sents))), 2)) ] # all pairs if FLAGS.use_pair_criteria: possible_pairs = filter_pairs_by_criteria(raw_article_sents, possible_pairs, corefs) if FLAGS.sent_position_criteria: possible_pairs = filter_pairs_by_sent_position( possible_pairs, rel_sent_indices) possible_singles = [(i, ) for i in range(len(raw_article_sents))] possible_combinations = possible_pairs + possible_singles positives = [ssi for ssi in similar_source_indices_list] negatives = [ ssi for ssi in possible_combinations if not (ssi in positives or ssi[::-1] in positives) ] negative_pairs = [ x for x in possible_pairs if not (x in similar_source_indices_list or x[::-1] in similar_source_indices_list) ] negative_singles = [ x for x in possible_singles if not (x in similar_source_indices_list or x[::-1] in similar_source_indices_list) ] random_negative_pairs = np.random.permutation( len(negative_pairs)).tolist() random_negative_singles = np.random.permutation( len(negative_singles)).tolist() qid = example_idx for similar_source_indices in positives: # True sentence single/pair relevance = 1 features = get_features(similar_source_indices, sent_term_matrix, article_sent_tokens, rel_sent_indices, single_feat_len, pair_feat_len, importances, singles_and_pairs) if features is None: continue instances.append( Lambdamart_Instance(features, relevance, qid, similar_source_indices)) a = 0 if FLAGS.dataset_name == 'xsum' and FLAGS.special_xsum_balance: neg_relevance = 0 num_negative = 4 if FLAGS.singles_and_pairs == 'singles': num_neg_singles = num_negative num_neg_pairs = 0 else: num_neg_singles = num_negative / 2 num_neg_pairs = num_negative / 2 for _ in range(num_neg_singles): if len(random_negative_singles) == 0: continue negative_indices = negative_singles[ random_negative_singles.pop()] neg_features = get_features(negative_indices, sent_term_matrix, article_sent_tokens, rel_sent_indices, single_feat_len, pair_feat_len, importances, singles_and_pairs) if neg_features is None: continue instances.append( Lambdamart_Instance(neg_features, neg_relevance, qid, negative_indices)) for _ in range(num_neg_pairs): if len(random_negative_pairs) == 0: continue negative_indices = negative_pairs[ random_negative_pairs.pop()] neg_features = get_features(negative_indices, sent_term_matrix, article_sent_tokens, rel_sent_indices, single_feat_len, pair_feat_len, importances, singles_and_pairs) if neg_features is None: continue instances.append( Lambdamart_Instance(neg_features, neg_relevance, qid, negative_indices)) elif balance: # False sentence single/pair is_pair = len(similar_source_indices) == 2 if is_pair: if len(random_negative_pairs) == 0: continue negative_indices = negative_pairs[ random_negative_pairs.pop()] else: if len(random_negative_singles) == 0: continue negative_indices = negative_singles[ random_negative_singles.pop()] neg_relevance = 0 neg_features = get_features(negative_indices, sent_term_matrix, article_sent_tokens, rel_sent_indices, single_feat_len, pair_feat_len, importances, singles_and_pairs) if neg_features is None: continue instances.append( Lambdamart_Instance(neg_features, neg_relevance, qid, negative_indices)) if not balance: for negative_indices in negatives: neg_relevance = 0 neg_features = get_features(negative_indices, sent_term_matrix, article_sent_tokens, single_feat_len, pair_feat_len, importances, singles_and_pairs) if neg_features is None: continue instances.append( Lambdamart_Instance(neg_features, neg_relevance, qid, negative_indices)) sorted_instances = sorted(instances, key=lambda x: (x.qid, x.source_indices)) assign_inst_ids(sorted_instances) if FLAGS.lr: return sorted_instances else: for instance in sorted_instances: lambdamart_str = format_to_lambdamart(instance, single_feat_len) out_str += lambdamart_str + '\n' with open(os.path.join(out_path, '%06d.txt' % example_idx), 'wb') as f: f.write(out_str)
def decode_iteratively(self, example_generator, total, names_to_types, ssi_list, hps): for example_idx, example in enumerate( tqdm(example_generator, total=total)): raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text = util.unpack_tf_example( example, names_to_types) article_sent_tokens = [ util.process_sent(sent) for sent in raw_article_sents ] groundtruth_summ_sents = [[ sent.strip() for sent in groundtruth_summary_text.strip().split('\n') ]] if ssi_list is None: # this is if we are doing the upper bound evaluation (ssi_list comes straight from the groundtruth) sys_ssi = groundtruth_similar_source_indices_list if FLAGS.singles_and_pairs == 'singles': sys_ssi = util.enforce_sentence_limit(sys_ssi, 1) elif FLAGS.singles_and_pairs == 'both': sys_ssi = util.enforce_sentence_limit(sys_ssi, 2) sys_ssi = util.replace_empty_ssis(sys_ssi, raw_article_sents) else: gt_ssi, sys_ssi, ext_len = ssi_list[example_idx] if FLAGS.singles_and_pairs == 'singles': sys_ssi = util.enforce_sentence_limit(sys_ssi, 1) groundtruth_similar_source_indices_list = util.enforce_sentence_limit( groundtruth_similar_source_indices_list, 1) gt_ssi = util.enforce_sentence_limit(gt_ssi, 1) elif FLAGS.singles_and_pairs == 'both': sys_ssi = util.enforce_sentence_limit(sys_ssi, 2) groundtruth_similar_source_indices_list = util.enforce_sentence_limit( groundtruth_similar_source_indices_list, 2) gt_ssi = util.enforce_sentence_limit(gt_ssi, 2) if gt_ssi != groundtruth_similar_source_indices_list: print( 'Warning: Example %d has different groundtruth source indices: ' + str(groundtruth_similar_source_indices_list) + ' || ' + str(gt_ssi)) if FLAGS.dataset_name == 'xsum': sys_ssi = [sys_ssi[0]] final_decoded_words = [] final_decoded_outpus = '' best_hyps = [] highlight_html_total = '' for ssi_idx, ssi in enumerate(sys_ssi): selected_raw_article_sents = util.reorder( raw_article_sents, ssi) selected_article_text = ' '.join([ ' '.join(sent) for sent in util.reorder(article_sent_tokens, ssi) ]) selected_doc_indices_str = '0 ' * len( selected_article_text.split()) if FLAGS.upper_bound: selected_groundtruth_summ_sent = [[ groundtruth_summ_sents[0][ssi_idx] ]] else: selected_groundtruth_summ_sent = groundtruth_summ_sents batch = create_batch(selected_article_text, selected_groundtruth_summ_sent, selected_doc_indices_str, selected_raw_article_sents, FLAGS.batch_size, hps, self._vocab) decoded_words, decoded_output, best_hyp = decode_example( self._sess, self._model, self._vocab, batch, example_idx, hps) best_hyps.append(best_hyp) final_decoded_words.extend(decoded_words) final_decoded_outpus += decoded_output if example_idx < 1000: min_matched_tokens = 2 selected_article_sent_tokens = [ util.process_sent(sent) for sent in selected_raw_article_sents ] highlight_summary_sent_tokens = [decoded_words] highlight_ssi_list, lcs_paths_list, highlight_smooth_article_lcs_paths_list = ssi_functions.get_simple_source_indices_list( highlight_summary_sent_tokens, selected_article_sent_tokens, None, 2, min_matched_tokens) highlighted_html = ssi_functions.html_highlight_sents_in_article( highlight_summary_sent_tokens, highlight_ssi_list, selected_article_sent_tokens, lcs_paths_list=lcs_paths_list, article_lcs_paths_list= highlight_smooth_article_lcs_paths_list) highlight_html_total += '<u>System Summary</u><br><br>' + highlighted_html + '<br><br>' if len(final_decoded_words) >= 100: break if example_idx < 1000: self.write_for_human(raw_article_sents, groundtruth_summ_sents, final_decoded_words, example_idx) ssi_functions.write_highlighted_html(highlight_html_total, self._highlight_dir, example_idx) rouge_functions.write_for_rouge( groundtruth_summ_sents, None, example_idx, self._rouge_ref_dir, self._rouge_dec_dir, decoded_words=final_decoded_words, log=False ) # write ref summary and decoded summary to file, to eval with pyrouge later example_idx += 1 # this is how many examples we've decoded logging.info("Decoder has finished reading dataset for single_pass.") logging.info("Output has been saved in %s and %s.", self._rouge_ref_dir, self._rouge_dec_dir) if len(os.listdir(self._rouge_ref_dir)) != 0: l_param = 100 logging.info("Now starting ROUGE eval...") results_dict = rouge_functions.rouge_eval(self._rouge_ref_dir, self._rouge_dec_dir, l_param=l_param) rouge_functions.rouge_log(results_dict, self._decode_dir)
def main(unused_argv): print('Running statistics on %s' % FLAGS.dataset_name) if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) if FLAGS.singles_and_pairs == 'singles': FLAGS.sentence_limit = 1 else: FLAGS.sentence_limit = 2 if FLAGS.dataset_name == 'all': dataset_names = ['cnn_dm', 'xsum', 'duc_2004'] else: dataset_names = [FLAGS.dataset_name] for dataset_name in dataset_names: FLAGS.dataset_name = dataset_name source_dir = os.path.join(data_dir, dataset_name) if FLAGS.dataset_split == 'all': if dataset_name == 'duc_2004': dataset_splits = ['test'] else: # dataset_splits = ['val_test', 'test', 'val', 'train'] dataset_splits = ['test', 'val', 'train'] else: dataset_splits = [FLAGS.dataset_split] for dataset_split in dataset_splits: if dataset_split == 'val_test': source_dataset_split = 'val' else: source_dataset_split = dataset_split source_files = sorted( glob.glob(source_dir + '/' + source_dataset_split + '*')) total = len(source_files) * 1000 example_generator = data.example_generator( source_dir + '/' + source_dataset_split + '*', True, False, should_check_valid=False) out_dir = os.path.join('data', 'bert', dataset_name, FLAGS.singles_and_pairs, 'input') util.create_dirs(out_dir) writer = open(os.path.join(out_dir, dataset_split) + '.tsv', 'wb') header_list = [ 'should_merge', 'sent1', 'sent2', 'example_idx', 'inst_id', 'ssi' ] writer.write(('\t'.join(header_list) + '\n').encode()) inst_id = 0 for example_idx, example in enumerate( tqdm(example_generator, total=total)): raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, doc_indices = util.unpack_tf_example( example, names_to_types) article_sent_tokens = [ util.process_sent(sent, whitespace=True) for sent in raw_article_sents ] groundtruth_summ_sents = [[ sent.strip() for sent in groundtruth_summary_text.strip().split('\n') ]] if dataset_name != 'duc_2004' or doc_indices is None or ( dataset_name != 'duc_2004' and len(doc_indices) != len( util.flatten_list_of_lists(article_sent_tokens))): doc_indices = [0] * len( util.flatten_list_of_lists(article_sent_tokens)) doc_indices = [int(doc_idx) for doc_idx in doc_indices] rel_sent_indices, _, _ = ssi_functions.get_rel_sent_indices( doc_indices, article_sent_tokens) similar_source_indices_list = util.enforce_sentence_limit( groundtruth_similar_source_indices_list, FLAGS.sentence_limit) possible_pairs = [ x for x in list( itertools.combinations( list(range(len(raw_article_sents))), 2)) ] # all pairs possible_pairs = filter_pairs_by_sent_position( possible_pairs, rel_sent_indices=rel_sent_indices) possible_singles = [(i, ) for i in range(len(raw_article_sents))] positives = [ssi for ssi in similar_source_indices_list] if dataset_split == 'test' or dataset_split == 'val_test': if FLAGS.singles_and_pairs == 'singles': possible_combinations = possible_singles else: possible_combinations = possible_pairs + possible_singles negatives = [ ssi for ssi in possible_combinations if not (ssi in positives or ssi[::-1] in positives) ] for ssi_idx, ssi in enumerate(positives): if len(ssi) == 0: continue if chronological_ssi and len(ssi) >= 2: if ssi[0] > ssi[1]: ssi = (min(ssi), max(ssi)) writer.write( get_string_bert_example(raw_article_sents, ssi, 1, example_idx, inst_id).encode()) inst_id += 1 for ssi in negatives: writer.write( get_string_bert_example(raw_article_sents, ssi, 0, example_idx, inst_id).encode()) inst_id += 1 else: positive_sents = list( set(util.flatten_list_of_lists(positives))) negative_pairs = [ pair for pair in possible_pairs if not any(i in positive_sents for i in pair) ] negative_singles = [ sing for sing in possible_singles if not sing[0] in positive_sents ] random_negative_pairs = np.random.permutation( len(negative_pairs)).tolist() random_negative_singles = np.random.permutation( len(negative_singles)).tolist() for ssi in similar_source_indices_list: if len(ssi) == 0: continue if chronological_ssi and len(ssi) >= 2: if ssi[0] > ssi[1]: ssi = (min(ssi), max(ssi)) is_pair = len(ssi) == 2 writer.write( get_string_bert_example(raw_article_sents, ssi, 1, example_idx, inst_id).encode()) inst_id += 1 # False sentence single/pair if is_pair: if len(random_negative_pairs) == 0: continue negative_indices = negative_pairs[ random_negative_pairs.pop()] else: if len(random_negative_singles) == 0: continue negative_indices = negative_singles[ random_negative_singles.pop()] article_lcs_paths = None writer.write( get_string_bert_example(raw_article_sents, negative_indices, 0, example_idx, inst_id).encode()) inst_id += 1
def convert_article_to_lambdamart_features(ex): # example_idx += 1 # if num_instances != -1 and example_idx >= num_instances: # break example, example_idx, single_feat_len, pair_feat_len = ex print(example_idx) raw_article_sents, similar_source_indices_list, summary_text = util.unpack_tf_example(example, names_to_types) article_sent_tokens = [util.process_sent(sent) for sent in raw_article_sents] summ_sent_tokens = [sent.strip().split() for sent in summary_text.strip().split('\n')] # sent_term_matrix = util.get_tfidf_matrix(raw_article_sents) article_text = ' '.join(raw_article_sents) sent_term_matrix = util.get_doc_substituted_tfidf_matrix(tfidf_vectorizer, raw_article_sents, article_text) doc_vector = np.mean(sent_term_matrix, axis=0) out_str = '' # ssi_idx_cur_inst_id = defaultdict(int) instances = [] if importance: importances = util.special_squash(util.get_tfidf_importances(tfidf_vectorizer, raw_article_sents)) possible_pairs = [list(x) for x in list(itertools.combinations(list(range(len(raw_article_sents))), 2))] # all pairs possible_singles = [[i] for i in range(len(raw_article_sents))] possible_combinations = possible_pairs + possible_singles positives = [ssi for ssi in similar_source_indices_list] negatives = [ssi for ssi in possible_combinations if not (ssi in positives or ssi[::-1] in positives)] negative_pairs = [x for x in possible_pairs if not (x in similar_source_indices_list or x[::-1] in similar_source_indices_list)] negative_singles = [x for x in possible_singles if not (x in similar_source_indices_list or x[::-1] in similar_source_indices_list)] random_negative_pairs = np.random.permutation(len(negative_pairs)).tolist() random_negative_singles = np.random.permutation(len(negative_singles)).tolist() qid = example_idx for similar_source_indices in positives: # True sentence single/pair relevance = 1 features = get_features(similar_source_indices, sent_term_matrix, article_sent_tokens, single_feat_len, pair_feat_len, importances) if features is None: continue instances.append(Lambdamart_Instance(features, relevance, qid, similar_source_indices)) a=0 if balance: # False sentence single/pair is_pair = len(similar_source_indices) == 2 if is_pair: if len(random_negative_pairs) == 0: continue negative_indices = negative_pairs[random_negative_pairs.pop()] else: if len(random_negative_singles) == 0: continue negative_indices = negative_singles[random_negative_singles.pop()] neg_relevance = 0 neg_features = get_features(negative_indices, sent_term_matrix, article_sent_tokens, single_feat_len, pair_feat_len, importances) if neg_features is None: continue instances.append(Lambdamart_Instance(neg_features, neg_relevance, qid, negative_indices)) if not balance: for negative_indices in negatives: neg_relevance = 0 neg_features = get_features(negative_indices, sent_term_matrix, article_sent_tokens, single_feat_len, pair_feat_len, importances) if neg_features is None: continue instances.append(Lambdamart_Instance(neg_features, neg_relevance, qid, negative_indices)) else: mmr_all = util.calc_MMR_all(raw_article_sents, article_sent_tokens, summ_sent_tokens, None) # the size is (# of summary sents, # of article sents) possible_pairs = [list(x) for x in list(itertools.combinations(list(range(len(raw_article_sents))), 2))] # all pairs possible_singles = [[i] for i in range(len(raw_article_sents))] # negative_pairs = [x for x in possible_pairs if not (x in similar_source_indices_list or x[::-1] in similar_source_indices_list)] # negative_singles = [x for x in possible_singles if not (x in similar_source_indices_list or x[::-1] in similar_source_indices_list)] # # random_negative_pairs = np.random.permutation(len(negative_pairs)).tolist() # random_negative_singles = np.random.permutation(len(negative_singles)).tolist() all_combinations = list(itertools.product(possible_pairs + possible_singles, list(range(len(summ_sent_tokens))))) positives = [(similar_source_indices, summ_sent_idx) for summ_sent_idx, similar_source_indices in enumerate(similar_source_indices_list)] negatives = [(ssi, ssi_idx) for ssi, ssi_idx in all_combinations if not ((ssi, ssi_idx) in positives or (ssi[::-1], ssi_idx) in positives)] for similar_source_indices, ssi_idx in positives: # True sentence single/pair relevance = 1 qid = example_idx * 10 + ssi_idx features = get_features(similar_source_indices, sent_term_matrix, article_sent_tokens, single_feat_len, pair_feat_len, mmr_all[ssi_idx]) if features is None: continue # inst_id = ssi_idx_cur_inst_id[ssi_idx] instances.append(Lambdamart_Instance(features, relevance, qid, similar_source_indices)) # ssi_idx_cur_inst_id[ssi_idx] += 1 a=0 if balance: # False sentence single/pair is_pair = len(similar_source_indices) == 2 if is_pair: if len(random_negative_pairs) == 0: continue negative_indices = possible_pairs[random_negative_pairs.pop()] else: if len(random_negative_singles) == 0: continue negative_indices = possible_singles[random_negative_singles.pop()] neg_relevance = 0 neg_features = get_features(negative_indices, sent_term_matrix, article_sent_tokens, single_feat_len, pair_feat_len) if neg_features is None: continue neg_lambdamart_str = format_to_lambdamart([neg_features, neg_relevance, qid, negative_indices]) out_str += neg_lambdamart_str + '\n' if not balance: for negative_indices, ssi_idx in negatives: neg_relevance = 0 qid = example_idx * 10 + ssi_idx neg_features = get_features(negative_indices, sent_term_matrix, article_sent_tokens, single_feat_len, pair_feat_len, mmr_all[ssi_idx]) if neg_features is None: continue # inst_id = ssi_idx_cur_inst_id[ssi_idx] instances.append(Lambdamart_Instance(neg_features, neg_relevance, qid, negative_indices)) # ssi_idx_cur_inst_id[ssi_idx] += 1 sorted_instances = sorted(instances, key=lambda x: (x.qid, x.source_indices)) assign_inst_ids(sorted_instances) if lr: return sorted_instances else: for instance in sorted_instances: lambdamart_str = format_to_lambdamart(instance, single_feat_len) out_str += lambdamart_str + '\n' # print out_str return out_str
def main(unused_argv): print('Running statistics on %s' % FLAGS.dataset_name) if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) out_dir = os.path.join( os.path.expanduser('~') + '/data/kaiqiang_data', FLAGS.dataset_name) if FLAGS.mode == 'write': util.create_dirs(out_dir) if FLAGS.dataset_name == 'duc_2004': dataset_splits = ['test'] elif FLAGS.dataset_split == 'all': dataset_splits = ['test', 'val', 'train'] else: dataset_splits = [FLAGS.dataset_split] for dataset_split in dataset_splits: if dataset_split == 'test': ssi_data_path = os.path.join( 'logs/%s_bert_both_sentemb_artemb_plushidden' % FLAGS.dataset_name, 'ssi.pkl') print(util.bcolors.OKGREEN + "Loading SSI from BERT at %s" % ssi_data_path + util.bcolors.ENDC) with open(ssi_data_path) as f: ssi_triple_list = pickle.load(f) source_dir = os.path.join(data_dir, FLAGS.dataset_name) source_files = sorted( glob.glob(source_dir + '/' + dataset_split + '*')) total = len(source_files) * 1000 if ( 'cnn' in FLAGS.dataset_name or 'newsroom' in FLAGS.dataset_name or 'xsum' in FLAGS.dataset_name) else len(source_files) example_generator = data.example_generator( source_dir + '/' + dataset_split + '*', True, False, should_check_valid=False) out_document_path = os.path.join(out_dir, dataset_split + '.Ndocument') out_summary_path = os.path.join(out_dir, dataset_split + '.Nsummary') out_example_idx_path = os.path.join(out_dir, dataset_split + '.Nexampleidx') doc_writer = open(out_document_path, 'w') if dataset_split != 'test': sum_writer = open(out_summary_path, 'w') ex_idx_writer = open(out_example_idx_path, 'w') for example_idx, example in enumerate( tqdm(example_generator, total=total)): if FLAGS.num_instances != -1 and example_idx >= FLAGS.num_instances: break raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, doc_indices = util.unpack_tf_example( example, names_to_types) article_sent_tokens = [ util.process_sent(sent) for sent in raw_article_sents ] if FLAGS.dataset_name == 'duc_2004': groundtruth_summ_sents = [[ sent.strip() for sent in gt_summ_text.strip().split('\n') ] for gt_summ_text in groundtruth_summary_text] else: groundtruth_summ_sents = [[ sent.strip() for sent in groundtruth_summary_text.strip().split('\n') ]] if doc_indices is None: doc_indices = [0] * len( util.flatten_list_of_lists(article_sent_tokens)) doc_indices = [int(doc_idx) for doc_idx in doc_indices] # rel_sent_indices, _, _ = preprocess_for_lambdamart_no_flags.get_rel_sent_indices(doc_indices, article_sent_tokens) if dataset_split == 'test': if example_idx >= len(ssi_triple_list): raise Exception( 'Len of ssi list (%d) is less than number of examples (>=%d)' % (len(ssi_triple_list), example_idx)) ssi_length_extractive = ssi_triple_list[example_idx][2] if ssi_length_extractive > 1: a = 0 ssi = ssi_triple_list[example_idx][1] ssi = ssi[:ssi_length_extractive] groundtruth_similar_source_indices_list = ssi else: groundtruth_similar_source_indices_list = util.enforce_sentence_limit( groundtruth_similar_source_indices_list, FLAGS.sentence_limit) for ssi_idx, ssi in enumerate( groundtruth_similar_source_indices_list): if len(ssi) == 0: continue my_article = ' '.join(util.reorder(raw_article_sents, ssi)) doc_writer.write(my_article + '\n') if dataset_split != 'test': sum_writer.write(groundtruth_summ_sents[0][ssi_idx] + '\n') ex_idx_writer.write(str(example_idx) + '\n') elif FLAGS.mode == 'evaluate': summary_dir = '/home/logan/data/kaiqiang_data/logan_ACL/trained_on_' + FLAGS.train_dataset + '/' + FLAGS.dataset_name out_summary_path = os.path.join(summary_dir, 'test' + 'Summary.txt') out_example_idx_path = os.path.join(out_dir, 'test' + '.Nexampleidx') decode_dir = 'logs/kaiqiang_%s_trainedon%s' % (FLAGS.dataset_name, FLAGS.train_dataset) rouge_ref_dir = os.path.join(decode_dir, 'reference') rouge_dec_dir = os.path.join(decode_dir, 'decoded') util.create_dirs(rouge_ref_dir) util.create_dirs(rouge_dec_dir) def num_lines_in_file(file_path): with open(file_path) as f: num_lines = sum(1 for line in f) return num_lines def process_example(sents, ex_idx, groundtruth_summ_sents): final_decoded_words = [] for sent in sents: final_decoded_words.extend(sent.split(' ')) rouge_functions.write_for_rouge(groundtruth_summ_sents, None, ex_idx, rouge_ref_dir, rouge_dec_dir, decoded_words=final_decoded_words, log=False) num_lines_summary = num_lines_in_file(out_summary_path) num_lines_example_indices = num_lines_in_file(out_example_idx_path) if num_lines_summary != num_lines_example_indices: raise Exception( 'Num lines summary != num lines example indices: (%d, %d)' % (num_lines_summary, num_lines_example_indices)) source_dir = os.path.join(data_dir, FLAGS.dataset_name) example_generator = data.example_generator(source_dir + '/' + 'test' + '*', True, False, should_check_valid=False) sum_writer = open(out_summary_path) ex_idx_writer = open(out_example_idx_path) prev_ex_idx = 0 sents = [] for line_idx in tqdm(range(num_lines_summary)): line = sum_writer.readline() ex_idx = int(ex_idx_writer.readline()) if ex_idx == prev_ex_idx: sents.append(line) else: example = example_generator.next() raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, doc_indices = util.unpack_tf_example( example, names_to_types) if FLAGS.dataset_name == 'duc_2004': groundtruth_summ_sents = [[ sent.strip() for sent in gt_summ_text.strip().split('\n') ] for gt_summ_text in groundtruth_summary_text] else: groundtruth_summ_sents = [[ sent.strip() for sent in groundtruth_summary_text.strip().split('\n') ]] process_example(sents, ex_idx, groundtruth_summ_sents) prev_ex_idx = ex_idx sents = [line] example = example_generator.next() raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, doc_indices = util.unpack_tf_example( example, names_to_types) if FLAGS.dataset_name == 'duc_2004': groundtruth_summ_sents = [[ sent.strip() for sent in gt_summ_text.strip().split('\n') ] for gt_summ_text in groundtruth_summary_text] else: groundtruth_summ_sents = [[ sent.strip() for sent in groundtruth_summary_text.strip().split('\n') ]] process_example(sents, ex_idx, groundtruth_summ_sents) print("Now starting ROUGE eval...") if FLAGS.dataset_name == 'xsum': l_param = 100 else: l_param = 100 results_dict = rouge_functions.rouge_eval(rouge_ref_dir, rouge_dec_dir, l_param=l_param) rouge_functions.rouge_log(results_dict, decode_dir) else: raise Exception('mode flag was not evaluate or write.')
def main(unused_argv): print('Running statistics on %s' % FLAGS.exp_name) if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) if FLAGS.all_actions: FLAGS.sent_dataset = True FLAGS.ssi_dataset = True FLAGS.print_output = True FLAGS.highlight = True original_dataset_name = 'xsum' if 'xsum' in FLAGS.dataset_name else 'cnn_dm' if ( 'cnn_dm' in FLAGS.dataset_name or 'duc_2004' in FLAGS.dataset_name) else '' vocab = Vocab(FLAGS.vocab_path + '_' + original_dataset_name, FLAGS.vocab_size) # create a vocabulary source_dir = os.path.join(data_dir, FLAGS.dataset_name) util.create_dirs(html_dir) if FLAGS.dataset_split == 'all': if FLAGS.dataset_name == 'duc_2004': dataset_splits = ['test'] else: dataset_splits = ['test', 'val', 'train'] else: dataset_splits = [FLAGS.dataset_split] for dataset_split in dataset_splits: source_files = sorted(glob.glob(source_dir + '/' + dataset_split + '*')) if FLAGS.exp_name == 'reference': # summary_dir = log_dir + default_exp_name + '/decode_test_' + str(max_enc_steps) + \ # 'maxenc_4beam_' + str(min_dec_steps) + 'mindec_' + str(max_dec_steps) + 'maxdec_ckpt-238410/reference' # summary_files = sorted(glob.glob(summary_dir + '/*_reference.A.txt')) summary_dir = source_dir summary_files = source_files else: if FLAGS.exp_name == 'cnn_dm': summary_dir = log_dir + FLAGS.exp_name + '/decode_test_400maxenc_4beam_35mindec_100maxdec_ckpt-238410/decoded' else: ckpt_folder = util.find_largest_ckpt_folder(log_dir + FLAGS.exp_name) summary_dir = log_dir + FLAGS.exp_name + '/' + ckpt_folder + '/decoded' # summary_dir = log_dir + FLAGS.exp_name + '/decode_test_' + str(max_enc_steps) + \ # 'maxenc_4beam_' + str(min_dec_steps) + 'mindec_' + str(max_dec_steps) + 'maxdec_ckpt-238410/decoded' summary_files = sorted(glob.glob(summary_dir + '/*')) if len(summary_files) == 0: raise Exception('No files found in %s' % summary_dir) example_generator = data.example_generator(source_dir + '/' + dataset_split + '*', True, False, is_original=True) pros = { 'annotators': 'dcoref', 'outputFormat': 'json', 'timeout': '5000000' } all_merge_examples = [] num_extracted_list = [] distances = [] relative_distances = [] html_str = '' extracted_sents_in_article_html = '' name = FLAGS.dataset_name + '_' + FLAGS.exp_name if FLAGS.coreference_replacement: name += '_coref' highlight_file_name = os.path.join( html_dir, FLAGS.dataset_name + '_' + FLAGS.exp_name) if FLAGS.consider_stopwords: highlight_file_name += '_stopwords' if FLAGS.highlight: extracted_sents_in_article_html_file = open( highlight_file_name + '_extracted_sents.html', 'wb') if FLAGS.kaiqiang: kaiqiang_article_texts = [] kaiqiang_abstract_texts = [] util.create_dirs(kaiqiang_dir) kaiqiang_article_file = open( os.path.join( kaiqiang_dir, FLAGS.dataset_name + '_' + dataset_split + '_' + str(FLAGS.min_matched_tokens) + '_articles.txt'), 'wb') kaiqiang_abstract_file = open( os.path.join( kaiqiang_dir, FLAGS.dataset_name + '_' + dataset_split + '_' + str(FLAGS.min_matched_tokens) + '_abstracts.txt'), 'wb') if FLAGS.ssi_dataset: if FLAGS.tag_tokens: with_coref_and_ssi_dir = lambdamart_dir + '_and_tag_tokens' else: with_coref_and_ssi_dir = lambdamart_dir lambdamart_out_dir = os.path.join(with_coref_and_ssi_dir, FLAGS.dataset_name) if FLAGS.sentence_limit == 1: lambdamart_out_dir += '_singles' if FLAGS.consider_stopwords: lambdamart_out_dir += '_stopwords' lambdamart_out_full_dir = os.path.join(lambdamart_out_dir, 'all') util.create_dirs(lambdamart_out_full_dir) lambdamart_writer = open( os.path.join(lambdamart_out_full_dir, dataset_split + '.bin'), 'wb') simple_similar_source_indices_list_plus_empty = [] example_idx = -1 instance_idx = 0 total = len(source_files) * 1000 if ( 'cnn' in FLAGS.dataset_name or 'newsroom' in FLAGS.dataset_name or 'xsum' in FLAGS.dataset_name) else len(source_files) random_choices = None if FLAGS.randomize: if FLAGS.dataset_name == 'cnn_dm': list_order = np.random.permutation(11490) random_choices = list_order[:FLAGS.num_instances] for example in tqdm(example_generator, total=total): example_idx += 1 if FLAGS.num_instances != -1 and instance_idx >= FLAGS.num_instances: break if random_choices is not None and example_idx not in random_choices: continue # for file_idx in tqdm(range(len(source_files))): # example = get_tf_example(source_files[file_idx]) article_text = example.features.feature[ 'article'].bytes_list.value[0].decode().lower() if FLAGS.exp_name == 'reference': summary_text, all_summary_texts = get_summary_from_example( example) else: summary_text = get_summary_text(summary_files[example_idx]) article_tokens = split_into_tokens(article_text) if 'raw_article_sents' in example.features.feature and len( example.features.feature['raw_article_sents'].bytes_list. value) > 0: raw_article_sents = example.features.feature[ 'raw_article_sents'].bytes_list.value raw_article_sents = [ sent.decode() for sent in raw_article_sents if sent.decode().strip() != '' ] article_sent_tokens = [ util.process_sent(sent, whitespace=True) for sent in raw_article_sents ] else: # article_text = util.to_unicode(article_text) # sent_pros = {'annotators': 'ssplit', 'outputFormat': 'json', 'timeout': '5000000'} # sents_result_dict = nlp.annotate(str(article_text), properties=sent_pros) # article_sent_tokens = [[token['word'] for token in sent['tokens']] for sent in sents_result_dict['sentences']] raw_article_sents = nltk.tokenize.sent_tokenize(article_text) article_sent_tokens = [ util.process_sent(sent) for sent in raw_article_sents ] if FLAGS.top_n_sents != -1: article_sent_tokens = article_sent_tokens[:FLAGS.top_n_sents] raw_article_sents = raw_article_sents[:FLAGS.top_n_sents] article_sents = [' '.join(sent) for sent in article_sent_tokens] try: article_tokens_string = str(' '.join(article_sents)) except: try: article_tokens_string = str(' '.join( [sent.decode('latin-1') for sent in article_sents])) except: raise if len(article_sent_tokens) == 0: continue summary_sent_tokens = split_into_sent_tokens(summary_text) if 'doc_indices' in example.features.feature and len( example.features.feature['doc_indices'].bytes_list.value ) > 0: doc_indices_str = example.features.feature[ 'doc_indices'].bytes_list.value[0].decode() if '1' in doc_indices_str: doc_indices = [ int(x) for x in doc_indices_str.strip().split() ] rel_sent_positions = importance_features.get_sent_indices( article_sent_tokens, doc_indices) else: num_tokens_total = sum( [len(sent) for sent in article_sent_tokens]) rel_sent_positions = list(range(len(raw_article_sents))) doc_indices = [0] * num_tokens_total else: rel_sent_positions = None doc_indices = None doc_indices_str = None if 'corefs' in example.features.feature and len( example.features.feature['corefs'].bytes_list.value) > 0: corefs_str = example.features.feature[ 'corefs'].bytes_list.value[0] corefs = json.loads(corefs_str) # summary_sent_tokens = limit_to_n_tokens(summary_sent_tokens, 100) similar_source_indices_list_plus_empty = [] simple_similar_source_indices, lcs_paths_list, article_lcs_paths_list, smooth_article_paths_list = ssi_functions.get_simple_source_indices_list( summary_sent_tokens, article_sent_tokens, vocab, FLAGS.sentence_limit, FLAGS.min_matched_tokens, not FLAGS.consider_stopwords, lemmatize=FLAGS.lemmatize, multiple_ssi=FLAGS.multiple_ssi) article_paths_parameter = article_lcs_paths_list if FLAGS.tag_tokens else None article_paths_parameter = smooth_article_paths_list if FLAGS.smart_tags else article_paths_parameter restricted_source_indices = util.enforce_sentence_limit( simple_similar_source_indices, FLAGS.sentence_limit) for summ_sent_idx, summ_sent in enumerate(summary_sent_tokens): if FLAGS.sent_dataset: if len(restricted_source_indices[summ_sent_idx]) == 0: continue merge_example = get_merge_example( restricted_source_indices[summ_sent_idx], article_sent_tokens, summ_sent, corefs, article_paths_parameter[summ_sent_idx]) all_merge_examples.append(merge_example) simple_similar_source_indices_list_plus_empty.append( simple_similar_source_indices) if FLAGS.ssi_dataset: summary_text_to_save = [ s for s in all_summary_texts ] if FLAGS.dataset_name == 'duc_2004' else summary_text write_lambdamart_example(simple_similar_source_indices, raw_article_sents, summary_text_to_save, corefs_str, doc_indices_str, article_paths_parameter, lambdamart_writer) if FLAGS.highlight: highlight_article_lcs_paths_list = smooth_article_paths_list if FLAGS.smart_tags else article_lcs_paths_list # simple_ssi_plus_empty = [ [s[0] for s in sim_source_ind] for sim_source_ind in simple_similar_source_indices] extracted_sents_in_article_html = ssi_functions.html_highlight_sents_in_article( summary_sent_tokens, simple_similar_source_indices, article_sent_tokens, doc_indices, lcs_paths_list, highlight_article_lcs_paths_list) extracted_sents_in_article_html_file.write( extracted_sents_in_article_html.encode()) a = 0 instance_idx += 1 if FLAGS.ssi_dataset: lambdamart_writer.close() if FLAGS.dataset_name == 'cnn_dm' or FLAGS.dataset_name == 'newsroom' or FLAGS.dataset_name == 'xsum': chunk_size = 1000 else: chunk_size = 1 util.chunk_file(dataset_split, lambdamart_out_full_dir, lambdamart_out_dir, chunk_size=chunk_size) if FLAGS.sent_dataset: with_coref_dir = data_dir + '_and_tag_tokens' if FLAGS.tag_tokens else data_dir out_dir = os.path.join(with_coref_dir, FLAGS.dataset_name + '_sent') if FLAGS.sentence_limit == 1: out_dir += '_singles' if FLAGS.consider_stopwords: out_dir += '_stopwords' if FLAGS.coreference_replacement: out_dir += '_coref' if FLAGS.top_n_sents != -1: out_dir += '_n=' + str(FLAGS.top_n_sents) util.create_dirs(out_dir) convert_data.write_with_generator(iter(all_merge_examples), len(all_merge_examples), out_dir, dataset_split) if FLAGS.print_output: # html_str = FLAGS.dataset + ' | ' + FLAGS.exp_name + '<br><br><br>' + html_str # save_fusions_to_file(html_str) ssi_path = os.path.join(ssi_dir, FLAGS.dataset_name) if FLAGS.consider_stopwords: ssi_path += '_stopwords' util.create_dirs(ssi_path) if FLAGS.dataset_name == 'duc_2004' and FLAGS.abstract_idx != 0: abstract_idx_str = '_%d' % FLAGS.abstract_idx else: abstract_idx_str = '' with open( os.path.join( ssi_path, dataset_split + '_ssi' + abstract_idx_str + '.pkl'), 'wb') as f: pickle.dump(simple_similar_source_indices_list_plus_empty, f) if FLAGS.kaiqiang: # kaiqiang_article_file.write('\n'.join(kaiqiang_article_texts)) # kaiqiang_abstract_file.write('\n'.join(kaiqiang_abstract_texts)) kaiqiang_article_file.close() kaiqiang_abstract_file.close() if FLAGS.highlight: extracted_sents_in_article_html_file.close() a = 0