def get_features_all_combinations(raw_article_sents, article_sent_tokens, mmrs, single_feat_len, pair_feat_len): # sent_term_matrix = util.get_tfidf_matrix(raw_article_sents) article_text = ' '.join(raw_article_sents) sent_term_matrix = util.get_doc_substituted_tfidf_matrix( tfidf_vectorizer, raw_article_sents, article_text) doc_vector = np.mean(sent_term_matrix, axis=0) possible_pairs = [ list(x) for x in list( itertools.combinations(list(range(len(raw_article_sents))), 2)) ] # all pairs possible_singles = [[i] for i in range(len(raw_article_sents))] if singles_and_pairs == 'pairs': all_combinations = possible_pairs elif singles_and_pairs == 'singles': all_combinations = possible_singles else: all_combinations = possible_pairs + possible_singles instances = [] for source_indices in all_combinations: features = get_features(source_indices, sent_term_matrix, article_sent_tokens, single_feat_len, pair_feat_len, mmrs) instances.append(Lambdamart_Instance(features, 0, 0, source_indices)) return instances
def main(unused_argv): print('Running statistics on %s' % exp_name) if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) start_time = time.time() np.random.seed(random_seed) source_dir = os.path.join(data_dir, dataset_articles) source_files = sorted(glob.glob(source_dir + '/' + dataset_split + '*')) ex_sents = ['single .', 'sentence .'] article_text = ' '.join(ex_sents) sent_term_matrix = util.get_doc_substituted_tfidf_matrix( tfidf_vectorizer, ex_sents, article_text) if singles_and_pairs == 'pairs': single_feat_len = 0 else: single_feat_len = len( get_single_sent_features(0, sent_term_matrix, [['single', '.'], ['sentence', '.']], [0, 0])) if singles_and_pairs == 'singles': pair_feat_len = 0 else: pair_feat_len = len( get_pair_sent_features([0, 1], sent_term_matrix, [['single', '.'], ['sentence', '.']], [0, 0])) total = len(source_files ) * 1000 if 'cnn' or 'newsroom' in dataset_articles else len( source_files) example_generator = data.example_generator(source_dir + '/' + dataset_split + '*', True, False, should_check_valid=False) ex_gen = example_generator_extended(example_generator, total, single_feat_len, pair_feat_len) print('Creating list') ex_list = [ex for ex in ex_gen] print('Converting...') list(futures.map(load_and_evaluate_example, ex_list)) # for ex in tqdm(ex_list, total=total): # load_and_evaluate_example(ex) print('Evaluating ROUGE...') results_dict = rouge_eval_references.rouge_eval(ref_dir, dec_dir) # print("Results_dict: ", results_dict) rouge_eval_references.rouge_log(results_dict, my_log_dir) util.print_execution_time(start_time)
def get_features_all_combinations(example_idx, raw_article_sents, article_sent_tokens, corefs, rel_sent_indices, first_k_indices, mmrs, single_feat_len, pair_feat_len, singles_and_pairs, temp_in_path): # sent_term_matrix = util.get_tfidf_matrix(raw_article_sents) article_text = ' '.join(raw_article_sents) # print 'getting tfidf matrix' sent_term_matrix = util.get_doc_substituted_tfidf_matrix( tfidf_vectorizer, raw_article_sents, article_text, pca) doc_vector = np.mean(sent_term_matrix, axis=0) # print 'got tfidf matrix' # print 'getting all pairs...' possible_pairs = [ x for x in list(itertools.combinations(first_k_indices, 2)) ] # all pairs # print 'filtering all pairs...' if FLAGS.use_pair_criteria: possible_pairs = filter_pairs_by_criteria(raw_article_sents, possible_pairs, corefs) if FLAGS.sent_position_criteria: possible_pairs = filter_pairs_by_sent_position( possible_pairs, rel_sent_indices=rel_sent_indices) possible_singles = [(i, ) for i in first_k_indices] if singles_and_pairs == 'pairs': all_combinations = possible_pairs elif singles_and_pairs == 'singles': all_combinations = possible_singles else: all_combinations = possible_pairs + possible_singles instances = [] if sum([1 for sent_idx in rel_sent_indices if sent_idx == 0]) > 1: comb_list = tqdm(all_combinations) else: comb_list = all_combinations with open(temp_in_path, 'w') as f: for inst_id, source_indices in enumerate(comb_list): features = get_features(source_indices, sent_term_matrix, article_sent_tokens, rel_sent_indices, single_feat_len, pair_feat_len, mmrs, singles_and_pairs) instance = Lambdamart_Instance(features, 0, example_idx, source_indices) instance.inst_id = inst_id lambdamart_str = format_to_lambdamart(instance, single_feat_len) out_str = lambdamart_str + '\n' f.write(out_str)
def main(unused_argv): print('Running statistics on %s' % exp_name) if len(unused_argv) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) start_time = time.time() np.random.seed(random_seed) source_dir = os.path.join(data_dir, dataset_articles) source_files = sorted(glob.glob(source_dir + '/' + dataset_split + '*')) ex_sents = ['single .', 'sentence .'] article_text = ' '.join(ex_sents) sent_term_matrix = util.get_doc_substituted_tfidf_matrix(tfidf_vectorizer, ex_sents, article_text) if singles_and_pairs == 'pairs': single_feat_len = 0 else: single_feat_len = len(get_single_sent_features(0, sent_term_matrix, [['single', '.'], ['sentence', '.']], [0, 0])) if singles_and_pairs == 'singles': pair_feat_len = 0 else: pair_feat_len = len( get_pair_sent_features([0, 1], sent_term_matrix, [['single', '.'], ['sentence', '.']], [0, 0]))
def main(unused_argv): print('Running statistics on %s' % exp_name) if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) if FLAGS.singles_and_pairs == 'both': in_dataset = FLAGS.dataset_name out_dataset = FLAGS.dataset_name + '_both' else: in_dataset = FLAGS.dataset_name + '_singles' out_dataset = FLAGS.dataset_name + '_singles' if FLAGS.lr: out_dataset = FLAGS.dataset_name + '_lr' start_time = time.time() np.random.seed(random_seed) source_dir = os.path.join(data_dir, in_dataset) ex_sents = ['single .', 'sentence .'] article_text = ' '.join(ex_sents) sent_term_matrix = util.get_doc_substituted_tfidf_matrix( tfidf_vectorizer, ex_sents, article_text, pca) if FLAGS.singles_and_pairs == 'pairs': single_feat_len = 0 else: single_feat_len = len( get_single_sent_features(0, sent_term_matrix, [['single', '.'], ['sentence', '.']], [0, 0], 0)) if FLAGS.singles_and_pairs == 'singles': pair_feat_len = 0 else: pair_feat_len = len( get_pair_sent_features([0, 1], sent_term_matrix, [['single', '.'], ['sentence', '.']], [0, 0], [0, 0])) util.print_vars(single_feat_len, pair_feat_len) util.create_dirs(temp_dir) if FLAGS.dataset_split == 'all': dataset_splits = ['test', 'val', 'train'] elif FLAGS.dataset_split == 'train_val': dataset_splits = ['val', 'train'] else: dataset_splits = [FLAGS.dataset_split] for split in dataset_splits: source_files = sorted(glob.glob(source_dir + '/' + split + '*')) out_path = os.path.join(out_dir, out_dataset, split) if FLAGS.pca: out_path += '_pca' util.create_dirs(os.path.join(out_path)) total = len(source_files) * 1000 if ( 'cnn' in in_dataset or 'newsroom' in in_dataset or 'xsum' in in_dataset) else len(source_files) example_generator = data.example_generator(source_dir + '/' + split + '*', True, False, should_check_valid=False) # for example in tqdm(example_generator, total=total): ex_gen = example_generator_extended(example_generator, total, single_feat_len, pair_feat_len, FLAGS.singles_and_pairs, out_path) print('Creating list') ex_list = [ex for ex in ex_gen] if FLAGS.num_instances != -1: ex_list = ex_list[:FLAGS.num_instances] print('Converting...') # all_features = pool.map(convert_article_to_lambdamart_features, ex_list) # all_features = ray.get([convert_article_to_lambdamart_features.remote(ex) for ex in ex_list]) if FLAGS.lr: all_instances = list( futures.map(convert_article_to_lambdamart_features, ex_list)) all_instances = util.flatten_list_of_lists(all_instances) x = [inst.features for inst in all_instances] x = np.array(x) y = [inst.relevance for inst in all_instances] y = np.expand_dims(np.array(y), 1) x_y = np.concatenate((x, y), 1) np.save(writer, x_y) else: list(futures.map(convert_article_to_lambdamart_features, ex_list)) # writer.write(''.join(all_features)) # all_features = [] # for example in tqdm(ex_gen, total=total): # all_features.append(convert_article_to_lambdamart_features(example)) # all_features = util.flatten_list_of_lists(all_features) # num1 = sum(x == 1 for x in all_features) # num2 = sum(x == 2 for x in all_features) # print 'Single sent: %d instances. Pair sent: %d instances.' % (num1, num2) # for example in tqdm(ex_gen, total=total): # features = convert_article_to_lambdamart_features(example) # writer.write(features) final_out_path = out_path + '.txt' file_names = sorted(glob.glob(os.path.join(out_path, '*'))) writer = open(final_out_path, 'wb') for file_name in tqdm(file_names): with open(file_name) as f: text = f.read() writer.write(text) writer.close() util.print_execution_time(start_time)
def convert_article_to_lambdamart_features(ex): # example_idx += 1 # if num_instances != -1 and example_idx >= num_instances: # break example, example_idx, single_feat_len, pair_feat_len, singles_and_pairs, out_path = ex print(example_idx) raw_article_sents, similar_source_indices_list, summary_text, corefs, doc_indices = util.unpack_tf_example( example, names_to_types) article_sent_tokens = [ util.process_sent(sent) for sent in raw_article_sents ] if doc_indices is None: doc_indices = [0] * len( util.flatten_list_of_lists(article_sent_tokens)) doc_indices = [int(doc_idx) for doc_idx in doc_indices] if len(doc_indices) != len( util.flatten_list_of_lists(article_sent_tokens)): doc_indices = [0] * len( util.flatten_list_of_lists(article_sent_tokens)) rel_sent_indices, _, _ = util.get_rel_sent_indices(doc_indices, article_sent_tokens) if FLAGS.singles_and_pairs == 'singles': sentence_limit = 1 else: sentence_limit = 2 similar_source_indices_list = util.enforce_sentence_limit( similar_source_indices_list, sentence_limit) summ_sent_tokens = [ sent.strip().split() for sent in summary_text.strip().split('\n') ] # sent_term_matrix = util.get_tfidf_matrix(raw_article_sents) article_text = ' '.join(raw_article_sents) sent_term_matrix = util.get_doc_substituted_tfidf_matrix( tfidf_vectorizer, raw_article_sents, article_text, pca) doc_vector = np.mean(sent_term_matrix, axis=0) out_str = '' # ssi_idx_cur_inst_id = defaultdict(int) instances = [] if importance: importances = util.special_squash( util.get_tfidf_importances(tfidf_vectorizer, raw_article_sents, pca)) possible_pairs = [ x for x in list( itertools.combinations(list(range(len(raw_article_sents))), 2)) ] # all pairs if FLAGS.use_pair_criteria: possible_pairs = filter_pairs_by_criteria(raw_article_sents, possible_pairs, corefs) if FLAGS.sent_position_criteria: possible_pairs = filter_pairs_by_sent_position( possible_pairs, rel_sent_indices) possible_singles = [(i, ) for i in range(len(raw_article_sents))] possible_combinations = possible_pairs + possible_singles positives = [ssi for ssi in similar_source_indices_list] negatives = [ ssi for ssi in possible_combinations if not (ssi in positives or ssi[::-1] in positives) ] negative_pairs = [ x for x in possible_pairs if not (x in similar_source_indices_list or x[::-1] in similar_source_indices_list) ] negative_singles = [ x for x in possible_singles if not (x in similar_source_indices_list or x[::-1] in similar_source_indices_list) ] random_negative_pairs = np.random.permutation( len(negative_pairs)).tolist() random_negative_singles = np.random.permutation( len(negative_singles)).tolist() qid = example_idx for similar_source_indices in positives: # True sentence single/pair relevance = 1 features = get_features(similar_source_indices, sent_term_matrix, article_sent_tokens, rel_sent_indices, single_feat_len, pair_feat_len, importances, singles_and_pairs) if features is None: continue instances.append( Lambdamart_Instance(features, relevance, qid, similar_source_indices)) a = 0 if FLAGS.dataset_name == 'xsum' and FLAGS.special_xsum_balance: neg_relevance = 0 num_negative = 4 if FLAGS.singles_and_pairs == 'singles': num_neg_singles = num_negative num_neg_pairs = 0 else: num_neg_singles = num_negative / 2 num_neg_pairs = num_negative / 2 for _ in range(num_neg_singles): if len(random_negative_singles) == 0: continue negative_indices = negative_singles[ random_negative_singles.pop()] neg_features = get_features(negative_indices, sent_term_matrix, article_sent_tokens, rel_sent_indices, single_feat_len, pair_feat_len, importances, singles_and_pairs) if neg_features is None: continue instances.append( Lambdamart_Instance(neg_features, neg_relevance, qid, negative_indices)) for _ in range(num_neg_pairs): if len(random_negative_pairs) == 0: continue negative_indices = negative_pairs[ random_negative_pairs.pop()] neg_features = get_features(negative_indices, sent_term_matrix, article_sent_tokens, rel_sent_indices, single_feat_len, pair_feat_len, importances, singles_and_pairs) if neg_features is None: continue instances.append( Lambdamart_Instance(neg_features, neg_relevance, qid, negative_indices)) elif balance: # False sentence single/pair is_pair = len(similar_source_indices) == 2 if is_pair: if len(random_negative_pairs) == 0: continue negative_indices = negative_pairs[ random_negative_pairs.pop()] else: if len(random_negative_singles) == 0: continue negative_indices = negative_singles[ random_negative_singles.pop()] neg_relevance = 0 neg_features = get_features(negative_indices, sent_term_matrix, article_sent_tokens, rel_sent_indices, single_feat_len, pair_feat_len, importances, singles_and_pairs) if neg_features is None: continue instances.append( Lambdamart_Instance(neg_features, neg_relevance, qid, negative_indices)) if not balance: for negative_indices in negatives: neg_relevance = 0 neg_features = get_features(negative_indices, sent_term_matrix, article_sent_tokens, single_feat_len, pair_feat_len, importances, singles_and_pairs) if neg_features is None: continue instances.append( Lambdamart_Instance(neg_features, neg_relevance, qid, negative_indices)) sorted_instances = sorted(instances, key=lambda x: (x.qid, x.source_indices)) assign_inst_ids(sorted_instances) if FLAGS.lr: return sorted_instances else: for instance in sorted_instances: lambdamart_str = format_to_lambdamart(instance, single_feat_len) out_str += lambdamart_str + '\n' with open(os.path.join(out_path, '%06d.txt' % example_idx), 'wb') as f: f.write(out_str)
def main(unused_argv): # def main(unused_argv): if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) print('Running statistics on %s' % exp_name) start_time = time.time() np.random.seed(random_seed) source_dir = os.path.join(data_dir, dataset_articles) source_files = sorted(glob.glob(source_dir + '/' + dataset_split + '*')) ex_sents = ['single .', 'sentence .'] article_text = ' '.join(ex_sents) sent_term_matrix = util.get_doc_substituted_tfidf_matrix( tfidf_vectorizer, ex_sents, article_text, pca) if FLAGS.singles_and_pairs == 'pairs': single_feat_len = 0 else: single_feat_len = len( get_single_sent_features(0, sent_term_matrix, [['single', '.'], ['sentence', '.']], [0, 0], 0)) if FLAGS.singles_and_pairs == 'singles': pair_feat_len = 0 else: pair_feat_len = len( get_pair_sent_features([0, 1], sent_term_matrix, [['single', '.'], ['sentence', '.']], [0, 0], [0, 0])) total = len(source_files ) * 1000 if 'cnn' or 'newsroom' in dataset_articles else len( source_files) example_generator = data.example_generator(source_dir + '/' + dataset_split + '*', True, False, should_check_valid=False) if FLAGS.mode == 'write_to_file': ex_gen = example_generator_extended(example_generator, total, single_feat_len, pair_feat_len, FLAGS.singles_and_pairs) print('Creating list') ex_list = [ex for ex in ex_gen] print('Converting...') # if len(sys.argv) > 1 and sys.argv[1] == '-m': list(futures.map(write_to_lambdamart_examples_to_file, ex_list)) # else: # instances_list = [] # for ex in tqdm(ex_list): # instances_list.append(write_to_lambdamart_examples_to_file(ex)) file_names = sorted(glob.glob(os.path.join(temp_in_dir, '*'))) instances_str = '' for file_name in tqdm(file_names): with open(file_name) as f: instances_str += f.read() with open(temp_in_path, 'wb') as f: f.write(instances_str) # RUN LAMBDAMART SCORING COMMAND HERE if FLAGS.mode == 'generate_summaries': qid_ssi_to_importances = rank_source_sents(temp_in_path, temp_out_path) ex_gen = example_generator_extended(example_generator, total, qid_ssi_to_importances, pair_feat_len, FLAGS.singles_and_pairs) print('Creating list') ex_list = [ex for ex in ex_gen] ssi_list = list(futures.map(evaluate_example, ex_list)) # save ssi_list with open(os.path.join(my_log_dir, 'ssi.pkl'), 'w') as f: pickle.dump(ssi_list, f) with open(os.path.join(my_log_dir, 'ssi.pkl')) as f: ssi_list = pickle.load(f) print('Evaluating Lambdamart model F1 score...') suffix = util.all_sent_selection_eval(ssi_list) # # # for ex in tqdm(ex_list, total=total): # # load_and_evaluate_example(ex) # print('Evaluating ROUGE...') results_dict = rouge_functions.rouge_eval(ref_dir, dec_dir, l_param=l_param) # print("Results_dict: ", results_dict) rouge_functions.rouge_log(results_dict, my_log_dir, suffix=suffix) util.print_execution_time(start_time)
def main(unused_argv): print('Running statistics on %s' % FLAGS.dataset_name) if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) if FLAGS.dataset_name == 'all': dataset_names = ['cnn_dm', 'xsum', 'duc_2004'] else: dataset_names = [FLAGS.dataset_name] if not os.path.exists(plot_data_file): all_lists_of_histogram_pairs = [] for dataset_name in dataset_names: FLAGS.dataset_name = dataset_name if dataset_name == 'duc_2004': dataset_splits = ['test'] elif FLAGS.dataset_split == 'all': dataset_splits = ['test', 'val', 'train'] else: dataset_splits = [FLAGS.dataset_split] ssi_list = [] for dataset_split in dataset_splits: ssi_path = os.path.join(ssi_dir, FLAGS.dataset_name, dataset_split + '_ssi.pkl') with open(ssi_path) as f: ssi_list.extend(pickle.load(f)) if FLAGS.dataset_name == 'duc_2004': for abstract_idx in [1, 2, 3]: ssi_path = os.path.join( ssi_dir, FLAGS.dataset_name, dataset_split + '_ssi_' + str(abstract_idx) + '.pkl') with open(ssi_path) as f: temp_ssi_list = pickle.load(f) ssi_list.extend(temp_ssi_list) ssi_2d = util.flatten_list_of_lists(ssi_list) num_extracted = [ len(ssi) for ssi in util.flatten_list_of_lists(ssi_list) ] hist_num_extracted = np.histogram(num_extracted, bins=6, range=(0, 5)) print(hist_num_extracted) print('Histogram of number of sentences merged: ' + util.hist_as_pdf_str(hist_num_extracted)) distances = [ abs(ssi[0] - ssi[1]) for ssi in ssi_2d if len(ssi) >= 2 ] print('Distance between sentences (mean, median): ', np.mean(distances), np.median(distances)) hist_dist = np.histogram(distances, bins=max(distances)) print('Histogram of distances: ' + util.hist_as_pdf_str(hist_dist)) summ_sent_idx_to_number_of_source_sents = [[], [], [], [], [], [], [], [], [], []] for ssi in ssi_list: for summ_sent_idx, source_indices in enumerate(ssi): if len(source_indices) == 0 or summ_sent_idx >= len( summ_sent_idx_to_number_of_source_sents): continue num_sents = len(source_indices) if num_sents > 2: num_sents = 2 summ_sent_idx_to_number_of_source_sents[ summ_sent_idx].append(num_sents) print( "Number of source sents for summary sentence indices (Is the first summary sent more likely to match with a singleton or a pair?):" ) for summ_sent_idx, list_of_numbers_of_source_sents in enumerate( summ_sent_idx_to_number_of_source_sents): if len(list_of_numbers_of_source_sents) == 0: percent_singleton = 0. else: percent_singleton = list_of_numbers_of_source_sents.count( 1) * 1. / len(list_of_numbers_of_source_sents) percent_pair = list_of_numbers_of_source_sents.count( 2) * 1. / len(list_of_numbers_of_source_sents) print str(percent_singleton) + '\t', print '' for summ_sent_idx, list_of_numbers_of_source_sents in enumerate( summ_sent_idx_to_number_of_source_sents): if len(list_of_numbers_of_source_sents) == 0: percent_pair = 0. else: percent_singleton = list_of_numbers_of_source_sents.count( 1) * 1. / len(list_of_numbers_of_source_sents) percent_pair = list_of_numbers_of_source_sents.count( 2) * 1. / len(list_of_numbers_of_source_sents) print str(percent_pair) + '\t', print '' primary_pos = [ssi[0] for ssi in ssi_2d if len(ssi) >= 1] secondary_pos = [ssi[1] for ssi in ssi_2d if len(ssi) >= 2] all_pos = [max(ssi) for ssi in ssi_2d if len(ssi) >= 1] # if FLAGS.dataset_name != 'duc_2004': # plot_positions(primary_pos, secondary_pos, all_pos) if FLAGS.dataset_split == 'all': glob_string = '*.bin' else: glob_string = dataset_splits[0] print('Loading TFIDF vectorizer') with open(tfidf_vec_path, 'rb') as f: tfidf_vectorizer = pickle.load(f) source_dir = os.path.join(data_dir, FLAGS.dataset_name) source_files = sorted( glob.glob(source_dir + '/' + glob_string + '*')) total = len(source_files) * 1000 if ( 'cnn' in FLAGS.dataset_name or 'newsroom' in FLAGS.dataset_name or 'xsum' in FLAGS.dataset_name) else len(source_files) example_generator = data.example_generator( source_dir + '/' + glob_string + '*', True, False, should_check_valid=False) all_possible_singles = 0 all_possible_pairs = [0] all_filtered_pairs = 0 all_all_combinations = 0 all_ssi_pairs = [0] ssi_pairs_with_shared_coref = [0] ssi_pairs_with_shared_word = [0] ssi_pairs_with_either_coref_or_word = [0] all_pairs_with_shared_coref = [0] all_pairs_with_shared_word = [0] all_pairs_with_either_coref_or_word = [0] actual_total = [0] rel_positions_primary = [] rel_positions_secondary = [] rel_positions_all = [] sent_lens = [] all_sent_lens = [] all_pos = [] y = [] normalized_positions_primary = [] normalized_positions_secondary = [] all_normalized_positions_primary = [] all_normalized_positions_secondary = [] normalized_positions_singles = [] normalized_positions_pairs_first = [] normalized_positions_pairs_second = [] primary_pos_duc = [] secondary_pos_duc = [] all_pos_duc = [] all_distances = [] distances_duc = [] tfidf_similarities = [] all_tfidf_similarities = [] average_mmrs = [] all_average_mmrs = [] for example_idx, example in enumerate( tqdm(example_generator, total=total)): # def process(example_idx_example): # # print '0' # example = example_idx_example if FLAGS.num_instances != -1 and example_idx >= FLAGS.num_instances: break raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, corefs, doc_indices = util.unpack_tf_example( example, names_to_types) article_sent_tokens = [ util.process_sent(sent) for sent in raw_article_sents ] article_text = ' '.join(raw_article_sents) groundtruth_summ_sents = [[ sent.strip() for sent in groundtruth_summary_text.strip().split('\n') ]] if doc_indices is None: doc_indices = [0] * len( util.flatten_list_of_lists(article_sent_tokens)) doc_indices = [int(doc_idx) for doc_idx in doc_indices] rel_sent_indices, doc_sent_indices, doc_sent_lens = preprocess_for_lambdamart_no_flags.get_rel_sent_indices( doc_indices, article_sent_tokens) groundtruth_similar_source_indices_list = util.enforce_sentence_limit( groundtruth_similar_source_indices_list, FLAGS.sentence_limit) sent_term_matrix = util.get_doc_substituted_tfidf_matrix( tfidf_vectorizer, raw_article_sents, article_text) sents_similarities = util.cosine_similarity( sent_term_matrix, sent_term_matrix) importances = util.special_squash( util.get_tfidf_importances(tfidf_vectorizer, raw_article_sents)) if FLAGS.dataset_name == 'duc_2004': first_k_indices = lambdamart_scores_to_summaries.get_indices_of_first_k_sents_of_each_article( rel_sent_indices, FLAGS.first_k) else: first_k_indices = [ idx for idx in range(len(raw_article_sents)) ] article_indices = list(range(len(raw_article_sents))) possible_pairs = [ x for x in list(itertools.combinations(article_indices, 2)) ] # all pairs # # # filtered_possible_pairs = preprocess_for_lambdamart_no_flags.filter_pairs_by_criteria(raw_article_sents, possible_pairs, corefs) # if FLAGS.dataset_name == 'duc_2004': # filtered_possible_pairs = [x for x in list(itertools.combinations(first_k_indices, 2))] # all pairs # else: # filtered_possible_pairs = preprocess_for_lambdamart_no_flags.filter_pairs_by_sent_position(possible_pairs) # # removed_pairs = list(set(possible_pairs) - set(filtered_possible_pairs)) # possible_singles = [(i,) for i in range(len(raw_article_sents))] # all_combinations = filtered_possible_pairs + possible_singles # # all_possible_singles += len(possible_singles) # all_possible_pairs[0] += len(possible_pairs) # all_filtered_pairs += len(filtered_possible_pairs) # all_all_combinations += len(all_combinations) # for ssi in groundtruth_similar_source_indices_list: # if len(ssi) > 0: # idx = rel_sent_indices[ssi[0]] # rel_positions_primary.append(idx) # rel_positions_all.append(idx) # if len(ssi) > 1: # idx = rel_sent_indices[ssi[1]] # rel_positions_secondary.append(idx) # rel_positions_all.append(idx) # # # # coref_pairs = preprocess_for_lambdamart_no_flags.get_coref_pairs(corefs) # # DO OVER LAP PAIRS BETTER # overlap_pairs = preprocess_for_lambdamart_no_flags.filter_by_overlap(article_sent_tokens, possible_pairs) # either_coref_or_word = list(set(list(coref_pairs) + overlap_pairs)) # # for ssi in groundtruth_similar_source_indices_list: # if len(ssi) == 2: # all_ssi_pairs[0] += 1 # do_share_coref = ssi in coref_pairs # do_share_words = ssi in overlap_pairs # if do_share_coref: # ssi_pairs_with_shared_coref[0] += 1 # if do_share_words: # ssi_pairs_with_shared_word[0] += 1 # if do_share_coref or do_share_words: # ssi_pairs_with_either_coref_or_word[0] += 1 # all_pairs_with_shared_coref[0] += len(coref_pairs) # all_pairs_with_shared_word[0] += len(overlap_pairs) # all_pairs_with_either_coref_or_word[0] += len(either_coref_or_word) if FLAGS.dataset_name == 'duc_2004': primary_pos_duc.extend([ rel_sent_indices[ssi[0]] for ssi in groundtruth_similar_source_indices_list if len(ssi) >= 1 ]) secondary_pos_duc.extend([ rel_sent_indices[ssi[1]] for ssi in groundtruth_similar_source_indices_list if len(ssi) >= 2 ]) all_pos_duc.extend([ max([rel_sent_indices[sent_idx] for sent_idx in ssi]) for ssi in groundtruth_similar_source_indices_list if len(ssi) >= 1 ]) for ssi in groundtruth_similar_source_indices_list: for sent_idx in ssi: sent_lens.append(len(article_sent_tokens[sent_idx])) if len(ssi) >= 1: orig_val = ssi[0] vals_to_add = get_integral_values_for_histogram( orig_val, rel_sent_indices, doc_sent_indices, doc_sent_lens, raw_article_sents) normalized_positions_primary.extend(vals_to_add) if len(ssi) >= 2: orig_val = ssi[1] vals_to_add = get_integral_values_for_histogram( orig_val, rel_sent_indices, doc_sent_indices, doc_sent_lens, raw_article_sents) normalized_positions_secondary.extend(vals_to_add) if FLAGS.dataset_name == 'duc_2004': distances_duc.append( abs(rel_sent_indices[ssi[1]] - rel_sent_indices[ssi[0]])) tfidf_similarities.append(sents_similarities[ssi[0], ssi[1]]) average_mmrs.append( (importances[ssi[0]] + importances[ssi[1]]) / 2) for ssi in groundtruth_similar_source_indices_list: if len(ssi) == 1: orig_val = ssi[0] vals_to_add = get_integral_values_for_histogram( orig_val, rel_sent_indices, doc_sent_indices, doc_sent_lens, raw_article_sents) normalized_positions_singles.extend(vals_to_add) if len(ssi) >= 2: if doc_sent_indices[ssi[0]] != doc_sent_indices[ ssi[1]]: continue orig_val_first = min(ssi[0], ssi[1]) vals_to_add = get_integral_values_for_histogram( orig_val_first, rel_sent_indices, doc_sent_indices, doc_sent_lens, raw_article_sents) normalized_positions_pairs_first.extend(vals_to_add) orig_val_second = max(ssi[0], ssi[1]) vals_to_add = get_integral_values_for_histogram( orig_val_second, rel_sent_indices, doc_sent_indices, doc_sent_lens, raw_article_sents) normalized_positions_pairs_second.extend(vals_to_add) # all_normalized_positions_primary.extend(util.flatten_list_of_lists([get_integral_values_for_histogram(single[0], rel_sent_indices, doc_sent_indices, doc_sent_lens, raw_article_sents) for single in possible_singles])) # all_normalized_positions_secondary.extend(util.flatten_list_of_lists([get_integral_values_for_histogram(pair[1], rel_sent_indices, doc_sent_indices, doc_sent_lens, raw_article_sents) for pair in possible_pairs])) all_sent_lens.extend( [len(sent) for sent in article_sent_tokens]) all_distances.extend([ abs(rel_sent_indices[pair[1]] - rel_sent_indices[pair[0]]) for pair in possible_pairs ]) all_tfidf_similarities.extend([ sents_similarities[pair[0], pair[1]] for pair in possible_pairs ]) all_average_mmrs.extend([ (importances[pair[0]] + importances[pair[1]]) / 2 for pair in possible_pairs ]) # if FLAGS.dataset_name == 'duc_2004': # rel_pos_single = [rel_sent_indices[single[0]] for single in possible_singles] # rel_pos_pair = [[rel_sent_indices[pair[0]], rel_sent_indices[pair[1]]] for pair in possible_pairs] # all_pos.extend(rel_pos_single) # all_pos.extend([max(pair) for pair in rel_pos_pair]) # else: # all_pos.extend(util.flatten_list_of_lists(possible_singles)) # all_pos.extend([max(pair) for pair in possible_pairs]) # y.extend([1 if single in groundtruth_similar_source_indices_list else 0 for single in possible_singles]) # y.extend([1 if pair in groundtruth_similar_source_indices_list else 0 for pair in possible_pairs]) # actual_total[0] += 1 # # p = Pool(144) # # list(tqdm(p.imap(process, example_generator), total=total)) # # # print 'Possible_singles\tPossible_pairs\tFiltered_pairs\tAll_combinations: \n%.2f\t%.2f\t%.2f\t%.2f' % (all_possible_singles*1./actual_total, \ # # all_possible_pairs*1./actual_total, all_filtered_pairs*1./actual_total, all_all_combinations*1./actual_total) # # # # # print 'Relative positions of groundtruth source sentences in document:\nPrimary\tSecondary\tBoth\n%.2f\t%.2f\t%.2f' % (np.mean(rel_positions_primary), np.mean(rel_positions_secondary), np.mean(rel_positions_all)) # # # # print 'SSI Pair statistics:\nShare_coref\tShare_word\tShare_either\n%.2f\t%.2f\t%.2f' \ # # % (ssi_pairs_with_shared_coref[0]*100./all_ssi_pairs[0], ssi_pairs_with_shared_word[0]*100./all_ssi_pairs[0], ssi_pairs_with_either_coref_or_word[0]*100./all_ssi_pairs[0]) # # print 'All Pair statistics:\nShare_coref\tShare_word\tShare_either\n%.2f\t%.2f\t%.2f' \ # # % (all_pairs_with_shared_coref[0]*100./all_possible_pairs[0], all_pairs_with_shared_word[0]*100./all_possible_pairs[0], all_pairs_with_either_coref_or_word[0]*100./all_possible_pairs[0]) # # # hist_all_pos = np.histogram(all_pos, bins=max(all_pos)+1) # # print 'Histogram of all sent positions: ', util.hist_as_pdf_str(hist_all_pos) # # min_sent_len = min(sent_lens) # # hist_sent_lens = np.histogram(sent_lens, bins=max(sent_lens)-min_sent_len+1) # # print 'min, max sent lens:', min_sent_len, max(sent_lens) # # print 'Histogram of sent lens: ', util.hist_as_pdf_str(hist_sent_lens) # # min_all_sent_len = min(all_sent_lens) # # hist_all_sent_lens = np.histogram(all_sent_lens, bins=max(all_sent_lens)-min_all_sent_len+1) # # print 'min, max all sent lens:', min_all_sent_len, max(all_sent_lens) # # print 'Histogram of all sent lens: ', util.hist_as_pdf_str(hist_all_sent_lens) # # # print 'Pearsons r, p value', pearsonr(all_pos, y) # # fig, ax1 = plt.subplots(nrows=1) # # plt.scatter(all_pos, y) # # pp = PdfPages(os.path.join('stuff/plots', FLAGS.dataset_name + '_position_scatter.pdf')) # # plt.savefig(pp, format='pdf',bbox_inches='tight') # # plt.show() # # pp.close() # # # if FLAGS.dataset_name == 'duc_2004': # # plot_positions(primary_pos_duc, secondary_pos_duc, all_pos_duc) # # normalized_positions_all = normalized_positions_primary + normalized_positions_secondary # # plot_histogram(normalized_positions_primary, num_bins=100) # # plot_histogram(normalized_positions_secondary, num_bins=100) # # plot_histogram(normalized_positions_all, num_bins=100) # # sent_lens_together = [sent_lens, all_sent_lens] # # plot_histogram(sent_lens_together, pdf=True, start_at_0=True, max_val=70) # # if FLAGS.dataset_name == 'duc_2004': # distances = distances_duc # sent_distances_together = [distances, all_distances] # # plot_histogram(sent_distances_together, pdf=True, start_at_0=True, max_val=100) # # tfidf_similarities_together = [tfidf_similarities, all_tfidf_similarities] # # plot_histogram(tfidf_similarities_together, pdf=True, num_bins=100) # # average_mmrs_together = [average_mmrs, all_average_mmrs] # # plot_histogram(average_mmrs_together, pdf=True, num_bins=100) # # normalized_positions_primary_together = [normalized_positions_primary, bin_values] # normalized_positions_secondary_together = [normalized_positions_secondary, bin_values] # # plot_histogram(normalized_positions_primary_together, pdf=True, num_bins=100) # # plot_histogram(normalized_positions_secondary_together, pdf=True, num_bins=100) # # # list_of_hist_pairs = [ # { # 'lst': normalized_positions_primary_together, # 'pdf': True, # 'num_bins': 100, # 'y_lim': 3.9, # 'y_label': FLAGS.dataset_name, # 'x_label': 'Sent position (primary)' # }, # { # 'lst': normalized_positions_secondary_together, # 'pdf': True, # 'num_bins': 100, # 'y_lim': 3.9, # 'x_label': 'Sent position (secondary)' # }, # { # 'lst': sent_distances_together, # 'pdf': True, # 'start_at_0': True, # 'max_val': 100, # 'x_label': 'Sent distance' # }, # { # 'lst': sent_lens_together, # 'pdf': True, # 'start_at_0': True, # 'max_val': 70, # 'x_label': 'Sent length' # }, # { # 'lst': average_mmrs_together, # 'pdf': True, # 'num_bins': 100, # 'x_label': 'Average TF-IDF importance' # } # ] normalized_positions_pairs_together = [ normalized_positions_pairs_first, normalized_positions_pairs_second ] list_of_hist_pairs = [ { 'lst': [normalized_positions_singles], 'pdf': True, 'num_bins': 100, # 'y_lim': 3.9, 'x_lim': 1.0, 'y_label': FLAGS.dataset_name, 'x_label': 'Sent Position (Singles)', 'legend_labels': ['Primary'] }, { 'lst': normalized_positions_pairs_together, 'pdf': True, 'num_bins': 100, # 'y_lim': 3.9, 'x_lim': 1.0, 'x_label': 'Sent Position (Pairs)', 'legend_labels': ['Primary', 'Secondary'] } ] all_lists_of_histogram_pairs.append(list_of_hist_pairs) with open(plot_data_file, 'w') as f: cPickle.dump(all_lists_of_histogram_pairs, f) else: with open(plot_data_file) as f: all_lists_of_histogram_pairs = cPickle.load(f) plot_histograms(all_lists_of_histogram_pairs)
def convert_article_to_lambdamart_features(ex): # example_idx += 1 # if num_instances != -1 and example_idx >= num_instances: # break example, example_idx, single_feat_len, pair_feat_len = ex print(example_idx) raw_article_sents, similar_source_indices_list, summary_text = util.unpack_tf_example(example, names_to_types) article_sent_tokens = [util.process_sent(sent) for sent in raw_article_sents] summ_sent_tokens = [sent.strip().split() for sent in summary_text.strip().split('\n')] # sent_term_matrix = util.get_tfidf_matrix(raw_article_sents) article_text = ' '.join(raw_article_sents) sent_term_matrix = util.get_doc_substituted_tfidf_matrix(tfidf_vectorizer, raw_article_sents, article_text) doc_vector = np.mean(sent_term_matrix, axis=0) out_str = '' # ssi_idx_cur_inst_id = defaultdict(int) instances = [] if importance: importances = util.special_squash(util.get_tfidf_importances(tfidf_vectorizer, raw_article_sents)) possible_pairs = [list(x) for x in list(itertools.combinations(list(range(len(raw_article_sents))), 2))] # all pairs possible_singles = [[i] for i in range(len(raw_article_sents))] possible_combinations = possible_pairs + possible_singles positives = [ssi for ssi in similar_source_indices_list] negatives = [ssi for ssi in possible_combinations if not (ssi in positives or ssi[::-1] in positives)] negative_pairs = [x for x in possible_pairs if not (x in similar_source_indices_list or x[::-1] in similar_source_indices_list)] negative_singles = [x for x in possible_singles if not (x in similar_source_indices_list or x[::-1] in similar_source_indices_list)] random_negative_pairs = np.random.permutation(len(negative_pairs)).tolist() random_negative_singles = np.random.permutation(len(negative_singles)).tolist() qid = example_idx for similar_source_indices in positives: # True sentence single/pair relevance = 1 features = get_features(similar_source_indices, sent_term_matrix, article_sent_tokens, single_feat_len, pair_feat_len, importances) if features is None: continue instances.append(Lambdamart_Instance(features, relevance, qid, similar_source_indices)) a=0 if balance: # False sentence single/pair is_pair = len(similar_source_indices) == 2 if is_pair: if len(random_negative_pairs) == 0: continue negative_indices = negative_pairs[random_negative_pairs.pop()] else: if len(random_negative_singles) == 0: continue negative_indices = negative_singles[random_negative_singles.pop()] neg_relevance = 0 neg_features = get_features(negative_indices, sent_term_matrix, article_sent_tokens, single_feat_len, pair_feat_len, importances) if neg_features is None: continue instances.append(Lambdamart_Instance(neg_features, neg_relevance, qid, negative_indices)) if not balance: for negative_indices in negatives: neg_relevance = 0 neg_features = get_features(negative_indices, sent_term_matrix, article_sent_tokens, single_feat_len, pair_feat_len, importances) if neg_features is None: continue instances.append(Lambdamart_Instance(neg_features, neg_relevance, qid, negative_indices)) else: mmr_all = util.calc_MMR_all(raw_article_sents, article_sent_tokens, summ_sent_tokens, None) # the size is (# of summary sents, # of article sents) possible_pairs = [list(x) for x in list(itertools.combinations(list(range(len(raw_article_sents))), 2))] # all pairs possible_singles = [[i] for i in range(len(raw_article_sents))] # negative_pairs = [x for x in possible_pairs if not (x in similar_source_indices_list or x[::-1] in similar_source_indices_list)] # negative_singles = [x for x in possible_singles if not (x in similar_source_indices_list or x[::-1] in similar_source_indices_list)] # # random_negative_pairs = np.random.permutation(len(negative_pairs)).tolist() # random_negative_singles = np.random.permutation(len(negative_singles)).tolist() all_combinations = list(itertools.product(possible_pairs + possible_singles, list(range(len(summ_sent_tokens))))) positives = [(similar_source_indices, summ_sent_idx) for summ_sent_idx, similar_source_indices in enumerate(similar_source_indices_list)] negatives = [(ssi, ssi_idx) for ssi, ssi_idx in all_combinations if not ((ssi, ssi_idx) in positives or (ssi[::-1], ssi_idx) in positives)] for similar_source_indices, ssi_idx in positives: # True sentence single/pair relevance = 1 qid = example_idx * 10 + ssi_idx features = get_features(similar_source_indices, sent_term_matrix, article_sent_tokens, single_feat_len, pair_feat_len, mmr_all[ssi_idx]) if features is None: continue # inst_id = ssi_idx_cur_inst_id[ssi_idx] instances.append(Lambdamart_Instance(features, relevance, qid, similar_source_indices)) # ssi_idx_cur_inst_id[ssi_idx] += 1 a=0 if balance: # False sentence single/pair is_pair = len(similar_source_indices) == 2 if is_pair: if len(random_negative_pairs) == 0: continue negative_indices = possible_pairs[random_negative_pairs.pop()] else: if len(random_negative_singles) == 0: continue negative_indices = possible_singles[random_negative_singles.pop()] neg_relevance = 0 neg_features = get_features(negative_indices, sent_term_matrix, article_sent_tokens, single_feat_len, pair_feat_len) if neg_features is None: continue neg_lambdamart_str = format_to_lambdamart([neg_features, neg_relevance, qid, negative_indices]) out_str += neg_lambdamart_str + '\n' if not balance: for negative_indices, ssi_idx in negatives: neg_relevance = 0 qid = example_idx * 10 + ssi_idx neg_features = get_features(negative_indices, sent_term_matrix, article_sent_tokens, single_feat_len, pair_feat_len, mmr_all[ssi_idx]) if neg_features is None: continue # inst_id = ssi_idx_cur_inst_id[ssi_idx] instances.append(Lambdamart_Instance(neg_features, neg_relevance, qid, negative_indices)) # ssi_idx_cur_inst_id[ssi_idx] += 1 sorted_instances = sorted(instances, key=lambda x: (x.qid, x.source_indices)) assign_inst_ids(sorted_instances) if lr: return sorted_instances else: for instance in sorted_instances: lambdamart_str = format_to_lambdamart(instance, single_feat_len) out_str += lambdamart_str + '\n' # print out_str return out_str