def readDatasets(datasets, samples=9999): ### read documents and ref. summaries summaries = [] targets = [] groups = [] sentences_of_topics = {} sorted_idxs_list = [] models_list = [] docs_list = [] topic_list = [] for dataset in datasets: reader = CorpusReader(PROCESSED_PATH) data = reader.get_data(dataset) sample_num = samples ml = [] dl = [] tl = [] ### read data for topic, docs, models in data: print('read DATA {}, TOPIC {}'.format(dataset, topic)) summs, ref_values_dic = readSummaries(dataset, topic, 'ground_truth', sample_num) ref_rewards = aggregateScores(ref_values_dic) summs = summs[:len(ref_rewards)] print('num of summaries read: {}'.format(len(summs))) sentences = [ sent2tokens(sentence, 'english') for _, doc in docs for sentence in doc ] sentences_of_topics[topic] = sentences ml.append(models) dl.append(docs) tl.append(topic) sorted_idxs = np.argsort(np.array(ref_rewards)) sorted_idxs_list.extend(sorted_idxs) summaries.extend(summs) targets.extend(ref_rewards) groups.extend(['{}-{}'.format(dataset, topic)] * len(summs)) models_list.append(ml) docs_list.append(dl) topic_list.append(tl) summaries = np.array(summaries) targets = np.array(targets) groups = np.array(groups) return summaries, targets, groups, models_list, docs_list, topic_list, sentences_of_topics, sorted_idxs_list
def build_duc_vocabulary(): datasets = ['DUC2001', 'DUC2002', 'DUC2004'] sample_num = 9999 cv_fold_num = 10 validation_size = 0.1 ### read documents and ref. summaries reader = CorpusReader(PROCESSED_PATH) vocabulary = set() for dataset in datasets: data = reader.get_data(dataset) ### read data for topic, docs, models in data: print('read DATA {}, TOPIC {}'.format(dataset, topic)) summs, ref_values_dic = readSummaries(dataset, topic, 'rouge', sample_num) sentences = [ sent2tokens(sentence, 'english') for _, doc in docs for sentence in doc ] vocabulary.update( [token for sentence in sentences for token in sentence]) return vocabulary
all_test_reward_dic = OrderedDict() topic_cnt = 0 summaries = [] targets = [] groups = [] models_list = [] docs_list = [] ### read data infersent = InfersentRewardGenerator() for topic, docs, models in data: if topic != 'd112h': continue print('read DATA {}, TOPIC {}'.format(dataset, topic)) summs, ref_values_dic = readSummaries(dataset, topic, 'rouge', sample_num) values = infersent(summs, docs) assert len(values) == len(summs) out_str = '' for ii, vv in enumerate(values): out_str += '{}\t{}\n'.format(summs[ii], vv) if not os.path.exists(os.path.join(out_base, topic)): os.makedirs(os.path.join(out_base, topic)) fpath = os.path.join(out_base, topic, 'infersent_heuristic') ff = open(fpath, 'w') ff.write(out_str) ff.close()
cos_list.append( cosine_similarity(ntf.reshape(1, -1), tf_idf_matrix[jj, :])) summary_features[ii].append(np.min(cos_list)) summary_features[ii].append(np.max(cos_list)) summary_features[ii].append(np.mean(cos_list)) summary_features[ii].append(np.std(cos_list)) return np.array(summary_features) if __name__ == '__main__': dataset = 'DUC2001' summary_len = 100 reader = CorpusReader(PROCESSED_PATH) data = reader.get_data(dataset, summary_len) topic_cnt = 0 for topic, docs, models in data: topic_cnt += 1 summaries, heuristic_values_list = readSummaries( dataset, topic, 'heuristic') print('num of summaries read: {}'.format(len(summaries))) vec = CrossTopicNgramVectoriser(docs) features = vec(summaries) print('features.shape {}'.format(features.shape))
def correlation(features, sizes): names = [] for f, s in zip(features, sizes): if s > 1: for i in range(s): names.append(f + str(i)) else: names.append(f) names.append('rouge_reward') dataset = 'DUC2001' ## DUC2001, DUC2002, DUC2004 sample_num = 9999 bin_num = 20 cv_fold_num = 10 ### read documents and ref. summaries reader = CorpusReader(PROCESSED_PATH) data = reader.get_data(dataset) topic_cnt = 0 summaries = [] groups = [] models_list = [] docs_list = [] targets = [] ### read data for topic, docs, models in data: print('read DATA {}, TOPIC {}'.format(dataset, topic)) summs, ref_values_dic = readSummaries(dataset, topic, 'rouge', sample_num) print('num of summaries read: {}'.format(len(summaries))) ref_rewards = aggregateScores(ref_values_dic) models_list.append(models) docs_list.append(docs) summaries.extend(summs) groups.extend([topic] * len(summs)) targets.extend(ref_rewards) topic_cnt += 1 allFeatures = readFeatures(features, dataset, np.array(summaries), groups, set(groups)) allFeatures = np.c_[allFeatures, np.array(targets)] correlations = {} threshold_correlation = {} for col1, col2 in itertools.combinations(range(len(names)), 2): pcc = pearsonr(allFeatures[:, col1], allFeatures[:, col2])[0] correlations[names[col1] + ' ' + names[col2] + ': pcc = '] = pcc # other way for ease of reading correlations[names[col2] + ' ' + names[col1] + ': pcc = '] = pcc if pcc < -0.8: threshold_correlation[names[col1] + ' ' + names[col2] + ': pcc = '] = pcc threshold_correlation[names[col2] + ' ' + names[col1] + ': pcc = '] = pcc #for key in sorted(correlations.keys()): # print(key+str(correlations[key])) print("Pairs with pcc >.9") for key in sorted(threshold_correlation.keys()): print(key + str(threshold_correlation[key]))
dataset = 'DUC2001' ## DUC2001, DUC2002, DUC2004 sample_num = 9999 out_base = os.path.join(FEATURE_DIR,dataset) if not os.path.exists(out_base): os.makedirs(out_base) ### read documents and ref. summaries reader = CorpusReader(PROCESSED_PATH) data = reader.get_data(dataset) ### read data rewarder = WordEmbeddingRewarder() for topic,docs,models in data: print('read DATA {}, TOPIC {}'.format(dataset,topic)) summs, _ = readSummaries(dataset,topic,'rouge',sample_num) values = rewarder(docs, summs) assert len(values) == len(summs) out_str = '' for ii,vv in enumerate(values[0]): out_str += '{}\t{}\n'.format(summs[ii],vv) if not os.path.exists(os.path.join(out_base,topic)): os.makedirs(os.path.join(out_base,topic)) fpath = os.path.join(out_base,topic,'dss') ff = open(fpath,'w') ff.write(out_str) ff.close()