def get_opinosis_df(test_df, dev_df, config): opinosis_df = test_df.copy() max_summary_list = [4, 5, 6] redundancy_list = [0, 1] gap_list = [2, 3, 4] run_id_rougeL = {} for max_summary in max_summary_list: for redundancy in redundancy_list: for gap in gap_list: run_id = 'max-%i-red-%i-gap-%i' % (max_summary, redundancy, gap) dir_dev = os.path.join('eval', config.data, 'opinosis', 'dev', 'output', run_id) paths_dev = [ os.path.join(dir_dev, '{0:03}.{1}.system'.format(i, run_id)) for i in dev_df.index ] summary_list = [] for path_dev in paths_dev: with open(path_dev, 'r') as f: summary = refine_text(f.read()) summary_list += [summary] rougeL = compute_rouge(dev_df, list(dev_df.summary), summary_list).mean()['rougeL'] run_id_rougeL[run_id] = rougeL run_id_test = sorted(run_id_rougeL.items(), key=lambda x: x[1], reverse=True)[0][0] dir_test = os.path.join('eval', config.data, 'opinosis', 'test', 'output', run_id_test) paths_test = [ os.path.join(dir_test, '{0:03}.{1}.system'.format(i, run_id_test)) for i in opinosis_df.index ] summary_list = [] for path_test in paths_test: with open(path_test, 'r') as f: summary = refine_text(f.read()) summary_list += [summary] opinosis_df[['rouge1', 'rouge2', 'rougeL']] = compute_rouge(opinosis_df, list(opinosis_df.summary), summary_list) opinosis_df.to_pickle(config.path_opinosis) return opinosis_df
def get_denoisesum_df(config): denoisesum_df = pd.read_json(config.path_denoisesum_raw) denoisesum_df[['rouge1', 'rouge2', 'rougeL']] = compute_rouge( denoisesum_df, reference_list=list(denoisesum_df.summary), summary_list=list(denoisesum_df.denoisesum)) return denoisesum_df
def get_lead_df(data_df, config): lead_df = data_df.copy() lead_df['lead'] = lead_df.text.apply(lambda text: ' '.join( [sent_tokenize(doc)[0].strip() for doc in text.split('</DOC>')])) lead_df[['rouge1', 'rouge2', 'rougeL']] = \ compute_rouge(lead_df, reference_list=list(lead_df.summary), summary_list=list(lead_df.lead)) lead_df.to_pickle(config.path_lead) return lead_df
def get_meansum_df(config): meansum_df = pd.read_pickle(config.path_meansum_raw) meansum_df[['rouge1', 'rouge2', 'rougeL' ]] = compute_rouge(meansum_df, reference_list=list(meansum_df.summary), summary_list=list(meansum_df.meansum)) meansum_df.to_pickle(config.path_meansum) return meansum_df
def get_recursum_df(sess, model, test_df, sys_sum, topk, threshold, truncate, summary_l, num_split): data_df = test_df.groupby('business_id').agg({ 'doc_l': lambda doc_l_series: doc_l_series.values[0], 'sent_l': lambda sent_l_series: sent_l_series.values[0], 'token_idxs': lambda token_idxs_series: token_idxs_series.values[0], 'text': lambda text_series: text_series.values[0] }) batches = get_batches(data_df, model.config.batch_size) topic_sents_list, probs_topic_list, topic_tokens_list = compute_topic_sents_probs( sess, model, batches, mode='eval', sample=False) text_list = [row.text.replace('\n', '') for _, row in data_df.iterrows()] verbose = False args = [(model.config.tree_idxs, topic_sents, text, topk, threshold, truncate, summary_l, verbose) for topic_sents, text in zip(topic_sents_list, text_list)] pool = multiprocessing.Pool(processes=num_split) summary_l_sents_list = pool.map(sys_sum, args) pool.close() summary_list = [ get_text_from_sents(summary_l_sents[summary_l]['sents']) for summary_l_sents in summary_l_sents_list ] summary_idxs_list = [[model.config.topic_idxs[topic_index] for topic_index in summary_l_sents[summary_l]['indices']] \ for summary_l_sents in summary_l_sents_list] data_df['recursum'] = summary_list data_df['summary_idxs'] = summary_idxs_list data_df['topic_sents'] = topic_sents_list data_df['topic_tokens'] = topic_tokens_list data_df['probs_topic'] = probs_topic_list recursum_df = pd.merge(test_df, data_df[[ 'recursum', 'summary_idxs', 'topic_sents', 'topic_tokens', 'probs_topic' ]], on='business_id', how='left') recursum_df[['rouge1', 'rouge2', 'rougeL']] = compute_rouge(recursum_df, \ reference_list=list(recursum_df.summary), summary_list=list(recursum_df.recursum)) recursum_df = recursum_df.set_index(test_df.index) assert recursum_df['business_id'].to_dict( ) == test_df['business_id'].to_dict() return recursum_df
def get_copycat_df(data_df, config): business_id_dict = { '#NAME1': '-zbcosKSMGDhaZYN-CrcVA', '#NAME2': '-i3pCgQi_Y9NiSSWs6G7bw', '#NAME3': '-_TSaVr53qiEGqMkwyEMaQ', '#NAME4': '-vCLrTTgw6pBufdarW8ynA', '#NAME5': '-K3kqmykKlhlB4arCsLHOw', '#NAME6': '-exEWEQ3iSMVC-QUP_ycPQ', '#NAME7': '-_yEVC3_3M6YOsamYfNFEw', '#NAME8': '-NR4KqS6lHseNvJ-GFzfMA', '#NAME9': '-ot4Xd6GxSUOqwUj7okZuA', '#NAME10': '-pV9kWNoA9vyHfM_auYecA', '#NAME11': '-FNquqGseSCVMWo7KbK-Tg', '#NAME12': '-Qkx7W0itbAApcG5lJuMFQ', '#NAME13': '-SJcjOv88ZHjIU44U4vWTQ', '#NAME14': '-isxnIljKLVjc9qEhCiaGg', '#NAME15': '-iPc_YSSqvM1CpZxxeUTXw', '#NAME16': '-ADtl9bLp8wNqYX1k3KuxA', '#NAME17': '-zEpEmDfFQL-ph0N3BDlXA', '#NAME18': '-oOKqZbYDt08zaWWyLZNIw', '#NAME19': '-PbCfkydmvuNcG9VG_ixkQ', '#NAME20': '-pN44P-_PjRpcj4Rk2wMOg', '#NAME21': '-dcI8oWvxdMCGp00da8Ksg', '#NAME22': '-MKWJZnMjSit406AUKf7Pg' } json_copycat = json.load(open(config.path_copycat_raw)) json_copycat_df = pd.concat([ pd.DataFrame.from_dict(json_copycat[category], orient='index') for category in list(json_copycat)[:-1] ]) if config.data == 'yelp': json_copycat_df.index = [ business_id_dict[raw_id] if 'NAME' in raw_id else raw_id for raw_id in json_copycat_df.index ] copycat_df = pd.merge(data_df, \ pd.DataFrame([{'business_id': index, 'copycat': ' '.join(copycat_summary)} \ for index, copycat_summary_list in json_copycat_df.gen_summ.to_dict().items() \ for copycat_summary in copycat_summary_list]), how='left') copycat_df[['rouge1', 'rouge2', 'rougeL' ]] = compute_rouge(copycat_df, reference_list=list(copycat_df.summary), summary_list=list(copycat_df.copycat)) copycat_df.index = data_df.index assert len(data_df) == len(copycat_df) copycat_df.to_pickle(config.path_copycat) return copycat_df
def get_lexrank_df(data_df, n_sents, min_sent_l, config): def lexrank(tfidfbows): cos_matrix = cosine_similarity(tfidfbows.toarray(), tfidfbows.toarray()) eig_values, _ = np.linalg.eig(cos_matrix) eig_indices = np.argsort(eig_values)[::-1] return eig_indices lexrank_df = data_df.copy() lexrank_df['eig_indices'] = lexrank_df['tfidfbows'].apply(lexrank) lexrank_df['length_indices'] = data_df.sent_l.apply( lambda sent_l: np.where(np.array(sent_l) > min_sent_l)[0]) lexrank_df['summary_indices'] = lexrank_df.apply( lambda row: [i for i in row.eig_indices if i in row.length_indices][:n_sents], 1) lexrank_df['lexrank'] = lexrank_df.apply(lambda row: \ get_text_from_sents(idxs_to_sents(np.array(row.token_idxs)[row.summary_indices], config)), 1) lexrank_df[['rouge1', 'rouge2', 'rougeL']] = compute_rouge(lexrank_df, \ reference_list=list(lexrank_df.summary), summary_list=list(lexrank_df.lexrank)) lexrank_df.to_pickle(config.path_lexrank) return lexrank_df