Пример #1
0
def get_opinosis_df(test_df, dev_df, config):
    opinosis_df = test_df.copy()

    max_summary_list = [4, 5, 6]
    redundancy_list = [0, 1]
    gap_list = [2, 3, 4]
    run_id_rougeL = {}
    for max_summary in max_summary_list:
        for redundancy in redundancy_list:
            for gap in gap_list:
                run_id = 'max-%i-red-%i-gap-%i' % (max_summary, redundancy,
                                                   gap)
                dir_dev = os.path.join('eval', config.data, 'opinosis', 'dev',
                                       'output', run_id)
                paths_dev = [
                    os.path.join(dir_dev,
                                 '{0:03}.{1}.system'.format(i, run_id))
                    for i in dev_df.index
                ]
                summary_list = []
                for path_dev in paths_dev:
                    with open(path_dev, 'r') as f:
                        summary = refine_text(f.read())
                    summary_list += [summary]

                rougeL = compute_rouge(dev_df, list(dev_df.summary),
                                       summary_list).mean()['rougeL']
                run_id_rougeL[run_id] = rougeL

    run_id_test = sorted(run_id_rougeL.items(),
                         key=lambda x: x[1],
                         reverse=True)[0][0]
    dir_test = os.path.join('eval', config.data, 'opinosis', 'test', 'output',
                            run_id_test)
    paths_test = [
        os.path.join(dir_test, '{0:03}.{1}.system'.format(i, run_id_test))
        for i in opinosis_df.index
    ]
    summary_list = []
    for path_test in paths_test:
        with open(path_test, 'r') as f:
            summary = refine_text(f.read())
        summary_list += [summary]

    opinosis_df[['rouge1', 'rouge2',
                 'rougeL']] = compute_rouge(opinosis_df,
                                            list(opinosis_df.summary),
                                            summary_list)
    opinosis_df.to_pickle(config.path_opinosis)
    return opinosis_df
Пример #2
0
def get_denoisesum_df(config):
    denoisesum_df = pd.read_json(config.path_denoisesum_raw)
    denoisesum_df[['rouge1', 'rouge2', 'rougeL']] = compute_rouge(
        denoisesum_df,
        reference_list=list(denoisesum_df.summary),
        summary_list=list(denoisesum_df.denoisesum))
    return denoisesum_df
Пример #3
0
def get_lead_df(data_df, config):
    lead_df = data_df.copy()
    lead_df['lead'] = lead_df.text.apply(lambda text: ' '.join(
        [sent_tokenize(doc)[0].strip() for doc in text.split('</DOC>')]))
    lead_df[['rouge1', 'rouge2', 'rougeL']] = \
        compute_rouge(lead_df, reference_list=list(lead_df.summary), summary_list=list(lead_df.lead))
    lead_df.to_pickle(config.path_lead)
    return lead_df
Пример #4
0
def get_meansum_df(config):
    meansum_df = pd.read_pickle(config.path_meansum_raw)
    meansum_df[['rouge1', 'rouge2', 'rougeL'
                ]] = compute_rouge(meansum_df,
                                   reference_list=list(meansum_df.summary),
                                   summary_list=list(meansum_df.meansum))
    meansum_df.to_pickle(config.path_meansum)
    return meansum_df
Пример #5
0
def get_recursum_df(sess, model, test_df, sys_sum, topk, threshold, truncate,
                    summary_l, num_split):
    data_df = test_df.groupby('business_id').agg({
        'doc_l':
        lambda doc_l_series: doc_l_series.values[0],
        'sent_l':
        lambda sent_l_series: sent_l_series.values[0],
        'token_idxs':
        lambda token_idxs_series: token_idxs_series.values[0],
        'text':
        lambda text_series: text_series.values[0]
    })

    batches = get_batches(data_df, model.config.batch_size)
    topic_sents_list, probs_topic_list, topic_tokens_list = compute_topic_sents_probs(
        sess, model, batches, mode='eval', sample=False)
    text_list = [row.text.replace('\n', '') for _, row in data_df.iterrows()]
    verbose = False

    args = [(model.config.tree_idxs, topic_sents, text, topk, threshold,
             truncate, summary_l, verbose)
            for topic_sents, text in zip(topic_sents_list, text_list)]
    pool = multiprocessing.Pool(processes=num_split)
    summary_l_sents_list = pool.map(sys_sum, args)
    pool.close()

    summary_list = [
        get_text_from_sents(summary_l_sents[summary_l]['sents'])
        for summary_l_sents in summary_l_sents_list
    ]
    summary_idxs_list = [[model.config.topic_idxs[topic_index] for topic_index in summary_l_sents[summary_l]['indices']] \
                                             for summary_l_sents in summary_l_sents_list]

    data_df['recursum'] = summary_list
    data_df['summary_idxs'] = summary_idxs_list
    data_df['topic_sents'] = topic_sents_list
    data_df['topic_tokens'] = topic_tokens_list
    data_df['probs_topic'] = probs_topic_list

    recursum_df = pd.merge(test_df,
                           data_df[[
                               'recursum', 'summary_idxs', 'topic_sents',
                               'topic_tokens', 'probs_topic'
                           ]],
                           on='business_id',
                           how='left')
    recursum_df[['rouge1', 'rouge2', 'rougeL']] = compute_rouge(recursum_df, \
                                                    reference_list=list(recursum_df.summary), summary_list=list(recursum_df.recursum))
    recursum_df = recursum_df.set_index(test_df.index)
    assert recursum_df['business_id'].to_dict(
    ) == test_df['business_id'].to_dict()
    return recursum_df
Пример #6
0
def get_copycat_df(data_df, config):
    business_id_dict = {
        '#NAME1': '-zbcosKSMGDhaZYN-CrcVA',
        '#NAME2': '-i3pCgQi_Y9NiSSWs6G7bw',
        '#NAME3': '-_TSaVr53qiEGqMkwyEMaQ',
        '#NAME4': '-vCLrTTgw6pBufdarW8ynA',
        '#NAME5': '-K3kqmykKlhlB4arCsLHOw',
        '#NAME6': '-exEWEQ3iSMVC-QUP_ycPQ',
        '#NAME7': '-_yEVC3_3M6YOsamYfNFEw',
        '#NAME8': '-NR4KqS6lHseNvJ-GFzfMA',
        '#NAME9': '-ot4Xd6GxSUOqwUj7okZuA',
        '#NAME10': '-pV9kWNoA9vyHfM_auYecA',
        '#NAME11': '-FNquqGseSCVMWo7KbK-Tg',
        '#NAME12': '-Qkx7W0itbAApcG5lJuMFQ',
        '#NAME13': '-SJcjOv88ZHjIU44U4vWTQ',
        '#NAME14': '-isxnIljKLVjc9qEhCiaGg',
        '#NAME15': '-iPc_YSSqvM1CpZxxeUTXw',
        '#NAME16': '-ADtl9bLp8wNqYX1k3KuxA',
        '#NAME17': '-zEpEmDfFQL-ph0N3BDlXA',
        '#NAME18': '-oOKqZbYDt08zaWWyLZNIw',
        '#NAME19': '-PbCfkydmvuNcG9VG_ixkQ',
        '#NAME20': '-pN44P-_PjRpcj4Rk2wMOg',
        '#NAME21': '-dcI8oWvxdMCGp00da8Ksg',
        '#NAME22': '-MKWJZnMjSit406AUKf7Pg'
    }

    json_copycat = json.load(open(config.path_copycat_raw))
    json_copycat_df = pd.concat([
        pd.DataFrame.from_dict(json_copycat[category], orient='index')
        for category in list(json_copycat)[:-1]
    ])
    if config.data == 'yelp':
        json_copycat_df.index = [
            business_id_dict[raw_id] if 'NAME' in raw_id else raw_id
            for raw_id in json_copycat_df.index
        ]

    copycat_df = pd.merge(data_df, \
                                             pd.DataFrame([{'business_id': index, 'copycat': ' '.join(copycat_summary)} \
                                                                           for index, copycat_summary_list in json_copycat_df.gen_summ.to_dict().items() \
                                                                           for copycat_summary in copycat_summary_list]), how='left')
    copycat_df[['rouge1', 'rouge2', 'rougeL'
                ]] = compute_rouge(copycat_df,
                                   reference_list=list(copycat_df.summary),
                                   summary_list=list(copycat_df.copycat))
    copycat_df.index = data_df.index
    assert len(data_df) == len(copycat_df)
    copycat_df.to_pickle(config.path_copycat)
    return copycat_df
Пример #7
0
def get_lexrank_df(data_df, n_sents, min_sent_l, config):
    def lexrank(tfidfbows):
        cos_matrix = cosine_similarity(tfidfbows.toarray(),
                                       tfidfbows.toarray())
        eig_values, _ = np.linalg.eig(cos_matrix)
        eig_indices = np.argsort(eig_values)[::-1]
        return eig_indices

    lexrank_df = data_df.copy()
    lexrank_df['eig_indices'] = lexrank_df['tfidfbows'].apply(lexrank)
    lexrank_df['length_indices'] = data_df.sent_l.apply(
        lambda sent_l: np.where(np.array(sent_l) > min_sent_l)[0])
    lexrank_df['summary_indices'] = lexrank_df.apply(
        lambda row: [i for i in row.eig_indices
                     if i in row.length_indices][:n_sents], 1)

    lexrank_df['lexrank'] = lexrank_df.apply(lambda row: \
                                                           get_text_from_sents(idxs_to_sents(np.array(row.token_idxs)[row.summary_indices], config)), 1)
    lexrank_df[['rouge1', 'rouge2', 'rougeL']] = compute_rouge(lexrank_df, \
                                                reference_list=list(lexrank_df.summary), summary_list=list(lexrank_df.lexrank))
    lexrank_df.to_pickle(config.path_lexrank)
    return lexrank_df