def main():

    args = get_args()
    data_path = os.path.join(args.iobasedir, 'processed/downloads',
                             args.data_set)
    log_path = os.path.join(args.iobasedir, 'logs')
    log_file = os.path.join(args.iobasedir, 'logs', 'UB.log')
    mkdirp(log_path)
    set_logger(log_file)

    for filename in os.listdir(data_path):
        data_file = os.path.join(data_path, filename)
        topic = filename[:-5]

        docs, refs = load_data(data_file)
        if not refs:
            continue

        if not args.summary_size:
            summary_size = len(' '.join(refs[0]).split(' '))
        else:
            summary_size = int(args.summary_size)

        logger.info('Topic ID: %s ', topic)
        logger.info('###')
        logger.info('Summmary_len: %d', summary_size)

        algos = ['UB1', 'UB2']
        for algo in algos:
            get_summary_scores(algo, docs, refs, summary_size, language, rouge)

        logger.info('###')
Пример #2
0
def main():

    args = get_args()
    rouge_dir = os.path.join(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
        'rouge/RELEASE-1.5.5/')

    data_path = os.path.join(args.iobasedir, 'processed/', args.dataset,
                             args.domain, args.split)
    log_path = os.path.join(args.iobasedir, 'logs')
    log_file = os.path.join(
        args.iobasedir, 'logs', 'baselines_rsumm_%s_%s_%s_%s.log' %
        (args.dataset, args.domain, args.split, str(args.summary_size)))
    mkdirp(log_path)
    set_logger(log_file)

    data_file = os.path.join(data_path, 'test0.csv')
    df = pd.read_csv(data_file,
                     sep=",",
                     quotechar='"',
                     engine='python',
                     header=None,
                     skiprows=1,
                     names=[
                         "user_id", "product_id", "rating", "review", "nouns",
                         "summary", 'time'
                     ])

    #   check_index = 1099
    for index, row in df.iterrows():
        #        if index != check_index:
        #            continue
        topic = row['user_id'] + '_' + row['product_id']
        docs = [[sent] for sent in sent_tokenize(row['review'].strip())]
        refs = [sent_tokenize(row['summary'].strip())]
        if not refs:
            continue

        if not args.summary_size:
            summary_size = len(" ".join(refs[0]).split(' '))
        else:
            summary_size = int(args.summary_size)

        logger.info('Topic ID: %s', topic)
        logger.info('###')
        logger.info('Summmary_len: %d', summary_size)

        rouge = Rouge(rouge_dir)
        algos = [
            'Luhn', 'LexRank', 'TextRank', 'LSA', 'KL', "ICSI", 'UB1', 'UB2'
        ]
        best_summary = []
        best_score = 0.0
        for algo in algos:
            best_summary, best_score = get_summary_scores(
                algo, docs, refs, summary_size, args.language, rouge,
                best_summary, best_score)

        rouge._cleanup()
        logger.info('###')
Пример #3
0
def main():

    args = get_args()
    rouge_dir = os.path.join(
        os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
        'rouge/RELEASE-1.5.5/')

    data_path = os.path.join(args.iobasedir, args.data_setpath)
    log_path = os.path.join(args.iobasedir, 'logs')
    log_file = os.path.join(
        args.iobasedir, 'logs',
        'baselines_%s_%s.log' % (args.data_set, args.summary_size))
    mkdirp(log_path)
    set_logger(log_file)

    for filename in os.listdir(data_path):
        data_file = os.path.join(data_path, filename)
        topic = filename[:-5]

        try:
            docs, refs = load_data(data_file)
        except:
            pass
        if not refs:
            continue

        if not args.summary_size:
            summary_size = len(" ".join(refs[0]).split(' '))
        else:
            summary_size = int(args.summary_size)

        logger.info('Topic ID: %s', topic)
        logger.info('###')
        logger.info('Summmary_len: %d', summary_size)

        rouge = Rouge(rouge_dir)
        algos = ['UB1', 'UB2', 'ICSI', 'Luhn', 'LexRank', 'LSA', 'KL']
        for algo in algos:
            get_summary_scores(algo, docs, refs, summary_size, args.language,
                               rouge)
        rouge._cleanup()
        logger.info('###')
Пример #4
0
def main():
    parser = argparse.ArgumentParser(
        description='Generate the Summarization Corpus')
    parser.add_argument('--corpus', choices=['bbc', 'guardian'], required=True)
    parser.add_argument('--data_type', choices=['raw', 'processed'])
    parser.add_argument('--mode',
                        choices=['fetch_urls', 'download', 'archive_urls'],
                        required=True)
    parser.add_argument('--request_parallelism', type=int, default=1)
    args = parser.parse_args()

    if args.mode == 'fetch_urls':
        data_path = path.join(base_dir, 'data/%s/' % ('raw'))
        FetchMode(data_path, args.corpus)
    elif args.mode == 'download':
        data_path = path.join(base_dir, 'data/%s/' % (args.data_type))
        download_path = path.join(data_path, 'downloads/%s' % (args.corpus))
        print("Download Path:", download_path)
        if not os.path.isdir(download_path):
            mkdirp(download_path)
        DownloadMode(data_path, args.corpus)
    elif args.mode == 'archive_urls':
        UrlMode(path.join(base_dir, 'data/processed/'), args.corpus,
                args.request_parallelism)