Exemplo n.º 1
0
    def run(self, args):
        self._notify_about_license()

        # Download the dataset so each individual setup does not need
        # to repeat this. The dataset comes from
        # http://multiling.iit.demokritos.gr/pages/view/1571/datasets
        tar_path = f'{args.output_dir}/multilingpilot2013.tar.bz2'
        download_file_from_google_drive('0B31rakzMfTMZRTZiM29UR3VxYmc',
                                        tar_path)
        sds.setup(tar_path, f'{args.output_dir}/sds')

        # MDS data is password protected, so the user must provide the data
        if not all([args.mds_documents_zip, args.mds_model_summaries_zip]):
            print(
                'Skipping setting up MDS task because either documents or model summaries zip not provided'
            )
        else:
            mds.setup(args.mds_documents_zip, args.mds_model_summaries_zip,
                      f'{args.output_dir}/mds')

        if not all([
                args.mds_model_summaries_zip, args.mds_peer_summaries_zip,
                args.metrics_zip
        ]):
            print(
                'Skipping setting up MDS metrics because either model/peer summaries or metrics zips not provided'
            )
        else:
            mds_metrics.setup(args.mds_model_summaries_zip,
                              args.mds_peer_summaries_zip, args.metrics_zip,
                              f'{args.output_dir}/mds')
Exemplo n.º 2
0
def download_raw_data(output_dir: str, force: bool) -> None:
    for summarizer_id, file_ids in FILE_IDS.items():
        for filename, file_id in file_ids.items():
            if file_id is not None:
                download_file_from_google_drive(
                    file_id,
                    f'{output_dir}/raw/{summarizer_id}/{filename}.txt',
                    force=force)
Exemplo n.º 3
0
    def run(self, args):
        self._notify_about_license()

        # Download the dataset so each individual setup does not need
        # to repeat this. The dataset comes from
        # The link comes from http://multiling.iit.demokritos.gr/pages/view/1650/task-summary-evaluation
        zip_path = f'{args.output_dir}/summary_eval_v2.zip'
        download_file_from_google_drive('1mRlEoqShJxgxrMJO1VgWlUuuaq_SJayy', zip_path)

        eval.setup(zip_path, args.output_dir)
Exemplo n.º 4
0
    def run(self, args):
        print(f'Downloading ROUGE-1.5.5')

        # I believe the ROUGE data files are platform-dependent. We verify it runs later, but in case it is, we have
        # to download the data files for the platform.
        # https://stackoverflow.com/questions/8220108/how-do-i-check-the-operating-system-in-python
        if sys.platform in ['linux', 'linux2']:
            file_id = '1K4J2wHGjAyr3LoSgaQuWZ_YyjtUGf26m'
        elif sys.platform == 'darwin':
            file_id = '1y0rDnTplQ83b2PQu_TgezbFpGOthP0gG'
        else:
            # No idea -- default to Linux
            file_id = '1K4J2wHGjAyr3LoSgaQuWZ_YyjtUGf26m'

        download_file_from_google_drive(
            file_id, f'{DATA_ROOT}/metrics/ROUGE-1.5.5.zip')

        commands = [
            f'cd {DATA_ROOT}/metrics', f'unzip ROUGE-1.5.5.zip',
            f'rm ROUGE-1.5.5.zip'
        ]
        command = ' && '.join(commands)

        process = Popen(command, shell=True)
        process.communicate()
        if process.returncode != 0:
            print('ROUGE setup failure')

        # ROUGE has data files which may not successfully load. Therefore, if it fails to run on a simple example,
        # the user needs to run some perl code within the ROUGE directory to correct the data file
        try:
            summary = 'Dan walked to the bakery this morning.'
            reference = 'Dan went to buy scones earlier this morning.'
            rouge = Rouge()
            rouge.score(summary, [reference])
            print('ROUGE setup success')
        except IndexError:
            print('ROUGE setup failure')
            print(
                'It is very likely that either (1) you need to install the Perl XML::DOM library or '
                '(2) you need to rebuild the ROUGE database file. The first case is more likely if you '
                'are running on Linux or MacOS. See '
                'https://github.com/danieldeutsch/sacrerouge/blob/master/doc/metrics/rouge.md for '
                'instructions for each of these steps. Afterward, this example should run without failing:'
            )
            print()
            print('>>> from sacrerouge.metrics import Rouge')
            print('>>> ')
            print('>>> summary = "Dan walked to the bakery this morning."')
            print(
                '>>> reference = "Dan went to buy scones earlier this morning."'
            )
            print('>>> ')
            print('>>> rouge = Rouge()')
            print('>>> rouge.score(summary, [reference])')
Exemplo n.º 5
0
def download_tars(output_dir: str, force: bool) -> Tuple[str, str]:
    # Downloads the "story" tarfiles from https://cs.nyu.edu/~kcho/DMQA/
    cnn_tar = f'{output_dir}/cnn_stories.tgz'
    dailymail_tar = f'{output_dir}/dailymail_stories.tgz'
    download_file_from_google_drive('0BwmD_VLjROrfTHk4NFg2SndKcjQ',
                                    cnn_tar,
                                    force=force)
    download_file_from_google_drive('0BwmD_VLjROrfM1BxdkxVaTY2bWs',
                                    dailymail_tar,
                                    force=force)
    return cnn_tar, dailymail_tar
Exemplo n.º 6
0
    def run(self, args):
        self._notify_about_license()

        # Link provided by John Conroy
        data_path = f'{args.output_dir}/2017_test_data.tgz'
        download_file_from_google_drive('1dQfEYzJokm0es3xFHJG1J3crBAYU5zTp', data_path)

        eval_path = f'{args.output_dir}/EvaluationML2017.tgz'
        download_file_from_google_drive('1pK7Df5gum5mwC0zYie5mCjqDdqLLmM1j', eval_path)

        sds.setup(data_path, eval_path, f'{args.output_dir}/sds')
        sds_metrics.setup(eval_path, f'{args.output_dir}/sds')
Exemplo n.º 7
0
    def run(self, args):
        assert command_exists('mvn'), 'BEwTE requires Maven to be installed'

        if args.force and os.path.exists(f'{DATA_ROOT}/metrics/ROUGE-BEwTE'):
            shutil.rmtree(f'{DATA_ROOT}/metrics/ROUGE-BEwTE')

        # We have to clone the ROUGE-BEwTE repository and disable git-lfs, otherwise we may cause the repo
        # to exceed the bandwidth quota, which results in a cloning failure. Afterward, the model files are downloaded
        # and put into place
        commands = [
            f'mkdir -p {DATA_ROOT}/metrics',
            f'cd {DATA_ROOT}/metrics',
            f'GIT_LFS_SKIP_SMUDGE=1 git clone https://github.com/igorbrigadir/ROUGE-BEwTE',
        ]
        command = ' && '.join(commands)

        process = Popen(command, shell=True)
        process.communicate()
        if process.returncode != 0:
            print('BEwT-E setup failure')
            return

        # Download the models, unzip, and move to the correct directory
        download_file_from_google_drive(
            '1d0DjP8sxNoro_9fXaAlhB5JgqHjWMgst',
            f'{DATA_ROOT}/metrics/ROUGE-BEwTE/models.zip')
        commands = [
            f'cd {DATA_ROOT}/metrics/ROUGE-BEwTE', f'unzip models.zip',
            f'rm models.zip', f'rm -r src/main/resources/models',
            f'mv models src/main/resources/'
        ]
        command = ' && '.join(commands)

        process = Popen(command, shell=True)
        process.communicate()
        if process.returncode != 0:
            print('BEwT-E setup failure')
            return

        self._edit_pom(f'{DATA_ROOT}/metrics/ROUGE-BEwTE/pom.xml')

        commands = [f'cd {DATA_ROOT}/metrics/ROUGE-BEwTE', f'mvn package']
        command = ' && '.join(commands)

        process = Popen(command, shell=True)
        process.communicate()
        if process.returncode == 0:
            print('BEwT-E setup success')
        else:
            print('BEwT-E setup failure')
Exemplo n.º 8
0
    def run(self, args):
        print(
            'This setup command will download the necessary model files. It will not install "qaeval". You must "pip install qaeval" on your own.'
        )

        generation_model_id = '1vVhRgLtsQDAOmxYhY5PMPnxxHUyCOdQU'
        generation_model_path = f'{DATA_ROOT}/metrics/qaeval/models/generation/model.tar.gz'
        if args.force and os.path.exists(generation_model_path):
            os.remove(generation_model_path)
        if not os.path.exists(generation_model_path):
            download_file_from_google_drive(generation_model_id,
                                            generation_model_path)
        else:
            print('Skipping downloading generation model')

        answering_model_id = '1q2Z3FPP9AYNz0RJKHMlaweNhmLQoyPA8'
        answering_model_zip_path = f'{DATA_ROOT}/metrics/qaeval/models/answering/model.zip'
        answering_model_path = f'{DATA_ROOT}/metrics/qaeval/models/answering/model'
        if args.force:
            if os.path.exists(answering_model_zip_path):
                os.remove(answering_model_zip_path)
            if os.path.exists(answering_model_path):
                shutil.rmtree(answering_model_path)

        if not os.path.exists(answering_model_zip_path):
            download_file_from_google_drive(answering_model_id,
                                            answering_model_zip_path)
        else:
            print('Skipping downloading answering model')
        if not os.path.exists(answering_model_path):
            print('Unzipping answering model')
            with zipfile.ZipFile(answering_model_zip_path) as zip:
                zip.extractall(answering_model_path)
        if os.path.exists(answering_model_zip_path):
            os.remove(answering_model_zip_path)

        lerc_model_id = '193K7v6pjOtuXdlMenQW-RzF6ft-xY2qd'
        lerc_model_path = f'{DATA_ROOT}/metrics/qaeval/models/lerc/model.tar.gz'
        if args.force and os.path.exists(lerc_model_path):
            os.remove(lerc_model_path)
        if not os.path.exists(lerc_model_path):
            download_file_from_google_drive(lerc_model_id, lerc_model_path)
        else:
            print('Skipping downloading LERC model')

        lerc_pretrained_model_id = '1fWBahDT-O1mpsbND300cuZuF73mfObzH'
        lerc_pretrained_model_path = f'{DATA_ROOT}/metrics/qaeval/models/lerc/pretrained.tar.gz'
        if args.force and os.path.exists(lerc_pretrained_model_path):
            os.remove(lerc_pretrained_model_path)
        if not os.path.exists(lerc_pretrained_model_path):
            download_file_from_google_drive(lerc_pretrained_model_id,
                                            lerc_pretrained_model_path)
        else:
            print('Skipping downloading LERC pretrained model')

        print('Downloading models complete')
Exemplo n.º 9
0
    def run(self, args):
        print(f'Downloading ROUGE-1.5.5')
        download_file_from_google_drive(
            '1y0rDnTplQ83b2PQu_TgezbFpGOthP0gG',
            f'{DATA_ROOT}/metrics/ROUGE-1.5.5.zip')

        commands = [
            f'cd {DATA_ROOT}/metrics', f'unzip ROUGE-1.5.5.zip',
            f'rm ROUGE-1.5.5.zip'
        ]
        command = ' && '.join(commands)

        process = Popen(command, shell=True)
        process.communicate()
        if process.returncode == 0:
            print('ROUGE setup success')
        else:
            print('ROUGE setup failure')
Exemplo n.º 10
0
    def run(self, args):
        self._notify_about_license()

        # Download the dataset so each individual setup does not need
        # to repeat this. The dataset comes from
        # http://multiling.iit.demokritos.gr/pages/view/1532/task-mss-single-document-summarization-data-and-information
        tar_path = f'{args.output_dir}/multilingMss2015Eval.tar.gz'
        download_file_from_google_drive('0B31rakzMfTMZa0ZIcmgzREstcVE', tar_path)
        sds.setup(tar_path, f'{args.output_dir}/sds')

        # The SDS metrics file was provided by John Conroy
        eval_tar_path = f'{args.output_dir}/Evaluation_MultiLing2015_MSS.tgz'
        download_file_from_google_drive('1j_jV9JAc0t_EulCUMH7Y-dQSpqD9BpFd', eval_tar_path)
        sds_metrics.setup(eval_tar_path, f'{args.output_dir}/sds')

        # The MDS data is password protected, so the user must provide the zips
        if not all([args.train_zip, args.test_zip]):
            print('Skipping setting up MDS data because either the training or testing zip is missing')
        else:
            mds.setup(args.train_zip, args.test_zip, f'{args.output_dir}/mds')
Exemplo n.º 11
0
def _download_data(output_dir: str, force: bool) -> None:
    download_file_from_google_drive('1kUjSRXzKnTYdJ732BkKVLg3CFxDKo25u',
                                    f'{output_dir}/train.jsonl.gz',
                                    force=force)
    download_file_from_google_drive('1_kHTZ32jazTbXaFRg0vBeIsVcpI7CTmy',
                                    f'{output_dir}/val.jsonl.gz',
                                    force=force)
    download_file_from_google_drive('1qsd5pOCpeSXsaqNobXCrcAzhcjtG1wA1',
                                    f'{output_dir}/test.jsonl.gz',
                                    force=force)
Exemplo n.º 12
0
def _download_documents_and_summaries(output_dir: str, force: bool) -> None:
    # The source files come from the "Raw data, bad retrievals removed" link on https://github.com/Alex-Fabbri/Multi-News
    download_file_from_google_drive('1wHAWDOwOoQWSj7HYpyJ3Aeud8WhhaJ7P',
                                    f'{output_dir}/train.src.cleaned',
                                    force=force)
    download_file_from_google_drive('1p_u9_jpz3Zbj0EL05QFX6wvJAahmOn6h',
                                    f'{output_dir}/val.src.cleaned',
                                    force=force)
    download_file_from_google_drive('1-n_6fj-1nM7sWtBSNkQCSfl5Rb3zPVfr',
                                    f'{output_dir}/test.src.cleaned',
                                    force=force)

    # The target files come from the "Raw data" link
    download_file_from_google_drive('1QVgswwhVTkd3VLCzajK6eVkcrSWEK6kq',
                                    f'{output_dir}/train.tgt',
                                    force=force)
    download_file_from_google_drive('1Y1lBbBU5Q0aJMqLhYEOdEtTqQ85XnRRM',
                                    f'{output_dir}/val.tgt',
                                    force=force)
    download_file_from_google_drive('1CX_YcgQ3WwNC1fXBpMfwMXFPCqsd9Lbp',
                                    f'{output_dir}/test.tgt',
                                    force=force)