def run(self, args): self._notify_about_license() # Download the dataset so each individual setup does not need # to repeat this. The dataset comes from # http://multiling.iit.demokritos.gr/pages/view/1571/datasets tar_path = f'{args.output_dir}/multilingpilot2013.tar.bz2' download_file_from_google_drive('0B31rakzMfTMZRTZiM29UR3VxYmc', tar_path) sds.setup(tar_path, f'{args.output_dir}/sds') # MDS data is password protected, so the user must provide the data if not all([args.mds_documents_zip, args.mds_model_summaries_zip]): print( 'Skipping setting up MDS task because either documents or model summaries zip not provided' ) else: mds.setup(args.mds_documents_zip, args.mds_model_summaries_zip, f'{args.output_dir}/mds') if not all([ args.mds_model_summaries_zip, args.mds_peer_summaries_zip, args.metrics_zip ]): print( 'Skipping setting up MDS metrics because either model/peer summaries or metrics zips not provided' ) else: mds_metrics.setup(args.mds_model_summaries_zip, args.mds_peer_summaries_zip, args.metrics_zip, f'{args.output_dir}/mds')
def download_raw_data(output_dir: str, force: bool) -> None: for summarizer_id, file_ids in FILE_IDS.items(): for filename, file_id in file_ids.items(): if file_id is not None: download_file_from_google_drive( file_id, f'{output_dir}/raw/{summarizer_id}/{filename}.txt', force=force)
def run(self, args): self._notify_about_license() # Download the dataset so each individual setup does not need # to repeat this. The dataset comes from # The link comes from http://multiling.iit.demokritos.gr/pages/view/1650/task-summary-evaluation zip_path = f'{args.output_dir}/summary_eval_v2.zip' download_file_from_google_drive('1mRlEoqShJxgxrMJO1VgWlUuuaq_SJayy', zip_path) eval.setup(zip_path, args.output_dir)
def run(self, args): print(f'Downloading ROUGE-1.5.5') # I believe the ROUGE data files are platform-dependent. We verify it runs later, but in case it is, we have # to download the data files for the platform. # https://stackoverflow.com/questions/8220108/how-do-i-check-the-operating-system-in-python if sys.platform in ['linux', 'linux2']: file_id = '1K4J2wHGjAyr3LoSgaQuWZ_YyjtUGf26m' elif sys.platform == 'darwin': file_id = '1y0rDnTplQ83b2PQu_TgezbFpGOthP0gG' else: # No idea -- default to Linux file_id = '1K4J2wHGjAyr3LoSgaQuWZ_YyjtUGf26m' download_file_from_google_drive( file_id, f'{DATA_ROOT}/metrics/ROUGE-1.5.5.zip') commands = [ f'cd {DATA_ROOT}/metrics', f'unzip ROUGE-1.5.5.zip', f'rm ROUGE-1.5.5.zip' ] command = ' && '.join(commands) process = Popen(command, shell=True) process.communicate() if process.returncode != 0: print('ROUGE setup failure') # ROUGE has data files which may not successfully load. Therefore, if it fails to run on a simple example, # the user needs to run some perl code within the ROUGE directory to correct the data file try: summary = 'Dan walked to the bakery this morning.' reference = 'Dan went to buy scones earlier this morning.' rouge = Rouge() rouge.score(summary, [reference]) print('ROUGE setup success') except IndexError: print('ROUGE setup failure') print( 'It is very likely that either (1) you need to install the Perl XML::DOM library or ' '(2) you need to rebuild the ROUGE database file. The first case is more likely if you ' 'are running on Linux or MacOS. See ' 'https://github.com/danieldeutsch/sacrerouge/blob/master/doc/metrics/rouge.md for ' 'instructions for each of these steps. Afterward, this example should run without failing:' ) print() print('>>> from sacrerouge.metrics import Rouge') print('>>> ') print('>>> summary = "Dan walked to the bakery this morning."') print( '>>> reference = "Dan went to buy scones earlier this morning."' ) print('>>> ') print('>>> rouge = Rouge()') print('>>> rouge.score(summary, [reference])')
def download_tars(output_dir: str, force: bool) -> Tuple[str, str]: # Downloads the "story" tarfiles from https://cs.nyu.edu/~kcho/DMQA/ cnn_tar = f'{output_dir}/cnn_stories.tgz' dailymail_tar = f'{output_dir}/dailymail_stories.tgz' download_file_from_google_drive('0BwmD_VLjROrfTHk4NFg2SndKcjQ', cnn_tar, force=force) download_file_from_google_drive('0BwmD_VLjROrfM1BxdkxVaTY2bWs', dailymail_tar, force=force) return cnn_tar, dailymail_tar
def run(self, args): self._notify_about_license() # Link provided by John Conroy data_path = f'{args.output_dir}/2017_test_data.tgz' download_file_from_google_drive('1dQfEYzJokm0es3xFHJG1J3crBAYU5zTp', data_path) eval_path = f'{args.output_dir}/EvaluationML2017.tgz' download_file_from_google_drive('1pK7Df5gum5mwC0zYie5mCjqDdqLLmM1j', eval_path) sds.setup(data_path, eval_path, f'{args.output_dir}/sds') sds_metrics.setup(eval_path, f'{args.output_dir}/sds')
def run(self, args): assert command_exists('mvn'), 'BEwTE requires Maven to be installed' if args.force and os.path.exists(f'{DATA_ROOT}/metrics/ROUGE-BEwTE'): shutil.rmtree(f'{DATA_ROOT}/metrics/ROUGE-BEwTE') # We have to clone the ROUGE-BEwTE repository and disable git-lfs, otherwise we may cause the repo # to exceed the bandwidth quota, which results in a cloning failure. Afterward, the model files are downloaded # and put into place commands = [ f'mkdir -p {DATA_ROOT}/metrics', f'cd {DATA_ROOT}/metrics', f'GIT_LFS_SKIP_SMUDGE=1 git clone https://github.com/igorbrigadir/ROUGE-BEwTE', ] command = ' && '.join(commands) process = Popen(command, shell=True) process.communicate() if process.returncode != 0: print('BEwT-E setup failure') return # Download the models, unzip, and move to the correct directory download_file_from_google_drive( '1d0DjP8sxNoro_9fXaAlhB5JgqHjWMgst', f'{DATA_ROOT}/metrics/ROUGE-BEwTE/models.zip') commands = [ f'cd {DATA_ROOT}/metrics/ROUGE-BEwTE', f'unzip models.zip', f'rm models.zip', f'rm -r src/main/resources/models', f'mv models src/main/resources/' ] command = ' && '.join(commands) process = Popen(command, shell=True) process.communicate() if process.returncode != 0: print('BEwT-E setup failure') return self._edit_pom(f'{DATA_ROOT}/metrics/ROUGE-BEwTE/pom.xml') commands = [f'cd {DATA_ROOT}/metrics/ROUGE-BEwTE', f'mvn package'] command = ' && '.join(commands) process = Popen(command, shell=True) process.communicate() if process.returncode == 0: print('BEwT-E setup success') else: print('BEwT-E setup failure')
def run(self, args): print( 'This setup command will download the necessary model files. It will not install "qaeval". You must "pip install qaeval" on your own.' ) generation_model_id = '1vVhRgLtsQDAOmxYhY5PMPnxxHUyCOdQU' generation_model_path = f'{DATA_ROOT}/metrics/qaeval/models/generation/model.tar.gz' if args.force and os.path.exists(generation_model_path): os.remove(generation_model_path) if not os.path.exists(generation_model_path): download_file_from_google_drive(generation_model_id, generation_model_path) else: print('Skipping downloading generation model') answering_model_id = '1q2Z3FPP9AYNz0RJKHMlaweNhmLQoyPA8' answering_model_zip_path = f'{DATA_ROOT}/metrics/qaeval/models/answering/model.zip' answering_model_path = f'{DATA_ROOT}/metrics/qaeval/models/answering/model' if args.force: if os.path.exists(answering_model_zip_path): os.remove(answering_model_zip_path) if os.path.exists(answering_model_path): shutil.rmtree(answering_model_path) if not os.path.exists(answering_model_zip_path): download_file_from_google_drive(answering_model_id, answering_model_zip_path) else: print('Skipping downloading answering model') if not os.path.exists(answering_model_path): print('Unzipping answering model') with zipfile.ZipFile(answering_model_zip_path) as zip: zip.extractall(answering_model_path) if os.path.exists(answering_model_zip_path): os.remove(answering_model_zip_path) lerc_model_id = '193K7v6pjOtuXdlMenQW-RzF6ft-xY2qd' lerc_model_path = f'{DATA_ROOT}/metrics/qaeval/models/lerc/model.tar.gz' if args.force and os.path.exists(lerc_model_path): os.remove(lerc_model_path) if not os.path.exists(lerc_model_path): download_file_from_google_drive(lerc_model_id, lerc_model_path) else: print('Skipping downloading LERC model') lerc_pretrained_model_id = '1fWBahDT-O1mpsbND300cuZuF73mfObzH' lerc_pretrained_model_path = f'{DATA_ROOT}/metrics/qaeval/models/lerc/pretrained.tar.gz' if args.force and os.path.exists(lerc_pretrained_model_path): os.remove(lerc_pretrained_model_path) if not os.path.exists(lerc_pretrained_model_path): download_file_from_google_drive(lerc_pretrained_model_id, lerc_pretrained_model_path) else: print('Skipping downloading LERC pretrained model') print('Downloading models complete')
def run(self, args): print(f'Downloading ROUGE-1.5.5') download_file_from_google_drive( '1y0rDnTplQ83b2PQu_TgezbFpGOthP0gG', f'{DATA_ROOT}/metrics/ROUGE-1.5.5.zip') commands = [ f'cd {DATA_ROOT}/metrics', f'unzip ROUGE-1.5.5.zip', f'rm ROUGE-1.5.5.zip' ] command = ' && '.join(commands) process = Popen(command, shell=True) process.communicate() if process.returncode == 0: print('ROUGE setup success') else: print('ROUGE setup failure')
def run(self, args): self._notify_about_license() # Download the dataset so each individual setup does not need # to repeat this. The dataset comes from # http://multiling.iit.demokritos.gr/pages/view/1532/task-mss-single-document-summarization-data-and-information tar_path = f'{args.output_dir}/multilingMss2015Eval.tar.gz' download_file_from_google_drive('0B31rakzMfTMZa0ZIcmgzREstcVE', tar_path) sds.setup(tar_path, f'{args.output_dir}/sds') # The SDS metrics file was provided by John Conroy eval_tar_path = f'{args.output_dir}/Evaluation_MultiLing2015_MSS.tgz' download_file_from_google_drive('1j_jV9JAc0t_EulCUMH7Y-dQSpqD9BpFd', eval_tar_path) sds_metrics.setup(eval_tar_path, f'{args.output_dir}/sds') # The MDS data is password protected, so the user must provide the zips if not all([args.train_zip, args.test_zip]): print('Skipping setting up MDS data because either the training or testing zip is missing') else: mds.setup(args.train_zip, args.test_zip, f'{args.output_dir}/mds')
def _download_data(output_dir: str, force: bool) -> None: download_file_from_google_drive('1kUjSRXzKnTYdJ732BkKVLg3CFxDKo25u', f'{output_dir}/train.jsonl.gz', force=force) download_file_from_google_drive('1_kHTZ32jazTbXaFRg0vBeIsVcpI7CTmy', f'{output_dir}/val.jsonl.gz', force=force) download_file_from_google_drive('1qsd5pOCpeSXsaqNobXCrcAzhcjtG1wA1', f'{output_dir}/test.jsonl.gz', force=force)
def _download_documents_and_summaries(output_dir: str, force: bool) -> None: # The source files come from the "Raw data, bad retrievals removed" link on https://github.com/Alex-Fabbri/Multi-News download_file_from_google_drive('1wHAWDOwOoQWSj7HYpyJ3Aeud8WhhaJ7P', f'{output_dir}/train.src.cleaned', force=force) download_file_from_google_drive('1p_u9_jpz3Zbj0EL05QFX6wvJAahmOn6h', f'{output_dir}/val.src.cleaned', force=force) download_file_from_google_drive('1-n_6fj-1nM7sWtBSNkQCSfl5Rb3zPVfr', f'{output_dir}/test.src.cleaned', force=force) # The target files come from the "Raw data" link download_file_from_google_drive('1QVgswwhVTkd3VLCzajK6eVkcrSWEK6kq', f'{output_dir}/train.tgt', force=force) download_file_from_google_drive('1Y1lBbBU5Q0aJMqLhYEOdEtTqQ85XnRRM', f'{output_dir}/val.tgt', force=force) download_file_from_google_drive('1CX_YcgQ3WwNC1fXBpMfwMXFPCqsd9Lbp', f'{output_dir}/test.tgt', force=force)