def buildImage(opt): dpath = os.path.join(opt['datapath'], 'COCO-IMG') version = '1' if not build_data.built(dpath, version_string=version): print('[building image data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the image data. fname1 = 'train2014.zip' fname2 = 'val2014.zip' fname3 = 'test2015.zip' url = 'https://s3.amazonaws.com/fair-data/parlai/COCO-IMG/' build_data.download(url + fname1, dpath, fname1) build_data.download(url + fname2, dpath, fname2) build_data.download(url + fname3, dpath, fname3) build_data.untar(dpath, fname1) build_data.untar(dpath, fname2) build_data.untar(dpath, fname3) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): version = 'v1.0' dpath = os.path.join(opt['datapath'], 'Twitter') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname1 = "twitter_en_big.txt.gz.partaa" fname2 = "twitter_en_big.txt.gz.partab" url = 'https://github.com/Marsan-Ma/chat_corpus/raw/master/' build_data.download(url + fname1, dpath, fname1) build_data.download(url + fname2, dpath, fname2) file1 = os.path.join(dpath, fname1) file2 = os.path.join(dpath, fname2) file3 = "twitter_en_big.txt.gz" outzipfile= os.path.join(dpath, file3) build_data.cat(file1, file2, outzipfile) import gzip with gzip.open(outzipfile, 'r') as f: file_content = bytes.decode(f.read()) data = file_content.split('\n')[2:] create_fb_format(data, dpath) os.remove(outzipfile) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): dpath = os.path.join(opt['datapath'], 'WikiQA') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'wikiqa.tar.gz' url = 'https://s3.amazonaws.com/fair-data/parlai/wikiqa/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) dpext = os.path.join(dpath, 'WikiQACorpus') create_fb_format(dpath, 'train', os.path.join(dpext, 'WikiQA-train.tsv')) create_fb_format(dpath, 'valid', os.path.join(dpext, 'WikiQA-dev.tsv')) create_fb_format(dpath, 'test', os.path.join(dpext, 'WikiQA-test.tsv')) create_fb_format(dpath, 'train-filtered', os.path.join(dpext, 'WikiQA-train.tsv')) create_fb_format(dpath, 'valid-filtered', os.path.join(dpext, 'WikiQA-dev.tsv')) create_fb_format(dpath, 'test-filtered', os.path.join(dpext, 'WikiQA-test.tsv')) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'VQA-v2') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') # An older version exists, so remove these outdated files. if build_data.built(dpath): build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname1 = 'v2_Questions_Train_mscoco.zip' fname2 = 'v2_Questions_Val_mscoco.zip' fname3 = 'v2_Questions_Test_mscoco.zip' fname4 = 'v2_Annotations_Val_mscoco.zip' fname5 = 'v2_Annotations_Train_mscoco.zip' url = 'http://visualqa.org/data/mscoco/vqa/' build_data.download(url + fname1, dpath, fname1) build_data.download(url + fname2, dpath, fname2) build_data.download(url + fname3, dpath, fname3) build_data.download(url + fname4, dpath, fname4) build_data.download(url + fname5, dpath, fname5) build_data.untar(dpath, fname1) build_data.untar(dpath, fname2) build_data.untar(dpath, fname3) build_data.untar(dpath, fname4) build_data.untar(dpath, fname5) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'WebQuestions') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. url = ('https://worksheets.codalab.org/rest/bundles/' + '0x4a763f8cde224c2da592b75f29e2f5c2/contents/blob/') build_data.download(url, dpath, 'train.json') url = ('https://worksheets.codalab.org/rest/bundles/' + '0xe7bac352fce7448c9ef238fb0a297ec2/contents/blob/') build_data.download(url, dpath, 'test.json') create_fb_format(dpath, 'train', os.path.join(dpath, 'train.json')) create_fb_format(dpath, 'valid', os.path.join(dpath, 'train.json')) create_fb_format(dpath, 'test', os.path.join(dpath, 'test.json')) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'wmt') version = 'None' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fnames = [('train.en','train.de', 'en_de_train.txt'), ('newstest2014.en','newstest2014.de', 'en_de_test.txt')] for (en_fname, de_fname, w_fname) in fnames: url_base = 'https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/' en_url = url_base + en_fname de_url = url_base + de_fname build_data.download(en_url, dpath, en_fname) build_data.download(de_url, dpath, de_fname) with open(os.path.join(dpath, en_fname), 'r') as f: en = [l[:-1] for l in f] with open(os.path.join(dpath, de_fname), 'r') as f: de = [l[:-1] for l in f] with open(os.path.join(dpath, w_fname), 'w') as f: for de_sent,en_sent in zip(de,en): f.write("1 "+en_sent+"\t"+de_sent+"\n") # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'wmt') version = 'None' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. train_r_fnames = ('train.en', 'train.de') train_w_fname = 'en_de_train.txt' valid_w_fname = 'en_de_valid.txt' test_r_fnames = ('newstest2014.en', 'newstest2014.de') test_w_fname = 'en_de_test.txt' train_zip = readFiles(dpath, train_r_fnames) numpy.random.shuffle(train_zip) with open(os.path.join(dpath, valid_w_fname), 'w') as f: for de_sent, en_sent in train_zip[:30000]: f.write("1 "+en_sent+"\t"+de_sent+"\n") with open(os.path.join(dpath, train_w_fname), 'w') as f: for de_sent, en_sent in train_zip[30000:]: f.write("1 "+en_sent+"\t"+de_sent+"\n") test_zip = readFiles(dpath, test_r_fnames) with open(os.path.join(dpath, test_w_fname), 'w') as f: for de_sent, en_sent in test_zip: f.write("1 "+en_sent+"\t"+de_sent+"\n") # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'MS_MARCO') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data url = "https://msmarco.blob.core.windows.net/msmarco/" fname = "train_v1.1.json.gz" build_data.download(url + fname, dpath, 'train.gz') fname = "dev_v1.1.json.gz" build_data.download(url + fname, dpath, 'valid.gz') fname = "test_public_v1.1.json.gz" build_data.download(url + fname, dpath, 'test.gz') create_fb_format(dpath, "train", os.path.join(dpath, 'train.gz')) create_fb_format(dpath, "valid", os.path.join(dpath, 'valid.gz')) create_fb_format(dpath, "test", os.path.join(dpath, 'test.gz')) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'MovieDialog') version = '1' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') dpath2 = os.path.join(dpath, 'movie_dialog_dataset', 'task4_reddit') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) build_data.make_dir(dpath2) # Download the data. fname = 'moviedialog.tar.gz' url = 'https://s3.amazonaws.com/fair-data/parlai/moviedialog/' + fname build_data.download(url, dpath, fname) url2 = 'http://tinyurl.com/' + 'p6tyohj' build_data.download(url2, dpath2, 'p6tyohj.tgz') build_data.untar(dpath, fname) build_data.untar(dpath2, 'p6tyohj.tgz') # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): version = 'v1.0' dpath = os.path.join(opt['datapath'], 'QACNN') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'cnn.tgz' gd_id = '0BwmD_VLjROrfTTljRDVZMFJnVWM' build_data.download_from_google_drive(gd_id, os.path.join(dpath, fname)) build_data.untar(dpath, fname) create_fb_format(dpath, 'train', os.path.join(dpath, 'cnn', 'questions', 'training')) create_fb_format(dpath, 'valid', os.path.join(dpath, 'cnn', 'questions', 'validation')) create_fb_format(dpath, 'test', os.path.join(dpath, 'cnn', 'questions', 'test')) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): dpath = os.path.join(opt['datapath'], 'MCTest') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'mctest.tar.gz' url = 'https://s3.amazonaws.com/fair-data/parlai/mctest/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) dpext = os.path.join(dpath, 'mctest') create_fb_format(dpath, 'train160', os.path.join(dpext, 'MCTest', 'mc160.train'), None) create_fb_format(dpath, 'valid160', os.path.join(dpext, 'MCTest', 'mc160.dev'), None) create_fb_format(dpath, 'test160', os.path.join(dpext, 'MCTest', 'mc160.test'), os.path.join(dpext, 'MCTestAnswers', 'mc160.test.ans')) create_fb_format(dpath, 'train500', os.path.join(dpext, 'MCTest', 'mc500.train'), None) create_fb_format(dpath, 'valid500', os.path.join(dpext, 'MCTest', 'mc500.dev'), None) create_fb_format(dpath, 'test500', os.path.join(dpext, 'MCTest', 'mc500.test'), os.path.join(dpext, 'MCTestAnswers', 'mc500.test.ans')) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'NarrativeQA') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # an older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # download the data. fname = 'narrative_qa.zip' # dataset URL url = NARRATIVE_QA_DOWNLOAD_URL build_data.download(url, dpath, fname) # uncompress it build_data.untar(dpath, fname) print('downloading stories now') base_path = os.path.join(dpath, 'narrativeqa-master') download_stories(base_path) # move from tmp to stories tmp_stories_path = os.path.join(base_path, 'tmp') new_stories_path = os.path.join(base_path, 'stories') shutil.move(tmp_stories_path, new_stories_path) # divide into train, valid and test for summaries summaries_csv_path = os.path.join(base_path, 'third_party', 'wikipedia', 'summaries.csv') new_path = os.path.join(base_path, 'summaries.csv') shutil.move(summaries_csv_path, new_path) divide_csv_into_sets(new_path) # divide into sets for questions questions_path = os.path.join(base_path, 'qaps.csv') divide_csv_into_sets(questions_path) # divide into sets for documents documents_path = os.path.join(base_path, 'documents.csv') divide_csv_into_sets(documents_path) # move specific set's files into their set's folder make_folders(base_path) move_files(base_path) # move narrativeqa-master to narrative_qa new_path = os.path.join(dpath, 'narrative_qa') shutil.move(base_path, new_path) # mark the data as built build_data.mark_done(dpath, version_string=version)
def build(opt): version = 'v0.9' dpath = os.path.join(opt['datapath'], 'VisDial-v0.9') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname1 = 'visdial_0.9_train.zip' fname2 = 'visdial_0.9_val.zip' url = 'https://computing.ece.vt.edu/~abhshkdz/data/visdial/' build_data.download(url + fname1, dpath, fname1) build_data.download(url + fname2, dpath, fname2) build_data.untar(dpath, fname1) build_data.untar(dpath, fname2) print('processing unpacked files') # Use 1000 examples from training set as validation. json1 = os.path.join(dpath, fname1.rsplit('.', 1)[0] + '.json') with open(json1) as t_json: train_data = json.load(t_json) valid_data = train_data.copy() valid_data['data'] = train_data['data'].copy() valid_data['data']['dialogs'] = [] # Use constant stride to pick examples. num_valid = 1000 total = len(train_data['data']['dialogs']) step = total // (num_valid - 1) for i in range(total-1, 0, -step)[:num_valid]: valid_data['data']['dialogs'].append(train_data['data']['dialogs'][i]) del train_data['data']['dialogs'][i] train_json = json1.rsplit('.', 1)[0] + '_train.json' valid_json = json1.rsplit('.', 1)[0] + '_valid.json' with open(train_json, 'w') as t_out, open(valid_json, 'w') as v_out: json.dump(train_data, t_out) json.dump(valid_data, v_out) os.remove(json1) # Use validation data as test. json2 = os.path.join(dpath, fname2.rsplit('.', 1)[0] + '.json') test_json = json2.rsplit('.', 1)[0] + '_test.json' build_data.move(json2, test_json) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): dpath = os.path.join(opt['datapath'], DATASET_NAME_LOCAL) version = 'v1.0' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) logging.info('Removed the existing data (old version).') build_data.make_dir(dpath) _download_with_cloud_storage_client(dpath) _untar_dataset_files(dpath) _move_valid_files_from_dev_to_valid(dpath) build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'C3') version = '1.0' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. for downloadable_file in RESOURCES: downloadable_file.download_file(dpath) build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath']) airdialogue_path = os.path.join(dpath, 'airdialogue_data') version = '1.1' if not build_data.built(airdialogue_path, version_string=version): print('[building data: ' + airdialogue_path + ']') if build_data.built(airdialogue_path): build_data.remove_dir(airdialogue_path) # Download the data. for downloadable_file in RESOURCES: downloadable_file.download_file(dpath) build_data.mark_done(airdialogue_path, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'multiwoz') version = '1.0' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): build_data.remove_dir(dpath) build_data.make_dir(dpath) fname = 'MULTIWOZ2.1.zip' url = 'https://www.repository.cam.ac.uk/bitstream/handle/1810/294507/MULTIWOZ2.1.zip?sequence=1&isAllowed=y' build_data.download(url, dpath, fname) build_data.unzip(dpath, fname) build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'wizard_of_wikipedia') version = '1.0' if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. for downloadable_file in RESOURCES: downloadable_file.download_file(dpath) build_data.mark_done(dpath, version)
def build(opt): version = 'v0.9' dpath = os.path.join(opt['datapath'], 'VisDial-v0.9') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. for downloadable_file in RESOURCES: downloadable_file.download_file(dpath) print('processing unpacked files') # Use 1000 examples from training set as validation. json1 = os.path.join(dpath, RESOURCES[0].file_name.rsplit('.', 1)[0] + '.json') with open(json1) as t_json: train_data = json.load(t_json) valid_data = train_data.copy() valid_data['data'] = train_data['data'].copy() valid_data['data']['dialogs'] = [] # Use constant stride to pick examples. num_valid = 1000 total = len(train_data['data']['dialogs']) step = total // (num_valid - 1) for i in range(total - 1, 0, -step)[:num_valid]: valid_data['data']['dialogs'].append(train_data['data']['dialogs'][i]) del train_data['data']['dialogs'][i] train_json = json1.rsplit('.', 1)[0] + '_train.json' valid_json = json1.rsplit('.', 1)[0] + '_valid.json' with open(train_json, 'w') as t_out, open(valid_json, 'w') as v_out: json.dump(train_data, t_out) json.dump(valid_data, v_out) os.remove(json1) # Use validation data as test. json2 = os.path.join(dpath, RESOURCES[1].file_name.rsplit('.', 1)[0] + '.json') test_json = json2.rsplit('.', 1)[0] + '_test.json' build_data.move(json2, test_json) # Mark the data as built. build_data.mark_done(dpath, version)
def build_data_from_path(dpath, version, downloadable_files): if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data for downloadable_file in downloadable_files: downloadable_file.download_file(dpath) # Mark the data as built build_data.mark_done(dpath, version_string=version) return dpath, version
def build(opt): dpath = os.path.join(opt['datapath'], 'MCTest') if not build_data.built(dpath): print('[building data: ' + dpath + ']') build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'mctest.tar.gz' url = 'https://s3.amazonaws.com/fair-data/parlai/mctest/' + fname build_data.download(dpath, url) build_data.untar(dpath, fname) dpext = os.path.join(dpath, 'mctest') create_fb_format(dpath, 'train160', os.path.join(dpext, 'MCTest', 'mc160.train'), None) create_fb_format(dpath, 'valid160', os.path.join(dpext, 'MCTest', 'mc160.dev'), None) create_fb_format( dpath, 'test160', os.path.join(dpext, 'MCTest', 'mc160.test'), os.path.join(dpext, 'MCTestAnswers', 'mc160.test.ans')) create_fb_format(dpath, 'train500', os.path.join(dpext, 'MCTest', 'mc500.train'), None) create_fb_format(dpath, 'valid500', os.path.join(dpext, 'MCTest', 'mc500.dev'), None) create_fb_format( dpath, 'test500', os.path.join(dpext, 'MCTest', 'mc500.test'), os.path.join(dpext, 'MCTestAnswers', 'mc500.test.ans')) # Mark the data as built. build_data.mark_done(dpath)
def build(opt): version = 'v1.0' dpath = os.path.join(opt['datapath'], 'QACNN') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'cnn.tgz' gd_id = '0BwmD_VLjROrfTTljRDVZMFJnVWM' build_data.download_from_google_drive(gd_id, os.path.join(dpath, fname)) build_data.untar(dpath, fname) create_fb_format(dpath, 'train', os.path.join(dpath, 'cnn', 'questions', 'training')) create_fb_format(dpath, 'valid', os.path.join(dpath, 'cnn', 'questions', 'validation')) create_fb_format(dpath, 'test', os.path.join(dpath, 'cnn', 'questions', 'test')) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt: Opt) -> None: dpath = os.path.join(opt['datapath'], DATASET_NAME) if build_data.built(dpath, VERSION): logging.debug('Data was already built. Skipping the data building.') return if os.path.exists(dpath): logging.debug(f'Removing old/corrupted data in {dpath}.') build_data.remove_dir(dpath) logging.info( f'[building data: {dpath}]\nThis may take a while but only heppens once.' ) logging.info(f'Cloning Github repo {GH_REPO}') temp_path = os.path.join(dpath, "temp") Repo.clone_from(GH_REPO, temp_path) build_data.untar(temp_path, 'data.zip') # Copying the unzipped data files to the dpath for dt in ('train', 'test'): fname = f'{dt}.json' fsource = os.path.join(temp_path, 'data', fname) fdest = os.path.join(dpath, fname) os.rename(fsource, fdest) # Removing unused files from the repository build_data.remove_dir(temp_path) build_data.mark_done(dpath, VERSION)
def build(opt): dpath = os.path.join(opt['datapath'], 'LCCC') version = None if not build_data.built(dpath, version_string=version): if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. RESOURCES[0].download_file(dpath) # Format it for use with ConversationTeacher _create_parlai_format(dpath) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = opt['datapath'] + "/VQA-COCO2014/" if not build_data.built(dpath): print("[building data: " + dpath + "]") build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname1 = "Questions_Train_mscoco.zip" fname2 = "Questions_Val_mscoco.zip" fname3 = "Questions_Test_mscoco.zip" fname4 = "Annotations_Val_mscoco.zip" fname5 = "Annotations_Train_mscoco.zip" url = "http://visualqa.org/data/mscoco/vqa/" build_data.download(dpath, url + fname1) build_data.download(dpath, url + fname2) build_data.download(dpath, url + fname3) build_data.download(dpath, url + fname4) build_data.download(dpath, url + fname5) build_data.untar(dpath, fname1) build_data.untar(dpath, fname2) build_data.untar(dpath, fname3) build_data.untar(dpath, fname4) build_data.untar(dpath, fname5) buildImage(dpath) # Mark the data as built. build_data.mark_done(dpath)
def build(opt): dpath = os.path.join(opt['datapath'], 'VQA-v2') if not build_data.built(dpath): print('[building data: ' + dpath + ']') build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname1 = 'v2_Questions_Train_mscoco.zip' fname2 = 'v2_Questions_Val_mscoco.zip' fname3 = 'v2_Questions_Test_mscoco.zip' fname4 = 'v2_Annotations_Val_mscoco.zip' fname5 = 'v2_Annotations_Train_mscoco.zip' url = 'http://visualqa.org/data/mscoco/vqa/' build_data.download(dpath, url + fname1) build_data.download(dpath, url + fname2) build_data.download(dpath, url + fname3) build_data.download(dpath, url + fname4) build_data.download(dpath, url + fname5) build_data.untar(dpath, fname1) build_data.untar(dpath, fname2) build_data.untar(dpath, fname3) build_data.untar(dpath, fname4) build_data.untar(dpath, fname5) # Mark the data as built. build_data.mark_done(dpath)
def build(opt): dpath = os.path.join(opt['datapath'], 'CornellMovie') version = 'v1.01' if not build_data.built(dpath, version_string=version): if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. for downloadable_file in RESOURCES: downloadable_file.download_file(dpath) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'WikiQA') if not build_data.built(dpath): print('[building data: ' + dpath + ']') build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'wikiqa.tar.gz' url = 'https://s3.amazonaws.com/fair-data/parlai/wikiqa/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) dpext = os.path.join(dpath, 'WikiQACorpus') create_fb_format(dpath, 'train', os.path.join(dpext, 'WikiQA-train.tsv')) create_fb_format(dpath, 'valid', os.path.join(dpext, 'WikiQA-dev.tsv')) create_fb_format(dpath, 'test', os.path.join(dpext, 'WikiQA-test.tsv')) create_fb_format(dpath, 'train-filtered', os.path.join(dpext, 'WikiQA-train.tsv')) create_fb_format(dpath, 'valid-filtered', os.path.join(dpext, 'WikiQA-dev.tsv')) create_fb_format(dpath, 'test-filtered', os.path.join(dpext, 'WikiQA-test.tsv')) # Mark the data as built. build_data.mark_done(dpath)
def build(opt): version = MSC_DATASETS_VERSION # create particular instance of dataset depending on flags.. dpath = get_msc_dir_path(opt) if not build_data.built(dpath, version): logger.warning('[build data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) for downloadable_file in RESOURCES: downloadable_file.download_file(dpath) # Mark the data as built. build_data.mark_done(dpath, version) return dpath
def build(opt): # get path to data directory dpath = os.path.join(opt['datapath'], 'wikitext-103') # check if data had been previously built if not build_data.built(dpath): build_data.make_dir(dpath) # Download data url = 'https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip' fname = 'wikitext-103.zip' build_data.download(url, dpath, fname) # Extract data zip_path = Path(dpath).joinpath(fname) with ZipFile(zip_path) as zip: for info in zip.infolist(): if info.filename[-1] == '/': continue info.filename = os.path.basename(info.filename) zip.extract(info, dpath) os.remove(zip_path) # Convert to parlai format ParlAI_format(Path(dpath)) build_data.mark_done(dpath)
def build(opt): dpath = opt['datapath'] + "/WebQuestions/" if not build_data.built(dpath): print("[building data: " + dpath + "]") build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. url = ("https://worksheets.codalab.org/rest/bundles/" + "0x4a763f8cde224c2da592b75f29e2f5c2/contents/blob/") build_data.download(dpath, url) build_data.move(dpath + 'index.html', dpath + 'train.json') url = ("https://worksheets.codalab.org/rest/bundles/" + "0xe7bac352fce7448c9ef238fb0a297ec2/contents/blob/") build_data.download(dpath, url) build_data.move(dpath + 'index.html', dpath + 'test.json') create_fb_format(dpath, 'train', dpath + 'train.json') create_fb_format(dpath, 'valid', dpath + 'train.json') create_fb_format(dpath, 'test', dpath + 'test.json') # Mark the data as built. build_data.mark_done(dpath)
def build(datapath): version = 'v1.0' dpath = os.path.join(datapath, 'genderation_bias') if not build_data.built(dpath, version): logging.info('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. for downloadable_file in RESOURCES: downloadable_file.download_file(dpath) # Mark the data as built. build_data.mark_done(dpath, version)
def build_download(opt): version = 'v1.0' dpath = os.path.join(opt['datapath'], 'covid') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. for downloadable_file in RESOURCES: downloadable_file.download_file(dpath, check=False) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): dpath = os.path.join(opt['datapath'], 'dialog-bAbI-plus') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. for downloadable_file in RESOURCES: downloadable_file.download_file(dpath) build_data.mark_done(dpath, version)
def build(opt): dpath = os.path.join(opt['datapath'], 'VQA-COCO2014') if not build_data.built(dpath): print('[building data: ' + dpath + ']') for item in os.listdir(dpath): item = os.path.join(dpath, item) if os.path.isdir(item): build_data.remove_dir(item) build_data.make_dir(dpath) # Download the data. fname1 = 'Questions_Train_mscoco.zip' fname2 = 'Questions_Val_mscoco.zip' fname3 = 'Questions_Test_mscoco.zip' fname4 = 'Annotations_Val_mscoco.zip' fname5 = 'Annotations_Train_mscoco.zip' url = 'http://visualqa.org/data/mscoco/vqa/' build_data.download(os.path.join(dpath, fname1), url + fname1) build_data.download(os.path.join(dpath, fname2), url + fname2) build_data.download(os.path.join(dpath, fname3), url + fname3) build_data.download(os.path.join(dpath, fname4), url + fname4) build_data.download(os.path.join(dpath, fname5), url + fname5) build_data.untar(dpath, fname1) build_data.untar(dpath, fname2) build_data.untar(dpath, fname3) build_data.untar(dpath, fname4) build_data.untar(dpath, fname5) # Mark the data as built. build_data.mark_done(dpath)
def buildImage(opt): dpath = os.path.join(opt['datapath'], 'COCO-IMG') if not build_data.built(dpath, version_string='1'): print('[building image data: ' + dpath + ']') build_data.remove_dir(dpath) build_data.make_dir(dpath) # download the image data. fname1 = 'train2014.zip' fname2 = 'val2014.zip' fname3 = 'test2015.zip' url1 = 'http://msvocds.blob.core.windows.net/coco2014/' url2 = 'http://msvocds.blob.core.windows.net/coco2015/' build_data.download(url1 + fname1, dpath, fname1) build_data.download(url1 + fname2, dpath, fname2) build_data.download(url2 + fname3, dpath, fname3) build_data.untar(dpath, fname1) build_data.untar(dpath, fname2) build_data.untar(dpath, fname3) # Mark the data as built. build_data.mark_done(dpath, version_string='1')
def build(opt): dpath = opt['datapath'] + "/MCTest/" if not build_data.built(dpath): print("[building data: " + dpath + "]") build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = "mctest.tar.gz" url = "https://s3.amazonaws.com/fair-data/parlai/mctest/" + fname build_data.download(dpath, url) build_data.untar(dpath, fname) dpext = dpath + 'mctest/' create_fb_format(dpath, 'train160', dpext + 'MCTest/mc160.train', None) create_fb_format(dpath, 'valid160', dpext + 'MCTest/mc160.dev', None) create_fb_format(dpath, 'test160', dpext + 'MCTest/mc160.test', dpext + 'MCTestAnswers/mc160.test.ans') create_fb_format(dpath, 'train500', dpext + 'MCTest/mc500.train', None) create_fb_format(dpath, 'valid500', dpext + 'MCTest/mc500.dev', None) create_fb_format(dpath, 'test500', dpext + 'MCTest/mc500.test', dpext + 'MCTestAnswers/mc500.test.ans') # Mark the data as built. build_data.mark_done(dpath)
def build(opt): # get path to data directory dpath = os.path.join(opt['datapath'], 'probing', 'wnli') # check if data had been previously built if not build_data.built(dpath): print('[building data: ' + dpath + ']') build_data.make_dir(dpath) # Download data fname = 'wnli_orig.zip' url = 'https://firebasestorage.googleapis.com/' \ 'v0/b/mtl-sentence-representations.appspot.com/' \ 'o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf' build_data.download(url, dpath, fname) build_data.unzip(dpath, fname) orig_dpath = os.path.join(dpath, 'wnli_orig') os.rename(os.path.join(dpath, 'WNLI'), orig_dpath) # Process the data create_probing_format(Path(orig_dpath)) # mark the data as built build_data.mark_done(dpath)
def build(opt): dpath, version = download(opt) if 'light_use_speech_prefix' not in opt: opt['light_use_speech_prefix'] = True # create particular instance of dataset depending on flags.. fpath = get_fpath(opt) dump_path = os.path.join(opt['datapath'], 'light_dialogue_wild', 'dumps') data_path = os.path.join(opt['datapath'], 'light_dialogue_wild', fpath) if not build_data.built(data_path, version): if build_data.built(data_path): # An older version exists, so remove these outdated files. build_data.remove_dir(data_path) build_data.make_dir(data_path) build_from_dump(opt, data_path, dump_path) # Mark the data as built. build_data.mark_done(data_path, version)
def build(): version = 'v1.0' dpath = os.path.join(self.datapath, 'OffensiveLanguage') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'OffensiveLanguage.txt' url = 'http://parl.ai/downloads/offensive_language/' + fname build_data.download(url, dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version)
def buildImage(opt): dpath = os.path.join(opt['datapath'], 'COCO-IMG-2017') version = '1' if not build_data.built(dpath, version_string=version): print('[building image data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the image data. for downloadable_file in RESOURCES[:3]: downloadable_file.download_file(dpath) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'metalwoz') version = '1.0' if not build_data.built(dpath, version_string=version): if build_data.built(dpath): build_data.remove_dir(dpath) build_data.make_dir(dpath) build_data.make_dir(os.path.join(dpath, 'train', 'dialogues')) build_data.make_dir(os.path.join(dpath, 'test', 'dialogues')) # Download the data. RESOURCES[0].download_file(os.path.join(dpath, 'train')) RESOURCES[1].download_file(os.path.join(dpath, 'test')) build_data.untar(os.path.join(dpath, 'test'), 'dstc8_metalwoz_heldout.zip') build_data.mark_done(dpath, version_string=version)
def build_personality_list(opt: Opt): dpath = os.path.join(opt['datapath'], TASK_FOLDER_NAME) version = 'v1.0' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data for downloadable_file in PERSONALITY_LIST_RESOURCES: downloadable_file.download_file(dpath) # Mark the data as built build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'TriviaQA') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'triviaqa-rc.tar.gz' url = 'http://nlp.cs.washington.edu/triviaqa/data/' build_data.download(url + fname, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'dialog-bAbI') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'dialog_babi.tar.gz' url = 'https://s3.amazonaws.com/fair-data/parlai/dialog_babi/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): version = 'v1.0' dpath = os.path.join(opt['datapath'], 'Persona-Chat') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'personachat.tgz' url = 'https://s3.amazonaws.com/fair-data/parlai/personachat/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): dpath = os.path.join(opt['datapath'], 'SQuAD') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname1 = 'train-v1.1.json' fname2 = 'dev-v1.1.json' url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/' build_data.download(url + fname1, dpath, fname1) build_data.download(url + fname2, dpath, fname2) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'CLEVR') version = 'v1.0' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') # An older version exists, so remove these outdated files. if build_data.built(dpath): build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'CLEVR_v1.0.zip' url = 'https://s3-us-west-1.amazonaws.com/clevr/' build_data.download(url + fname, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'OpenSubtitles') version = '1' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. url = ('http://opus.lingfil.uu.se/download.php?f=OpenSubtitles/en.tar.gz') build_data.download(url, dpath, 'OpenSubtitles.tar.gz') build_data.untar(dpath, 'OpenSubtitles.tar.gz') create_fb_format(os.path.join(dpath, 'OpenSubtitles', 'en'), dpath) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): data_path = os.path.join(opt['datapath'], 'DialogueQE') version = '1501534800' if not build_data.built(data_path, version_string=version): print('[building data: ' + data_path + ']') if build_data.built(data_path): build_data.remove_dir(data_path) build_data.make_dir(data_path) fname = 'data_' + version + '.tar.gz' url = 'https://raw.githubusercontent.com/deepmipt/turing-data/master/' + fname build_data.download(url, data_path, fname) build_data.untar(data_path, fname) os.rename(os.path.join(data_path, 'data_train_' + version + '.json'), os.path.join(data_path, 'train.json')) os.rename(os.path.join(data_path, 'data_test_' + version + '.json'), os.path.join(data_path, 'test.json')) build_data.mark_done(data_path, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'personalized-dialog') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. # https://www.dropbox.com/s/4i9u4y24pt3paba/personalized-dialog-dataset.tar.gz?dl=1 fname = 'personalized-dialog-dataset.tar.gz' url = 'https://www.dropbox.com/s/4i9u4y24pt3paba/' + fname + '?dl=1' build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): # Depends upon another dataset, wikimovies, build that first. wikimovies_build.build(opt) dpath = os.path.join(opt['datapath'], 'DBLL') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'dbll.tgz' url = 'https://s3.amazonaws.com/fair-data/parlai/dbll/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'InsuranceQA') version = '1' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data from github. fname = 'insuranceqa.zip' url = 'https://github.com/shuzi/insuranceQA/archive/master.zip' print('[downloading data from: ' + url + ']') build_data.download(url, dpath, fname) build_data.untar(dpath, fname) ParseInsuranceQAV1.build(dpath) ParseInsuranceQAV2.build(dpath) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'COPA') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # an older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # download the data. fname = 'COPA-resources.tgz' # dataset URL url = 'http://people.ict.usc.edu/~gordon/downloads/' + fname build_data.download(url, dpath, fname) # uncompress it build_data.untar(dpath, fname) # mark the data as built build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'negotiation') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') # make a clean directory if needed if build_data.built(dpath): # an older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data from github fname = 'negotiation.zip' url = ('https://github.com/facebookresearch/end-to-end-negotiator/' 'archive/master.zip') print('[downloading data from: ' + url + ']') build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark as done build_data.mark_done(dpath, version_string=version)
def build(opt): version = 'v1.0' dpath = os.path.join(opt['datapath'], 'SCAN') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'scan.tgz' url = 'https://s3.amazonaws.com/fair-data/parlai/scan/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) ext = os.path.join('dailymail', 'questions') create_fb_format(dpath, 'train', os.path.join(dpath, 'tasks_train_simple.txt')) create_fb_format(dpath, 'valid', os.path.join(dpath, 'tasks_train_simple.txt')) create_fb_format(dpath, 'test', os.path.join(dpath, 'tasks_test_simple.txt')) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): dpath = os.path.join(opt['datapath'], 'CornellMovie') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'cornell_movie_dialogs_corpus.zip' url = 'http://www.mpi-sws.org/~cristian/data/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) dpext = os.path.join(dpath, 'cornell movie-dialogs corpus') create_fb_format(os.path.join(dpext, 'movie_lines.txt'), os.path.join(dpext, 'movie_conversations.txt'), dpath) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): """Create train and validation data for synthetic shapes described by attributes.""" dpath = os.path.join(opt['datapath'], 'taskntalk') if not build_data.built(dpath): print('[building data: ' + dpath + ']') build_data.make_dir(os.path.join(dpath, 'large')) build_data.make_dir(os.path.join(dpath, 'small')) # save training and validation data to_save = { 'attributes': ['color', 'shape', 'style'], 'task_defn': [['color', 'shape'], ['shape', 'color'], ['color', 'style'], ['style', 'color'], ['shape', 'style'], ['style', 'shape']] } split_data = {} # small dataset properties properties = { 'color': ['red', 'green', 'blue', 'purple'], 'shape': ['square', 'triangle', 'circle', 'star'], 'style': ['dotted', 'solid', 'filled', 'dashed'] } to_save['properties'] = properties # properties.values() not used directly to maintain order data_verbose = list(itertools.product(*[properties[key] for key in to_save['attributes']])) # randomly select train and rest of it is valid split_data['valid'] = random.sample(data_verbose, int(0.2 * len(data_verbose))) split_data['train'] = [s for s in data_verbose if s not in split_data['valid']] to_save['data'] = split_data['train'] with open(os.path.join(dpath, 'small', 'train.json'), 'w') as outfile: json.dump(to_save, outfile, indent=4, separators=(',', ': '), sort_keys=True) to_save['data'] = split_data['valid'] with open(os.path.join(dpath, 'small', 'valid.json'), 'w') as outfile: json.dump(to_save, outfile, indent=4, separators=(',', ': '), sort_keys=True) # large dataset properties properties = { 'color': ['red', 'green', 'blue', 'purple', 'yellow', 'cyan', 'orange', 'teal'], 'shape': ['square', 'triangle', 'circle', 'star', 'heart', 'spade', 'club', 'diamond'], 'style': ['dotted', 'solid', 'filled', 'dashed', 'hstripe', 'vstripe', 'hgrad', 'vgrad'] } to_save['properties'] = properties data_verbose = list(itertools.product(*[properties[key] for key in to_save['attributes']])) split_data['valid'] = random.sample(data_verbose, int(0.8 * len(data_verbose))) split_data['train'] = [s for s in data_verbose if s not in split_data['valid']] to_save['data'] = split_data['train'] with open(os.path.join(dpath, 'large', 'train.json'), 'w') as outfile: json.dump(to_save, outfile, indent=4, separators=(',', ': '), sort_keys=True) to_save['data'] = split_data['valid'] with open(os.path.join(dpath, 'large', 'valid.json'), 'w') as outfile: json.dump(to_save, outfile, indent=4, separators=(',', ': '), sort_keys=True) # Mark the data as built. build_data.mark_done(dpath)