def build(opt): version = 'v1.0' dpath = os.path.join(opt['datapath'], 'QACNN') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'cnn.tgz' gd_id = '0BwmD_VLjROrfTTljRDVZMFJnVWM' build_data.download_from_google_drive(gd_id, os.path.join(dpath, fname)) build_data.untar(dpath, fname) create_fb_format(dpath, 'train', os.path.join(dpath, 'cnn', 'questions', 'training')) create_fb_format(dpath, 'valid', os.path.join(dpath, 'cnn', 'questions', 'validation')) create_fb_format(dpath, 'test', os.path.join(dpath, 'cnn', 'questions', 'test')) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): dpath = os.path.join(opt['datapath'], 'MovieDialog') version = '1' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') dpath2 = os.path.join(dpath, 'movie_dialog_dataset', 'task4_reddit') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) build_data.make_dir(dpath2) # Download the data. fname = 'moviedialog.tar.gz' url = 'https://s3.amazonaws.com/fair-data/parlai/moviedialog/' + fname build_data.download(url, dpath, fname) url2 = 'http://tinyurl.com/' + 'p6tyohj' build_data.download(url2, dpath2, 'p6tyohj.tgz') build_data.untar(dpath, fname) build_data.untar(dpath2, 'p6tyohj.tgz') # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'WikiQA') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'wikiqa.tar.gz' url = 'https://s3.amazonaws.com/fair-data/parlai/wikiqa/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) dpext = os.path.join(dpath, 'WikiQACorpus') create_fb_format(dpath, 'train', os.path.join(dpext, 'WikiQA-train.tsv')) create_fb_format(dpath, 'valid', os.path.join(dpext, 'WikiQA-dev.tsv')) create_fb_format(dpath, 'test', os.path.join(dpext, 'WikiQA-test.tsv')) create_fb_format(dpath, 'train-filtered', os.path.join(dpext, 'WikiQA-train.tsv')) create_fb_format(dpath, 'valid-filtered', os.path.join(dpext, 'WikiQA-dev.tsv')) create_fb_format(dpath, 'test-filtered', os.path.join(dpext, 'WikiQA-test.tsv')) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'VQA-v2') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') # An older version exists, so remove these outdated files. if build_data.built(dpath): build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname1 = 'v2_Questions_Train_mscoco.zip' fname2 = 'v2_Questions_Val_mscoco.zip' fname3 = 'v2_Questions_Test_mscoco.zip' fname4 = 'v2_Annotations_Val_mscoco.zip' fname5 = 'v2_Annotations_Train_mscoco.zip' url = 'http://visualqa.org/data/mscoco/vqa/' build_data.download(url + fname1, dpath, fname1) build_data.download(url + fname2, dpath, fname2) build_data.download(url + fname3, dpath, fname3) build_data.download(url + fname4, dpath, fname4) build_data.download(url + fname5, dpath, fname5) build_data.untar(dpath, fname1) build_data.untar(dpath, fname2) build_data.untar(dpath, fname3) build_data.untar(dpath, fname4) build_data.untar(dpath, fname5) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'MCTest') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'mctest.tar.gz' url = 'https://s3.amazonaws.com/fair-data/parlai/mctest/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) dpext = os.path.join(dpath, 'mctest') create_fb_format(dpath, 'train160', os.path.join(dpext, 'MCTest', 'mc160.train'), None) create_fb_format(dpath, 'valid160', os.path.join(dpext, 'MCTest', 'mc160.dev'), None) create_fb_format(dpath, 'test160', os.path.join(dpext, 'MCTest', 'mc160.test'), os.path.join(dpext, 'MCTestAnswers', 'mc160.test.ans')) create_fb_format(dpath, 'train500', os.path.join(dpext, 'MCTest', 'mc500.train'), None) create_fb_format(dpath, 'valid500', os.path.join(dpext, 'MCTest', 'mc500.dev'), None) create_fb_format(dpath, 'test500', os.path.join(dpext, 'MCTest', 'mc500.test'), os.path.join(dpext, 'MCTestAnswers', 'mc500.test.ans')) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'NarrativeQA') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # an older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # download the data. fname = 'narrative_qa.zip' # dataset URL url = NARRATIVE_QA_DOWNLOAD_URL build_data.download(url, dpath, fname) # uncompress it build_data.untar(dpath, fname) print('downloading stories now') base_path = os.path.join(dpath, 'narrativeqa-master') download_stories(base_path) # move from tmp to stories tmp_stories_path = os.path.join(base_path, 'tmp') new_stories_path = os.path.join(base_path, 'stories') shutil.move(tmp_stories_path, new_stories_path) # divide into train, valid and test for summaries summaries_csv_path = os.path.join(base_path, 'third_party', 'wikipedia', 'summaries.csv') new_path = os.path.join(base_path, 'summaries.csv') shutil.move(summaries_csv_path, new_path) divide_csv_into_sets(new_path) # divide into sets for questions questions_path = os.path.join(base_path, 'qaps.csv') divide_csv_into_sets(questions_path) # divide into sets for documents documents_path = os.path.join(base_path, 'documents.csv') divide_csv_into_sets(documents_path) # move specific set's files into their set's folder make_folders(base_path) move_files(base_path) # move narrativeqa-master to narrative_qa new_path = os.path.join(dpath, 'narrative_qa') shutil.move(base_path, new_path) # mark the data as built build_data.mark_done(dpath, version_string=version)
def build(opt): version = 'v0.9' dpath = os.path.join(opt['datapath'], 'VisDial-v0.9') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname1 = 'visdial_0.9_train.zip' fname2 = 'visdial_0.9_val.zip' url = 'https://computing.ece.vt.edu/~abhshkdz/data/visdial/' build_data.download(url + fname1, dpath, fname1) build_data.download(url + fname2, dpath, fname2) build_data.untar(dpath, fname1) build_data.untar(dpath, fname2) print('processing unpacked files') # Use 1000 examples from training set as validation. json1 = os.path.join(dpath, fname1.rsplit('.', 1)[0] + '.json') with open(json1) as t_json: train_data = json.load(t_json) valid_data = train_data.copy() valid_data['data'] = train_data['data'].copy() valid_data['data']['dialogs'] = [] # Use constant stride to pick examples. num_valid = 1000 total = len(train_data['data']['dialogs']) step = total // (num_valid - 1) for i in range(total-1, 0, -step)[:num_valid]: valid_data['data']['dialogs'].append(train_data['data']['dialogs'][i]) del train_data['data']['dialogs'][i] train_json = json1.rsplit('.', 1)[0] + '_train.json' valid_json = json1.rsplit('.', 1)[0] + '_valid.json' with open(train_json, 'w') as t_out, open(valid_json, 'w') as v_out: json.dump(train_data, t_out) json.dump(valid_data, v_out) os.remove(json1) # Use validation data as test. json2 = os.path.join(dpath, fname2.rsplit('.', 1)[0] + '.json') test_json = json2.rsplit('.', 1)[0] + '_test.json' build_data.move(json2, test_json) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): dpath = os.path.join(opt['datapath'], 'wizard_of_wikipedia') fname = 'wizard_of_wikipedia.tgz' version = '1.0' if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) url = 'http://parl.ai/downloads/wizard_of_wikipedia/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) build_data.mark_done(dpath, version)
def build(opt): dpath = os.path.join(opt['datapath'], 'dailydialog_augmented') version = 'None' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. remote_fname = 'dailydialog.tar.gz' local_fname = 'dailydialog_augmented.tar.gz' url = 'http://parl.ai/downloads/dailydialog/' + remote_fname build_data.download(url, dpath, local_fname) build_data.untar(dpath, local_fname) fpath = os.path.join(dpath, 'train.json') with open(fpath, mode='r+') as f: data = [] print('augmenting dailydialog') for line in f: dialog = list( map(lambda obj_dialog: obj_dialog['text'], json.loads(line)['dialogue'])) data.append(dialog) augmented_data = augment_dataset(data) def package_dialog(dialog): packaged_utterances = list( map( lambda utterance: { 'emotion': "", 'act': "", 'text': utterance }, dialog)) return { "fold": "train", "topic": "", "dialogue": packaged_utterances } augmented_data = list(map(package_dialog, augmented_data)) f.seek(0) json.dump(augmented_data, f) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'InsuranceQA') if not build_data.built(dpath): print('[building data: ' + dpath + ']') build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data from github. fname = 'insuranceqa.zip' url = 'https://github.com/shuzi/insuranceQA/archive/master.zip' build_data.download(url, dpath, fname, redownload=False) build_data.untar(dpath, fname) # According to the author, V2 holds the latest data dpext = os.path.join(dpath, 'insuranceQA-master/V2') # read vocab file vocab_path = os.path.join(dpext, "vocabulary") d_vocab = read_vocab(vocab_path) # read label2answer file label2answer_path_gz = os.path.join( dpext, "InsuranceQA.label2answer.token.encoded.gz") d_label_answer = read_label2answer(label2answer_path_gz, d_vocab) # TODO: right now it uses 100 by default, but 500, 1000, 1500 (# of label candidates) should also be available train_path_gz = os.path.join( dpext, "InsuranceQA.question.anslabel.token.100.pool.solr.train.encoded.gz" ) valid_path_gz = os.path.join( dpext, "InsuranceQA.question.anslabel.token.100.pool.solr.valid.encoded.gz" ) test_path_gz = os.path.join( dpext, "InsuranceQA.question.anslabel.token.100.pool.solr.test.encoded.gz" ) create_fb_format(dpath, 'train', train_path_gz, d_vocab, d_label_answer) create_fb_format(dpath, 'valid', valid_path_gz, d_vocab, d_label_answer) create_fb_format(dpath, 'test', test_path_gz, d_vocab, d_label_answer) # Mark the data as built. build_data.mark_done(dpath)
def build_data_for_agent(opt): # get path to data directory and create folders tree dpath = join(opt['model_file']) # define languages language = opt['language'] dpath = join(dpath, language, 'agent') build_data.make_dir(dpath) build_data.make_dir(join(dpath, 'embeddings')) build_data.make_dir(join(dpath, 'vocab')) build_data.make_dir(join(dpath, 'logs', opt['name'])) if not isfile(join(dpath, 'embeddings', 'embeddings_lenta_100.vec')): print('[Download the word embeddings]...') try: embed_url = os.environ['EMBEDDINGS_URL'] + 'embeddings_lenta_100.vec' build_data.download(embed_url, join(dpath, 'embeddings'), 'embeddings_lenta_100.vec') print('[End of download the word embeddings]...') except RuntimeWarning: raise('To use your own embeddings, please, put the file embeddings_lenta_100.vec in the folder ' '{0}'.format(join(dpath,'embeddings'))) if not isfile(join(dpath, 'vocab', 'char_vocab.russian.txt')): print('[Download the chars vocalibary]...') try: vocab_url = os.environ['MODELS_URL'] + 'coreference/vocabs/char_vocab.russian.txt' build_data.download(vocab_url, join(dpath, 'vocab'), 'char_vocab.russian.txt') print('[End of download the chars vocalibary]...') except RuntimeWarning: raise('To use your own char vocalibary, please, put the file char_vocab.russian.txt in the folder ' '{0}'.format(join(dpath,'vocabs'))) if opt['name'] == 'pretrain_model' and not isdir(join(dpath, 'logs', 'pretrain_model')): print('[Download the pretrain model]...') try: pretrain_url = os.environ['MODELS_URL'] + 'coreference/OpeanAI/pretrain_model.zip' build_data.download(pretrain_url, join(dpath, 'logs'), 'pretrain_model.zip') build_data.untar(join(dpath, 'logs'), 'pretrain_model.zip') print('[End of download pretrain model]...') except RuntimeWarning: raise('To train your own model, please, change the variable --name in build.py:train_coreference ' 'to anything other than `pretrain_model`') build_data.make_dir(join(dpath, 'reports', 'response_files')) build_data.make_dir(join(dpath, 'reports', 'results')) build_data.make_dir(join(dpath, 'reports', 'predictions')) return None
def build(opt): dpath = os.path.join(opt['datapath'], 'dialog-bAbI') if not build_data.built(dpath): print('[building data: ' + dpath + ']') build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'dialog_babi.tar.gz' url = 'https://s3.amazonaws.com/fair-data/parlai/dialog_babi/' + fname build_data.download(dpath, url) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath)
def build(opt): dpath = opt['datapath'] + "/dialog-bAbI/" if not build_data.built(dpath): print("[building data: " + dpath + "]") build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = "dialog_babi.tar.gz" url = "https://s3.amazonaws.com/fair-data/parlai/dialog_babi/" + fname build_data.download(dpath, url) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath)
def build(opt): dpath = os.path.join(opt['datapath'], 'BookTest') if not build_data.built(dpath): print('[building data: ' + dpath + ']') build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'booktest.tar.bz2' url = 'https://s3.amazonaws.com/fair-data/parlai/booktest/' + fname build_data.download(dpath, url) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath)
def build(opt): dpath = opt['datapath'] + "/BookTest/" if not build_data.built(dpath): print("[building data: " + dpath + "]") build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = "booktest.tar.bz2" url = "https://s3.amazonaws.com/fair-data/parlai/booktest/" + fname build_data.download(dpath, url) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath)
def build(opt): dpath = opt['datapath'] + '/Ubuntu/' if not build_data.built(dpath): print('[building data: ' + dpath + ']') build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'ubuntu.tar.gz' url = 'https://s3.amazonaws.com/fair-data/parlai/ubuntu/' + fname build_data.download(dpath, url) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath)
def build(opt): dpath = os.path.join(opt['datapath'], 'WikiMovies') if not build_data.built(dpath): print('[building data: ' + dpath + ']') build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'wikimovies.tar.gz' url = 'https://s3.amazonaws.com/fair-data/parlai/wikimovies/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath)
def build(opt): dpath = os.path.join(opt['datapath'], 'FVQA') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') # An older version exists, so remove these outdated files. if build_data.built(dpath): build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. build_data.download('https://dl.dropboxusercontent.com/s/iyz6l7jhbt6jb7q/new_dataset_release.zip', dpath, 'FVQA.zip') # noqa: E501 build_data.untar(dpath, 'FVQA.zip') # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt["datapath"], "metalwoz") version = "1.0" if not build_data.built(dpath, version_string=version): if build_data.built(dpath): build_data.remove_dir(dpath) build_data.make_dir(dpath) build_data.make_dir(os.path.join(dpath, "train", "dialogues")) build_data.make_dir(os.path.join(dpath, "test", "dialogues")) # Download the data. RESOURCES[0].download_file(os.path.join(dpath, "train")) RESOURCES[1].download_file(os.path.join(dpath, "test")) build_data.untar(os.path.join(dpath, "test"), "dstc8_metalwoz_heldout.zip") build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'personalized-dialog') if not build_data.built(dpath): print('[building data: ' + dpath + ']') build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. # https://www.dropbox.com/s/4i9u4y24pt3paba/personalized-dialog-dataset.tar.gz?dl=1 fname = 'personalized-dialog-dataset.tar.gz' url = 'https://www.dropbox.com/s/4i9u4y24pt3paba/' + fname + '?dl=1' build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath)
def build(opt): dpath = os.path.join(opt['datapath'], 'metalwoz') version = '1.0' if not build_data.built(dpath, version_string=version): if build_data.built(dpath): build_data.remove_dir(dpath) build_data.make_dir(dpath) build_data.make_dir(os.path.join(dpath, 'train', 'dialogues')) build_data.make_dir(os.path.join(dpath, 'test', 'dialogues')) # Download the data. RESOURCES[0].download_file(os.path.join(dpath, 'train')) RESOURCES[1].download_file(os.path.join(dpath, 'test')) build_data.untar(os.path.join(dpath, 'test'), 'dstc8_metalwoz_heldout.zip') build_data.mark_done(dpath, version_string=version)
def download_process_wikiqa(data_path='data'): dpath = os.path.join(data_path, 'WikiQA') build_data.make_dir(dpath) fname = 'wikiqa.tar.gz' url = 'http://parl.ai/downloads/wikiqa/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) dpext = os.path.join(dpath, 'WikiQACorpus') create_fb_format(dpath, 'train', os.path.join(dpext, 'WikiQA-train.tsv')) create_fb_format(dpath, 'valid', os.path.join(dpext, 'WikiQA-dev.tsv')) create_fb_format(dpath, 'test', os.path.join(dpext, 'WikiQA-test.tsv')) create_fb_format(dpath, 'train-filtered', os.path.join(dpext, 'WikiQA-train.tsv')) create_fb_format(dpath, 'valid-filtered', os.path.join(dpext, 'WikiQA-dev.tsv')) create_fb_format(dpath, 'test-filtered', os.path.join(dpext, 'WikiQA-test.tsv'))
def build(opt): dpath = os.path.join(opt['datapath'], 'SimpleQuestions') if not build_data.built(dpath): print('[building data: ' + dpath + ']') build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'simplequestions.tar.gz' url = ('https://s3.amazonaws.com/fair-data/parlai/simplequestions/' + fname) build_data.download(dpath, url) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath)
def build(opt): dpath = opt['datapath'] + "/SimpleQuestions/" if not build_data.built(dpath): print("[building data: " + dpath + "]") build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = "simplequestions.tar.gz" url = ("https://s3.amazonaws.com/fair-data/parlai/simplequestions/" + fname) build_data.download(dpath, url) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath)
def build(opt): dpath = os.path.join(opt['datapath'], 'wikipedia') task = opt.get('task', 'wikipedia:all') extract_full = task.split(':')[-1] == 'all' if extract_full: dpath = os.path.join(dpath, 'full') fname = 'wiki_full_extracted.tgz' else: dpath = os.path.join(dpath, 'summary') fname = "summaries.tgz" if not build_data.built(dpath): print('[building data: ' + dpath + ']') build_data.make_dir(dpath) url = 'http://parl.ai/downloads/wikipedia/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) build_data.mark_done(dpath)
def build(opt): dpath = os.path.join(opt['datapath'], 'dialog-bAbI-plus') fname = "dialog-bAbI-plus.zip" version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) url = "https://drive.google.com/uc?export=download&id=0B2MvoQfXtqZmMTJqclpBdGN2bmc" build_data.download(url, dpath, fname) build_data.untar(dpath, fname) build_data.mark_done(dpath, version)
def build(opt): dpath = os.path.join(opt['datapath'], 'MCTest') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'mctest.tar.gz' url = 'http://parl.ai/downloads/mctest/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) dpext = os.path.join(dpath, 'mctest') create_fb_format( dpath, 'train160', os.path.join(dpext, 'MCTest', 'mc160.train'), None ) create_fb_format( dpath, 'valid160', os.path.join(dpext, 'MCTest', 'mc160.dev'), None ) create_fb_format( dpath, 'test160', os.path.join(dpext, 'MCTest', 'mc160.test'), os.path.join(dpext, 'MCTestAnswers', 'mc160.test.ans'), ) create_fb_format( dpath, 'train500', os.path.join(dpext, 'MCTest', 'mc500.train'), None ) create_fb_format( dpath, 'valid500', os.path.join(dpext, 'MCTest', 'mc500.dev'), None ) create_fb_format( dpath, 'test500', os.path.join(dpext, 'MCTest', 'mc500.test'), os.path.join(dpext, 'MCTestAnswers', 'mc500.test.ans'), ) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'OpenSubtitles') if not build_data.built(dpath): print('[building data: ' + dpath + ']') build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'download.php?f=OpenSubtitles/en.tar.gz' url = ('http://opus.lingfil.uu.se/' + fname) build_data.download(os.path.join(dpath, 'OpenSubtitles.tar.gz'), url) build_data.untar(dpath, 'OpenSubtitles.tar.gz') create_fb_format(os.path.join(dpath, 'OpenSubtitles', 'en'), dpath) # Mark the data as built. build_data.mark_done(dpath)
def build(opt): dpath = os.path.join(opt['datapath'], 'personality_captions') image_path = os.path.join(dpath, 'images') fname = 'personality_captions.tgz' version = '1.0' if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) url = 'http://parl.ai/downloads/personality_captions/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) build_data.mark_done(dpath, version) if not build_data.built(image_path, version) and not opt.get('yfcc_path'): download_images(opt)
def build(opt): dpath = opt['datapath'] + "/OpenSubtitles/" if not build_data.built(dpath): print("[building data: " + dpath + "]") build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = "download.php?f=OpenSubtitles/en.tar.gz" url = ("http://opus.lingfil.uu.se/" + fname) build_data.download(dpath, url) build_data.untar(dpath, 'download.php?f=OpenSubtitles%2Fen.tar.gz') create_fb_format(dpath + '/OpenSubtitles/en/', dpath) # Mark the data as built. build_data.mark_done(dpath)
def download(opt, path, fname, version='1.0'): fshort = fname[:fname.find('.')] if '.' in fname else fname dpath = os.path.join(opt['datapath'], 'models', path, fshort) if not build_data.built(dpath, version): print('[downloading: ' + dpath + '/' + fname + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. url = 'https://s3.amazonaws.com/fair-data/parlai/_models/convai2/' + fname build_data.download(url, dpath, fname) if '.tgz' in fname or '.gz' in fname: build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): # Depends upon another dataset, wikimovies, build that first. wikimovies_build.build(opt) dpath = opt['datapath'] + "/DBLL/" if not build_data.built(dpath): print("[building data: " + dpath + "]") build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = "dbll.tgz" url = "https://s3.amazonaws.com/fair-data/parlai/dbll/" + fname build_data.download(dpath, url) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath)
def build(opt): # Depends upon another dataset, wikimovies, build that first. wikimovies_build.build(opt) dpath = os.path.join(opt['datapath'], 'DBLL') if not build_data.built(dpath): print('[building data: ' + dpath + ']') build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'dbll.tgz' url = 'https://s3.amazonaws.com/fair-data/parlai/dbll/' + fname build_data.download(dpath, url) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath)
def build(opt): dpath = os.path.join(opt['datapath'], 'TriviaQA') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'triviaqa-rc.tar.gz' url = 'http://nlp.cs.washington.edu/triviaqa/data/' build_data.download(url + fname, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): version = 'v3.0' dpath = os.path.join(opt['datapath'], 'ConvAI2') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'convai2.tar.gz' url = 'https://s3.amazonaws.com/fair-data/parlai/convai2/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): version = 'v1.0' dpath = os.path.join(opt['datapath'], 'Persona-Chat') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'personachat.tgz' url = 'https://s3.amazonaws.com/fair-data/parlai/personachat/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): dpath = os.path.join(opt['datapath'], 'TriviaQA') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'triviaqa-rc.tar.gz' url = 'http://nlp.cs.washington.edu/triviaqa/data/' build_data.download(url + fname, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'TalkTheWalk') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'talkthewalk.tgz' url = 'https://s3.amazonaws.com/fair-data/parlai/projects/talkthewalk/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'dialog-bAbI') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'dialog_babi.tar.gz' url = 'https://s3.amazonaws.com/fair-data/parlai/dialog_babi/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'bAbI') version = 'None' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'babi.tar.gz' url = 'http://parl.ai/downloads/babi/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'personalized-dialog') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. # https://www.dropbox.com/s/4i9u4y24pt3paba/personalized-dialog-dataset.tar.gz?dl=1 fname = 'personalized-dialog-dataset.tar.gz' url = 'https://www.dropbox.com/s/4i9u4y24pt3paba/' + fname + '?dl=1' build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'OpenSubtitles') version = '1' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. url = ('http://opus.lingfil.uu.se/download.php?f=OpenSubtitles/en.tar.gz') build_data.download(url, dpath, 'OpenSubtitles.tar.gz') build_data.untar(dpath, 'OpenSubtitles.tar.gz') create_fb_format(os.path.join(dpath, 'OpenSubtitles', 'en'), dpath) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'CLEVR') version = 'v1.0' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') # An older version exists, so remove these outdated files. if build_data.built(dpath): build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'CLEVR_v1.0.zip' url = 'https://s3-us-west-1.amazonaws.com/clevr/' build_data.download(url + fname, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): data_path = os.path.join(opt['datapath'], 'DialogueQE') version = '1501534800' if not build_data.built(data_path, version_string=version): print('[building data: ' + data_path + ']') if build_data.built(data_path): build_data.remove_dir(data_path) build_data.make_dir(data_path) fname = 'data_' + version + '.tar.gz' url = 'https://raw.githubusercontent.com/deepmipt/turing-data/master/' + fname build_data.download(url, data_path, fname) build_data.untar(data_path, fname) os.rename(os.path.join(data_path, 'data_train_' + version + '.json'), os.path.join(data_path, 'train.json')) os.rename(os.path.join(data_path, 'data_test_' + version + '.json'), os.path.join(data_path, 'test.json')) build_data.mark_done(data_path, version_string=version)
def buildImage(opt): dpath = os.path.join(opt['datapath'], 'COCO-IMG') version = '1' if not build_data.built(dpath, version_string=version): print('[building image data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the image data. fname1 = 'train2014.zip' fname2 = 'val2014.zip' fname3 = 'test2015.zip' url = 'https://s3.amazonaws.com/fair-data/parlai/COCO-IMG/' build_data.download(url + fname1, dpath, fname1) build_data.download(url + fname2, dpath, fname2) build_data.download(url + fname3, dpath, fname3) build_data.untar(dpath, fname1) build_data.untar(dpath, fname2) build_data.untar(dpath, fname3) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): # Depends upon another dataset, wikimovies, build that first. wikimovies_build.build(opt) dpath = os.path.join(opt['datapath'], 'DBLL') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'dbll.tgz' url = 'https://s3.amazonaws.com/fair-data/parlai/dbll/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'COPA') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # an older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # download the data. fname = 'COPA-resources.tgz' # dataset URL url = 'http://people.ict.usc.edu/~gordon/downloads/' + fname build_data.download(url, dpath, fname) # uncompress it build_data.untar(dpath, fname) # mark the data as built build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'negotiation') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') # make a clean directory if needed if build_data.built(dpath): # an older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data from github fname = 'negotiation.zip' url = ('https://github.com/facebookresearch/end-to-end-negotiator/' 'archive/master.zip') print('[downloading data from: ' + url + ']') build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark as done build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'InsuranceQA') version = '1' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data from github. fname = 'insuranceqa.zip' url = 'https://github.com/shuzi/insuranceQA/archive/master.zip' print('[downloading data from: ' + url + ']') build_data.download(url, dpath, fname) build_data.untar(dpath, fname) ParseInsuranceQAV1.build(dpath) ParseInsuranceQAV2.build(dpath) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'CornellMovie') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'cornell_movie_dialogs_corpus.zip' url = 'http://www.mpi-sws.org/~cristian/data/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) dpext = os.path.join(dpath, 'cornell movie-dialogs corpus') create_fb_format(os.path.join(dpext, 'movie_lines.txt'), os.path.join(dpext, 'movie_conversations.txt'), dpath) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): version = 'v1.0' dpath = os.path.join(opt['datapath'], 'SCAN') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'scan.tgz' url = 'https://s3.amazonaws.com/fair-data/parlai/scan/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) ext = os.path.join('dailymail', 'questions') create_fb_format(dpath, 'train', os.path.join(dpath, 'tasks_train_simple.txt')) create_fb_format(dpath, 'valid', os.path.join(dpath, 'tasks_train_simple.txt')) create_fb_format(dpath, 'test', os.path.join(dpath, 'tasks_test_simple.txt')) # Mark the data as built. build_data.mark_done(dpath, version)