def try_downloading(directory, row): document_id, kind, story_url, story_size = row['document_id'], \ row['kind'], row['story_url'], row['story_file_size'] story_path = os.path.join(directory, document_id + '.content') actual_story_size = 0 if os.path.exists(story_path): with open(story_path, 'rb') as f: actual_story_size = len(f.read()) if actual_story_size <= 19000: if kind == 'gutenberg': time.sleep(2) build_data.download(story_url, directory, document_id + '.content') else: return True file_type = subprocess.check_output(['file', '-b', story_path]) file_type = file_type.decode('utf-8') if 'gzip compressed' in file_type: gz_path = os.path.join(directory, document_id + '.content.gz') shutil.move(story_path, gz_path) build_tools.untar(gz_path) return False
def build(opt): dpath = os.path.join(opt['datapath'], 'MCTest') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'mctest.tar.gz' url = 'https://s3.amazonaws.com/fair-data/parlai/mctest/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) dpext = os.path.join(dpath, 'mctest') create_fb_format(dpath, 'train160', os.path.join(dpext, 'MCTest', 'mc160.train'), None) create_fb_format(dpath, 'valid160', os.path.join(dpext, 'MCTest', 'mc160.dev'), None) create_fb_format(dpath, 'test160', os.path.join(dpext, 'MCTest', 'mc160.test'), os.path.join(dpext, 'MCTestAnswers', 'mc160.test.ans')) create_fb_format(dpath, 'train500', os.path.join(dpext, 'MCTest', 'mc500.train'), None) create_fb_format(dpath, 'valid500', os.path.join(dpext, 'MCTest', 'mc500.dev'), None) create_fb_format(dpath, 'test500', os.path.join(dpext, 'MCTest', 'mc500.test'), os.path.join(dpext, 'MCTestAnswers', 'mc500.test.ans')) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'MovieDialog') version = '1' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') dpath2 = os.path.join(dpath, 'movie_dialog_dataset', 'task4_reddit') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) build_data.make_dir(dpath2) # Download the data. fname = 'moviedialog.tar.gz' url = 'https://s3.amazonaws.com/fair-data/parlai/moviedialog/' + fname build_data.download(url, dpath, fname) url2 = 'http://tinyurl.com/' + 'p6tyohj' build_data.download(url2, dpath2, 'p6tyohj.tgz') build_data.untar(dpath, fname) build_data.untar(dpath2, 'p6tyohj.tgz') # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'wmt') version = 'None' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fnames = [('train.en','train.de', 'en_de_train.txt'), ('newstest2014.en','newstest2014.de', 'en_de_test.txt')] for (en_fname, de_fname, w_fname) in fnames: url_base = 'https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/' en_url = url_base + en_fname de_url = url_base + de_fname build_data.download(en_url, dpath, en_fname) build_data.download(de_url, dpath, de_fname) with open(os.path.join(dpath, en_fname), 'r') as f: en = [l[:-1] for l in f] with open(os.path.join(dpath, de_fname), 'r') as f: de = [l[:-1] for l in f] with open(os.path.join(dpath, w_fname), 'w') as f: for de_sent,en_sent in zip(de,en): f.write("1 "+en_sent+"\t"+de_sent+"\n") # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'VQA-v2') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') # An older version exists, so remove these outdated files. if build_data.built(dpath): build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname1 = 'v2_Questions_Train_mscoco.zip' fname2 = 'v2_Questions_Val_mscoco.zip' fname3 = 'v2_Questions_Test_mscoco.zip' fname4 = 'v2_Annotations_Val_mscoco.zip' fname5 = 'v2_Annotations_Train_mscoco.zip' url = 'http://visualqa.org/data/mscoco/vqa/' build_data.download(url + fname1, dpath, fname1) build_data.download(url + fname2, dpath, fname2) build_data.download(url + fname3, dpath, fname3) build_data.download(url + fname4, dpath, fname4) build_data.download(url + fname5, dpath, fname5) build_data.untar(dpath, fname1) build_data.untar(dpath, fname2) build_data.untar(dpath, fname3) build_data.untar(dpath, fname4) build_data.untar(dpath, fname5) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'WikiQA') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'wikiqa.tar.gz' url = 'https://s3.amazonaws.com/fair-data/parlai/wikiqa/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) dpext = os.path.join(dpath, 'WikiQACorpus') create_fb_format(dpath, 'train', os.path.join(dpext, 'WikiQA-train.tsv')) create_fb_format(dpath, 'valid', os.path.join(dpext, 'WikiQA-dev.tsv')) create_fb_format(dpath, 'test', os.path.join(dpext, 'WikiQA-test.tsv')) create_fb_format(dpath, 'train-filtered', os.path.join(dpext, 'WikiQA-train.tsv')) create_fb_format(dpath, 'valid-filtered', os.path.join(dpext, 'WikiQA-dev.tsv')) create_fb_format(dpath, 'test-filtered', os.path.join(dpext, 'WikiQA-test.tsv')) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'WebQuestions') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. url = ('https://worksheets.codalab.org/rest/bundles/' + '0x4a763f8cde224c2da592b75f29e2f5c2/contents/blob/') build_data.download(url, dpath, 'train.json') url = ('https://worksheets.codalab.org/rest/bundles/' + '0xe7bac352fce7448c9ef238fb0a297ec2/contents/blob/') build_data.download(url, dpath, 'test.json') create_fb_format(dpath, 'train', os.path.join(dpath, 'train.json')) create_fb_format(dpath, 'valid', os.path.join(dpath, 'train.json')) create_fb_format(dpath, 'test', os.path.join(dpath, 'test.json')) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): version = 'v1.0' dpath = os.path.join(opt['datapath'], 'Twitter') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname1 = "twitter_en_big.txt.gz.partaa" fname2 = "twitter_en_big.txt.gz.partab" url = 'https://github.com/Marsan-Ma/chat_corpus/raw/master/' build_data.download(url + fname1, dpath, fname1) build_data.download(url + fname2, dpath, fname2) file1 = os.path.join(dpath, fname1) file2 = os.path.join(dpath, fname2) file3 = "twitter_en_big.txt.gz" outzipfile= os.path.join(dpath, file3) build_data.cat(file1, file2, outzipfile) import gzip with gzip.open(outzipfile, 'r') as f: file_content = bytes.decode(f.read()) data = file_content.split('\n')[2:] create_fb_format(data, dpath) os.remove(outzipfile) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): dpath = os.path.join(opt['datapath'], 'NarrativeQA') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # an older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # download the data. fname = 'narrative_qa.zip' # dataset URL url = NARRATIVE_QA_DOWNLOAD_URL build_data.download(url, dpath, fname) # uncompress it build_data.untar(dpath, fname) print('downloading stories now') base_path = os.path.join(dpath, 'narrativeqa-master') download_stories(base_path) # move from tmp to stories tmp_stories_path = os.path.join(base_path, 'tmp') new_stories_path = os.path.join(base_path, 'stories') shutil.move(tmp_stories_path, new_stories_path) # divide into train, valid and test for summaries summaries_csv_path = os.path.join(base_path, 'third_party', 'wikipedia', 'summaries.csv') new_path = os.path.join(base_path, 'summaries.csv') shutil.move(summaries_csv_path, new_path) divide_csv_into_sets(new_path) # divide into sets for questions questions_path = os.path.join(base_path, 'qaps.csv') divide_csv_into_sets(questions_path) # divide into sets for documents documents_path = os.path.join(base_path, 'documents.csv') divide_csv_into_sets(documents_path) # move specific set's files into their set's folder make_folders(base_path) move_files(base_path) # move narrativeqa-master to narrative_qa new_path = os.path.join(dpath, 'narrative_qa') shutil.move(base_path, new_path) # mark the data as built build_data.mark_done(dpath, version_string=version)
def build(opt): version = 'v0.9' dpath = os.path.join(opt['datapath'], 'VisDial-v0.9') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname1 = 'visdial_0.9_train.zip' fname2 = 'visdial_0.9_val.zip' url = 'https://computing.ece.vt.edu/~abhshkdz/data/visdial/' build_data.download(url + fname1, dpath, fname1) build_data.download(url + fname2, dpath, fname2) build_data.untar(dpath, fname1) build_data.untar(dpath, fname2) print('processing unpacked files') # Use 1000 examples from training set as validation. json1 = os.path.join(dpath, fname1.rsplit('.', 1)[0] + '.json') with open(json1) as t_json: train_data = json.load(t_json) valid_data = train_data.copy() valid_data['data'] = train_data['data'].copy() valid_data['data']['dialogs'] = [] # Use constant stride to pick examples. num_valid = 1000 total = len(train_data['data']['dialogs']) step = total // (num_valid - 1) for i in range(total-1, 0, -step)[:num_valid]: valid_data['data']['dialogs'].append(train_data['data']['dialogs'][i]) del train_data['data']['dialogs'][i] train_json = json1.rsplit('.', 1)[0] + '_train.json' valid_json = json1.rsplit('.', 1)[0] + '_valid.json' with open(train_json, 'w') as t_out, open(valid_json, 'w') as v_out: json.dump(train_data, t_out) json.dump(valid_data, v_out) os.remove(json1) # Use validation data as test. json2 = os.path.join(dpath, fname2.rsplit('.', 1)[0] + '.json') test_json = json2.rsplit('.', 1)[0] + '_test.json' build_data.move(json2, test_json) # Mark the data as built. build_data.mark_done(dpath, version)
def readFiles(dpath, rfnames): en_fname, de_fname = rfnames url_base = 'https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/' en_url = url_base + en_fname de_url = url_base + de_fname build_data.download(en_url, dpath, en_fname) build_data.download(de_url, dpath, de_fname) with open(os.path.join(dpath, en_fname), 'r') as f: # We replace '##AT##-##AT##' as a workaround in order to use the # nltk tokenizer specified by DictionaryAgent en = [l[:-1].replace("##AT##-##AT##", "__AT__") for l in f] with open(os.path.join(dpath, de_fname), 'r') as f: de = [l[:-1].replace("##AT##-##AT##", "__AT__") for l in f] return list(zip(de, en))
def build(): version = 'v1.0' dpath = os.path.join(datapath, "persona_biases_categories") if not build_data.built(dpath, version): logging.info(f"building data: {dpath}") if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fnames = ["demographic_groups.txt", "offensive_adjectives.txt"] for fname in fnames: url = "http://parl.ai/downloads/persona_biases_categories/" + fname build_data.download(url, dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version) return dpath
def build(opt): dpath = os.path.join(opt['datapath'], 'SST') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'stanfordSentimentTreebank.zip' url = 'http://nlp.stanford.edu/~socherr/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def download(datapath, version='v1.0'): dpath = os.path.join(datapath, 'models', 'bert_models') if not build_data.built(dpath, version): print('[downloading BERT models: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fnames = ['bert-base-uncased.tar.gz', 'bert-base-uncased-vocab.txt'] for fname in fnames: url = ('https://s3.amazonaws.com/models.huggingface.co/bert/' + fname) build_data.download(url, dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version)
def build(datapath): version = 'v1.0' dpath = os.path.join(datapath, 'dialogue_safety') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fnames = [SINGLE_TURN_DATA, MULTI_TURN_DATA] for fname in fnames: url = 'http://parl.ai/downloads/dialogue_safety/' + fname build_data.download(url, dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): dpath = os.path.join(opt['datapath'], 'dialog-bAbI') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'dialog_babi.tar.gz' url = 'https://s3.amazonaws.com/fair-data/parlai/dialog_babi/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'talkthewalk') version = 'None' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'talkthewalk.tgz' url = 'http://parl.ai/downloads/projects/talkthewalk/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'TriviaQA') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'triviaqa-rc.tar.gz' url = 'http://nlp.cs.washington.edu/triviaqa/data/' build_data.download(url + fname, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'dialog-bAbI') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'dialog_babi.tar.gz' url = 'http://parl.ai/downloads/dialog_babi/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): version = 'v1.0' dpath = os.path.join(opt['datapath'], 'Persona-Chat') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'personachat.tgz' url = 'https://s3.amazonaws.com/fair-data/parlai/personachat/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): dpath = os.path.join(opt['datapath'], 'TriviaQA') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'triviaqa-rc.tar.gz' url = 'http://nlp.cs.washington.edu/triviaqa/data/' build_data.download(url + fname, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'mnist') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'mnist.tar.gz' url = 'https://s3.amazonaws.com/fair-data/parlai/mnist/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'QA-SRL') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fnames = ['wiki1.train.qa', 'wiki1.dev.qa', 'wiki1.test.qa'] for fname in fnames: url = 'https://dada.cs.washington.edu/qasrl/data/' + fname build_data.download(url, dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'FVQA') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') # An older version exists, so remove these outdated files. if build_data.built(dpath): build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. build_data.download( 'https://dl.dropboxusercontent.com/s/iyz6l7jhbt6jb7q/new_dataset_release.zip', dpath, 'FVQA.zip') # noqa: E501 build_data.untar(dpath, 'FVQA.zip') # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): version = 'v5.0' dpath = os.path.join(opt['datapath'], 'ConvAI2') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'convai2_fix_723.tgz' url = 'http://parl.ai/downloads/convai2/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): data_path = os.path.join(opt['datapath'], 'ConvAIChitChat') version = '1501534800' pdb.set_trace() if not build_data.built(data_path, version_string=version): print('[building data: ' + data_path + ']') if build_data.built(data_path): build_data.remove_dir(data_path) build_data.make_dir(data_path) fname = 'data_' + version + '.tar.gz' url = 'https://raw.githubusercontent.com/deepmipt/turing-data/master/' + fname build_data.download(url, data_path, fname) build_data.untar(data_path, fname) os.rename(os.path.join(data_path, 'data_train_' + version + '.json'), os.path.join(data_path, 'train.json')) os.rename(os.path.join(data_path, 'data_test_' + version + '.json'), os.path.join(data_path, 'test.json')) # Extract 10% of train.json into valid.json. with open(os.path.join(data_path, 'train.json')) as data_file: dialogs = json.load(data_file) random.seed(0) valid_dialogs_idxes = random.sample(range(len(dialogs)), round(len(dialogs) * 0.1)) valid_dialogs = [dialogs[idx] for idx in valid_dialogs_idxes] train_dialogs_idxes = list( set(range(len(dialogs))) - set(valid_dialogs_idxes)) train_dialogs = [dialogs[idx] for idx in train_dialogs_idxes] with open(os.path.join(data_path, 'valid.json'), 'w') as valid_data_file: json.dump(valid_dialogs, valid_data_file) with open(os.path.join(data_path, 'train.json'), 'w') as valid_data_file: json.dump(train_dialogs, valid_data_file) build_data.mark_done(data_path, version_string=version)
def build(opt): # Depends upon another dataset, wikimovies, build that first. wikimovies_build.build(opt) dpath = os.path.join(opt['datapath'], 'MTurkWikiMovies') if not build_data.built(dpath): print('[building data: ' + dpath + ']') build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'mturkwikimovies.tar.gz' url = ('https://s3.amazonaws.com/fair-data/parlai/mturkwikimovies/' + fname) build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath)
def build(opt): dpath = os.path.join(opt['datapath'], 'CLEVR') version = 'v1.0' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') # An older version exists, so remove these outdated files. if build_data.built(dpath): build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'CLEVR_v1.0.zip' url = 'https://s3-us-west-1.amazonaws.com/clevr/' build_data.download(url + fname, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'personalized-dialog') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. # https://www.dropbox.com/s/4i9u4y24pt3paba/personalized-dialog-dataset.tar.gz?dl=1 fname = 'personalized-dialog-dataset.tar.gz' url = 'https://www.dropbox.com/s/4i9u4y24pt3paba/' + fname + '?dl=1' build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): data_path = os.path.join(opt['datapath'], 'DialogueQE') version = '1501534800' if not build_data.built(data_path, version_string=version): print('[building data: ' + data_path + ']') if build_data.built(data_path): build_data.remove_dir(data_path) build_data.make_dir(data_path) fname = 'data_' + version + '.tar.gz' url = 'https://raw.githubusercontent.com/deepmipt/turing-data/master/' + fname build_data.download(url, data_path, fname) build_data.untar(data_path, fname) os.rename(os.path.join(data_path, 'data_train_' + version + '.json'), os.path.join(data_path, 'train.json')) os.rename(os.path.join(data_path, 'data_test_' + version + '.json'), os.path.join(data_path, 'test.json')) build_data.mark_done(data_path, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'personalized-dialog') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. # https://www.dropbox.com/s/4i9u4y24pt3paba/personalized-dialog-dataset.tar.gz?dl=1 fname = 'personalized-dialog-dataset.tar.gz' url = 'https://www.dropbox.com/s/4i9u4y24pt3paba/' + fname + '?dl=1' build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): datapath = os.path.join(opt['datapath'], 'DREAM') build_data.make_dir(datapath) version = None if not build_data.built(datapath, version_string=version): print('[building data: ' + datapath + ']') if build_data.built(datapath): # An older version exists, so remove these outdated files. build_data.remove_dir(datapath) # Download the data. splits = ['train', 'dev', 'test'] for split in splits: fname = split + '.json' url = 'https://raw.githubusercontent.com/nlpdata/dream/master/data/' + fname build_data.download(url, datapath, fname) # Mark the data as built. build_data.mark_done(datapath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'OpenSubtitles') version = '1' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. url = ('http://opus.lingfil.uu.se/download.php?f=OpenSubtitles/en.tar.gz') build_data.download(url, dpath, 'OpenSubtitles.tar.gz') build_data.untar(dpath, 'OpenSubtitles.tar.gz') create_fb_format(os.path.join(dpath, 'OpenSubtitles', 'en'), dpath) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'CLEVR') version = 'v1.0' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') # An older version exists, so remove these outdated files. if build_data.built(dpath): build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'CLEVR_v1.0.zip' url = 'https://s3-us-west-1.amazonaws.com/clevr/' build_data.download(url + fname, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'SQuAD') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname1 = 'train-v1.1.json' fname2 = 'dev-v1.1.json' url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/' build_data.download(url + fname1, dpath, fname1) build_data.download(url + fname2, dpath, fname2) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def _build_data(self) -> Tuple[str, str]: """ Build data. Maybe download the appropriate data. :return (bpe_data, json_path): bpe_data and path to encoder json """ data_path = os.path.join(self.opt['datapath'], 'gpt2') vocab_path = os.path.join(data_path, 'vocab.bpe') json_path = os.path.join(data_path, 'encoder.json') if not PathManager.exists(vocab_path) or not PathManager.exists(json_path): make_dir(data_path) download(self.DEFAULT_VOCAB_BPE, data_path, 'vocab.bpe') download(self.DEFAULT_ENCODER_JSON, data_path, 'encoder.json') with PathManager.open(vocab_path, 'r', encoding="utf-8") as f: bpe_data = f.read() return bpe_data, json_path, vocab_path
def build(opt): data_path = os.path.join(opt['datapath'], 'DialogueQE') version = '1501534800' if not build_data.built(data_path, version_string=version): print('[building data: ' + data_path + ']') if build_data.built(data_path): build_data.remove_dir(data_path) build_data.make_dir(data_path) fname = 'data_' + version + '.tar.gz' url = 'https://raw.githubusercontent.com/deepmipt/turing-data/master/' + fname build_data.download(url, data_path, fname) build_data.untar(data_path, fname) os.rename(os.path.join(data_path, 'data_train_' + version + '.json'), os.path.join(data_path, 'train.json')) os.rename(os.path.join(data_path, 'data_test_' + version + '.json'), os.path.join(data_path, 'test.json')) build_data.mark_done(data_path, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'MutualFriends') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. url = ( 'https://worksheets.codalab.org/rest/bundles/' '0x5a4cefea7fd443cea15aa532bb8fcd67/contents/blob/' ) build_data.download(url, dpath, 'data.json') # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = opt['datapath'] + "/CornellMovie/" if not build_data.built(dpath): print("[building data: " + dpath + "]") build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = "cornell_movie_dialogs_corpus.zip" url = "http://www.mpi-sws.org/~cristian/data/" + fname build_data.download(dpath, url) build_data.untar(dpath, fname) dpext = dpath + '/cornell movie-dialogs corpus/' create_fb_format(dpext + 'movie_lines.txt', dpext + 'movie_conversations.txt', dpath) # Mark the data as built. build_data.mark_done(dpath)
def build(opt): dpath = os.path.join(opt['datapath'], 'MCTest') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'mctest.tar.gz' url = 'http://parl.ai/downloads/mctest/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) dpext = os.path.join(dpath, 'mctest') create_fb_format(dpath, 'train160', os.path.join(dpext, 'MCTest', 'mc160.train'), None) create_fb_format(dpath, 'valid160', os.path.join(dpext, 'MCTest', 'mc160.dev'), None) create_fb_format( dpath, 'test160', os.path.join(dpext, 'MCTest', 'mc160.test'), os.path.join(dpext, 'MCTestAnswers', 'mc160.test.ans'), ) create_fb_format(dpath, 'train500', os.path.join(dpext, 'MCTest', 'mc500.train'), None) create_fb_format(dpath, 'valid500', os.path.join(dpext, 'MCTest', 'mc500.dev'), None) create_fb_format( dpath, 'test500', os.path.join(dpext, 'MCTest', 'mc500.test'), os.path.join(dpext, 'MCTestAnswers', 'mc500.test.ans'), ) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'VisDial-v0.9') if not build_data.built(dpath): print('[building data: ' + dpath + ']') build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname1 = 'visdial_0.9_train.zip' fname2 = 'visdial_0.9_val.zip' url = 'https://computing.ece.vt.edu/~abhshkdz/data/visdial/' build_data.download(dpath, url + fname1) build_data.download(dpath, url + fname2) build_data.untar(dpath, fname1) build_data.untar(dpath, fname2) # Mark the data as built. build_data.mark_done(dpath)
def build(opt): dpath = os.path.join(opt['datapath'], 'CornellMovie') if not build_data.built(dpath): print('[building data: ' + dpath + ']') build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'cornell_movie_dialogs_corpus.zip' url = 'http://www.mpi-sws.org/~cristian/data/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) dpext = os.path.join(dpath, 'cornell movie-dialogs corpus') create_fb_format(os.path.join(dpext, 'movie_lines.txt'), os.path.join(dpext, 'movie_conversations.txt'), dpath) # Mark the data as built. build_data.mark_done(dpath)
def build(opt): dpath = os.path.join(opt['datapath'], 'WoZ') version = 'None' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fnames = ['woz_test_en.json', 'woz_train_en.json', 'woz_validate_en.json'] for fname in fnames: url = ('https://github.com/nmrksic/' 'neural-belief-tracker/raw/master/data/woz/' + fname) build_data.download(url, dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'MovieDialog') if not build_data.built(dpath): print('[building data: ' + dpath + ']') build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'moviedialog.tar.gz' url = 'https://s3.amazonaws.com/fair-data/parlai/moviedialog/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) dpath2 = os.path.join(dpath, 'movie_dialog_dataset', 'task4_reddit') url2 = 'http://tinyurl.com/' + 'p6tyohj' build_data.download(url2, dpath2, 'p6tyohj.tgz') build_data.untar(dpath2, 'p6tyohj.tgz') # Mark the data as built. build_data.mark_done(dpath)
def build(datapath): dpath = os.path.join(datapath, 'OpenSubtitles') version = '2' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. url = ('http://opus.lingfil.uu.se/download.php?f=OpenSubtitles/en.tar.gz') build_data.download(url, dpath, 'OpenSubtitles.tar.gz') build_data.untar(dpath, 'OpenSubtitles.tar.gz') create_fb_format(os.path.join(dpath, 'OpenSubtitles', 'en'), dpath) # Mark the data as built. build_data.mark_done(dpath, version_string=version) return dpath
def build(opt): dpath = os.path.join(opt['datapath'], 'MS_MARCO') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data url = "https://msmarco.blob.core.windows.net/msmarco/" fname = "train_v1.1.json.gz" build_data.download(url + fname, dpath, 'train.gz') fname = "dev_v1.1.json.gz" build_data.download(url + fname, dpath, 'valid.gz') fname = "test_public_v1.1.json.gz" build_data.download(url + fname, dpath, 'test.gz') create_fb_format(dpath, "train", os.path.join(dpath, 'train.gz')) create_fb_format(dpath, "valid", os.path.join(dpath, 'valid.gz')) create_fb_format(dpath, "test", os.path.join(dpath, 'test.gz')) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def buildImage(opt): dpath = os.path.join(opt['datapath'], 'COCO-IMG') version = '1' if not build_data.built(dpath, version_string=version): print('[building image data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the image data. fname1 = 'train2014.zip' fname2 = 'val2014.zip' fname3 = 'test2015.zip' url = 'https://s3.amazonaws.com/fair-data/parlai/COCO-IMG/' build_data.download(url + fname1, dpath, fname1) build_data.download(url + fname2, dpath, fname2) build_data.download(url + fname3, dpath, fname3) build_data.untar(dpath, fname1) build_data.untar(dpath, fname2) build_data.untar(dpath, fname3) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def download(opt): version = 'v2.03' # download pickled database dpath = os.path.join(opt['datapath'], 'light_dialogue') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. url = 'http://parl.ai/downloads/light/' + 'light-dialog-processed-small7.pkl' fname = 'light_data.pkl' build_data.download(url, dpath, fname) # Download the unseen data. url = 'http://parl.ai/downloads/light/light-unseen-processed2.pkl' fname = 'light_unseen_data.pkl' build_data.download(url, dpath, fname) # Download the environment dataset. url = 'http://parl.ai/downloads/light/light-environment.pkl' fname = 'light_environment.pkl' build_data.download(url, dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version) return dpath, version
def build(opt): dpath = os.path.join(opt['datapath'], 'MS_MARCO') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data url = "https://msmarco.blob.core.windows.net/msmarco/" fname = "train_v1.1.json.gz" build_data.download(url + fname, dpath, 'train.gz') fname = "dev_v1.1.json.gz" build_data.download(url + fname, dpath, 'valid.gz') fname = "test_public_v1.1.json.gz" build_data.download(url + fname, dpath, 'test.gz') create_fb_format(dpath, "train", os.path.join(dpath, 'train.gz')) create_fb_format(dpath, "valid", os.path.join(dpath, 'valid.gz')) create_fb_format(dpath, "test", os.path.join(dpath, 'test.gz')) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): # Depends upon another dataset, wikimovies, build that first. wikimovies_build.build(opt) dpath = os.path.join(opt['datapath'], 'DBLL') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'dbll.tgz' url = 'https://s3.amazonaws.com/fair-data/parlai/dbll/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'COPA') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # an older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # download the data. fname = 'COPA-resources.tgz' # dataset URL url = 'http://people.ict.usc.edu/~gordon/downloads/' + fname build_data.download(url, dpath, fname) # uncompress it build_data.untar(dpath, fname) # mark the data as built build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'negotiation') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') # make a clean directory if needed if build_data.built(dpath): # an older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data from github fname = 'negotiation.zip' url = ('https://github.com/facebookresearch/end-to-end-negotiator/' 'archive/master.zip') print('[downloading data from: ' + url + ']') build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark as done build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'InsuranceQA') version = '1' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data from github. fname = 'insuranceqa.zip' url = 'https://github.com/shuzi/insuranceQA/archive/master.zip' print('[downloading data from: ' + url + ']') build_data.download(url, dpath, fname) build_data.untar(dpath, fname) ParseInsuranceQAV1.build(dpath) ParseInsuranceQAV2.build(dpath) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): version = 'v1.0' dpath = os.path.join(opt['datapath'], 'SCAN') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'scan.tgz' url = 'https://s3.amazonaws.com/fair-data/parlai/scan/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) ext = os.path.join('dailymail', 'questions') create_fb_format(dpath, 'train', os.path.join(dpath, 'tasks_train_simple.txt')) create_fb_format(dpath, 'valid', os.path.join(dpath, 'tasks_train_simple.txt')) create_fb_format(dpath, 'test', os.path.join(dpath, 'tasks_test_simple.txt')) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): dpath = os.path.join(opt['datapath'], 'CornellMovie') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'cornell_movie_dialogs_corpus.zip' url = 'http://www.mpi-sws.org/~cristian/data/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) dpext = os.path.join(dpath, 'cornell movie-dialogs corpus') create_fb_format(os.path.join(dpext, 'movie_lines.txt'), os.path.join(dpext, 'movie_conversations.txt'), dpath) # Mark the data as built. build_data.mark_done(dpath, version_string=version)