def build(opt): dpath = os.path.join(opt['datapath'], 'wmt') version = 'None' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fnames = [('train.en','train.de', 'en_de_train.txt'), ('newstest2014.en','newstest2014.de', 'en_de_test.txt')] for (en_fname, de_fname, w_fname) in fnames: url_base = 'https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/' en_url = url_base + en_fname de_url = url_base + de_fname build_data.download(en_url, dpath, en_fname) build_data.download(de_url, dpath, de_fname) with open(os.path.join(dpath, en_fname), 'r') as f: en = [l[:-1] for l in f] with open(os.path.join(dpath, de_fname), 'r') as f: de = [l[:-1] for l in f] with open(os.path.join(dpath, w_fname), 'w') as f: for de_sent,en_sent in zip(de,en): f.write("1 "+en_sent+"\t"+de_sent+"\n") # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): version = 'v1.0' dpath = os.path.join(opt['datapath'], 'Twitter') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname1 = "twitter_en_big.txt.gz.partaa" fname2 = "twitter_en_big.txt.gz.partab" url = 'https://github.com/Marsan-Ma/chat_corpus/raw/master/' build_data.download(url + fname1, dpath, fname1) build_data.download(url + fname2, dpath, fname2) file1 = os.path.join(dpath, fname1) file2 = os.path.join(dpath, fname2) file3 = "twitter_en_big.txt.gz" outzipfile= os.path.join(dpath, file3) build_data.cat(file1, file2, outzipfile) import gzip with gzip.open(outzipfile, 'r') as f: file_content = bytes.decode(f.read()) data = file_content.split('\n')[2:] create_fb_format(data, dpath) os.remove(outzipfile) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): dpath = os.path.join(opt['datapath'], 'MCTest') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'mctest.tar.gz' url = 'https://s3.amazonaws.com/fair-data/parlai/mctest/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) dpext = os.path.join(dpath, 'mctest') create_fb_format(dpath, 'train160', os.path.join(dpext, 'MCTest', 'mc160.train'), None) create_fb_format(dpath, 'valid160', os.path.join(dpext, 'MCTest', 'mc160.dev'), None) create_fb_format(dpath, 'test160', os.path.join(dpext, 'MCTest', 'mc160.test'), os.path.join(dpext, 'MCTestAnswers', 'mc160.test.ans')) create_fb_format(dpath, 'train500', os.path.join(dpext, 'MCTest', 'mc500.train'), None) create_fb_format(dpath, 'valid500', os.path.join(dpext, 'MCTest', 'mc500.dev'), None) create_fb_format(dpath, 'test500', os.path.join(dpext, 'MCTest', 'mc500.test'), os.path.join(dpext, 'MCTestAnswers', 'mc500.test.ans')) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'WebQuestions') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. url = ('https://worksheets.codalab.org/rest/bundles/' + '0x4a763f8cde224c2da592b75f29e2f5c2/contents/blob/') build_data.download(url, dpath, 'train.json') url = ('https://worksheets.codalab.org/rest/bundles/' + '0xe7bac352fce7448c9ef238fb0a297ec2/contents/blob/') build_data.download(url, dpath, 'test.json') create_fb_format(dpath, 'train', os.path.join(dpath, 'train.json')) create_fb_format(dpath, 'valid', os.path.join(dpath, 'train.json')) create_fb_format(dpath, 'test', os.path.join(dpath, 'test.json')) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'wmt') version = 'None' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. train_r_fnames = ('train.en', 'train.de') train_w_fname = 'en_de_train.txt' valid_w_fname = 'en_de_valid.txt' test_r_fnames = ('newstest2014.en', 'newstest2014.de') test_w_fname = 'en_de_test.txt' train_zip = readFiles(dpath, train_r_fnames) numpy.random.shuffle(train_zip) with open(os.path.join(dpath, valid_w_fname), 'w') as f: for de_sent, en_sent in train_zip[:30000]: f.write("1 "+en_sent+"\t"+de_sent+"\n") with open(os.path.join(dpath, train_w_fname), 'w') as f: for de_sent, en_sent in train_zip[30000:]: f.write("1 "+en_sent+"\t"+de_sent+"\n") test_zip = readFiles(dpath, test_r_fnames) with open(os.path.join(dpath, test_w_fname), 'w') as f: for de_sent, en_sent in test_zip: f.write("1 "+en_sent+"\t"+de_sent+"\n") # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def buildImage(opt): dpath = os.path.join(opt['datapath'], 'COCO-IMG') version = '1' if not build_data.built(dpath, version_string=version): print('[building image data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the image data. fname1 = 'train2014.zip' fname2 = 'val2014.zip' fname3 = 'test2015.zip' url = 'https://s3.amazonaws.com/fair-data/parlai/COCO-IMG/' build_data.download(url + fname1, dpath, fname1) build_data.download(url + fname2, dpath, fname2) build_data.download(url + fname3, dpath, fname3) build_data.untar(dpath, fname1) build_data.untar(dpath, fname2) build_data.untar(dpath, fname3) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): version = 'v1.0' dpath = os.path.join(opt['datapath'], 'QACNN') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'cnn.tgz' gd_id = '0BwmD_VLjROrfTTljRDVZMFJnVWM' build_data.download_from_google_drive(gd_id, os.path.join(dpath, fname)) build_data.untar(dpath, fname) create_fb_format(dpath, 'train', os.path.join(dpath, 'cnn', 'questions', 'training')) create_fb_format(dpath, 'valid', os.path.join(dpath, 'cnn', 'questions', 'validation')) create_fb_format(dpath, 'test', os.path.join(dpath, 'cnn', 'questions', 'test')) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): dpath = os.path.join(opt['datapath'], 'MS_MARCO') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data url = "https://msmarco.blob.core.windows.net/msmarco/" fname = "train_v1.1.json.gz" build_data.download(url + fname, dpath, 'train.gz') fname = "dev_v1.1.json.gz" build_data.download(url + fname, dpath, 'valid.gz') fname = "test_public_v1.1.json.gz" build_data.download(url + fname, dpath, 'test.gz') create_fb_format(dpath, "train", os.path.join(dpath, 'train.gz')) create_fb_format(dpath, "valid", os.path.join(dpath, 'valid.gz')) create_fb_format(dpath, "test", os.path.join(dpath, 'test.gz')) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'MovieDialog') version = '1' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') dpath2 = os.path.join(dpath, 'movie_dialog_dataset', 'task4_reddit') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) build_data.make_dir(dpath2) # Download the data. fname = 'moviedialog.tar.gz' url = 'https://s3.amazonaws.com/fair-data/parlai/moviedialog/' + fname build_data.download(url, dpath, fname) url2 = 'http://tinyurl.com/' + 'p6tyohj' build_data.download(url2, dpath2, 'p6tyohj.tgz') build_data.untar(dpath, fname) build_data.untar(dpath2, 'p6tyohj.tgz') # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'VQA-v2') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') # An older version exists, so remove these outdated files. if build_data.built(dpath): build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname1 = 'v2_Questions_Train_mscoco.zip' fname2 = 'v2_Questions_Val_mscoco.zip' fname3 = 'v2_Questions_Test_mscoco.zip' fname4 = 'v2_Annotations_Val_mscoco.zip' fname5 = 'v2_Annotations_Train_mscoco.zip' url = 'http://visualqa.org/data/mscoco/vqa/' build_data.download(url + fname1, dpath, fname1) build_data.download(url + fname2, dpath, fname2) build_data.download(url + fname3, dpath, fname3) build_data.download(url + fname4, dpath, fname4) build_data.download(url + fname5, dpath, fname5) build_data.untar(dpath, fname1) build_data.untar(dpath, fname2) build_data.untar(dpath, fname3) build_data.untar(dpath, fname4) build_data.untar(dpath, fname5) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def load(self, path): opt = self.opt mode = opt.get('image_mode', 'raw') if mode is None or mode == 'none': # don't need to load images return None elif mode == 'raw': # raw just returns RGB values return Image.open(path).convert('RGB') elif mode == 'ascii': # convert images to ascii ¯\_(ツ)_/¯ return self.img_to_ascii(path) else: # otherwise, looks for preprocessed version under 'mode' directory prepath, imagefn = os.path.split(path) dpath = os.path.join(prepath, mode) if not os.path.exists(dpath): build_data.make_dir(dpath) imagefn = imagefn + '.npy' new_path = os.path.join(prepath, mode, imagefn) if not os.path.isfile(new_path): return self.extract(Image.open(path).convert('RGB'), new_path) else: return np.load(new_path)
def build(opt): dpath = os.path.join(opt['datapath'], 'WikiQA') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'wikiqa.tar.gz' url = 'https://s3.amazonaws.com/fair-data/parlai/wikiqa/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) dpext = os.path.join(dpath, 'WikiQACorpus') create_fb_format(dpath, 'train', os.path.join(dpext, 'WikiQA-train.tsv')) create_fb_format(dpath, 'valid', os.path.join(dpext, 'WikiQA-dev.tsv')) create_fb_format(dpath, 'test', os.path.join(dpext, 'WikiQA-test.tsv')) create_fb_format(dpath, 'train-filtered', os.path.join(dpext, 'WikiQA-train.tsv')) create_fb_format(dpath, 'valid-filtered', os.path.join(dpext, 'WikiQA-dev.tsv')) create_fb_format(dpath, 'test-filtered', os.path.join(dpext, 'WikiQA-test.tsv')) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'NarrativeQA') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # an older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # download the data. fname = 'narrative_qa.zip' # dataset URL url = NARRATIVE_QA_DOWNLOAD_URL build_data.download(url, dpath, fname) # uncompress it build_data.untar(dpath, fname) print('downloading stories now') base_path = os.path.join(dpath, 'narrativeqa-master') download_stories(base_path) # move from tmp to stories tmp_stories_path = os.path.join(base_path, 'tmp') new_stories_path = os.path.join(base_path, 'stories') shutil.move(tmp_stories_path, new_stories_path) # divide into train, valid and test for summaries summaries_csv_path = os.path.join(base_path, 'third_party', 'wikipedia', 'summaries.csv') new_path = os.path.join(base_path, 'summaries.csv') shutil.move(summaries_csv_path, new_path) divide_csv_into_sets(new_path) # divide into sets for questions questions_path = os.path.join(base_path, 'qaps.csv') divide_csv_into_sets(questions_path) # divide into sets for documents documents_path = os.path.join(base_path, 'documents.csv') divide_csv_into_sets(documents_path) # move specific set's files into their set's folder make_folders(base_path) move_files(base_path) # move narrativeqa-master to narrative_qa new_path = os.path.join(dpath, 'narrative_qa') shutil.move(base_path, new_path) # mark the data as built build_data.mark_done(dpath, version_string=version)
def build(opt): version = 'v0.9' dpath = os.path.join(opt['datapath'], 'VisDial-v0.9') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname1 = 'visdial_0.9_train.zip' fname2 = 'visdial_0.9_val.zip' url = 'https://computing.ece.vt.edu/~abhshkdz/data/visdial/' build_data.download(url + fname1, dpath, fname1) build_data.download(url + fname2, dpath, fname2) build_data.untar(dpath, fname1) build_data.untar(dpath, fname2) print('processing unpacked files') # Use 1000 examples from training set as validation. json1 = os.path.join(dpath, fname1.rsplit('.', 1)[0] + '.json') with open(json1) as t_json: train_data = json.load(t_json) valid_data = train_data.copy() valid_data['data'] = train_data['data'].copy() valid_data['data']['dialogs'] = [] # Use constant stride to pick examples. num_valid = 1000 total = len(train_data['data']['dialogs']) step = total // (num_valid - 1) for i in range(total-1, 0, -step)[:num_valid]: valid_data['data']['dialogs'].append(train_data['data']['dialogs'][i]) del train_data['data']['dialogs'][i] train_json = json1.rsplit('.', 1)[0] + '_train.json' valid_json = json1.rsplit('.', 1)[0] + '_valid.json' with open(train_json, 'w') as t_out, open(valid_json, 'w') as v_out: json.dump(train_data, t_out) json.dump(valid_data, v_out) os.remove(json1) # Use validation data as test. json2 = os.path.join(dpath, fname2.rsplit('.', 1)[0] + '.json') test_json = json2.rsplit('.', 1)[0] + '_test.json' build_data.move(json2, test_json) # Mark the data as built. build_data.mark_done(dpath, version)
def save_graph(self, fname): path = os.path.join(self._opt['datapath'], 'graph_world2') build_data.make_dir(path) if fname != '': self._save_fname = path + '/' + fname + '.gw2' else: fname = self._save_fname members = [attr for attr in dir(self) if not callable(getattr(self, attr)) and (not attr.startswith("__")) and (attr.startswith("_"))] model = {} for m in members: model[m] = getattr(self, m) with open(fname, 'wb') as write: torch.save(model, write)
def build(opt): dpath = os.path.join(opt['datapath'], 'wizard_of_wikipedia') version = '1.0' if not build_data.built(dpath, version): print('dpath', dpath) print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. for downloadable_file in RESOURCES: downloadable_file.download_file(dpath) build_data.mark_done(dpath, version)
def build(datapath): version = 'v1.0' dpath = os.path.join(datapath, 'genderation_bias') if not build_data.built(dpath, version): logging.info('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. for downloadable_file in RESOURCES: downloadable_file.download_file(dpath) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): dpath = os.path.join(opt['datapath'], 'LCCC') version = None if not build_data.built(dpath, version_string=version): if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. RESOURCES[0].download_file(dpath) # Format it for use with ConversationTeacher _create_parlai_format(dpath) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath, version = download(opt) if 'light_use_speech_prefix' not in opt: opt['light_use_speech_prefix'] = True # create particular instance of dataset depending on flags.. fpath = get_fpath(opt) dump_path = os.path.join(opt['datapath'], 'light_dialogue_wild', 'dumps') data_path = os.path.join(opt['datapath'], 'light_dialogue_wild', fpath) if not build_data.built(data_path, version): if build_data.built(data_path): # An older version exists, so remove these outdated files. build_data.remove_dir(data_path) build_data.make_dir(data_path) build_from_dump(opt, data_path, dump_path) # Mark the data as built. build_data.mark_done(data_path, version)
def build(opt): dpath = opt['datapath'] + "/dialog-bAbI/" if not build_data.built(dpath): print("[building data: " + dpath + "]") build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = "dialog_babi.tar.gz" url = "https://s3.amazonaws.com/fair-data/parlai/dialog_babi/" + fname build_data.download(dpath, url) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath)
def build(opt): dpath = os.path.join(opt['datapath'], 'WikiMovies') if not build_data.built(dpath): print('[building data: ' + dpath + ']') build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'wikimovies.tar.gz' url = 'https://s3.amazonaws.com/fair-data/parlai/wikimovies/' + fname build_data.download(dpath, url) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath)
def build(opt): dpath = os.path.join(opt['datapath'], 'dialog-bAbI-plus') version = "v1.1" if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. for downloadable_file in RESOURCES: downloadable_file.download_file(dpath) build_data.mark_done(dpath, version)
def build_data_from_path(dpath, version, downloadable_files): if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data for downloadable_file in downloadable_files: downloadable_file.download_file(dpath) # Mark the data as built build_data.mark_done(dpath, version_string=version) return dpath, version
def build(opt): dpath = os.path.join(opt['datapath'], 'DBLL') if not build_data.built(dpath): print('[building data: ' + dpath + ']') build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'dbll.tgz' url = 'https://s3.amazonaws.com/fair-data/parlai/dbll/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath)
def build(): version = 'v1.0' dpath = os.path.join(self.datapath, 'OffensiveLanguage') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'OffensiveLanguage.txt' url = 'http://parl.ai/downloads/offensive_language/' + fname build_data.download(url, dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version)
def download_process_wikiqa(data_path='data'): dpath = os.path.join(data_path, 'WikiQA') build_data.make_dir(dpath) fname = 'wikiqa.tar.gz' url = 'http://parl.ai/downloads/wikiqa/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) dpext = os.path.join(dpath, 'WikiQACorpus') create_fb_format(dpath, 'train', os.path.join(dpext, 'WikiQA-train.tsv')) create_fb_format(dpath, 'valid', os.path.join(dpext, 'WikiQA-dev.tsv')) create_fb_format(dpath, 'test', os.path.join(dpext, 'WikiQA-test.tsv')) create_fb_format(dpath, 'train-filtered', os.path.join(dpext, 'WikiQA-train.tsv')) create_fb_format(dpath, 'valid-filtered', os.path.join(dpext, 'WikiQA-dev.tsv')) create_fb_format(dpath, 'test-filtered', os.path.join(dpext, 'WikiQA-test.tsv'))
def build(opt): dpath = os.path.join(opt['datapath'], 'SimpleQuestions') if not build_data.built(dpath): print('[building data: ' + dpath + ']') build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'simplequestions.tar.gz' url = ('https://s3.amazonaws.com/fair-data/parlai/simplequestions/' + fname) build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath)
def download_stories(path): documents_csv = os.path.join(path, 'documents.csv') tmp_dir = os.path.join(path, 'tmp') build_data.make_dir(tmp_dir) with open(documents_csv, 'r') as f: reader = csv.DictReader(f, delimiter=',') for row in reader: print("Downloading %s (%s)" % (row['wiki_title'], row['document_id'])) finished = try_downloading(tmp_dir, row) count = 0 while not finished and count < 5: if count != 0: print("Retrying (%d retries left)" % (5 - count - 1)) finished = try_downloading(tmp_dir, row) count += 1
def build(opt): dpath = os.path.join(opt['datapath'], 'MutualFriends') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. url = 'https://worksheets.codalab.org/rest/bundles/0x5a4cefea7fd443cea15aa532bb8fcd67/contents/blob/' build_data.download(url, dpath, 'data.json') # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'personalized-dialog') if not build_data.built(dpath): print('[building data: ' + dpath + ']') build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. # https://www.dropbox.com/s/4i9u4y24pt3paba/personalized-dialog-dataset.tar.gz?dl=1 fname = 'personalized-dialog-dataset.tar.gz' url = 'https://www.dropbox.com/s/4i9u4y24pt3paba/' + fname + '?dl=1' build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath)
def build(opt): dpath = os.path.join(opt['datapath'], 'SQuAD') if not build_data.built(dpath): print('[building data: ' + dpath + ']') build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname1 = 'train-v1.1.json' fname2 = 'dev-v1.1.json' url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/' build_data.download(dpath, url + fname1) build_data.download(dpath, url + fname2) # Mark the data as built. build_data.mark_done(dpath)
def build(opt): dpath = opt['datapath'] + "/SQuAD/" if not build_data.built(dpath): print("[building data: " + dpath + "]") build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname1 = "train-v1.1.json" fname2 = "dev-v1.1.json" url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/" build_data.download(dpath, url + fname1) build_data.download(dpath, url + fname2) # Mark the data as built. build_data.mark_done(dpath)
def build(opt): dpath = opt['datapath'] + "/SimpleQuestions/" if not build_data.built(dpath): print("[building data: " + dpath + "]") build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = "simplequestions.tar.gz" url = ("https://s3.amazonaws.com/fair-data/parlai/simplequestions/" + fname) build_data.download(dpath, url) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath)
def build(opt): dpath = os.path.join(opt['datapath'], 'FVQA') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') # An older version exists, so remove these outdated files. if build_data.built(dpath): build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. build_data.download('https://dl.dropboxusercontent.com/s/iyz6l7jhbt6jb7q/new_dataset_release.zip', dpath, 'FVQA.zip') # noqa: E501 build_data.untar(dpath, 'FVQA.zip') # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], CONST.DATASET_NAME) version = '1.0' if not build_data.built(dpath, version): logging.info( f'[building data: {dpath}]\nThis may take a while but only heppens once.' ) if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. DATASET_FILE.download_file(dpath) logging.info('Finished downloading dataset files successfully.') build_data.mark_done(dpath, version)
def build(opt): dpath = os.path.join(opt['datapath'], 'wikipedia') task = opt.get('task', 'wikipedia:all') extract_full = task.split(':')[-1] == 'all' if extract_full: dpath = os.path.join(dpath, 'full') else: dpath = os.path.join(dpath, 'summary') if not build_data.built(dpath): print('dpath') print('[building data: ' + dpath + ']') build_data.make_dir(dpath) if extract_full: RESOURCES[0].download_file(dpath) else: RESOURCES[1].download_file(dpath) build_data.mark_done(dpath)
def build_personality_list(opt: Opt): dpath = os.path.join(opt['datapath'], TASK_FOLDER_NAME) version = 'v1.0' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data for downloadable_file in PERSONALITY_LIST_RESOURCES: downloadable_file.download_file(dpath) # Mark the data as built build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'wikipedia') task = opt.get('task', 'wikipedia:all') extract_full = task.split(':')[-1] == 'all' if extract_full: dpath = os.path.join(dpath, 'full') fname = 'wiki_full_extracted.tgz' else: dpath = os.path.join(dpath, 'summary') fname = "summaries.tgz" if not build_data.built(dpath): print('[building data: ' + dpath + ']') build_data.make_dir(dpath) url = 'http://parl.ai/downloads/wikipedia/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) build_data.mark_done(dpath)
def buildImage(opt): dpath = os.path.join(opt['datapath'], 'COCO-IMG-2017') version = '1' if not build_data.built(dpath, version_string=version): print('[building image data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the image data. for downloadable_file in RESOURCES[:3]: downloadable_file.download_file(dpath) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): version = 'v5.0' dpath = os.path.join(opt['datapath'], 'ConvAI2') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. for downloadable_file in RESOURCES: downloadable_file.download_file(dpath) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): dpath = os.path.join(opt['datapath'], 'dialog-bAbI-plus') fname = "dialog-bAbI-plus.zip" version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) url = "https://drive.google.com/uc?export=download&id=0B2MvoQfXtqZmMTJqclpBdGN2bmc" build_data.download(url, dpath, fname) build_data.untar(dpath, fname) build_data.mark_done(dpath, version)
def load(self, path): opt = self.opt mode = opt.get('image_mode', 'raw') is_zip = False if mode is None or mode == 'none': # don't need to load images return None elif '.zip' in path: # assume format path/to/file.zip/image_name.jpg is_zip = True sep = path.index('.zip') + 4 zipname = path[:sep] file_name = path[sep + 1:] path = ZipFile(zipname, 'r').open(file_name) prepath = os.path.join(opt['datapath'], opt['task']) imagefn = ''.join( zipname.strip('.zip').split('/')[-2:]) + path.name if mode == 'raw': # raw just returns RGB values return Image.open(path).convert('RGB') elif mode == 'ascii': # convert images to ascii ¯\_(ツ)_/¯ return self.img_to_ascii(path) else: # otherwise, looks for preprocessed version under 'mode' directory if not is_zip: prepath, imagefn = os.path.split(path) dpath = os.path.join(prepath, mode) if not os.path.exists(dpath): build_data.make_dir(dpath) imagefn = imagefn.split('.')[0] imagefn = imagefn + '.hdf5' new_path = os.path.join(prepath, mode, imagefn) if not os.path.isfile(new_path): return self.extract(Image.open(path).convert('RGB'), new_path) else: with open(new_path): hdf5_file = self.h5py.File(new_path, 'r') feature = hdf5_file['feature'].value feature = self.torch.from_numpy(feature) return feature
def build(opt): dpath = os.path.join(opt['datapath'], 'MCTest') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'mctest.tar.gz' url = 'http://parl.ai/downloads/mctest/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) dpext = os.path.join(dpath, 'mctest') create_fb_format( dpath, 'train160', os.path.join(dpext, 'MCTest', 'mc160.train'), None ) create_fb_format( dpath, 'valid160', os.path.join(dpext, 'MCTest', 'mc160.dev'), None ) create_fb_format( dpath, 'test160', os.path.join(dpext, 'MCTest', 'mc160.test'), os.path.join(dpext, 'MCTestAnswers', 'mc160.test.ans'), ) create_fb_format( dpath, 'train500', os.path.join(dpext, 'MCTest', 'mc500.train'), None ) create_fb_format( dpath, 'valid500', os.path.join(dpext, 'MCTest', 'mc500.dev'), None ) create_fb_format( dpath, 'test500', os.path.join(dpext, 'MCTest', 'mc500.test'), os.path.join(dpext, 'MCTestAnswers', 'mc500.test.ans'), ) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = opt['datapath'] + "/OpenSubtitles/" if not build_data.built(dpath): print("[building data: " + dpath + "]") build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = "download.php?f=OpenSubtitles/en.tar.gz" url = ("http://opus.lingfil.uu.se/" + fname) build_data.download(dpath, url) build_data.untar(dpath, 'download.php?f=OpenSubtitles%2Fen.tar.gz') create_fb_format(dpath + '/OpenSubtitles/en/', dpath) # Mark the data as built. build_data.mark_done(dpath)
def build_style_labeled_datasets(opt: Opt): dpath = get_style_labeled_data_folder(datapath=opt['datapath']) if not build_data.built(dpath, version_string=STYLE_LABELED_DATASETS_VERSION): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data for downloadable_file in STYLE_LABELED_DATASETS_RESOURCES: downloadable_file.download_file(dpath) # Mark the data as built build_data.mark_done(dpath, version_string=STYLE_LABELED_DATASETS_VERSION)
def build(opt): dpath = os.path.join(opt['datapath'], 'TriviaQA') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'triviaqa-rc.tar.gz' url = 'http://nlp.cs.washington.edu/triviaqa/data/' build_data.download(url + fname, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): version = 'v1.0' dpath = os.path.join(opt['datapath'], 'Persona-Chat') if not build_data.built(dpath, version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'personachat.tgz' url = 'https://s3.amazonaws.com/fair-data/parlai/personachat/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version)
def build(opt): dpath = os.path.join(opt['datapath'], 'dialog-bAbI') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'dialog_babi.tar.gz' url = 'https://s3.amazonaws.com/fair-data/parlai/dialog_babi/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(cls, dpath): print("building version: %s" % cls.version) # the root of dataset dpext = os.path.join(dpath, 'insuranceQA-master/%s' % cls.version) # read vocab file vocab_path = os.path.join(dpext, "vocabulary") d_vocab = cls.read_vocab(vocab_path) # read label2answer file label2answer_path_gz = os.path.join(dpext, cls.label2answer_fname) d_label_answer = cls.read_label2answer(label2answer_path_gz, d_vocab) # Create out path out_path = os.path.join(dpath, cls.version) build_data.make_dir(out_path) # Parse and write data files cls.write_data_files(dpext, out_path, d_vocab, d_label_answer)
def build(opt): data_path = os.path.join(opt['datapath'], 'DialogueQE') version = '1501534800' if not build_data.built(data_path, version_string=version): print('[building data: ' + data_path + ']') if build_data.built(data_path): build_data.remove_dir(data_path) build_data.make_dir(data_path) fname = 'data_' + version + '.tar.gz' url = 'https://raw.githubusercontent.com/deepmipt/turing-data/master/' + fname build_data.download(url, data_path, fname) build_data.untar(data_path, fname) os.rename(os.path.join(data_path, 'data_train_' + version + '.json'), os.path.join(data_path, 'train.json')) os.rename(os.path.join(data_path, 'data_test_' + version + '.json'), os.path.join(data_path, 'test.json')) build_data.mark_done(data_path, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'personalized-dialog') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. # https://www.dropbox.com/s/4i9u4y24pt3paba/personalized-dialog-dataset.tar.gz?dl=1 fname = 'personalized-dialog-dataset.tar.gz' url = 'https://www.dropbox.com/s/4i9u4y24pt3paba/' + fname + '?dl=1' build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'CLEVR') version = 'v1.0' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') # An older version exists, so remove these outdated files. if build_data.built(dpath): build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'CLEVR_v1.0.zip' url = 'https://s3-us-west-1.amazonaws.com/clevr/' build_data.download(url + fname, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'OpenSubtitles') version = '1' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. url = ('http://opus.lingfil.uu.se/download.php?f=OpenSubtitles/en.tar.gz') build_data.download(url, dpath, 'OpenSubtitles.tar.gz') build_data.untar(dpath, 'OpenSubtitles.tar.gz') create_fb_format(os.path.join(dpath, 'OpenSubtitles', 'en'), dpath) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'SQuAD') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname1 = 'train-v1.1.json' fname2 = 'dev-v1.1.json' url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/' build_data.download(url + fname1, dpath, fname1) build_data.download(url + fname2, dpath, fname2) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): # Depends upon another dataset, wikimovies, build that first. wikimovies_build.build(opt) dpath = os.path.join(opt['datapath'], 'DBLL') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'dbll.tgz' url = 'https://s3.amazonaws.com/fair-data/parlai/dbll/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'COPA') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # an older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # download the data. fname = 'COPA-resources.tgz' # dataset URL url = 'http://people.ict.usc.edu/~gordon/downloads/' + fname build_data.download(url, dpath, fname) # uncompress it build_data.untar(dpath, fname) # mark the data as built build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'negotiation') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') # make a clean directory if needed if build_data.built(dpath): # an older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data from github fname = 'negotiation.zip' url = ('https://github.com/facebookresearch/end-to-end-negotiator/' 'archive/master.zip') print('[downloading data from: ' + url + ']') build_data.download(url, dpath, fname) build_data.untar(dpath, fname) # Mark as done build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'InsuranceQA') version = '1' if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data from github. fname = 'insuranceqa.zip' url = 'https://github.com/shuzi/insuranceQA/archive/master.zip' print('[downloading data from: ' + url + ']') build_data.download(url, dpath, fname) build_data.untar(dpath, fname) ParseInsuranceQAV1.build(dpath) ParseInsuranceQAV2.build(dpath) # Mark the data as built. build_data.mark_done(dpath, version_string=version)
def build(opt): dpath = os.path.join(opt['datapath'], 'CornellMovie') version = None if not build_data.built(dpath, version_string=version): print('[building data: ' + dpath + ']') if build_data.built(dpath): # An older version exists, so remove these outdated files. build_data.remove_dir(dpath) build_data.make_dir(dpath) # Download the data. fname = 'cornell_movie_dialogs_corpus.zip' url = 'http://www.mpi-sws.org/~cristian/data/' + fname build_data.download(url, dpath, fname) build_data.untar(dpath, fname) dpext = os.path.join(dpath, 'cornell movie-dialogs corpus') create_fb_format(os.path.join(dpext, 'movie_lines.txt'), os.path.join(dpext, 'movie_conversations.txt'), dpath) # Mark the data as built. build_data.mark_done(dpath, version_string=version)