def download(self): if self.DATASET_PATH.exists(): return Path.mkdir(self.DATASET_PATH) base_url = "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master" splits = [{ "name": "Train", "checksum": "7d5bb1f58278e33b395744cd2ad8d7600faa0b3c4d615c659a44ec1181d759fa" }, { "name": "Eval", "checksum": "4c49e6753b7262c001506b9151135abf722247035ab075dad93acdea5789c01f" }, { "name": "Test", "checksum": "359acb78c37802208f7fde9e2f6574b8526527c63d6a336f90a53f1932cb4701" }] for split in splits: file = self.DATASET_PATH / f"{split['name']}.txt" download_file(f"{base_url}/{split['name']}.txt", str(file), split["checksum"])
def reddit_processing(url, sha256sums, dumps_directory, keep_dumps): base_name = url.split('/')[-1] dump_file_path = os.path.join(dumps_directory, base_name) db_done_file = dump_file_path + ".dbdone" if os.path.exists(db_done_file): return True try: download_file(url, dump_file_path, sha256sums.get(base_name)) except Exception as ex: logger.info(f"Download failed {ex}, skipping processing.") return False db_session = get_db_session() process_dump_file(dump_file_path, db_session, tqdm.tqdm) with open(db_done_file, "w") as fh: fh.write("Done!") if not keep_dumps: os.remove(dump_file_path) return True
def download(self): year = self.YEAR lang = "EN" base_path = ( "http://nlp.uned.es/clef-qa/repository/js/scripts/downloadFile.php?" "file=/var/www/html/nlp/clef-qa/repository/resources/QA4MRE/") # TODO: add side tasks? variable_year_path = { 2011: '2011/Training_Data/Goldstandard/', 2012: '2012/Main_Task/Training_Data/Goldstandard/Used_in_Evaluation/', 2013: '2013/Main_Task/Training_Data/Goldstandard/' } sha256sums = { 2011: "6d2524952a3a015f2a82df785b85b5578681e3602ec276b4e72c01f4ebc50034", 2012: "f9edaf408f8ac93f89a643a0d0b19263a1bb5ce64f19b2af10df279a656dfb24", 2013: "c60e5aa4ec77e0493ef0b11d46bd1d74d58a499a3a2f871b8cf3af9536f0f094", } vpath = variable_year_path[year] url_path = f"{base_path}{vpath}QA4MRE-{year}-{lang}_GS.xml" if not os.path.exists("data/qa4mre"): os.mkdir("data/qa4mre") if not os.path.isfile(f"data/qa4mre/QA4MRE-{year}-{lang}"): download_file( url_path, f"data/qa4mre/QA4MRE-{year}-{lang}_GS.xml", sha256sums[year], )
def download(self): file_name, checksum = self.get_file_download_info() url = 'https://raw.githubusercontent.com/openai/gpt-3/master/data/' + file_name if not os.path.exists(self.directory): os.makedirs(self.directory) download_file(url, self.directory + file_name, checksum) self.set_docs()
def wget(url, to=None, checksum=None): # thin wrapper for best_download if to is None: to = os.path.basename(to) if not to: to = 'index' download_file(url, to, checksum)
def download(self): # TODO: separate pile val/test out by component so we don't have to scan the entire file once per set os.makedirs("data/pile/", exist_ok=True) download_file( "https://the-eye.eu/public/AI/pile/val.jsonl.zst", self.VAL_PATH, "264c875d8bbd355d8daa9d032b75fd8fb91606218bb84dd1155b203fcd5fab92") download_file( "https://the-eye.eu/public/AI/pile/test.jsonl.zst", self.TEST_PATH, "0bb28c52d0b5596d389bf179ce2d43bf7f7ffae76b0d2d20b180c97f62e0975e")
def download(self): if not self.BASE_PATH.exists(): Path.mkdir(self.BASE_PATH) file = self.BASE_PATH / self.FILENAME if not file.exists(): rawfile = file.parent / (file.name + ".gz") base_url = "https://raw.githubusercontent.com/openai/gpt-3/master/data" download_file(f"{base_url}/{self.FILENAME}.gz", str(rawfile), self.CHECKSUM) extract_gzip(gz=rawfile, to=file)
def download(self): if not os.path.exists('data/sciq'): os.mkdir('data/sciq') download_file( 'https://ai2-public-datasets.s3.amazonaws.com/sciq/SciQ.zip', 'data/sciq/SciQ.zip', '7f3312f6ac6b09970b32942d106a8c44ec0dad46a0369f17d635aff8e348a87c', ) with zipfile.ZipFile("data/sciq/SciQ.zip", "r") as zf: zf.extractall("data/sciq/")
def download(self): if self.DATASET_PATH.exists(): return Path.mkdir(self.DATASET_PATH) url = "https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip" checksum = "39d2278a29fd729de301b111a45f434c24834f40df8f4ff116d864589e3249d6" zip_path = self.DATASET_PATH / "drop_dataset.zip" download_file(url, str(zip_path), checksum) with ZipFile(zip_path, "r") as zip: zip.extractall(self.DATASET_PATH)
def download(self): sh("mkdir -p data/lambada") try: download_file( "http://eaidata.bmk.sh/data/lambada_test.jsonl", "data/lambada/lambada_test.jsonl", "4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226" ) except: # fallback - for some reason best_download doesnt work all the time here sh("wget http://eaidata.bmk.sh/data/lambada_test.jsonl -O data/lambada/lambada_test.jsonl") sh('echo "4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226 data/lambada/lambada_test.jsonl" | sha256sum --check')
def download(self): if not os.path.exists('data/triviaqa/unfiltered-web-train.jsonl'): os.makedirs("data/triviaqa/", exist_ok=True) download_file( "http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz", "data/triviaqa/triviaqa-unfiltered.tar.gz", "adc19b42769062d241a8fbe834c56e58598d9322eb6c614e9f33a68a2cf5523e" ) sh(""" cd data/triviaqa/ tar -xf triviaqa-unfiltered.tar.gz """)
def download(self): if not os.path.exists('data/ethics/done'): sh("mkdir -p data") download_file( "https://people.eecs.berkeley.edu/~hendrycks/ethics.tar", "data/ethics.tar", "40acbf1ac0da79a2aabef394d58889136b8d38b05be09482006de2453fb06333" ) sh(""" tar -xf data/ethics.tar -C data/ rm data/ethics.tar touch data/ethics/done """)
def download(self): if not (self.DATASET_PATH / 'test').exists() or not (self.DATASET_PATH / 'done').exists(): sh(f"mkdir -p {self.DATASET_PATH}") download_file( "https://people.eecs.berkeley.edu/~hendrycks/MATH.tar", f"{self.DATASET_PATH}.tar", "01256fd7cd5430596fdf07e6e6a5827111b5235b7ffed679c662a12f898932da" ) sh(f""" tar -xf {self.DATASET_PATH}.tar -C data/ && touch {self.DATASET_PATH / 'done'} rm {self.DATASET_PATH}.tar """)
def download(fname, checksum, sources, extract=False): if os.path.exists(fname + '.done'): return print('Finding source for', fname) parentdir = Path(fname).parent os.makedirs(parentdir, exist_ok=True) for source in sources: try: # todo: implement torrent handling if source.type == 'direct': download_file(source.url, fname, checksum) elif source.type == 'gdrive': if os.path.exists(fname): try: print(fname, 'already exists.') sha256sum(fname, expected=checksum) touch(fname + '.done') return except AssertionError: print('{} exists but doesn\'t match checksum!'.format( fname)) rm_if_exists(fname) gdown.download(source.url, fname, quiet=False) sha256sum(fname, expected=checksum) elif source.type == 'gcloud': raise NotImplementedError('gcloud download not implemented!') if extract: tar_xf(fname) rm_if_exists(fname) touch(fname + '.done') return except SystemExit: raise except KeyboardInterrupt: raise except: import traceback traceback.print_exc() print('Download method [{}] {} failed, trying next option'.format( source.type, source.url)) # rm_if_exists(fname) continue break raise Exception('Failed to download {} from any source'.format(fname))
def download(self): if not (self.DATASET_PATH / 'done').exists(): sh("mkdir -p data") download_file( "https://people.eecs.berkeley.edu/~hendrycks/data.tar", "data/data.tar", "78a804365a59028188fb19bd1adcadc5e0c260b220a9d8b2e33a5ea7d5fbe3b4" ) sh(""" tar -xf data/data.tar -C data/ rm data/data.tar mv data/data data/hendrycksTest touch data/hendrycksTest/done """)
def download(self): coqa_train_filepath = 'data/coqa/coqa-train-v1.0.json' coqa_dev_filepath = 'data/coqa/coqa-dev-v1.0.json' sh("""mkdir -p data/coqa""") download_file( "http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-train-v1.0.json", coqa_train_filepath, "b0fdb2bc1bd38dd3ca2ce5fa2ac3e02c6288ac914f241ac409a655ffb6619fa6") download_file( "http://downloads.cs.stanford.edu/nlp/data/coqa/coqa-dev-v1.0.json", coqa_dev_filepath, "dfa367a9733ce53222918d0231d9b3bedc2b8ee831a2845f62dfc70701f2540a")
def download(self): if self.BASE_PATH.exists(): return Path.mkdir(self.BASE_PATH, parents=True) master_zip = Path("data/master.zip") download_file( "https://github.com/Nealcly/MuTual/archive/master.zip", str(master_zip), "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9") with zipfile.ZipFile(master_zip, 'r') as zip: zip.extractall("data") Path("data/MuTual-master/data").rename(str(self.BASE_PATH)) # Remove left over files and directories. master_zip.unlink() shutil.rmtree("data/MuTual-master")
def download(self): sh("mkdir -p data/lambada") download_file( "http://eaidata.bmk.sh/data/lambada_test.jsonl", "data/lambada/lambada_test.jsonl", "4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226")
in_path = 'pile' out_path = 'langlen_stage1' def lengths(doc): global tok return { 'len_char': len(doc), 'len_utf8bytes': len(doc.encode('utf-8')), 'len_words': len(re.split(r'\s+', doc)), 'len_tokens': len(tok.encode(doc)), } download_file( 'https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin', 'lid.176.bin', '7e69ec5451bc261cc7844e49e4792a85d7f09c06789ec800fc4a44aec362764e') def language(doc): global langdet details = langdet.predict(doc.replace('\n', ' '), k=1) return {'lang': details[0][0].replace('__label__', '')} def writef(f, lines): with open(f, 'wb') as fh: cctx = zstandard.ZstdCompressor(level=3, threads=8) compressor = cctx.stream_writer(fh) for line in tqdm(lines):
def download(self): if not os.path.exists('data/wikitext/wikitext-2-raw/wiki.valid.raw'): os.makedirs("data/wikitext/", exist_ok=True) download_file("https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip", "data/wikitext/wikitext-2-raw-v1.zip", "ef7edb566e3e2b2d31b29c1fdb0c89a4cc683597484c3dc2517919c615435a11") sh("cd data/wikitext/ && unzip wikitext-2-raw-v1.zip")
def download(self): sh("mkdir -p data/lambada") download_file( "https://storage.googleapis.com/gpt-2/data/lambada_test.jsonl", "data/lambada/lambada_test.jsonl", "4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226")