def main(args): num_process = min(multiprocessing.cpu_count(), args.num_process) if args.mode == 'download': download_wikicorpus(args.lang, args.date, args.output) elif args.mode == 'format': format_wikicorpus(args.input, args.output, args.bytes, num_process, args.num_out_files) elif args.mode == 'download+format': downloaded_file = download_wikicorpus(args.lang, args.date, args.output) format_wikicorpus(downloaded_file, args.output, args.bytes, num_process, args.num_out_files) elif args.mode == 'download_prepared': url = _URLS['wikipedia-en-20200620'] file_hash = _URL_FILE_STATS[url] target_download_location = os.path.join(args.output, os.path.basename(url)) download(url, target_download_location, sha1_hash=file_hash) tar = tarfile.open(target_download_location) names = tar.getnames() print('Start unarchiving raw text files') start_time = time.time() for name in names: tar.extract(name, path=args.output) tar.close() print("Done unarchiving within {:.2f} seconds".format(time.time() - start_time)) else: raise NotImplementedError
def convert_config(args, converted): print('converting cfg...') # download config gluon_cfg = Gluon_T5.get_cfg(T5_PRETRAINED_MODEL_MAP[args.model_name]) with tempfile.TemporaryDirectory() as temp_dir: hf_cfg_path = os.path.join(temp_dir, 'config.json') download(url=T5_PRETRAINED_CONFIG_MAP[args.model_name], path=hf_cfg_path) with open(hf_cfg_path, 'r') as f: hf_cfg = json.load(f) os.remove(hf_cfg_path) # update attributes cfg = gluon_cfg.clone() cfg.defrost() cfg.MODEL.vocab_size = hf_cfg['vocab_size'] cfg.MODEL.d_model = hf_cfg['d_model'] cfg.MODEL.d_kv = hf_cfg['d_kv'] cfg.MODEL.d_ff = hf_cfg['d_ff'] cfg.MODEL.num_layers = hf_cfg['num_layers'] cfg.MODEL.num_heads = hf_cfg['num_heads'] cfg.MODEL.layer_norm_eps = hf_cfg['layer_norm_epsilon'] cfg.MODEL.dropout_prob = hf_cfg['dropout_rate'] cfg.INITIALIZER.init_factor = hf_cfg['initializer_factor'] cfg.freeze() # save config config_path = os.path.join(args.dest_dir, 'model.yml') with open(config_path, 'w') as f: f.write(cfg.dump()) converted['config'] = config_path return cfg
def convert_vocab(args, converted): print('converting vocab...') # at this step we don't add <extra_id>s into the vocab, but just save the original binary file directly # those special tokens are added only when instantiating a T5Tokenizer vocab_path = os.path.join(args.dest_dir, 't5.vocab') download(url=PRETRAINED_VOCAB_MAP[args.model_name], path=vocab_path) converted['vocab'] = vocab_path
def main(args): url =_URLS['books1'] file_hash = _URL_FILE_STATS[url] target_download_location = os.path.join(args.cache_path, os.path.basename(url)) download(url, target_download_location, sha1_hash=file_hash) tar = tarfile.open(target_download_location) names = tar.getnames() print('Start unarchiving raw text files') start_time = time.time() for name in names: tar.extract(name, path=args.output) tar.close() print("Done unarchiving within {:.2f} seconds".format(time.time() - start_time)) print("start transfer to one article per line") input_name = os.path.join(args.output, 'books1/epubtxt/') output_name = os.path.join(args.output,'bookcorpus.txt' ) format = BookscorpusTextFormatting(input_name, output_name) format.merge() print("end format") if args.segment_sentences: print("start to transfer bookcorpus to one sentence per line") t1 = time.time() input_name = os.path.join(args.output, 'bookcorpus.txt') output_name = os.path.join(args.output, 'one_sentence_per_line/') if not os.path.exists(output_name): os.mkdir(output_name) sharding = Sharding([input_name], output_name, 128, 1, 0 ,args.segment_num_worker) sharding.load_articles() sharding.segment_articles_into_sentences() t2 = time.time() print("transfer cost:{}".format(t2-t1))
def main(args): def extract(gz_path): logging.warning( f'Extracting {gz_path}, this can cost long time because the file is large' ) try: f_name = gz_path.replace(".gz", "") g_file = gzip.GzipFile(gz_path) open(f_name, "wb+").write(g_file.read()) g_file.close() os.remove(gz_path) except Exception as e: print(e) if not os.path.exists(args.save_path): os.makedirs(args.save_path) for url in _URLS.values(): file_name = url[url.rfind('/') + 1:] file_hash = _URL_FILE_STATS[url] download(url, path=os.path.join(args.cache_path, file_name), sha1_hash=file_hash) if not os.path.exists(os.path.join(args.save_path, file_name))\ or (args.overwrite and args.save_path != args.cache_path): os.symlink(os.path.join(args.cache_path, file_name), os.path.join(args.save_path, file_name)) if args.extract: extract(os.path.join(args.save_path, file_name))
def test_sentencepiece_tokenizer(): with tempfile.TemporaryDirectory() as dir_path: model_path = os.path.join(dir_path, 'spm.model') download(url=get_repo_url() + 'tokenizer_test_models/sentencepiece/case1/test_ende-a9bee4.model', path=model_path) # Case1 tokenizer = SentencepieceTokenizer(model_path) gt_tokenized = [['▁Hel', 'lo', ',', '▁y', "'", 'all', '!', '▁How', '▁are', '▁you', '▁', 'VI', 'II', '▁', '😁', '▁', '😁', '▁', '😁', '▁?'], ['▁G', 'lu', 'on', 'N', 'L', 'P', '▁is', '▁great', '!', '!', '!', '!', '!', '!'], ['▁G', 'lu', 'on', 'N', 'L', 'P', '-', 'A', 'ma', 'zo', 'n', '-', 'H', 'ai', 'bin', '-', 'L', 'e', 'on', 'ard', '-', 'S', 'hen', 'g', '-', 'S', 'hu', 'ai', '-', 'X', 'ing', 'j', 'ian', '.', '.', '.', '.', '.', '/', ':', '!', '@', '#', '▁', "'", 'ab', 'c', "'"]] gt_offsets = [[(0, 3), (3, 5), (5, 6), (6, 8), (8, 9), (9, 12), (12, 13), (13, 17), (17, 21), (21, 25), (25, 26), (26, 26), (26, 27), (27, 28), (28, 29), (29, 30), (30, 31), (31, 32), (32, 33), (33, 35)], [(0, 1), (1, 3), (3, 5), (5, 6), (6, 7), (7, 8), (8, 11), (11, 17), (17, 18), (18, 19), (19, 20), (20, 21), (21, 22), (22, 23)], [(0, 1), (1, 3), (3, 5), (5, 6), (6, 7), (7, 8), (8, 9), (9, 10), (10, 12), (12, 14), (14, 15), (15, 16), (16, 17), (17, 19), (19, 22), (22, 23), (23, 24), (24, 25), (25, 27), (27, 30), (30, 31), (31, 32), (32, 35), (35, 36), (36, 37), (37, 38), (38, 40), (40, 42), (42, 43), (43, 44), (44, 47), (47, 48), (48, 51), (51, 52), (52, 53), (53, 54), (54, 55), (55, 56), (56, 57), (57, 58), (58, 59), (59, 60), (60, 61), (61, 62), (62, 63), (63, 65), (65, 66), (66, 67)]] gt_int_decode = ['Hello, y ⁇ all! How are you VIII ⁇ ⁇ ⁇ ?', 'GluonNLP is great!!!!!!', 'GluonNLP-Amazon-Haibin-Leonard-Sheng-Shuai-Xingjian...../:! ⁇ # ⁇ abc ⁇ '] verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized) verify_pickleble(tokenizer, SentencepieceTokenizer) verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets) verify_decode_spm(tokenizer, SUBWORD_TEST_SAMPLES, gt_int_decode) # Case2, lower_case gt_lower_case_int_decode = ['hello, y ⁇ all! how are you viii ⁇ ⁇ ⁇ ?', 'gluonnlp is great!!!!!!', 'gluonnlp-amazon-haibin-leonard-sheng-shuai-xingjian...../:! ⁇ # ⁇ abc ⁇ '] tokenizer = SentencepieceTokenizer(model_path, lowercase=True) verify_decode_spm(tokenizer, SUBWORD_TEST_SAMPLES, gt_lower_case_int_decode) # Case3, Use the sentencepiece regularization commands, we test whether we can obtain different encoding results tokenizer = SentencepieceTokenizer(model_path, lowercase=True, nbest=-1, alpha=1.0) has_different_encode_out = False encode_out = None for _ in range(10): if encode_out is None: encode_out = tokenizer.encode(SUBWORD_TEST_SAMPLES[0]) else: ele_out = tokenizer.encode(SUBWORD_TEST_SAMPLES[0]) if ele_out != encode_out: has_different_encode_out = True break assert has_different_encode_out os.remove(model_path)
def main(args): if not os.path.exists(args.save_path): os.makedirs(args.save_path) for url in _URLS.values(): file_name = url[url.rfind('/') + 1:] file_hash = _URL_FILE_STATS[url] download(url, path=os.path.join(args.cache_path, file_name), sha1_hash=file_hash) if not os.path.exists(os.path.join(args.save_path, file_name))\ or (args.overwrite and args.save_path != args.cache_path): os.symlink(os.path.join(args.cache_path, file_name), os.path.join(args.save_path, file_name))
def test_huggingface_wordpiece_tokenizer(): with tempfile.TemporaryDirectory() as dir_path: vocab_path = os.path.join(dir_path, 'hf_wordpiece.vocab') download(url=get_repo_url() + 'tokenizer_test_models/hf_wordpiece/test_hf_wordpiece.vocab', path=vocab_path) hf_vocab_path = os.path.join(dir_path, 'hf_wordpiece.hf_vocab') download(url=get_repo_url() + 'tokenizer_test_models/hf_wordpiece/test_hf_wordpiece.hf_vocab', path=hf_vocab_path) # Case 1, lowercase=True tokenizer = HuggingFaceWordPieceTokenizer(vocab_path, lowercase=True) gt_tokenized = [["hello", ",", "y", "'", "all", "!", "how", "are", "you", "<unk>", "<unk>", "<unk>", "<unk>", "?"], ["gl", "##uo", "##nn", "##l", "##p", "is", "great", "\uff01", "\uff01", "\uff01", "!", "!", "!"], ["gl", "##uo", "##nn", "##l", "##p", "-", "amazon", "-", "hai", "##bin", "-", "leonard", "-", "shen", "##g", "-", "shu", "##ai", "-", "xin", "##g", "##ji", "##an", ".", ".", ".", ".", ".", "/", ":", "!", "@", "#", "'", "abc", "'"]] gt_offsets = [[(0, 5), (5, 6), (7, 8), (8, 9), (9, 12), (12, 13), (14, 17), (18, 21), (22, 25), (26, 27), (28, 29), (30, 31), (32, 33), (34, 35)], [(0, 2), (2, 4), (4, 6), (6, 7), (7, 8), (9, 11), (12, 17), (17, 18), (18, 19), (19, 20), (20, 21), (21, 22), (22, 23)], [(0, 2), (2, 4), (4, 6), (6, 7), (7, 8), (8, 9), (9, 15), (15, 16), (16, 19), (19, 22), (22, 23), (23, 30), (30, 31), (31, 35), (35, 36), (36, 37), (37, 40), (40, 42), (42, 43), (43, 46), (46, 47), (47, 49), (49, 51), (51, 52), (52, 53), (53, 54), (54, 55), (55, 56), (56, 57), (57, 58), (58, 59), (59, 60), (60, 61), (62, 63), (63, 66), (66, 67)]] gt_decode = ["hello, y'all! how are you?", "gluonnlp is great ! ! !!!!", "gluonnlp - amazon - haibin - leonard - sheng - shuai - xingjian..... / :! @ #'abc '"] verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized) verify_pickleble(tokenizer, HuggingFaceWordPieceTokenizer) verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets) verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_decode) # Case 2, lowercase=False gt_lowercase_decode = [", y'all! are you?", "is great ! ! !!!!", "- - - - - -..... / :! @ #'abc '"] tokenizer = HuggingFaceWordPieceTokenizer(vocab_path, lowercase=False) verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_lowercase_decode) # Case 3, using original hf vocab tokenizer = HuggingFaceWordPieceTokenizer(hf_vocab_path, lowercase=True) verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized) verify_pickleble(tokenizer, HuggingFaceWordPieceTokenizer) verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets) verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_decode) os.remove(vocab_path) os.remove(hf_vocab_path)
def download_wikicorpus(lang, date, output): """ lang: the language code such as en, zh date: string, the date of the Wikipedia with format of YYYYMMDD, or 'latest'. """ if not os.path.exists(output): os.makedirs(output) if lang not in __LANGUAGES_BANK: raise ValueError('Unsupported language code') language = lang.replace('-', '_') output_file = os.path.join(output, 'download', language, date, 'wikicorpus.xml.bz2') download(get_url(language, date), output_file) return output_file
def main(args): url = _URLS['gutenberg'] file_hash = _URL_FILE_STATS[url] target_download_location = os.path.join(args.cache_path, os.path.basename(url)) download(url, target_download_location, sha1_hash=file_hash) save_dir = args.dataset if args.save_dir is None else args.save_dir if not os.path.exists(save_dir): os.makedirs(save_dir, exist_ok=True) with zipfile.ZipFile(target_download_location) as f: for name in f.namelist(): if name.endswith('.txt'): filename = os.path.basename(name) f.extract(name, os.path.join(save_dir, filename))
def verify_download(url, sha1_hash, overwrite): with tempfile.TemporaryDirectory() as root: download_path = os.path.join(root, 'dat0') # Firstly, verify that we are able to get download the data correctly download(url, sha1_hash=sha1_hash, path=download_path, overwrite=overwrite) assert sha1sum(download_path) == sha1_hash os.remove(download_path) # Secondly, verify that we are able to download with multiprocessing download_path = os.path.join(root, 'dat1') with multiprocessing.Pool(2) as pool: pool.map(functools.partial(download, sha1_hash=sha1_hash, path=download_path, overwrite=overwrite), [url for _ in range(2)]) assert sha1sum(download_path) == sha1_hash os.remove(download_path)
def main(args): num_process = min(multiprocessing.cpu_count(), args.num_process) if args.mode == 'download': download_wikicorpus(args.lang, args.date, args.output) elif args.mode == 'format': format_wikicorpus(args.input, args.output, args.bytes, num_process, args.num_out_files, args.quiet) elif args.mode == 'download+format': downloaded_file = download_wikicorpus(args.lang, args.date, args.output) format_wikicorpus(downloaded_file, args.output, args.bytes, num_process, args.num_out_files, args.quiet) elif args.mode == 'download_prepared': url = _URLS['wikipedia-en-20200620'] file_hash = _URL_FILE_STATS[url] target_download_location = os.path.join(args.output, os.path.basename(url)) download(url, target_download_location, sha1_hash=file_hash) tar = tarfile.open(target_download_location) names = tar.getnames() print('Start unarchiving raw text files') start_time = time.time() for name in names: tar.extract(name, path=args.output) tar.close() print("Done unarchiving within {:.2f} seconds".format(time.time() - start_time)) else: raise NotImplementedError if args.segment_sentences: print("start to transfer bookcorpus to one sentence per line") t1 = time.time() segmenter = NLTKSegmenter() original_name = os.path.join(args.output, 'prepared_wikipedia') output_name = os.path.join(args.output, 'one_sentence_per_line/') if not os.path.exists(output_name): os.mkdir(output_name) input_names = os.listdir(original_name) for i in range(len(input_names)): input_names[i] = os.path.join(original_name, input_names[i]) sharding = Sharding(input_names, output_name, 256, 1, 0, args.segment_num_worker) sharding.load_articles() sharding.segment_articles_into_sentences() t2 = time.time() print("transfer cost:{}".format(t2 - t1))
def main(args): train_url = _URLS[args.version]['train'] dev_url = _URLS[args.version]['dev'] train_file_name = train_url[train_url.rfind('/') + 1:] dev_file_name = dev_url[dev_url.rfind('/') + 1:] download(train_url, path=os.path.join(args.cache_path, train_file_name)) download(dev_url, path=os.path.join(args.cache_path, dev_file_name)) if not os.path.exists(args.save_path): os.makedirs(args.save_path) if not os.path.exists(os.path.join(args.save_path, train_file_name))\ or (args.overwrite and args.save_path != args.cache_path): os.symlink(os.path.join(args.cache_path, train_file_name), os.path.join(args.save_path, train_file_name)) if not os.path.exists(os.path.join(args.save_path, dev_file_name))\ or (args.overwrite and args.save_path != args.cache_path): os.symlink(os.path.join(args.cache_path, dev_file_name), os.path.join(args.save_path, dev_file_name))
def main(args): url = _URLS['gutenberg'] file_hash = _URL_FILE_STATS[url] target_download_location = os.path.join(args.cache_path, os.path.basename(url)) download(url, target_download_location, sha1_hash=file_hash) save_dir = args.dataset if args.save_dir is None else args.save_dir if not os.path.exists(save_dir): os.makedirs(save_dir, exist_ok=True) print(f'Save to {save_dir}') with zipfile.ZipFile(target_download_location) as f: for name in f.namelist(): if name.endswith('.txt'): filename = os.path.basename(name) with f.open(name) as in_file: with open(os.path.join(save_dir, filename.replace(' ', '_')), 'wb') as out_file: shutil.copyfileobj(in_file, out_file)
def try_import_wikiextractor(): try: sys.path.append(_CURR_DIR) import WikiExtractor except ImportError: try: download( 'https://raw.githubusercontent.com/attardi/wikiextractor/master/WikiExtractor.py', path=os.path.join(_CURR_DIR, 'WikiExtractor.py'), sha1_hash='3c4896a837b75c476d23c037e8d6c7fdfd9a29eb') sys.path.append(_CURR_DIR) import WikiExtractor except BaseException: raise ImportError( 'Cannot import WikiExtractor! You can download the "WikiExtractor.py"' ' in https://github.com/attardi/wikiextractor to {}'.format( _CURR_DIR)) return WikiExtractor
def test_huggingface_bytebpe_tokenizer_v08(): """Test for huggingface bytebpe tokenizer >=0.8""" with tempfile.TemporaryDirectory() as dir_path: model_path = os.path.join(dir_path, 'hf_bytebpe_new_0.8.model') download(url=get_repo_url() + 'tokenizer_test_models/hf_bytebpe_new_0.8/hf_bytebpe.model', path=model_path, sha1_hash='a1c4da1f6c21df923e150f56dbb5b7a53c61808b') vocab_path = os.path.join(dir_path, 'hf_bytebpe_new_0.8.vocab') download(url=get_repo_url() + 'tokenizer_test_models/hf_bytebpe_new_0.8/hf_bytebpe.vocab', path=vocab_path, sha1_hash='7831b19078a3222f450e65b2188dc0770473123b') tokenizer = HuggingFaceTokenizer(model_path, vocab_path) gt_tokenized = [['He', 'llo', ',', 'Ġy', "'", 'all', '!', 'ĠHow', 'Ġare', 'Ġyou', 'Ġâ', 'ħ', '§', 'Ġ', 'ð', 'Ł', 'ĺ', 'ģ', 'Ġ', 'ð', 'Ł', 'ĺ', 'ģ', 'Ġ', 'ð', 'Ł', 'ĺ', 'ģ', 'Ġ?'], ['G', 'l', 'u', 'on', 'N', 'L', 'P', 'Ġis', 'Ġgreat', 'ï', '¼', 'ģ', 'ï', '¼', 'ģ', 'ï', '¼', 'ģ', '!', '!', '!'], ['G', 'l', 'u', 'on', 'N', 'L', 'P', '-', 'Am', 'az', 'on', '-', 'Ha', 'ib', 'in', '-', 'Le', 'on', 'ard', '-', 'S', 'hen', 'g', '-', 'Sh', 'u', 'ai', '-', 'X', 'ing', 'j', 'ian', '..', '...', '/', ':', '!', '@', '#', 'Ġ', "'", 'ab', 'c', "'"]] gt_offsets = [[(0, 2), (2, 5), (5, 6), (6, 8), (8, 9), (9, 12), (12, 13), (13, 17), (17, 21), (21, 25), (25, 27), (26, 27), (26, 27), (27, 28), (28, 29), (28, 29), (28, 29), (28, 29), (29, 30), (30, 31), (30, 31), (30, 31), (30, 31), (31, 32), (32, 33), (32, 33), (32, 33), (32, 33), (33, 35)], [(0, 1), (1, 2), (2, 3), (3, 5), (5, 6), (6, 7), (7, 8), (8, 11), (11, 17), (17, 18), (17, 18), (17, 18), (18, 19), (18, 19), (18, 19), (19, 20), (19, 20), (19, 20), (20, 21), (21, 22), (22, 23)], [(0, 1), (1, 2), (2, 3), (3, 5), (5, 6), (6, 7), (7, 8), (8, 9), (9, 11), (11, 13), (13, 15), (15, 16), (16, 18), (18, 20), (20, 22), (22, 23), (23, 25), (25, 27), (27, 30), (30, 31), (31, 32), (32, 35), (35, 36), (36, 37), (37, 39), (39, 40), (40, 42), (42, 43), (43, 44), (44, 47), (47, 48), (48, 51), (51, 53), (53, 56), (56, 57), (57, 58), (58, 59), (59, 60), (60, 61), (61, 62), (62, 63), (63, 65), (65, 66), (66, 67)]] gt_decode = ["Hello, y'all! How are you Ⅷ 😁 😁 😁 ?", 'GluonNLP is great!!!!!!', "GluonNLP-Amazon-Haibin-Leonard-Sheng-Shuai-Xingjian...../:!@# 'abc'"] verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized) verify_pickleble(tokenizer, HuggingFaceTokenizer) verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets) verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_decode)
def test_huggingface_wordpiece_tokenizer_v08(): """Test for huggingface tokenizer >=0.8""" with tempfile.TemporaryDirectory() as dir_path: model_path = os.path.join(dir_path, 'hf_wordpiece_new_0.8.model') download(url=get_repo_url() + 'tokenizer_test_models/hf_wordpiece_new_0.8/hf_wordpiece.model', path=model_path, sha1_hash='66ccadf6e5e354ff9604e4a82f107a2ac873abd5') vocab_path = os.path.join(dir_path, 'hf_wordpiece_new_0.8.vocab') download(url=get_repo_url() + 'tokenizer_test_models/hf_wordpiece_new_0.8/hf_wordpiece.vocab', path=vocab_path, sha1_hash='dd6fdf4bbc74eaa8806d12cb3d38a4d9a306aea8') tokenizer = HuggingFaceTokenizer(model_path, vocab_path) gt_tokenized = [['Hel', '##lo', ',', 'y', '[UNK]', 'all', '!', 'How', 'are', 'you', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '?'], ['Gl', '##u', '##on', '##N', '##L', '##P', 'is', 'great', '[UNK]', '[UNK]', '[UNK]', '!', '!', '!'], ['Gl', '##u', '##on', '##N', '##L', '##P', '-', 'Am', '##az', '##on', '-', 'Ha', '##ibi', '##n', '-', 'Leon', '##ard', '-', 'She', '##n', '##g', '-', 'Sh', '##ua', '##i', '-', 'X', '##ing', '##j', '##ian', '.', '.', '.', '.', '.', '/', ':', '!', '@', '#', '[UNK]', 'ab', '##c', '[UNK]']] gt_offsets = [[(0, 3), (3, 5), (5, 6), (7, 8), (8, 9), (9, 12), (12, 13), (14, 17), (18, 21), (22, 25), (26, 27), (28, 29), (30, 31), (32, 33), (34, 35)], [(0, 2), (2, 3), (3, 5), (5, 6), (6, 7), (7, 8), (9, 11), (12, 17), (17, 18), (18, 19), (19, 20), (20, 21), (21, 22), (22, 23)], [(0, 2), (2, 3), (3, 5), (5, 6), (6, 7), (7, 8), (8, 9), (9, 11), (11, 13), (13, 15), (15, 16), (16, 18), (18, 21), (21, 22), (22, 23), (23, 27), (27, 30), (30, 31), (31, 34), (34, 35), (35, 36), (36, 37), (37, 39), (39, 41), (41, 42), (42, 43), (43, 44), (44, 47), (47, 48), (48, 51), (51, 52), (52, 53), (53, 54), (54, 55), (55, 56), (56, 57), (57, 58), (58, 59), (59, 60), (60, 61), (62, 63), (63, 65), (65, 66), (66, 67)]] gt_decode = ['Hello, y all! How are you?', 'GluonNLP is great!!!', 'GluonNLP - Amazon - Haibin - Leonard - Sheng - Shuai - Xingjian..... / ' ':! @ # abc'] verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized) verify_pickleble(tokenizer, HuggingFaceTokenizer) verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets) verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_decode)
def test_yttm_tokenizer(): with tempfile.TemporaryDirectory() as dir_path: model_path = os.path.join(dir_path, 'yttm.model') download(url=get_repo_url() + 'tokenizer_test_models/yttm/test_ende_yttm-6f2c39.model', path=model_path) tokenizer = YTTMTokenizer(model_path=model_path) gt_tokenized = [['▁He', 'll', 'o', ',', '▁y', "'", 'all', '!', '▁How', '▁are', '▁you', '▁', 'Ⅷ', '▁', '😁', '▁', '😁', '▁', '😁', '▁?'], ['▁Gl', 'u', 'on', 'N', 'L', 'P', '▁is', '▁great', '!', '!', '!', '!', '!', '!'], ['▁Gl', 'u', 'on', 'N', 'L', 'P', '-A', 'm', 'az', 'on', '-H', 'a', 'ib', 'in', '-L', 'e', 'on', 'ard', '-S', 'hen', 'g', '-S', 'h', 'u', 'ai', '-', 'X', 'ing', 'j', 'ian', '.', '.', '.', '.', '.', '/', ':', '!', '@', '#', '▁', "'", 'ab', 'c', "'"]] gt_offsets = [[(0, 2), (2, 4), (4, 5), (5, 6), (6, 8), (8, 9), (9, 12), (12, 13), (13, 17), (17, 21), (21, 25), (25, 26), (26, 27), (27, 28), (28, 29), (29, 30), (30, 31), (31, 32), (32, 33), (33, 35)], [(0, 2), (2, 3), (3, 5), (5, 6), (6, 7), (7, 8), (8, 11), (11, 17), (17, 18), (18, 19), (19, 20), (20, 21), (21, 22), (22, 23)], [(0, 2), (2, 3), (3, 5), (5, 6), (6, 7), (7, 8), (8, 10), (10, 11), (11, 13), (13, 15), (15, 17), (17, 18), (18, 20), (20, 22), (22, 24), (24, 25), (25, 27), (27, 30), (30, 32), (32, 35), (35, 36), (36, 38), (38, 39), (39, 40), (40, 42), (42, 43), (43, 44), (44, 47), (47, 48), (48, 51), (51, 52), (52, 53), (53, 54), (54, 55), (55, 56), (56, 57), (57, 58), (58, 59), (59, 60), (60, 61), (61, 62), (62, 63), (63, 65), (65, 66), (66, 67)]] gt_int_decode = ['Hello, y<UNK>all! How are you <UNK> <UNK> <UNK> <UNK> ?', 'GluonNLP is great!!!!!!', 'GluonNLP-Amazon-Haibin-Leonard-Sheng-Shuai-Xingjian...../:!@# <UNK>abc<UNK>'] gt_str_decode = ["Hello, y'all! How are you Ⅷ 😁 😁 😁 ?", 'GluonNLP is great!!!!!!', "GluonNLP-Amazon-Haibin-Leonard-Sheng-Shuai-Xingjian...../:!@# 'abc'"] verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized) verify_pickleble(tokenizer, YTTMTokenizer) verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets) # Begin to verify decode for sample_sentences, ele_gt_int_decode, ele_gt_str_decode in [(SUBWORD_TEST_SAMPLES[0], gt_int_decode[0], gt_str_decode[0]), (SUBWORD_TEST_SAMPLES, gt_int_decode, gt_str_decode)]: int_decode = tokenizer.decode(tokenizer.encode(sample_sentences, int)) str_decode = tokenizer.decode(tokenizer.encode(sample_sentences, str)) assert int_decode == ele_gt_int_decode assert str_decode == ele_gt_str_decode os.remove(model_path) assert tokenizer.decode([]) == '' assert tokenizer.decode([[]]) == ['']
def test_subword_nmt_tokenizer(): with tempfile.TemporaryDirectory() as dir_path: model_path = os.path.join(dir_path, 'subword_nmt.model') download(url=get_repo_url() + 'tokenizer_test_models/subword-nmt/test_ende-d189ff.model', path=model_path) vocab_path = os.path.join(dir_path, 'subword_nmt.vocab') download(url=get_repo_url() + 'tokenizer_test_models/subword-nmt/test_ende_vocab-900f81.json', path=vocab_path) # Case 1 tokenizer = SubwordNMTTokenizer(model_path, vocab_path) gt_tokenized = [["Hel", "lo", ",</w>", "y", "\'", "all", "!</w>", "How</w>", "are</w>", "you</w>", "Ⅷ</w>", "😁</w>", "😁</w>", "😁</w>", "?</w>"], ["Gl", "u", "on", "N", "L", "P</w>", "is</w>", "great", "!", "!", "!", "!!", "!</w>"], ["Gl", "u", "on", "N", "L", "P", "-", "Amaz", "on-", "H", "ai", "b", "in-", "Le", "on", "ard", "-", "Sh", "eng", "-", "Sh", "u", "ai", "-", "X", "ing", "ji", "an", "..", "...", "/", ":", "!", "@", "#</w>", "\'", "ab", "c", "\'</w>"]] gt_offsets = [[(0, 3), (3, 5), (5, 6), (7, 8), (8, 9), (9, 12), (12, 13), (14, 17), (18, 21), (22, 25), (26, 27), (28, 29), (30, 31), (32, 33), (34, 35)], [(0, 2), (2, 3), (3, 5), (5, 6), (6, 7), (7, 8), (9, 11), (12, 17), (17, 18), (18, 19), (19, 20), (20, 22), (22, 23)], [(0, 2), (2, 3), (3, 5), (5, 6), (6, 7), (7, 8), (8, 9), (9, 13), (13, 16), (16, 17), (17, 19), (19, 20), (20, 23), (23, 25), (25, 27), (27, 30), (30, 31), (31, 33), (33, 36), (36, 37), (37, 39), (39, 40), (40, 42), (42, 43), (43, 44), (44, 47), (47, 49), (49, 51), (51, 53), (53, 56), (56, 57), (57, 58), (58, 59), (59, 60), (60, 61), (62, 63), (63, 65), (65, 66), (66, 67)]] gt_int_decode = ["Hello, y\'all! How are you Ⅷ 😁 😁 😁 ?", "GluonNLP is great!!!!!!", "GluonNLP-Amazon-Haibin-Leonard-Sheng-Shuai-Xingjian...../:!@# \'abc\'"] gt_str_decode = SUBWORD_TEST_SAMPLES verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized) verify_pickleble(tokenizer, SubwordNMTTokenizer) verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets) verify_decode_subword_nmt(tokenizer, SUBWORD_TEST_SAMPLES, gt_int_decode, gt_str_decode) # Case 2, bpe_dropout # We use str decode here because we may not perfectly recover the original sentence with int decode. tokenizer = SubwordNMTTokenizer(model_path, vocab_path, bpe_dropout=0.5) verify_decode(tokenizer, SUBWORD_TEST_SAMPLES, out_type=str) os.remove(model_path) os.remove(vocab_path)
def test_huggingface_bpe_tokenizer_v08(): """Test for huggingface BPE tokenizer >=0.8""" with tempfile.TemporaryDirectory() as dir_path: model_path = os.path.join(dir_path, 'hf_bpe_new_0.8.model') download(url=get_repo_url() + 'tokenizer_test_models/hf_bpe_new_0.8/hf_bpe.model', path=model_path, sha1_hash='ecda90979561ca4c5a8d769b5e3c9fa2270d5317') vocab_path = os.path.join(dir_path, 'hf_bpe_new_0.8.vocab') download(url=get_repo_url() + 'tokenizer_test_models/hf_bpe_new_0.8/hf_bpe.vocab', path=vocab_path, sha1_hash='b92dde0b094f405208f3ec94b5eae88430bf4262') tokenizer = HuggingFaceTokenizer(model_path, vocab_path) gt_tokenized = [['H', 'ello</w>', ',</w>', 'y</w>', 'all</w>', '!</w>', 'How</w>', 'are</w>', 'you</w>', '?</w>'], ['G', 'lu', 'on', 'N', 'L', 'P</w>', 'is</w>', 'great</w>', '!</w>', '!</w>', '!</w>'], ['G', 'lu', 'on', 'N', 'L', 'P</w>', '-</w>', 'Amaz', 'on</w>', '-</w>', 'Ha', 'i', 'bin</w>', '-</w>', 'Leon', 'ard</w>', '-</w>', 'Sh', 'eng</w>', '-</w>', 'S', 'hu', 'ai</w>', '-</w>', 'X', 'ing', 'j', 'ian</w>', '.</w>', '.</w>', '.</w>', '.</w>', '.</w>', '/</w>', ':</w>', '!</w>', '@</w>', '#</w>', 'ab', 'c</w>']] gt_offsets = [[(0, 1), (1, 5), (5, 6), (7, 8), (9, 12), (12, 13), (14, 17), (18, 21), (22, 25), (34, 35)], [(0, 1), (1, 3), (3, 5), (5, 6), (6, 7), (7, 8), (9, 11), (12, 17), (20, 21), (21, 22), (22, 23)], [(0, 1), (1, 3), (3, 5), (5, 6), (6, 7), (7, 8), (8, 9), (9, 13), (13, 15), (15, 16), (16, 18), (18, 19), (19, 22), (22, 23), (23, 27), (27, 30), (30, 31), (31, 33), (33, 36), (36, 37), (37, 38), (38, 40), (40, 42), (42, 43), (43, 44), (44, 47), (47, 48), (48, 51), (51, 52), (52, 53), (53, 54), (54, 55), (55, 56), (56, 57), (57, 58), (58, 59), (59, 60), (60, 61), (63, 65), (65, 66)]] gt_decode = ['Hello , y all ! How are you ?', 'GluonNLP is great ! ! !', 'GluonNLP - Amazon - Haibin - Leonard - Sheng - Shuai - Xingjian' ' . . . . . / : ! @ # abc'] verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized) verify_pickleble(tokenizer, HuggingFaceTokenizer) verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets) verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_decode)
def main(args): url = _URLS[args.dataset] file_hash = _URL_FILE_STATS[url] target_download_location = os.path.join(args.cache_path, os.path.basename(url)) download(url, target_download_location, sha1_hash=file_hash) save_dir = args.dataset if args.save_dir is None else args.save_dir if not os.path.exists(save_dir): os.makedirs(save_dir, exist_ok=True) if args.dataset == 'gutenberg': if args.mode == 'raw': with zipfile.ZipFile(target_download_location) as f: for name in f.namelist(): if name.endswith('.txt'): filename = os.path.basename(name) f.extract(name, os.path.join(save_dir, filename)) else: # TODO(zheyuye), format for pretraining raise NotImplementedError else: raise NotImplementedError
def main(args): train_url = _URLS[args.version]['train'] dev_url = _URLS[args.version]['dev'] train_file_name = train_url[train_url.rfind('/') + 1:] dev_file_name = dev_url[dev_url.rfind('/') + 1:] download(train_url, path=os.path.join(args.cache_path, train_file_name)) download(dev_url, path=os.path.join(args.cache_path, dev_file_name)) if not os.path.exists(args.save_path): os.makedirs(args.save_path) if not os.path.exists(os.path.join(args.save_path, train_file_name)) \ or (args.overwrite and args.save_path != args.cache_path): shutil.copyfile(os.path.join(args.cache_path, train_file_name), os.path.join(args.save_path, train_file_name)) else: print(f'Found {os.path.join(args.save_path, train_file_name)}...skip') if not os.path.exists(os.path.join(args.save_path, dev_file_name)) \ or (args.overwrite and args.save_path != args.cache_path): shutil.copyfile(os.path.join(args.cache_path, dev_file_name), os.path.join(args.save_path, dev_file_name)) else: print(f'Found {os.path.join(args.save_path, dev_file_name)}...skip')
def test_huggingface_bytebpe_tokenizer(): with tempfile.TemporaryDirectory() as dir_path: model_path = os.path.join(dir_path, 'hf_bytebpe.model') download(url=get_repo_url() + 'tokenizer_test_models/hf_bytebpe/test_hf_bytebpe.model', path=model_path) vocab_path = os.path.join(dir_path, 'hf_bytebpe.vocab') download(url=get_repo_url() + 'tokenizer_test_models/hf_bytebpe/test_hf_bytebpe.vocab', path=vocab_path) hf_vocab_path = os.path.join(dir_path, 'hf_bytebpe.hf_vocab') download(url=get_repo_url() + 'tokenizer_test_models/hf_bytebpe/test_hf_bytebpe.hf_vocab', path=hf_vocab_path) # Case 1, default lowercase=False tokenizer = HuggingFaceByteBPETokenizer(model_path, vocab_path) gt_tokenized = [['Hello', ',', 'Ġy', "'", 'all', '!', 'ĠHow', 'Ġare', 'Ġyou', 'Ġâ', 'ħ', '§', 'ĠðŁĺ', 'ģ', 'ĠðŁĺ', 'ģ', 'ĠðŁĺ', 'ģ', 'Ġ?'], ['Gl', 'u', 'on', 'N', 'LP', 'Ġis', 'Ġgreat', 'ï¼', 'ģ', 'ï¼', 'ģ', 'ï¼', 'ģ', '!!!'], ['Gl', 'u', 'on', 'N', 'LP', '-', 'Amazon', '-', 'Ha', 'ib', 'in', '-', 'Le', 'on', 'ard', '-', 'She', 'ng', '-', 'Sh', 'u', 'ai', '-', 'X', 'ing', 'j', 'ian', '.....', '/', ':', '!', '@', '#', "Ġ'", 'ab', 'c', "'"]] # the defination of the offsets of bytelevel seems not clear gt_offsets = [[(0, 5), (5, 6), (6, 8), (8, 9), (9, 12), (12, 13), (13, 17), (17, 21), (21, 25), (25, 27), (26, 27), (26, 27), (27, 29), (28, 29), (29, 31), (30, 31), (31, 33), (32, 33), (33, 35)], [(0, 2), (2, 3), (3, 5), (5, 6), (6, 8), (8, 11), (11, 17), (17, 18), (17, 18), (18, 19), (18, 19), (19, 20), (19, 20), (20, 23)], [(0, 2), (2, 3), (3, 5), (5, 6), (6, 8), (8, 9), (9, 15), (15, 16), (16, 18), (18, 20), (20, 22), (22, 23), (23, 25), (25, 27), (27, 30), (30, 31), (31, 34), (34, 36), (36, 37), (37, 39), (39, 40), (40, 42), (42, 43), (43, 44), (44, 47), (47, 48), (48, 51), (51, 56), (56, 57), (57, 58), (58, 59), (59, 60), (60, 61), (61, 63), (63, 65), (65, 66), (66, 67)]] gt_decode = ["Hello, y'all! How are you Ⅷ 😁 😁 😁 ?", 'GluonNLP is great!!!!!!', "GluonNLP-Amazon-Haibin-Leonard-Sheng-Shuai-Xingjian...../:!@# 'abc'"] verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized) verify_pickleble(tokenizer, HuggingFaceByteBPETokenizer) verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets) verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_decode) # Case 2, lowercase=True gt_lowercase_int_decode = ["hello, y'all! how are you ⅷ 😁 😁 😁 ?", 'gluonnlp is great!!!!!!', "gluonnlp-amazon-haibin-leonard-sheng-shuai-xingjian...../:!@# 'abc'"] tokenizer = HuggingFaceByteBPETokenizer(model_path, vocab_path, lowercase=True) verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_lowercase_int_decode) # Case 3, using original hf vocab tokenizer = HuggingFaceByteBPETokenizer(model_path, hf_vocab_path) verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized) verify_pickleble(tokenizer, HuggingFaceByteBPETokenizer) verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets) verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_decode) os.remove(model_path) os.remove(vocab_path) os.remove(hf_vocab_path)
def main(args): # Download the data url = _URLS[args.dataset] file_hash = _URL_FILE_STATS[url] target_download_location = os.path.join(args.cache_path, os.path.basename(url)) download(url, target_download_location, sha1_hash=file_hash) if args.save_dir is None: save_dir = args.dataset else: save_dir = args.save_dir if not args.overwrite and os.path.exists(save_dir): print('{} found, skip! Turn on --overwrite to force overwrite'.format( save_dir)) print('Extract the data from {} into {}'.format(target_download_location, save_dir)) if args.dataset == 'lmd_full': with tarfile.open(target_download_location) as f: f.extractall(save_dir) elif args.dataset == 'lmd_matched': with tarfile.open(target_download_location) as f: f.extractall(save_dir) elif args.dataset == 'lmd_aligned': with tarfile.open(target_download_location) as f: f.extractall(save_dir) elif args.dataset == 'clean_midi': with tarfile.open(target_download_location) as f: f.extractall(save_dir) elif args.dataset == 'maestro_v1': with zipfile.ZipFile(target_download_location, 'r') as fobj: fobj.extractall(save_dir) elif args.dataset == 'maestro_v2': with zipfile.ZipFile(target_download_location, 'r') as fobj: fobj.extractall(save_dir) elif args.dataset == 'geocities': with zipfile.ZipFile(target_download_location, 'r') as fobj: fobj.extractall(save_dir) else: raise NotImplementedError
def test_huggingface_bpe_tokenizer(): with tempfile.TemporaryDirectory() as dir_path: model_path = os.path.join(dir_path, 'test_hf_bpe.model') download(url=get_repo_url() + 'tokenizer_test_models/hf_bpe/test_hf_bpe.model', path=model_path) vocab_path = os.path.join(dir_path, 'test_hf_bpe.vocab') download(url=get_repo_url() + 'tokenizer_test_models/hf_bpe/test_hf_bpe.vocab', path=vocab_path) hf_vocab_path = os.path.join(dir_path, 'test_hf_bpe.hf_vocab') download(url=get_repo_url() + 'tokenizer_test_models/hf_bpe/test_hf_bpe.hf_vocab', path=hf_vocab_path) # Case 1, default lowercase=False tokenizer = HuggingFaceBPETokenizer(model_path, vocab_path) gt_tokenized = [['Hello</w>', ',</w>', 'y</w>', "'</w>", 'all</w>', '!</w>', 'How</w>', 'are</w>', 'you</w>', '<unk>', '<unk>', '<unk>', '<unk>', '?</w>'], ['Gl', 'u', 'on', 'N', 'LP</w>', 'is</w>', 'great</w>', '!</w>', '!</w>', '!</w>', '!</w>', '!</w>', '!</w>'], ['Gl', 'u', 'on', 'N', 'LP</w>', '-</w>', 'Amazon</w>', '-</w>', 'H', 'ai', 'bin</w>', '-</w>', 'Leonard</w>', '-</w>', 'Sh', 'en', 'g</w>', '-</w>', 'Sh', 'u', 'ai</w>', '-</w>', 'X', 'ing', 'j', 'ian</w>', '.</w>', '.</w>', '.</w>', '.</w>', '.</w>', '/</w>', ':</w>', '!</w>', '@</w>', '#</w>', "'</w>", 'ab', 'c</w>', "'</w>"]] gt_offsets = [[(0, 5), (5, 6), (7, 8), (8, 9), (9, 12), (12, 13), (14, 17), (18, 21), (22, 25), (26, 27), (28, 29), (30, 31), (32, 33), (34, 35)], [(0, 2), (2, 3), (3, 5), (5, 6), (6, 8), (9, 11), (12, 17), (17, 18), (18, 19), (19, 20), (20, 21), (21, 22), (22, 23)], [(0, 2), (2, 3), (3, 5), (5, 6), (6, 8), (8, 9), (9, 15), (15, 16), (16, 17), (17, 19), (19, 22), (22, 23), (23, 30), (30, 31), (31, 33), (33, 35), (35, 36), (36, 37), (37, 39), (39, 40), (40, 42), (42, 43), (43, 44), (44, 47), (47, 48), (48, 51), (51, 52), (52, 53), (53, 54), (54, 55), (55, 56), (56, 57), (57, 58), (58, 59), (59, 60), (60, 61), (62, 63), (63, 65), (65, 66), (66, 67)]] # gt_int_decode = gt_str_decode for hf # hf removed the unk tokens in decode result gt_decode = ["Hello , y ' all ! How are you ?", 'GluonNLP is great ! ! ! ! ! !', "GluonNLP - Amazon - Haibin - Leonard - Sheng - Shuai - Xingjian . . . . . / : ! @ # ' abc '"] verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized) verify_pickleble(tokenizer, HuggingFaceBPETokenizer) verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets) verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_decode) # Case 2, lowercase=True gt_lowercase_decode = ["hello , y ' all ! how are you ?", 'gluonnlp is great ! ! ! ! ! !', "gluonnlp - amazon - haibin - leonard - sheng - shuai - xingjian . . . . . / : ! @ # ' abc '"] tokenizer = HuggingFaceBPETokenizer(model_path, vocab_path, lowercase=True) verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_lowercase_decode) # Case 3, using original hf vocab tokenizer = HuggingFaceBPETokenizer(model_path, hf_vocab_path) verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized) verify_pickleble(tokenizer, HuggingFaceBPETokenizer) verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets) verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_decode) os.remove(model_path) os.remove(vocab_path) os.remove(hf_vocab_path)
def main(args): def extract(tar_path, target_path): try: tar = tarfile.open(tar_path, "r:gz") file_names = tar.getnames() for file_name in file_names: tar.extract(file_name, target_path) tar.close() except Exception as e: print(e) tar_url = _URLS[args.type] file_name = tar_url[tar_url.rfind('/') + 1:] file_hash = _URL_FILE_STATS[tar_url] download(tar_url, path=os.path.join(args.cache_path, file_name), sha1_hash=file_hash) if not os.path.exists(args.save_path): os.makedirs(args.save_path) if not os.path.exists(os.path.join(args.save_path, file_name))\ or (args.overwrite and args.save_path != args.cache_path): os.symlink(os.path.join(args.cache_path, file_name), os.path.join(args.save_path, file_name)) extract(os.path.join(args.save_path, file_name), args.save_path)
def format_mrpc(data_dir): mrpc_dir = os.path.join(data_dir, "mrpc") os.makedirs(mrpc_dir, exist_ok=True) mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt") mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt") download(GLUE_TASK2PATH["mrpc"]['train'], mrpc_train_file, sha1_hash=_URL_FILE_STATS[GLUE_TASK2PATH["mrpc"]['train']]) download(GLUE_TASK2PATH["mrpc"]['test'], mrpc_test_file, sha1_hash=_URL_FILE_STATS[GLUE_TASK2PATH["mrpc"]['test']]) assert os.path.isfile( mrpc_train_file), "Train data not found at %s" % mrpc_train_file assert os.path.isfile( mrpc_test_file), "Test data not found at %s" % mrpc_test_file download(GLUE_TASK2PATH["mrpc"]['dev'], os.path.join(mrpc_dir, "dev_ids.tsv"), sha1_hash=_URL_FILE_STATS[GLUE_TASK2PATH["mrpc"]['dev']]) dev_ids = [] with open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding="utf8") as ids_fh: for row in ids_fh: dev_ids.append(row.strip().split("\t")) with open(mrpc_train_file, encoding="utf8") as data_fh, open( os.path.join(mrpc_dir, "train.tsv"), "w", encoding="utf8") as train_fh, open(os.path.join( mrpc_dir, "dev.tsv"), "w", encoding="utf8") as dev_fh: header = data_fh.readline() train_fh.write(header) dev_fh.write(header) for row in data_fh: label, id1, id2, s1, s2 = row.strip().split("\t") if [id1, id2] in dev_ids: dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2)) else: train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2)) with open(mrpc_test_file, encoding="utf8") as data_fh, open(os.path.join( mrpc_dir, "test.tsv"), "w", encoding="utf8") as test_fh: header = data_fh.readline() test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n") for idx, row in enumerate(data_fh): label, id1, id2, s1, s2 = row.strip().split("\t") test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
def main(args): os.makedirs(args.cache_path, exist_ok=True) os.makedirs(args.data_dir, exist_ok=True) if args.tasks == 'all': tasks = list(TASK2PATH.keys()) else: tasks = args.tasks.split(',') for task in tasks: task_dir_path = os.path.join(args.data_dir, task) os.makedirs(task_dir_path, exist_ok=True) file_url = TASK2PATH[task] sha1_hash = _URL_FILE_STATS[file_url] download_path = download(file_url, args.cache_path, sha1_hash=sha1_hash) with tarfile.open(download_path) as f: f.extractall(task_dir_path) if task == 'imdb': shutil.move(os.path.join(task_dir_path, 'imdb', 'train.parquet'), os.path.join(task_dir_path, 'train.parquet')) shutil.move(os.path.join(task_dir_path, 'imdb', 'test.parquet'), os.path.join(task_dir_path, 'test.parquet')) train_data = pd.read_parquet( os.path.join(task_dir_path, 'train.parquet')) test_data = pd.read_parquet( os.path.join(task_dir_path, 'test.parquet')) elif task == 'ag': train_data = pd.read_csv(os.path.join(task_dir_path, 'ag_news_csv', 'train.csv'), header=None) test_data = pd.read_csv(os.path.join(task_dir_path, 'ag_news_csv', 'test.csv'), header=None) train_data = pd.DataFrame({ 'label': train_data[0], 'content': train_data[1] + ' ' + train_data[2] }) test_data = pd.DataFrame({ 'label': test_data[0], 'content': test_data[1] + ' ' + test_data[2] }) train_data.to_parquet(os.path.join(task_dir_path, 'train.parquet')) test_data.to_parquet(os.path.join(task_dir_path, 'test.parquet')) elif task == 'dbpedia': train_data = pd.read_csv(os.path.join(task_dir_path, 'dbpedia_csv', 'train.csv'), header=None) test_data = pd.read_csv(os.path.join(task_dir_path, 'dbpedia_csv', 'test.csv'), header=None) train_data = pd.DataFrame({ 'label': train_data[0], 'content': train_data[1] + ' ' + train_data[2] }) test_data = pd.DataFrame({ 'label': test_data[0], 'content': test_data[1] + ' ' + test_data[2] }) train_data.to_parquet(os.path.join(task_dir_path, 'train.parquet')) test_data.to_parquet(os.path.join(task_dir_path, 'test.parquet')) elif task == 'yelp2': train_data = pd.read_csv(os.path.join(task_dir_path, 'yelp_review_polarity_csv', 'train.csv'), header=None) test_data = pd.read_csv(os.path.join(task_dir_path, 'yelp_review_polarity_csv', 'test.csv'), header=None) train_data.columns = ['label', 'review'] test_data.columns = ['label', 'review'] train_data.to_parquet(os.path.join(task_dir_path, 'train.parquet')) test_data.to_parquet(os.path.join(task_dir_path, 'test.parquet')) elif task == 'yelp5': train_data = pd.read_csv(os.path.join(task_dir_path, 'yelp_review_full_csv', 'train.csv'), header=None) test_data = pd.read_csv(os.path.join(task_dir_path, 'yelp_review_full_csv', 'test.csv'), header=None) train_data.columns = ['label', 'review'] test_data.columns = ['label', 'review'] train_data.to_parquet(os.path.join(task_dir_path, 'train.parquet')) test_data.to_parquet(os.path.join(task_dir_path, 'test.parquet')) elif task == 'amazon2': train_data = pd.read_csv(os.path.join( task_dir_path, 'amazon_review_polarity_csv', 'train.csv'), header=None) test_data = pd.read_csv(os.path.join(task_dir_path, 'amazon_review_polarity_csv', 'test.csv'), header=None) train_data = pd.DataFrame({ 'label': train_data[0], 'review': train_data[1] + ' ' + train_data[2] }) test_data = pd.DataFrame({ 'label': test_data[0], 'review': test_data[1] + ' ' + test_data[2] }) train_data.to_parquet(os.path.join(task_dir_path, 'train.parquet')) test_data.to_parquet(os.path.join(task_dir_path, 'test.parquet')) elif task == 'amazon5': train_data = pd.read_csv(os.path.join(task_dir_path, 'amazon_review_full_csv', 'train.csv'), header=None) test_data = pd.read_csv(os.path.join(task_dir_path, 'amazon_review_full_csv', 'test.csv'), header=None) train_data = pd.DataFrame({ 'label': train_data[0], 'review': train_data[1] + ' ' + train_data[2] }) test_data = pd.DataFrame({ 'label': test_data[0], 'review': test_data[1] + ' ' + test_data[2] }) train_data.to_parquet(os.path.join(task_dir_path, 'train.parquet')) test_data.to_parquet(os.path.join(task_dir_path, 'test.parquet')) else: raise NotImplementedError print('Task={}, #Train={}, #Test={}'.format(task, len(train_data), len(test_data)))
def _download_with_mirror(url, path, sha1_hash): return download( get_repo_url() + _WMT_MIRROR_URL_MAP[url] if url in _WMT_MIRROR_URL_MAP else url, path=path, sha1_hash=sha1_hash )
def main(args): if args.data_dir is None: args.data_dir = args.benchmark args.cache_path = os.path.join(args.cache_path, args.benchmark) print('Downloading {} to {}. Selected tasks = {}'.format( args.benchmark, args.data_dir, args.tasks)) os.makedirs(args.cache_path, exist_ok=True) os.makedirs(args.data_dir, exist_ok=True) tasks = get_tasks(args.benchmark, args.tasks) if args.benchmark == 'glue': TASK2PATH = GLUE_TASK2PATH TASK2READER = GLUE_READERS elif args.benchmark == 'superglue': TASK2PATH = SUPERGLUE_TASK2PATH TASK2READER = SUPERGLUE_READER else: raise NotImplementedError for task in tasks: print('Processing {}...'.format(task)) if task == 'diagnostic' or 'diagnostic' in task: if args.benchmark == 'glue': reader = TASK2READER[task] base_dir = os.path.join(args.data_dir, 'rte_diagnostic') os.makedirs(base_dir, exist_ok=True) download(TASK2PATH['diagnostic'][0], path=os.path.join(base_dir, 'diagnostic.tsv'), sha1_hash=_URL_FILE_STATS[TASK2PATH['diagnostic'][0]]) download(TASK2PATH['diagnostic'][1], path=os.path.join(base_dir, 'diagnostic-full.tsv'), sha1_hash=_URL_FILE_STATS[TASK2PATH['diagnostic'][1]]) df = reader(base_dir) df.to_parquet(os.path.join(base_dir, 'diagnostic-full.parquet')) else: for key, name in [('broadcoverage-diagnostic', 'AX-b'), ('winogender-diagnostic', 'AX-g')]: data_file = os.path.join(args.cache_path, "{}.zip".format(key)) url = TASK2PATH[key] reader = TASK2READER[key] download(url, data_file, sha1_hash=_URL_FILE_STATS[url]) with zipfile.ZipFile(data_file) as zipdata: zipdata.extractall(args.data_dir) df = reader(os.path.join(args.data_dir, name)) df.to_parquet( os.path.join(args.data_dir, name, '{}.parquet'.format(name))) elif task == 'mrpc': reader = TASK2READER[task] format_mrpc(args.data_dir) df_dict, meta_data = reader(os.path.join(args.data_dir, 'mrpc')) for key, df in df_dict.items(): if key == 'val': key = 'dev' df.to_parquet( os.path.join(args.data_dir, 'mrpc', '{}.parquet'.format(key))) with open(os.path.join(args.data_dir, 'mrpc', 'metadata.json'), 'w') as f: json.dump(meta_data, f) else: # Download data data_file = os.path.join(args.cache_path, "{}.zip".format(task)) url = TASK2PATH[task] reader = TASK2READER[task] download(url, data_file, sha1_hash=_URL_FILE_STATS[url]) base_dir = os.path.join(args.data_dir, task) if os.path.exists(base_dir): print('Found!') continue zip_dir_name = None with zipfile.ZipFile(data_file) as zipdata: if zip_dir_name is None: zip_dir_name = os.path.dirname( zipdata.infolist()[0].filename) zipdata.extractall(args.data_dir) shutil.move(os.path.join(args.data_dir, zip_dir_name), base_dir) df_dict, meta_data = reader(base_dir) for key, df in df_dict.items(): if key == 'val': key = 'dev' df.to_parquet(os.path.join(base_dir, '{}.parquet'.format(key))) if meta_data is not None: with open(os.path.join(base_dir, 'metadata.json'), 'w') as f: json.dump(meta_data, f) print("\tCompleted!")