예제 #1
0
def main(args):
    num_process = min(multiprocessing.cpu_count(), args.num_process)
    if args.mode == 'download':
        download_wikicorpus(args.lang, args.date, args.output)
    elif args.mode == 'format':
        format_wikicorpus(args.input, args.output, args.bytes, num_process,
                          args.num_out_files)
    elif args.mode == 'download+format':
        downloaded_file = download_wikicorpus(args.lang, args.date,
                                              args.output)
        format_wikicorpus(downloaded_file, args.output, args.bytes,
                          num_process, args.num_out_files)
    elif args.mode == 'download_prepared':
        url = _URLS['wikipedia-en-20200620']
        file_hash = _URL_FILE_STATS[url]
        target_download_location = os.path.join(args.output,
                                                os.path.basename(url))
        download(url, target_download_location, sha1_hash=file_hash)
        tar = tarfile.open(target_download_location)
        names = tar.getnames()
        print('Start unarchiving raw text files')
        start_time = time.time()
        for name in names:
            tar.extract(name, path=args.output)
        tar.close()
        print("Done unarchiving within {:.2f} seconds".format(time.time() -
                                                              start_time))
    else:
        raise NotImplementedError
예제 #2
0
def convert_config(args, converted):
    print('converting cfg...')
    # download config
    gluon_cfg = Gluon_T5.get_cfg(T5_PRETRAINED_MODEL_MAP[args.model_name])
    with tempfile.TemporaryDirectory() as temp_dir:
        hf_cfg_path = os.path.join(temp_dir, 'config.json')
        download(url=T5_PRETRAINED_CONFIG_MAP[args.model_name],
                 path=hf_cfg_path)
        with open(hf_cfg_path, 'r') as f:
            hf_cfg = json.load(f)
        os.remove(hf_cfg_path)
    # update attributes
    cfg = gluon_cfg.clone()
    cfg.defrost()
    cfg.MODEL.vocab_size = hf_cfg['vocab_size']
    cfg.MODEL.d_model = hf_cfg['d_model']
    cfg.MODEL.d_kv = hf_cfg['d_kv']
    cfg.MODEL.d_ff = hf_cfg['d_ff']
    cfg.MODEL.num_layers = hf_cfg['num_layers']
    cfg.MODEL.num_heads = hf_cfg['num_heads']
    cfg.MODEL.layer_norm_eps = hf_cfg['layer_norm_epsilon']
    cfg.MODEL.dropout_prob = hf_cfg['dropout_rate']
    cfg.INITIALIZER.init_factor = hf_cfg['initializer_factor']
    cfg.freeze()
    # save config
    config_path = os.path.join(args.dest_dir, 'model.yml')
    with open(config_path, 'w') as f:
        f.write(cfg.dump())
    converted['config'] = config_path
    return cfg
예제 #3
0
def convert_vocab(args, converted):
    print('converting vocab...')
    # at this step we don't add <extra_id>s into the vocab, but just save the original binary file directly
    # those special tokens are added only when instantiating a T5Tokenizer
    vocab_path = os.path.join(args.dest_dir, 't5.vocab')
    download(url=PRETRAINED_VOCAB_MAP[args.model_name], path=vocab_path)
    converted['vocab'] = vocab_path
예제 #4
0
def main(args):
    url =_URLS['books1']
    file_hash = _URL_FILE_STATS[url]
    target_download_location = os.path.join(args.cache_path,
                                            os.path.basename(url))
    download(url, target_download_location, sha1_hash=file_hash)
    tar = tarfile.open(target_download_location)
    names = tar.getnames()
    print('Start unarchiving raw text files')
    start_time = time.time()
    for name in names:
        tar.extract(name, path=args.output)
    tar.close()
    print("Done unarchiving within {:.2f} seconds".format(time.time() - start_time))
    print("start transfer to one article per line")
    input_name = os.path.join(args.output, 'books1/epubtxt/')
    output_name = os.path.join(args.output,'bookcorpus.txt' )
    format = BookscorpusTextFormatting(input_name, output_name)
    format.merge()
    print("end format")
    if args.segment_sentences:
        print("start to transfer bookcorpus to one sentence per line")
        t1 = time.time()

        input_name = os.path.join(args.output, 'bookcorpus.txt')
        output_name = os.path.join(args.output, 'one_sentence_per_line/')
        if not os.path.exists(output_name):
            os.mkdir(output_name)
        sharding = Sharding([input_name], output_name, 128, 1, 0 ,args.segment_num_worker)

        sharding.load_articles()
        sharding.segment_articles_into_sentences()
        t2 = time.time()
        print("transfer cost:{}".format(t2-t1))
예제 #5
0
def main(args):
    def extract(gz_path):
        logging.warning(
            f'Extracting {gz_path}, this can cost long time because the file is large'
        )
        try:
            f_name = gz_path.replace(".gz", "")
            g_file = gzip.GzipFile(gz_path)
            open(f_name, "wb+").write(g_file.read())
            g_file.close()
            os.remove(gz_path)
        except Exception as e:
            print(e)

    if not os.path.exists(args.save_path):
        os.makedirs(args.save_path)

    for url in _URLS.values():
        file_name = url[url.rfind('/') + 1:]
        file_hash = _URL_FILE_STATS[url]
        download(url,
                 path=os.path.join(args.cache_path, file_name),
                 sha1_hash=file_hash)
        if not os.path.exists(os.path.join(args.save_path, file_name))\
                or (args.overwrite and args.save_path != args.cache_path):
            os.symlink(os.path.join(args.cache_path, file_name),
                       os.path.join(args.save_path, file_name))
        if args.extract:
            extract(os.path.join(args.save_path, file_name))
예제 #6
0
def test_sentencepiece_tokenizer():
    with tempfile.TemporaryDirectory() as dir_path:
        model_path = os.path.join(dir_path, 'spm.model')
        download(url=get_repo_url()
                     + 'tokenizer_test_models/sentencepiece/case1/test_ende-a9bee4.model',
                 path=model_path)
        # Case1
        tokenizer = SentencepieceTokenizer(model_path)
        gt_tokenized = [['▁Hel', 'lo', ',', '▁y', "'", 'all', '!', '▁How', '▁are', '▁you',
                         '▁', 'VI', 'II', '▁', '😁', '▁', '😁', '▁', '😁', '▁?'],
                        ['▁G', 'lu', 'on', 'N', 'L', 'P', '▁is', '▁great', '!', '!', '!', '!',
                         '!', '!'],
                        ['▁G', 'lu', 'on', 'N', 'L', 'P', '-', 'A', 'ma', 'zo', 'n', '-', 'H', 'ai',
                         'bin', '-', 'L', 'e', 'on', 'ard', '-', 'S', 'hen', 'g', '-', 'S', 'hu', 'ai',
                         '-', 'X', 'ing', 'j', 'ian', '.', '.', '.', '.', '.', '/', ':', '!', '@',
                         '#', '▁', "'", 'ab', 'c', "'"]]
        gt_offsets = [[(0, 3), (3, 5), (5, 6), (6, 8), (8, 9), (9, 12), (12, 13), (13, 17), (17, 21),
                       (21, 25), (25, 26), (26, 26), (26, 27), (27, 28), (28, 29), (29, 30), (30, 31),
                       (31, 32), (32, 33), (33, 35)],
                      [(0, 1), (1, 3), (3, 5), (5, 6), (6, 7), (7, 8), (8, 11), (11, 17), (17, 18),
                       (18, 19), (19, 20), (20, 21), (21, 22), (22, 23)],
                      [(0, 1), (1, 3), (3, 5), (5, 6), (6, 7), (7, 8), (8, 9), (9, 10), (10, 12),
                       (12, 14), (14, 15), (15, 16), (16, 17), (17, 19), (19, 22), (22, 23), (23, 24),
                       (24, 25), (25, 27), (27, 30), (30, 31), (31, 32), (32, 35), (35, 36), (36, 37),
                       (37, 38), (38, 40), (40, 42), (42, 43), (43, 44), (44, 47), (47, 48), (48, 51),
                       (51, 52), (52, 53), (53, 54), (54, 55), (55, 56), (56, 57), (57, 58), (58, 59),
                       (59, 60), (60, 61), (61, 62), (62, 63), (63, 65), (65, 66), (66, 67)]]
        gt_int_decode = ['Hello, y ⁇ all! How are you VIII  ⁇   ⁇   ⁇  ?',
                         'GluonNLP is great!!!!!!',
                         'GluonNLP-Amazon-Haibin-Leonard-Sheng-Shuai-Xingjian...../:! ⁇ #  ⁇ abc ⁇ ']
        verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized)
        verify_pickleble(tokenizer, SentencepieceTokenizer)
        verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets)
        verify_decode_spm(tokenizer, SUBWORD_TEST_SAMPLES, gt_int_decode)

        # Case2, lower_case
        gt_lower_case_int_decode = ['hello, y ⁇ all! how are you viii  ⁇   ⁇   ⁇  ?',
                                    'gluonnlp is great!!!!!!',
                                    'gluonnlp-amazon-haibin-leonard-sheng-shuai-xingjian...../:! ⁇ #  ⁇ abc ⁇ ']
        tokenizer = SentencepieceTokenizer(model_path, lowercase=True)
        verify_decode_spm(tokenizer, SUBWORD_TEST_SAMPLES, gt_lower_case_int_decode)

        # Case3, Use the sentencepiece regularization commands, we test whether we can obtain different encoding results
        tokenizer = SentencepieceTokenizer(model_path, lowercase=True, nbest=-1, alpha=1.0)
        has_different_encode_out = False
        encode_out = None
        for _ in range(10):
            if encode_out is None:
                encode_out = tokenizer.encode(SUBWORD_TEST_SAMPLES[0])
            else:
                ele_out = tokenizer.encode(SUBWORD_TEST_SAMPLES[0])
                if ele_out != encode_out:
                    has_different_encode_out = True
                    break
        assert has_different_encode_out
        os.remove(model_path)
예제 #7
0
def main(args):
    if not os.path.exists(args.save_path):
        os.makedirs(args.save_path)
    for url in _URLS.values():
        file_name = url[url.rfind('/') + 1:]
        file_hash = _URL_FILE_STATS[url]
        download(url, path=os.path.join(args.cache_path, file_name), sha1_hash=file_hash)
        if not os.path.exists(os.path.join(args.save_path, file_name))\
                or (args.overwrite and args.save_path != args.cache_path):
            os.symlink(os.path.join(args.cache_path, file_name),
                       os.path.join(args.save_path, file_name))
예제 #8
0
def test_huggingface_wordpiece_tokenizer():
    with tempfile.TemporaryDirectory() as dir_path:
        vocab_path = os.path.join(dir_path, 'hf_wordpiece.vocab')
        download(url=get_repo_url()
                     + 'tokenizer_test_models/hf_wordpiece/test_hf_wordpiece.vocab',
                 path=vocab_path)
        hf_vocab_path = os.path.join(dir_path, 'hf_wordpiece.hf_vocab')
        download(url=get_repo_url()
                     + 'tokenizer_test_models/hf_wordpiece/test_hf_wordpiece.hf_vocab',
                 path=hf_vocab_path)

        # Case 1, lowercase=True
        tokenizer = HuggingFaceWordPieceTokenizer(vocab_path, lowercase=True)
        gt_tokenized = [["hello", ",", "y", "'", "all", "!", "how", "are", "you",
                         "<unk>", "<unk>", "<unk>", "<unk>", "?"],
                        ["gl", "##uo", "##nn", "##l", "##p", "is", "great", "\uff01",
                         "\uff01", "\uff01", "!", "!", "!"],
                        ["gl", "##uo", "##nn", "##l", "##p", "-", "amazon", "-", "hai",
                         "##bin", "-", "leonard", "-", "shen", "##g", "-", "shu", "##ai", "-",
                         "xin", "##g", "##ji", "##an", ".", ".", ".", ".", ".", "/", ":", "!",
                         "@", "#", "'", "abc", "'"]]
        gt_offsets = [[(0, 5), (5, 6), (7, 8), (8, 9), (9, 12), (12, 13), (14, 17), (18, 21),
                       (22, 25), (26, 27), (28, 29), (30, 31), (32, 33), (34, 35)],
                      [(0, 2), (2, 4), (4, 6), (6, 7), (7, 8), (9, 11), (12, 17), (17, 18),
                       (18, 19), (19, 20), (20, 21), (21, 22), (22, 23)],
                      [(0, 2), (2, 4), (4, 6), (6, 7), (7, 8), (8, 9), (9, 15), (15, 16), (16, 19),
                       (19, 22), (22, 23), (23, 30), (30, 31), (31, 35), (35, 36), (36, 37), (37, 40),
                       (40, 42), (42, 43), (43, 46), (46, 47), (47, 49), (49, 51), (51, 52), (52, 53),
                       (53, 54), (54, 55), (55, 56), (56, 57), (57, 58), (58, 59), (59, 60), (60, 61),
                       (62, 63), (63, 66), (66, 67)]]
        gt_decode = ["hello, y'all! how are you?",
                     "gluonnlp is great ! ! !!!!",
                     "gluonnlp - amazon - haibin - leonard - sheng - shuai - xingjian..... / :! @ #'abc '"]
        verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized)
        verify_pickleble(tokenizer, HuggingFaceWordPieceTokenizer)
        verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets)
        verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_decode)

        # Case 2, lowercase=False
        gt_lowercase_decode = [", y'all! are you?",
                               "is great ! ! !!!!",
                               "- - - - - -..... / :! @ #'abc '"]
        tokenizer = HuggingFaceWordPieceTokenizer(vocab_path, lowercase=False)
        verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_lowercase_decode)

        # Case 3, using original hf vocab
        tokenizer = HuggingFaceWordPieceTokenizer(hf_vocab_path, lowercase=True)
        verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized)
        verify_pickleble(tokenizer, HuggingFaceWordPieceTokenizer)
        verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets)
        verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_decode)

        os.remove(vocab_path)
        os.remove(hf_vocab_path)
예제 #9
0
def download_wikicorpus(lang, date, output):
    """
    lang: the language code such as en, zh
    date: string, the date of the Wikipedia with format of YYYYMMDD, or 'latest'.
    """
    if not os.path.exists(output):
        os.makedirs(output)
    if lang not in __LANGUAGES_BANK:
        raise ValueError('Unsupported language code')
    language = lang.replace('-', '_')
    output_file = os.path.join(output, 'download', language, date,
                               'wikicorpus.xml.bz2')
    download(get_url(language, date), output_file)
    return output_file
예제 #10
0
def main(args):
    url = _URLS['gutenberg']
    file_hash = _URL_FILE_STATS[url]
    target_download_location = os.path.join(args.cache_path,
                                            os.path.basename(url))
    download(url, target_download_location, sha1_hash=file_hash)
    save_dir = args.dataset if args.save_dir is None else args.save_dir
    if not os.path.exists(save_dir):
        os.makedirs(save_dir, exist_ok=True)
    with zipfile.ZipFile(target_download_location) as f:
        for name in f.namelist():
            if name.endswith('.txt'):
                filename = os.path.basename(name)
            f.extract(name, os.path.join(save_dir, filename))
예제 #11
0
def verify_download(url, sha1_hash, overwrite):
    with tempfile.TemporaryDirectory() as root:
        download_path = os.path.join(root, 'dat0')
        # Firstly, verify that we are able to get download the data correctly
        download(url, sha1_hash=sha1_hash, path=download_path, overwrite=overwrite)
        assert sha1sum(download_path) == sha1_hash
        os.remove(download_path)

        # Secondly, verify that we are able to download with multiprocessing
        download_path = os.path.join(root, 'dat1')
        with multiprocessing.Pool(2) as pool:
            pool.map(functools.partial(download, sha1_hash=sha1_hash,
                                       path=download_path, overwrite=overwrite),
                     [url for _ in range(2)])
        assert sha1sum(download_path) == sha1_hash
        os.remove(download_path)
예제 #12
0
def main(args):
    num_process = min(multiprocessing.cpu_count(), args.num_process)
    if args.mode == 'download':
        download_wikicorpus(args.lang, args.date, args.output)
    elif args.mode == 'format':
        format_wikicorpus(args.input, args.output, args.bytes, num_process,
                          args.num_out_files, args.quiet)
    elif args.mode == 'download+format':
        downloaded_file = download_wikicorpus(args.lang, args.date,
                                              args.output)
        format_wikicorpus(downloaded_file, args.output, args.bytes,
                          num_process, args.num_out_files, args.quiet)
    elif args.mode == 'download_prepared':
        url = _URLS['wikipedia-en-20200620']
        file_hash = _URL_FILE_STATS[url]
        target_download_location = os.path.join(args.output,
                                                os.path.basename(url))
        download(url, target_download_location, sha1_hash=file_hash)
        tar = tarfile.open(target_download_location)
        names = tar.getnames()
        print('Start unarchiving raw text files')
        start_time = time.time()
        for name in names:
            tar.extract(name, path=args.output)
        tar.close()
        print("Done unarchiving within {:.2f} seconds".format(time.time() -
                                                              start_time))
    else:
        raise NotImplementedError
    if args.segment_sentences:
        print("start to transfer bookcorpus to one sentence per line")
        t1 = time.time()
        segmenter = NLTKSegmenter()
        original_name = os.path.join(args.output, 'prepared_wikipedia')
        output_name = os.path.join(args.output, 'one_sentence_per_line/')
        if not os.path.exists(output_name):
            os.mkdir(output_name)
        input_names = os.listdir(original_name)
        for i in range(len(input_names)):
            input_names[i] = os.path.join(original_name, input_names[i])
        sharding = Sharding(input_names, output_name, 256, 1, 0,
                            args.segment_num_worker)

        sharding.load_articles()
        sharding.segment_articles_into_sentences()
        t2 = time.time()
        print("transfer cost:{}".format(t2 - t1))
예제 #13
0
def main(args):
    train_url = _URLS[args.version]['train']
    dev_url = _URLS[args.version]['dev']
    train_file_name = train_url[train_url.rfind('/') + 1:]
    dev_file_name = dev_url[dev_url.rfind('/') + 1:]
    download(train_url, path=os.path.join(args.cache_path, train_file_name))
    download(dev_url, path=os.path.join(args.cache_path, dev_file_name))
    if not os.path.exists(args.save_path):
        os.makedirs(args.save_path)
    if not os.path.exists(os.path.join(args.save_path, train_file_name))\
            or (args.overwrite and args.save_path != args.cache_path):
        os.symlink(os.path.join(args.cache_path, train_file_name),
                   os.path.join(args.save_path, train_file_name))
    if not os.path.exists(os.path.join(args.save_path, dev_file_name))\
            or (args.overwrite and args.save_path != args.cache_path):
        os.symlink(os.path.join(args.cache_path, dev_file_name),
                   os.path.join(args.save_path, dev_file_name))
예제 #14
0
def main(args):
    url = _URLS['gutenberg']
    file_hash = _URL_FILE_STATS[url]
    target_download_location = os.path.join(args.cache_path,
                                            os.path.basename(url))
    download(url, target_download_location, sha1_hash=file_hash)
    save_dir = args.dataset if args.save_dir is None else args.save_dir
    if not os.path.exists(save_dir):
        os.makedirs(save_dir, exist_ok=True)
    print(f'Save to {save_dir}')
    with zipfile.ZipFile(target_download_location) as f:
        for name in f.namelist():
            if name.endswith('.txt'):
                filename = os.path.basename(name)
                with f.open(name) as in_file:
                    with open(os.path.join(save_dir, filename.replace(' ', '_')), 'wb') as out_file:
                        shutil.copyfileobj(in_file, out_file)
예제 #15
0
def try_import_wikiextractor():
    try:
        sys.path.append(_CURR_DIR)
        import WikiExtractor
    except ImportError:
        try:
            download(
                'https://raw.githubusercontent.com/attardi/wikiextractor/master/WikiExtractor.py',
                path=os.path.join(_CURR_DIR, 'WikiExtractor.py'),
                sha1_hash='3c4896a837b75c476d23c037e8d6c7fdfd9a29eb')
            sys.path.append(_CURR_DIR)
            import WikiExtractor
        except BaseException:
            raise ImportError(
                'Cannot import WikiExtractor! You can download the "WikiExtractor.py"'
                ' in https://github.com/attardi/wikiextractor to {}'.format(
                    _CURR_DIR))
    return WikiExtractor
예제 #16
0
def test_huggingface_bytebpe_tokenizer_v08():
    """Test for huggingface bytebpe tokenizer >=0.8"""
    with tempfile.TemporaryDirectory() as dir_path:
        model_path = os.path.join(dir_path, 'hf_bytebpe_new_0.8.model')
        download(url=get_repo_url() +
                     'tokenizer_test_models/hf_bytebpe_new_0.8/hf_bytebpe.model',
                 path=model_path,
                 sha1_hash='a1c4da1f6c21df923e150f56dbb5b7a53c61808b')
        vocab_path = os.path.join(dir_path, 'hf_bytebpe_new_0.8.vocab')
        download(url=get_repo_url() +
                     'tokenizer_test_models/hf_bytebpe_new_0.8/hf_bytebpe.vocab',
                 path=vocab_path,
                 sha1_hash='7831b19078a3222f450e65b2188dc0770473123b')
        tokenizer = HuggingFaceTokenizer(model_path, vocab_path)
        gt_tokenized = [['He', 'llo', ',', 'Ġy', "'", 'all', '!', 'ĠHow', 'Ġare', 'Ġyou',
                         'Ġâ', 'ħ', '§', 'Ġ', 'ð', 'Ł', 'ĺ', 'ģ', 'Ġ', 'ð', 'Ł', 'ĺ',
                         'ģ', 'Ġ', 'ð', 'Ł', 'ĺ', 'ģ', 'Ġ?'],
                        ['G', 'l', 'u', 'on', 'N', 'L', 'P', 'Ġis', 'Ġgreat', 'ï', '¼', 'ģ',
                         'ï', '¼', 'ģ', 'ï', '¼', 'ģ', '!', '!', '!'],
                        ['G', 'l', 'u', 'on', 'N', 'L', 'P', '-', 'Am', 'az', 'on', '-',
                         'Ha', 'ib', 'in', '-', 'Le', 'on', 'ard', '-', 'S', 'hen', 'g', '-',
                         'Sh', 'u', 'ai', '-', 'X', 'ing', 'j', 'ian',
                         '..', '...', '/', ':', '!', '@', '#', 'Ġ', "'", 'ab', 'c', "'"]]
        gt_offsets = [[(0, 2), (2, 5), (5, 6), (6, 8), (8, 9), (9, 12), (12, 13), (13, 17),
                       (17, 21), (21, 25), (25, 27), (26, 27), (26, 27), (27, 28), (28, 29),
                       (28, 29), (28, 29), (28, 29), (29, 30), (30, 31), (30, 31), (30, 31),
                       (30, 31), (31, 32), (32, 33), (32, 33), (32, 33), (32, 33), (33, 35)],
                      [(0, 1), (1, 2), (2, 3), (3, 5), (5, 6), (6, 7), (7, 8), (8, 11), (11, 17),
                       (17, 18), (17, 18), (17, 18), (18, 19), (18, 19), (18, 19), (19, 20),
                       (19, 20), (19, 20), (20, 21), (21, 22), (22, 23)],
                      [(0, 1), (1, 2), (2, 3), (3, 5), (5, 6), (6, 7), (7, 8), (8, 9), (9, 11),
                       (11, 13), (13, 15), (15, 16), (16, 18), (18, 20), (20, 22), (22, 23),
                       (23, 25), (25, 27), (27, 30), (30, 31), (31, 32), (32, 35), (35, 36),
                       (36, 37), (37, 39), (39, 40), (40, 42), (42, 43), (43, 44),
                       (44, 47), (47, 48), (48, 51), (51, 53), (53, 56), (56, 57),
                       (57, 58), (58, 59), (59, 60), (60, 61), (61, 62), (62, 63),
                       (63, 65), (65, 66), (66, 67)]]
        gt_decode = ["Hello, y'all! How are you Ⅷ 😁 😁 😁 ?",
                     'GluonNLP is great!!!!!!',
                     "GluonNLP-Amazon-Haibin-Leonard-Sheng-Shuai-Xingjian...../:!@# 'abc'"]
        verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized)
        verify_pickleble(tokenizer, HuggingFaceTokenizer)
        verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets)
        verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_decode)
예제 #17
0
def test_huggingface_wordpiece_tokenizer_v08():
    """Test for huggingface tokenizer >=0.8"""
    with tempfile.TemporaryDirectory() as dir_path:
        model_path = os.path.join(dir_path, 'hf_wordpiece_new_0.8.model')
        download(url=get_repo_url() +
                     'tokenizer_test_models/hf_wordpiece_new_0.8/hf_wordpiece.model',
                 path=model_path,
                 sha1_hash='66ccadf6e5e354ff9604e4a82f107a2ac873abd5')
        vocab_path = os.path.join(dir_path, 'hf_wordpiece_new_0.8.vocab')
        download(url=get_repo_url() +
                     'tokenizer_test_models/hf_wordpiece_new_0.8/hf_wordpiece.vocab',
                 path=vocab_path,
                 sha1_hash='dd6fdf4bbc74eaa8806d12cb3d38a4d9a306aea8')
        tokenizer = HuggingFaceTokenizer(model_path, vocab_path)
        gt_tokenized = [['Hel', '##lo', ',', 'y', '[UNK]', 'all', '!',
                         'How', 'are', 'you', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '?'],
                        ['Gl', '##u', '##on', '##N', '##L', '##P', 'is', 'great', '[UNK]',
                         '[UNK]', '[UNK]', '!', '!', '!'],
                        ['Gl', '##u', '##on', '##N', '##L', '##P', '-',
                         'Am', '##az', '##on', '-', 'Ha', '##ibi', '##n', '-', 'Leon', '##ard',
                         '-', 'She', '##n', '##g', '-', 'Sh', '##ua', '##i', '-', 'X',
                         '##ing', '##j', '##ian', '.', '.', '.', '.', '.', '/', ':', '!',
                         '@', '#', '[UNK]', 'ab', '##c', '[UNK]']]
        gt_offsets = [[(0, 3), (3, 5), (5, 6), (7, 8), (8, 9), (9, 12), (12, 13),
                       (14, 17), (18, 21), (22, 25), (26, 27), (28, 29), (30, 31),
                       (32, 33), (34, 35)],
                      [(0, 2), (2, 3), (3, 5), (5, 6), (6, 7), (7, 8), (9, 11), (12, 17),
                       (17, 18), (18, 19), (19, 20), (20, 21), (21, 22), (22, 23)],
                      [(0, 2), (2, 3), (3, 5), (5, 6), (6, 7), (7, 8), (8, 9),
                       (9, 11), (11, 13), (13, 15), (15, 16), (16, 18), (18, 21),
                       (21, 22), (22, 23), (23, 27), (27, 30), (30, 31), (31, 34),
                       (34, 35), (35, 36), (36, 37), (37, 39), (39, 41), (41, 42),
                       (42, 43), (43, 44), (44, 47), (47, 48), (48, 51), (51, 52),
                       (52, 53), (53, 54), (54, 55), (55, 56), (56, 57), (57, 58),
                       (58, 59), (59, 60), (60, 61), (62, 63), (63, 65), (65, 66),
                       (66, 67)]]
        gt_decode = ['Hello, y all! How are you?',
                     'GluonNLP is great!!!',
                     'GluonNLP - Amazon - Haibin - Leonard - Sheng - Shuai - Xingjian..... / '
                     ':! @ # abc']
        verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized)
        verify_pickleble(tokenizer, HuggingFaceTokenizer)
        verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets)
        verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_decode)
예제 #18
0
def test_yttm_tokenizer():
    with tempfile.TemporaryDirectory() as dir_path:
        model_path = os.path.join(dir_path, 'yttm.model')
        download(url=get_repo_url() + 'tokenizer_test_models/yttm/test_ende_yttm-6f2c39.model',
                 path=model_path)
        tokenizer = YTTMTokenizer(model_path=model_path)
        gt_tokenized = [['▁He', 'll', 'o', ',', '▁y', "'", 'all', '!', '▁How', '▁are', '▁you', '▁',
                         'Ⅷ', '▁', '😁', '▁', '😁', '▁', '😁', '▁?'],
                        ['▁Gl', 'u', 'on', 'N', 'L', 'P', '▁is', '▁great', '!', '!', '!', '!',
                         '!', '!'],
                        ['▁Gl', 'u', 'on', 'N', 'L', 'P', '-A', 'm', 'az', 'on', '-H', 'a', 'ib',
                         'in', '-L', 'e', 'on', 'ard', '-S', 'hen', 'g', '-S', 'h', 'u', 'ai',
                         '-', 'X', 'ing', 'j', 'ian', '.', '.', '.', '.', '.', '/', ':', '!',
                         '@', '#', '▁', "'", 'ab', 'c', "'"]]
        gt_offsets = [[(0, 2), (2, 4), (4, 5), (5, 6), (6, 8), (8, 9), (9, 12), (12, 13), (13, 17),
                       (17, 21), (21, 25), (25, 26), (26, 27), (27, 28), (28, 29), (29, 30), (30, 31),
                       (31, 32), (32, 33), (33, 35)],
                      [(0, 2), (2, 3), (3, 5), (5, 6), (6, 7), (7, 8), (8, 11), (11, 17), (17, 18),
                       (18, 19), (19, 20), (20, 21), (21, 22), (22, 23)],
                      [(0, 2), (2, 3), (3, 5), (5, 6), (6, 7), (7, 8), (8, 10), (10, 11), (11, 13),
                       (13, 15), (15, 17), (17, 18), (18, 20), (20, 22), (22, 24), (24, 25), (25, 27),
                       (27, 30), (30, 32), (32, 35), (35, 36), (36, 38), (38, 39), (39, 40), (40, 42),
                       (42, 43), (43, 44), (44, 47), (47, 48), (48, 51), (51, 52), (52, 53), (53, 54),
                       (54, 55), (55, 56), (56, 57), (57, 58), (58, 59), (59, 60), (60, 61), (61, 62),
                       (62, 63), (63, 65), (65, 66), (66, 67)]]
        gt_int_decode = ['Hello, y<UNK>all! How are you <UNK> <UNK> <UNK> <UNK> ?',
                         'GluonNLP is great!!!!!!',
                         'GluonNLP-Amazon-Haibin-Leonard-Sheng-Shuai-Xingjian...../:!@# <UNK>abc<UNK>']
        gt_str_decode = ["Hello, y'all! How are you Ⅷ 😁 😁 😁 ?",
                         'GluonNLP is great!!!!!!',
                         "GluonNLP-Amazon-Haibin-Leonard-Sheng-Shuai-Xingjian...../:!@# 'abc'"]
        verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized)
        verify_pickleble(tokenizer, YTTMTokenizer)
        verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets)
        # Begin to verify decode
        for sample_sentences, ele_gt_int_decode, ele_gt_str_decode in [(SUBWORD_TEST_SAMPLES[0], gt_int_decode[0], gt_str_decode[0]),
                                                                       (SUBWORD_TEST_SAMPLES, gt_int_decode, gt_str_decode)]:
            int_decode = tokenizer.decode(tokenizer.encode(sample_sentences, int))
            str_decode = tokenizer.decode(tokenizer.encode(sample_sentences, str))
            assert int_decode == ele_gt_int_decode
            assert str_decode == ele_gt_str_decode
        os.remove(model_path)
        assert tokenizer.decode([]) == ''
        assert tokenizer.decode([[]]) == ['']
예제 #19
0
def test_subword_nmt_tokenizer():
    with tempfile.TemporaryDirectory() as dir_path:
        model_path = os.path.join(dir_path, 'subword_nmt.model')
        download(url=get_repo_url() + 'tokenizer_test_models/subword-nmt/test_ende-d189ff.model',
                 path=model_path)
        vocab_path = os.path.join(dir_path, 'subword_nmt.vocab')
        download(url=get_repo_url() + 'tokenizer_test_models/subword-nmt/test_ende_vocab-900f81.json',
                 path=vocab_path)

        # Case 1
        tokenizer = SubwordNMTTokenizer(model_path, vocab_path)
        gt_tokenized = [["Hel", "lo", ",</w>", "y", "\'", "all", "!</w>", "How</w>", "are</w>", "you</w>",
                         "Ⅷ</w>", "😁</w>", "😁</w>", "😁</w>", "?</w>"],
                        ["Gl", "u", "on", "N", "L", "P</w>", "is</w>", "great", "!", "!", "!", "!!",
                         "!</w>"],
                        ["Gl", "u", "on", "N", "L", "P", "-", "Amaz", "on-", "H", "ai", "b", "in-", "Le",
                         "on", "ard", "-", "Sh", "eng", "-", "Sh", "u", "ai", "-", "X", "ing", "ji",
                         "an", "..", "...", "/", ":", "!", "@", "#</w>", "\'", "ab", "c", "\'</w>"]]
        gt_offsets = [[(0, 3), (3, 5), (5, 6), (7, 8), (8, 9), (9, 12), (12, 13), (14, 17), (18, 21),
                       (22, 25), (26, 27), (28, 29), (30, 31), (32, 33), (34, 35)],
                      [(0, 2), (2, 3), (3, 5), (5, 6), (6, 7), (7, 8), (9, 11), (12, 17), (17, 18),
                       (18, 19), (19, 20), (20, 22), (22, 23)],
                      [(0, 2), (2, 3), (3, 5), (5, 6), (6, 7), (7, 8), (8, 9), (9, 13), (13, 16),
                       (16, 17), (17, 19), (19, 20), (20, 23), (23, 25), (25, 27), (27, 30), (30, 31),
                       (31, 33), (33, 36), (36, 37), (37, 39), (39, 40), (40, 42), (42, 43), (43, 44),
                       (44, 47), (47, 49), (49, 51), (51, 53), (53, 56), (56, 57), (57, 58), (58, 59),
                       (59, 60), (60, 61), (62, 63), (63, 65), (65, 66), (66, 67)]]
        gt_int_decode = ["Hello, y\'all! How are you Ⅷ 😁 😁 😁 ?",
                         "GluonNLP is great!!!!!!",
                         "GluonNLP-Amazon-Haibin-Leonard-Sheng-Shuai-Xingjian...../:!@# \'abc\'"]
        gt_str_decode = SUBWORD_TEST_SAMPLES
        verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized)
        verify_pickleble(tokenizer, SubwordNMTTokenizer)
        verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets)
        verify_decode_subword_nmt(tokenizer, SUBWORD_TEST_SAMPLES, gt_int_decode, gt_str_decode)

        # Case 2, bpe_dropout
        # We use str decode here because we may not perfectly recover the original sentence with int decode.
        tokenizer = SubwordNMTTokenizer(model_path, vocab_path, bpe_dropout=0.5)
        verify_decode(tokenizer, SUBWORD_TEST_SAMPLES, out_type=str)

        os.remove(model_path)
        os.remove(vocab_path)
예제 #20
0
def test_huggingface_bpe_tokenizer_v08():
    """Test for huggingface BPE tokenizer >=0.8"""
    with tempfile.TemporaryDirectory() as dir_path:
        model_path = os.path.join(dir_path, 'hf_bpe_new_0.8.model')
        download(url=get_repo_url() +
                     'tokenizer_test_models/hf_bpe_new_0.8/hf_bpe.model',
                 path=model_path,
                 sha1_hash='ecda90979561ca4c5a8d769b5e3c9fa2270d5317')
        vocab_path = os.path.join(dir_path, 'hf_bpe_new_0.8.vocab')
        download(url=get_repo_url() +
                     'tokenizer_test_models/hf_bpe_new_0.8/hf_bpe.vocab',
                 path=vocab_path,
                 sha1_hash='b92dde0b094f405208f3ec94b5eae88430bf4262')
        tokenizer = HuggingFaceTokenizer(model_path, vocab_path)
        gt_tokenized = [['H', 'ello</w>', ',</w>', 'y</w>', 'all</w>', '!</w>',
                         'How</w>', 'are</w>', 'you</w>', '?</w>'],
                        ['G', 'lu', 'on', 'N', 'L', 'P</w>', 'is</w>', 'great</w>',
                         '!</w>', '!</w>', '!</w>'],
                        ['G', 'lu', 'on', 'N', 'L', 'P</w>', '-</w>', 'Amaz', 'on</w>',
                         '-</w>', 'Ha', 'i', 'bin</w>', '-</w>', 'Leon', 'ard</w>', '-</w>',
                         'Sh', 'eng</w>', '-</w>', 'S', 'hu', 'ai</w>', '-</w>', 'X', 'ing',
                         'j', 'ian</w>', '.</w>', '.</w>', '.</w>', '.</w>', '.</w>', '/</w>',
                         ':</w>', '!</w>', '@</w>', '#</w>', 'ab', 'c</w>']]
        gt_offsets = [[(0, 1), (1, 5), (5, 6), (7, 8), (9, 12), (12, 13), (14, 17),
                       (18, 21), (22, 25), (34, 35)],
                      [(0, 1), (1, 3), (3, 5), (5, 6), (6, 7), (7, 8), (9, 11), (12, 17),
                       (20, 21), (21, 22), (22, 23)],
                      [(0, 1), (1, 3), (3, 5), (5, 6), (6, 7), (7, 8), (8, 9), (9, 13), (13, 15),
                       (15, 16), (16, 18), (18, 19), (19, 22), (22, 23), (23, 27), (27, 30),
                       (30, 31), (31, 33), (33, 36), (36, 37), (37, 38), (38, 40), (40, 42),
                       (42, 43), (43, 44), (44, 47), (47, 48), (48, 51), (51, 52), (52, 53),
                       (53, 54), (54, 55), (55, 56), (56, 57), (57, 58), (58, 59), (59, 60),
                       (60, 61), (63, 65), (65, 66)]]
        gt_decode = ['Hello , y all ! How are you ?',
                     'GluonNLP is great ! ! !',
                     'GluonNLP - Amazon - Haibin - Leonard - Sheng - Shuai - Xingjian'
                     ' . . . . . / : ! @ # abc']
        verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized)
        verify_pickleble(tokenizer, HuggingFaceTokenizer)
        verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets)
        verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_decode)
예제 #21
0
def main(args):
    url = _URLS[args.dataset]
    file_hash = _URL_FILE_STATS[url]
    target_download_location = os.path.join(args.cache_path,
                                            os.path.basename(url))
    download(url, target_download_location, sha1_hash=file_hash)
    save_dir = args.dataset if args.save_dir is None else args.save_dir
    if not os.path.exists(save_dir):
        os.makedirs(save_dir, exist_ok=True)
    if args.dataset == 'gutenberg':
        if args.mode == 'raw':
            with zipfile.ZipFile(target_download_location) as f:
                for name in f.namelist():
                    if name.endswith('.txt'):
                        filename = os.path.basename(name)
                    f.extract(name, os.path.join(save_dir, filename))
        else:
            # TODO(zheyuye), format for pretraining
            raise NotImplementedError
    else:
        raise NotImplementedError
예제 #22
0
def main(args):
    train_url = _URLS[args.version]['train']
    dev_url = _URLS[args.version]['dev']
    train_file_name = train_url[train_url.rfind('/') + 1:]
    dev_file_name = dev_url[dev_url.rfind('/') + 1:]
    download(train_url, path=os.path.join(args.cache_path, train_file_name))
    download(dev_url, path=os.path.join(args.cache_path, dev_file_name))
    if not os.path.exists(args.save_path):
        os.makedirs(args.save_path)
    if not os.path.exists(os.path.join(args.save_path, train_file_name)) \
            or (args.overwrite and args.save_path != args.cache_path):
        shutil.copyfile(os.path.join(args.cache_path, train_file_name),
                os.path.join(args.save_path, train_file_name))
    else:
        print(f'Found {os.path.join(args.save_path, train_file_name)}...skip')
    if not os.path.exists(os.path.join(args.save_path, dev_file_name)) \
            or (args.overwrite and args.save_path != args.cache_path):
        shutil.copyfile(os.path.join(args.cache_path, dev_file_name),
                os.path.join(args.save_path, dev_file_name))
    else:
        print(f'Found {os.path.join(args.save_path, dev_file_name)}...skip')
예제 #23
0
def test_huggingface_bytebpe_tokenizer():
    with tempfile.TemporaryDirectory() as dir_path:
        model_path = os.path.join(dir_path, 'hf_bytebpe.model')
        download(url=get_repo_url() + 'tokenizer_test_models/hf_bytebpe/test_hf_bytebpe.model',
                 path=model_path)
        vocab_path = os.path.join(dir_path, 'hf_bytebpe.vocab')
        download(url=get_repo_url() + 'tokenizer_test_models/hf_bytebpe/test_hf_bytebpe.vocab',
                 path=vocab_path)
        hf_vocab_path = os.path.join(dir_path, 'hf_bytebpe.hf_vocab')
        download(url=get_repo_url() + 'tokenizer_test_models/hf_bytebpe/test_hf_bytebpe.hf_vocab',
                 path=hf_vocab_path)

        # Case 1, default lowercase=False
        tokenizer = HuggingFaceByteBPETokenizer(model_path, vocab_path)
        gt_tokenized = [['Hello', ',', 'Ġy', "'", 'all', '!', 'ĠHow', 'Ġare', 'Ġyou',
                         'Ġâ', 'ħ', '§', 'ĠðŁĺ', 'ģ', 'ĠðŁĺ', 'ģ', 'ĠðŁĺ', 'ģ', 'Ġ?'],
                        ['Gl', 'u', 'on', 'N', 'LP', 'Ġis', 'Ġgreat', 'ï¼', 'ģ', 'ï¼',
                         'ģ', 'ï¼', 'ģ', '!!!'],
                        ['Gl', 'u', 'on', 'N', 'LP', '-', 'Amazon', '-', 'Ha', 'ib', 'in',
                         '-', 'Le', 'on', 'ard', '-', 'She', 'ng', '-', 'Sh', 'u',
                         'ai', '-', 'X', 'ing', 'j', 'ian', '.....', '/', ':', '!', '@',
                         '#', "Ġ'", 'ab', 'c', "'"]]
        # the defination of the offsets of bytelevel seems not clear
        gt_offsets = [[(0, 5), (5, 6), (6, 8), (8, 9), (9, 12), (12, 13), (13, 17), (17, 21),
                       (21, 25), (25, 27), (26, 27), (26, 27), (27, 29), (28, 29), (29, 31),
                       (30, 31), (31, 33), (32, 33), (33, 35)],
                      [(0, 2), (2, 3), (3, 5), (5, 6), (6, 8), (8, 11), (11, 17), (17, 18),
                       (17, 18), (18, 19), (18, 19), (19, 20), (19, 20), (20, 23)],
                      [(0, 2), (2, 3), (3, 5), (5, 6), (6, 8), (8, 9), (9, 15), (15, 16),
                       (16, 18), (18, 20), (20, 22), (22, 23), (23, 25), (25, 27), (27, 30),
                       (30, 31), (31, 34), (34, 36), (36, 37), (37, 39), (39, 40), (40, 42),
                       (42, 43), (43, 44), (44, 47), (47, 48), (48, 51), (51, 56),
                       (56, 57), (57, 58), (58, 59), (59, 60), (60, 61), (61, 63),
                       (63, 65), (65, 66), (66, 67)]]
        gt_decode = ["Hello, y'all! How are you Ⅷ 😁 😁 😁 ?",
                     'GluonNLP is great!!!!!!',
                     "GluonNLP-Amazon-Haibin-Leonard-Sheng-Shuai-Xingjian...../:!@# 'abc'"]
        verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized)
        verify_pickleble(tokenizer, HuggingFaceByteBPETokenizer)
        verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets)
        verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_decode)

        # Case 2, lowercase=True
        gt_lowercase_int_decode = ["hello, y'all! how are you ⅷ 😁 😁 😁 ?",
                                   'gluonnlp is great!!!!!!',
                                   "gluonnlp-amazon-haibin-leonard-sheng-shuai-xingjian...../:!@# 'abc'"]
        tokenizer = HuggingFaceByteBPETokenizer(model_path, vocab_path, lowercase=True)
        verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_lowercase_int_decode)

        # Case 3, using original hf vocab
        tokenizer = HuggingFaceByteBPETokenizer(model_path, hf_vocab_path)
        verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized)
        verify_pickleble(tokenizer, HuggingFaceByteBPETokenizer)
        verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets)
        verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_decode)

        os.remove(model_path)
        os.remove(vocab_path)
        os.remove(hf_vocab_path)
예제 #24
0
def main(args):
    # Download the data
    url = _URLS[args.dataset]
    file_hash = _URL_FILE_STATS[url]
    target_download_location = os.path.join(args.cache_path,
                                            os.path.basename(url))
    download(url, target_download_location, sha1_hash=file_hash)
    if args.save_dir is None:
        save_dir = args.dataset
    else:
        save_dir = args.save_dir
    if not args.overwrite and os.path.exists(save_dir):
        print('{} found, skip! Turn on --overwrite to force overwrite'.format(
            save_dir))
    print('Extract the data from {} into {}'.format(target_download_location,
                                                    save_dir))
    if args.dataset == 'lmd_full':
        with tarfile.open(target_download_location) as f:
            f.extractall(save_dir)
    elif args.dataset == 'lmd_matched':
        with tarfile.open(target_download_location) as f:
            f.extractall(save_dir)
    elif args.dataset == 'lmd_aligned':
        with tarfile.open(target_download_location) as f:
            f.extractall(save_dir)
    elif args.dataset == 'clean_midi':
        with tarfile.open(target_download_location) as f:
            f.extractall(save_dir)
    elif args.dataset == 'maestro_v1':
        with zipfile.ZipFile(target_download_location, 'r') as fobj:
            fobj.extractall(save_dir)
    elif args.dataset == 'maestro_v2':
        with zipfile.ZipFile(target_download_location, 'r') as fobj:
            fobj.extractall(save_dir)
    elif args.dataset == 'geocities':
        with zipfile.ZipFile(target_download_location, 'r') as fobj:
            fobj.extractall(save_dir)
    else:
        raise NotImplementedError
예제 #25
0
def test_huggingface_bpe_tokenizer():
    with tempfile.TemporaryDirectory() as dir_path:
        model_path = os.path.join(dir_path, 'test_hf_bpe.model')
        download(url=get_repo_url() + 'tokenizer_test_models/hf_bpe/test_hf_bpe.model',
                 path=model_path)
        vocab_path = os.path.join(dir_path, 'test_hf_bpe.vocab')
        download(url=get_repo_url() + 'tokenizer_test_models/hf_bpe/test_hf_bpe.vocab',
                 path=vocab_path)
        hf_vocab_path = os.path.join(dir_path, 'test_hf_bpe.hf_vocab')
        download(url=get_repo_url() + 'tokenizer_test_models/hf_bpe/test_hf_bpe.hf_vocab',
                 path=hf_vocab_path)

        # Case 1, default lowercase=False
        tokenizer = HuggingFaceBPETokenizer(model_path, vocab_path)
        gt_tokenized = [['Hello</w>', ',</w>', 'y</w>', "'</w>", 'all</w>', '!</w>', 'How</w>',
                         'are</w>', 'you</w>', '<unk>', '<unk>', '<unk>', '<unk>', '?</w>'],
                        ['Gl', 'u', 'on', 'N', 'LP</w>', 'is</w>', 'great</w>', '!</w>', '!</w>',
                         '!</w>', '!</w>', '!</w>', '!</w>'],
                        ['Gl', 'u', 'on', 'N', 'LP</w>', '-</w>', 'Amazon</w>', '-</w>', 'H', 'ai',
                         'bin</w>', '-</w>', 'Leonard</w>', '-</w>', 'Sh', 'en', 'g</w>', '-</w>',
                         'Sh', 'u', 'ai</w>', '-</w>', 'X', 'ing', 'j', 'ian</w>', '.</w>', '.</w>',
                         '.</w>', '.</w>', '.</w>', '/</w>', ':</w>', '!</w>', '@</w>', '#</w>',
                         "'</w>", 'ab', 'c</w>', "'</w>"]]
        gt_offsets = [[(0, 5), (5, 6), (7, 8), (8, 9), (9, 12), (12, 13), (14, 17), (18, 21), (22, 25),
                       (26, 27), (28, 29), (30, 31), (32, 33), (34, 35)],
                      [(0, 2), (2, 3), (3, 5), (5, 6), (6, 8), (9, 11), (12, 17), (17, 18), (18, 19),
                       (19, 20), (20, 21), (21, 22), (22, 23)],
                      [(0, 2), (2, 3), (3, 5), (5, 6), (6, 8), (8, 9), (9, 15), (15, 16), (16, 17),
                       (17, 19), (19, 22), (22, 23), (23, 30), (30, 31), (31, 33), (33, 35), (35, 36),
                       (36, 37), (37, 39), (39, 40), (40, 42), (42, 43), (43, 44), (44, 47), (47, 48),
                       (48, 51), (51, 52), (52, 53), (53, 54), (54, 55), (55, 56), (56, 57), (57, 58),
                       (58, 59), (59, 60), (60, 61), (62, 63), (63, 65), (65, 66), (66, 67)]]
        # gt_int_decode = gt_str_decode for hf
        # hf removed the unk tokens in decode result
        gt_decode = ["Hello , y ' all ! How are you ?",
                     'GluonNLP is great ! ! ! ! ! !',
                     "GluonNLP - Amazon - Haibin - Leonard - Sheng - Shuai - Xingjian . . . . . / : ! @ # ' abc '"]
        verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized)
        verify_pickleble(tokenizer, HuggingFaceBPETokenizer)
        verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets)
        verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_decode)

        # Case 2, lowercase=True
        gt_lowercase_decode = ["hello , y ' all ! how are you ?",
                               'gluonnlp is great ! ! ! ! ! !',
                               "gluonnlp - amazon - haibin - leonard - sheng - shuai - xingjian . . . . . / : ! @ # ' abc '"]
        tokenizer = HuggingFaceBPETokenizer(model_path, vocab_path, lowercase=True)
        verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_lowercase_decode)

        # Case 3, using original hf vocab
        tokenizer = HuggingFaceBPETokenizer(model_path, hf_vocab_path)
        verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized)
        verify_pickleble(tokenizer, HuggingFaceBPETokenizer)
        verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets)
        verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_decode)

        os.remove(model_path)
        os.remove(vocab_path)
        os.remove(hf_vocab_path)
예제 #26
0
def main(args):
    def extract(tar_path, target_path):
        try:
            tar = tarfile.open(tar_path, "r:gz")
            file_names = tar.getnames()
            for file_name in file_names:
                tar.extract(file_name, target_path)
            tar.close()
        except Exception as e:
            print(e)

    tar_url = _URLS[args.type]
    file_name = tar_url[tar_url.rfind('/') + 1:]
    file_hash = _URL_FILE_STATS[tar_url]
    download(tar_url,
             path=os.path.join(args.cache_path, file_name),
             sha1_hash=file_hash)
    if not os.path.exists(args.save_path):
        os.makedirs(args.save_path)
    if not os.path.exists(os.path.join(args.save_path, file_name))\
            or (args.overwrite and args.save_path != args.cache_path):
        os.symlink(os.path.join(args.cache_path, file_name),
                   os.path.join(args.save_path, file_name))
    extract(os.path.join(args.save_path, file_name), args.save_path)
예제 #27
0
def format_mrpc(data_dir):
    mrpc_dir = os.path.join(data_dir, "mrpc")
    os.makedirs(mrpc_dir, exist_ok=True)
    mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")
    mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")
    download(GLUE_TASK2PATH["mrpc"]['train'],
             mrpc_train_file,
             sha1_hash=_URL_FILE_STATS[GLUE_TASK2PATH["mrpc"]['train']])
    download(GLUE_TASK2PATH["mrpc"]['test'],
             mrpc_test_file,
             sha1_hash=_URL_FILE_STATS[GLUE_TASK2PATH["mrpc"]['test']])
    assert os.path.isfile(
        mrpc_train_file), "Train data not found at %s" % mrpc_train_file
    assert os.path.isfile(
        mrpc_test_file), "Test data not found at %s" % mrpc_test_file
    download(GLUE_TASK2PATH["mrpc"]['dev'],
             os.path.join(mrpc_dir, "dev_ids.tsv"),
             sha1_hash=_URL_FILE_STATS[GLUE_TASK2PATH["mrpc"]['dev']])

    dev_ids = []
    with open(os.path.join(mrpc_dir, "dev_ids.tsv"),
              encoding="utf8") as ids_fh:
        for row in ids_fh:
            dev_ids.append(row.strip().split("\t"))

    with open(mrpc_train_file, encoding="utf8") as data_fh, open(
            os.path.join(mrpc_dir, "train.tsv"), "w",
            encoding="utf8") as train_fh, open(os.path.join(
                mrpc_dir, "dev.tsv"),
                                               "w",
                                               encoding="utf8") as dev_fh:
        header = data_fh.readline()
        train_fh.write(header)
        dev_fh.write(header)
        for row in data_fh:
            label, id1, id2, s1, s2 = row.strip().split("\t")
            if [id1, id2] in dev_ids:
                dev_fh.write("%s\t%s\t%s\t%s\t%s\n" %
                             (label, id1, id2, s1, s2))
            else:
                train_fh.write("%s\t%s\t%s\t%s\t%s\n" %
                               (label, id1, id2, s1, s2))

    with open(mrpc_test_file,
              encoding="utf8") as data_fh, open(os.path.join(
                  mrpc_dir, "test.tsv"),
                                                "w",
                                                encoding="utf8") as test_fh:
        header = data_fh.readline()
        test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
        for idx, row in enumerate(data_fh):
            label, id1, id2, s1, s2 = row.strip().split("\t")
            test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
예제 #28
0
def main(args):
    os.makedirs(args.cache_path, exist_ok=True)
    os.makedirs(args.data_dir, exist_ok=True)
    if args.tasks == 'all':
        tasks = list(TASK2PATH.keys())
    else:
        tasks = args.tasks.split(',')
    for task in tasks:
        task_dir_path = os.path.join(args.data_dir, task)
        os.makedirs(task_dir_path, exist_ok=True)
        file_url = TASK2PATH[task]
        sha1_hash = _URL_FILE_STATS[file_url]
        download_path = download(file_url,
                                 args.cache_path,
                                 sha1_hash=sha1_hash)
        with tarfile.open(download_path) as f:
            f.extractall(task_dir_path)
        if task == 'imdb':
            shutil.move(os.path.join(task_dir_path, 'imdb', 'train.parquet'),
                        os.path.join(task_dir_path, 'train.parquet'))
            shutil.move(os.path.join(task_dir_path, 'imdb', 'test.parquet'),
                        os.path.join(task_dir_path, 'test.parquet'))
            train_data = pd.read_parquet(
                os.path.join(task_dir_path, 'train.parquet'))
            test_data = pd.read_parquet(
                os.path.join(task_dir_path, 'test.parquet'))
        elif task == 'ag':
            train_data = pd.read_csv(os.path.join(task_dir_path, 'ag_news_csv',
                                                  'train.csv'),
                                     header=None)
            test_data = pd.read_csv(os.path.join(task_dir_path, 'ag_news_csv',
                                                 'test.csv'),
                                    header=None)
            train_data = pd.DataFrame({
                'label':
                train_data[0],
                'content':
                train_data[1] + ' ' + train_data[2]
            })
            test_data = pd.DataFrame({
                'label':
                test_data[0],
                'content':
                test_data[1] + ' ' + test_data[2]
            })
            train_data.to_parquet(os.path.join(task_dir_path, 'train.parquet'))
            test_data.to_parquet(os.path.join(task_dir_path, 'test.parquet'))
        elif task == 'dbpedia':
            train_data = pd.read_csv(os.path.join(task_dir_path, 'dbpedia_csv',
                                                  'train.csv'),
                                     header=None)
            test_data = pd.read_csv(os.path.join(task_dir_path, 'dbpedia_csv',
                                                 'test.csv'),
                                    header=None)
            train_data = pd.DataFrame({
                'label':
                train_data[0],
                'content':
                train_data[1] + ' ' + train_data[2]
            })
            test_data = pd.DataFrame({
                'label':
                test_data[0],
                'content':
                test_data[1] + ' ' + test_data[2]
            })
            train_data.to_parquet(os.path.join(task_dir_path, 'train.parquet'))
            test_data.to_parquet(os.path.join(task_dir_path, 'test.parquet'))
        elif task == 'yelp2':
            train_data = pd.read_csv(os.path.join(task_dir_path,
                                                  'yelp_review_polarity_csv',
                                                  'train.csv'),
                                     header=None)
            test_data = pd.read_csv(os.path.join(task_dir_path,
                                                 'yelp_review_polarity_csv',
                                                 'test.csv'),
                                    header=None)
            train_data.columns = ['label', 'review']
            test_data.columns = ['label', 'review']
            train_data.to_parquet(os.path.join(task_dir_path, 'train.parquet'))
            test_data.to_parquet(os.path.join(task_dir_path, 'test.parquet'))
        elif task == 'yelp5':
            train_data = pd.read_csv(os.path.join(task_dir_path,
                                                  'yelp_review_full_csv',
                                                  'train.csv'),
                                     header=None)
            test_data = pd.read_csv(os.path.join(task_dir_path,
                                                 'yelp_review_full_csv',
                                                 'test.csv'),
                                    header=None)
            train_data.columns = ['label', 'review']
            test_data.columns = ['label', 'review']
            train_data.to_parquet(os.path.join(task_dir_path, 'train.parquet'))
            test_data.to_parquet(os.path.join(task_dir_path, 'test.parquet'))
        elif task == 'amazon2':
            train_data = pd.read_csv(os.path.join(
                task_dir_path, 'amazon_review_polarity_csv', 'train.csv'),
                                     header=None)
            test_data = pd.read_csv(os.path.join(task_dir_path,
                                                 'amazon_review_polarity_csv',
                                                 'test.csv'),
                                    header=None)
            train_data = pd.DataFrame({
                'label':
                train_data[0],
                'review':
                train_data[1] + ' ' + train_data[2]
            })
            test_data = pd.DataFrame({
                'label':
                test_data[0],
                'review':
                test_data[1] + ' ' + test_data[2]
            })
            train_data.to_parquet(os.path.join(task_dir_path, 'train.parquet'))
            test_data.to_parquet(os.path.join(task_dir_path, 'test.parquet'))
        elif task == 'amazon5':
            train_data = pd.read_csv(os.path.join(task_dir_path,
                                                  'amazon_review_full_csv',
                                                  'train.csv'),
                                     header=None)
            test_data = pd.read_csv(os.path.join(task_dir_path,
                                                 'amazon_review_full_csv',
                                                 'test.csv'),
                                    header=None)
            train_data = pd.DataFrame({
                'label':
                train_data[0],
                'review':
                train_data[1] + ' ' + train_data[2]
            })
            test_data = pd.DataFrame({
                'label':
                test_data[0],
                'review':
                test_data[1] + ' ' + test_data[2]
            })
            train_data.to_parquet(os.path.join(task_dir_path, 'train.parquet'))
            test_data.to_parquet(os.path.join(task_dir_path, 'test.parquet'))
        else:
            raise NotImplementedError
        print('Task={}, #Train={}, #Test={}'.format(task, len(train_data),
                                                    len(test_data)))
예제 #29
0
def _download_with_mirror(url, path, sha1_hash):
    return download(
        get_repo_url() + _WMT_MIRROR_URL_MAP[url] if url in _WMT_MIRROR_URL_MAP else url,
        path=path,
        sha1_hash=sha1_hash
    )
예제 #30
0
def main(args):
    if args.data_dir is None:
        args.data_dir = args.benchmark
    args.cache_path = os.path.join(args.cache_path, args.benchmark)
    print('Downloading {} to {}. Selected tasks = {}'.format(
        args.benchmark, args.data_dir, args.tasks))
    os.makedirs(args.cache_path, exist_ok=True)
    os.makedirs(args.data_dir, exist_ok=True)
    tasks = get_tasks(args.benchmark, args.tasks)
    if args.benchmark == 'glue':
        TASK2PATH = GLUE_TASK2PATH
        TASK2READER = GLUE_READERS
    elif args.benchmark == 'superglue':
        TASK2PATH = SUPERGLUE_TASK2PATH
        TASK2READER = SUPERGLUE_READER
    else:
        raise NotImplementedError
    for task in tasks:
        print('Processing {}...'.format(task))
        if task == 'diagnostic' or 'diagnostic' in task:
            if args.benchmark == 'glue':
                reader = TASK2READER[task]
                base_dir = os.path.join(args.data_dir, 'rte_diagnostic')
                os.makedirs(base_dir, exist_ok=True)
                download(TASK2PATH['diagnostic'][0],
                         path=os.path.join(base_dir, 'diagnostic.tsv'),
                         sha1_hash=_URL_FILE_STATS[TASK2PATH['diagnostic'][0]])
                download(TASK2PATH['diagnostic'][1],
                         path=os.path.join(base_dir, 'diagnostic-full.tsv'),
                         sha1_hash=_URL_FILE_STATS[TASK2PATH['diagnostic'][1]])
                df = reader(base_dir)
                df.to_parquet(os.path.join(base_dir,
                                           'diagnostic-full.parquet'))
            else:
                for key, name in [('broadcoverage-diagnostic', 'AX-b'),
                                  ('winogender-diagnostic', 'AX-g')]:
                    data_file = os.path.join(args.cache_path,
                                             "{}.zip".format(key))
                    url = TASK2PATH[key]
                    reader = TASK2READER[key]
                    download(url, data_file, sha1_hash=_URL_FILE_STATS[url])
                    with zipfile.ZipFile(data_file) as zipdata:
                        zipdata.extractall(args.data_dir)
                    df = reader(os.path.join(args.data_dir, name))
                    df.to_parquet(
                        os.path.join(args.data_dir, name,
                                     '{}.parquet'.format(name)))
        elif task == 'mrpc':
            reader = TASK2READER[task]
            format_mrpc(args.data_dir)
            df_dict, meta_data = reader(os.path.join(args.data_dir, 'mrpc'))
            for key, df in df_dict.items():
                if key == 'val':
                    key = 'dev'
                df.to_parquet(
                    os.path.join(args.data_dir, 'mrpc',
                                 '{}.parquet'.format(key)))
            with open(os.path.join(args.data_dir, 'mrpc', 'metadata.json'),
                      'w') as f:
                json.dump(meta_data, f)
        else:
            # Download data
            data_file = os.path.join(args.cache_path, "{}.zip".format(task))
            url = TASK2PATH[task]
            reader = TASK2READER[task]
            download(url, data_file, sha1_hash=_URL_FILE_STATS[url])
            base_dir = os.path.join(args.data_dir, task)
            if os.path.exists(base_dir):
                print('Found!')
                continue
            zip_dir_name = None
            with zipfile.ZipFile(data_file) as zipdata:
                if zip_dir_name is None:
                    zip_dir_name = os.path.dirname(
                        zipdata.infolist()[0].filename)
                zipdata.extractall(args.data_dir)
            shutil.move(os.path.join(args.data_dir, zip_dir_name), base_dir)
            df_dict, meta_data = reader(base_dir)
            for key, df in df_dict.items():
                if key == 'val':
                    key = 'dev'
                df.to_parquet(os.path.join(base_dir, '{}.parquet'.format(key)))
            if meta_data is not None:
                with open(os.path.join(base_dir, 'metadata.json'), 'w') as f:
                    json.dump(meta_data, f)
        print("\tCompleted!")