Exemplo n.º 1
0
def main():
    parser = ArgumentParser(description="Training tokenizer on text files.")
    parser.add_argument("text_dir", nargs="?", help="Path to the directory containgin the text files (any .txt file).")
    parser.add_argument("-t", "--tokenizer_path", default=TOKENIZER_PATH, help="Path to the saved trained tokenizer.")
    args = parser.parse_args()
    text_dir = args.text_dir
    tokenizer_path = args.tokenizer_path
    if Path(tokenizer_path).exists():
        paths = [str(x) for x in Path(text_dir).glob("**/*.txt")]
        tokenizer = ByteLevelBPETokenizer()
        tokenizer.pre_tokenizer = ByteLevel
        tokenizer.train(
            files=paths,
            vocab_size=config.vocab_size,
            min_frequency=2,
            special_tokens=[
                "<s>",
                "<pad>",
                "</s>",
                "<unk>",  # probably not needed if using ByteLevel pretokenization
                "<mask>",
            ]
        )
        tokenizer.save_model(tokenizer_path)
    else:
        print(f"{tokenizer_path} does not exists, will not be able to save tokenizer. Create dir first and re-run the command.")
Exemplo n.º 2
0
def tokenize_cards(
        files=['./dataset/cards_train.txt', './dataset/cards_val.txt'],
        output_dir='./tokenizer'):
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.pre_tokenizer = Whitespace()

    tokenizer.train(files=files, special_tokens=SPECIAL_TOKENS + OTHER_TOKENS)
    tokenizer.save_model(output_dir)
Exemplo n.º 3
0
def load_tokenizer(vocab='./tokenizer/vocab.json', merges='./tokenizer/merges.txt', gpt=False, load_from=None):
    if gpt:
        if load_from:
            tokenizer = GPT2Tokenizer.from_pretrained(load_from)
        else:
            tokenizer = GPT2Tokenizer(
                vocab, merges, 
                bos_token=CARD_BEGIN, eos_token=CARD_END, sep_token=CARD_END,
                unk_token=UNK, pad_token=CARD_PAD, mask_token=CARD_MASK, padding_side="left"
            )
    else:
        tokenizer = ByteLevelBPETokenizer(vocab, merges)
        tokenizer.add_special_tokens(SPECIAL_TOKENS + OTHER_TOKENS)
        tokenizer.mask_token = CARD_MASK
    
    tokenizer.pre_tokenizer = Whitespace()
    return tokenizer
Exemplo n.º 4
0
def get_french_vocab(model_name):
    root = Path(os.getcwd()).parent.parent.parent
    french_corpus = "Datasets/corpora/fr/text"
    fr_corpus_path = os.path.join(root, french_corpus)
    files = []
    for dir_ in os.listdir(fr_corpus_path):
        fr_corpus_dir = os.path.join(fr_corpus_path, dir_)
        for text_file in os.listdir(fr_corpus_dir):
            text_file = os.path.join(fr_corpus_dir, text_file)
            files.append(text_file)

    tokenizer = ByteLevelBPETokenizer(add_prefix_space=True)
    tokenizer.pre_tokenizer = Whitespace()

    tokenizer.train(files,
                    vocab_size=20000,
                    min_frequency=2,
                    show_progress=True,
                    special_tokens=["<sos>", "<pad>", "<eos>", "<unk>"])

    print(tokenizer.encode("c'est la meilleure des phrases françaises").tokens)
    tokenizer.save(model_name)