def get_tokenizer(vocab_file): tokenizer = BertTokenizerFast( vocab_file=vocab_file, do_basic_tokenize=True ) special_tokens_dict = {'additional_special_tokens': ["<end>", "<begin>"]} tokenizer.add_special_tokens(special_tokens_dict) return tokenizer
files='/opt/ml/code/KBOBERT/KBOBERT_Data.txt', vocab_size=32000, min_frequency=2, show_progress=True, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], wordpieces_prefix="##") wp_tokenizer.save_model('./') tokenizer = BertTokenizerFast( vocab_file="/opt/ml/code/KBOBERT/vocab.txt", max_len=512, do_lower_case=False, ) tokenizer.add_special_tokens({'mask_token': '[MASK]'}) # https://huggingface.co/transformers/model_doc/bert.html#bertconfig config = BertConfig(vocab_size=32000, hidden_size=256, num_hidden_layers=6, num_attention_heads=4, intermediate_size=3072, hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=2, pad_token_id=0, position_embedding_type="absolute")