help="Directory containing config.json of data") parser.add_argument('--model_dir', default='experiments/base_model', help="Directory containing config.json of model") args = argparse.Namespace(data_dir='data', model_dir='experiments/base_model') if __name__ == '__main__': args = parser.parse_args() data_dir = Path(args.data_dir) model_dir = Path(args.model_dir) data_config = Config(json_path=data_dir / 'config.json') model_config = Config(json_path=model_dir / 'config.json') # tokenizer ptr_tokenizer = BertTokenizer.from_pretrained( 'pretrained/vocab.korean.rawtext.list', do_lower_case=False) with open('pretrained/vocab.pkl', mode='rb') as io: vocab = pickle.load(io) pad_sequence = PadSequence(length=model_config.length, pad_val=vocab.to_indices(vocab.padding_token)) preprocessor = PreProcessor(vocab=vocab, split_fn=ptr_tokenizer.tokenize, pad_fn=pad_sequence) # model config = BertConfig('pretrained/bert_config.json') model = BertClassifier(config, num_classes=model_config.num_classes, vocab=preprocessor.vocab) bert_pretrained = torch.load('pretrained/pytorch_model.bin') model.load_state_dict(bert_pretrained, strict=False)
args = parser.parse_args() ptr_dir = Path('pretrained') data_dir = Path(args.data_dir) model_dir = Path(args.model_dir) ptr_config = Config(ptr_dir / 'config_{}.json'.format(args.type)) data_config = Config(data_dir / 'config.json') model_config = Config(model_dir / 'config.json') # vocab with open(ptr_config.vocab, mode='rb') as io: vocab = pickle.load(io) # tokenizer if args.type == 'etri': ptr_tokenizer = ETRITokenizer.from_pretrained(ptr_config.tokenizer, do_lower_case=False) pad_sequence = PadSequence(length=model_config.length, pad_val=vocab.to_indices( vocab.padding_token)) preprocessor = PreProcessor(vocab=vocab, split_fn=ptr_tokenizer.tokenize, pad_fn=pad_sequence) elif args.type == 'skt': ptr_tokenizer = SentencepieceTokenizer(ptr_config.tokenizer) pad_sequence = PadSequence(length=model_config.length, pad_val=vocab.to_indices( vocab.padding_token)) preprocessor = PreProcessor(vocab=vocab, split_fn=ptr_tokenizer, pad_fn=pad_sequence)
print('Already you have pytorch_model_skt_tokenizer.model') ptr_config = Config({'config': str(ptr_config_path), 'bert': str(ptr_bert_path), 'tokenizer': str(ptr_tokenizer_path), 'vocab': str(ptr_vocab_path.with_suffix('.pkl'))}) ptr_config.save(ptr_dir / "config_skt.json") if args.type == 'etri': # loading BertTokenizer ptr_config_path = ptr_dir / 'bert_config_etri.json' ptr_tokenizer_path = ptr_dir / "vocab.korean.rawtext.list" ptr_bert_path = ptr_dir / "pytorch_model_etri.bin" ptr_tokenizer = ETRITokenizer.from_pretrained( ptr_tokenizer_path, do_lower_case=False ) # generate vocab idx_to_token = list(ptr_tokenizer.vocab.keys()) token_to_idx = {token: idx for idx, token in enumerate(idx_to_token)} vocab = Vocab( idx_to_token, padding_token="[PAD]", unknown_token="[UNK]", bos_token=None, eos_token=None, reserved_tokens=["[CLS]", "[SEP]", "[MASK]"], token_to_idx=token_to_idx, )