示例#1
0
                    help="Directory containing config.json of data")
parser.add_argument('--model_dir',
                    default='experiments/base_model',
                    help="Directory containing config.json of model")

args = argparse.Namespace(data_dir='data', model_dir='experiments/base_model')

if __name__ == '__main__':
    args = parser.parse_args()
    data_dir = Path(args.data_dir)
    model_dir = Path(args.model_dir)
    data_config = Config(json_path=data_dir / 'config.json')
    model_config = Config(json_path=model_dir / 'config.json')

    # tokenizer
    ptr_tokenizer = BertTokenizer.from_pretrained(
        'pretrained/vocab.korean.rawtext.list', do_lower_case=False)
    with open('pretrained/vocab.pkl', mode='rb') as io:
        vocab = pickle.load(io)
    pad_sequence = PadSequence(length=model_config.length,
                               pad_val=vocab.to_indices(vocab.padding_token))
    preprocessor = PreProcessor(vocab=vocab,
                                split_fn=ptr_tokenizer.tokenize,
                                pad_fn=pad_sequence)

    # model
    config = BertConfig('pretrained/bert_config.json')
    model = BertClassifier(config,
                           num_classes=model_config.num_classes,
                           vocab=preprocessor.vocab)
    bert_pretrained = torch.load('pretrained/pytorch_model.bin')
    model.load_state_dict(bert_pretrained, strict=False)
    args = parser.parse_args()
    ptr_dir = Path('pretrained')
    data_dir = Path(args.data_dir)
    model_dir = Path(args.model_dir)

    ptr_config = Config(ptr_dir / 'config_{}.json'.format(args.type))
    data_config = Config(data_dir / 'config.json')
    model_config = Config(model_dir / 'config.json')

    # vocab
    with open(ptr_config.vocab, mode='rb') as io:
        vocab = pickle.load(io)

    # tokenizer
    if args.type == 'etri':
        ptr_tokenizer = ETRITokenizer.from_pretrained(ptr_config.tokenizer,
                                                      do_lower_case=False)
        pad_sequence = PadSequence(length=model_config.length,
                                   pad_val=vocab.to_indices(
                                       vocab.padding_token))
        preprocessor = PreProcessor(vocab=vocab,
                                    split_fn=ptr_tokenizer.tokenize,
                                    pad_fn=pad_sequence)
    elif args.type == 'skt':
        ptr_tokenizer = SentencepieceTokenizer(ptr_config.tokenizer)
        pad_sequence = PadSequence(length=model_config.length,
                                   pad_val=vocab.to_indices(
                                       vocab.padding_token))
        preprocessor = PreProcessor(vocab=vocab,
                                    split_fn=ptr_tokenizer,
                                    pad_fn=pad_sequence)
示例#3
0
            print('Already you have pytorch_model_skt_tokenizer.model')

        ptr_config = Config({'config': str(ptr_config_path),
                             'bert': str(ptr_bert_path),
                             'tokenizer': str(ptr_tokenizer_path),
                             'vocab': str(ptr_vocab_path.with_suffix('.pkl'))})
        ptr_config.save(ptr_dir / "config_skt.json")

    if args.type == 'etri':
        # loading BertTokenizer
        ptr_config_path = ptr_dir / 'bert_config_etri.json'
        ptr_tokenizer_path = ptr_dir / "vocab.korean.rawtext.list"
        ptr_bert_path = ptr_dir / "pytorch_model_etri.bin"

        ptr_tokenizer = ETRITokenizer.from_pretrained(
            ptr_tokenizer_path, do_lower_case=False
        )
        # generate vocab
        idx_to_token = list(ptr_tokenizer.vocab.keys())
        token_to_idx = {token: idx for idx, token in enumerate(idx_to_token)}

        vocab = Vocab(
            idx_to_token,
            padding_token="[PAD]",
            unknown_token="[UNK]",
            bos_token=None,
            eos_token=None,
            reserved_tokens=["[CLS]", "[SEP]", "[MASK]"],
            token_to_idx=token_to_idx,
        )