def deploy_model_4_1(args): from argparse import Namespace model = Model.load_from_checkpoint(args.resume_from_checkpoint, hparams=args) model_state_dict = model.state_dict() model_config = Namespace(**model.hparams) ltp_model = { 'version': "4.1.0", 'model': model_state_dict, 'model_config': model_config, 'transformer_config': model.transformer.config.to_dict(), 'seg': ['I-W', 'B-W'], 'pos': load_labels(os.path.join(args.pos_data_dir, 'pos_labels.txt')), 'ner': load_labels(os.path.join(args.ner_data_dir, 'ner_labels.txt')), 'srl': load_labels(os.path.join(args.srl_data_dir, 'srl_labels.txt')), 'dep': load_labels(os.path.join(args.dep_data_dir, 'dep_labels.txt')), 'sdp': load_labels(os.path.join(args.sdp_data_dir, 'deps_labels.txt')), } os.makedirs(args.ltp_model, exist_ok=True) torch.save(ltp_model, os.path.join(args.ltp_model, 'ltp.model')) from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(args.transformer) tokenizer.save_pretrained(args.ltp_model)
def build_ner_distill_dataset(args): model = Model.load_from_checkpoint( args.resume_from_checkpoint, hparams=args ) model.eval() model.freeze() dataset, metric = ner.build_dataset( model, args.ner_data_dir, ner.task_info.task_name ) train_dataloader = torch.utils.data.DataLoader( dataset[datasets.Split.TRAIN], batch_size=args.batch_size, collate_fn=collate, num_workers=args.num_workers ) output = os.path.join(args.ner_data_dir, ner.task_info.task_name, 'output.npz') if torch.cuda.is_available(): model.cuda() map2cpu = lambda x: map2device(x) map2cuda = lambda x: map2device(x, model.device) else: map2cpu = lambda x: x map2cuda = lambda x: x with torch.no_grad(): batchs = [] for batch in tqdm(train_dataloader): batch = map2cuda(batch) logits = model.forward(task='ner', **batch).logits batch.update(logits=logits) batchs.append(map2cpu(batch)) try: numpy.savez( output, data=convert2npy(batchs), extra=convert2npy({ 'transitions': model.ner_classifier.crf.transitions, 'start_transitions': model.ner_classifier.crf.start_transitions, 'end_transitions': model.ner_classifier.crf.end_transitions }) ) except Exception as e: numpy.savez(output, data=convert2npy(batchs)) print("Done")
def deploy_model_4_0(args, version): ltp_adapter_mapper = sorted([ ('transformer', 'pretrained'), ('seg_classifier', 'seg_decoder'), ('pos_classifier', 'pos_decoder'), ('ner_classifier', 'ner_decoder'), ('ner_classifier.classifier', 'ner_decoder.mlp'), ('ner_classifier.relative_transformer', 'ner_decoder.transformer'), ('srl_classifier', 'srl_decoder'), ('srl_classifier.rel_atten', 'srl_decoder.biaffine'), ('srl_classifier.crf', 'srl_decoder.crf'), ('dep_classifier', 'dep_decoder'), ('sdp_classifier', 'sdp_decoder'), ], key=lambda x: len(x[0]), reverse=True) model = Model.load_from_checkpoint(args.resume_from_checkpoint, hparams=args) model_state_dict = OrderedDict(model.state_dict().items()) for preffix, target_preffix in ltp_adapter_mapper: model_state_dict = { key.replace(preffix, target_preffix, 1): value for key, value in model_state_dict.items() } pos_labels = load_labels(args.pos_data_dir, 'vocabs', 'xpos.txt') ner_labels = load_labels(args.ner_data_dir, 'ner_labels.txt') srl_labels = load_labels(args.srl_data_dir, 'srl_labels.txt') dep_labels = load_labels(args.dep_data_dir, 'vocabs', 'deprel.txt') sdp_labels = load_labels(args.sdp_data_dir, 'vocabs', 'deps.txt') ltp_model = { 'version': '4.0.0', 'code_version': version, 'seg': ['I-W', 'B-W'], 'pos': pos_labels, 'ner': ner_labels, 'srl': srl_labels, 'dep': dep_labels, 'sdp': sdp_labels, 'pretrained_config': model.transformer.config, 'model_config': { 'class': 'SimpleMultiTaskModel', 'init': { 'seg': { 'label_num': args.seg_num_labels }, 'pos': { 'label_num': args.pos_num_labels }, 'ner': { 'label_num': args.ner_num_labels, 'decoder': 'RelativeTransformer', 'RelativeTransformer': { 'num_heads': args.ner_num_heads, 'num_layers': args.ner_num_layers, 'hidden_size': args.ner_hidden_size, 'dropout': args.dropout } }, 'dep': { 'label_num': args.dep_num_labels, 'decoder': 'Graph', 'Graph': { 'arc_hidden_size': args.dep_arc_hidden_size, 'rel_hidden_size': args.dep_rel_hidden_size, 'dropout': args.dropout } }, 'sdp': { 'label_num': args.sdp_num_labels, 'decoder': 'Graph', 'Graph': { 'arc_hidden_size': args.sdp_arc_hidden_size, 'rel_hidden_size': args.sdp_rel_hidden_size, 'dropout': args.dropout } }, 'srl': { 'label_num': args.srl_num_labels, 'decoder': 'BiLinearCRF', 'BiLinearCRF': { 'hidden_size': args.srl_hidden_size, 'dropout': args.dropout } } } }, 'model': model_state_dict } os.makedirs(args.ltp_model, exist_ok=True) torch.save(ltp_model, os.path.join(args.ltp_model, 'ltp.model')) from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(args.transformer) tokenizer.save_pretrained(args.ltp_model)
def deploy_model_4_1(args, version): from argparse import Namespace fake_parser = ArgumentParser() fake_parser = Model.add_model_specific_args(fake_parser) model_args, _ = fake_parser.parse_known_args(namespace=args) transformer_config = AutoConfig.from_pretrained(model_args.transformer) model = Model.load_from_checkpoint(args.resume_from_checkpoint, strict=False, hparams=model_args, config=transformer_config) model_config = Namespace(**model.hparams) # LOAD VOCAB pos_labels = load_labels(args.pos_data_dir, 'vocabs', 'xpos.txt') ner_labels = load_labels(args.ner_data_dir, 'ner_labels.txt') srl_labels = load_labels(args.srl_data_dir, 'srl_labels.txt') dep_labels = load_labels(args.dep_data_dir, 'vocabs', 'deprel.txt') sdp_labels = load_labels(args.sdp_data_dir, 'vocabs', 'deps.txt') # MODEL CLIP if not len(pos_labels): del model.pos_classifier model_config.pos_num_labels = 0 if not len(ner_labels): del model.ner_classifier model_config.ner_num_labels = 0 if not len(srl_labels): del model.srl_classifier model_config.srl_num_labels = 0 if not len(dep_labels): del model.dep_classifier model_config.dep_num_labels = 0 if not len(sdp_labels): del model.sdp_classifier model_config.sdp_num_labels = 0 model_state_dict = OrderedDict(model.state_dict().items()) ltp_model = { 'version': version, 'model': model_state_dict, 'model_config': model_config, 'transformer_config': model.transformer.config.to_dict(), 'seg': ['I-W', 'B-W'], 'pos': pos_labels, 'ner': ner_labels, 'srl': srl_labels, 'dep': dep_labels, 'sdp': sdp_labels, } os.makedirs(args.ltp_model, exist_ok=True) torch.save(ltp_model, os.path.join(args.ltp_model, 'ltp.model')) from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(args.transformer) tokenizer.save_pretrained(args.ltp_model)