示例#1
0
def deploy_model_4_1(args):
    from argparse import Namespace

    model = Model.load_from_checkpoint(args.resume_from_checkpoint,
                                       hparams=args)
    model_state_dict = model.state_dict()
    model_config = Namespace(**model.hparams)

    ltp_model = {
        'version': "4.1.0",
        'model': model_state_dict,
        'model_config': model_config,
        'transformer_config': model.transformer.config.to_dict(),
        'seg': ['I-W', 'B-W'],
        'pos': load_labels(os.path.join(args.pos_data_dir, 'pos_labels.txt')),
        'ner': load_labels(os.path.join(args.ner_data_dir, 'ner_labels.txt')),
        'srl': load_labels(os.path.join(args.srl_data_dir, 'srl_labels.txt')),
        'dep': load_labels(os.path.join(args.dep_data_dir, 'dep_labels.txt')),
        'sdp': load_labels(os.path.join(args.sdp_data_dir, 'deps_labels.txt')),
    }
    os.makedirs(args.ltp_model, exist_ok=True)
    torch.save(ltp_model, os.path.join(args.ltp_model, 'ltp.model'))

    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained(args.transformer)
    tokenizer.save_pretrained(args.ltp_model)
示例#2
0
文件: multitask.py 项目: zmjm4/ltp
def build_ner_distill_dataset(args):
    model = Model.load_from_checkpoint(
        args.resume_from_checkpoint, hparams=args
    )

    model.eval()
    model.freeze()

    dataset, metric = ner.build_dataset(
        model, args.ner_data_dir,
        ner.task_info.task_name
    )
    train_dataloader = torch.utils.data.DataLoader(
        dataset[datasets.Split.TRAIN],
        batch_size=args.batch_size,
        collate_fn=collate,
        num_workers=args.num_workers
    )

    output = os.path.join(args.ner_data_dir, ner.task_info.task_name, 'output.npz')

    if torch.cuda.is_available():
        model.cuda()
        map2cpu = lambda x: map2device(x)
        map2cuda = lambda x: map2device(x, model.device)
    else:
        map2cpu = lambda x: x
        map2cuda = lambda x: x

    with torch.no_grad():
        batchs = []
        for batch in tqdm(train_dataloader):
            batch = map2cuda(batch)
            logits = model.forward(task='ner', **batch).logits
            batch.update(logits=logits)
            batchs.append(map2cpu(batch))
        try:
            numpy.savez(
                output,
                data=convert2npy(batchs),
                extra=convert2npy({
                    'transitions': model.ner_classifier.crf.transitions,
                    'start_transitions': model.ner_classifier.crf.start_transitions,
                    'end_transitions': model.ner_classifier.crf.end_transitions
                })
            )
        except Exception as e:
            numpy.savez(output, data=convert2npy(batchs))

    print("Done")
示例#3
0
def deploy_model_4_0(args, version):
    ltp_adapter_mapper = sorted([
        ('transformer', 'pretrained'),
        ('seg_classifier', 'seg_decoder'),
        ('pos_classifier', 'pos_decoder'),
        ('ner_classifier', 'ner_decoder'),
        ('ner_classifier.classifier', 'ner_decoder.mlp'),
        ('ner_classifier.relative_transformer', 'ner_decoder.transformer'),
        ('srl_classifier', 'srl_decoder'),
        ('srl_classifier.rel_atten', 'srl_decoder.biaffine'),
        ('srl_classifier.crf', 'srl_decoder.crf'),
        ('dep_classifier', 'dep_decoder'),
        ('sdp_classifier', 'sdp_decoder'),
    ],
                                key=lambda x: len(x[0]),
                                reverse=True)

    model = Model.load_from_checkpoint(args.resume_from_checkpoint,
                                       hparams=args)
    model_state_dict = OrderedDict(model.state_dict().items())
    for preffix, target_preffix in ltp_adapter_mapper:
        model_state_dict = {
            key.replace(preffix, target_preffix, 1): value
            for key, value in model_state_dict.items()
        }

    pos_labels = load_labels(args.pos_data_dir, 'vocabs', 'xpos.txt')
    ner_labels = load_labels(args.ner_data_dir, 'ner_labels.txt')
    srl_labels = load_labels(args.srl_data_dir, 'srl_labels.txt')
    dep_labels = load_labels(args.dep_data_dir, 'vocabs', 'deprel.txt')
    sdp_labels = load_labels(args.sdp_data_dir, 'vocabs', 'deps.txt')

    ltp_model = {
        'version': '4.0.0',
        'code_version': version,
        'seg': ['I-W', 'B-W'],
        'pos': pos_labels,
        'ner': ner_labels,
        'srl': srl_labels,
        'dep': dep_labels,
        'sdp': sdp_labels,
        'pretrained_config': model.transformer.config,
        'model_config': {
            'class': 'SimpleMultiTaskModel',
            'init': {
                'seg': {
                    'label_num': args.seg_num_labels
                },
                'pos': {
                    'label_num': args.pos_num_labels
                },
                'ner': {
                    'label_num': args.ner_num_labels,
                    'decoder': 'RelativeTransformer',
                    'RelativeTransformer': {
                        'num_heads': args.ner_num_heads,
                        'num_layers': args.ner_num_layers,
                        'hidden_size': args.ner_hidden_size,
                        'dropout': args.dropout
                    }
                },
                'dep': {
                    'label_num': args.dep_num_labels,
                    'decoder': 'Graph',
                    'Graph': {
                        'arc_hidden_size': args.dep_arc_hidden_size,
                        'rel_hidden_size': args.dep_rel_hidden_size,
                        'dropout': args.dropout
                    }
                },
                'sdp': {
                    'label_num': args.sdp_num_labels,
                    'decoder': 'Graph',
                    'Graph': {
                        'arc_hidden_size': args.sdp_arc_hidden_size,
                        'rel_hidden_size': args.sdp_rel_hidden_size,
                        'dropout': args.dropout
                    }
                },
                'srl': {
                    'label_num': args.srl_num_labels,
                    'decoder': 'BiLinearCRF',
                    'BiLinearCRF': {
                        'hidden_size': args.srl_hidden_size,
                        'dropout': args.dropout
                    }
                }
            }
        },
        'model': model_state_dict
    }
    os.makedirs(args.ltp_model, exist_ok=True)
    torch.save(ltp_model, os.path.join(args.ltp_model, 'ltp.model'))

    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained(args.transformer)
    tokenizer.save_pretrained(args.ltp_model)
示例#4
0
def deploy_model_4_1(args, version):
    from argparse import Namespace

    fake_parser = ArgumentParser()
    fake_parser = Model.add_model_specific_args(fake_parser)
    model_args, _ = fake_parser.parse_known_args(namespace=args)

    transformer_config = AutoConfig.from_pretrained(model_args.transformer)
    model = Model.load_from_checkpoint(args.resume_from_checkpoint,
                                       strict=False,
                                       hparams=model_args,
                                       config=transformer_config)

    model_config = Namespace(**model.hparams)
    # LOAD VOCAB
    pos_labels = load_labels(args.pos_data_dir, 'vocabs', 'xpos.txt')
    ner_labels = load_labels(args.ner_data_dir, 'ner_labels.txt')
    srl_labels = load_labels(args.srl_data_dir, 'srl_labels.txt')
    dep_labels = load_labels(args.dep_data_dir, 'vocabs', 'deprel.txt')
    sdp_labels = load_labels(args.sdp_data_dir, 'vocabs', 'deps.txt')

    # MODEL CLIP
    if not len(pos_labels):
        del model.pos_classifier
        model_config.pos_num_labels = 0

    if not len(ner_labels):
        del model.ner_classifier
        model_config.ner_num_labels = 0

    if not len(srl_labels):
        del model.srl_classifier
        model_config.srl_num_labels = 0

    if not len(dep_labels):
        del model.dep_classifier
        model_config.dep_num_labels = 0

    if not len(sdp_labels):
        del model.sdp_classifier
        model_config.sdp_num_labels = 0

    model_state_dict = OrderedDict(model.state_dict().items())

    ltp_model = {
        'version': version,
        'model': model_state_dict,
        'model_config': model_config,
        'transformer_config': model.transformer.config.to_dict(),
        'seg': ['I-W', 'B-W'],
        'pos': pos_labels,
        'ner': ner_labels,
        'srl': srl_labels,
        'dep': dep_labels,
        'sdp': sdp_labels,
    }
    os.makedirs(args.ltp_model, exist_ok=True)
    torch.save(ltp_model, os.path.join(args.ltp_model, 'ltp.model'))

    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained(args.transformer)
    tokenizer.save_pretrained(args.ltp_model)