def main(args):

    if args.dataset == 'sim-R':
        from BERTDST_utils.simR_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP
    if args.dataset == 'sim-M':
        from BERTDST_utils.simM_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP
    if args.dataset == 'DSTC2':
        from BERTDST_utils.DSTC2_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP
    if args.dataset == 'WOZ2.0':
        from BERTDST_utils.WOZ_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP
    if args.dataset == 'MultiWOZ2.1':
        from BERTDST_utils.MultiWOZ_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, OP, make_slot_meta
        ontology = json.load(open(args.ontology_data_path))
        SLOT, ontology = make_slot_meta(ontology)

    slot_meta = SLOT
    tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True)
    data = prepare_dataset(1.0, args.test_data_path, tokenizer, slot_meta,
                           args.test_size_window, args.max_seq_length,
                           args.test_MG)

    model_config = BertConfig.from_json_file(args.bert_config_path)
    model_config.dropout = 0.1
    op2id = OP
    model = MGDST(model_config, len(op2id), len(slot_meta))
    ckpt = torch.load(args.model_ckpt_path, map_location='cpu')
    model.load_state_dict(ckpt)

    model.eval()
    model.to(device)

    model_evaluation(make_turn_label, postprocessing, state_equal, OP, model,
                     data, tokenizer, slot_meta, 0, args.test_size_window,
                     args.test_MG)
예제 #2
0
    def __init__(self):
        model_dir = '/var/model/bert'
        if not os.path.isdir(model_dir):
            model_dir = os.path.abspath(os.path.dirname(__file__) + '/../../var/model/bert')

        self.use_gpu: bool = torch.cuda.is_available()
        self.config: BertConfig = BertConfig.from_json_file(model_dir + '/config.json')
        self.tokenizer: BertTokenizer = BertTokenizer.from_pretrained(model_dir + '/vocab.txt', do_lower_case=False)

        self.model_masked: BertForMaskedLM = BertForMaskedLM.from_pretrained(model_dir + '/model.bin', config=self.config)
        self.model: BertModel = self.model_masked.bert

        # freeze bert encoder
        for param in self.model.parameters():
            param.requires_grad = False
        for param in self.model_masked.parameters():
            param.requires_grad = False

        self.model.encoder.output_hidden_states = True
        self.model.eval()
        self.model_masked.eval()

        if self.use_gpu:
            self.model.cuda()
            self.model_masked.cuda()
예제 #3
0
    def __init__(self,
                 *,
                 pretrained_model_name=None,
                 config_filename=None,
                 vocab_size=None,
                 hidden_size=768,
                 num_hidden_layers=12,
                 num_attention_heads=12,
                 intermediate_size=3072,
                 hidden_act="gelu",
                 max_position_embeddings=512,
                 random_init=False,
                 **kwargs):
        TrainableNM.__init__(self, **kwargs)

        # Check that only one of pretrained_model_name, config_filename, and
        # vocab_size was passed in
        total = 0

        if pretrained_model_name is not None:
            total += 1
        if config_filename is not None:
            total += 1
        if vocab_size is not None:
            total += 1

        if total != 1:
            raise ValueError(
                "Only one of pretrained_model_name, vocab_size, " +
                "or config_filename should be passed into the " +
                "BERT constructor.")

        if vocab_size is not None:
            config = BertConfig(
                vocab_size_or_config_json_file=vocab_size,
                hidden_size=hidden_size,
                num_hidden_layers=num_hidden_layers,
                num_attention_heads=num_attention_heads,
                intermediate_size=intermediate_size,
                hidden_act=hidden_act,
                max_position_embeddings=max_position_embeddings)
            model = BertModel(config)
        elif pretrained_model_name is not None:
            model = BertModel.from_pretrained(pretrained_model_name)
        elif config_filename is not None:
            config = BertConfig.from_json_file(config_filename)
            model = BertModel(config)
        else:
            raise ValueError(
                "Either pretrained_model_name or vocab_size must" +
                "be passed into the BERT constructor")

        model.to(self._device)

        self.add_module("bert", model)
        self.config = model.config

        if random_init:
            self.apply(
                lambda module: transformer_weights_init(module, xavier=False))
예제 #4
0
def start(check_accr=False):
    bert_config = BertConfig.from_json_file(config.bert_config_root)
    model = BertCloze(bert_config, num_choices=10)
    load_model(model, config.pretrained_bert_root)
    generate_prob(model)
    generate_result(i_range=5)
    if check_accr:
        check_result()
    print("程序运行完成")
예제 #5
0
파일: CNN.py 프로젝트: Inaguma1110/span_NER
    def __init__(self, config, vocab):
        super(BERT_PRETRAINED_MODEL_JAPANESE, self).__init__()

        self.config = config
        self.vocab = vocab
        self.BERT_config = BertConfig.from_json_file(
            '../published_model/bert_spm/bert_config.json')
        self.tokenizer = BertTokenizer.from_pretrained(
            './spm_model/wiki-ja.vocab.txt')
        self.pretrained_BERT_model = BertModel.from_pretrained(
            '../published_model/bert_spm/pytorch_model.bin',
            config=self.BERT_config)
예제 #6
0
def main(args):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    ontology = json.load(open(os.path.join(args.data_root,
                                           args.ontology_data)))
    slot_meta, _ = make_slot_meta(ontology)

    tokenizer = BertTokenizer.from_pretrained(args.bert_config)
    special_tokens = ['[SLOT]', '[NULL]']
    special_tokens_dict = {'additional_special_tokens': special_tokens}
    tokenizer.add_special_tokens(special_tokens_dict)

    data = prepare_dataset(data_path=os.path.join(args.data_root,
                                                  args.test_data),
                           data_list=None,
                           tokenizer=tokenizer,
                           slot_meta=slot_meta,
                           n_history=args.n_history,
                           max_seq_length=args.max_seq_length,
                           op_code=args.op_code)

    model_config = BertConfig.from_json_file(args.bert_config_path)
    model_config.dropout = 0.1
    op2id = OP_SET[args.op_code]
    model = TransformerDST(model_config, len(op2id), len(domain2id),
                           op2id['update'])
    ckpt = torch.load(args.model_ckpt_path, map_location='cpu')
    model.load_state_dict(ckpt)

    model.eval()
    model.to(device)

    if args.eval_all:
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         False, False, False)
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         False, False, True)
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         False, True, False)
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         False, True, True)
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         True, False, False)
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         True, True, False)
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         True, False, True)
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         True, True, True)
    else:
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         args.gt_op, args.gt_p_state, args.gt_gen)
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file,
                                     pytorch_dump_path):
    # Initialise PyTorch model
    config = BertConfig.from_json_file(bert_config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))
    model = BertForPreTraining(config)

    # Load weights from tf checkpoint
    load_tf_weights_in_bert(model, config, tf_checkpoint_path)

    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    torch.save(model.state_dict(), pytorch_dump_path)
예제 #8
0
    def __init__(self, large, temp_dir, finetune=False):
        super(Bert, self).__init__()

        if (large):
            #self.model = BertModel.from_pretrained('bert-base-multilingual-cased', cache_dir=temp_dir)
            config = BertConfig.from_json_file('bert-large/config.json')
            self.model = BertModel.from_pretrained('bert-large',
                                                   cache_dir=None,
                                                   config=config)
        else:
            self.model = BertModel.from_pretrained(
                'bert-base-multilingual-cased', cache_dir=temp_dir)

        self.finetune = finetune
예제 #9
0
    def __init__(self,
                 backbone,
                 neck,
                 rpn_head,
                 text_bbox_roi_extractor,
                 text_bbox_head,
                 text_mask_roi_extractor,
                 text_mask_head,
                 char_bbox_roi_extractor,
                 char_bbox_head,
                 crm_cfg,
                 train_cfg=None,
                 test_cfg=None,
                 pretrained=None,
                 lm_cfg=None):
        super(AE_TextSpotter, self).__init__()
        self.backbone = builder.build_backbone(backbone)
        self.neck = builder.build_neck(neck)
        self.rpn_head = builder.build_head(rpn_head)

        # text detection module
        self.text_bbox_roi_extractor = builder.build_roi_extractor(
            text_bbox_roi_extractor)
        self.text_bbox_head = builder.build_head(text_bbox_head)
        self.text_mask_roi_extractor = builder.build_roi_extractor(
            text_mask_roi_extractor)
        self.text_mask_head = builder.build_head(text_mask_head)

        # character-based recognition module
        self.char_bbox_roi_extractor = builder.build_roi_extractor(
            char_bbox_roi_extractor)
        self.char_bbox_head = builder.build_head(char_bbox_head)
        self.crm_cfg = crm_cfg
        self.label2char = mmcv.load(crm_cfg.char_dict_file)['label2char']

        # language module
        if lm_cfg is not None:
            self.lm_cfg = lm_cfg
            self.dictmap = mmcv.load(lm_cfg.dictmap_file)
            self.bert_tokenizer = BertTokenizer.from_pretrained(
                lm_cfg.bert_vocab_file)
            self.bert_model = BertModel.from_pretrained(
                lm_cfg.bert_model_file,
                config=BertConfig.from_json_file(lm_cfg.bert_cfg_file))
            self.lang_model = GRUFC(**lm_cfg.lang_model)

        self.train_cfg = train_cfg
        self.test_cfg = test_cfg
        self.init_weights(pretrained=pretrained)
예제 #10
0
def convert_ckpt_compatible(ckpt_path, config_path):
    ckpt = torch.load(ckpt_path, map_location='cpu')
    keys = list(ckpt.keys())
    for key in keys:
        if 'LayerNorm' in key:
            if 'gamma' in key:
                ckpt[key.replace('gamma', 'weight')] = ckpt.pop(key)
            else:
                ckpt[key.replace('beta', 'bias')] = ckpt.pop(key)

    model_config = BertConfig.from_json_file(config_path)
    model = BertForPreTraining(model_config)
    model.load_state_dict(ckpt)
    new_ckpt = model.bert.state_dict()

    return new_ckpt
예제 #11
0
    def __init__(self, args, temp_dir, finetune=False):

        super(Bert, self).__init__()
        if args.pretrained_model_type in ['bert-base-uncased', 'bert-base-multilingual-uncased']:
            self.model = BertModel.from_pretrained(args.pretrained_model_type, cache_dir=temp_dir)
        if args.pretrained_model_type in ['rubert-deeppavlov']:
            name = args.pretrained_model_type
            config = BertConfig.from_json_file(mapper(name, 'config'))
            self.model = BertModel.from_pretrained(mapper(name, 'model'), config=config)

        if not self.model:
            raise NotImplementedError("self.model")

        bert_data = BertData(args)
        self.model.resize_token_embeddings(len(bert_data.tokenizer))
        self.finetune = finetune
예제 #12
0
def main(args):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    ontology = json.load(open(os.path.join(args.data_root,
                                           args.ontology_data)))
    slot_meta, _ = make_slot_meta(ontology)
    tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True)
    data = prepare_dataset(os.path.join(args.data_root,
                                        args.test_data), tokenizer, slot_meta,
                           args.n_history, args.max_seq_length, args.op_code)

    model_config = BertConfig.from_json_file(args.bert_config_path)
    model_config.dropout = 0.1
    op2id = OP_SET[args.op_code]
    model = TransformerDST(model_config, len(op2id), len(domain2id),
                           op2id['update'])
    ckpt = torch.load(args.model_ckpt_path, map_location='cpu')
    model.load_state_dict(ckpt)

    model.eval()
    model.to(device)

    if args.eval_all:
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         False, False, False)
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         False, False, True)
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         False, True, False)
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         False, True, True)
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         True, False, False)
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         True, True, False)
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         True, False, True)
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         True, True, True)
    else:
        model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code,
                         args.gt_op, args.gt_p_state, args.gt_gen)
def _get_custom_bert(pretrained_weights):
    model_fname = 'pytorch_model.bin'
    if model_fname not in os.listdir(pretrained_weights):
        convert(pretrained_weights)
    model_fpath = os.path.join(pretrained_weights, model_fname)
    config_fpath = os.path.join(pretrained_weights, 'bert_config.json')
    config = BertConfig.from_json_file(config_fpath)
    custom_bert = BertModel(config)
    state_dict = torch.load(model_fpath)

    def _remove_prefix(string):
        prefix = 'bert.'
        if string.startswith(prefix):
            string = string[len(prefix):]
        return string

    state_dict = {
        _remove_prefix(k): v
        for k, v in state_dict.items() if not k.startswith('cls')
    }
    custom_bert.load_state_dict(state_dict)
    return custom_bert
예제 #14
0
def main(args):

    assert args.use_one_optim is True

    if args.use_cls_only:
        args.no_dial = True

    print("### use_cls_only: {:}".format(args.use_cls_only))
    print("### no_dial: {:}".format(args.no_dial))

    if args.recover_e > 0:
        raise NotImplementedError("This option is from my oldest code version. "
                                  "I have not checked it for this code version.")

    if not os.path.exists(args.save_dir):
        os.mkdir(args.save_dir)
        print("### mkdir {:}".format(args.save_dir))

    def worker_init_fn(worker_id):
        np.random.seed(args.random_seed + worker_id)

    n_gpu = 0
    if torch.cuda.is_available() and (not args.use_cpu):
        n_gpu = torch.cuda.device_count()
        device = torch.device('cuda')
        print("### Device: {:}".format(device))
    else:
        print("### Use CPU (Debugging)")
        device = torch.device("cpu")

    if args.random_seed < 0:
        print("### Pick a random seed")
        args.random_seed = random.sample(list(range(0, 100000)), 1)[0]

    print("### Random Seed: {:}".format(args.random_seed))
    np.random.seed(args.random_seed)
    random.seed(args.random_seed)
    rng = random.Random(args.random_seed)
    torch.manual_seed(args.random_seed)

    if n_gpu > 0:
        if args.random_seed >= 0:
            torch.cuda.manual_seed(args.random_seed)
            torch.cuda.manual_seed_all(args.random_seed)

        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    if not os.path.exists(args.save_dir):
        os.mkdir(args.save_dir)

    ontology = json.load(open(args.ontology_data))
    slot_meta, ontology = make_slot_meta(ontology)
    op2id = OP_SET[args.op_code]
    print(op2id)

    tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True)

    train_path = os.path.join(args.data_root, "train.pt")
    dev_path = os.path.join(args.data_root, "dev.pt")
    test_path = os.path.join(args.data_root, "test.pt")

    if not os.path.exists(test_path):
        test_data_raw = prepare_dataset(data_path=args.test_data_path,
                                        tokenizer=tokenizer,
                                        slot_meta=slot_meta,
                                        n_history=args.n_history,
                                        max_seq_length=args.max_seq_length,
                                        op_code=args.op_code)
        torch.save(test_data_raw, test_path)
    else:
        test_data_raw = torch.load(test_path)

    print("# test examples %d" % len(test_data_raw))

    if not os.path.exists(train_path):
        train_data_raw = prepare_dataset(data_path=args.train_data_path,
                                         tokenizer=tokenizer,
                                         slot_meta=slot_meta,
                                         n_history=args.n_history,
                                         max_seq_length=args.max_seq_length,
                                         op_code=args.op_code)

        torch.save(train_data_raw, train_path)
    else:
        train_data_raw = torch.load(train_path)

    train_data = MultiWozDataset(train_data_raw,
                                 tokenizer,
                                 slot_meta,
                                 args.max_seq_length,
                                 rng,
                                 ontology,
                                 args.word_dropout,
                                 args.shuffle_state,
                                 args.shuffle_p, pad_id=tokenizer.convert_tokens_to_ids(['[PAD]'])[0],
                                 slot_id=tokenizer.convert_tokens_to_ids(['[SLOT]'])[0],
                                 decoder_teacher_forcing=args.decoder_teacher_forcing,
                                 use_full_slot=args.use_full_slot,
                                 use_dt_only=args.use_dt_only, no_dial=args.no_dial,
                                 use_cls_only=args.use_cls_only)

    print("# train examples %d" % len(train_data_raw))

    if not os.path.exists(dev_path):
        dev_data_raw = prepare_dataset(data_path=args.dev_data_path,
                                       tokenizer=tokenizer,
                                       slot_meta=slot_meta,
                                       n_history=args.n_history,
                                       max_seq_length=args.max_seq_length,
                                       op_code=args.op_code)
        torch.save(dev_data_raw,  dev_path)
    else:
        dev_data_raw = torch.load(dev_path)

    print("# dev examples %d" % len(dev_data_raw))

    model_config = BertConfig.from_json_file(args.bert_config_path)
    model_config.dropout = args.dropout
    model_config.attention_probs_dropout_prob = args.attention_probs_dropout_prob
    model_config.hidden_dropout_prob = args.hidden_dropout_prob

    type_vocab_size = 4
    dec_config = args
    model = TransformerDST(model_config, dec_config, len(op2id), len(domain2id),
                           op2id['update'],
                           tokenizer.convert_tokens_to_ids(['[MASK]'])[0],
                           tokenizer.convert_tokens_to_ids(['[SEP]'])[0],
                           tokenizer.convert_tokens_to_ids(['[PAD]'])[0],
                           tokenizer.convert_tokens_to_ids(['-'])[0],
                           type_vocab_size, args.exclude_domain)

    if not os.path.exists(args.bert_ckpt_path):
        args.bert_ckpt_path = download_ckpt(args.bert_ckpt_path, args.bert_config_path, 'assets')

    state_dict = torch.load(args.bert_ckpt_path, map_location='cpu')
    _k = 'embeddings.token_type_embeddings.weight'
    print("config.type_vocab_size != state_dict[bert.embeddings.token_type_embeddings.weight] ({0} != {1})".format(
            type_vocab_size, state_dict[_k].shape[0]))
    state_dict[_k].resize_(
        type_vocab_size, state_dict[_k].shape[1])
    state_dict[_k].data[2, :].copy_(state_dict[_k].data[0, :])
    state_dict[_k].data[3, :].copy_(state_dict[_k].data[0, :])
    model.bert.load_state_dict(state_dict)
    print("\n### Done Load BERT")
    sys.stdout.flush()

    # re-initialize added special tokens ([SLOT], [NULL], [EOS])
    model.bert.embeddings.word_embeddings.weight.data[1].normal_(mean=0.0, std=0.02)
    model.bert.embeddings.word_embeddings.weight.data[2].normal_(mean=0.0, std=0.02)
    model.bert.embeddings.word_embeddings.weight.data[3].normal_(mean=0.0, std=0.02)

    # re-initialize seg-2, seg-3
    model.bert.embeddings.token_type_embeddings.weight.data[2].normal_(mean=0.0, std=0.02)
    model.bert.embeddings.token_type_embeddings.weight.data[3].normal_(mean=0.0, std=0.02)
    model.to(device)

    num_train_steps = int(len(train_data_raw) / args.batch_size * args.n_epochs)

    if args.use_one_optim:
        print("### Use One Optim")
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(
                nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(
                nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=args.enc_lr)
        scheduler = WarmupLinearSchedule(optimizer, int(num_train_steps * args.enc_warmup),
                                             t_total=num_train_steps)
    else:
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        enc_param_optimizer = list(model.bert.named_parameters())  # TODO: For BERT only
        print('### Optim BERT: {:}'.format(len(enc_param_optimizer)))
        enc_optimizer_grouped_parameters = [
            {'params': [p for n, p in enc_param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in enc_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]

        enc_optimizer = AdamW(enc_optimizer_grouped_parameters, lr=args.enc_lr)
        enc_scheduler = WarmupLinearSchedule(enc_optimizer, int(num_train_steps * args.enc_warmup),
                                             t_total=num_train_steps)

        dec_param_optimizer = list(model.named_parameters())  # TODO:  For other parameters
        print('### Optim All: {:}'.format(len(dec_param_optimizer)))
        dec_param_optimizer = [p for (n, p) in dec_param_optimizer if 'bert' not in n]
        print('### Optim OTH: {:}'.format(len(dec_param_optimizer)))
        dec_optimizer = AdamW(dec_param_optimizer, lr=args.dec_lr)
        dec_scheduler = WarmupLinearSchedule(dec_optimizer, int(num_train_steps * args.dec_warmup),
                                             t_total=num_train_steps)

    if args.recover_e > 0:
        model_recover, enc_recover, dec_recover = load(args, str(args.recover_e))
        print("### Recover Model E{:}".format(args.recover_e))
        sys.stdout.flush()
        model.load_state_dict(model_recover)
        print("### Recover Optim E{:}".format(args.recover_e))
        sys.stdout.flush()
        enc_optimizer.load_state_dict(enc_recover)
        dec_optimizer.load_state_dict(dec_optimizer)

    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.batch_size,
                                  collate_fn=train_data.collate_fn,
                                  num_workers=args.num_workers,
                                  worker_init_fn=worker_init_fn)

    loss_fnc = nn.CrossEntropyLoss()
    best_score = {'epoch': 0, 'joint_acc': 0, 'op_acc': 0, 'final_slot_f1': 0}

    start_time = time.time()

    for epoch in range(args.n_epochs):
        batch_loss = []
        model.train()
        for step, batch in enumerate(train_dataloader):

            batch = [b.to(device) if (not isinstance(b, int)) and (not isinstance(b, dict) and (not isinstance(b, list)) and (not isinstance(b, np.ndarray))) else b for b in batch]

            input_ids_p, segment_ids_p, input_mask_p, \
            state_position_ids, op_ids, domain_ids, input_ids_g, segment_ids_g, position_ids_g, input_mask_g, \
            masked_pos, masked_weights, lm_label_ids, id_n_map, gen_max_len, n_total_pred = batch

            domain_scores, state_scores, loss_g = model(input_ids_p, segment_ids_p, input_mask_p, state_position_ids,
                input_ids_g, segment_ids_g, position_ids_g, input_mask_g,
                masked_pos, masked_weights, lm_label_ids, id_n_map, gen_max_len, only_pred_op=args.only_pred_op, n_gpu=n_gpu)

            if n_total_pred > 0:
                loss_g = loss_g.sum() / n_total_pred
            else:
                loss_g = 0

            loss_s = loss_fnc(state_scores.view(-1, len(op2id)), op_ids.view(-1))

            if args.only_pred_op:
                loss = loss_s
            else:
                loss = loss_s + loss_g

            if args.exclude_domain is not True:
                loss_d = loss_fnc(domain_scores.view(-1, len(domain2id)), domain_ids.view(-1))
                loss = loss + loss_d

            batch_loss.append(loss.item())

            loss.backward()

            if args.use_one_optim:
                optimizer.step()
                scheduler.step()
            else:
                enc_optimizer.step()
                enc_scheduler.step()
                dec_optimizer.step()
                dec_scheduler.step()

            model.zero_grad()

            if step % 100 == 0:
                try:
                    loss_g = loss_g.item()
                except AttributeError:
                    loss_g = loss_g

                if args.exclude_domain is not True:
                    print("time %.1f min, [%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, gen_loss : %.3f, dom_loss : %.3f" \
                          % ((time.time()-start_time)/60, epoch+1, args.n_epochs, step,
                             len(train_dataloader), np.mean(batch_loss),
                             loss_s.item(), loss_g, loss_d.item()))
                else:
                    print("time %.1f min, [%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, gen_loss : %.3f" \
                          % ((time.time()-start_time)/60, epoch+1, args.n_epochs, step,
                             len(train_dataloader), np.mean(batch_loss),
                             loss_s.item(), loss_g))

                sys.stdout.flush()
                batch_loss = []

        if args.use_one_optim:
            save(args, epoch + 1, model, optimizer)
        else:
            save(args, epoch + 1, model, enc_optimizer, dec_optimizer)

        if ((epoch+1) % args.eval_epoch == 0) and (epoch+1 >= 8):
            eval_res = model_evaluation(model, dev_data_raw, tokenizer, slot_meta, epoch+1, args.op_code,
                                        use_full_slot=args.use_full_slot, use_dt_only=args.use_dt_only, no_dial=args.no_dial, use_cls_only=args.use_cls_only, n_gpu=n_gpu)
            print("### Epoch {:} Score : ".format(epoch+1), eval_res)

            if eval_res['joint_acc'] > best_score['joint_acc']:
                best_score = eval_res
                print("### Best Joint Acc: {:} ###".format(best_score['joint_acc']))
                print('\n')

                if epoch+1 >= 8:  # To speed up
                    eval_res_test = model_evaluation(model, test_data_raw, tokenizer, slot_meta, epoch + 1, args.op_code,
                                                     use_full_slot=args.use_full_slot, use_dt_only=args.use_dt_only, no_dial=args.no_dial, use_cls_only=args.use_cls_only, n_gpu=n_gpu)
                    print("### Epoch {:} Test Score : ".format(epoch + 1), eval_res_test)
예제 #15
0
def main(args):
    def worker_init_fn(worker_id):
        np.random.seed(args.random_seed + worker_id)

    n_gpu = 0
    if torch.cuda.is_available():
        n_gpu = torch.cuda.device_count()
    np.random.seed(args.random_seed)
    random.seed(args.random_seed)
    rng = random.Random(args.random_seed)
    torch.manual_seed(args.random_seed)
    if n_gpu > 0:
        torch.cuda.manual_seed(args.random_seed)
        torch.cuda.manual_seed_all(args.random_seed)
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    if not os.path.exists(args.save_dir):
        os.mkdir(args.save_dir)

    ontology = json.load(open(args.ontology_data))
    slot_meta, ontology = make_slot_meta(ontology)
    op2id = OP_SET[args.op_code]
    print(op2id)
    tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True)

    train_data_raw = prepare_dataset(data_path=args.train_data_path,
                                     tokenizer=tokenizer,
                                     slot_meta=slot_meta,
                                     n_history=args.n_history,
                                     max_seq_length=args.max_seq_length,
                                     op_code=args.op_code)

    train_data = MultiWozDataset(train_data_raw,
                                 tokenizer,
                                 slot_meta,
                                 args.max_seq_length,
                                 rng,
                                 ontology,
                                 args.word_dropout,
                                 args.shuffle_state,
                                 args.shuffle_p)
    print("# train examples %d" % len(train_data_raw))

    dev_data_raw = prepare_dataset(data_path=args.dev_data_path,
                                   tokenizer=tokenizer,
                                   slot_meta=slot_meta,
                                   n_history=args.n_history,
                                   max_seq_length=args.max_seq_length,
                                   op_code=args.op_code)
    print("# dev examples %d" % len(dev_data_raw))

    test_data_raw = prepare_dataset(data_path=args.test_data_path,
                                    tokenizer=tokenizer,
                                    slot_meta=slot_meta,
                                    n_history=args.n_history,
                                    max_seq_length=args.max_seq_length,
                                    op_code=args.op_code)
    print("# test examples %d" % len(test_data_raw))

    model_config = BertConfig.from_json_file(args.bert_config_path)
    model_config.dropout = args.dropout
    model_config.attention_probs_dropout_prob = args.attention_probs_dropout_prob
    model_config.hidden_dropout_prob = args.hidden_dropout_prob

    model = SomDST(model_config, len(op2id), len(domain2id), op2id['update'], args.exclude_domain)

    if not os.path.exists(args.bert_ckpt_path):
        args.bert_ckpt_path = download_ckpt(args.bert_ckpt_path, args.bert_config_path, 'assets')

    ckpt = torch.load(args.bert_ckpt_path, map_location='cpu')
    model.encoder.bert.load_state_dict(ckpt)

    # re-initialize added special tokens ([SLOT], [NULL], [EOS])
    model.encoder.bert.embeddings.word_embeddings.weight.data[1].normal_(mean=0.0, std=0.02)
    model.encoder.bert.embeddings.word_embeddings.weight.data[2].normal_(mean=0.0, std=0.02)
    model.encoder.bert.embeddings.word_embeddings.weight.data[3].normal_(mean=0.0, std=0.02)
    model.to(device)

    num_train_steps = int(len(train_data_raw) / args.batch_size * args.n_epochs)

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    enc_param_optimizer = list(model.encoder.named_parameters())
    enc_optimizer_grouped_parameters = [
        {'params': [p for n, p in enc_param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in enc_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

    enc_optimizer = AdamW(enc_optimizer_grouped_parameters, lr=args.enc_lr)
    enc_scheduler = WarmupLinearSchedule(enc_optimizer, int(num_train_steps * args.enc_warmup),
                                         t_total=num_train_steps)

    dec_param_optimizer = list(model.decoder.parameters())
    dec_optimizer = AdamW(dec_param_optimizer, lr=args.dec_lr)
    dec_scheduler = WarmupLinearSchedule(dec_optimizer, int(num_train_steps * args.dec_warmup),
                                         t_total=num_train_steps)

    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.batch_size,
                                  collate_fn=train_data.collate_fn,
                                  num_workers=args.num_workers,
                                  worker_init_fn=worker_init_fn)

    loss_fnc = nn.CrossEntropyLoss()
    best_score = {'epoch': 0, 'joint_acc': 0, 'op_acc': 0, 'final_slot_f1': 0}
    for epoch in range(args.n_epochs):
        batch_loss = []
        model.train()
        for step, batch in enumerate(train_dataloader):
            batch = [b.to(device) if not isinstance(b, int) else b for b in batch]
            input_ids, input_mask, segment_ids, state_position_ids, op_ids,\
            domain_ids, gen_ids, max_value, max_update = batch

            if rng.random() < args.decoder_teacher_forcing:  # teacher forcing
                teacher = gen_ids
            else:
                teacher = None

            domain_scores, state_scores, gen_scores = model(input_ids=input_ids,
                                                            token_type_ids=segment_ids,
                                                            state_positions=state_position_ids,
                                                            attention_mask=input_mask,
                                                            max_value=max_value,
                                                            op_ids=op_ids,
                                                            max_update=max_update,
                                                            teacher=teacher)

            loss_s = loss_fnc(state_scores.view(-1, len(op2id)), op_ids.view(-1))
            loss_g = masked_cross_entropy_for_value(gen_scores.contiguous(),
                                                    gen_ids.contiguous(),
                                                    tokenizer.vocab['[PAD]'])
            loss = loss_s + loss_g
            if args.exclude_domain is not True:
                loss_d = loss_fnc(domain_scores.view(-1, len(domain2id)), domain_ids.view(-1))
                loss = loss + loss_d
            batch_loss.append(loss.item())

            loss.backward()
            enc_optimizer.step()
            enc_scheduler.step()
            dec_optimizer.step()
            dec_scheduler.step()
            model.zero_grad()

            if step % 100 == 0:
                if args.exclude_domain is not True:
                    print("[%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, gen_loss : %.3f, dom_loss : %.3f" \
                          % (epoch+1, args.n_epochs, step,
                             len(train_dataloader), np.mean(batch_loss),
                             loss_s.item(), loss_g.item(), loss_d.item()))
                else:
                    print("[%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, gen_loss : %.3f" \
                          % (epoch+1, args.n_epochs, step,
                             len(train_dataloader), np.mean(batch_loss),
                             loss_s.item(), loss_g.item()))
                batch_loss = []

        if (epoch+1) % args.eval_epoch == 0:
            eval_res = model_evaluation(model, dev_data_raw, tokenizer, slot_meta, epoch+1, args.op_code)
            if eval_res['joint_acc'] > best_score['joint_acc']:
                best_score = eval_res
                model_to_save = model.module if hasattr(model, 'module') else model
                save_path = os.path.join(args.save_dir, 'model_best.bin')
                torch.save(model_to_save.state_dict(), save_path)
            print("Best Score : ", best_score)
            print("\n")

    print("Test using best model...")
    best_epoch = best_score['epoch']
    ckpt_path = os.path.join(args.save_dir, 'model_best.bin')
    model = SomDST(model_config, len(op2id), len(domain2id), op2id['update'], args.exclude_domain)
    ckpt = torch.load(ckpt_path, map_location='cpu')
    model.load_state_dict(ckpt)
    model.to(device)

    model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code,
                     is_gt_op=False, is_gt_p_state=False, is_gt_gen=False)
    model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code,
                     is_gt_op=False, is_gt_p_state=False, is_gt_gen=True)
    model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code,
                     is_gt_op=False, is_gt_p_state=True, is_gt_gen=False)
    model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code,
                     is_gt_op=False, is_gt_p_state=True, is_gt_gen=True)
    model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code,
                     is_gt_op=True, is_gt_p_state=False, is_gt_gen=False)
    model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code,
                     is_gt_op=True, is_gt_p_state=True, is_gt_gen=False)
    model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code,
                     is_gt_op=True, is_gt_p_state=False, is_gt_gen=True)
    model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code,
                     is_gt_op=True, is_gt_p_state=True, is_gt_gen=True)
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--bert_config_file",
        default=None,
        type=str,
        required=True,
        help=
        "The config json file corresponding to the pre-trained BERT model. This specifies the model architecture."
    )
    parser.add_argument(
        "--vocab_file",
        default=None,
        type=str,
        required=True,
        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument("--train_file",
                        default=None,
                        type=str,
                        help="SQuAD json for training. E.g., train-v1.1.json")
    parser.add_argument(
        "--predict_file",
        default=None,
        type=str,
        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json"
    )
    parser.add_argument(
        "--init_checkpoint",
        default=None,
        type=str,
        help="Initial checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument(
        "--do_lower_case",
        default=True,
        action='store_true',
        help="Whether to lower case the input text. Should be True for uncased "
        "models and False for cased models.")
    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help=
        "The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_predict",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--predict_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for predictions.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% "
        "of training.")
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument("--save_checkpoints_steps",
                        default=1000,
                        type=int,
                        help="How often to save the model checkpoint.")
    parser.add_argument("--iterations_per_loop",
                        default=1000,
                        type=int,
                        help="How many steps to make in each estimator call.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help=
        "The total number of n-best predictions to generate in the nbest_predictions.json "
        "output file.")
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help=
        "The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.")

    parser.add_argument(
        "--verbose_logging",
        default=False,
        action='store_true',
        help=
        "If true, all of the warnings related to data processing will be printed. "
        "A number of warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        "--accumulate_gradients",
        type=int,
        default=1,
        help=
        "Number of steps to accumulate gradient on (divide the batch_size and accumulate)"
    )
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    parser.add_argument("--checkpoint",
                        default=None,
                        type=str,
                        help="The checkpoint file.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer")
    parser.add_argument(
        '--fp16',
        action='store_true',
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit"
    )

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu,
                bool(args.local_rank != -1))

    if args.accumulate_gradients < 1:
        raise ValueError(
            "Invalid accumulate_gradients parameter: {}, should be >= 1".
            format(args.accumulate_gradients))

    args.train_batch_size = int(args.train_batch_size /
                                args.accumulate_gradients)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")

    if args.do_train:
        if not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
    if args.do_predict:
        if not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified."
            )

    bert_config = BertConfig.from_json_file(args.bert_config_file)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (args.max_seq_length, bert_config.max_position_embeddings))

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory () already exists and is not empty.")
    os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    train_examples = None
    num_train_steps = None
    if args.do_train:
        train_examples = read_label_examples(input_file=args.train_file,
                                             is_training=True)
        num_train_steps = int(
            len(train_examples) / args.train_batch_size *
            args.num_train_epochs)

    model = BertForLabelling(bert_config, args.train_batch_size)
    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)
    output_model_file = os.path.join(args.output_dir, "saved_model")

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    model.to(device)
    if args.do_train:
        no_decay = ['bias', 'gamma', 'beta']
        t_total = len(
            train_examples
        ) // args.gradient_accumulation_steps * args.num_train_epochs
        optimizer_parameters = [{
            'params':
            [p for n, p in model.named_parameters() if n not in no_decay],
            'weight_decay_rate':
            0.01
        }, {
            'params':
            [p for n, p in model.named_parameters() if n in no_decay],
            'weight_decay_rate':
            0.0
        }]
        logger.info("***** Preparing optimizer *****")
        optimizer = AdamW(optimizer_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=args.warmup_steps,
                                         t_total=t_total)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    if args.do_train:
        train_features = convert_examples_to_features(
            examples=train_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=True)
        logger.info("***** Running training *****")
        logger.info("  Num orig examples = %d", len(train_examples))
        logger.info("  Num split examples = %d", len(train_features))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)

        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_seq = torch.tensor([f.label_seq for f in train_features],
                                     dtype=torch.float)

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_seq)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)

        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.to(device)
        model.train()
        global_step = 0
        for i in trange(int(args.num_train_epochs), desc="Epoch"):
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_seq = batch
                loss = model(input_ids, segment_ids, input_mask, label_seq)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()  # We have accumulated enought gradients
                    model.zero_grad()
                    global_step += 1

            torch.save(model.state_dict(), output_model_file + ".{}".format(i))

    if args.do_predict:
        if args.checkpoint:
            state_dict = torch.load(args.checkpoint)
            model.load_state_dict(state_dict)
        eval_examples = read_label_examples(input_file=args.predict_file,
                                            is_training=False)
        eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=False)

        logger.info("***** Running predictions *****")
        logger.info("  Num orig examples = %d", len(eval_examples))
        logger.info("  Num split examples = %d", len(eval_features))
        logger.info("  Batch size = %d", args.predict_batch_size)

        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_example_index = torch.arange(all_input_ids.size(0),
                                         dtype=torch.long)

        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_example_index)
        if args.local_rank == -1:
            eval_sampler = SequentialSampler(eval_data)
        else:
            eval_sampler = DistributedSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.predict_batch_size)

        model.eval()
        all_results = []
        logger.info("Start evaluating")
        nb_f1_scores, nb_nums = 0, 0
        nb_p_scores, nb_r_scores = 0, 0
        output_prediction_file = os.path.join(args.output_dir,
                                              "predictions.txt")
        output_prediction_file_writer = open(output_prediction_file,
                                             'w',
                                             encoding='utf-8')
        for input_ids, input_mask, segment_ids, example_indices in tqdm(
                eval_dataloader, desc="Evaluating"):
            if len(all_results) % 1000 == 0:
                logger.info("Processing example: %d" % (len(all_results)))
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            with torch.no_grad():
                batch_logits = model(input_ids, segment_ids, input_mask)
            for i, example_index in enumerate(example_indices):
                logits = batch_logits[i].detach().cpu().tolist()
                # print(len(logits))
                eval_feature = eval_features[example_index.item()]
                # print(np.array(np.array(logits) > 0.5, dtype=int))
                # print(np.array(eval_feature.label_seq))

                # print(eval_feature)
                # nb_f1_score = f1_score(np.array(np.array(logits) > 0.5, dtype=int), np.array(eval_feature.label_seq))
                # nb_f1_scores += nb_f1_score
                # nb_p_scores += precision_score(np.array(np.array(logits) > 0.5, dtype=int), np.array(eval_feature.label_seq))
                # nb_r_scores += recall_score(np.array(np.array(logits) > 0.5, dtype=int), np.array(eval_feature.label_seq))
                nb_nums += 1
                words = eval_examples[example_index.item()].doc_tokens
                words_scores = [-1] * len(words)
                for token_id in range(len(logits)):
                    if token_id not in eval_feature.token_to_orig_map:
                        continue
                    orig_id = eval_feature.token_to_orig_map[token_id]
                    words_scores[orig_id] = max(words_scores[orig_id],
                                                logits[token_id])

                words_num = max(eval_feature.token_to_orig_map.values()) + 1
                output_prediction_file_writer.write(' '.join(
                    words[:words_num]))
                output_prediction_file_writer.write('\t')
                output_prediction_file_writer.write(' '.join(
                    [str(ws) for ws in words_scores[:words_num]]))
                output_prediction_file_writer.write('\n')
예제 #17
0
parser.add_argument("--config_file",
                    default=None,
                    type=str,
                    help="The BERT model config")
args = parser.parse_args()

nf = nemo.core.NeuralModuleFactory(backend=nemo.core.Backend.PyTorch,
                                   local_rank=args.local_rank,
                                   optimization_level=args.amp_opt_level,
                                   log_dir=args.work_dir,
                                   create_tb_writer=True,
                                   files_to_copy=[__file__],
                                   add_time_to_log_dir=True)

if args.config_file is not None:
    config = BertConfig.from_json_file(args.config_file).to_dict()
    args.vocab_size = config['vocab_size']
    args.hidden_size = config['hidden_size']
    args.num_hidden_layers = config['num_hidden_layers']
    args.num_attention_heads = config['num_attention_heads']
    args.intermediate_size = config['intermediate_size']
    args.hidden_act = config['hidden_act']
    args.max_seq_length = config['max_position_embeddings']

if not args.preprocessed_data:
    special_tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]']
    data_desc = BERTPretrainingDataDesc(args.dataset_name, args.data_dir,
                                        args.vocab_size, args.sample_size,
                                        special_tokens, 'train.txt')
    if args.tokenizer == "sentence-piece":
        nf.logger.info("To use SentencePieceTokenizer.")
def main(args):

    if not os.path.exists(args.save_dir):
        os.mkdir(args.save_dir)
        print("### mkdir {:}".format(args.save_dir))

    def worker_init_fn(worker_id):
        np.random.seed(args.random_seed + worker_id)

    n_gpu = 0
    if torch.cuda.is_available() and args.use_cpu:
        n_gpu = torch.cuda.device_count()
        device = torch.device('cuda')
        print("### Device: {:}".format(device))
    else:
        print("### Use CPU (Debugging)")
        device = torch.device("cpu")

    if args.random_seed < 0:
        print("### Pick a random seed")
        args.random_seed = random.sample(list(range(1, 100000)), 1)[0]

    print("### Random Seed: {:}".format(args.random_seed))
    np.random.seed(args.random_seed)
    random.seed(args.random_seed)
    rng = random.Random(args.random_seed)
    torch.manual_seed(args.random_seed)

    if n_gpu > 0:
        if args.random_seed >= 0:
            torch.cuda.manual_seed(args.random_seed)
            torch.cuda.manual_seed_all(args.random_seed)

        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    if not os.path.exists(args.save_dir):
        os.mkdir(args.save_dir)

    ontology = json.load(open(args.ontology_data))
    slot_meta, ontology = make_slot_meta(ontology)
    op2id = OP_SET[args.op_code]
    print(op2id)

    tokenizer = BertTokenizer.from_pretrained(args.bert_config)

    special_tokens = ['[SLOT]', '[NULL]']
    special_tokens_dict = {'additional_special_tokens': special_tokens}
    tokenizer.add_special_tokens(special_tokens_dict)

    test_path = os.path.join(args.data_root_test, "test.pt")

    if not os.path.exists(test_path):
        test_data_raw = prepare_dataset_for_inference(data_path=args.test_data_path,
                                                      data_list=None,
                                                      tokenizer=tokenizer,
                                                      slot_meta=slot_meta,
                                                      n_history=args.n_history,
                                                      max_seq_length=args.max_seq_length,
                                                      op_code=args.op_code)
        torch.save(test_data_raw, test_path)
    else:
        test_data_raw = torch.load(test_path)

    print("# test examples %d" % len(test_data_raw))

    model_config = BertConfig.from_json_file(args.bert_config_path)
    model_config.dropout = 0.
    model_config.attention_probs_dropout_prob = 0.
    model_config.hidden_dropout_prob = 0.

    type_vocab_size = 4
    dec_config = args
    model = TransformerDST(model_config, dec_config, len(op2id), len(domain2id),
                           op2id['update'],
                           tokenizer.convert_tokens_to_ids(['[MASK]'])[0],
                           tokenizer.convert_tokens_to_ids(['[SEP]'])[0],
                           tokenizer.convert_tokens_to_ids(['[PAD]'])[0],
                           tokenizer.convert_tokens_to_ids(['-'])[0],
                           type_vocab_size, args.exclude_domain)


    state_dict = torch.load(args.bert_ckpt_path, map_location='cpu')
    _k = 'bert.embeddings.token_type_embeddings.weight'
    print("config.type_vocab_size != state_dict[bert.embeddings.token_type_embeddings.weight] ({0} != {1})".format(
        type_vocab_size, state_dict[_k].shape[0]))
    state_dict[_k] = state_dict[_k].repeat(int(type_vocab_size / state_dict[_k].shape[0]), 1)
    state_dict[_k].data[2, :].copy_(state_dict[_k].data[0, :])
    state_dict[_k].data[3, :].copy_(state_dict[_k].data[0, :])
    model.bert.load_state_dict(state_dict, strict=False)

    # re-initialize added special tokens ([SLOT], [NULL], [EOS])
    model.bert.embeddings.word_embeddings.weight.data[1].normal_(mean=0.0, std=0.02)
    model.bert.embeddings.word_embeddings.weight.data[2].normal_(mean=0.0, std=0.02)
    model.bert.embeddings.word_embeddings.weight.data[3].normal_(mean=0.0, std=0.02)

    # re-initialize seg-2, seg-3
    model.bert.embeddings.token_type_embeddings.weight.data[2].normal_(mean=0.0, std=0.02)
    model.bert.embeddings.token_type_embeddings.weight.data[3].normal_(mean=0.0, std=0.02)
    model.bert.resize_token_embeddings(len(tokenizer))

    test_epochs = [int(e) for e in args.load_epoch.strip().lower().split('-')]
    for best_epoch in test_epochs:
        print("### Epoch {:}...".format(best_epoch))
        sys.stdout.flush()
        ckpt_path = os.path.join('/opt/ml/code/transformer_dst', args.save_dir, 'model.e{:}.bin'.format(best_epoch))
        ckpt = torch.load(ckpt_path, map_location='cpu')
        model.load_state_dict(ckpt)
        model.to(device)

        eval_res = model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code,
                                    use_full_slot=args.use_full_slot, use_dt_only=args.use_dt_only,
                                    no_dial=args.no_dial, n_gpu=n_gpu,
                                    is_gt_op=False, is_gt_p_state=False, is_gt_gen=False, submission=True)

        print("### Epoch {:} Test Score : ".format(best_epoch), eval_res)
        print('\n' * 2)
        sys.stdout.flush()
예제 #19
0
def main():
    args = parser.get_args()
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    assert torch.cuda.is_available(), "No GPU/CUDA is detected!"
    # Training on CPU is almost infeasible,
    # but evaluation/inference can be done on CPU

    '''
    Do initial argument checks
    '''
    if args.id == 'dummy': 
        args.id = str(uuid.uuid4())
        # If no ID is specified,
        # then we will generate a radom ID as the folder name of this run

    if args.training_mode != 'supervised' and \
            args.training_mode != 'semisupervised_phase1' and \
            args.training_mode != 'semisupervised_phase2':
        raise Exception('You can do either supervised or semisupervised training')
        # 'semisupervised_phase1' is essentially unsupervised learning of the joint model
        # on chest radiographs and radiology reports
        # 'semisupervised_phase2' is supervised learning with the initialization 
        # from the training results of semisupervised_phase1 

    if args.semisupervised_training_data != 'allCXR' and \
            args.semisupervised_training_data != 'allCHF':
        raise Exception('You can train the model on all MIMIC-CXR (allCXR) or \
            the congestive heart failure cohort (allCHF)')

    if args.training_mode == 'semisupervised_phase2':
        if not os.path.isdir(args.joint_semisupervised_pretrained_checkpoint):
            raise Exception('The joint_semisupervised_pretrained_checkpoint directory \
                has to exist for the model initialization of semisupervised_phase2')

    if args.output_channel_encoding != 'multilabel' and \
            args.output_channel_encoding != 'multiclass':
        raise Exception('You can select either multilabel or multiclass classification')
    
    if args.data_split_mode != 'cross_val' and args.data_split_mode != 'testing':
        raise Exception('You can do either cross-validation (cross_val) or testing (testing), \
            which determine how the dataset is going to be split')

    if args.joint_loss_method != 'l2' and args.joint_loss_method != 'cosine' and \
            args.joint_loss_method != 'dot' and args.joint_loss_method != 'ranking':
        raise Exception('You can have either l2, cosine, dot or ranking \
            as the joint loss calculation between the img-txt embedding')

    if args.joint_loss_similarity_function != 'l2' and \
            args.joint_loss_similarity_function != 'cosine' and \
            args.joint_loss_similarity_function != 'dot':
        raise Exception('You can have either l2, cosine, or dot \
            as the similarity function for the ranking loss in the img-txt embedding. \
            You had %s' % args.joint_loss_similarity_function)

    if not args.do_train and not args.do_eval:
        raise Exception('Either do_train or do_eval flag must be set as true')

    '''
    Select the right data split file based on the argument setting
    '''
    # TODO: release the data split file (including our labels)
    if args.training_mode == 'supervised' or args.training_mode == 'semisupervised_phase2':
        data_split_file_postfix = ''
        # Supervised training does not need unlabeled data 
    elif args.semisupervised_training_data == 'allCHF':
        data_split_file_postfix = '-allCHF'
    elif args.semisupervised_training_data == 'allCXR':
        data_split_file_postfix = '-allCXR'


    if not args.use_data_split_path:
        if args.data_split_mode == 'testing' and args.do_eval:
            args.data_split_path = os.path.join(args.data_split_path, 
                                                'mimic-cxr-sub-img-edema-split-manualtest.csv')
            # When evaluating in the testing mode, you should use the expert labels 
            # that are included in the test set 
        else:
            args.data_split_path = os.path.join(
                args.data_split_path,
                'mimic-cxr-sub-img-edema-split{}.csv'.format(data_split_file_postfix))

    '''
    Set the output directory structure
    '''
    # TODO: revisit the code related to masked text (may want to delete it)
    if args.use_masked_txt:
         args.text_data_dir = os.path.join(args.text_data_dir, 'masked')
    if not args.use_text_data_dir:
        args.text_data_dir = os.path.join(args.text_data_dir, args.output_channel_encoding)

    args.model = 'model'
    # TODO: consider deleting this

    if not args.use_text_data_dir:
        if args.training_mode == 'supervised' or args.training_mode == 'supervised_masking':
            args.text_data_dir = os.path.join(args.text_data_dir, 'supervised', 'full')
        elif 'semisupervised' in args.training_mode:
            args.text_data_dir = os.path.join(args.text_data_dir, 'semisupervised',
                                              args.semisupervised_training_data, 'full')

    if args.training_mode == 'supervised' or args.training_mode == 'supervised_masking':
        args.output_dir = os.path.join(args.output_dir, args.data_split_mode,
                                       args.model, args.training_mode, args.id)
    elif 'semisupervised' in args.training_mode:
        args.output_dir = os.path.join(args.output_dir, args.data_split_mode, args.model, 
                                       args.training_mode, args.semisupervised_training_data, 
                                       args.id)

    args.reports_dir = os.path.join(args.output_dir, 'eval_reports')
    args.tsbd_dir = os.path.join(args.output_dir, 'tsbd_dir')
    args.checkpoints_dir = os.path.join(args.output_dir, 'checkpoints')

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and \
            args.do_train and not args.overwrite_output_dir:
        raise ValueError("Output directory ({}) already exists and is not empty." \
            " Use".format(args.output_dir)+" --overwrite_output_dir to overcome.")

    '''
    Create the necessary directories.
    Make sure no argument updating after this point.
    '''
    directories = [args.output_dir, args.reports_dir, args.tsbd_dir, args.checkpoints_dir]
    for directory in directories:
        if not(os.path.exists(directory)):
            os.makedirs(directory)
    if not os.path.exists(args.data_split_path):
        raise Exception('The data split path %s does not exist! Please check' \
            % args.data_split_path)

    # TODO: the name of "reports_dir" can be confusing; need to rename it
    if args.do_eval:
        args.reports_dir = os.path.join(args.reports_dir,
            'eval_report_{}'.format(len(os.listdir(args.reports_dir))))
        if not os.path.exists(args.reports_dir):
            os.makedirs(args.reports_dir)
        main_utils.to_json_file(vars(args), os.path.join(args.reports_dir, 'eval_args.json'))
        print('Location of the evaluation result directory: %s' % args.reports_dir)

    '''
    Print some important arguments
    '''
    print('Classification type: {}'.format(args.output_channel_encoding))
    print('Loss method in the image-text embedding space: {}'.format(args.joint_loss_method)) 
    if args.joint_loss_method == 'ranking':
        print('Similarity function for the ranking loss in the img-txt embedding:',
              args.joint_loss_similarity_function)
    print('Currently doing **{}**'.format(args.data_split_mode))
    print('Training mode: {}'.format(args.training_mode))
    print('Doing training: {}'.format(args.do_train))
    print('Doing eval: {}'.format(args.do_eval))
    print('Cuda is available: {}'.format(torch.cuda.is_available()))
    print('Device used: ', device)
    print('Scheduler used: ', args.scheduler)
    print('Initial learning Rate: ', args.learning_rate)
    print('Number of training epochs: ', args.num_train_epochs)
    print('Text data directory: ', args.text_data_dir)
    if 'semisupervised' in args.training_mode:
        print('Training data for semisupervised learning: ', args.semisupervised_training_data)
    print('Using all Sequences in BERT last layer rather than just [CLS]: ', 
          args.bert_pool_last_hidden)
    if args.bert_pool_last_hidden:
        print('Using img embedding for computing attention scores: ', 
              args.bert_pool_use_img)
    print('Pretrained BERT model directory: {}'.format(args.bert_pretrained_dir))

    '''
    Set logging and tensorboard directories
    '''
    if args.do_train:
        args.tsbd_dir = os.path.join(
            args.tsbd_dir,
            'tsbd_{}'.format(len(os.listdir(args.tsbd_dir))))
        if not os.path.exists(args.tsbd_dir):
            os.makedirs(args.tsbd_dir)
        print('Location of the tensorboard directory: %s' % args.tsbd_dir)
        log_file = os.path.join(args.output_dir, 'training.log')
    if args.do_eval:
        log_file = os.path.join(args.reports_dir, 'evaluating.log')
    print('Logging in: {}'.format(log_file))
    logging.basicConfig(filename=log_file, level=logging.INFO, filemode='w', 
                        format='%(asctime)s - %(name)s %(message)s', 
                        datefmt='%m-%d %H:%M')
    logger = logging.getLogger(__name__)
    logger.info("Current git commit sha: %s", sha)

    '''
    Set text tokenizer 
    '''
    tokenizer = BertTokenizer.from_pretrained(args.bert_pretrained_dir)
    # tokenizer is not something that constantly needs to be saved 
    # because only the pre-trained bert model determines this.

    '''
    Train the model
    '''
    if args.do_train:
        start_time = time.time()
        logger = logging.getLogger('pytorch_transformers.modeling_utils').setLevel(logging.INFO)

        '''
        Load a pretrained joint model or pretrained BERT model 
        '''
        config = BertConfig.from_json_file(os.path.join(args.bert_pretrained_dir, 
                                                        args.config_name))
        config.num_labels = 3 if args.output_channel_encoding == 'multilabel' else 4
        if args.training_mode == 'semisupervised_phase2':
            model = ImageTextModel.from_pretrained(
                args.joint_semisupervised_pretrained_checkpoint)
            print('Pretrained model:\t {}'.\
                format(args.joint_semisupervised_pretrained_checkpoint))
        elif args.use_pretrained_checkpoint:
            model = ImageTextModel.from_pretrained(
                args.joint_semisupervised_pretrained_checkpoint)
            print('Pretrained model:\t {}'.\
                format(args.joint_semisupervised_pretrained_checkpoint))            
        else:
            model = ImageTextModel(config=config, 
                                   pretrained_bert_dir=args.bert_pretrained_dir)
            print('No pretrained joint model, loading pretrained BERT model:\t {}'.\
                format(args.bert_pretrained_dir))

        '''
        Perform model training
        '''
        model.to(device)
        loss_info = main_utils.train(args, device, model, tokenizer)
        
        '''
        Reset the logger now
        '''
        logger = logging.getLogger(__name__)
        logger.info("Saving model checkpoint to %s", args.output_dir)
          
        '''
        Take care of distributed/parallel training
        '''
        model_to_save = model.module if hasattr(model, 'module') else model

        '''
        Save model training results
        '''
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
        main_utils.to_json_file(loss_info, os.path.join(args.output_dir, 'loss_info.json'))
        end_time = time.time()

    '''
    Evaluate the model
    '''
    results_txt = {}
    results_img = {}
    losses_info = {}
    # eval should assume that the train ids already contain the necessary folders 
    # will deal with this later. Just copy eval images here 
    if args.do_eval:
        start_time = time.time()

        checkpoints = [args.output_dir]
        # The final checkpoint is in the args.output_dir

        if args.eval_all_checkpoints:
            checkpoints = list(os.path.dirname(c) for c in sorted(
                glob.glob(args.output_dir + '/**/' + args.weights_name, recursive=True)))

        logger = logging.getLogger(__name__)
        logger.info("Evaluate %d checkpoints ", len(checkpoints))
        for checkpoint in checkpoints:
            epoch_number = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
            logger = logging.getLogger('joint_img_txt.model.model').setLevel(logging.INFO)
            model = ImageTextModel.from_pretrained(checkpoint)
            model.to(device)
            dump_prediction_files = False
            if checkpoint == args.output_dir:
                dump_prediction_files = True
                epoch_number = 'final'
            print('***    Epoch {}'.format(epoch_number))
            print('\t\t\t Checkpoint: {}'.format(checkpoint))
            result_txt, result_img = main_utils.evaluate(
                args, device, model, tokenizer,
                dump_prediction_files, prefix=epoch_number)
            result_txt = dict((k + '_{}'.format(epoch_number), v) for k, v in result_txt.items())
            result_img = dict((k + '_{}'.format(epoch_number), v) for k, v in result_img.items())
            results_txt.update(result_txt)
            results_img.update(result_img)

        main_utils.to_json_file(results_txt, os.path.join(args.reports_dir, 'results_txt.json'))
        main_utils.to_json_file(results_img, os.path.join(args.reports_dir, 'results_img.json'))
        end_time = time.time()

    print("\n\nTotal time to run:", round((end_time-start_time)/3600.0, 2))
예제 #20
0
    segment_idx = token_idx * 0
    segment_idx[(sep_idx + 1):] = 1
    mask = (token_idx != 0)
    return token_idx.unsqueeze(0), segment_idx.unsqueeze(0), mask.unsqueeze(0)


if __name__ == '__main__':
    args = parser.parse_args()
    assert os.path.exists(args.bert_model), '{} does not exist'.format(args.bert_model)
    assert os.path.exists(args.bert_vocab), '{} does not exist'.format(args.bert_vocab)
    assert args.topk > 0, '{} should be positive'.format(args.topk)

    print('Initialize BERT vocabulary from {}...'.format(args.bert_vocab))
    bert_tokenizer = BertTokenizer(vocab_file=args.bert_vocab)
    print('Initialize BERT model from {}...'.format(args.bert_model))
    config = BertConfig.from_json_file('./bert-base-uncased/config.json')
    bert_model = BertForMaskedLM.from_pretrained('./bert-base-uncased/pytorch_model.bin', config = config)

    while True:
        message = input('Enter your message: ').strip()
        tokens = bert_tokenizer.tokenize(message)
        if len(tokens) == 0:
            continue
        if tokens[0] != CLS:
            tokens = [CLS] + tokens
        if tokens[-1] != SEP:
            tokens.append(SEP)
        token_idx, segment_idx, mask = to_bert_input(tokens, bert_tokenizer)
        with torch.no_grad():
            logits = bert_model(token_idx, segment_idx, mask, masked_lm_labels=None)
        logits = np.squeeze(logits[0], axis=0)
예제 #21
0
    return vocab


vocab = load_vocab('./spm_model/wiki-ja.vocab')
mask_indx = 12

pdb.set_trace()
spmed[0] = '[CLS]'
spmed.append('[SEP]')
spmed[mask_indx] = '[MASK]'

indx_tokens = [vocab[s] if s in vocab else vocab['<unk>'] for s in spmed]

tokens_tensor = torch.tensor([indx_tokens])

config = BertConfig.from_json_file(
    '../published_model/bert_spm/bert_config.json')
model = BertModel.from_pretrained(
    '../published_model/bert_spm/pytorch_model.bin', config=config)
model2 = BertForMaskedLM.from_pretrained(
    '../published_model/bert_spm/pytorch_model.bin', config=config)
model3 = BertModel.from_pretrained(
    '../published_model/bert_spm/pytorch_model.bin', config=config)

model.eval()
model2.eval()
model3.eval()

tokens_tensor = tokens_tensor.to('cuda')

model.to('cuda')
model2.to('cuda')
예제 #22
0
def main(args):

    assert args.use_one_optim is True

    if args.recover_e > 0:
        raise NotImplementedError("This option is from my oldest code version. "
                                  "I have not checked it for this code version.")

    if not os.path.exists(args.save_dir):
        os.mkdir(args.save_dir)
        print("### mkdir {:}".format(args.save_dir))

    def worker_init_fn(worker_id):
        np.random.seed(args.random_seed + worker_id)

    n_gpu = 0
    if torch.cuda.is_available() and (not args.use_cpu):
        n_gpu = torch.cuda.device_count()
        device = torch.device('cuda')
        print("### Device: {:}".format(device))
    else:
        print("### Use CPU (Debugging)")
        device = torch.device("cpu")

    if args.random_seed < 0:
        print("### Pick a random seed")
        args.random_seed = random.sample(list(range(1, 100000)), 1)[0]

    print("### Random Seed: {:}".format(args.random_seed))
    np.random.seed(args.random_seed)
    random.seed(args.random_seed)
    rng = random.Random(args.random_seed)
    torch.manual_seed(args.random_seed)

    if n_gpu > 0:
        if args.random_seed >= 0:
            torch.cuda.manual_seed(args.random_seed)
            torch.cuda.manual_seed_all(args.random_seed)

        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    if not os.path.exists(args.save_dir):
        os.mkdir(args.save_dir)

    ontology = json.load(open(args.ontology_data))
    slot_meta, ontology = make_slot_meta(ontology)
    op2id = OP_SET[args.op_code]
    print(op2id)

    tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True)

    train_path = os.path.join(args.data_root, "train.pt")
    train_data_raw = torch.load(train_path)[:5000]
    print("# train examples %d" % len(train_data_raw))

    test_path = os.path.join(args.data_root, "test.pt")
    test_data_raw = torch.load(test_path)
    print("# test examples %d" % len(test_data_raw))

    model_config = BertConfig.from_json_file(args.bert_config_path)
    model_config.dropout = args.dropout
    model_config.attention_probs_dropout_prob = args.attention_probs_dropout_prob
    model_config.hidden_dropout_prob = args.hidden_dropout_prob

    type_vocab_size = 4
    dec_config = args
    model = TransformerDST(model_config, dec_config, len(op2id), len(domain2id),
                           op2id['update'],
                           tokenizer.convert_tokens_to_ids(['[MASK]'])[0],
                           tokenizer.convert_tokens_to_ids(['[SEP]'])[0],
                           tokenizer.convert_tokens_to_ids(['[PAD]'])[0],
                           tokenizer.convert_tokens_to_ids(['-'])[0],
                           type_vocab_size, args.exclude_domain)

    test_epochs = [int(e) for e in args.load_epoch.strip().lower().split('-')]
    for best_epoch in test_epochs:
        print("### Epoch {:}...".format(best_epoch))
        sys.stdout.flush()
        ckpt_path = os.path.join(args.save_dir, 'model.e{:}.bin'.format(best_epoch))
        ckpt = torch.load(ckpt_path, map_location='cpu')
        model.load_state_dict(ckpt)
        model.to(device)

        # eval_res = model_evaluation(model, train_data_raw, tokenizer, slot_meta, best_epoch, args.op_code,
        #                             use_full_slot=args.use_full_slot, use_dt_only=args.use_dt_only, no_dial=args.no_dial, n_gpu=n_gpu,
        #                             is_gt_op=False, is_gt_p_state=False, is_gt_gen=False)
        #
        # print("### Epoch {:} Train Score : ".format(best_epoch), eval_res)
        # print('\n'*2)
        # sys.stdout.flush()

        eval_res = model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code,
                                    use_full_slot=args.use_full_slot, use_dt_only=args.use_dt_only, no_dial=args.no_dial, n_gpu=n_gpu,
                                    is_gt_op=False, is_gt_p_state=False, is_gt_gen=False)

        print("### Epoch {:} Test Score : ".format(best_epoch), eval_res)
        print('\n'*2)
        sys.stdout.flush()
예제 #23
0
def main(args):
    def worker_init_fn(worker_id):
        np.random.seed(args.random_seed + worker_id)

    if args.dataset == 'sim-R':
        from BERTDST_utils.simR_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP
    if args.dataset == 'sim-M':
        from BERTDST_utils.simM_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP
    if args.dataset == 'DSTC2':
        from BERTDST_utils.DSTC2_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP
    if args.dataset == 'WOZ2.0':
        from BERTDST_utils.WOZ_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP
    if args.dataset == 'MultiWOZ2.1':
        from BERTDST_utils.MultiWOZ_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, OP, make_slot_meta
        ontology = json.load(open(args.ontology_data_path))
        SLOT, ontology = make_slot_meta(ontology)

    n_gpu = 0
    if torch.cuda.is_available():
        n_gpu = torch.cuda.device_count()

    np.random.seed(args.random_seed)
    random.seed(args.random_seed)
    rng = random.Random(args.random_seed)
    torch.manual_seed(args.random_seed)
    if n_gpu > 0:
        torch.cuda.manual_seed(args.random_seed)
        torch.cuda.manual_seed_all(args.random_seed)
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True

    if not os.path.exists(args.save_dir):
        os.mkdir(args.save_dir)

    slot_meta = SLOT
    op2id = OP
    print(op2id)
    tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True)

    train_data_raw = prepare_dataset(data_scale=args.train_scale,
                                     data_path=args.train_data_path,
                                     tokenizer=tokenizer,
                                     slot_meta=slot_meta,
                                     size_window=args.train_size_window,
                                     max_seq_length=args.max_seq_length,
                                     multi_granularity=args.train_MG,
                                     data_type='train')

    train_data = MultiWozDataset(train_data_raw, tokenizer, slot_meta,
                                 args.max_seq_length, rng, args.word_dropout)
    print("# train examples %d" % len(train_data_raw))

    dev_data_raw = prepare_dataset(data_scale=1.0,
                                   data_path=args.dev_data_path,
                                   tokenizer=tokenizer,
                                   slot_meta=slot_meta,
                                   size_window=args.test_size_window,
                                   max_seq_length=args.max_seq_length,
                                   multi_granularity=args.test_MG,
                                   data_type='dev')
    print("# dev examples %d" % len(dev_data_raw))

    test_data_raw = prepare_dataset(data_scale=1.0,
                                    data_path=args.test_data_path,
                                    tokenizer=tokenizer,
                                    slot_meta=slot_meta,
                                    size_window=args.test_size_window,
                                    max_seq_length=args.max_seq_length,
                                    multi_granularity=args.test_MG,
                                    data_type='test')
    print("# test examples %d" % len(test_data_raw))

    model_config = BertConfig.from_json_file(args.bert_config_path)
    model_config.dropout = args.dropout
    model_config.attention_probs_dropout_prob = args.attention_probs_dropout_prob
    model_config.hidden_dropout_prob = args.hidden_dropout_prob
    model = MGDST(model_config, len(op2id), len(slot_meta))

    ckpt = torch.load(args.bert_ckpt_path, map_location='cpu')
    ckpt1 = {
        k.replace('bert.', '').replace('gamma',
                                       'weight').replace('beta', 'bias'): v
        for k, v in ckpt.items() if 'cls.' not in k
    }
    model.encoder.bert.load_state_dict(ckpt1)
    #model.encoder.bert.from_pretrained(args.bert_ckpt_path)

    model.to(device)

    num_train_steps = int(
        len(train_data_raw) / args.batch_size * args.n_epochs)

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    enc_param_optimizer = list(model.encoder.named_parameters())
    enc_optimizer_grouped_parameters = [{
        'params': [
            p for n, p in enc_param_optimizer
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in enc_param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    enc_optimizer = AdamW(enc_optimizer_grouped_parameters, lr=args.enc_lr)
    enc_scheduler = WarmupLinearSchedule(enc_optimizer,
                                         int(num_train_steps *
                                             args.enc_warmup),
                                         t_total=num_train_steps)

    dec_param_optimizer = list(model.decoder.parameters())
    dec_optimizer = AdamW(dec_param_optimizer, lr=args.dec_lr)
    dec_scheduler = WarmupLinearSchedule(dec_optimizer,
                                         int(num_train_steps *
                                             args.dec_warmup),
                                         t_total=num_train_steps)

    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.batch_size,
                                  collate_fn=train_data.collate_fn,
                                  num_workers=args.num_workers,
                                  worker_init_fn=worker_init_fn)

    loss_fnc = nn.CrossEntropyLoss()
    best_score = {'epoch': 0, 'joint_acc': 0, 'op_acc': 0, 'final_slot_f1': 0}
    total_step = 0
    for epoch in range(args.n_epochs):
        batch_loss = []
        model.train()
        for step, batch in enumerate(train_dataloader):
            batch = [
                b.to(device) if not isinstance(b, int) else b for b in batch
            ]
            input_ids, input_mask, segment_ids, op_ids, gen_ids = batch

            state_scores, span_scores = model(input_ids=input_ids,
                                              token_type_ids=segment_ids,
                                              attention_mask=input_mask)

            loss_state = loss_fnc(
                state_scores.contiguous().view(-1, len(op2id)),
                op_ids.contiguous().view(-1))
            try:
                loss_span = masked_cross_entropy_for_value(
                    span_scores.contiguous(), gen_ids.contiguous(),
                    tokenizer.vocab['[PAD]'])
            except Exception as e:
                print(e)
            loss = loss_state * 0.8 + loss_span * 0.2
            batch_loss.append(loss.item())

            loss.backward()
            enc_optimizer.step()
            enc_scheduler.step()
            dec_optimizer.step()
            dec_scheduler.step()
            model.zero_grad()

            total_step += 1

            if step % 100 == 0:
                print("[%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, span_loss : %.3f" \
                          % (epoch+1, args.n_epochs, step,
                             len(train_dataloader), np.mean(batch_loss),
                             loss_state.item(), loss_span.item()))
                batch_loss = []

        if (epoch + 1) % args.eval_epoch == 0:
            print('total_step: ', total_step)
            eval_res = model_evaluation(make_turn_label, postprocessing,
                                        state_equal, OP, model, dev_data_raw,
                                        tokenizer, slot_meta, epoch + 1,
                                        args.test_size_window, args.test_MG)
            if eval_res['joint_acc'] > best_score['joint_acc']:
                best_score = eval_res
                model_to_save = model.module if hasattr(model,
                                                        'module') else model
                save_path = os.path.join(
                    args.save_dir,
                    'model_best_gran[%s]_scale[%s]_seed[%s].bin' %
                    (str(args.train_size_window), str(
                        args.train_scale), args.random_seed))
                torch.save(model_to_save.state_dict(), save_path)
            print("Best Score : ", best_score)
            print("\n")

            if epoch > args.patience_start_epoch and best_score[
                    'epoch'] + args.patience < epoch:
                print("out of patience...")
                break

    print("Test using best model...")
    best_epoch = best_score['epoch']
    ckpt_path = os.path.join(
        args.save_dir, 'model_best_gran[%s]_scale[%s]_seed[%s].bin' %
        (str(args.train_size_window), str(args.train_scale), args.random_seed))
    model = MGDST(model_config, len(op2id), len(slot_meta))
    ckpt = torch.load(ckpt_path, map_location='cpu')
    model.load_state_dict(ckpt)
    model.to(device)

    model_evaluation(make_turn_label, postprocessing, state_equal, OP, model,
                     test_data_raw, tokenizer, slot_meta, best_epoch,
                     args.test_size_window, args.test_MG)