def main(args): if args.dataset == 'sim-R': from BERTDST_utils.simR_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP if args.dataset == 'sim-M': from BERTDST_utils.simM_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP if args.dataset == 'DSTC2': from BERTDST_utils.DSTC2_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP if args.dataset == 'WOZ2.0': from BERTDST_utils.WOZ_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP if args.dataset == 'MultiWOZ2.1': from BERTDST_utils.MultiWOZ_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, OP, make_slot_meta ontology = json.load(open(args.ontology_data_path)) SLOT, ontology = make_slot_meta(ontology) slot_meta = SLOT tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True) data = prepare_dataset(1.0, args.test_data_path, tokenizer, slot_meta, args.test_size_window, args.max_seq_length, args.test_MG) model_config = BertConfig.from_json_file(args.bert_config_path) model_config.dropout = 0.1 op2id = OP model = MGDST(model_config, len(op2id), len(slot_meta)) ckpt = torch.load(args.model_ckpt_path, map_location='cpu') model.load_state_dict(ckpt) model.eval() model.to(device) model_evaluation(make_turn_label, postprocessing, state_equal, OP, model, data, tokenizer, slot_meta, 0, args.test_size_window, args.test_MG)
def __init__(self): model_dir = '/var/model/bert' if not os.path.isdir(model_dir): model_dir = os.path.abspath(os.path.dirname(__file__) + '/../../var/model/bert') self.use_gpu: bool = torch.cuda.is_available() self.config: BertConfig = BertConfig.from_json_file(model_dir + '/config.json') self.tokenizer: BertTokenizer = BertTokenizer.from_pretrained(model_dir + '/vocab.txt', do_lower_case=False) self.model_masked: BertForMaskedLM = BertForMaskedLM.from_pretrained(model_dir + '/model.bin', config=self.config) self.model: BertModel = self.model_masked.bert # freeze bert encoder for param in self.model.parameters(): param.requires_grad = False for param in self.model_masked.parameters(): param.requires_grad = False self.model.encoder.output_hidden_states = True self.model.eval() self.model_masked.eval() if self.use_gpu: self.model.cuda() self.model_masked.cuda()
def __init__(self, *, pretrained_model_name=None, config_filename=None, vocab_size=None, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, hidden_act="gelu", max_position_embeddings=512, random_init=False, **kwargs): TrainableNM.__init__(self, **kwargs) # Check that only one of pretrained_model_name, config_filename, and # vocab_size was passed in total = 0 if pretrained_model_name is not None: total += 1 if config_filename is not None: total += 1 if vocab_size is not None: total += 1 if total != 1: raise ValueError( "Only one of pretrained_model_name, vocab_size, " + "or config_filename should be passed into the " + "BERT constructor.") if vocab_size is not None: config = BertConfig( vocab_size_or_config_json_file=vocab_size, hidden_size=hidden_size, num_hidden_layers=num_hidden_layers, num_attention_heads=num_attention_heads, intermediate_size=intermediate_size, hidden_act=hidden_act, max_position_embeddings=max_position_embeddings) model = BertModel(config) elif pretrained_model_name is not None: model = BertModel.from_pretrained(pretrained_model_name) elif config_filename is not None: config = BertConfig.from_json_file(config_filename) model = BertModel(config) else: raise ValueError( "Either pretrained_model_name or vocab_size must" + "be passed into the BERT constructor") model.to(self._device) self.add_module("bert", model) self.config = model.config if random_init: self.apply( lambda module: transformer_weights_init(module, xavier=False))
def start(check_accr=False): bert_config = BertConfig.from_json_file(config.bert_config_root) model = BertCloze(bert_config, num_choices=10) load_model(model, config.pretrained_bert_root) generate_prob(model) generate_result(i_range=5) if check_accr: check_result() print("程序运行完成")
def __init__(self, config, vocab): super(BERT_PRETRAINED_MODEL_JAPANESE, self).__init__() self.config = config self.vocab = vocab self.BERT_config = BertConfig.from_json_file( '../published_model/bert_spm/bert_config.json') self.tokenizer = BertTokenizer.from_pretrained( './spm_model/wiki-ja.vocab.txt') self.pretrained_BERT_model = BertModel.from_pretrained( '../published_model/bert_spm/pytorch_model.bin', config=self.BERT_config)
def main(args): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') ontology = json.load(open(os.path.join(args.data_root, args.ontology_data))) slot_meta, _ = make_slot_meta(ontology) tokenizer = BertTokenizer.from_pretrained(args.bert_config) special_tokens = ['[SLOT]', '[NULL]'] special_tokens_dict = {'additional_special_tokens': special_tokens} tokenizer.add_special_tokens(special_tokens_dict) data = prepare_dataset(data_path=os.path.join(args.data_root, args.test_data), data_list=None, tokenizer=tokenizer, slot_meta=slot_meta, n_history=args.n_history, max_seq_length=args.max_seq_length, op_code=args.op_code) model_config = BertConfig.from_json_file(args.bert_config_path) model_config.dropout = 0.1 op2id = OP_SET[args.op_code] model = TransformerDST(model_config, len(op2id), len(domain2id), op2id['update']) ckpt = torch.load(args.model_ckpt_path, map_location='cpu') model.load_state_dict(ckpt) model.eval() model.to(device) if args.eval_all: model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, False, False, False) model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, False, False, True) model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, False, True, False) model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, False, True, True) model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, True, False, False) model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, True, True, False) model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, True, False, True) model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, True, True, True) else: model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, args.gt_op, args.gt_p_state, args.gt_gen)
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): # Initialise PyTorch model config = BertConfig.from_json_file(bert_config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = BertForPreTraining(config) # Load weights from tf checkpoint load_tf_weights_in_bert(model, config, tf_checkpoint_path) # Save pytorch-model print("Save PyTorch model to {}".format(pytorch_dump_path)) torch.save(model.state_dict(), pytorch_dump_path)
def __init__(self, large, temp_dir, finetune=False): super(Bert, self).__init__() if (large): #self.model = BertModel.from_pretrained('bert-base-multilingual-cased', cache_dir=temp_dir) config = BertConfig.from_json_file('bert-large/config.json') self.model = BertModel.from_pretrained('bert-large', cache_dir=None, config=config) else: self.model = BertModel.from_pretrained( 'bert-base-multilingual-cased', cache_dir=temp_dir) self.finetune = finetune
def __init__(self, backbone, neck, rpn_head, text_bbox_roi_extractor, text_bbox_head, text_mask_roi_extractor, text_mask_head, char_bbox_roi_extractor, char_bbox_head, crm_cfg, train_cfg=None, test_cfg=None, pretrained=None, lm_cfg=None): super(AE_TextSpotter, self).__init__() self.backbone = builder.build_backbone(backbone) self.neck = builder.build_neck(neck) self.rpn_head = builder.build_head(rpn_head) # text detection module self.text_bbox_roi_extractor = builder.build_roi_extractor( text_bbox_roi_extractor) self.text_bbox_head = builder.build_head(text_bbox_head) self.text_mask_roi_extractor = builder.build_roi_extractor( text_mask_roi_extractor) self.text_mask_head = builder.build_head(text_mask_head) # character-based recognition module self.char_bbox_roi_extractor = builder.build_roi_extractor( char_bbox_roi_extractor) self.char_bbox_head = builder.build_head(char_bbox_head) self.crm_cfg = crm_cfg self.label2char = mmcv.load(crm_cfg.char_dict_file)['label2char'] # language module if lm_cfg is not None: self.lm_cfg = lm_cfg self.dictmap = mmcv.load(lm_cfg.dictmap_file) self.bert_tokenizer = BertTokenizer.from_pretrained( lm_cfg.bert_vocab_file) self.bert_model = BertModel.from_pretrained( lm_cfg.bert_model_file, config=BertConfig.from_json_file(lm_cfg.bert_cfg_file)) self.lang_model = GRUFC(**lm_cfg.lang_model) self.train_cfg = train_cfg self.test_cfg = test_cfg self.init_weights(pretrained=pretrained)
def convert_ckpt_compatible(ckpt_path, config_path): ckpt = torch.load(ckpt_path, map_location='cpu') keys = list(ckpt.keys()) for key in keys: if 'LayerNorm' in key: if 'gamma' in key: ckpt[key.replace('gamma', 'weight')] = ckpt.pop(key) else: ckpt[key.replace('beta', 'bias')] = ckpt.pop(key) model_config = BertConfig.from_json_file(config_path) model = BertForPreTraining(model_config) model.load_state_dict(ckpt) new_ckpt = model.bert.state_dict() return new_ckpt
def __init__(self, args, temp_dir, finetune=False): super(Bert, self).__init__() if args.pretrained_model_type in ['bert-base-uncased', 'bert-base-multilingual-uncased']: self.model = BertModel.from_pretrained(args.pretrained_model_type, cache_dir=temp_dir) if args.pretrained_model_type in ['rubert-deeppavlov']: name = args.pretrained_model_type config = BertConfig.from_json_file(mapper(name, 'config')) self.model = BertModel.from_pretrained(mapper(name, 'model'), config=config) if not self.model: raise NotImplementedError("self.model") bert_data = BertData(args) self.model.resize_token_embeddings(len(bert_data.tokenizer)) self.finetune = finetune
def main(args): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') ontology = json.load(open(os.path.join(args.data_root, args.ontology_data))) slot_meta, _ = make_slot_meta(ontology) tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True) data = prepare_dataset(os.path.join(args.data_root, args.test_data), tokenizer, slot_meta, args.n_history, args.max_seq_length, args.op_code) model_config = BertConfig.from_json_file(args.bert_config_path) model_config.dropout = 0.1 op2id = OP_SET[args.op_code] model = TransformerDST(model_config, len(op2id), len(domain2id), op2id['update']) ckpt = torch.load(args.model_ckpt_path, map_location='cpu') model.load_state_dict(ckpt) model.eval() model.to(device) if args.eval_all: model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, False, False, False) model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, False, False, True) model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, False, True, False) model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, False, True, True) model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, True, False, False) model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, True, True, False) model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, True, False, True) model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, True, True, True) else: model_evaluation(model, data, tokenizer, slot_meta, 0, args.op_code, args.gt_op, args.gt_p_state, args.gt_gen)
def _get_custom_bert(pretrained_weights): model_fname = 'pytorch_model.bin' if model_fname not in os.listdir(pretrained_weights): convert(pretrained_weights) model_fpath = os.path.join(pretrained_weights, model_fname) config_fpath = os.path.join(pretrained_weights, 'bert_config.json') config = BertConfig.from_json_file(config_fpath) custom_bert = BertModel(config) state_dict = torch.load(model_fpath) def _remove_prefix(string): prefix = 'bert.' if string.startswith(prefix): string = string[len(prefix):] return string state_dict = { _remove_prefix(k): v for k, v in state_dict.items() if not k.startswith('cls') } custom_bert.load_state_dict(state_dict) return custom_bert
def main(args): assert args.use_one_optim is True if args.use_cls_only: args.no_dial = True print("### use_cls_only: {:}".format(args.use_cls_only)) print("### no_dial: {:}".format(args.no_dial)) if args.recover_e > 0: raise NotImplementedError("This option is from my oldest code version. " "I have not checked it for this code version.") if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) print("### mkdir {:}".format(args.save_dir)) def worker_init_fn(worker_id): np.random.seed(args.random_seed + worker_id) n_gpu = 0 if torch.cuda.is_available() and (not args.use_cpu): n_gpu = torch.cuda.device_count() device = torch.device('cuda') print("### Device: {:}".format(device)) else: print("### Use CPU (Debugging)") device = torch.device("cpu") if args.random_seed < 0: print("### Pick a random seed") args.random_seed = random.sample(list(range(0, 100000)), 1)[0] print("### Random Seed: {:}".format(args.random_seed)) np.random.seed(args.random_seed) random.seed(args.random_seed) rng = random.Random(args.random_seed) torch.manual_seed(args.random_seed) if n_gpu > 0: if args.random_seed >= 0: torch.cuda.manual_seed(args.random_seed) torch.cuda.manual_seed_all(args.random_seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) ontology = json.load(open(args.ontology_data)) slot_meta, ontology = make_slot_meta(ontology) op2id = OP_SET[args.op_code] print(op2id) tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True) train_path = os.path.join(args.data_root, "train.pt") dev_path = os.path.join(args.data_root, "dev.pt") test_path = os.path.join(args.data_root, "test.pt") if not os.path.exists(test_path): test_data_raw = prepare_dataset(data_path=args.test_data_path, tokenizer=tokenizer, slot_meta=slot_meta, n_history=args.n_history, max_seq_length=args.max_seq_length, op_code=args.op_code) torch.save(test_data_raw, test_path) else: test_data_raw = torch.load(test_path) print("# test examples %d" % len(test_data_raw)) if not os.path.exists(train_path): train_data_raw = prepare_dataset(data_path=args.train_data_path, tokenizer=tokenizer, slot_meta=slot_meta, n_history=args.n_history, max_seq_length=args.max_seq_length, op_code=args.op_code) torch.save(train_data_raw, train_path) else: train_data_raw = torch.load(train_path) train_data = MultiWozDataset(train_data_raw, tokenizer, slot_meta, args.max_seq_length, rng, ontology, args.word_dropout, args.shuffle_state, args.shuffle_p, pad_id=tokenizer.convert_tokens_to_ids(['[PAD]'])[0], slot_id=tokenizer.convert_tokens_to_ids(['[SLOT]'])[0], decoder_teacher_forcing=args.decoder_teacher_forcing, use_full_slot=args.use_full_slot, use_dt_only=args.use_dt_only, no_dial=args.no_dial, use_cls_only=args.use_cls_only) print("# train examples %d" % len(train_data_raw)) if not os.path.exists(dev_path): dev_data_raw = prepare_dataset(data_path=args.dev_data_path, tokenizer=tokenizer, slot_meta=slot_meta, n_history=args.n_history, max_seq_length=args.max_seq_length, op_code=args.op_code) torch.save(dev_data_raw, dev_path) else: dev_data_raw = torch.load(dev_path) print("# dev examples %d" % len(dev_data_raw)) model_config = BertConfig.from_json_file(args.bert_config_path) model_config.dropout = args.dropout model_config.attention_probs_dropout_prob = args.attention_probs_dropout_prob model_config.hidden_dropout_prob = args.hidden_dropout_prob type_vocab_size = 4 dec_config = args model = TransformerDST(model_config, dec_config, len(op2id), len(domain2id), op2id['update'], tokenizer.convert_tokens_to_ids(['[MASK]'])[0], tokenizer.convert_tokens_to_ids(['[SEP]'])[0], tokenizer.convert_tokens_to_ids(['[PAD]'])[0], tokenizer.convert_tokens_to_ids(['-'])[0], type_vocab_size, args.exclude_domain) if not os.path.exists(args.bert_ckpt_path): args.bert_ckpt_path = download_ckpt(args.bert_ckpt_path, args.bert_config_path, 'assets') state_dict = torch.load(args.bert_ckpt_path, map_location='cpu') _k = 'embeddings.token_type_embeddings.weight' print("config.type_vocab_size != state_dict[bert.embeddings.token_type_embeddings.weight] ({0} != {1})".format( type_vocab_size, state_dict[_k].shape[0])) state_dict[_k].resize_( type_vocab_size, state_dict[_k].shape[1]) state_dict[_k].data[2, :].copy_(state_dict[_k].data[0, :]) state_dict[_k].data[3, :].copy_(state_dict[_k].data[0, :]) model.bert.load_state_dict(state_dict) print("\n### Done Load BERT") sys.stdout.flush() # re-initialize added special tokens ([SLOT], [NULL], [EOS]) model.bert.embeddings.word_embeddings.weight.data[1].normal_(mean=0.0, std=0.02) model.bert.embeddings.word_embeddings.weight.data[2].normal_(mean=0.0, std=0.02) model.bert.embeddings.word_embeddings.weight.data[3].normal_(mean=0.0, std=0.02) # re-initialize seg-2, seg-3 model.bert.embeddings.token_type_embeddings.weight.data[2].normal_(mean=0.0, std=0.02) model.bert.embeddings.token_type_embeddings.weight.data[3].normal_(mean=0.0, std=0.02) model.to(device) num_train_steps = int(len(train_data_raw) / args.batch_size * args.n_epochs) if args.use_one_optim: print("### Use One Optim") param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any( nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any( nd in n for nd in no_decay)], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.enc_lr) scheduler = WarmupLinearSchedule(optimizer, int(num_train_steps * args.enc_warmup), t_total=num_train_steps) else: no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] enc_param_optimizer = list(model.bert.named_parameters()) # TODO: For BERT only print('### Optim BERT: {:}'.format(len(enc_param_optimizer))) enc_optimizer_grouped_parameters = [ {'params': [p for n, p in enc_param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in enc_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] enc_optimizer = AdamW(enc_optimizer_grouped_parameters, lr=args.enc_lr) enc_scheduler = WarmupLinearSchedule(enc_optimizer, int(num_train_steps * args.enc_warmup), t_total=num_train_steps) dec_param_optimizer = list(model.named_parameters()) # TODO: For other parameters print('### Optim All: {:}'.format(len(dec_param_optimizer))) dec_param_optimizer = [p for (n, p) in dec_param_optimizer if 'bert' not in n] print('### Optim OTH: {:}'.format(len(dec_param_optimizer))) dec_optimizer = AdamW(dec_param_optimizer, lr=args.dec_lr) dec_scheduler = WarmupLinearSchedule(dec_optimizer, int(num_train_steps * args.dec_warmup), t_total=num_train_steps) if args.recover_e > 0: model_recover, enc_recover, dec_recover = load(args, str(args.recover_e)) print("### Recover Model E{:}".format(args.recover_e)) sys.stdout.flush() model.load_state_dict(model_recover) print("### Recover Optim E{:}".format(args.recover_e)) sys.stdout.flush() enc_optimizer.load_state_dict(enc_recover) dec_optimizer.load_state_dict(dec_optimizer) if n_gpu > 1: model = torch.nn.DataParallel(model) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size, collate_fn=train_data.collate_fn, num_workers=args.num_workers, worker_init_fn=worker_init_fn) loss_fnc = nn.CrossEntropyLoss() best_score = {'epoch': 0, 'joint_acc': 0, 'op_acc': 0, 'final_slot_f1': 0} start_time = time.time() for epoch in range(args.n_epochs): batch_loss = [] model.train() for step, batch in enumerate(train_dataloader): batch = [b.to(device) if (not isinstance(b, int)) and (not isinstance(b, dict) and (not isinstance(b, list)) and (not isinstance(b, np.ndarray))) else b for b in batch] input_ids_p, segment_ids_p, input_mask_p, \ state_position_ids, op_ids, domain_ids, input_ids_g, segment_ids_g, position_ids_g, input_mask_g, \ masked_pos, masked_weights, lm_label_ids, id_n_map, gen_max_len, n_total_pred = batch domain_scores, state_scores, loss_g = model(input_ids_p, segment_ids_p, input_mask_p, state_position_ids, input_ids_g, segment_ids_g, position_ids_g, input_mask_g, masked_pos, masked_weights, lm_label_ids, id_n_map, gen_max_len, only_pred_op=args.only_pred_op, n_gpu=n_gpu) if n_total_pred > 0: loss_g = loss_g.sum() / n_total_pred else: loss_g = 0 loss_s = loss_fnc(state_scores.view(-1, len(op2id)), op_ids.view(-1)) if args.only_pred_op: loss = loss_s else: loss = loss_s + loss_g if args.exclude_domain is not True: loss_d = loss_fnc(domain_scores.view(-1, len(domain2id)), domain_ids.view(-1)) loss = loss + loss_d batch_loss.append(loss.item()) loss.backward() if args.use_one_optim: optimizer.step() scheduler.step() else: enc_optimizer.step() enc_scheduler.step() dec_optimizer.step() dec_scheduler.step() model.zero_grad() if step % 100 == 0: try: loss_g = loss_g.item() except AttributeError: loss_g = loss_g if args.exclude_domain is not True: print("time %.1f min, [%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, gen_loss : %.3f, dom_loss : %.3f" \ % ((time.time()-start_time)/60, epoch+1, args.n_epochs, step, len(train_dataloader), np.mean(batch_loss), loss_s.item(), loss_g, loss_d.item())) else: print("time %.1f min, [%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, gen_loss : %.3f" \ % ((time.time()-start_time)/60, epoch+1, args.n_epochs, step, len(train_dataloader), np.mean(batch_loss), loss_s.item(), loss_g)) sys.stdout.flush() batch_loss = [] if args.use_one_optim: save(args, epoch + 1, model, optimizer) else: save(args, epoch + 1, model, enc_optimizer, dec_optimizer) if ((epoch+1) % args.eval_epoch == 0) and (epoch+1 >= 8): eval_res = model_evaluation(model, dev_data_raw, tokenizer, slot_meta, epoch+1, args.op_code, use_full_slot=args.use_full_slot, use_dt_only=args.use_dt_only, no_dial=args.no_dial, use_cls_only=args.use_cls_only, n_gpu=n_gpu) print("### Epoch {:} Score : ".format(epoch+1), eval_res) if eval_res['joint_acc'] > best_score['joint_acc']: best_score = eval_res print("### Best Joint Acc: {:} ###".format(best_score['joint_acc'])) print('\n') if epoch+1 >= 8: # To speed up eval_res_test = model_evaluation(model, test_data_raw, tokenizer, slot_meta, epoch + 1, args.op_code, use_full_slot=args.use_full_slot, use_dt_only=args.use_dt_only, no_dial=args.no_dial, use_cls_only=args.use_cls_only, n_gpu=n_gpu) print("### Epoch {:} Test Score : ".format(epoch + 1), eval_res_test)
def main(args): def worker_init_fn(worker_id): np.random.seed(args.random_seed + worker_id) n_gpu = 0 if torch.cuda.is_available(): n_gpu = torch.cuda.device_count() np.random.seed(args.random_seed) random.seed(args.random_seed) rng = random.Random(args.random_seed) torch.manual_seed(args.random_seed) if n_gpu > 0: torch.cuda.manual_seed(args.random_seed) torch.cuda.manual_seed_all(args.random_seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) ontology = json.load(open(args.ontology_data)) slot_meta, ontology = make_slot_meta(ontology) op2id = OP_SET[args.op_code] print(op2id) tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True) train_data_raw = prepare_dataset(data_path=args.train_data_path, tokenizer=tokenizer, slot_meta=slot_meta, n_history=args.n_history, max_seq_length=args.max_seq_length, op_code=args.op_code) train_data = MultiWozDataset(train_data_raw, tokenizer, slot_meta, args.max_seq_length, rng, ontology, args.word_dropout, args.shuffle_state, args.shuffle_p) print("# train examples %d" % len(train_data_raw)) dev_data_raw = prepare_dataset(data_path=args.dev_data_path, tokenizer=tokenizer, slot_meta=slot_meta, n_history=args.n_history, max_seq_length=args.max_seq_length, op_code=args.op_code) print("# dev examples %d" % len(dev_data_raw)) test_data_raw = prepare_dataset(data_path=args.test_data_path, tokenizer=tokenizer, slot_meta=slot_meta, n_history=args.n_history, max_seq_length=args.max_seq_length, op_code=args.op_code) print("# test examples %d" % len(test_data_raw)) model_config = BertConfig.from_json_file(args.bert_config_path) model_config.dropout = args.dropout model_config.attention_probs_dropout_prob = args.attention_probs_dropout_prob model_config.hidden_dropout_prob = args.hidden_dropout_prob model = SomDST(model_config, len(op2id), len(domain2id), op2id['update'], args.exclude_domain) if not os.path.exists(args.bert_ckpt_path): args.bert_ckpt_path = download_ckpt(args.bert_ckpt_path, args.bert_config_path, 'assets') ckpt = torch.load(args.bert_ckpt_path, map_location='cpu') model.encoder.bert.load_state_dict(ckpt) # re-initialize added special tokens ([SLOT], [NULL], [EOS]) model.encoder.bert.embeddings.word_embeddings.weight.data[1].normal_(mean=0.0, std=0.02) model.encoder.bert.embeddings.word_embeddings.weight.data[2].normal_(mean=0.0, std=0.02) model.encoder.bert.embeddings.word_embeddings.weight.data[3].normal_(mean=0.0, std=0.02) model.to(device) num_train_steps = int(len(train_data_raw) / args.batch_size * args.n_epochs) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] enc_param_optimizer = list(model.encoder.named_parameters()) enc_optimizer_grouped_parameters = [ {'params': [p for n, p in enc_param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in enc_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] enc_optimizer = AdamW(enc_optimizer_grouped_parameters, lr=args.enc_lr) enc_scheduler = WarmupLinearSchedule(enc_optimizer, int(num_train_steps * args.enc_warmup), t_total=num_train_steps) dec_param_optimizer = list(model.decoder.parameters()) dec_optimizer = AdamW(dec_param_optimizer, lr=args.dec_lr) dec_scheduler = WarmupLinearSchedule(dec_optimizer, int(num_train_steps * args.dec_warmup), t_total=num_train_steps) if n_gpu > 1: model = torch.nn.DataParallel(model) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size, collate_fn=train_data.collate_fn, num_workers=args.num_workers, worker_init_fn=worker_init_fn) loss_fnc = nn.CrossEntropyLoss() best_score = {'epoch': 0, 'joint_acc': 0, 'op_acc': 0, 'final_slot_f1': 0} for epoch in range(args.n_epochs): batch_loss = [] model.train() for step, batch in enumerate(train_dataloader): batch = [b.to(device) if not isinstance(b, int) else b for b in batch] input_ids, input_mask, segment_ids, state_position_ids, op_ids,\ domain_ids, gen_ids, max_value, max_update = batch if rng.random() < args.decoder_teacher_forcing: # teacher forcing teacher = gen_ids else: teacher = None domain_scores, state_scores, gen_scores = model(input_ids=input_ids, token_type_ids=segment_ids, state_positions=state_position_ids, attention_mask=input_mask, max_value=max_value, op_ids=op_ids, max_update=max_update, teacher=teacher) loss_s = loss_fnc(state_scores.view(-1, len(op2id)), op_ids.view(-1)) loss_g = masked_cross_entropy_for_value(gen_scores.contiguous(), gen_ids.contiguous(), tokenizer.vocab['[PAD]']) loss = loss_s + loss_g if args.exclude_domain is not True: loss_d = loss_fnc(domain_scores.view(-1, len(domain2id)), domain_ids.view(-1)) loss = loss + loss_d batch_loss.append(loss.item()) loss.backward() enc_optimizer.step() enc_scheduler.step() dec_optimizer.step() dec_scheduler.step() model.zero_grad() if step % 100 == 0: if args.exclude_domain is not True: print("[%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, gen_loss : %.3f, dom_loss : %.3f" \ % (epoch+1, args.n_epochs, step, len(train_dataloader), np.mean(batch_loss), loss_s.item(), loss_g.item(), loss_d.item())) else: print("[%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, gen_loss : %.3f" \ % (epoch+1, args.n_epochs, step, len(train_dataloader), np.mean(batch_loss), loss_s.item(), loss_g.item())) batch_loss = [] if (epoch+1) % args.eval_epoch == 0: eval_res = model_evaluation(model, dev_data_raw, tokenizer, slot_meta, epoch+1, args.op_code) if eval_res['joint_acc'] > best_score['joint_acc']: best_score = eval_res model_to_save = model.module if hasattr(model, 'module') else model save_path = os.path.join(args.save_dir, 'model_best.bin') torch.save(model_to_save.state_dict(), save_path) print("Best Score : ", best_score) print("\n") print("Test using best model...") best_epoch = best_score['epoch'] ckpt_path = os.path.join(args.save_dir, 'model_best.bin') model = SomDST(model_config, len(op2id), len(domain2id), op2id['update'], args.exclude_domain) ckpt = torch.load(ckpt_path, map_location='cpu') model.load_state_dict(ckpt) model.to(device) model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code, is_gt_op=False, is_gt_p_state=False, is_gt_gen=False) model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code, is_gt_op=False, is_gt_p_state=False, is_gt_gen=True) model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code, is_gt_op=False, is_gt_p_state=True, is_gt_gen=False) model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code, is_gt_op=False, is_gt_p_state=True, is_gt_gen=True) model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code, is_gt_op=True, is_gt_p_state=False, is_gt_gen=False) model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code, is_gt_op=True, is_gt_p_state=True, is_gt_gen=False) model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code, is_gt_op=True, is_gt_p_state=False, is_gt_gen=True) model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code, is_gt_op=True, is_gt_p_state=True, is_gt_gen=True)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--bert_config_file", default=None, type=str, required=True, help= "The config json file corresponding to the pre-trained BERT model. This specifies the model architecture." ) parser.add_argument( "--vocab_file", default=None, type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json") parser.add_argument( "--predict_file", default=None, type=str, help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json" ) parser.add_argument( "--init_checkpoint", default=None, type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument( "--do_lower_case", default=True, action='store_true', help="Whether to lower case the input text. Should be True for uncased " "models and False for cased models.") parser.add_argument( "--max_seq_length", default=384, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences " "longer than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--doc_stride", default=128, type=int, help= "When splitting up a long document into chunks, how much stride to take between chunks." ) parser.add_argument( "--max_query_length", default=64, type=int, help= "The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% " "of training.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--save_checkpoints_steps", default=1000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--iterations_per_loop", default=1000, type=int, help="How many steps to make in each estimator call.") parser.add_argument( "--n_best_size", default=20, type=int, help= "The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument( "--max_answer_length", default=30, type=int, help= "The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument( "--verbose_logging", default=False, action='store_true', help= "If true, all of the warnings related to data processing will be printed. " "A number of warnings are expected for a normal SQuAD evaluation.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument( "--accumulate_gradients", type=int, default=1, help= "Number of steps to accumulate gradient on (divide the batch_size and accumulate)" ) parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument("--checkpoint", default=None, type=str, help="The checkpoint file.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer") parser.add_argument( '--fp16', action='store_true', help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit" ) args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.accumulate_gradients < 1: raise ValueError( "Invalid accumulate_gradients parameter: {}, should be >= 1". format(args.accumulate_gradients)) args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_predict: raise ValueError( "At least one of `do_train` or `do_predict` must be True.") if args.do_train: if not args.train_file: raise ValueError( "If `do_train` is True, then `train_file` must be specified.") if args.do_predict: if not args.predict_file: raise ValueError( "If `do_predict` is True, then `predict_file` must be specified." ) bert_config = BertConfig.from_json_file(args.bert_config_file) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length %d because the BERT model " "was only trained up to sequence length %d" % (args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory () already exists and is not empty.") os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') train_examples = None num_train_steps = None if args.do_train: train_examples = read_label_examples(input_file=args.train_file, is_training=True) num_train_steps = int( len(train_examples) / args.train_batch_size * args.num_train_epochs) model = BertForLabelling(bert_config, args.train_batch_size) if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) output_model_file = os.path.join(args.output_dir, "saved_model") if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) model.to(device) if args.do_train: no_decay = ['bias', 'gamma', 'beta'] t_total = len( train_examples ) // args.gradient_accumulation_steps * args.num_train_epochs optimizer_parameters = [{ 'params': [p for n, p in model.named_parameters() if n not in no_decay], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in model.named_parameters() if n in no_decay], 'weight_decay_rate': 0.0 }] logger.info("***** Preparing optimizer *****") optimizer = AdamW(optimizer_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) if args.do_train: train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_seq = torch.tensor([f.label_seq for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_seq) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.to(device) model.train() global_step = 0 for i in trange(int(args.num_train_epochs), desc="Epoch"): for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_seq = batch loss = model(input_ids, segment_ids, input_mask, label_seq) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients model.zero_grad() global_step += 1 torch.save(model.state_dict(), output_model_file + ".{}".format(i)) if args.do_predict: if args.checkpoint: state_dict = torch.load(args.checkpoint) model.load_state_dict(state_dict) eval_examples = read_label_examples(input_file=args.predict_file, is_training=False) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logger.info("***** Running predictions *****") logger.info(" Num orig examples = %d", len(eval_examples)) logger.info(" Num split examples = %d", len(eval_features)) logger.info(" Batch size = %d", args.predict_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size) model.eval() all_results = [] logger.info("Start evaluating") nb_f1_scores, nb_nums = 0, 0 nb_p_scores, nb_r_scores = 0, 0 output_prediction_file = os.path.join(args.output_dir, "predictions.txt") output_prediction_file_writer = open(output_prediction_file, 'w', encoding='utf-8') for input_ids, input_mask, segment_ids, example_indices in tqdm( eval_dataloader, desc="Evaluating"): if len(all_results) % 1000 == 0: logger.info("Processing example: %d" % (len(all_results))) input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_logits = model(input_ids, segment_ids, input_mask) for i, example_index in enumerate(example_indices): logits = batch_logits[i].detach().cpu().tolist() # print(len(logits)) eval_feature = eval_features[example_index.item()] # print(np.array(np.array(logits) > 0.5, dtype=int)) # print(np.array(eval_feature.label_seq)) # print(eval_feature) # nb_f1_score = f1_score(np.array(np.array(logits) > 0.5, dtype=int), np.array(eval_feature.label_seq)) # nb_f1_scores += nb_f1_score # nb_p_scores += precision_score(np.array(np.array(logits) > 0.5, dtype=int), np.array(eval_feature.label_seq)) # nb_r_scores += recall_score(np.array(np.array(logits) > 0.5, dtype=int), np.array(eval_feature.label_seq)) nb_nums += 1 words = eval_examples[example_index.item()].doc_tokens words_scores = [-1] * len(words) for token_id in range(len(logits)): if token_id not in eval_feature.token_to_orig_map: continue orig_id = eval_feature.token_to_orig_map[token_id] words_scores[orig_id] = max(words_scores[orig_id], logits[token_id]) words_num = max(eval_feature.token_to_orig_map.values()) + 1 output_prediction_file_writer.write(' '.join( words[:words_num])) output_prediction_file_writer.write('\t') output_prediction_file_writer.write(' '.join( [str(ws) for ws in words_scores[:words_num]])) output_prediction_file_writer.write('\n')
parser.add_argument("--config_file", default=None, type=str, help="The BERT model config") args = parser.parse_args() nf = nemo.core.NeuralModuleFactory(backend=nemo.core.Backend.PyTorch, local_rank=args.local_rank, optimization_level=args.amp_opt_level, log_dir=args.work_dir, create_tb_writer=True, files_to_copy=[__file__], add_time_to_log_dir=True) if args.config_file is not None: config = BertConfig.from_json_file(args.config_file).to_dict() args.vocab_size = config['vocab_size'] args.hidden_size = config['hidden_size'] args.num_hidden_layers = config['num_hidden_layers'] args.num_attention_heads = config['num_attention_heads'] args.intermediate_size = config['intermediate_size'] args.hidden_act = config['hidden_act'] args.max_seq_length = config['max_position_embeddings'] if not args.preprocessed_data: special_tokens = ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'] data_desc = BERTPretrainingDataDesc(args.dataset_name, args.data_dir, args.vocab_size, args.sample_size, special_tokens, 'train.txt') if args.tokenizer == "sentence-piece": nf.logger.info("To use SentencePieceTokenizer.")
def main(args): if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) print("### mkdir {:}".format(args.save_dir)) def worker_init_fn(worker_id): np.random.seed(args.random_seed + worker_id) n_gpu = 0 if torch.cuda.is_available() and args.use_cpu: n_gpu = torch.cuda.device_count() device = torch.device('cuda') print("### Device: {:}".format(device)) else: print("### Use CPU (Debugging)") device = torch.device("cpu") if args.random_seed < 0: print("### Pick a random seed") args.random_seed = random.sample(list(range(1, 100000)), 1)[0] print("### Random Seed: {:}".format(args.random_seed)) np.random.seed(args.random_seed) random.seed(args.random_seed) rng = random.Random(args.random_seed) torch.manual_seed(args.random_seed) if n_gpu > 0: if args.random_seed >= 0: torch.cuda.manual_seed(args.random_seed) torch.cuda.manual_seed_all(args.random_seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) ontology = json.load(open(args.ontology_data)) slot_meta, ontology = make_slot_meta(ontology) op2id = OP_SET[args.op_code] print(op2id) tokenizer = BertTokenizer.from_pretrained(args.bert_config) special_tokens = ['[SLOT]', '[NULL]'] special_tokens_dict = {'additional_special_tokens': special_tokens} tokenizer.add_special_tokens(special_tokens_dict) test_path = os.path.join(args.data_root_test, "test.pt") if not os.path.exists(test_path): test_data_raw = prepare_dataset_for_inference(data_path=args.test_data_path, data_list=None, tokenizer=tokenizer, slot_meta=slot_meta, n_history=args.n_history, max_seq_length=args.max_seq_length, op_code=args.op_code) torch.save(test_data_raw, test_path) else: test_data_raw = torch.load(test_path) print("# test examples %d" % len(test_data_raw)) model_config = BertConfig.from_json_file(args.bert_config_path) model_config.dropout = 0. model_config.attention_probs_dropout_prob = 0. model_config.hidden_dropout_prob = 0. type_vocab_size = 4 dec_config = args model = TransformerDST(model_config, dec_config, len(op2id), len(domain2id), op2id['update'], tokenizer.convert_tokens_to_ids(['[MASK]'])[0], tokenizer.convert_tokens_to_ids(['[SEP]'])[0], tokenizer.convert_tokens_to_ids(['[PAD]'])[0], tokenizer.convert_tokens_to_ids(['-'])[0], type_vocab_size, args.exclude_domain) state_dict = torch.load(args.bert_ckpt_path, map_location='cpu') _k = 'bert.embeddings.token_type_embeddings.weight' print("config.type_vocab_size != state_dict[bert.embeddings.token_type_embeddings.weight] ({0} != {1})".format( type_vocab_size, state_dict[_k].shape[0])) state_dict[_k] = state_dict[_k].repeat(int(type_vocab_size / state_dict[_k].shape[0]), 1) state_dict[_k].data[2, :].copy_(state_dict[_k].data[0, :]) state_dict[_k].data[3, :].copy_(state_dict[_k].data[0, :]) model.bert.load_state_dict(state_dict, strict=False) # re-initialize added special tokens ([SLOT], [NULL], [EOS]) model.bert.embeddings.word_embeddings.weight.data[1].normal_(mean=0.0, std=0.02) model.bert.embeddings.word_embeddings.weight.data[2].normal_(mean=0.0, std=0.02) model.bert.embeddings.word_embeddings.weight.data[3].normal_(mean=0.0, std=0.02) # re-initialize seg-2, seg-3 model.bert.embeddings.token_type_embeddings.weight.data[2].normal_(mean=0.0, std=0.02) model.bert.embeddings.token_type_embeddings.weight.data[3].normal_(mean=0.0, std=0.02) model.bert.resize_token_embeddings(len(tokenizer)) test_epochs = [int(e) for e in args.load_epoch.strip().lower().split('-')] for best_epoch in test_epochs: print("### Epoch {:}...".format(best_epoch)) sys.stdout.flush() ckpt_path = os.path.join('/opt/ml/code/transformer_dst', args.save_dir, 'model.e{:}.bin'.format(best_epoch)) ckpt = torch.load(ckpt_path, map_location='cpu') model.load_state_dict(ckpt) model.to(device) eval_res = model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code, use_full_slot=args.use_full_slot, use_dt_only=args.use_dt_only, no_dial=args.no_dial, n_gpu=n_gpu, is_gt_op=False, is_gt_p_state=False, is_gt_gen=False, submission=True) print("### Epoch {:} Test Score : ".format(best_epoch), eval_res) print('\n' * 2) sys.stdout.flush()
def main(): args = parser.get_args() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") assert torch.cuda.is_available(), "No GPU/CUDA is detected!" # Training on CPU is almost infeasible, # but evaluation/inference can be done on CPU ''' Do initial argument checks ''' if args.id == 'dummy': args.id = str(uuid.uuid4()) # If no ID is specified, # then we will generate a radom ID as the folder name of this run if args.training_mode != 'supervised' and \ args.training_mode != 'semisupervised_phase1' and \ args.training_mode != 'semisupervised_phase2': raise Exception('You can do either supervised or semisupervised training') # 'semisupervised_phase1' is essentially unsupervised learning of the joint model # on chest radiographs and radiology reports # 'semisupervised_phase2' is supervised learning with the initialization # from the training results of semisupervised_phase1 if args.semisupervised_training_data != 'allCXR' and \ args.semisupervised_training_data != 'allCHF': raise Exception('You can train the model on all MIMIC-CXR (allCXR) or \ the congestive heart failure cohort (allCHF)') if args.training_mode == 'semisupervised_phase2': if not os.path.isdir(args.joint_semisupervised_pretrained_checkpoint): raise Exception('The joint_semisupervised_pretrained_checkpoint directory \ has to exist for the model initialization of semisupervised_phase2') if args.output_channel_encoding != 'multilabel' and \ args.output_channel_encoding != 'multiclass': raise Exception('You can select either multilabel or multiclass classification') if args.data_split_mode != 'cross_val' and args.data_split_mode != 'testing': raise Exception('You can do either cross-validation (cross_val) or testing (testing), \ which determine how the dataset is going to be split') if args.joint_loss_method != 'l2' and args.joint_loss_method != 'cosine' and \ args.joint_loss_method != 'dot' and args.joint_loss_method != 'ranking': raise Exception('You can have either l2, cosine, dot or ranking \ as the joint loss calculation between the img-txt embedding') if args.joint_loss_similarity_function != 'l2' and \ args.joint_loss_similarity_function != 'cosine' and \ args.joint_loss_similarity_function != 'dot': raise Exception('You can have either l2, cosine, or dot \ as the similarity function for the ranking loss in the img-txt embedding. \ You had %s' % args.joint_loss_similarity_function) if not args.do_train and not args.do_eval: raise Exception('Either do_train or do_eval flag must be set as true') ''' Select the right data split file based on the argument setting ''' # TODO: release the data split file (including our labels) if args.training_mode == 'supervised' or args.training_mode == 'semisupervised_phase2': data_split_file_postfix = '' # Supervised training does not need unlabeled data elif args.semisupervised_training_data == 'allCHF': data_split_file_postfix = '-allCHF' elif args.semisupervised_training_data == 'allCXR': data_split_file_postfix = '-allCXR' if not args.use_data_split_path: if args.data_split_mode == 'testing' and args.do_eval: args.data_split_path = os.path.join(args.data_split_path, 'mimic-cxr-sub-img-edema-split-manualtest.csv') # When evaluating in the testing mode, you should use the expert labels # that are included in the test set else: args.data_split_path = os.path.join( args.data_split_path, 'mimic-cxr-sub-img-edema-split{}.csv'.format(data_split_file_postfix)) ''' Set the output directory structure ''' # TODO: revisit the code related to masked text (may want to delete it) if args.use_masked_txt: args.text_data_dir = os.path.join(args.text_data_dir, 'masked') if not args.use_text_data_dir: args.text_data_dir = os.path.join(args.text_data_dir, args.output_channel_encoding) args.model = 'model' # TODO: consider deleting this if not args.use_text_data_dir: if args.training_mode == 'supervised' or args.training_mode == 'supervised_masking': args.text_data_dir = os.path.join(args.text_data_dir, 'supervised', 'full') elif 'semisupervised' in args.training_mode: args.text_data_dir = os.path.join(args.text_data_dir, 'semisupervised', args.semisupervised_training_data, 'full') if args.training_mode == 'supervised' or args.training_mode == 'supervised_masking': args.output_dir = os.path.join(args.output_dir, args.data_split_mode, args.model, args.training_mode, args.id) elif 'semisupervised' in args.training_mode: args.output_dir = os.path.join(args.output_dir, args.data_split_mode, args.model, args.training_mode, args.semisupervised_training_data, args.id) args.reports_dir = os.path.join(args.output_dir, 'eval_reports') args.tsbd_dir = os.path.join(args.output_dir, 'tsbd_dir') args.checkpoints_dir = os.path.join(args.output_dir, 'checkpoints') if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and \ args.do_train and not args.overwrite_output_dir: raise ValueError("Output directory ({}) already exists and is not empty." \ " Use".format(args.output_dir)+" --overwrite_output_dir to overcome.") ''' Create the necessary directories. Make sure no argument updating after this point. ''' directories = [args.output_dir, args.reports_dir, args.tsbd_dir, args.checkpoints_dir] for directory in directories: if not(os.path.exists(directory)): os.makedirs(directory) if not os.path.exists(args.data_split_path): raise Exception('The data split path %s does not exist! Please check' \ % args.data_split_path) # TODO: the name of "reports_dir" can be confusing; need to rename it if args.do_eval: args.reports_dir = os.path.join(args.reports_dir, 'eval_report_{}'.format(len(os.listdir(args.reports_dir)))) if not os.path.exists(args.reports_dir): os.makedirs(args.reports_dir) main_utils.to_json_file(vars(args), os.path.join(args.reports_dir, 'eval_args.json')) print('Location of the evaluation result directory: %s' % args.reports_dir) ''' Print some important arguments ''' print('Classification type: {}'.format(args.output_channel_encoding)) print('Loss method in the image-text embedding space: {}'.format(args.joint_loss_method)) if args.joint_loss_method == 'ranking': print('Similarity function for the ranking loss in the img-txt embedding:', args.joint_loss_similarity_function) print('Currently doing **{}**'.format(args.data_split_mode)) print('Training mode: {}'.format(args.training_mode)) print('Doing training: {}'.format(args.do_train)) print('Doing eval: {}'.format(args.do_eval)) print('Cuda is available: {}'.format(torch.cuda.is_available())) print('Device used: ', device) print('Scheduler used: ', args.scheduler) print('Initial learning Rate: ', args.learning_rate) print('Number of training epochs: ', args.num_train_epochs) print('Text data directory: ', args.text_data_dir) if 'semisupervised' in args.training_mode: print('Training data for semisupervised learning: ', args.semisupervised_training_data) print('Using all Sequences in BERT last layer rather than just [CLS]: ', args.bert_pool_last_hidden) if args.bert_pool_last_hidden: print('Using img embedding for computing attention scores: ', args.bert_pool_use_img) print('Pretrained BERT model directory: {}'.format(args.bert_pretrained_dir)) ''' Set logging and tensorboard directories ''' if args.do_train: args.tsbd_dir = os.path.join( args.tsbd_dir, 'tsbd_{}'.format(len(os.listdir(args.tsbd_dir)))) if not os.path.exists(args.tsbd_dir): os.makedirs(args.tsbd_dir) print('Location of the tensorboard directory: %s' % args.tsbd_dir) log_file = os.path.join(args.output_dir, 'training.log') if args.do_eval: log_file = os.path.join(args.reports_dir, 'evaluating.log') print('Logging in: {}'.format(log_file)) logging.basicConfig(filename=log_file, level=logging.INFO, filemode='w', format='%(asctime)s - %(name)s %(message)s', datefmt='%m-%d %H:%M') logger = logging.getLogger(__name__) logger.info("Current git commit sha: %s", sha) ''' Set text tokenizer ''' tokenizer = BertTokenizer.from_pretrained(args.bert_pretrained_dir) # tokenizer is not something that constantly needs to be saved # because only the pre-trained bert model determines this. ''' Train the model ''' if args.do_train: start_time = time.time() logger = logging.getLogger('pytorch_transformers.modeling_utils').setLevel(logging.INFO) ''' Load a pretrained joint model or pretrained BERT model ''' config = BertConfig.from_json_file(os.path.join(args.bert_pretrained_dir, args.config_name)) config.num_labels = 3 if args.output_channel_encoding == 'multilabel' else 4 if args.training_mode == 'semisupervised_phase2': model = ImageTextModel.from_pretrained( args.joint_semisupervised_pretrained_checkpoint) print('Pretrained model:\t {}'.\ format(args.joint_semisupervised_pretrained_checkpoint)) elif args.use_pretrained_checkpoint: model = ImageTextModel.from_pretrained( args.joint_semisupervised_pretrained_checkpoint) print('Pretrained model:\t {}'.\ format(args.joint_semisupervised_pretrained_checkpoint)) else: model = ImageTextModel(config=config, pretrained_bert_dir=args.bert_pretrained_dir) print('No pretrained joint model, loading pretrained BERT model:\t {}'.\ format(args.bert_pretrained_dir)) ''' Perform model training ''' model.to(device) loss_info = main_utils.train(args, device, model, tokenizer) ''' Reset the logger now ''' logger = logging.getLogger(__name__) logger.info("Saving model checkpoint to %s", args.output_dir) ''' Take care of distributed/parallel training ''' model_to_save = model.module if hasattr(model, 'module') else model ''' Save model training results ''' model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) main_utils.to_json_file(loss_info, os.path.join(args.output_dir, 'loss_info.json')) end_time = time.time() ''' Evaluate the model ''' results_txt = {} results_img = {} losses_info = {} # eval should assume that the train ids already contain the necessary folders # will deal with this later. Just copy eval images here if args.do_eval: start_time = time.time() checkpoints = [args.output_dir] # The final checkpoint is in the args.output_dir if args.eval_all_checkpoints: checkpoints = list(os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + '/**/' + args.weights_name, recursive=True))) logger = logging.getLogger(__name__) logger.info("Evaluate %d checkpoints ", len(checkpoints)) for checkpoint in checkpoints: epoch_number = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" logger = logging.getLogger('joint_img_txt.model.model').setLevel(logging.INFO) model = ImageTextModel.from_pretrained(checkpoint) model.to(device) dump_prediction_files = False if checkpoint == args.output_dir: dump_prediction_files = True epoch_number = 'final' print('*** Epoch {}'.format(epoch_number)) print('\t\t\t Checkpoint: {}'.format(checkpoint)) result_txt, result_img = main_utils.evaluate( args, device, model, tokenizer, dump_prediction_files, prefix=epoch_number) result_txt = dict((k + '_{}'.format(epoch_number), v) for k, v in result_txt.items()) result_img = dict((k + '_{}'.format(epoch_number), v) for k, v in result_img.items()) results_txt.update(result_txt) results_img.update(result_img) main_utils.to_json_file(results_txt, os.path.join(args.reports_dir, 'results_txt.json')) main_utils.to_json_file(results_img, os.path.join(args.reports_dir, 'results_img.json')) end_time = time.time() print("\n\nTotal time to run:", round((end_time-start_time)/3600.0, 2))
segment_idx = token_idx * 0 segment_idx[(sep_idx + 1):] = 1 mask = (token_idx != 0) return token_idx.unsqueeze(0), segment_idx.unsqueeze(0), mask.unsqueeze(0) if __name__ == '__main__': args = parser.parse_args() assert os.path.exists(args.bert_model), '{} does not exist'.format(args.bert_model) assert os.path.exists(args.bert_vocab), '{} does not exist'.format(args.bert_vocab) assert args.topk > 0, '{} should be positive'.format(args.topk) print('Initialize BERT vocabulary from {}...'.format(args.bert_vocab)) bert_tokenizer = BertTokenizer(vocab_file=args.bert_vocab) print('Initialize BERT model from {}...'.format(args.bert_model)) config = BertConfig.from_json_file('./bert-base-uncased/config.json') bert_model = BertForMaskedLM.from_pretrained('./bert-base-uncased/pytorch_model.bin', config = config) while True: message = input('Enter your message: ').strip() tokens = bert_tokenizer.tokenize(message) if len(tokens) == 0: continue if tokens[0] != CLS: tokens = [CLS] + tokens if tokens[-1] != SEP: tokens.append(SEP) token_idx, segment_idx, mask = to_bert_input(tokens, bert_tokenizer) with torch.no_grad(): logits = bert_model(token_idx, segment_idx, mask, masked_lm_labels=None) logits = np.squeeze(logits[0], axis=0)
return vocab vocab = load_vocab('./spm_model/wiki-ja.vocab') mask_indx = 12 pdb.set_trace() spmed[0] = '[CLS]' spmed.append('[SEP]') spmed[mask_indx] = '[MASK]' indx_tokens = [vocab[s] if s in vocab else vocab['<unk>'] for s in spmed] tokens_tensor = torch.tensor([indx_tokens]) config = BertConfig.from_json_file( '../published_model/bert_spm/bert_config.json') model = BertModel.from_pretrained( '../published_model/bert_spm/pytorch_model.bin', config=config) model2 = BertForMaskedLM.from_pretrained( '../published_model/bert_spm/pytorch_model.bin', config=config) model3 = BertModel.from_pretrained( '../published_model/bert_spm/pytorch_model.bin', config=config) model.eval() model2.eval() model3.eval() tokens_tensor = tokens_tensor.to('cuda') model.to('cuda') model2.to('cuda')
def main(args): assert args.use_one_optim is True if args.recover_e > 0: raise NotImplementedError("This option is from my oldest code version. " "I have not checked it for this code version.") if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) print("### mkdir {:}".format(args.save_dir)) def worker_init_fn(worker_id): np.random.seed(args.random_seed + worker_id) n_gpu = 0 if torch.cuda.is_available() and (not args.use_cpu): n_gpu = torch.cuda.device_count() device = torch.device('cuda') print("### Device: {:}".format(device)) else: print("### Use CPU (Debugging)") device = torch.device("cpu") if args.random_seed < 0: print("### Pick a random seed") args.random_seed = random.sample(list(range(1, 100000)), 1)[0] print("### Random Seed: {:}".format(args.random_seed)) np.random.seed(args.random_seed) random.seed(args.random_seed) rng = random.Random(args.random_seed) torch.manual_seed(args.random_seed) if n_gpu > 0: if args.random_seed >= 0: torch.cuda.manual_seed(args.random_seed) torch.cuda.manual_seed_all(args.random_seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) ontology = json.load(open(args.ontology_data)) slot_meta, ontology = make_slot_meta(ontology) op2id = OP_SET[args.op_code] print(op2id) tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True) train_path = os.path.join(args.data_root, "train.pt") train_data_raw = torch.load(train_path)[:5000] print("# train examples %d" % len(train_data_raw)) test_path = os.path.join(args.data_root, "test.pt") test_data_raw = torch.load(test_path) print("# test examples %d" % len(test_data_raw)) model_config = BertConfig.from_json_file(args.bert_config_path) model_config.dropout = args.dropout model_config.attention_probs_dropout_prob = args.attention_probs_dropout_prob model_config.hidden_dropout_prob = args.hidden_dropout_prob type_vocab_size = 4 dec_config = args model = TransformerDST(model_config, dec_config, len(op2id), len(domain2id), op2id['update'], tokenizer.convert_tokens_to_ids(['[MASK]'])[0], tokenizer.convert_tokens_to_ids(['[SEP]'])[0], tokenizer.convert_tokens_to_ids(['[PAD]'])[0], tokenizer.convert_tokens_to_ids(['-'])[0], type_vocab_size, args.exclude_domain) test_epochs = [int(e) for e in args.load_epoch.strip().lower().split('-')] for best_epoch in test_epochs: print("### Epoch {:}...".format(best_epoch)) sys.stdout.flush() ckpt_path = os.path.join(args.save_dir, 'model.e{:}.bin'.format(best_epoch)) ckpt = torch.load(ckpt_path, map_location='cpu') model.load_state_dict(ckpt) model.to(device) # eval_res = model_evaluation(model, train_data_raw, tokenizer, slot_meta, best_epoch, args.op_code, # use_full_slot=args.use_full_slot, use_dt_only=args.use_dt_only, no_dial=args.no_dial, n_gpu=n_gpu, # is_gt_op=False, is_gt_p_state=False, is_gt_gen=False) # # print("### Epoch {:} Train Score : ".format(best_epoch), eval_res) # print('\n'*2) # sys.stdout.flush() eval_res = model_evaluation(model, test_data_raw, tokenizer, slot_meta, best_epoch, args.op_code, use_full_slot=args.use_full_slot, use_dt_only=args.use_dt_only, no_dial=args.no_dial, n_gpu=n_gpu, is_gt_op=False, is_gt_p_state=False, is_gt_gen=False) print("### Epoch {:} Test Score : ".format(best_epoch), eval_res) print('\n'*2) sys.stdout.flush()
def main(args): def worker_init_fn(worker_id): np.random.seed(args.random_seed + worker_id) if args.dataset == 'sim-R': from BERTDST_utils.simR_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP if args.dataset == 'sim-M': from BERTDST_utils.simM_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP if args.dataset == 'DSTC2': from BERTDST_utils.DSTC2_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP if args.dataset == 'WOZ2.0': from BERTDST_utils.WOZ_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, SLOT, OP if args.dataset == 'MultiWOZ2.1': from BERTDST_utils.MultiWOZ_data_utils import prepare_dataset, MultiWozDataset, make_turn_label, postprocessing, state_equal, OP, make_slot_meta ontology = json.load(open(args.ontology_data_path)) SLOT, ontology = make_slot_meta(ontology) n_gpu = 0 if torch.cuda.is_available(): n_gpu = torch.cuda.device_count() np.random.seed(args.random_seed) random.seed(args.random_seed) rng = random.Random(args.random_seed) torch.manual_seed(args.random_seed) if n_gpu > 0: torch.cuda.manual_seed(args.random_seed) torch.cuda.manual_seed_all(args.random_seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) slot_meta = SLOT op2id = OP print(op2id) tokenizer = BertTokenizer(args.vocab_path, do_lower_case=True) train_data_raw = prepare_dataset(data_scale=args.train_scale, data_path=args.train_data_path, tokenizer=tokenizer, slot_meta=slot_meta, size_window=args.train_size_window, max_seq_length=args.max_seq_length, multi_granularity=args.train_MG, data_type='train') train_data = MultiWozDataset(train_data_raw, tokenizer, slot_meta, args.max_seq_length, rng, args.word_dropout) print("# train examples %d" % len(train_data_raw)) dev_data_raw = prepare_dataset(data_scale=1.0, data_path=args.dev_data_path, tokenizer=tokenizer, slot_meta=slot_meta, size_window=args.test_size_window, max_seq_length=args.max_seq_length, multi_granularity=args.test_MG, data_type='dev') print("# dev examples %d" % len(dev_data_raw)) test_data_raw = prepare_dataset(data_scale=1.0, data_path=args.test_data_path, tokenizer=tokenizer, slot_meta=slot_meta, size_window=args.test_size_window, max_seq_length=args.max_seq_length, multi_granularity=args.test_MG, data_type='test') print("# test examples %d" % len(test_data_raw)) model_config = BertConfig.from_json_file(args.bert_config_path) model_config.dropout = args.dropout model_config.attention_probs_dropout_prob = args.attention_probs_dropout_prob model_config.hidden_dropout_prob = args.hidden_dropout_prob model = MGDST(model_config, len(op2id), len(slot_meta)) ckpt = torch.load(args.bert_ckpt_path, map_location='cpu') ckpt1 = { k.replace('bert.', '').replace('gamma', 'weight').replace('beta', 'bias'): v for k, v in ckpt.items() if 'cls.' not in k } model.encoder.bert.load_state_dict(ckpt1) #model.encoder.bert.from_pretrained(args.bert_ckpt_path) model.to(device) num_train_steps = int( len(train_data_raw) / args.batch_size * args.n_epochs) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] enc_param_optimizer = list(model.encoder.named_parameters()) enc_optimizer_grouped_parameters = [{ 'params': [ p for n, p in enc_param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in enc_param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] enc_optimizer = AdamW(enc_optimizer_grouped_parameters, lr=args.enc_lr) enc_scheduler = WarmupLinearSchedule(enc_optimizer, int(num_train_steps * args.enc_warmup), t_total=num_train_steps) dec_param_optimizer = list(model.decoder.parameters()) dec_optimizer = AdamW(dec_param_optimizer, lr=args.dec_lr) dec_scheduler = WarmupLinearSchedule(dec_optimizer, int(num_train_steps * args.dec_warmup), t_total=num_train_steps) if n_gpu > 1: model = torch.nn.DataParallel(model) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size, collate_fn=train_data.collate_fn, num_workers=args.num_workers, worker_init_fn=worker_init_fn) loss_fnc = nn.CrossEntropyLoss() best_score = {'epoch': 0, 'joint_acc': 0, 'op_acc': 0, 'final_slot_f1': 0} total_step = 0 for epoch in range(args.n_epochs): batch_loss = [] model.train() for step, batch in enumerate(train_dataloader): batch = [ b.to(device) if not isinstance(b, int) else b for b in batch ] input_ids, input_mask, segment_ids, op_ids, gen_ids = batch state_scores, span_scores = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) loss_state = loss_fnc( state_scores.contiguous().view(-1, len(op2id)), op_ids.contiguous().view(-1)) try: loss_span = masked_cross_entropy_for_value( span_scores.contiguous(), gen_ids.contiguous(), tokenizer.vocab['[PAD]']) except Exception as e: print(e) loss = loss_state * 0.8 + loss_span * 0.2 batch_loss.append(loss.item()) loss.backward() enc_optimizer.step() enc_scheduler.step() dec_optimizer.step() dec_scheduler.step() model.zero_grad() total_step += 1 if step % 100 == 0: print("[%d/%d] [%d/%d] mean_loss : %.3f, state_loss : %.3f, span_loss : %.3f" \ % (epoch+1, args.n_epochs, step, len(train_dataloader), np.mean(batch_loss), loss_state.item(), loss_span.item())) batch_loss = [] if (epoch + 1) % args.eval_epoch == 0: print('total_step: ', total_step) eval_res = model_evaluation(make_turn_label, postprocessing, state_equal, OP, model, dev_data_raw, tokenizer, slot_meta, epoch + 1, args.test_size_window, args.test_MG) if eval_res['joint_acc'] > best_score['joint_acc']: best_score = eval_res model_to_save = model.module if hasattr(model, 'module') else model save_path = os.path.join( args.save_dir, 'model_best_gran[%s]_scale[%s]_seed[%s].bin' % (str(args.train_size_window), str( args.train_scale), args.random_seed)) torch.save(model_to_save.state_dict(), save_path) print("Best Score : ", best_score) print("\n") if epoch > args.patience_start_epoch and best_score[ 'epoch'] + args.patience < epoch: print("out of patience...") break print("Test using best model...") best_epoch = best_score['epoch'] ckpt_path = os.path.join( args.save_dir, 'model_best_gran[%s]_scale[%s]_seed[%s].bin' % (str(args.train_size_window), str(args.train_scale), args.random_seed)) model = MGDST(model_config, len(op2id), len(slot_meta)) ckpt = torch.load(ckpt_path, map_location='cpu') model.load_state_dict(ckpt) model.to(device) model_evaluation(make_turn_label, postprocessing, state_equal, OP, model, test_data_raw, tokenizer, slot_meta, best_epoch, args.test_size_window, args.test_MG)