def initialize(self, ctx): self.manifest = ctx.manifest properties = ctx.system_properties model_dir = properties.get("model_dir") serialized_file = self.manifest["model"]["serializedFile"] model_pt_path = os.path.join(model_dir, serialized_file) self.device = torch.device("cuda:" + str(properties.get("gpu_id")) if torch.cuda. is_available() else "cpu") # Read model serialize/pt file self.tokenizer = BertTokenizer.from_pretrained( self.config.model_name_or_path) self.processor = TRADEPreprocessor(self.slot_meta, self.tokenizer) tokenized_slot_meta = [] for slot in self.slot_meta: tokenized_slot_meta.append( self.tokenizer.encode(slot.replace("-", " "), add_special_tokens=False)) self.model = TRADE(self.config, tokenized_slot_meta) ckpt = torch.load(model_pt_path, map_location="cpu") self.model.load_state_dict(ckpt) self.model.to(self.device) print("Model is loaded") self.initialized = True
args.vocab_size = len(tokenizer) args.n_gate = len(processor.gating2id) # gating 갯수 none, dontcare, ptr # Extracting Featrues train_features = processor.convert_examples_to_features(train_examples) dev_features = processor.convert_examples_to_features(dev_examples) # Slot Meta tokenizing for the decoder initial inputs tokenized_slot_meta = [] for slot in slot_meta: tokenized_slot_meta.append( tokenizer.encode(slot.replace("-", " "), add_special_tokens=False) ) # Model 선언 model = TRADE(args, tokenized_slot_meta) model.set_subword_embedding(args.model_name_or_path) # Subword Embedding 초기화 print(f"Subword Embeddings is loaded from {args.model_name_or_path}") model.to(device) print("Model is initialized") train_data = WOSDataset(train_features) train_sampler = RandomSampler(train_data) train_loader = DataLoader( train_data, batch_size=args.train_batch_size, sampler=train_sampler, collate_fn=processor.collate_fn, ) print("# train:", len(train_data))
eval_sampler = SequentialSampler(eval_data) eval_loader = DataLoader( eval_data, batch_size=args.eval_batch_size, sampler=eval_sampler, collate_fn=processor.collate_fn, ) print("# eval:", len(eval_data)) tokenized_slot_meta = [] for slot in slot_meta: tokenized_slot_meta.append( tokenizer.encode(slot.replace("-", " "), add_special_tokens=False) ) model = TRADE(config, tokenized_slot_meta) ckpt = torch.load(args.model_dir, map_location="cpu") model.load_state_dict(ckpt) model.to(device) print("Model is loaded") predictions = inference(model, eval_loader, processor, device) if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) json.dump( predictions, open(f"{args.output_dir}/predictions.csv", "w"), indent=2, ensure_ascii=False,
with open(feature_path + '/train_features.pickle', 'wb') as f: pickle.dump(train_features, f) with open(feature_path + '/dev_features.pickle', 'wb') as f: pickle.dump(dev_features, f) with open(feature_path + '/dev_labels.pickle', 'wb') as f: pickle.dump(dev_labels, f) print("Pickles saved!") # Slot Meta tokenizing for the decoder initial inputs tokenized_slot_meta = [] for slot in slot_meta: tokenized_slot_meta.append( tokenizer.encode(slot.replace("-", " "), add_special_tokens=False)) # Model 선언 model = TRADE(args, tokenized_slot_meta) # model.set_subword_embedding(args.model_name_or_path) # Subword Embedding 초기화 # print(f"Subword Embeddings is loaded from {args.model_name_or_path}") model.to(device) print("Model is initialized") if args.use_wandb: # wandb watch model wandb.watch(model) train_data = WOSDataset(train_features) train_sampler = RandomSampler(train_data) train_loader = DataLoader( train_data, batch_size=args.train_batch_size, sampler=train_sampler,
def train(args): # Define Tokenizer tokenizer_module = getattr(import_module("transformers"), f"{args.model_name}Tokenizer") tokenizer = tokenizer_module.from_pretrained(args.pretrained_name_or_path) slot_meta, train_examples, dev_examples, dev_labels = train_data_loading( args, isUserFirst=False, isDialogueLevel=False) # Define Preprocessor processor = TRADEPreprocessor(slot_meta, tokenizer, max_seq_length=args.max_seq_length, use_n_gate=args.use_n_gate) train_features = processor.convert_examples_to_features(train_examples) dev_features = processor.convert_examples_to_features(dev_examples) train_loader = get_data_loader(processor, train_features, args.train_batch_size) dev_loader = get_data_loader(processor, dev_features, args.eval_batch_size) args.vocab_size = len(tokenizer) args.n_gate = len( processor.gating2id ) # gating 갯수 : (none, dontcare, ptr) or (none, yes, no, dontcare, ptr) # Slot Meta tokenizing for the decoder initial inputs tokenized_slot_meta = [] for slot in slot_meta: tokenized_slot_meta.append( tokenizer.encode(slot.replace("-", " "), add_special_tokens=False)) # Model 선언 model = TRADE(args, tokenized_slot_meta) # model.set_subword_embedding(args) # Subword Embedding 초기화 print(f"Subword Embeddings is loaded from {args.pretrained_name_or_path}") model.to(device) print("Model is initialized") # Optimizer 및 Scheduler 선언 n_epochs = args.epochs t_total = len(train_loader) * n_epochs # get_optimizer 부분에서 자동으로 warmup_steps를 계산할 수 있도록 바꿨음 (아래가 원래의 code) # warmup_steps = int(t_total * args.warmup_ratio) optimizer = get_optimizer(model, args) # get optimizer (Adam, sgd, AdamP, ..) scheduler = get_scheduler( optimizer, t_total, args) # get scheduler (custom, linear, cosine, ..) loss_fnc_1 = masked_cross_entropy_for_value # generation - # classes: vocab_size loss_fnc_2 = nn.CrossEntropyLoss() # loss_fnc_2 = LabelSmoothingLoss(classes=model.decoder.n_gate,smoothing=args.smoothing_factor) json.dump( vars(args), open(f"{args.model_dir}/{args.model_fold}/exp_config.json", "w"), indent=2, ensure_ascii=False, ) json.dump( slot_meta, open(f"{args.model_dir}/{args.model_fold}/slot_meta.json", "w"), indent=2, ensure_ascii=False, ) best_score, best_checkpoint = 0, 0 for epoch in range(n_epochs): model.train() for step, batch in enumerate(train_loader): input_ids, segment_ids, input_masks, gating_ids, target_ids, guids = [ b.to(device) if not isinstance(b, list) else b for b in batch ] # teacher forcing if (args.teacher_forcing_ratio > 0.0 and random.random() < args.teacher_forcing_ratio): tf = target_ids else: tf = None all_point_outputs, all_gate_outputs = model( input_ids, segment_ids, input_masks, target_ids.size(-1), tf) # generation loss loss_1 = loss_fnc_1( all_point_outputs.contiguous(), target_ids.contiguous().view(-1), tokenizer.pad_token_id, ) # gating loss loss_2 = loss_fnc_2( all_gate_outputs.contiguous().view(-1, args.n_gate), gating_ids.contiguous().view(-1), ) loss = loss_1 + loss_2 loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() for learning_rate in scheduler.get_lr(): wandb.log({"learning_rate": learning_rate}) optimizer.zero_grad() if step % 100 == 0: print( f"[{epoch}/{n_epochs}] [{step}/{len(train_loader)}] loss: {loss.item()} gen: {loss_1.item()} gate: {loss_2.item()}" ) wandb.log({ "epoch": epoch, "Train epoch loss": loss.item(), "Train epoch gen loss": loss_1.item(), "Train epoch gate loss": loss_2.item(), }) predictions = inference_TRADE(model, dev_loader, processor, device) eval_result = _evaluation(predictions, dev_labels, slot_meta) for k, v in eval_result.items(): if k in ("joint_goal_accuracy", 'turn_slot_accuracy', 'turn_slot_f1'): print(f"{k}: {v}") if best_score < eval_result["joint_goal_accuracy"]: print("Update Best checkpoint!") best_score = eval_result["joint_goal_accuracy"] best_checkpoint = epoch wandb.log({ "epoch": epoch, "Best joint goal accuracy": best_score, "Best turn slot accuracy": eval_result['turn_slot_accuracy'], "Best turn slot f1": eval_result['turn_slot_f1'] }) if args.logging_accuracy_per_domain_slot: wandb.log({ k: v for k, v in eval_result.items() if k not in ("joint_goal_accuracy", 'turn_slot_accuracy', 'turn_slot_f1') }) torch.save(model.state_dict(), f"{args.model_dir}/{args.model_fold}/model-{epoch}.bin") print(f"Best checkpoint: {args.model_dir}/model-{best_checkpoint}.bin") wandb.log( {"Best checkpoint": f"{args.model_dir}/model-{best_checkpoint}.bin"})
def main_inference(args, config): slot_meta = json.load( open(f"{args.model_dir}/{args.model_fold}/slot_meta.json", "r")) ontology = json.load(open(f"{CFG.TrainOntology}", "r")) if config.replace_word_data: slot_meta = [meta.replace('택시', '버스') for meta in slot_meta] ontology = { domain_slot_key.replace('택시', '버스'): domain_slot_value for domain_slot_key, domain_slot_value in ontology.items() } # Define Tokenizer tokenizer_module = getattr(import_module("transformers"), f"{config.model_name}Tokenizer") tokenizer = tokenizer_module.from_pretrained( config.pretrained_name_or_path) # Extracting Featrues if config.dst == 'TRADE': eval_examples = test_data_loading(args, isUserFirst=False, isDialogueLevel=False) processor = TRADEPreprocessor(slot_meta, tokenizer) tokenized_slot_meta = [] for slot in slot_meta: tokenized_slot_meta.append( tokenizer.encode(slot.replace("-", " "), add_special_tokens=False)) # Model 선언 model = TRADE(config, tokenized_slot_meta) model.set_subword_embedding(config) # Subword Embedding 초기화 elif config.dst == 'SUMBT': eval_examples = test_data_loading(args, isUserFirst=True, isDialogueLevel=True) max_turn = max([len(e) * 2 for e in eval_examples]) processor = SUMBTPreprocessor( slot_meta, tokenizer, ontology=ontology, # predefined ontology max_seq_length=config.max_seq_length, # 각 turn마다 최대 길이 max_turn_length=max_turn) # 각 dialogue의 최대 turn 길이 slot_type_ids, slot_values_ids = tokenize_ontology( ontology, tokenizer, config.max_label_length) # Model 선언 num_labels = [len(s) for s in slot_values_ids] # 각 Slot 별 후보 Values의 갯수 model = SUMBT(config, num_labels, device) model.initialize_slot_value_lookup( slot_values_ids, slot_type_ids) # Tokenized Ontology의 Pre-encoding using BERT_SV eval_features = processor.convert_examples_to_features(eval_examples) eval_loader = get_data_loader(processor, eval_features, config.eval_batch_size) print("# eval:", len(eval_loader)) ckpt = torch.load( f'{args.model_dir}/{args.model_fold}/model-{args.chkpt_idx}.bin', map_location="cpu") model.load_state_dict(ckpt) model.to(device) print("Model is loaded") inference_module = getattr(import_module("inference"), f"inference_{config.dst}") predictions = inference_module(model, eval_loader, processor, device) os.makedirs(args.output_dir, exist_ok=True) json.dump( predictions, open(f"{args.output_dir}/{args.model_fold}-predictions.csv", "w"), indent=2, ensure_ascii=False, )
class TRADEHandler(BaseHandler, ABC): """ Transformers text classifier handler class. This handler takes a text (string) and as input and returns the classification text based on the serialized transformers checkpoint. """ def __init__(self): super(TRADEHandler, self).__init__() self.initialized = False self.config, self.slot_meta = self.load_json_data( "./exp_config.json", "./slot_meta.json") def load_json_data(self, exp_config_path, slot_meta_path): config = json.load(open(exp_config_path, "r")) config = argparse.Namespace(**config) slot_meta = json.load(open(slot_meta_path, "r")) return config, slot_meta def initialize(self, ctx): self.manifest = ctx.manifest properties = ctx.system_properties model_dir = properties.get("model_dir") serialized_file = self.manifest["model"]["serializedFile"] model_pt_path = os.path.join(model_dir, serialized_file) self.device = torch.device("cuda:" + str(properties.get("gpu_id")) if torch.cuda. is_available() else "cpu") # Read model serialize/pt file self.tokenizer = BertTokenizer.from_pretrained( self.config.model_name_or_path) self.processor = TRADEPreprocessor(self.slot_meta, self.tokenizer) tokenized_slot_meta = [] for slot in self.slot_meta: tokenized_slot_meta.append( self.tokenizer.encode(slot.replace("-", " "), add_special_tokens=False)) self.model = TRADE(self.config, tokenized_slot_meta) ckpt = torch.load(model_pt_path, map_location="cpu") self.model.load_state_dict(ckpt) self.model.to(self.device) print("Model is loaded") self.initialized = True def preprocess(self, requests): """ Very basic preprocessing code - only tokenizes. Extend with your own preprocessing steps as needed. """ input_batch = [] for idx, data in enumerate(requests): input_text = data.get("data") if input_text is None: input_text = data.get("body") if isinstance(input_text, (bytes, bytearray)): input_text = input_text.decode('utf-8') input_text = json.loads(input_text) input_batch.extend(input_text) eval_examples = get_examples_from_dialogues(input_batch, user_first=False, dialogue_level=False) eval_features = self.processor.convert_examples_to_features( eval_examples) eval_data = WOSDataset(eval_features) eval_sampler = SequentialSampler(eval_data) eval_loader = DataLoader( eval_data, batch_size=1, sampler=eval_sampler, collate_fn=self.processor.collate_fn, ) return eval_loader def postprocess_state(self, state): for i, s in enumerate(state): s = s.replace(" : ", ":") state[i] = s.replace(" , ", ", ") return state def inference(self, inputs): self.model.eval() output_lst = [] predictions = {} for batch in inputs: input_ids, segment_ids, input_masks, gating_ids, target_ids, guids = [ b.to(self.device) if not isinstance(b, list) else b for b in batch ] with torch.no_grad(): o, g = self.model(input_ids, segment_ids, input_masks, 9) _, generated_ids = o.max(-1) _, gated_ids = g.max(-1) for guid, gate, gen in zip(guids, gated_ids.tolist(), generated_ids.tolist()): prediction = self.processor.recover_state(gate, gen) prediction = self.postprocess_state(prediction) predictions[guid] = prediction output_lst.append(predictions) return output_lst # def inference(self, inputs): # """ # Predict the class of a text using a trained transformer model. # """ # # NOTE: This makes the assumption that your model expects text to be tokenized # # with "input_ids" and "token_type_ids" - which is true for some popular transformer models, e.g. bert. # # If your transformer model expects different tokenization, adapt this code to suit # # its expected input format. # prediction = self.model( # inputs['input_ids'].to(self.device), # token_type_ids=inputs['token_type_ids'].to(self.device) # )[0].argmax().item() # logger.info("Model predicted: '%s'", prediction) # # if self.mapping: # prediction = self.mapping[str(prediction)] # # return [prediction] def postprocess(self, inference_output): # TODO: Add any needed post-processing of the model predictions here return inference_output