def main(): parser = argparse.ArgumentParser() parser.add_argument("--test_data_path", default='./data/test_stt.pkl', type=str, help="test data path") args_ = parser.parse_args() pretrained = torch.load(pretrained_model_path) args = torch.load(args_path) args.test_data_path = args_.test_data_path args.eval_batch_size = 64 bert_config = BertConfig(config_path) bert_config.num_labels = 7 model = BertForEmotionClassification(bert_config).to(device) model.load_state_dict(pretrained, strict=False) args.n_gpu = 2 loss, acc, f1, total_y_hat, cm = test(model, args) print("loss : {} \nacc : {} \nf1 : {}".format(loss, acc, f1)) draw_cm(cm) tmp = pd.read_pickle(args.test_data_path) # remove duplicates tmp = tmp[['Sentence', 'Emotion']].drop_duplicates().reset_index(drop=True) tmp['Pred'] = [label_list[i] for i in total_y_hat] tmp.to_csv('./result/test_result.csv') print("results are saved to result folder")
def main(): parser = argparse.ArgumentParser() # parser.add_argument("--test_data_path", default='./data/korean_single_test.csv', type=str, # help="test data path") parser.add_argument("--test_data_path", default='./data/toon_test.csv', type=str, help="test data path") args_ = parser.parse_args() pretrained = torch.load(pretrained_model_path) args = torch.load(args_path) args.test_data_path = args_.test_data_path args.eval_batch_size = 64 bert_config = BertConfig(config_path) bert_config.num_labels = 7 model = BertForEmotionClassification(bert_config).to(device) model.load_state_dict(pretrained, strict=False) loss, acc, f1, total_y_hat, cm = test(model, args) print("loss : {} \nacc : {} \nf1 : {}".format(loss, acc, f1)) draw_cm(cm) tmp = pd.read_csv(args.test_data_path) tmp['Pred'] = [label_list[i] for i in total_y_hat] tmp.to_csv('./result/test_result_toon.csv') print("results are saved to result folder")
def train(args): set_seed(args) # Set device if args.device == 'cuda': device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') logger.info('use cuda') else: device = torch.device('cpu') logger.info('use cpu') # Set label list for classification if args.num_label == 'multi': label_list = ['공포', '놀람', '분노', '슬픔', '중립', '행복', '혐오'] elif args.num_label == 'binary': label_list = ['긍정', '부정'] logger.info('use {} labels for training'.format(len(label_list))) # Load pretrained model and model configuration pretrained_path = os.path.join('./pretrained_model/', args.pretrained_type) if args.pretrained_model_path is None: # Use pretrained bert model(etri/skt) pretrained_model_path = os.path.join(pretrained_path, 'pytorch_model.bin') else: # Use further-pretrained bert model pretrained_model_path = args.pretrained_model_path logger.info('Pretrain Model : {}'.format(pretrained_model_path)) pretrained = torch.load(pretrained_model_path) if args.pretrained_type == 'skt' and 'bert.' not in list(pretrained.keys())[0]: logger.info('modify parameter names') # Change parameter name for consistency new_keys_ = ['bert.' + k for k in pretrained.keys()] old_values_ = pretrained.values() pretrained = {k: v for k, v in zip(new_keys_, old_values_)} bert_config = BertConfig(os.path.join(pretrained_path + '/bert_config.json')) bert_config.num_labels = len(label_list) model = BertForEmotionClassification(bert_config).to(device) model.load_state_dict(pretrained, strict=False) # Load Datasets tr_set = Datasets(file_path=args.train_data_path, label_list=label_list, pretrained_type=args.pretrained_type, max_len=args.max_len) # Use custom batch function collate_fn = ClassificationBatchFunction(args.max_len, tr_set.pad_idx, tr_set.cls_idx, tr_set.sep_idx) tr_loader = DataLoader(dataset=tr_set, batch_size=args.train_batch_size, shuffle=True, num_workers=8, pin_memory=True, collate_fn=collate_fn) dev_set = Datasets(file_path=args.dev_data_path, label_list=label_list, pretrained_type=args.pretrained_type, max_len=args.max_len) dev_loader = DataLoader(dataset=dev_set, batch_size=args.eval_batch_size, num_workers=8, pin_memory=True, drop_last=False, collate_fn=collate_fn) # optimizer optimizer = layerwise_decay_optimizer(model=model, lr=args.learning_rate, layerwise_decay=args.layerwise_decay) # lr scheduler t_total = len(tr_loader) // args.gradient_accumulation_steps * args.epochs warmup_steps = int(t_total * args.warmup_percent) logger.info('total training steps : {}, lr warmup steps : {}'.format(t_total, warmup_steps)) # Use gradual warmup and linear decay scheduler = optimization.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) # for low-precision training if args.fp16: try: from apex import amp logger.info('Use fp16') except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level, verbosity=0) # tensorboard setting save_path = "./model_saved_finetuning/lr{},batch{},total{},warmup{},len{},{}".format( args.learning_rate, args.train_batch_size * args.gradient_accumulation_steps, t_total, args.warmup_percent, args.max_len, args.pretrained_type) if not os.path.isdir(save_path): os.makedirs(save_path) writer = SummaryWriter(save_path) # Save best model results with resultwriter result_writer = utils.ResultWriter("./model_saved_finetuning/results.csv") model.zero_grad() best_val_loss = 1e+9 global_step = 0 train_loss, train_acc, train_f1 = 0, 0, 0 logging_loss, logging_acc, logging_f1 = 0, 0, 0 logger.info('***** Training starts *****') total_result = [] for epoch in tqdm(range(args.epochs), desc='epochs'): for step, batch in tqdm(enumerate(tr_loader), desc='steps', total=len(tr_loader)): model.train() x_train, mask_train, y_train = map(lambda x: x.to(device), batch) inputs = { 'input_ids': x_train, 'attention_mask': mask_train, 'classification_label': y_train, } output, loss = model(**inputs) y_max = output.max(dim=1)[1] cr = classification_report(y_train.tolist(), y_max.tolist(), labels=list(range(len(label_list))), target_names=label_list, output_dict=True) # Get accuracy(micro f1) if 'micro avg' not in cr.keys(): batch_acc = list(cr.items())[len(label_list)][1] else: # If at least one of labels does not exists in mini-batch, use micro average instead batch_acc = cr['micro avg']['f1-score'] # macro f1 batch_macro_f1 = cr['macro avg']['f1-score'] # accumulate measures grad_accu = args.gradient_accumulation_steps if grad_accu > 1: loss /= grad_accu batch_acc /= grad_accu batch_macro_f1 /= grad_accu if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() train_loss += loss.item() train_acc += batch_acc train_f1 += batch_macro_f1 if (global_step + 1) % grad_accu == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.grad_clip_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip_norm) optimizer.step() scheduler.step() model.zero_grad() global_step += 1 if global_step % args.logging_step == 0: acc_ = (train_acc - logging_acc) / args.logging_step f1_ = (train_f1 - logging_f1) / args.logging_step loss_ = (train_loss - logging_loss) / args.logging_step writer.add_scalars('loss', {'train': loss_}, global_step) writer.add_scalars('acc', {'train': acc_}, global_step) writer.add_scalars('macro_f1', {'train': f1_}, global_step) logger.info('[{}/{}], trn loss : {:.3f}, trn acc : {:.3f}, macro f1 : {:.3f}'.format( global_step, t_total, loss_, acc_, f1_ )) logging_acc, logging_f1, logging_loss = train_acc, train_f1, train_loss # Get f1 score for each label f1_results = [(l, r['f1-score']) for i, (l, r) in enumerate(cr.items()) if i < len(label_list)] f1_log = "\n".join(["{} : {}".format(l, f) for l, f in f1_results]) logger.info("\n\n***f1-score***\n" + f1_log + "\n\n***confusion matrix***\n{}".format( confusion_matrix(y_train.tolist(), y_max.tolist()))) # Validation val_loss, val_acc, val_macro_f1, _ = evaluate(args, dev_loader, model, device) val_result = '[{}/{}] val loss : {:.3f}, val acc : {:.3f}. val macro f1 : {:.3f}'.format( global_step, t_total, val_loss, val_acc, val_macro_f1 ) writer.add_scalars('loss', {'val': val_loss}, global_step) writer.add_scalars('acc', {'val': val_acc}, global_step) writer.add_scalars('macro_f1', {'val': val_macro_f1}, global_step) logger.info(val_result) total_result.append(val_result) if val_loss < best_val_loss: # Save model checkpoints torch.save(model.state_dict(), os.path.join(save_path, 'best_model.bin')) torch.save(args, os.path.join(save_path, 'training_args.bin')) logger.info('Saving model checkpoint to %s', save_path) best_val_loss = val_loss best_val_acc = val_acc best_val_macro_f1 = val_macro_f1 # Save results in 'model_saved_finetuning/results.csv' results = { 'val_loss': best_val_loss, 'val_acc': best_val_acc, 'val_macro_f1' : best_val_macro_f1, 'save_dir': save_path, 'pretrained_path': pretrained_path, } result_writer.update(args, **results) return global_step, loss_, acc_, best_val_loss, best_val_acc, total_result
except Exception as e: logging.critical("Unexpected error : %s", e) sys.exit() # print(DIRNAME) pretrained_model_path = os.path.join(MODEL_ABS_PATH, constant.MODEL_BIN_NAME) # print(pretrained_model_path) config_path = os.path.join(DIRNAME, constant.BERT_CONFIG_NAME) # print(config_path) pretrained = torch.load(pretrained_model_path, map_location='cpu') bert_config = BertConfig(config_path) bert_config.num_labels = 7 model = BertForEmotionClassification(bert_config) model.load_state_dict(pretrained, strict=False) model.eval() softmax = torch.nn.Softmax(dim=1) tokenizer, vocab = get_pretrained_model('etri') # '공포', '놀람', '분노', '슬픔', '중립', '행복', '혐오' # 'angry', 'surprise', 'angry', 'sad', 'neutral', 'joy', 'disgust' obj = dict() emotion = ['scare', 'surprise', 'angry', 'sad', 'neutral', 'joy', 'disgust'] def get_prediction(sentence):
def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_folder_path, classification_head): """ Copy/paste/tweak roberta's weights to our BERT structure. """ roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path) roberta.eval() # disable dropout config = BertConfig( vocab_size_or_config_json_file=50265, hidden_size=roberta.args.encoder_embed_dim, num_hidden_layers=roberta.args.encoder_layers, num_attention_heads=roberta.args.encoder_attention_heads, intermediate_size=roberta.args.encoder_ffn_embed_dim, max_position_embeddings=514, type_vocab_size=1, layer_norm_eps=1e-5, # PyTorch default used in fairseq ) if classification_head: config.num_labels = roberta.args.num_classes print("Our BERT config:", config) model = RobertaForSequenceClassification( config) if classification_head else RobertaForMaskedLM(config) model.eval() # Now let's copy all the weights. # Embeddings roberta_sent_encoder = roberta.model.decoder.sentence_encoder model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like( model.roberta.embeddings.token_type_embeddings.weight ) # just zero them out b/c RoBERTa doesn't use them. model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias for i in range(config.num_hidden_layers): # Encoder: start of layer layer: BertLayer = model.roberta.encoder.layer[i] roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[ i] ### self attention self_attn: BertSelfAttention = layer.attention.self assert (roberta_layer.self_attn.in_proj_weight.shape == torch.Size( (3 * config.hidden_size, config.hidden_size))) # we use three distinct linear layers so we split the source layer here. self_attn.query.weight.data = roberta_layer.self_attn.in_proj_weight[: config . hidden_size, :] self_attn.query.bias.data = roberta_layer.self_attn.in_proj_bias[: config . hidden_size] self_attn.key.weight.data = roberta_layer.self_attn.in_proj_weight[ config.hidden_size:2 * config.hidden_size, :] self_attn.key.bias.data = roberta_layer.self_attn.in_proj_bias[ config.hidden_size:2 * config.hidden_size] self_attn.value.weight.data = roberta_layer.self_attn.in_proj_weight[ 2 * config.hidden_size:, :] self_attn.value.bias.data = roberta_layer.self_attn.in_proj_bias[ 2 * config.hidden_size:] ### self-attention output self_output: BertSelfOutput = layer.attention.output assert (self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape) self_output.dense.weight = roberta_layer.self_attn.out_proj.weight self_output.dense.bias = roberta_layer.self_attn.out_proj.bias self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias ### intermediate intermediate: BertIntermediate = layer.intermediate assert ( intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape) intermediate.dense.weight = roberta_layer.fc1.weight intermediate.dense.bias = roberta_layer.fc1.bias ### output bert_output: BertOutput = layer.output assert ( bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape) bert_output.dense.weight = roberta_layer.fc2.weight bert_output.dense.bias = roberta_layer.fc2.bias bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias #### end of layer if classification_head: model.classifier.dense.weight = roberta.model.classification_heads[ 'mnli'].dense.weight model.classifier.dense.bias = roberta.model.classification_heads[ 'mnli'].dense.bias model.classifier.out_proj.weight = roberta.model.classification_heads[ 'mnli'].out_proj.weight model.classifier.out_proj.bias = roberta.model.classification_heads[ 'mnli'].out_proj.bias else: # LM Head model.lm_head.dense.weight = roberta.model.decoder.lm_head.dense.weight model.lm_head.dense.bias = roberta.model.decoder.lm_head.dense.bias model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias model.lm_head.decoder.weight = roberta.model.decoder.lm_head.weight model.lm_head.bias = roberta.model.decoder.lm_head.bias # Let's check that we get the same results. input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze( 0) # batch of size 1 our_output = model(input_ids)[0] if classification_head: their_output = roberta.model.classification_heads['mnli']( roberta.extract_features(input_ids)) else: their_output = roberta.model(input_ids)[0] print(our_output.shape, their_output.shape) max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item() print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7 success = torch.allclose(our_output, their_output, atol=1e-3) print("Do both models output the same tensors?", "🔥" if success else "💩") if not success: raise Exception("Something went wRoNg") print(f"Saving model to {pytorch_dump_folder_path}") model.save_pretrained(pytorch_dump_folder_path)