def bert_baseline(arg): from bert_config import bert_parameter_dict version = arg.model parameters = bert_parameter_dict[version] batch_size = parameters["batch_size"] epoch_num = parameters["epoch_num"] learning_rate = parameters["learning_rate"] device = parameters["device"] early_stop_epoch = parameters["early_stop_epoch"] dl_model_dir = os.path.join(model_dir, version) create_dir(dl_model_dir) data_cached_path = os.path.join(cache_dir, version + ".h5") if os.path.isfile(data_cached_path): x_train, y_train, x_test, y_test, x_dev, y_dev = h5_load( data_cached_path, ["x_train", "y_train", "x_test", "y_test", "x_dev", "y_dev"], dtype=np.int32, verbose=True) else: # load data x_train, y_train = load_data(phrase="train", verbose=True) x_test, y_test = load_data(phrase="test", verbose=True) x_dev, y_dev = load_data(phrase="dev", verbose=True) # turn text into ids if version == "bert": tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') elif version == "sci-bert": tokenizer = AutoTokenizer.from_pretrained( 'allenai/scibert_scivocab_uncased') tokenizer.save_pretrained(dl_model_dir) feature = Feature(tokenizer=tokenizer) x_train = feature.extract(x_train[:]) x_test = feature.extract(x_test[:]) x_dev = feature.extract(x_dev[:]) # turn label into vector y_train = np.array([label_mapping[y] for y in y_train]) y_test = np.array([label_mapping[y] for y in y_test]) y_dev = np.array([label_mapping[y] for y in y_dev]) # cache data with h5py.File(data_cached_path, 'w') as outfile: outfile.create_dataset("x_train", data=x_train) outfile.create_dataset("y_train", data=y_train) outfile.create_dataset("x_test", data=x_test) outfile.create_dataset("y_test", data=y_test) outfile.create_dataset("x_dev", data=x_dev) outfile.create_dataset("y_dev", data=y_dev) print("Train", x_train.shape, y_train.shape) print("Test", x_test.shape, y_test.shape) print("Valid", x_dev.shape, y_dev.shape) #subset_num = 1000 #x_train, y_train = x_train[:subset_num], y_train[:subset_num] #x_dev, y_dev = x_dev[:subset_num], y_dev[:subset_num] #x_test, y_test = x_test[:subset_num], y_test[:subset_num] train_dataset = CovidDataset(x_train, y_train) test_dataset = CovidDataset(x_test, y_test) dev_dataset = CovidDataset(x_dev, y_dev) training = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4) testing = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4) dev = data.DataLoader(dev_dataset, batch_size=batch_size, shuffle=False, num_workers=4) # model if version == "bert": print("Using Bert!!!") model = BertForSequenceClassification.from_pretrained( 'bert-base-uncased', num_labels=5).to(device) elif version == "sci-bert": print("Using SCI-Bert!!!") #config = BertConfig(vocab_size=31090, num_labels=5) config = AutoConfig.from_pretrained('allenai/scibert_scivocab_uncased') config.num_labels = 5 model = AutoModelForSequenceClassification.from_pretrained( 'allenai/scibert_scivocab_uncased', config=config).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) acc, _, _ = evaluate(model, dev, device=device) best_model = None best_accuracy = 0.0 best_epoch = 0 stopper = EarlyStop(mode="max", history=early_stop_epoch) for epoch in range(1, epoch_num + 1): model.train() total_loss = 0 total_acc = 0 total_count = len(train_dataset) // batch_size for count, (x_batch, y_batch) in enumerate(training, 1): x_batch = x_batch.to(device) y_batch = y_batch.to(device) optimizer.zero_grad() outputs = model(x_batch, labels=y_batch) loss, y_pred = outputs[0:2] loss.backward() optimizer.step() total_loss += loss.item() # compute accuracy y_pred = torch.argmax(y_pred, dim=1) correct_num = torch.sum(y_pred == y_batch).double() total_acc += correct_num / y_pred.shape[0] print("\x1b[2K\rEpoch: {} / {} [{:.2f}%] Loss: {:.5f} Acc: {:.5f}". format(epoch, epoch_num, 100.0 * count / total_count, total_loss / count, total_acc / count), end="") print() if epoch % 1 == 0: acc, _, _ = evaluate(model, dev, device=device) if acc > best_accuracy: best_model = copy.deepcopy(model.state_dict()) best_accuracy = acc best_epoch = epoch # check early stopping if stopper.check(acc): print("Early Stopping at Epoch = ", epoch) break # load best model & test & save print("loading model from epoch {}".format(best_epoch)) #torch.save(best_model, os.path.join(dl_model_dir, "best_model.pt")) model.load_state_dict(best_model) model.save_pretrained(dl_model_dir) acc, predict, true_label = evaluate(model, testing, device=device) score = precision_recall_fscore_support(true_label, predict) table = output_score(score) print(table) # output result with open(os.path.join(result_dir, "{}.result".format(version)), 'w', encoding='utf-8') as outfile: outfile.write(table.to_csv(path_or_buf=None) + "\n") outfile.write("acc = {}\n".format(acc))
def train_process(config, train_load, valid_load, test_load, k, train_sampler): # load source bert weights # model_config = BertConfig.from_pretrained(pretrained_model_name_or_path="../user_data/bert_source/{}/config.json".format(config.model_name)) model_config = BertConfig() model_config.vocab_size = len( pd.read_csv('../user_data/vocab', names=["score"])) model = BertForSequenceClassification(config=model_config) if os.path.isfile('save_model/{}_best_model_v1111.pth.tar'.format( config.model_name)): checkpoint = torch.load('save_model/{}_best_model_v1.pth.tar'.format( config.model_name), map_location=torch.device('cpu')) model.load_state_dict(checkpoint['status'], strict=False) best_dev_auc = 0 print('***********load best model weight*************') else: checkpoint = torch.load( '../user_data/save_bert/{}_checkpoint.pth.tar'.format( config.model_name), map_location=torch.device('cpu')) model.load_state_dict(checkpoint['status'], strict=False) best_dev_auc = 0 print('***********load pretrained mlm model weight*************') for param in model.parameters(): param.requires_grad = True # 4) 封装之前要把模型移到对应的gpu model = model.to(config.device) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": config.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate) # t_total = len(train_load) * config.num_train_epochs # scheduler = get_linear_schedule_with_warmup( # optimizer, num_warmup_steps=t_total * config.warmup_proportion, num_training_steps=t_total # ) cudnn.benchmark = True if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") # 5)封装 model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[config.local_rank]) model.train() if config.fgm: fgm = FGM(model) for epoch in range(config.num_train_epochs): train_sampler.set_epoch(epoch) is_best = False torch.cuda.empty_cache() for batch, (input_ids, token_type_ids, attention_mask, label) in enumerate(train_load): input_ids = input_ids.cuda(config.local_rank, non_blocking=True) attention_mask = attention_mask.cuda(config.local_rank, non_blocking=True) token_type_ids = token_type_ids.cuda(config.local_rank, non_blocking=True) label = label.cuda(config.local_rank, non_blocking=True) outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=label) loss = outputs.loss model.zero_grad() loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm) if config.fgm: fgm.attack() # 在embedding上添加对抗扰动 loss_adv = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=label).loss loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 fgm.restore() # 恢复embedding参数 optimizer.step() # scheduler.step() dev_auc = model_evaluate(config, model, valid_load) # 同步各个进程的速度,计算分布式loss torch.distributed.barrier() reduce_dev_auc = reduce_auc(dev_auc, config.nprocs).item() if reduce_dev_auc > best_dev_auc: best_dev_auc = reduce_dev_auc is_best = True now = strftime("%Y-%m-%d %H:%M:%S", localtime()) msg = 'number {} fold,time:{},epoch:{}/{},reduce_dev_auc:{},best_dev_auc:{}' if config.local_rank in [0, -1]: print( msg.format(k, now, epoch + 1, config.num_train_epochs, reduce_dev_auc, best_dev_auc)) checkpoint = { "status": model.state_dict(), "epoch": epoch + 1, 'reduce_dev_auc': reduce_dev_auc } if is_best: torch.save( checkpoint, '../user_data/save_model' + os.sep + '{}_best_model.pth.tar'.format(config.model_name)) torch.save( checkpoint, '../user_data/save_model' + os.sep + '{}_checkpoint.pth.tar'.format(config.model_name)) del checkpoint torch.distributed.barrier()
def test_24_hour_model(all_examples): input_ids, attention_masks, labels = tokenize_all_examples(all_examples) # Convert the lists into tensors. input_ids = torch.cat(input_ids, dim=0) attention_masks = torch.cat(attention_masks, dim=0) labels = torch.tensor(labels) # Combine the training inputs into a TensorDataset. dataset = TensorDataset(input_ids, attention_masks, labels) # The DataLoader needs to know our batch size for training, so we specify it # here. For fine-tuning BERT on a specific task, the authors recommend a batch # size of 16 or 32. batch_size = 32 # For validation the order doesn't matter, so we'll just read them sequentially. validation_dataloader = DataLoader( dataset, # The validation samples. sampler=SequentialSampler(dataset), # Pull out batches sequentially. batch_size=batch_size # Evaluate with this batch size. ) # Load BertForSequenceClassification, the pretrained BERT model with a single # linear classification layer on top. model = BertForSequenceClassification.from_pretrained( # Use the 12-layer BERT model, with an uncased vocab. "./bert_models/full_24_hour_model/", num_labels=24, # 24 class model for us # Whether the model returns attentions weights. output_attentions=False, # Whether the model returns all hidden-states. output_hidden_states=False, ) # Tell pytorch to run this model on the GPU. model.cuda() # Put model in evaluation mode model.eval() # Tracking variables predictions, true_labels = [], [] # Predict for batch in validation_dataloader: # Add batch to GPU batch = tuple(t.to(device) for t in batch) # Unpack the inputs from our dataloader b_input_ids, b_input_mask, b_labels = batch # Telling the model not to compute or store gradients, saving memory and # speeding up prediction with torch.no_grad(): # Forward pass, calculate logit predictions outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = outputs[0] # Move logits and labels to CPU logits = logits.detach().cpu().numpy() label_ids = b_labels.to('cpu').numpy() # Store predictions and true labels predictions.append(logits) true_labels.append(label_ids) return predictions, true_labels
def predict(text): device = setup() preprocessed_text = text_cleansing(text) print('Preprocessed text:', preprocessed_text) print('Loading model...') model = BertForSequenceClassification.from_pretrained( app.config["MODEL_2_PATH"]) tokenizer = BertTokenizer.from_pretrained(app.config["MODEL_2_PATH"]) # Copy the model to the GPU. model.to(device) print('Model has loaded.') encoded_dict = tokenizer.encode_plus( preprocessed_text, # Sentence to encode. add_special_tokens=True, # Add '[CLS]' and '[SEP]' max_length=128, # Pad & truncate all sentences. return_attention_mask=True, # Construct attn. masks. return_tensors='pt', # Return pytorch tensors. truncation=True, padding='max_length') # encoded_dict = encoded_dict.to(device) # Add the encoded sentence to the list. input_ids = encoded_dict['input_ids'] input_ids = input_ids.to(device) # And its attention mask (simply differentiates padding from non-padding). attention_masks = encoded_dict['attention_mask'] attention_masks = attention_masks.to(device) model.eval() with torch.no_grad(): outputs = model(input_ids, attention_mask=attention_masks) # outputs.to(device) print('Outputs:', outputs) logits = outputs[0] softmax = torch.nn.functional.softmax(logits) logits = logits.detach().cpu().numpy() softmax = softmax.detach().cpu().numpy() print('Logits:', logits) print('Softmax:', softmax) label_id = np.argmax(logits, axis=1).flatten() percentage = np.max(softmax * 100) if label_id == 0: label_name = 'Non-Kekerasan' elif label_id == 1: label_name = 'Kekerasan' prediction = 'Konten ini adalah {} ({:.0f}%)'.format( label_name, percentage) print(prediction) return prediction # prediction
def main(args): """ """ # Create output dir if none mentioned. if args.output_dir is None: model_name = os.path.splitext(os.path.basename( args.model_name_or_path))[0] args.output_dir = "./output/" + model_name + '/' if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) print("\n========================================") print(' MODEL ') print("========================================") print("Loading BertForSequenceClassification model...") model = BertForSequenceClassification.from_pretrained( args. model_name_or_path, # Use the 12-layer BERT model, with a cased vocab. num_labels=args.num_labels, # The number of output labels output_attentions=False, # Whether the model returns attentions weights. output_hidden_states= False, # Whether the model returns all hidden-states. cache_dir=args.cache_dir, ) print('Loading BertTokenizer...') tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path, do_lower_case=False) print("Setting up CUDA & GPU...") if torch.cuda.is_available(): if args.gpu_id is not None: torch.cuda.set_device(args.gpu_id) args.n_gpu = 1 print(" - GPU {} {} will be used.".format( torch.cuda.get_device_name(args.gpu_id), args.gpu_id)) else: args.n_gpu = torch.cuda.device_count() gpu_ids = list(range(0, args.n_gpu)) if args.n_gpu > 1: model = torch.nn.DataParallel(model, device_ids=gpu_ids, output_device=gpu_ids[-1]) print(" - GPU(s) {} will be used.".format(str(gpu_ids))) args.device = torch.device("cuda") else: args.device = torch.device("cpu") args.n_gpu = 0 print(" - No GPU available, using the CPU instead.") model.to(args.device) # Set the seed value all over the place to make this reproducible. set_seed(args.seed) print("\n========================================") print(' DATA ') print("========================================") print("Loading data...") classes_of_interest = [ 'Data Sheets', 'Configuration (Guides, Examples & TechNotes)', 'Install & Upgrade Guides', 'Release Notes', 'End User Guides' ] df, categories = load_data(args, classes_of_interest) sentences = df.Sentence.values classes = df.Class.values class_ids = df.Class_id.values print(' - Number of sentences: {:,}'.format(df.shape[0])) print(' - Number of doc types: {:,}'.format(len(categories))) for i, cat in enumerate(categories): print(" * {} : {}".format(cat, i)) print("Tokenizing sentences...") tokenized = tokenize_sentences(tokenizer, df) attention_masks = create_masks(tokenized) print("Splitting dataset...") dataset = (tokenized, class_ids, attention_masks, sentences) train_set, val_set, test_set = split_data(args, dataset) print(" - Samples in train set: {}".format(len(train_set[0]))) train_ids = Counter(train_set[1]).keys() train_ids_freq = Counter(train_set[1]).values() for i, freq in zip(train_ids, train_ids_freq): print(" * {} : {}".format(i, freq)) print(" - Samples in val set: {}".format(len(val_set[0]))) val_ids = Counter(val_set[1]).keys() val_ids_freq = Counter(val_set[1]).values() for i, freq in zip(val_ids, val_ids_freq): print(" * {} : {}".format(i, freq)) print(" - Samples in test set: {}".format(len(test_set[0]))) test_ids = Counter(test_set[1]).keys() test_ids_freq = Counter(test_set[1]).values() for i, freq in zip(test_ids, test_ids_freq): print(" * {} : {}".format(i, freq)) if args.do_train: print("\n========================================") print(' TRAINING ') print("========================================") model = train(args, model, tokenizer, categories, train_set, val_set) if args.do_test: print("\n========================================") print(' TESTING ') print("========================================") print("Evaluation on entire test set...") result, df_wrong, df_right = evaluate(args, model, categories, test_set) plot_confusion_matrix(result['conf_matrix'], categories, args.output_dir) df_wrong.to_csv(os.path.join(args.output_dir, 'preds_wrong.csv')) df_right.to_csv(os.path.join(args.output_dir, 'preds_right.csv')) with open(os.path.join(args.output_dir, 'test_set_scores.json'), 'w+') as f: json.dump(result, f) print(" * Accuracy: {0:.6f}".format(result['Accuracy'])) print(" * MCC: {0:.6f}".format(result['MCC'])) print(" Macro Average") print(" * Recall: {0:.6f}".format(result['Macro_Average']['Recall'])) print(" * Precision: {0:.6f}".format( result['Macro_Average']['Precision'])) print(" * F1 score: {0:.6f}".format(result['Macro_Average']['F1'])) print(" Weighted Average") print(" * Recall: {0:.6f}".format( result['Weighted_Average']['Recall'])) print(" * Precision: {0:.6f}".format( result['Weighted_Average']['Precision'])) print(" * F1 score: {0:.6f}".format(result['Weighted_Average']['F1'])) print("Evaluation on bootstrap samples from test set...") stats = bootstrap_evaluation(args, model, categories, test_set, 100) with open(os.path.join(args.output_dir, 'bootstrap_scores.json'), 'w+') as f: json.dump(stats, f) if args.do_compare: print("Evaluation on BERT predictions...") evaluate_bert_preds(args, model, tokenizer, categories)
# ============================================================================= # Define model # ============================================================================= from transformers import BertForSequenceClassification, AdamW, BertConfig # Load BertForSequenceClassification, the pretrained BERT model with a single # linear classification layer on top. model = BertForSequenceClassification.from_pretrained( "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab. num_labels = len(set(labels.numpy())), # The number of output labels--2 for binary classification. # You can increase this for multi-class tasks. output_attentions = False, # Whether the model returns attentions weights. output_hidden_states = False, # Whether the model returns all hidden-states. ) # Tell pytorch to run this model on the GPU. model.cuda() # Get all of the model's parameters as a list of tuples. params = list(model.named_parameters()) # ============================================================================= # Optimizer & Learning Rate Scheduler
model.compile(optimizer=opt, loss=loss, metrics=[metric]) # Train and evaluate using tf.keras.Model.fit() train_steps = num_train // 32 valid_steps = num_valid // 32 history = model.fit(Xtrain, epochs=2, steps_per_epoch=train_steps, validation_data=Xvalid, validation_steps=valid_steps) model.save_pretrained(FINE_TUNED_MODEL_DIR) # load saved model saved_model = BertForSequenceClassification.from_pretrained( FINE_TUNED_MODEL_DIR, from_tf=True) # predict sentence paraphrase sentence_0 = "At least 12 people were killed in the battle last week." sentence_1 = "At least 12 people lost their lives in last weeks fighting." sentence_2 = "The fires burnt down the houses on the street." inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, return_tensors="pt") inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, return_tensors="pt") pred_1 = saved_model(**inputs_1)[0].argmax().item() pred_2 = saved_model(**inputs_2)[0].argmax().item() def print_result(id1, id2, pred): if pred == 1:
def train_classifier(model: BertForSequenceClassification, dataset: TensorDataset, validation_ratio: float, batch_size: int, freeze_embeddings_layer: bool, freeze_encoder_layers: int, epochs: int) -> (BertForSequenceClassification, list): device = select_device() train_size = int(validation_ratio * len(dataset)) val_size = len(dataset) - train_size train_dataset, val_dataset = random_split(dataset, [train_size, val_size]) train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size) validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size) modules = [] if freeze_embeddings_layer: modules.append(model.bert.embeddings) for i in range(freeze_encoder_layers): modules.append(model.bert.encoder.layer[i]) for module in modules: for param in module.parameters(): param.requires_grad = False model.to(device) optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=5e-5) total_steps = len(train_dataloader) * epochs scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) training_stats = [] total_t0 = time.time() for epoch_i in range(0, epochs): print("") print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) print('Training...') t0 = time.time() total_train_loss = 0 model.train() for step, batch in enumerate(train_dataloader): if step % 40 == 0 and not step == 0: elapsed = format_time(time.time() - t0) print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format( step, len(train_dataloader), elapsed)) b_input_ids = batch[0].to(device) b_input_mask = batch[1].to(device) b_labels = batch[2].to(device) model.zero_grad() outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) loss = outputs.loss logits = outputs.logits total_train_loss += loss.item() loss.backward() # Clip the norm of the gradients to 1.0. # This is to help prevent the "exploding gradients" problem. torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() avg_train_loss = total_train_loss / len(train_dataloader) training_time = format_time(time.time() - t0) print("") print(" Average training loss: {0:.2f}".format(avg_train_loss)) print(" Training epcoh took: {:}".format(training_time)) print("") print("Running Validation...") t0 = time.time() model.eval() total_eval_accuracy = 0 total_eval_loss = 0 nb_eval_steps = 0 for batch in validation_dataloader: b_input_ids = batch[0].to(device) b_input_mask = batch[1].to(device) b_labels = batch[2].to(device) with torch.no_grad(): outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) loss = outputs.loss logits = outputs.logits total_eval_loss += loss.item() logits = logits.detach().cpu().numpy() label_ids = b_labels.cpu().numpy() total_eval_accuracy += flat_accuracy(logits, label_ids) avg_val_accuracy = total_eval_accuracy / len(validation_dataloader) print(" Accuracy: {0:.2f}".format(avg_val_accuracy)) avg_val_loss = total_eval_loss / len(validation_dataloader) validation_time = format_time(time.time() - t0) print(" Validation Loss: {0:.2f}".format(avg_val_loss)) print(" Validation took: {:}".format(validation_time)) training_stats.append({ 'epoch': epoch_i + 1, 'Training Loss': avg_train_loss, 'Valid. Loss': avg_val_loss, 'Valid. Accur.': avg_val_accuracy, 'Training Time': training_time, 'Validation Time': validation_time }) print("") print("Training complete!") print("Total training took {:} (h:mm:ss)".format( format_time(time.time() - total_t0))) return model, training_stats
def get_model(opt): #model = BertForSequenceClassification.from_pretrained('./bert-base-cased',num_labels=opt.num_labels) model = BertForSequenceClassification.from_pretrained( './bert-base-uncased', num_labels=opt.num_labels) return model
def main(args): if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") os.makedirs(args.output_dir, exist_ok=True) json.dump(args.__dict__, open( os.path.join(args.output_dir, 'opt_{}.json'.format(args.task_name)), 'w'), sort_keys=True, indent=2) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) amp_handle = None if args.fp16: from apex import amp amp_handle = amp.init(enable_caching=True) # Prepare model if (args.model_recover_path is None) or len(args.model_recover_path) == 0: model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) else: if not os.path.exists(args.model_recover_path): logger.info("Path does not exist: {0}".format( args.model_recover_path)) sys.exit(0) logger.info("***** Recover model: {0} *****".format( args.model_recover_path)) model = BertForSequenceClassification.from_pretrained( args.bert_model, state_dict=torch.load(args.model_recover_path), num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] # note: args.train_batch_size has been changed to (/= args.gradient_accumulation_steps) if args.do_train: t_total = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) else: t_total = 1 optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) if args.local_rank != -1: t_total = t_total // torch.distributed.get_world_size() if args.fp16: try: from apex.fp16_utils.fp16_optimizer import FP16_Optimizer except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) logger.info("***** CUDA.empty_cache() *****") torch.cuda.empty_cache() if args.task_name == 'sts-b': if args.fp16: lbl_type = torch.half else: lbl_type = torch.float else: lbl_type = torch.long global_step = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", t_total) train_data = convert_features_to_dataset(train_features, lbl_type) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) best_result = 0.0 for i_epoch in trange(1, args.num_train_epochs + 1, desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 model.train() iter_bar = tqdm(train_dataloader, desc='Iter (loss=X.XXX)') for step, batch in enumerate(iter_bar): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch outputs = model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids, labels=label_ids) loss = outputs[0] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) if amp_handle: amp_handle._clear_cache() else: loss.backward() tr_loss += loss.item() iter_bar.set_description('Iter (loss=%5.3f)' % loss.item()) nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 # Perform validation eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) eval_data = convert_features_to_dataset(eval_features, lbl_type) eval_segment = processor.get_dev_segments()[0] logger.info("***** Running evaluation: {0}-{1} *****".format( eval_segment, i_epoch)) logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_result = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 all_logits, all_label_ids = [], [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): outputs = model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids, labels=label_ids) tmp_eval_loss = outputs[0] logits = outputs[1] if amp_handle: amp_handle._clear_cache() logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() all_logits.append(logits) all_label_ids.append(label_ids) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps # compute evaluation metric all_logits = np.concatenate(all_logits, axis=0) all_label_ids = np.concatenate(all_label_ids, axis=0) metric_func = processor.get_metric_func() eval_result = metric_func(all_logits, all_label_ids) # logging the results logger.info("***** Eval results for {0}: {1} *****".format( eval_segment, eval_result)) if eval_result > best_result: best_result = eval_result # Save a trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( args.output_dir, "{0}.pt".format(args.task_name)) torch.save(model_to_save.state_dict(), output_model_file) logger.info( " Saved best model to {0}".format(output_model_file)) # delete unused variables del optimizer del param_optimizer del optimizer_grouped_parameters # Load a trained model that you have fine-tuned if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info("***** CUDA.empty_cache() *****") torch.cuda.empty_cache() del model output_model_file = os.path.join(args.output_dir, "{0}.pt".format(args.task_name)) model_state_dict = torch.load(output_model_file) model = BertForSequenceClassification.from_pretrained( args.bert_model, state_dict=model_state_dict, num_labels=num_labels) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) eval_set_list = [] for eval_segment in processor.get_dev_segments(): eval_examples = processor.get_dev_examples(args.data_dir, segment=eval_segment) eval_set_list.append((eval_segment, eval_examples)) break for eval_segment, eval_examples in eval_set_list: eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer) eval_data = convert_features_to_dataset(eval_features, lbl_type) logger.info("***** Running evaluation: %s *****", eval_segment) logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_result = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 all_logits, all_label_ids = [], [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): outputs = model(input_ids, attention_mask=input_mask, token_type_ids=segment_ids, labels=label_ids) tmp_eval_loss = outputs[0] logits = outputs[1] if amp_handle: amp_handle._clear_cache() logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() all_logits.append(logits) all_label_ids.append(label_ids) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps # compute evaluation metric all_logits = np.concatenate(all_logits, axis=0) all_label_ids = np.concatenate(all_label_ids, axis=0) metric_func = processor.get_metric_func() eval_result = metric_func(all_logits, all_label_ids) # logging the results logger.info("***** Eval results for {0}: {1} *****".format( eval_segment, eval_result))
def npoclass(inputs, gpu_core=True, model_path=None, ntee_type='bc', n_jobs=4, backend='multiprocessing', batch_size_dl=64, verbose=1): # Set the seed value all over the place to make this reproducible. seed_val = 42 random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) # Check model files. if ntee_type == 'bc' and model_path == None: raise ValueError( "Make sure model files/path are correct. Please download from https://jima.me/open/npoclass_model_bc.zip, unzip, and specifiy model_path (default set to None)." ) if ntee_type == 'mg' and model_path == None: raise ValueError( "Make sure model files/path are correct. Please download from https://jima.me/open/npoclass_model_mg.zip, unzip, and specifiy model_path (default set to None)." ) # Check ntee type. if ntee_type == 'bc': le_file_name = 'le_broad_cat.pkl' elif ntee_type == 'mg': le_file_name = 'le_major_group.pkl' else: raise ValueError( "ntee_type must be 'bc' (broad category) or 'mg' (major group)") # Read model and label encoder, if not read. global model_loaded, tokenizer_loaded, label_encoder try: assert model_loaded assert tokenizer_loaded assert label_encoder except: #load a pretrained model and tokenizer. model_loaded = BertForSequenceClassification.from_pretrained( model_path) tokenizer_loaded = BertTokenizer.from_pretrained(model_path) # Read label encoder. with open(model_path + le_file_name, 'rb') as label_encoder_pkl: label_encoder = pickle.load(label_encoder_pkl) # Select acceleration method. if gpu_core == True and torch.cuda.is_available(): print('There are %d GPU(s) available.' % torch.cuda.device_count(), 'Using GPU:', torch.cuda.get_device_name(0)) torch.cuda.manual_seed_all(seed_val) device = torch.device('cuda') model_loaded.cuda() else: print('No GPU acceleration available or gpu_core=False, using CPU.') device = torch.device('cpu') model_loaded.cpu() print('Encoding inputs ...') sleep(.5) # Pause a second for better printing results. # Encode inputs. global func_encode_string, func_encode_string_batch # Define as global, otherwise cannot pickle or very slow. def func_encode_string(text_string): encoded_dict = tokenizer_loaded.encode_plus( text_string, add_special_tokens=True, # Add '[CLS]' and '[SEP]' truncation='longest_first', padding='max_length', # Max length accepted by model. return_attention_mask=True, # Construct attn. masks. return_tensors='pt', # Return pytorch tensors. ) return encoded_dict def func_encode_string_batch(text_strings): encoded_dicts = [] for text_string in text_strings: encoded_dicts += [func_encode_string(text_string)] return encoded_dicts # Tokenize all of the sentences and map the tokens to thier word IDs. input_ids = [] attention_masks = [] # Encode input string(s). if type(inputs) == list: if backend == 'multiprocessing': # Multiprocessing is faster than loky in processing large objects. encoded_outputs = Parallel( n_jobs=n_jobs, backend="multiprocessing", batch_size='auto', verbose=verbose)(delayed(func_encode_string)(text_string) for text_string in inputs) for encoded_output in encoded_outputs: # Add the encoded sentence to the list. input_ids.append(encoded_output['input_ids']) # And its attention mask (simply differentiates padding from non-padding). attention_masks.append(encoded_output['attention_mask']) elif backend == 'sequential': for text_string in tqdm(inputs): encoded_output = func_encode_string(text_string) # Add the encoded sentence to the list. input_ids.append(encoded_output['input_ids']) # And its attention mask (simply differentiates padding from non-padding). attention_masks.append(encoded_output['attention_mask']) elif backend == 'dask': with joblib.parallel_backend('dask'): n_jobs = len( client.scheduler_info()['workers']) # Get # works. string_chunks = partition_all( math.ceil(len(inputs) / n_jobs), inputs) # Collect into groups of size by worker numbers. encoded_outputs = Parallel( n_jobs=-1, batch_size='auto', verbose=verbose)( delayed(func_encode_string_batch)(text_strings) for text_strings in string_chunks) encoded_outputs = itertools.chain(*encoded_outputs) for encoded_output in encoded_outputs: # Add the encoded sentence to the list. input_ids.append(encoded_output['input_ids']) # And its attention mask (simply differentiates padding from non-padding). attention_masks.append(encoded_output['attention_mask']) if type(inputs) == str: encoded_output = func_encode_string(inputs) input_ids = [encoded_output['input_ids']] attention_masks = [encoded_output['attention_mask']] # Convert the lists into tensors. input_ids = torch.cat(input_ids, dim=0) attention_masks = torch.cat(attention_masks, dim=0) # Prepare dataloader for efficient calculation. pred_data = TensorDataset(input_ids, attention_masks) pred_sampler = SequentialSampler(pred_data) pred_dataloader = DataLoader(pred_data, sampler=pred_sampler, batch_size=batch_size_dl) # Start prediction. model_loaded.eval() logits_all = [] print('Predicting categories ...') sleep(.5) # Pause a second for better printing results. for batch in tqdm(pred_dataloader, mininterval=10): # Add batch to the pre-chosen device batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask = batch with torch.no_grad(): outputs = model_loaded(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits_all += outputs[0].tolist() # Calculate probabilities of logitcs. logits_prob = tf.nn.sigmoid(logits_all).numpy().tolist() # Find the positions of max values in logits. logits_max = np.argmax(logits_prob, axis=1) # Transfer to labels. logits_labels = label_encoder.inverse_transform(logits_max) # Compile results to be returned. result_list = [] for list_index in range(0, len(logits_labels)): result_dict = {} result_dict['recommended'] = logits_labels[list_index] conf_prob = logits_prob[list_index][logits_max[list_index]] if conf_prob >= .99: result_dict['confidence'] = 'high (>=.99)' elif conf_prob >= .95: result_dict['confidence'] = 'medium (<.99|>=.95)' else: result_dict['confidence'] = 'low (<.95)' prob_dict = {} for label_index in range(0, len(label_encoder.classes_)): prob_dict[label_encoder.classes_[label_index]] = logits_prob[ list_index][label_index] result_dict['probabilities'] = prob_dict result_list += [result_dict] return result_list
def evaluate(args): #set up model and device (hopefully cuda) device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") if args.cont: print(args.cont_path) model = BertForSequenceClassification.from_pretrained(args.cont_path) tokenizer = BertTokenizer.from_pretrained(args.cont_path) else: model = BertForSequenceClassification.from_pretrained( 'bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model.to(device) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) h = 0 s = 0 n = 0 for f in os.listdir(args.datapath): if f.endswith("ham"): h += 1 elif f.endswith("spam"): s += 1 else: n += 1 print(f"{h} {s} {n}") TP = 0 TN = 0 T1 = 0 T2 = 0 batch_list = getBatch(args.datapath, 1, tokenizer) for _ in trange(len(os.listdir(args.datapath))): batch, labels, masks = next(batch_list) inputs = torch.tensor(batch, dtype=torch.long, device=device) labels = torch.tensor(labels, dtype=torch.long, device=device) masks = torch.tensor(masks, dtype=torch.long, device=device) inputs = inputs.to(device) labels = labels.to(device) masks = masks.to(device) with torch.no_grad(): model.eval() outputs = model(inputs) loss = select(outputs[0]) # check for errors if labels[0] == 0: # expect ham if loss == 0: TP += 1 else: T2 += 1 if labels[0] == 1: # expect spam if loss == 1: TN += 1 else: T1 += 1 # print(f"expected : produced -- {labels[0]} : {loss}") # print("message:\n" + tokenizer.decode(inputs[0].tolist())) print(f"TP: {TP}\tTN:{TN}\tT1: {T1}\tT2: {T2}")
def train(datapath, outpath, seed, batch_size, epochs, save_steps, args, use_cuda=True): #set up model and device (hopefully cuda) device = torch.device( "cuda" if torch.cuda.is_available() and use_cuda else "cpu") # if use_gpt: # model = GPT2LMHeadModel.from_pretrained('gpt2') # tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # else: # model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103') # tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103') if args.cont: model = BertForSequenceClassification.from_pretrained(args.cont_path) tokenizer = BertTokenizer.from_pretrained(args.cont_path) else: model = BertForSequenceClassification.from_pretrained( 'bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model.to(device) optimizer = torch.optim.Adam(model.parameters(), betas=(.9, .999), lr=2e-05) #setup rng seeds on all devices to ensure repeatable results np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) num_batches = len(os.listdir(datapath)) / batch_size # batch_list = getBatch(datapath, batch_size, tokenizer) batch_list = spam_file_man(datapath, batch_size, tokenizer) next(batch_list) avg_losses = [] avg_loss = 0 model.zero_grad() timestamp = datetime.datetime.now().strftime('%y%m%d%H%M%S') for _ in trange(epochs, desc="Epochs"): for batch_num in tqdm(range(0 if not args.cont else args.cont_step, int(num_batches)), desc="Batches"): #setup this batch. batch, labels, masks = next(batch_list) inputs = torch.tensor(batch, dtype=torch.long, device=device) labels = torch.tensor(labels, dtype=torch.long, device=device) masks = torch.tensor(masks, dtype=torch.long, device=device) inputs = inputs.to(device) labels = labels.to(device) masks = masks.to(device) #feed input to model to train model.train() outputs = model(input_ids=inputs, labels=labels, attention_mask=masks) # if not use_gpt: # # loss returned from transfoXL was broken # first_pad = get_first_occ(inputs[0], -1) # loss = outputs[0][0][:first_pad].mean() loss = outputs[0] avg_loss += loss #update parameters loss.backward() optimizer.step() model.zero_grad() if batch_num % save_steps == 0: print('CHECKPOINT') checkpoint_path = f"{fixpath(outpath)}{timestamp}/e{epochs}-num{batch_num}-size{batch_size}" if not os.path.exists(checkpoint_path): os.makedirs(checkpoint_path) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(checkpoint_path) tokenizer.save_pretrained(checkpoint_path) avg = avg_loss / save_steps print(f"average loss: {avg}") avg_losses += [avg] print('finished') print(avg_losses)
model_path = "E:\Projects\A_Idiom_detection_gihan\idiom_detection_nlp\models\\epie_models" model_path = "E:\Projects\A_Idiom_detection_gihan\idiom_detection_nlp\\building_emotional_embeddings\models\idiomatic_dataset_with_sentiments\checkpoint-500\\" # model_path = 'bert-base-uncased' import tensorflow as tf from transformers import BertTokenizer, TFBertModel, BertModel, BertForSequenceClassification import torch print('hi') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # model = TFBertModel.from_pretrained(model_path) model = BertForSequenceClassification.from_pretrained(model_path) sentence = "Hello developmentation" tokens = tokenizer.tokenize(sentence) print(tokens) # input_ids = tf.constant(tokenizer.encode(sentence))[None, :] # Batch size 1 input_ids = torch.tensor(tokenizer.encode(sentence))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs # The last hidden-state is the first element of the output tuple print(last_hidden_states)
def main(config): # Get pretrained tokenizer. tokenizer = AutoTokenizer.from_pretrained(config.pretrained_model_name) # Get dataloaders using tokenizer from untokenized corpus. train_loader, valid_loader, index_to_label = get_loaders( config.train_fn, tokenizer) print( '|train| =', len(train_loader) * config.batch_size, '|valid| =', len(valid_loader) * config.batch_size, ) # Get pretrained model with specified softmax layer. model = BertForSequenceClassification.from_pretrained( config.pretrained_model_name, num_labels=len(index_to_label)) if config.use_radam: optimizer = custom_optim.RAdam(model.parameters(), lr=config.lr) else: # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = optim.AdamW(optimizer_grouped_parameters, lr=config.lr, eps=config.adam_epsilon) # By default, model returns a hidden representation before softmax func. # Thus, we need to use CrossEntropyLoss, which combines LogSoftmax and NLLLoss. crit = nn.CrossEntropyLoss() n_total_iterations = len(train_loader) * config.n_epochs n_warmup_steps = int(n_total_iterations * config.warmup_ratio) scheduler = get_linear_schedule_with_warmup(optimizer, n_warmup_steps, n_total_iterations) if config.gpu_id >= 0: model.cuda(config.gpu_id) crit.cuda(config.gpu_id) # Start train. trainer = Trainer(config) model = trainer.train( model, crit, optimizer, scheduler, train_loader, valid_loader, ) torch.save( { 'rnn': None, 'cnn': None, 'bert': model.state_dict(), 'config': config, 'vocab': None, 'classes': index_to_label, 'tokenizer': tokenizer, }, config.model_fn)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--data_dir", default="/home/jqu/Documents/data/XNLI/", type=str, required=False, help= "The input data dir. Should contain the .tsv files (or other data files) for the task.", ) parser.add_argument("--model_type", type=str, required=True, help="distilbert|bert") parser.add_argument("--model_dir", type=str, required=True, help="where the trained model locates") args = parser.parse_args() # load test dataset processor = processors["xnli"](language="en", train_language="en") examples = processor.get_test_examples(args.data_dir) if args.model_type == "bert": # prepare tokenizer tokenizer = BertTokenizer.from_pretrained(args.model_dir, do_lower_case=False) model = BertForSequenceClassification.from_pretrained(args.model_dir) elif args.model_type == "distilbert": tokenizer = DistilBertTokenizer.from_pretrained(args.model_dir, do_lower_case=False) model = DistilBertForSequenceClassification.from_pretrained( args.model_dir) elif args.model_type == "albert": tokenizer = AlbertTokenizer.from_pretrained(args.model_dir, do_lower_case=False) model = AlbertForSequenceClassification.from_pretrained(args.model_dir) model.to("cuda:0") model.eval() features = convert_examples_to_features( examples, tokenizer, label_list=processor.get_labels(), max_length=128, output_mode="classification", pad_on_left=False, pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=0, mask_padding_with_zero=True) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_labels = torch.tensor([f.label for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=512) overall_preds = [[], []] for batch in tqdm(eval_dataloader, desc="Evaluating"): with torch.no_grad(): batch = tuple(t.to("cuda:0") for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } if args.model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if args.model_type in ["bert"] else None ) # XLM and DistilBERT don't use segment_ids outputs = model(**inputs) _, logits = outputs[:2] preds = logits.detach().cpu().numpy() preds = np.argmax(preds, axis=1) overall_preds[0] += preds.tolist() out_label_ids = inputs["labels"].detach().cpu().numpy() overall_preds[1] += out_label_ids.tolist() # compute scores result = accuracy_score(overall_preds[0], overall_preds[1]) print(f"Overall accuracy: {result}") confusion_score = confusion_matrix(overall_preds[0], overall_preds[1]) print("confusion matrix:\n") print(confusion_score)
labels_test, num_labels, ) = get_data(config) ldm = PLDataModuleFromCorpus( raw_train, labels_train, val=raw_dev, val_labels=labels_dev, test=raw_test, test_labels=labels_test, collate_fn=collate_fn, **config.data, ) model = BertForSequenceClassification.from_pretrained( config.hugging_face_model, num_labels=num_labels) logger.info(model) # Leave this hardcoded for now. optimizer = AdamW([p for p in model.parameters() if p.requires_grad], lr=1e-5) criterion = nn.CrossEntropyLoss() lm = BertPLModule( model, optimizer, criterion, metrics={"acc": FromLogits(pl.metrics.classification.Accuracy())}, )
data.self_train_prop) != 0 else [0] * len(data.self_train_prop) return data if __name__ == '__main__': args = create_args() # load tokenizer tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case, piece=args.piece, piece_model=args.piece_model) # load bert model config = BertConfig.from_json_file(args.config_file) model = BertForSequenceClassification(config) model_state_dict = model.state_dict() print('Model parameter: {}'.format( sum(p.numel() for k, p in model_state_dict.items()))) pre_state_dict = torch.load(args.pretrained_file) pre_state_dict = { k: v for k, v in pre_state_dict.items() if k in model_state_dict } model_state_dict.update(pre_state_dict) model.load_state_dict(model_state_dict) if args.cuda: model.cuda() # load data data = BERTCLDCDataReader(args, tokenizer)
def train(trainloader, valloader, model_name, num_label, epochs): model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_label) clear_output() # 讓模型跑在 GPU 上並取得訓練集的分類準確率 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("device:", device) model = model.to(device) pred, acc = get_predictions(model, trainloader, compute_acc=True) # 使用 Adam Optim 更新整個分類模型的參數 optimizer = torch.optim.Adam(model.parameters(), lr=1e-5) for epoch in range(epochs): running_loss = 0.0 print("") print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs)) print('Training...') # 訓練模式 model.train() for data in trainloader: # trainloader is an iterator over each batch tokens_tensors, segments_tensors, \ masks_tensors, labels = [t.to(device) for t in data] # 將參數梯度歸零 optimizer.zero_grad() # forward pass outputs = model(input_ids=tokens_tensors, token_type_ids=segments_tensors, attention_mask=masks_tensors, labels=labels) loss = outputs[0] # backward loss.backward() # Clip the norm of the gradients to 1.0. # This is to help prevent the "exploding gradients" problem. torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() # 紀錄當前 batch loss running_loss += loss.item() # 計算分類準確率 logit, acc = get_predictions(model, trainloader, compute_acc=True) print('loss: %.3f, acc: %.3f' % (running_loss, acc)) print("") print("Running Validation...") # # Put the model in evaluation mode--the dropout layers behave differently # # during evaluation. # model.eval() # # Evaluate data for one epoch # for data in valloader: # tokens_tensors, segments_tensors, \ # masks_tensors, labels = [t.to(device) for t in data] # # Telling the model not to compute or store gradients, saving memory and # # speeding up validation # with torch.no_grad(): # # Forward pass, calculate logit predictions. # # This will return the logits rather than the loss because we have # # not provided labels. # # token_type_ids is the same as the "segment ids", which # # differentiates sentence 1 and 2 in 2-sentence tasks. # outputs = model(input_ids=tokens_tensors, # token_type_ids=segments_tensors, # attention_mask=masks_tensors, # labels=labels) # # Get the "logits" output by the model. The "logits" are the output # # values prior to applying an activation function like the softmax. # logits = outputs[0] _, acc = get_predictions(model, valloader, compute_acc=True) # Move logits and labels to CPU #logits = logits.detach().cpu().numpy() #label_ids = b_labels.to('cpu').numpy() # Calculate the accuracy for this batch of test sentences. # tmp_eval_accuracy = flat_accuracy(logits, label_ids) # Accumulate the total accuracy. #eval_accuracy += tmp_eval_accuracy # Track the number of batches #nb_eval_steps += 1 # Report the final accuracy for this validation run. print(" Accuracy: {0:.2f}".format(acc)) return model
def train(args, states=None): config_obj = Config(args.config_file) config = config_obj.elements # make training runs deterministic set_seed(seed_value=config['random_seed']) logging.info("Loading datasets...") dataset, labels = load_tokens( input_id_path=config['input_id'], token_type_id_path=config['token_type_id'], attention_mask_path=config['attention_mask'], label_path=config['labels'], ) train_loader, val_loader, test_loader = create_dataloaders( dataset, labels, batch_size=config['batch_size'], random_seed=config['random_seed'], balance=config['correct_imbalance'], ) model = BertForSequenceClassification.from_pretrained( "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext", num_labels=2, output_attentions=False, output_hidden_states=False, ) if torch.cuda.is_available(): model.cuda() loss_function = nn.CrossEntropyLoss() # optimizer = AdamW(model.parameters(), lr=config['lr']) optimizer = torch.optim.SGD(model.parameters(), lr=config['lr']) total_train_steps = config['num_epochs'] * len(train_loader) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_train_steps, ) best_metric = 0 # loop over the dataset multiple times for epoch in range(1, config['num_epochs'] + 1): logging.info( f"==================== Epoch: {epoch} ====================") running_losses = [] for i, data in enumerate(train_loader, 0): # get the inputs; data is a list of [inputs, labels] input_ids, token_type_ids, attention_mask, labels = data if torch.cuda.is_available(): input_ids = input_ids.cuda() token_type_ids = token_type_ids.cuda() attention_mask = attention_mask.cuda() labels = labels.cuda() # zero the parameter gradients optimizer.zero_grad() # forward _, logits = model( input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=labels, ) # probs = F.softmax(logits, dim=1) # backprop loss = loss_function(logits, labels) loss.backward() # clip gradients torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # update/optimize optimizer.step() # update learning rate scheduler.step() # Log summary running_losses.append(loss.item()) if i % args.log_interval == 0: interval_loss = sum(running_losses) / len(running_losses) logging.info(f"step = {i}, loss = {interval_loss}") running_losses = [] if i % args.test_interval == 0: dev_metric = eval( val_loader, model, loss_function, args.eval_metric, ) if dev_metric > best_metric: best_metric = dev_metric states = { "epoch": epoch, "step": i, "model": model.state_dict(), "optimizer": optimizer.state_dict() } save_model_state(save_dir=args.model_dir, step=i, states=states) print(f"Finished Training, best {args.eval_metric}: {best_metric}")
#qscd d_lab = dict() d_lab["questioning"] = 0 d_lab["support"] = 1 d_lab["commenting"] = 2 d_lab["denying"] = 3 train = pd.read_csv("../Fine-Tuning/CSV_Stance/train_semeval_raw.csv") val = pd.read_csv("../Fine-Tuning/CSV_Stance/dev_semeval_raw.csv") test = pd.read_csv("../Fine-Tuning/CSV_Stance/test_semeval_raw.csv") #dir_path='../../model_save/Dos-Fases-all_Stance_4epochs/' dir_path = '../../model_save/Dos-Fases-all_Stance/' tokenizer_loaded = BertTokenizer.from_pretrained( dir_path) #'bert-base-uncased') model_loaded = BertForSequenceClassification.from_pretrained( dir_path, num_labels=4) #'bert-base-uncased', num_labels=4) idx_2_token = tokenizer_loaded.ids_to_tokens archivo = open(dir_path + 'vocab.txt', 'r') Word2Index = {word.strip(): i for i, word in enumerate(archivo.readlines())} M_BERT_space = model_loaded.bert.embeddings.word_embeddings.weight.detach( ).cpu().numpy() transformer = Normalizer().fit(M_BERT_space) # fit does nothing. M_BERT_space = transformer.transform(M_BERT_space) def my_normalize(v): norm = np.linalg.norm(v) if norm == 0:
def main(): # training settings def get_args(): parser = ArgumentParser(description='QQPairs') parser.add_argument('--name', type=str, default='QQPairs', metavar='S', help="Model name") parser.add_argument('--checkpoint', type=str, default='bert-base-uncased', metavar='S', help="e.g., bert-base-uncased, etc") parser.add_argument('--model', type=str, default='bert-base-uncased', metavar='S', help="e.g., bert-base-uncased, etc") parser.add_argument('--batch-size', type=int, default=32, metavar='N', help='input batch size for training (default: 32)') parser.add_argument('--epochs', type=int, default=1, metavar='N', help='number of epochs to train (default: 1)') parser.add_argument('--lr', type=float, default=1e-5, metavar='LR', help='learning rate (default: 1e-5)') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--num-workers', type=int, default=0, metavar='N', help='number of CPU cores (default: 0)') parser.add_argument('--num-labels', type=int, default=2, metavar='N', help='number of labels to classify (default: 2)') parser.add_argument('--l2', type=float, default=0.01, metavar='LR', help='l2 regularization weight (default: 0.01)') parser.add_argument('--max-seq-length', type=int, default=84, metavar='N', help='max sequence length for encoding (default: 84)') parser.add_argument('--warmup-proportion', type=int, default=0.1, metavar='N', help='Warmup proportion (default: 0.1)') parser.add_argument('--embed-batch-size', type=int, default=1, metavar='N', help='Embedding batch size emission; (default: 1)') args = parser.parse_args() return args args = get_args() # set seeds and determinism torch.manual_seed(args.seed) torch.backends.cudnn.deterministic = True torch.cuda.amp.autocast(enabled=True) # set device device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # build ds train_ds = QQP(type='train', transform=Tokenize_Transform(args, logger)) # build ds dev_ds = QQP(type='dev', transform=Tokenize_Transform(args, logger)) # create training dataloader train_dataloader = DataLoader(train_ds, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, drop_last=False) # create embed dataloader train_embed_dataloader = DataLoader(train_ds, batch_size=args.embed_batch_size, shuffle=True, num_workers=args.num_workers, drop_last=False) # create embed dataloader dev_embed_dataloader = DataLoader(dev_ds, batch_size=args.embed_batch_size, shuffle=True, num_workers=args.num_workers, drop_last=False) # load the model model = BertForSequenceClassification.from_pretrained(args.checkpoint, num_labels=args.num_labels).to(device) # create gradient scaler for mixed precision scaler = GradScaler() # set optimizer param_optimizer = list(model.named_parameters()) # exclude these from regularization no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] # give l2 regularization to any parameter that is not named after no_decay list # give no l2 regulariation to any bias parameter or layernorm bias/weight optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.l2}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] # set optimizer optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr, correct_bias=False, weight_decay=args.l2) num_train_optimization_steps = int(len(train_ds) / args.batch_size) * args.epochs scheduler = get_linear_schedule_with_warmup(optimizer, num_training_steps=num_train_optimization_steps, num_warmup_steps=args.warmup_proportion * num_train_optimization_steps) # set epochs epochs = args.epochs # set location and make if necessary if args.checkpoint == 'bert-base-uncased': checkpoint_location = 'C:\\w266\\data\\embed_checkpoints\\' elif args.checkpoint == 'bert-large-uncased': checkpoint_location = 'C:\\w266\\data\\embed_checkpoints\\bert_large\\' os.makedirs(checkpoint_location, exist_ok=True) # train best_loss = np.inf for epoch in range(1, epochs + 1): train_log = train(model, train_dataloader, scaler, optimizer, scheduler, device, args) logs = dict(train_log) if logs['loss'] < best_loss: # torch save torch.save(model.state_dict(), checkpoint_location + args.name + '_epoch_{}.pt'.format(epoch)) best_loss = logs['loss'] show_info = f'\nEpoch: {epoch} - ' + "-".join([f' {key}: {value:.4f} ' for key, value in logs.items()]) print(show_info) # now proceed to emit embeddings model = BertForSequenceClassification.from_pretrained(args.checkpoint, num_labels=args.num_labels, output_hidden_states=True).to(device) # load weights from 1 epoch model.load_state_dict(torch.load(checkpoint_location + args.name + '_epoch_1.pt')) # export embeddings emit_train_embeddings(train_embed_dataloader, train_ds, model, device, args) emit_dev_embeddings(dev_embed_dataloader, dev_ds, model, device, args)
textCNN.to(device) config = BertConfig.from_json_file('./dataset/bert_config.json') config.output_hidden_states = True model = BertModel.from_pretrained( './model/bert_pre58_4/pytorch_model.bin', config=config) model.cuda() model = torch.nn.DataParallel(model, device_ids=[0, 1, 2, 3]).cuda() model.to(device) save_offset = 12 supreme_config = BertConfig.from_json_file('./dataset/bert_config.json') supreme_config.num_labels = len(myDataset.cls_label_2_id) model_ = BertForSequenceClassification(config=supreme_config) model_.cuda() model_ = torch.nn.DataParallel(model_, device_ids=[0, 1, 2, 3]).cuda() model_.to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam([{'params': model.parameters(), 'lr': 5e-5}, {'params': textCNN.parameters(), 'lr': 1e-3}], lr=1e-3, weight_decay=0.) # %% losses = [] num_epochs = 30 for epoch in range(num_epochs): train_count = 0
dev_texts = [data[0] for data in dev_data] dev_labels = [data[1] for data in dev_data] dev_encodings = tokenizer(dev_texts, truncation=True, padding=True) dev_dataset = Dataset(dev_encodings, dev_labels) test_encodings = tokenizer(test_texts, truncation=True, padding=True) test_dataset = Dataset(test_encodings, test_labels) # We keep the label of unlabeled data to track for accuracy of pseudo-labeling unlabeled_texts = [data[0] for data in unlabeled_data] unlabeled_labels = [data[1] for data in unlabeled_data] unlabeled_encodings = tokenizer(unlabeled_texts, truncation=True, padding=True) unlabeled_dataset = Dataset(unlabeled_encodings, unlabeled_labels) # Build model model = BertForSequenceClassification.from_pretrained( 'bert-base-uncased', num_labels=config.class_num) # Criterion & optimizer loss_function = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr=2e-5) #or AdamW # Init Trainer trainer = Trainer(config, model, loss_function, optimizer, args.save_path, dev_dataset, test_dataset) # Initial training (supervised leraning) trainer.initial_train(labeled_dataset) # load checkpoint checkpoint_path = trainer.sup_path + '/checkpoint.pt' checkpoint = torch.load(checkpoint_path)
if( ifLIMIT ): X_train = X_train[:100] y_train = y_train[:100] X_test = X_test[:100] y_test = y_test[:100] with open('./module/label_preprocess.pkl' , 'rb') as input: label_preprocessing = pickle.load(input) print("create data loader...") train_loader = create_data_loader(X_train, y_train, batch_size_ = BATCH_SIZE) train_loader_1 = create_data_loader(X_train, y_train, batch_size_ = 1) test_loader = create_data_loader(X_test, y_test, batch_size_ = 1) print("create model...") model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels = len(y_train[0])) model.to(DEVICE) clear_output() optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE) loss_fn = nn.MSELoss(reduction='sum') print("start training...") state_of_the_art = 0 for epoch in range(EPOCHS): running_loss = 0.0 for data in train_loader: input_ids, token_type_ids, attention_mask, labels = [t.to(DEVICE) for t in data]
attention_masks.append(seq_mask) # Convert all of our data into torch tensors, the required datatype for our model input_ids = torch.tensor(input_ids) labels = torch.tensor(labels) attention_masks = torch.tensor(attention_masks) batch_size = 128 #256 train_data = TensorDataset(input_ids, attention_masks, labels) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size, num_workers=4) model = BertForSequenceClassification.from_pretrained(bert_name, num_labels=len(i2l)) model = model.cuda() param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }]
def crossvalidation_front_back(): parser = argparse.ArgumentParser() parser.add_argument("--train_data_path", required=True, type=str) parser.add_argument("--output_dir", required=True, type=str) parser.add_argument("--cro_test_data_path", type=str) parser.add_argument("--do_lower_case", action='store_true') parser.add_argument("--split_num", default=2, type=int) parser.add_argument("--config_file", type=str) parser.add_argument("--model_file", type=str) parser.add_argument("--eval_split", default=0.2, type=float) parser.add_argument("--test_split", default=0.1, type=float) parser.add_argument("--max_len", default=512, type=int) parser.add_argument("--batch_size", default=16, type=int) parser.add_argument("--num_epochs", default=3, type=int) parser.add_argument("--learning_rate", default=2e-5, type=float) parser.add_argument("--weight_decay", default=0.01, type=float) parser.add_argument("--warmup_proportion", default=0.1, type=float) parser.add_argument("--adam_epsilon", default=1e-8, type=float) args = parser.parse_args() if not os.path.exists(args.output_dir): os.mkdir(args.output_dir) print("Setting the random seed...") random.seed(42) np.random.seed(42) torch.manual_seed(42) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) log_path = os.path.join(args.output_dir, "log") print("Reading data...") df_data = pd.read_csv(args.train_data_path, sep="\t") data = df_data['data'].tolist() label_set = sorted(list(set(df_data['label'].values))) labels = encode_labels(df_data['label'].tolist(), label_set) if args.cro_test_data_path is not None: print("Preparing the croatian test data...") cro_test_data, cro_test_labels = read_croatian_data( args.cro_test_data_path) cro_test_labels = encode_labels(cro_test_labels, label_set) print("Training model on the split number " + str(args.split_num) + "...") device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=args.do_lower_case) if args.config_file is not None and args.model_file is not None: config = BertConfig.from_pretrained(args.config_file, num_labels=len(label_set)) model = BertForSequenceClassification.from_pretrained(args.model_file, config=config) else: model = BertForSequenceClassification.from_pretrained( 'bert-base-multilingual-cased', num_labels=len(label_set)) test_data = data[(floor(len(data) * args.split_num * 0.1)):( floor(len(data) * (args.split_num + 1) * 0.1))] test_labels = labels[floor((len(labels) * args.split_num * 0.1)):floor((len(labels) * (args.split_num + 1) * 0.1))] train_data = data[:floor((len(data) * args.split_num * 0.1))] + data[floor( (len(data) * (args.split_num + 1) * 0.1)):] train_labels = labels[:floor((len(labels) * args.split_num * 0.1))] + labels[floor((len(labels) * (args.split_num + 1) * 0.1)):] train_data, eval_data, train_labels, eval_labels = train_test_split( train_data, train_labels, test_size=args.eval_split, random_state=42) print("Train label:") print(train_labels[0]) print("Train data:") print(train_data[0]) train_dataloader = cut_at_front_and_back(train_data, train_labels, tokenizer, args.max_len, args.batch_size) eval_dataloader = cut_at_front_and_back(eval_data, eval_labels, tokenizer, args.max_len, args.batch_size) test_dataloader = cut_at_front_and_back(test_data, test_labels, tokenizer, args.max_len, args.batch_size) if args.cro_test_data_path is not None: cro_test_dataloader = cut_at_front_and_back(cro_test_data, cro_test_labels, tokenizer, args.max_len, args.batch_size) _, __ = bert_train(model, device, train_dataloader, eval_dataloader, args.output_dir, args.num_epochs, args.warmup_proportion, args.weight_decay, args.learning_rate, args.adam_epsilon, save_best=True) print("Testing the trained model on the current test split...") metrics = bert_evaluate(model, test_dataloader, device) with open(log_path, 'a') as f: f.write("Results for split nr. " + str(args.split_num) + " on current slo test:\n") f.write("Acc: " + str(metrics['accuracy']) + "\n") f.write("Recall: " + str(metrics['recall']) + "\n") f.write("Precision: " + str(metrics['precision']) + "\n") f.write("F1: " + str(metrics['f1']) + "\n") f.write("\n") if args.cro_test_data_path is not None: print("Testing the trained model on the croatian test set...") cro_metrics = bert_evaluate(model, cro_test_dataloader, device) with open(log_path, 'a') as f: f.write("Results for split nr. " + str(args.split_num) + " on cro test set:\n") f.write("Acc: " + str(cro_metrics['accuracy']) + "\n") f.write("Recall: " + str(cro_metrics['recall']) + "\n") f.write("Precision: " + str(cro_metrics['precision']) + "\n") f.write("F1: " + str(cro_metrics['f1']) + "\n") f.write("\n") print("Done.")
def train(model_dir, args): seed_everything(args.seed) use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") print("This notebook use [%s]." % (device)) s_dir = args.model + str( args.num_hidden_layers) + '-' + args.preprocess + '-epoch' + str( args.epochs ) + '-' + args.scheduler + '-' + args.tokenize + '-' + str( args.max_len) + '-' + str(args.seed) save_dir = increment_path(os.path.join(model_dir, s_dir)) log_dir = increment_path(os.path.join('logs', s_dir)) # load model and tokenizer MODEL_NAME = args.model if MODEL_NAME.startswith('xlm'): tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME) else: tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # set neptune set_neptune(save_dir, args) # load dataset dataset = load_data("/opt/ml/input/data/train/train.tsv") labels = dataset['label'].values # setting model hyperparameter if MODEL_NAME.startswith('xlm'): bert_config = XLMRobertaConfig.from_pretrained(MODEL_NAME) else: bert_config = BertConfig.from_pretrained(MODEL_NAME) bert_config.num_labels = args.num_labels bert_config.num_hidden_layers = args.num_hidden_layers if MODEL_NAME.startswith('xlm'): model = XLMRobertaForSequenceClassification.from_pretrained( MODEL_NAME, config=bert_config) else: model = BertForSequenceClassification.from_pretrained( MODEL_NAME, config=bert_config) if args.drop >= 0: model.dropout = nn.Dropout(p=args.drop) # preprocess dataset if args.preprocess != 'no': pre_module = getattr(import_module("preprocess"), args.preprocess) dataset = pre_module(dataset, model, tokenizer) # make dataset for pytorch. # train, val split train_dataset, val_dataset = train_test_split(dataset, test_size=args.val_ratio, random_state=args.seed) tok_module = getattr(import_module("load_data"), args.tokenize) train_tokenized = tok_module(train_dataset, tokenizer, max_len=args.max_len) val_tokenized = tok_module(val_dataset, tokenizer, max_len=args.max_len) # make dataset for pytorch. RE_train_dataset = RE_Dataset( train_tokenized, train_dataset['label'].reset_index(drop='index')) RE_val_dataset = RE_Dataset(val_tokenized, val_dataset['label'].reset_index(drop='index')) model.to(device) # 사용한 option 외에도 다양한 option들이 있습니다. # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요. training_args = TrainingArguments( seed=args.seed, output_dir=save_dir, # output directory save_total_limit=2, # number of total save model. save_steps=args.save_steps, # model saving step. num_train_epochs=args.epochs, # total number of training epochs learning_rate=args.lr, # learning_rate per_device_train_batch_size=args. batch_size, # batch size per device during training per_device_eval_batch_size=16, # batch size for evaluation lr_scheduler_type=args.scheduler, warmup_steps=args. warmup_steps, # number of warmup steps for learning rate scheduler weight_decay=args.weight_decay, # strength of weight decay logging_dir=log_dir, # directory for storing logs logging_steps=100, # log saving step. evaluation_strategy= 'steps', # evaluation strategy to adopt during training # `no`: No evaluation during training. # `steps`: Evaluate every `eval_steps`. # `epoch`: Evaluate every end of epoch. eval_steps=100, # evaluation step. dataloader_num_workers=4, label_smoothing_factor=args.smoothing_factor, load_best_model_at_end=True, metric_for_best_model='accuracy') trainer = Trainer( model=model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset eval_dataset=RE_val_dataset, # evaluation dataset compute_metrics=compute_metrics # define metrics function ) # train model trainer.train()
# In[38]: test_inputs = torch.tensor(input_ids_test) test_labels = torch.tensor(labels_test) test_masks = torch.tensor(attention_masks_test) # In[39]: test_data = TensorDataset(test_inputs, test_masks, test_labels) #test_sampler = RandomSampler(test_data) test_dataloader = DataLoader(test_data, batch_size=batch_size) # # Model and Parameters # In[40]: model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3) #model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=3) model = nn.DataParallel(model) model.to(device) # In[41]: logging.info("Model Loaded!") # In[42]: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
def train_bert_uncased(t_config, p_config, s_config): device = torch.device('cuda') seed_everything(s_config.seed) train = pd.read_csv('../input/train.csv').sample( t_config.num_to_load + t_config.valid_size, random_state=s_config.seed) train = prepare_train_text(train, p_config) train = train.fillna(0) tokenizer = BertTokenizer.from_pretrained('bert-base-cased') train_processed = get_tokenized_samples(t_config.MAX_SEQUENCE_LENGTH, tokenizer, train['text_proc']) sequences = train_processed lengths = np.argmax(sequences == 0, axis=1) lengths[lengths == 0] = sequences.shape[1] MyModel = BertForSequenceClassification.from_pretrained( 'bert-base-cased', num_labels=t_config.num_labels) MyModel.to(device) # Prepare target target_train = train['target'].values[:t_config.num_to_load] target_train_aux = train[[ 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat' ]].values[:t_config.num_to_load] target_train_identity = train[identity_columns].values[:t_config. num_to_load] # Prepare training data inputs_train = train_processed[:t_config.num_to_load] weight_train = train['weight'].values[:t_config.num_to_load] lengths_train = lengths[:t_config.num_to_load] inputs_train = torch.tensor(inputs_train, dtype=torch.int64) Target_train = torch.Tensor(target_train) Target_train_aux = torch.Tensor(target_train_aux) Target_train_identity = torch.Tensor(target_train_identity) weight_train = torch.Tensor(weight_train) Lengths_train = torch.tensor(lengths_train, dtype=torch.int64) # Prepare dataset train_dataset = data.TensorDataset(inputs_train, Target_train, Target_train_aux, Target_train_identity, weight_train, Lengths_train) ids_train = lengths_train.argsort(kind="stable") ids_train_new = resort_index(ids_train, t_config.num_of_bucket, s_config.seed) train_loader = torch.utils.data.DataLoader(data.Subset( train_dataset, ids_train_new), batch_size=t_config.batch_size, collate_fn=clip_to_max_len, shuffle=False) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in list(MyModel.named_parameters()) if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in list(MyModel.named_parameters()) if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=t_config.learning_rate, betas=[0.9, 0.999], warmup=t_config.warmup, t_total=t_config.num_epoch * len(train_loader) // t_config.accumulation_steps) i = 0 for n, p in list(MyModel.named_parameters()): if i < 10: p.requires_grad = False i += 1 p = train['target'].mean() likelihood = np.log(p / (1 - p)) model_bias = torch.tensor(likelihood).type(torch.float) MyModel.classifier.bias = nn.Parameter(model_bias.to(device)) MyModel, optimizer = amp.initialize(MyModel, optimizer, opt_level="O1", verbosity=0) for epoch in range(t_config.num_epoch): i = 0 print('Training start') optimizer.zero_grad() MyModel.train() for batch_idx, (input, target, target_aux, target_identity, sample_weight) in tqdm_notebook( enumerate(train_loader), total=len(train_loader)): y_pred = MyModel( input.to(device), attention_mask=(input > 0).to(device), ) loss = F.binary_cross_entropy_with_logits(y_pred[0][:, 0], target.to(device), reduction='none') loss = (loss * sample_weight.to(device)).sum() / ( sample_weight.sum().to(device)) loss_aux = F.binary_cross_entropy_with_logits( y_pred[0][:, 1:6], target_aux.to(device), reduction='none').mean(axis=1) loss_aux = (loss_aux * sample_weight.to(device)).sum() / ( sample_weight.sum().to(device)) loss += loss_aux if t_config.num_labels == 15: loss_identity = F.binary_cross_entropy_with_logits( y_pred[0][:, 6:], target_identity.to(device), reduction='none').mean(axis=1) loss_identity = (loss_identity * sample_weight.to(device) ).sum() / (sample_weight.sum().to(device)) loss += loss_identity with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if (i + 1) % t_config.accumulation_steps == 0: optimizer.step() optimizer.zero_grad() i += 1 torch.save( { 'model_state_dict': MyModel.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), }, f'{t_config.PATH}_{s_config.seed}')