def test(): # 配置文件 cf = Config('./config.yaml') # 有GPU用GPU device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 测试数据 test_data = NewsDataset("./data/cnews_final_test.txt", cf.max_seq_len) test_dataloader = DataLoader(test_data, batch_size=cf.batch_size, shuffle=True) # 模型 config = BertConfig("./output/pytorch_bert_config.json") model = BertForSequenceClassification(config, num_labels=cf.num_labels) model.load_state_dict(torch.load("./output/pytorch_model.bin")) # 把模型放到指定设备 model.to(device) # 让模型并行化运算 if torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # 训练 start_time = time.time() data_len = len(test_dataloader) model.eval() y_pred = np.array([]) y_test = np.array([]) # for step,batch in enumerate(tqdm(test_dataloader,"batch",total=len(test_dataloader))): for step, batch in enumerate(test_dataloader): label_id = batch['label_id'].squeeze(1).to(device) word_ids = batch['word_ids'].to(device) segment_ids = batch['segment_ids'].to(device) word_mask = batch['word_mask'].to(device) loss = model(word_ids, segment_ids, word_mask, label_id) with torch.no_grad(): pred = get_model_labels(model, word_ids, segment_ids, word_mask) y_pred = np.hstack((y_pred, pred)) y_test = np.hstack((y_test, label_id.to("cpu").numpy())) # 评估 print("Precision, Recall and F1-Score...") print( metrics.classification_report(y_test, y_pred, target_names=get_labels('./data/label'))) # 混淆矩阵 print("Confusion Matrix...") cm = metrics.confusion_matrix(y_test, y_pred) print(cm)
def get_trained_model(fine_tuned="bert_pytorch.bin", device=torch.device('cuda')): model = None y_columns = [ 'toxic', "severe_toxic", "obscene", "threat", "insult", "identity_hate" ] pretrain_data_folder = PRETRAIND_PICKLE_AND_MORE if not os.path.exists(pretrain_data_folder + "/" + fine_tuned): pretrain_data_folder = '/home/working' if os.path.exists(pretrain_data_folder + "/" + fine_tuned): output_model_file = pretrain_data_folder + "/" + fine_tuned bert_config = BertConfig.from_json_file(pretrain_data_folder + "/bert_config.json") # Run validation # The following 2 lines are not needed but show how to download the model for prediction model = BertForSequenceClassification(bert_config, num_labels=len(y_columns)) model.load_state_dict(torch.load(output_model_file)) model.to(device) return model
def main(): test_df = pd.read_csv(TEST_PATH) with timer('preprocessing text'): test_df['comment_text'] = test_df['comment_text'].astype(str) test_df = test_df.fillna(0) with timer('load embedding'): tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None, do_lower_case=True) X_text = convert_lines(test_df["comment_text"].fillna("DUMMY_VALUE"), max_len, tokenizer) with timer('train'): model = BertForSequenceClassification(bert_config, num_labels=n_labels) model.load_state_dict(torch.load(model_path)) model = model.to(device) test_dataset = torch.utils.data.TensorDataset( torch.tensor(X_text, dtype=torch.long)) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size * 2, shuffle=False) test_pred = inference(model, test_loader, device, n_labels) del model gc.collect() torch.cuda.empty_cache() submission = pd.DataFrame.from_dict({ 'id': test_df['id'], 'prediction': test_pred.reshape(-1) }) submission.to_csv('submission.csv', index=False) LOGGER.info(submission.head())
def main(): train_df = pd.read_csv(TRAIN_PATH) train_df['male'] = np.load( "../input/identity-column-data/male_labeled.npy") train_df['female'] = np.load( "../input/identity-column-data/female_labeled.npy") train_df['homosexual_gay_or_lesbian'] = np.load( "../input/identity-column-data/homosexual_gay_or_lesbian_labeled.npy") train_df['christian'] = np.load( "../input/identity-column-data/christian_labeled.npy") train_df['jewish'] = np.load( "../input/identity-column-data/jewish_labeled.npy") train_df['muslim'] = np.load( "../input/identity-column-data/muslim_labeled.npy") train_df['black'] = np.load( "../input/identity-column-data/black_labeled.npy") train_df['white'] = np.load( "../input/identity-column-data/white_labeled.npy") train_df['psychiatric_or_mental_illness'] = np.load( "../input/identity-column-data/psychiatric_or_mental_illness_labeled.npy" ) fold_df = pd.read_csv(FOLD_PATH) # y = np.where(train_df['target'] >= 0.5, 1, 0) y = train_df['target'].values y_aux = train_df[AUX_COLUMNS].values identity_columns_new = [] for column in identity_columns + ['target']: train_df[column + "_bin"] = np.where(train_df[column] >= 0.5, True, False) if column != "target": identity_columns_new.append(column + "_bin") # Overall weights = np.ones((len(train_df), )) / 4 # Subgroup weights += (train_df[identity_columns].fillna(0).values >= 0.5).sum( axis=1).astype(bool).astype(np.int) / 4 # Background Positive, Subgroup Negative weights += ( ((train_df["target"].values >= 0.5).astype(bool).astype(np.int) + (1 - (train_df[identity_columns].fillna(0).values >= 0.5).sum( axis=1).astype(bool).astype(np.int))) > 1).astype(bool).astype( np.int) / 4 # Background Negative, Subgroup Positive weights += ( ((train_df["target"].values < 0.5).astype(bool).astype(np.int) + (train_df[identity_columns].fillna(0).values >= 0.5).sum( axis=1).astype(bool).astype(np.int)) > 1).astype(bool).astype( np.int) / 4 loss_weight = 0.5 with timer('preprocessing text'): # df["comment_text"] = [analyzer_embed(text) for text in df["comment_text"]] train_df['comment_text'] = train_df['comment_text'].astype(str) train_df = train_df.fillna(0) with timer('load embedding'): tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None, do_lower_case=True) X_text = convert_lines_head_tail( train_df["comment_text"].fillna("DUMMY_VALUE"), max_len, head_len, tokenizer) del tokenizer gc.collect() LOGGER.info(f"X_text {X_text.shape}") with timer('train'): train_index = fold_df.fold_id != fold_id valid_index = fold_df.fold_id == fold_id X_train, y_train, y_aux_train, w_train = X_text[train_index].astype( "int32"), y[train_index], y_aux[train_index], weights[train_index] X_val, y_val, y_aux_val, w_val = X_text[valid_index].astype("int32"), y[valid_index], y_aux[valid_index], \ weights[ valid_index] test_df = train_df[valid_index] del X_text, y, y_aux, weights, train_index, valid_index, train_df gc.collect() model = BertForSequenceClassification(bert_config, num_labels=n_labels) model.load_state_dict(torch.load(model_path)) model.zero_grad() model = model.to(device) y_train = np.concatenate( (y_train.reshape(-1, 1), w_train.reshape(-1, 1), y_aux_train), axis=1).astype("float32") y_val = np.concatenate( (y_val.reshape(-1, 1), w_val.reshape(-1, 1), y_aux_val), axis=1).astype("float32") train_dataset = torch.utils.data.TensorDataset( torch.tensor(X_train, dtype=torch.long), torch.tensor(y_train, dtype=torch.float32)) valid = torch.utils.data.TensorDataset( torch.tensor(X_val, dtype=torch.long), torch.tensor(y_val, dtype=torch.float32)) ran_sampler = torch.utils.data.RandomSampler(train_dataset) len_sampler = LenMatchBatchSampler(ran_sampler, batch_size=batch_size, drop_last=False) train_loader = torch.utils.data.DataLoader(train_dataset, batch_sampler=len_sampler) valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size * 2, shuffle=False) LOGGER.info(f"done data loader setup") param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = int(epochs * len(X_train) / batch_size / accumulation_steps) total_step = int(epochs * len(X_train) / batch_size) optimizer = BertAdam(optimizer_grouped_parameters, lr=base_lr, warmup=0.005, t_total=num_train_optimization_steps) LOGGER.info(f"done optimizer loader setup") model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) # criterion = torch.nn.BCEWithLogitsLoss().to(device) criterion = CustomLoss(loss_weight).to(device) LOGGER.info(f"done amp setup") for epoch in range(1, epochs + 1): LOGGER.info(f"Starting {epoch} epoch...") LOGGER.info(f"length {len(X_train)} train {len(X_val)} train...") if epoch == 1: for param_group in optimizer.param_groups: param_group['lr'] = base_lr * gammas[1] tr_loss, train_losses = train_one_epoch(model, train_loader, criterion, optimizer, device, accumulation_steps, total_step, n_labels, base_lr, gamma=gammas[2 * epoch]) LOGGER.info(f'Mean train loss: {round(tr_loss,5)}') torch.save(model.state_dict(), '{}_epoch{}_fold{}.pth'.format(exp, epoch, fold_id)) valid_loss, oof_pred = validate(model, valid_loader, criterion, device, n_labels) LOGGER.info(f'Mean valid loss: {round(valid_loss,5)}') if epochs > 1: test_df_cp = test_df.copy() test_df_cp["pred"] = oof_pred[:, 0] test_df_cp = convert_dataframe_to_bool(test_df_cp) bias_metrics_df = compute_bias_metrics_for_model( test_df_cp, identity_columns) LOGGER.info(bias_metrics_df) score = get_final_metric(bias_metrics_df, calculate_overall_auc(test_df_cp)) LOGGER.info(f'score is {score}') del model gc.collect() torch.cuda.empty_cache() test_df["pred"] = oof_pred[:, 0] test_df = convert_dataframe_to_bool(test_df) bias_metrics_df = compute_bias_metrics_for_model(test_df, identity_columns) LOGGER.info(bias_metrics_df) score = get_final_metric(bias_metrics_df, calculate_overall_auc(test_df)) LOGGER.info(f'final score is {score}') test_df.to_csv("oof.csv", index=False) xs = list(range(1, len(train_losses) + 1)) plt.plot(xs, train_losses, label='Train loss') plt.legend() plt.xticks(xs) plt.xlabel('Iter') plt.savefig("loss.png")
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) # Path parser.add_argument("--output_model_path", default="./models/classifier_model.bin", type=str, help="Path of the output model.") parser.add_argument("--output_lossfig_path", default="./models/loss.png", type=str, help="Path of the output model.") # Model options. parser.add_argument("--batch_size", type=int, default=32, help="Batch size.") parser.add_argument("--seq_length", type=int, default=128, help="Sequence length.") # Optimizer options. parser.add_argument("--learning_rate", type=float, default=2e-5, help="Learning rate.") parser.add_argument("--warmup", type=float, default=0.1, help="Warm up value.") # Training options. parser.add_argument("--dropout", type=float, default=0.5, help="Dropout.") parser.add_argument("--epochs_num", type=int, default=5, help="Number of epochs.") parser.add_argument("--report_steps", type=int, default=100, help="Specific steps to print prompt.") parser.add_argument("--seed", type=int, default=7, help="Random seed.") parser.add_argument("--device", type=str, default='cpu', help="Device use.") args = parser.parse_args() def set_seed(seed=7): random.seed(seed) os.environ['PYTHONHASHSEED'] = str(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) torch.backends.cudnn.deterministic = True set_seed(args.seed) # 读取数据 train = pd.read_csv('../data5k/train.tsv', encoding='utf-8', sep='\t') dev = pd.read_csv('../data5k/dev.tsv', encoding='utf-8', sep='\t') test = pd.read_csv('../data5k/test.tsv', encoding='utf-8', sep='\t') # Load bert vocabulary and tokenizer bert_config = BertConfig('bert_model/bert_config.json') BERT_MODEL_PATH = 'bert_model' bert_tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None, do_lower_case=False) # 产生输入数据 processor = DataPrecessForSingleSentence(bert_tokenizer=bert_tokenizer) # train dataset seqs, seq_masks, seq_segments = processor.get_input( sentences=train['text_a'].tolist(), max_seq_len=args.seq_length) labels = train['label'].tolist() t_seqs = torch.tensor(seqs, dtype=torch.long) t_seq_masks = torch.tensor(seq_masks, dtype=torch.long) t_seq_segments = torch.tensor(seq_segments, dtype=torch.long) t_labels = torch.tensor(labels, dtype=torch.long) train_data = TensorDataset(t_seqs, t_seq_masks, t_seq_segments, t_labels) train_sampler = RandomSampler(train_data) train_dataloder = DataLoader(dataset=train_data, sampler=train_sampler, batch_size=args.batch_size) # dev dataset seqs, seq_masks, seq_segments = processor.get_input( sentences=dev['text_a'].tolist(), max_seq_len=args.seq_length) labels = dev['label'].tolist() t_seqs = torch.tensor(seqs, dtype=torch.long) t_seq_masks = torch.tensor(seq_masks, dtype=torch.long) t_seq_segments = torch.tensor(seq_segments, dtype=torch.long) t_labels = torch.tensor(labels, dtype=torch.long) dev_data = TensorDataset(t_seqs, t_seq_masks, t_seq_segments, t_labels) dev_sampler = RandomSampler(dev_data) dev_dataloder = DataLoader(dataset=dev_data, sampler=dev_sampler, batch_size=args.batch_size) # test dataset seqs, seq_masks, seq_segments = processor.get_input( sentences=test['text_a'].tolist(), max_seq_len=args.seq_length) labels = test['label'].tolist() t_seqs = torch.tensor(seqs, dtype=torch.long) t_seq_masks = torch.tensor(seq_masks, dtype=torch.long) t_seq_segments = torch.tensor(seq_segments, dtype=torch.long) t_labels = torch.tensor(labels, dtype=torch.long) test_data = TensorDataset(t_seqs, t_seq_masks, t_seq_segments, t_labels) test_sampler = RandomSampler(test_data) test_dataloder = DataLoader(dataset=test_data, sampler=test_sampler, batch_size=args.batch_size) # build classification model model = BertForSequenceClassification(bert_config, 2) # For simplicity, we use DataParallel wrapper to use multiple GPUs. if args.device == 'cpu': device = torch.device("cpu") else: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if torch.cuda.device_count() > 1: print("{} GPUs are available. Let's use them.".format( torch.cuda.device_count())) model = nn.DataParallel(model) model = model.to(device) # evaluation function def evaluate(args, is_test, metrics='Acc'): if is_test: dataset = test_dataloder instances_num = test.shape[0] print("The number of evaluation instances: ", instances_num) else: dataset = dev_dataloder instances_num = dev.shape[0] print("The number of evaluation instances: ", instances_num) correct = 0 model.eval() # Confusion matrix. confusion = torch.zeros(2, 2, dtype=torch.long) for i, batch_data in enumerate(dataset): batch_data = tuple(t.to(device) for t in batch_data) batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels = batch_data with torch.no_grad(): logits = model(batch_seqs, batch_seq_masks, batch_seq_segments, labels=None) pred = logits.softmax(dim=1).argmax(dim=1) gold = batch_labels for j in range(pred.size()[0]): confusion[pred[j], gold[j]] += 1 correct += torch.sum(pred == gold).item() if is_test: print("Confusion matrix:") print(confusion) print("Report precision, recall, and f1:") for i in range(confusion.size()[0]): p = confusion[i, i].item() / confusion[i, :].sum().item() r = confusion[i, i].item() / confusion[:, i].sum().item() f1 = 2 * p * r / (p + r) if i == 1: label_1_f1 = f1 print("Label {}: {:.3f}, {:.3f}, {:.3f}".format(i, p, r, f1)) print("Acc. (Correct/Total): {:.4f} ({}/{}) ".format( correct / instances_num, correct, instances_num)) if metrics == 'Acc': return correct / instances_num elif metrics == 'f1': return label_1_f1 else: return correct / instances_num # training phase print("Start training.") instances_num = train.shape[0] batch_size = args.batch_size train_steps = int(instances_num * args.epochs_num / batch_size) + 1 print("Batch size: ", batch_size) print("The number of training instances:", instances_num) # 待优化的参数 param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup, t_total=train_steps) # 存储每一个batch的loss all_loss = [] all_acc = [] total_loss = 0.0 result = 0.0 best_result = 0.0 for epoch in range(1, args.epochs_num + 1): model.train() for step, batch_data in enumerate(train_dataloder): batch_data = tuple(t.to(device) for t in batch_data) batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels = batch_data # 对标签进行onehot编码 one_hot = torch.zeros(batch_labels.size(0), 2).long() '''one_hot_batch_labels = one_hot.scatter_( dim=1, index=torch.unsqueeze(batch_labels, dim=1), src=torch.ones(batch_labels.size(0), 2).long()) logits = model( batch_seqs, batch_seq_masks, batch_seq_segments, labels=None) logits = logits.softmax(dim=1) loss_function = CrossEntropyLoss() loss = loss_function(logits, batch_labels)''' loss = model(batch_seqs, batch_seq_masks, batch_seq_segments, batch_labels) loss.backward() total_loss += loss.item() if (step + 1) % 100 == 0: print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}". format(epoch, step + 1, total_loss / 100)) sys.stdout.flush() total_loss = 0. #print("Epoch id: {}, Training steps: {}, Avg loss: {:.3f}".format(epoch, step+1, loss)) optimizer.step() optimizer.zero_grad() all_loss.append(total_loss) total_loss = 0. print("Start evaluation on dev dataset.") result = evaluate(args, False) all_acc.append(result) if result > best_result: best_result = result torch.save(model, open(args.output_model_path, "wb")) #save_model(model, args.output_model_path) else: continue print("Start evaluation on test dataset.") evaluate(args, True) print('all_loss:', all_loss) print('all_acc:', all_acc) # Evaluation phase. print("Final evaluation on the test dataset.") model.load_state_dict(torch.load(args.output_model_path)) evaluate(args, True) '''
== (y_batch[:, 0] > 0.5).to(device)).to( torch.float)).item() / len(train_loader) tq.set_postfix(avg_loss=avg_loss, avg_accuracy=avg_accuracy) torch.save(model.state_dict(), output_model_file + '_epoch_' + str(epoch) + '.bin') #validate test_model = BertForSequenceClassification(bert_config, num_labels=len(y_columns)) #paralleism test_model = nn.DataParallel(test_model) test_model.load_state_dict( torch.load(output_model_file + '_epoch_' + str(epoch) + '.bin')) test_model.to(device) for param in test_model.parameters(): param.requires_grad = False test_model.eval() valid_preds = np.zeros((len(X_val))) print(valid_preds.size) valid = torch.utils.data.TensorDataset( torch.tensor(X_val, dtype=torch.long)) valid_loader = torch.utils.data.DataLoader(valid, batch_size=256, shuffle=False) tk0 = tqdm(valid_loader) for i, (x_batch, ) in enumerate(tk0): pred = test_model(x_batch.to(device), attention_mask=(x_batch > 0).to(device),
train_df = train_df.drop(['comment_text'], axis=1) train_df['target'] = (train_df['target'] >= 0.5).astype(float) valid_df = valid_df.fillna(0) valid_df = valid_df.drop(['comment_text'], axis=1) valid_df['target'] = (valid_df['toxic'] == 1) | (valid_df['severe_toxic'] == 1) valid_df['target'] = valid_df['target'] | (valid_df['obscene'] == 1) valid_df['target'] = valid_df['target'] | (valid_df['threat'] == 1) valid_df['target'] = valid_df['target'] | (valid_df['insult'] == 1) valid_df['target'] = valid_df['target'] | (valid_df['identity_hate'] == 1) valid_df['target'] = valid_df['target'].astype(float) model = BertForSequenceClassification(bert_config, num_labels=1) model.load_state_dict(torch.load("./datas/bert_pytorch.bin")) model.to(device) for param in model.parameters(): param.requires_grad = False X = train_seqs[:] y = train_df['target'].values[:] valid_X = valid_seqs[:] valid_y = valid_df['target'].values[:] X = np.concatenate((X, valid_X), axis=1) y = np.concatenate((y, valid_y), axis=0) train_dataset = torch.utils.data.TensorDataset( torch.tensor(X, dtype=torch.long), torch.tensor(y, dtype=torch.float)) output_model_file = "./datas/mybert.bin" lr = 2e-5
class TransformersClassifierHandler(BaseHandler, ABC): """ Transformers text classifier handler class. This handler takes a text (string) and as input and returns the classification text based on the serialized transformers checkpoint. """ def __init__(self): super(TransformersClassifierHandler, self).__init__() self.initialized = False def initialize(self, ctx): properties = ctx.system_properties MODEL_DIR = properties.get("model_dir") self.device = torch.device("cuda:" + str(properties.get("gpu_id")) if torch.cuda. is_available() else "cpu") self.labelencoder = preprocessing.LabelEncoder() self.labelencoder.classes_ = np.load( os.path.join(MODEL_DIR, 'classes.npy')) config = BertConfig(os.path.join(MODEL_DIR, 'bert_config.json')) self.model = BertForSequenceClassification( config, num_labels=len(self.labelencoder.classes_)) self.model.load_state_dict( torch.load(os.path.join(MODEL_DIR, 'pytorch_model.bin'), map_location="cpu")) self.model.to(self.device) self.model.eval() self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") self.softmax = torch.nn.Softmax(dim=-1) # self.batch_size = batch_size logger.debug( 'Transformer model from path {0} loaded successfully'.format( MODEL_DIR)) self.manifest = ctx.manifest self.initialized = True def preprocess(self, data): ids = [] segment_ids = [] input_masks = [] MAX_LEN = 128 for sen in data: text_tokens = self.tokenizer.tokenize(sen) tokens = ["[CLS]"] + text_tokens + ["[SEP]"] temp_ids = self.tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(temp_ids) segment_id = [0] * len(temp_ids) padding = [0] * (MAX_LEN - len(temp_ids)) temp_ids += padding input_mask += padding segment_id += padding ids.append(temp_ids) input_masks.append(input_mask) segment_ids.append(segment_id) ## Convert input list to Torch Tensors ids = torch.tensor(ids) segment_ids = torch.tensor(segment_ids) input_masks = torch.tensor(input_masks) validation_data = TensorDataset(ids, input_masks, segment_ids) validation_sampler = SequentialSampler(validation_data) validation_dataloader = DataLoader( validation_data, sampler=validation_sampler, batch_size=len(data), num_workers=self.dataloader_num_workers) return validation_dataloader def inference(self, validation_dataloader): """ Predict the class of a text using a trained transformer model. """ # NOTE: This makes the assumption that your model expects text to be tokenized # with "input_ids" and "token_type_ids" - which is true for some popular transformer models, e.g. bert. # If your transformer model expects different tokenization, adapt this code to suit # its expected input format. responses = [] for batch in validation_dataloader: # Add batch to GPU batch = tuple(t.to(self.device) for t in batch) # Unpack the inputs from our dataloader b_input_ids, b_input_mask, b_labels = batch with torch.no_grad(): # Forward pass, calculate logit predictions logits = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) for i in range(logits.size(0)): label_idx = [ self.softmax( logits[i]).detach().cpu().numpy().argmax() ] label_str = self.labelencoder.inverse_transform( label_idx)[0] responses.append(label_str) return responses def postprocess(self, inference_output): # TODO: Add any needed post-processing of the model predictions here return inference_output
def main(): # train_df = pd.read_csv(TRAIN_PATH).sample(frac=1.0, random_state=seed) # train_size = int(len(train_df) * 0.9) train_df = pd.read_csv(TRAIN_PATH).sample(train_size + valid_size, random_state=seed) LOGGER.info(f'data_size is {len(train_df)}') LOGGER.info(f'train_size is {train_size}') y = np.where(train_df['target'] >= 0.5, 1, 0) y_aux = train_df[AUX_COLUMNS].values identity_columns_new = [] for column in identity_columns + ['target']: train_df[column + "_bin"] = np.where(train_df[column] >= 0.5, True, False) if column != "target": identity_columns_new.append(column + "_bin") sample_weights = np.ones(len(train_df), dtype=np.float32) sample_weights += train_df[identity_columns_new].sum(axis=1) sample_weights += train_df['target_bin'] * (~train_df[identity_columns_new]).sum(axis=1) sample_weights += (~train_df['target_bin']) * train_df[identity_columns_new].sum(axis=1) * 5 sample_weights /= sample_weights.mean() with timer('preprocessing text'): # df["comment_text"] = [analyzer_embed(text) for text in df["comment_text"]] train_df['comment_text'] = train_df['comment_text'].astype(str) train_df = train_df.fillna(0) with timer('load embedding'): tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None, do_lower_case=True) X_text = convert_lines(train_df["comment_text"].fillna("DUMMY_VALUE"), max_len, tokenizer) test_df = train_df[train_size:] with timer('train'): X_train, y_train, y_aux_train, w_train = X_text[:train_size], y[:train_size], y_aux[ :train_size], sample_weights[ :train_size] X_val, y_val, y_aux_val, w_val = X_text[train_size:], y[train_size:], y_aux[train_size:], sample_weights[ train_size:] model = BertForSequenceClassification(bert_config, num_labels=n_labels) model.load_state_dict(torch.load(model_path)) model.zero_grad() model = model.to(device) train_dataset = torch.utils.data.TensorDataset(torch.tensor(X_train, dtype=torch.long), torch.tensor(y_train, dtype=torch.float)) valid = torch.utils.data.TensorDataset(torch.tensor(X_val, dtype=torch.long), torch.tensor(y_val, dtype=torch.float)) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size * 2, shuffle=False) sample_weight_train = [w_train.values, np.ones_like(w_train)] sample_weight_val = [w_val.values, np.ones_like(w_val)] param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] num_train_optimization_steps = int(epochs * train_size / batch_size / accumulation_steps) total_step = int(epochs * train_size / batch_size) optimizer = BertAdam(optimizer_grouped_parameters, lr=2e-5*gamma, warmup=0.05, t_total=num_train_optimization_steps) model, optimizer = amp.initialize(model, optimizer, opt_level="O1", verbosity=0) criterion = torch.nn.BCEWithLogitsLoss().to(device) LOGGER.info(f"Starting 1 epoch...") tr_loss, train_losses = train_one_epoch(model, train_loader, criterion, optimizer, device, accumulation_steps, total_step, n_labels) LOGGER.info(f'Mean train loss: {round(tr_loss,5)}') torch.save(model.state_dict(), '{}_dic'.format(exp)) valid_loss, oof_pred = validate(model, valid_loader, criterion, device, n_labels) del model gc.collect() torch.cuda.empty_cache() test_df["pred"] = oof_pred.reshape(-1) test_df = convert_dataframe_to_bool(test_df) bias_metrics_df = compute_bias_metrics_for_model(test_df, identity_columns) LOGGER.info(bias_metrics_df) score = get_final_metric(bias_metrics_df, calculate_overall_auc(test_df)) LOGGER.info(f'final score is {score}') test_df.to_csv("oof.csv", index=False) xs = list(range(1, len(train_losses) + 1)) plt.plot(xs, train_losses, label='Train loss'); plt.legend(); plt.xticks(xs); plt.xlabel('Iter') plt.savefig("loss.png")
def train_unfixed(): # 配置文件 cf = Config('./config.yaml') # 有GPU用GPU device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # 训练数据 train_data = NewsDataset("./data/cnews_final_train.txt", cf.max_seq_len) train_dataloader = DataLoader(train_data, batch_size=cf.batch_size, shuffle=True) # 测试数据 test_data = NewsDataset("./data/cnews_final_test.txt", cf.max_seq_len) test_dataloader = DataLoader(test_data, batch_size=cf.batch_size, shuffle=True) # 模型 config = BertConfig("./output/pytorch_bert_config.json") model = BertForSequenceClassification(config, num_labels=cf.num_labels) model.load_state_dict(torch.load("./output/pytorch_model.bin")) # 优化器用adam for param in model.parameters(): param.requires_grad = True param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = int( len(train_data) / cf.batch_size) * cf.epoch optimizer = BertAdam(optimizer_grouped_parameters, lr=cf.lr, t_total=num_train_optimization_steps) # 把模型放到指定设备 model.to(device) # 让模型并行化运算 if torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # 训练 start_time = time.time() total_batch = 0 # 总批次 best_acc_val = 0.0 # 最佳验证集准确率 last_improved = 0 # 记录上一次提升批次 require_improvement = 1500 # 如果超过1500轮未提升,提前结束训练 # 获取当前验证集acc model.eval() _, best_acc_val = evaluate(model, test_dataloader, device) flag = False model.train() for epoch_id in range(cf.epoch): print("Epoch %d" % epoch_id) for step, batch in enumerate( tqdm(train_dataloader, desc="batch", total=len(train_dataloader))): # for step,batch in enumerate(train_dataloader): label_id = batch['label_id'].squeeze(1).to(device) word_ids = batch['word_ids'].to(device) segment_ids = batch['segment_ids'].to(device) word_mask = batch['word_mask'].to(device) loss = model(word_ids, segment_ids, word_mask, label_id) loss.backward() optimizer.step() optimizer.zero_grad() total_batch += 1 if total_batch % cf.print_per_batch == 0: model.eval() with torch.no_grad(): loss_train, acc_train = get_model_loss_acc( model, word_ids, segment_ids, word_mask, label_id) loss_val, acc_val = evaluate(model, test_dataloader, device) if acc_val > best_acc_val: # 保存最好结果 best_acc_val = acc_val last_improved = total_batch torch.save(model.state_dict(), "./output/pytorch_model.bin") with open("./output/pytorch_bert_config.json", 'w') as f: f.write(model.config.to_json_string()) improved_str = "*" else: improved_str = "" time_dif = get_time_dif(start_time) msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \ + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}' print( msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str)) model.train() if total_batch - last_improved > require_improvement: print("长时间未优化") flag = True break if flag: break