def main(paras): logger = logging.getLogger(__name__) if paras.save_log_file: logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=paras.logging_level, filename=f'{paras.log_save_path}/{paras.train_log_file}', filemode='w') else: logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=paras.logging_level, ) device = 'cuda' if torch.cuda.is_available() else 'cpu' logger.info(f'Loading model: {paras.model_name}') tokenizer = BertTokenizer.from_pretrained(paras.model_name) bert = BertModel.from_pretrained(paras.model_name) train_dataset = RE_Dataset(paras, 'train') train_dataloaer = DataLoader(train_dataset, batch_size=paras.batch_size, shuffle=paras.shuffle, drop_last=paras.drop_last) label_to_index = train_dataset.label_to_index special_token_list = list(train_dataset.special_token_set) # fixme: add special token to tokenizer special_tokens_dict = {'additional_special_tokens': special_token_list} tokenizer.add_special_tokens(special_tokens_dict) # bert.resize_token_embeddings(len(tokenizer)) test_dataset = RE_Dataset(paras, 'test') test_dataloader = DataLoader(test_dataset, batch_size=paras.batch_size, shuffle=paras.shuffle, drop_last=paras.drop_last) bert_classifier = BertClassifier(bert, paras.hidden_size, paras.label_number, paras.dropout_prob) if paras.optimizer == 'adam': logger.info('Loading Adam optimizer.') optimizer = torch.optim.Adam(bert_classifier.parameters(), lr=paras.learning_rate) elif paras.optimizer == 'adamw': logger.info('Loading AdamW optimizer.') no_decay = [ 'bias', 'LayerNorm.weight' ] optimizer_grouped_parameters = [ {'params': [ p for n, p in bert_classifier.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01}, {'params': [ p for n, p in bert_classifier.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0} ] optimizer = AdamW(optimizer_grouped_parameters, lr=paras.learning_rate, eps=args.adam_epsilon) else: logger.warning(f'optimizer must be "Adam" or "AdamW", but got {paras.optimizer}.') logger.info('Loading Adam optimizer.') optimizer = torch.optim.Adam(bert_classifier.parameters(), lr=paras.learning_rate) logger.info('Training Start.') best_eval = {'acc': 0, 'precision': 0, 'recall': 0, 'f1': 0, 'loss': 0} for epoch in range(paras.num_train_epochs): epoch_loss = 0 bert_classifier.train() for step, batch in enumerate(train_dataloaer): optimizer.zero_grad() batch_data, batch_label = batch encoded_data = tokenizer(batch_data, padding=True, truncation=True, return_tensors='pt', max_length=paras.max_sequence_length) label_tensor = batch_label_to_idx(batch_label, label_to_index) loss = bert_classifier(encoded_data, label_tensor) epoch_loss += loss_to_int(loss) logging.info(f'epoch: {epoch}, step: {step}, loss: {loss:.4f}') # fixme: del # acc, precision, recall, f1 = evaluation(bert_classifier, tokenizer, test_dataloader, # paras.max_sequence_length, label_to_index) # logger.info(f'Accuracy: {acc:.4f}, Precision: {precision:.4f}, ' # f'Recall: {recall:.4f}, F1-score: {f1:.4f}') loss.backward() optimizer.step() epoch_loss = epoch_loss / len(train_dataloaer) acc, precision, recall, f1 = evaluation(bert_classifier, tokenizer, test_dataloader, paras.max_sequence_length, label_to_index) logging.info(f'Epoch: {epoch}, Epoch-Average Loss: {epoch_loss:.4f}') logger.info(f'Accuracy: {acc:.4f}, Precision: {precision:.4f}, ' f'Recall: {recall:.4f}, F1-score: {f1:.4f}') if best_eval['loss'] == 0 or f1 > best_eval['f1']: best_eval['loss'] = epoch_loss best_eval['acc'] = acc best_eval['precision'] = precision best_eval['recall'] = recall best_eval['f1'] = f1 torch.save(bert_classifier, f'{paras.log_save_path}/{paras.model_save_name}') with open(f'{paras.log_save_path}/{paras.checkpoint_file}', 'w') as wf: wf.write(f'Save time: {time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}\n') wf.write(f'Best F1-score: {best_eval["f1"]:.4f}\n') wf.write(f'Precision: {best_eval["precision"]:.4f}\n') wf.write(f'Recall: {best_eval["recall"]:.4f}\n') wf.write(f'Accuracy: {best_eval["acc"]:.4f}\n') wf.write(f'Epoch-Average Loss: {best_eval["loss"]:.4f}\n') logger.info(f'Updated model, best F1-score: {best_eval["f1"]:.4f}\n') logger.info(f'Train complete, Best F1-score: {best_eval["f1"]:.4f}.')
def run(): def collate_fn( batch: List[Tuple[torch.LongTensor, torch.LongTensor]], device: torch.device) -> Tuple[torch.LongTensor, torch.LongTensor]: x, y = list(zip(*batch)) x = pad_sequence(x, batch_first=True, padding_value=0) y = torch.stack(y) return x.to(device), y.to(device) df = pd.read_csv("../inputs/Train.csv") # test = pd.read_csv("../inputs/Test.csv") train_df, val_df = train_test_split(df, stratify=df.label, test_size=VALID_SIZE, random_state=SEED) labels = ["Depression", "Alcohol", "Suicide", "Drugs"] train = pd.concat([train_df["text"], pd.get_dummies(train_df['label'])\ .reindex(columns=labels)], axis=1)#.reset_index(drop=True) valid = pd.concat([val_df["text"], pd.get_dummies(val_df['label'])\ .reindex(columns=labels)], axis=1)#.reset_index(drop=True) if DEVICE == 'cpu': print('cpu') else: n_gpu = torch.cuda.device_count() print(torch.cuda.get_device_name(0)) train_dataset = MentalHealthDataset(config.TOKENIZER, train, lazy=True) valid_dataset = MentalHealthDataset(config.TOKENIZER, valid, lazy=True) collate_fn = partial(collate_fn, device=DEVICE) train_sampler = RandomSampler(train_dataset) valid_sampler = RandomSampler(valid_dataset) train_iterator = DataLoader(train_dataset, batch_size=config.TRAIN_BATCH_SIZE, sampler=train_sampler, collate_fn=collate_fn) valid_iterator = DataLoader(valid_dataset, batch_size=config.VALID_BATCH_SIZE, sampler=valid_sampler, collate_fn=collate_fn) # model = BertClassifier().to(DEVICE) model = BertClassifier(BertModel.from_pretrained(config.BERT_PATH), 4).to(DEVICE) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] # triangular learning rate, linearly grows untill half of first epoch, then linearly decays warmup_steps = 10**3 # 10 ** 3 total_steps = len(train_iterator) * config.EPOCHS - warmup_steps optimizer = AdamW(optimizer_grouped_parameters, lr=LR, eps=1e-8) scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps, total_steps) # scheduler = WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=total_steps) # optimizer = torch.optim.Adam(model.parameters(), lr=LR) # 1e-4) # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", # patience=5, factor=0.3, min_lr=1e-10, verbose=True) for epoch in range(config.EPOCHS): print('=' * 5, f"EPOCH {epoch}", '=' * 5) engine.train_fn(train_iterator, model, optimizer, scheduler) engine.eval_fn(valid_iterator, model) model.eval() test_df = pd.read_csv("../inputs/Test.csv") submission = pd.read_csv('../inputs/SampleSubmission.csv') res = np.zeros((submission.shape[0], len(labels))) for i in tqdm(range(len(test_df) // config.TRAIN_BATCH_SIZE + 1)): batch_df = test_df.iloc[i * config.TRAIN_BATCH_SIZE:(i + 1) * config.TRAIN_BATCH_SIZE] assert (batch_df["ID"] == submission["ID"] [i * config.TRAIN_BATCH_SIZE:(i + 1) * config.TRAIN_BATCH_SIZE]).all(), f"Id mismatch" texts = [] for text in batch_df["text"].tolist(): text = config.TOKENIZER.encode(text, add_special_tokens=True) if len(text) > config.MAX_LEN: text = text[:config.MAX_LEN - 1] + [config.TOKENIZER.sep_token_id] texts.append(torch.LongTensor(text)) x = pad_sequence( texts, batch_first=True, padding_value=config.TOKENIZER.pad_token_id).to(DEVICE) mask = (x != config.TOKENIZER.pad_token_id).float().to(DEVICE) with torch.no_grad(): _, outputs = model(x, attention_mask=mask) outputs = outputs.cpu().numpy() submission.loc[i * config.TRAIN_BATCH_SIZE:(i * config.TRAIN_BATCH_SIZE + len(outputs) - 1), labels] = outputs submission.to_csv("../subs/submission_2.csv", index=False)