def main(conf): # Prepare data train_dev = koco.load_dataset("korean-hate-speech", mode="train_dev") train, valid = train_dev["train"], train_dev["dev"] # Prepare tokenizer tokenizer = ( get_tokenizer() if "kobert" in conf.pretrained_model else AutoTokenizer.from_pretrained(conf.pretrained_model) ) if conf.tokenizer.register_names: names = pd.read_csv("entertainement_biographical_db.tsv", sep="\t")[ "name_wo_parenthesis" ].tolist() tokenizer.add_tokens(names) # Mapping string y_label to integer label if conf.label.hate: train, label2idx = map_label2idx(train, "hate") valid, _ = map_label2idx(valid, "hate") elif conf.label.bias: train, label2idx = map_label2idx(train, "bias") valid, _ = map_label2idx(valid, "bias") # Use bias as an additional context for predicting hate if conf.label.hate and conf.label.bias: biases = ["gender", "others", "none"] tokenizer.add_tokens([f"<{label}>" for label in biases]) # Prepare DataLoader train_dataset = KoreanHateSpeechDataset(train) valid_dataset = KoreanHateSpeechDataset(valid) collator = KoreanHateSpeechCollator( tokenizer, predict_hate_with_bias=(conf.label.hate and conf.label.bias) ) train_loader = DataLoader( train_dataset, batch_size=conf.train_hparams.batch_size, shuffle=True, collate_fn=collator.collate, ) valid_loader = DataLoader( valid_dataset, batch_size=conf.train_hparams.batch_size, shuffle=False, collate_fn=collator.collate, ) # Prepare model set_seeds(conf.train_hparams.seed) model = BertForSequenceClassification.from_pretrained( conf.pretrained_model, num_labels=len(label2idx) ) if conf.tokenizer.register_names: model.resize_token_embeddings(len(tokenizer)) elif conf.label.hate and conf.label.bias: model.resize_token_embeddings(len(tokenizer)) model = model.to(device) # Prepare optimizer and scheduler no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": 0.01, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }, ] optimizer = optim.AdamW( optimizer_grouped_parameters, lr=conf.train_hparams.lr, eps=conf.train_hparams.adam_epsilon, ) n_total_iterations = len(train_loader) * conf.train_hparams.n_epochs n_warmup_steps = int(n_total_iterations * conf.train_hparams.warmup_ratio) scheduler = get_linear_schedule_with_warmup( optimizer, n_warmup_steps, n_total_iterations ) criterion = nn.CrossEntropyLoss() criterion = criterion.to(device) # Train! trainer = BertTrainer(conf.train_hparams) model = trainer.train( model, criterion, optimizer, scheduler, train_loader, valid_loader ) makedirs(conf.checkpoint_dir) makedirs(conf.log_dir) checkpoint_path = f"{conf.checkpoint_dir}/{conf.model_name}.pt" log_path = f"{conf.log_dir}/{conf.model_name}.log" torch.save({"model": model.state_dict()}, checkpoint_path) torch.save({"config": conf, "classes": label2idx, "tokenizer": tokenizer}, log_path)
attention_mask += [0] else: encoder_input_id = encoder_input_id[:self.max_seq_len - 1] + [ self.tokenizer.eos_token_id ] attention_mask = attention_mask[:self.max_seq_len] return { 'input_ids': np.array(encoder_input_id, dtype=np.int_), 'attention_mask': np.array(attention_mask, dtype=np.float), 'labels': np.array(label, dtype=np.int_) } import koco train_dev = koco.load_dataset('korean-hate-speech', mode='train_dev') batch_size = 16 train_dataset = KGBDDataset(train_dev['train']) valid_dataset = KGBDDataset(train_dev['dev']) train_dataloader = DataLoader(train_dataset, batch_size=batch_size, num_workers=4, shuffle=True) valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, num_workers=4, shuffle=False) from transformers.optimization import AdamW, get_cosine_schedule_with_warmup from transformers import BartForSequenceClassification
def main(conf, testfile, save): # Load saved data checkpoint_path = f"{conf.checkpoint_dir}/{conf.model_name}.pt" log_path = f"{conf.log_dir}/{conf.model_name}.log" saved_model = torch.load(checkpoint_path, map_location=device)["model"] saved_data = torch.load(log_path, map_location=device) tokenizer = saved_data["tokenizer"] config = saved_data["config"] label2idx = saved_data["classes"] idx2label = {idx: label for label, idx in label2idx.items()} if testfile == "koco-test": test = koco.load_dataset("korean-hate-speech", mode="test") if config.label.hate and config.label.bias: if os.path.exists( "korean-hate-speech-dataset/labeled/test.bias.ternary.tsv" ): df = pd.read_csv( "korean-hate-speech-dataset/labeled/test.bias.ternary.tsv", sep="\t") else: raise NotImplementedError( "Adding external bias information is not supported, yet") test = [] for i, row in df.iterrows(): test.append({ "comments": row["comments"], "bias": row["label"] }) else: test = [] for line in read_lines(testfile): test.append({"comments": line}) test_texts = [] for t in test: test_text = t["comments"] if config.label.hate and config.label.bias: bias_context = f'<{t["bias"]}>' test_text = f"{bias_context} {test_text}" test_texts.append(test_text) with torch.no_grad(): # Declare model and load pre-trained weights model = BertForSequenceClassification.from_pretrained( config.pretrained_model, num_labels=len(label2idx)) if config.tokenizer.register_names: model.resize_token_embeddings(len(tokenizer)) elif config.label.hate and config.label.bias: model.resize_token_embeddings(len(tokenizer)) model.load_state_dict(saved_model) model.to(device) # Predict! model.eval() y_hats, tokens = [], [] for index in range(0, len(test_texts), config.train_hparams.batch_size): batch = test_texts[index:index + config.train_hparams.batch_size] batch_tokenized = tokenizer(batch, padding=True, truncation=True, return_tensors="pt") x = batch_tokenized["input_ids"] mask = batch_tokenized["attention_mask"] x = x.to(device) mask = mask.to(device) y_hat = F.softmax(model(x, attention_mask=mask)[0], dim=-1) y_hats += [y_hat] batch_token_lists = [tokenizer.tokenize(t) for t in batch] tokens += batch_token_lists y_hats = torch.cat(y_hats, dim=0) # (len(test), n_classes) probs, indices = y_hats.cpu().topk(1) # Print! if not save: for test_text, index, token in zip(test_texts, indices, tokens): print(test_text) print(" ".join(token)) print(idx2label[int(index[0])]) print("======================================================") # Save! if save: # Save test comment + predicted label with open( f"{result_dir}/{os.path.basename(testfile)}.{conf.model_name}.predict", "w") as f: f.write("comments" + "\t" + "prediction" + "\n") for test_text, index in zip(test_texts, indices): f.write(test_text + "\t" + idx2label[int(index[0])] + "\n") # Save tokenized test comment + predicted label with open( f"{result_dir}/{os.path.basename(testfile)}.{conf.model_name}.tokens", "w") as f: f.write("tokens" + "\t" + "prediction" + "\n") for token, index in zip(tokens, indices): f.write(" ".join(token) + "\t" + idx2label[int(index[0])] + "\n")