示例#1
0
def main(conf):
    # Prepare data
    train_dev = koco.load_dataset("korean-hate-speech", mode="train_dev")
    train, valid = train_dev["train"], train_dev["dev"]

    # Prepare tokenizer
    tokenizer = (
        get_tokenizer()
        if "kobert" in conf.pretrained_model
        else AutoTokenizer.from_pretrained(conf.pretrained_model)
    )
    if conf.tokenizer.register_names:
        names = pd.read_csv("entertainement_biographical_db.tsv", sep="\t")[
            "name_wo_parenthesis"
        ].tolist()
        tokenizer.add_tokens(names)

    # Mapping string y_label to integer label
    if conf.label.hate:
        train, label2idx = map_label2idx(train, "hate")
        valid, _ = map_label2idx(valid, "hate")
    elif conf.label.bias:
        train, label2idx = map_label2idx(train, "bias")
        valid, _ = map_label2idx(valid, "bias")

    # Use bias as an additional context for predicting hate
    if conf.label.hate and conf.label.bias:
        biases = ["gender", "others", "none"]
        tokenizer.add_tokens([f"<{label}>" for label in biases])

    # Prepare DataLoader
    train_dataset = KoreanHateSpeechDataset(train)
    valid_dataset = KoreanHateSpeechDataset(valid)
    collator = KoreanHateSpeechCollator(
        tokenizer, predict_hate_with_bias=(conf.label.hate and conf.label.bias)
    )
    train_loader = DataLoader(
        train_dataset,
        batch_size=conf.train_hparams.batch_size,
        shuffle=True,
        collate_fn=collator.collate,
    )
    valid_loader = DataLoader(
        valid_dataset,
        batch_size=conf.train_hparams.batch_size,
        shuffle=False,
        collate_fn=collator.collate,
    )

    # Prepare model
    set_seeds(conf.train_hparams.seed)
    model = BertForSequenceClassification.from_pretrained(
        conf.pretrained_model, num_labels=len(label2idx)
    )
    if conf.tokenizer.register_names:
        model.resize_token_embeddings(len(tokenizer))
    elif conf.label.hate and conf.label.bias:
        model.resize_token_embeddings(len(tokenizer))
    model = model.to(device)

    # Prepare optimizer and scheduler
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p
                for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.01,
        },
        {
            "params": [
                p
                for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]
    optimizer = optim.AdamW(
        optimizer_grouped_parameters,
        lr=conf.train_hparams.lr,
        eps=conf.train_hparams.adam_epsilon,
    )

    n_total_iterations = len(train_loader) * conf.train_hparams.n_epochs
    n_warmup_steps = int(n_total_iterations * conf.train_hparams.warmup_ratio)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, n_warmup_steps, n_total_iterations
    )

    criterion = nn.CrossEntropyLoss()
    criterion = criterion.to(device)

    # Train!
    trainer = BertTrainer(conf.train_hparams)
    model = trainer.train(
        model, criterion, optimizer, scheduler, train_loader, valid_loader
    )

    makedirs(conf.checkpoint_dir)
    makedirs(conf.log_dir)
    checkpoint_path = f"{conf.checkpoint_dir}/{conf.model_name}.pt"
    log_path = f"{conf.log_dir}/{conf.model_name}.log"
    torch.save({"model": model.state_dict()}, checkpoint_path)
    torch.save({"config": conf, "classes": label2idx, "tokenizer": tokenizer}, log_path)
示例#2
0
                attention_mask += [0]
        else:
            encoder_input_id = encoder_input_id[:self.max_seq_len - 1] + [
                self.tokenizer.eos_token_id
            ]
            attention_mask = attention_mask[:self.max_seq_len]
        return {
            'input_ids': np.array(encoder_input_id, dtype=np.int_),
            'attention_mask': np.array(attention_mask, dtype=np.float),
            'labels': np.array(label, dtype=np.int_)
        }


import koco

train_dev = koco.load_dataset('korean-hate-speech', mode='train_dev')

batch_size = 16
train_dataset = KGBDDataset(train_dev['train'])
valid_dataset = KGBDDataset(train_dev['dev'])
train_dataloader = DataLoader(train_dataset,
                              batch_size=batch_size,
                              num_workers=4,
                              shuffle=True)
valid_dataloader = DataLoader(valid_dataset,
                              batch_size=batch_size,
                              num_workers=4,
                              shuffle=False)

from transformers.optimization import AdamW, get_cosine_schedule_with_warmup
from transformers import BartForSequenceClassification
示例#3
0
文件: predict.py 项目: ohahohah/detox
def main(conf, testfile, save):
    # Load saved data
    checkpoint_path = f"{conf.checkpoint_dir}/{conf.model_name}.pt"
    log_path = f"{conf.log_dir}/{conf.model_name}.log"
    saved_model = torch.load(checkpoint_path, map_location=device)["model"]
    saved_data = torch.load(log_path, map_location=device)
    tokenizer = saved_data["tokenizer"]
    config = saved_data["config"]
    label2idx = saved_data["classes"]
    idx2label = {idx: label for label, idx in label2idx.items()}

    if testfile == "koco-test":
        test = koco.load_dataset("korean-hate-speech", mode="test")
        if config.label.hate and config.label.bias:
            if os.path.exists(
                    "korean-hate-speech-dataset/labeled/test.bias.ternary.tsv"
            ):
                df = pd.read_csv(
                    "korean-hate-speech-dataset/labeled/test.bias.ternary.tsv",
                    sep="\t")
            else:
                raise NotImplementedError(
                    "Adding external bias information is not supported, yet")

            test = []
            for i, row in df.iterrows():
                test.append({
                    "comments": row["comments"],
                    "bias": row["label"]
                })
    else:
        test = []
        for line in read_lines(testfile):
            test.append({"comments": line})

    test_texts = []
    for t in test:
        test_text = t["comments"]
        if config.label.hate and config.label.bias:
            bias_context = f'<{t["bias"]}>'
            test_text = f"{bias_context} {test_text}"
        test_texts.append(test_text)

    with torch.no_grad():
        # Declare model and load pre-trained weights
        model = BertForSequenceClassification.from_pretrained(
            config.pretrained_model, num_labels=len(label2idx))
        if config.tokenizer.register_names:
            model.resize_token_embeddings(len(tokenizer))
        elif config.label.hate and config.label.bias:
            model.resize_token_embeddings(len(tokenizer))
        model.load_state_dict(saved_model)
        model.to(device)

        # Predict!
        model.eval()
        y_hats, tokens = [], []
        for index in range(0, len(test_texts),
                           config.train_hparams.batch_size):
            batch = test_texts[index:index + config.train_hparams.batch_size]
            batch_tokenized = tokenizer(batch,
                                        padding=True,
                                        truncation=True,
                                        return_tensors="pt")
            x = batch_tokenized["input_ids"]
            mask = batch_tokenized["attention_mask"]
            x = x.to(device)
            mask = mask.to(device)

            y_hat = F.softmax(model(x, attention_mask=mask)[0], dim=-1)
            y_hats += [y_hat]

            batch_token_lists = [tokenizer.tokenize(t) for t in batch]
            tokens += batch_token_lists
        y_hats = torch.cat(y_hats, dim=0)  # (len(test), n_classes)
        probs, indices = y_hats.cpu().topk(1)

    # Print!
    if not save:
        for test_text, index, token in zip(test_texts, indices, tokens):
            print(test_text)
            print(" ".join(token))
            print(idx2label[int(index[0])])
            print("======================================================")

    # Save!
    if save:
        # Save test comment + predicted label
        with open(
                f"{result_dir}/{os.path.basename(testfile)}.{conf.model_name}.predict",
                "w") as f:
            f.write("comments" + "\t" + "prediction" + "\n")
            for test_text, index in zip(test_texts, indices):
                f.write(test_text + "\t" + idx2label[int(index[0])] + "\n")
        # Save tokenized test comment + predicted label
        with open(
                f"{result_dir}/{os.path.basename(testfile)}.{conf.model_name}.tokens",
                "w") as f:
            f.write("tokens" + "\t" + "prediction" + "\n")
            for token, index in zip(tokens, indices):
                f.write(" ".join(token) + "\t" + idx2label[int(index[0])] +
                        "\n")