Exemplo n.º 1
0
def loop(train, test, validation, percent):
# sample dataset
    print('======================= training dataset size: ', percent, " %")
    train, dispose = train_test_split(train, train_size= percent/100, stratify=train['label'])
    print('train dataset number' , len(train))


    if (('RoBerta' in model_name) or ('roberta' in model_name)):
        from transformers import RobertaTokenizer, RobertaModel

        tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=False)
        from multi_label_fns import RoBerta_clf

        model = RoBerta_clf.from_pretrained(whame,
                                            num_labels=NUM_LABELS,
                                            output_attentions=False,
                                            output_hidden_states=True)
        print('using RoBerta:', model_name)

    elif (('Bert' in model_name) or ('bert' in model_name)):
        from transformers import BertTokenizer

        tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
        from multi_label_fns import Bert_clf

        model = Bert_clf.from_pretrained(model_name,
                                         num_labels=NUM_LABELS,
                                         output_attentions=False,
                                         output_hidden_states=True)
        print('using Bert:', model_name)

    elif (('XLM' in model_name) or ('xlm' in model_name)):
        from transformers import XLMTokenizer

        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-enfr-1024', do_lower_case=False)
        from multi_label_fns import XLM_clf

        model = XLM_clf.from_pretrained(model_name,
                                        num_labels=NUM_LABELS,
                                        output_attentions=False,
                                        output_hidden_states=True)
        print('using XLM:', model_name)

    elif 'gpt2' in model_name:
        from transformers import GPT2Tokenizer, GPT2PreTrainedModel, GPT2DoubleHeadsModel

        tokenizer = GPT2Tokenizer.from_pretrained('gpt2', do_lower_case=True)
        tokenizer.cls_token = tokenizer.cls_token_id
        tokenizer.pad_token = tokenizer.eos_token
        from gpt2 import GPT2_multilabel_clf

        model = GPT2_multilabel_clf.from_pretrained(model_name,
                                                    num_labels=NUM_LABELS,
                                                    output_attentions=False,
                                                    output_hidden_states=False,
                                                    use_cache=False,
                                                    )
        print(' ')
        print('using GPT2:', model_name)

    print(f'The model has {count_parameters(model):,} trainable parameters')

    model.to(device)

    print('training dataset size:', len(train))
    sentences_train = train.comment.values
    labels_train = train.label.values

    sentences_test = test.comment.values
    labels_test = test.label.values

    sentences_validation = validation.comment.values
    labels_validation = validation.label.values


    # AG10K and tweet50k need to convert their labels to numbers
    if args.data == 'AG10K':
        from sklearn import preprocessing

        le = preprocessing.LabelEncoder()
        le.fit(["NAG", "CAG", "OAG"])

        labels_train = le.transform(labels_train)
        labels_test = le.transform(labels_test)
        labels_validation = le.transform(labels_validation)

    elif args.data == 'tweet50k':
        from sklearn import preprocessing

        le = preprocessing.LabelEncoder()
        le.fit(['abusive', 'normal', 'hateful', 'spam'])

        labels_train = le.transform(labels_train)
        labels_test = le.transform(labels_test)
        labels_validation = le.transform(labels_validation)

    else:
        pass



    # put train_labels to tensor and device
    train_labels = torch.tensor(labels_train).to(device)
    test_labels = torch.tensor(labels_test).to(device)
    validation_labels = torch.tensor(labels_validation).to(device)


    train_inputs = torch.Tensor()
    train_masks = torch.Tensor()
    for sent in sentences_train:
        encoded_sent = tokenizer.encode_plus(sent,  # Sentence to encode.
                                             add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
                                             max_length=MAX_LEN,  # Truncate all sentences.
                                             pad_to_max_length=True,
                                             return_attention_mask=True,
                                             return_token_type_ids=False,
                                             truncation=True,
                                             return_tensors='pt')  # return pytorch not tensorflow tensor
        train_inputs = torch.cat((train_inputs, encoded_sent['input_ids'].float()), dim=0)
        train_masks = torch.cat((train_masks, encoded_sent['attention_mask'].float()), dim=0)
    train_inputs.to(device)
    train_masks.to(device)

    validation_inputs = torch.Tensor()
    validation_masks = torch.Tensor()
    for sent in sentences_validation:
        encoded_sent = tokenizer.encode_plus(sent,  # Sentence to encode.
                                             add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
                                             max_length=MAX_LEN,  # Truncate all sentences.
                                             pad_to_max_length=True,
                                             return_attention_mask=True,
                                             return_token_type_ids=False,
                                             truncation=True,
                                             return_tensors='pt')  # return pytorch not tensorflow tensor
        validation_inputs = torch.cat((validation_inputs, encoded_sent['input_ids'].float()), dim=0)
        validation_masks = torch.cat((validation_masks, encoded_sent['attention_mask'].float()), dim=0)
    validation_inputs.to(device)
    validation_masks.to(device)

    test_inputs = torch.Tensor()
    test_masks = torch.Tensor()
    for sent in sentences_test:
        encoded_sent = tokenizer.encode_plus(sent,  # Sentence to encode.
                                             add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
                                             max_length=MAX_LEN,  # Truncate all sentences.
                                             pad_to_max_length=True,
                                             return_attention_mask=True,
                                             return_token_type_ids=False,
                                             truncation=True,
                                             return_tensors='pt')  # return pytorch not tensorflow tensor
        test_inputs = torch.cat((test_inputs, encoded_sent['input_ids'].float()), dim=0)
        test_masks = torch.cat((test_masks, encoded_sent['attention_mask'].float()), dim=0)

    test_inputs.to(device)
    test_masks.to(device)

    # for training data
    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    # for validation set.
    validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
    validation_sampler = SequentialSampler(validation_data)
    validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


    '''
    ================== Training Loop =======================
    '''
    optimizer = AdamW(model.parameters(), lr=0.0005, weight_decay = 0.01, eps = 1e-6)
    from transformers import get_linear_schedule_with_warmup
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps= int(total_steps*0.06),  # Default value in run_glue.py
                                                num_training_steps=total_steps)


    if args.FTModel != None:
        resultname = str(args.FTModel)
    else:
        resultname = str(args.BertModel) + '_' + str(args.data)

    best_valid_loss = float('inf')
    loss_values = []

    # For each epoch...
    for epoch_i in range(0, epochs):
        print("")
        print('========== Epoch {:} / {:} =========='.format(epoch_i + 1, epochs))
        t0 = time.time()
        train_loss = train_multiclass(model, train_dataloader)
        print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        print("")
        print("Running Validation...")

        t0 = time.time()
        valid_loss = validate_multiclass(model, validation_dataloader)
        print("  Validation took: {:}".format(format_time(time.time() - t0)))

    #torch.save(model.state_dict(), str(args.resultpath) + resultname + '_model.pt')

    print("")
    print(" {:.2f} % data, Training complete!".format(percent))

    prediction_data = TensorDataset(test_inputs, test_masks, test_labels)
    prediction_sampler = SequentialSampler(prediction_data)
    prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size, shuffle = False)

    model.eval()

    predictions = torch.Tensor().to(device)
    for batch in prediction_dataloader:
        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            # Forward pass, calculate logit predictions, 没有给label, 所以不outputloss
            outputs = model(b_input_ids.long(), token_type_ids=None,
                            attention_mask=b_input_mask)  # return: loss(only if label is given), logi
        softmax = torch.nn.functional.softmax(outputs, dim=1)
        prediction = softmax.argmax(dim=1)
        predictions = torch.cat((predictions, prediction.float()))
        # true_labels = torch.cat((true_labels, b_labels.float()))

    print('    DONE.')
    predictions_np = predictions.cpu().tolist()
    test['prediction'] = predictions_np
    test['label_encoded'] = labels_test
    f1_micro = f1_score(test['label_encoded'], test['prediction'], average='micro')
    f1_macro = f1_score(test['label_encoded'], test['prediction'], average='macro')
    print('RESULTS -----------')
    print(str(args.data))
    print('f1_micro:', f1_micro, 'f1_macro:', f1_macro)
    print(classification_report(test['label_encoded'], test['prediction'], zero_division=1, digits=4))
Exemplo n.º 2
0
                                    num_labels=NUM_LABELS,
                                    output_attentions=False,
                                    output_hidden_states=True)
    print('using XLM:', model_name)

elif 'gpt2' in model_name:
    from transformers import GPT2Tokenizer, GPT2PreTrainedModel, GPT2DoubleHeadsModel
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2', do_lower_case=True)
    tokenizer.cls_token = tokenizer.cls_token_id
    tokenizer.pad_token = tokenizer.eos_token
    from gpt2 import GPT2_multilabel_clf

    model = GPT2_multilabel_clf.from_pretrained(
        model_name,
        num_labels=NUM_LABELS,
        output_attentions=False,
        output_hidden_states=False,
        use_cache=False,
    )
    print(' ')
    print('using GPT2:', model_name)
'''


if args.data == 'multi-label':
    from multi_label_fns import validate_multilable, train_multilabel

    if (('RoBerta' in model_name) or ('roberta' in model_name)):
        from transformers import RobertaTokenizer, RobertaModel
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=False)
        from multi_label_fns import RoBerta_clf
Exemplo n.º 3
0
def loop(train, test, validation, percent):
# sample dataset
    print('======================= training dataset size: ', percent, " %")

    if percent < 100:
        train, dispose = train_test_split(train, train_size= percent/100, stratify=train['identity_hate'])
    else:
        pass

    sentences_train = train.comment_text.values
    sentences_test = test.comment_text.values
    sentences_validation = validation.comment_text.values

    labels_train = train.iloc[:, -6:].copy()
    labels_test = test.iloc[:, -6:].copy()
    labels_validation = validation.iloc[:, -6:].copy()

    train_labels = torch.tensor([labels_train['toxic'].values,
                                 labels_train['severe_toxic'].values,
                                 labels_train['obscene'].values,
                                 labels_train['threat'].values,
                                 labels_train['insult'].values,
                                 labels_train['identity_hate'].values, ]).permute(1, 0).to(device)

    test_labels = torch.tensor([labels_test['toxic'].values,
                                labels_test['severe_toxic'].values,
                                labels_test['obscene'].values,
                                labels_test['threat'].values,
                                labels_test['insult'].values,
                                labels_test['identity_hate'].values, ]).permute(1, 0).to(device)

    validation_labels = torch.tensor([labels_validation['toxic'].values,
                                      labels_validation['severe_toxic'].values,
                                      labels_validation['obscene'].values,
                                      labels_validation['threat'].values,
                                      labels_validation['insult'].values,
                                      labels_validation['identity_hate'].values, ]).permute(1, 0).to(device)



    if (('RoBerta' in model_name) or ('roberta' in model_name)):
        from transformers import RobertaTokenizer, RobertaModel
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=False)
        from multi_label_fns import RoBerta_clf
        model = RoBerta_clf.from_pretrained(model_name,
                                            num_labels=NUM_LABELS,
                                            output_attentions=False,
                                            output_hidden_states=True)
        #print('using RoBerta:', model_name)

    elif (('Bert' in model_name) or ('bert' in model_name)):
        from transformers import BertTokenizer
        tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
        from multi_label_fns import Bert_clf
        model = Bert_clf.from_pretrained(model_name,
                                         num_labels=NUM_LABELS,
                                         output_attentions=False,
                                         output_hidden_states=True)
        print('using Bert:', model_name)

    elif (('XLM' in model_name) or ('xlm' in model_name)):
        from transformers import XLMTokenizer
        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-enfr-1024', do_lower_case=False)
        from multi_label_fns import XLM_clf
        model = XLM_clf.from_pretrained(model_name,
                                        num_labels=NUM_LABELS,
                                        output_attentions=False,
                                        output_hidden_states=True)
        print('using XLM:', model_name)

    elif 'gpt2' in model_name:
        from transformers import GPT2Tokenizer, GPT2PreTrainedModel, GPT2DoubleHeadsModel
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2', do_lower_case=True)
        tokenizer.cls_token = tokenizer.cls_token_id
        tokenizer.pad_token = tokenizer.eos_token
        from gpt2 import GPT2_multilabel_clf

        model = GPT2_multilabel_clf.from_pretrained(model_name,
                                         num_labels=NUM_LABELS,
                                         output_attentions=False,
                                         output_hidden_states=False,
                                         use_cache=False,
                                         )
        print(' ')
        print('using GPT2:', model_name)



    model.to(device)


    train_inputs = torch.Tensor()
    train_masks = torch.Tensor()
    for sent in sentences_train:
        encoded_sent = tokenizer.encode_plus(sent,  # Sentence to encode.
                                             add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
                                             max_length=MAX_LEN,  # Truncate all sentences.
                                             pad_to_max_length=True,
                                             return_attention_mask=True,
                                             return_token_type_ids=False,
                                             truncation=True,
                                             return_tensors='pt')  # return pytorch not tensorflow tensor
        train_inputs = torch.cat((train_inputs, encoded_sent['input_ids'].float()), dim=0)
        train_masks = torch.cat((train_masks, encoded_sent['attention_mask'].float()), dim=0)
    train_inputs.to(device)
    train_masks.to(device)

    validation_inputs = torch.Tensor()
    validation_masks = torch.Tensor()
    for sent in sentences_validation:
        encoded_sent = tokenizer.encode_plus(sent,  # Sentence to encode.
                                             add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
                                             max_length=MAX_LEN,  # Truncate all sentences.
                                             pad_to_max_length=True,
                                             return_attention_mask=True,
                                             return_token_type_ids=False,
                                             truncation=True,
                                             return_tensors='pt')  # return pytorch not tensorflow tensor
        validation_inputs = torch.cat((validation_inputs, encoded_sent['input_ids'].float()), dim=0)
        validation_masks = torch.cat((validation_masks, encoded_sent['attention_mask'].float()), dim=0)
    validation_inputs.to(device)
    validation_masks.to(device)

    test_inputs = torch.Tensor()
    test_masks = torch.Tensor()
    for sent in sentences_test:
        encoded_sent = tokenizer.encode_plus(sent,  # Sentence to encode.
                                             add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
                                             max_length=MAX_LEN,  # Truncate all sentences.
                                             pad_to_max_length=True,
                                             return_attention_mask=True,
                                             return_token_type_ids=False,
                                             truncation=True,
                                             return_tensors='pt')  # return pytorch not tensorflow tensor
        test_inputs = torch.cat((test_inputs, encoded_sent['input_ids'].float()), dim=0)
        test_masks = torch.cat((test_masks, encoded_sent['attention_mask'].float()), dim=0)

    test_inputs.to(device)
    test_masks.to(device)

    # for training data
    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    # for validation set.
    validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
    validation_sampler = SequentialSampler(validation_data)
    validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)



    '''
    ================== Training Loop =======================
    '''
    optimizer = AdamW(model.parameters(), lr=0.0005, weight_decay = 0.01, eps = 1e-6)
    from transformers import get_linear_schedule_with_warmup
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps= int(total_steps*0.06),  # Default value in run_glue.py
                                                num_training_steps=total_steps)


    best_valid_loss = float('inf')
    loss_values = []

    # For each epoch...
    for epoch_i in range(0, epochs):
        print("")
        print('========== Epoch {:} / {:} =========='.format(epoch_i + 1, epochs))
        t0 = time.time()

        train_loss = train_multilabel(model, train_dataloader)

        print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        print("")
        print("Running Validation...")

        t0 = time.time()
        valid_loss = validate_multilable(model, validation_dataloader)
        print("  Validation took: {:}".format(format_time(time.time() - t0)))

    #torch.save(model.state_dict(), str(args.resultpath) + resultname + '_model.pt')

    print("")
    print("Training complete!")

    prediction_data = TensorDataset(test_inputs, test_masks, test_labels)
    prediction_sampler = SequentialSampler(prediction_data)
    prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size, shuffle = False)

    model.eval()

    predictions = torch.Tensor().to(device)



    labels = torch.Tensor().to(device)
    for batch in prediction_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        b_input_ids, b_input_mask, b_labels = batch

        with torch.no_grad():
            # Forward pass, calculate logit predictions, 没有给label, 所以不outputloss
            outputs = model(b_input_ids.long(), token_type_ids=None,
                            attention_mask=b_input_mask)  # return: loss(only if label is given), logit
        #logits = outputs
        rounded_preds = torch.round(torch.sigmoid(outputs))
        predictions = torch.cat((predictions, rounded_preds))  #rounded_preds.float()
        labels = torch.cat((labels, b_labels.float()))
    print(' prediction    DONE.')

    pred_array = predictions.cpu().detach().numpy()
    label_array = labels.cpu().detach().numpy()


    micro_f1 = f1_score(label_array, pred_array, average='micro', zero_division=1)
    macro_f1 = f1_score(label_array, pred_array, average='macro', zero_division=1)

    print("micro is {}, macro is {}".format(micro_f1, macro_f1))

    #predictions_np = predictions.cpu().numpy()
    predictions_df = pd.DataFrame(pred_array,
                                  columns = ['pred_toxic', 'pred_severe_toxic', 'pred_obscene', 'pred_threat', 'pred_insult', 'pred_identity_hate'])

    #result = clean_dataset(test.join(predictions_df))

    f1_toxic = f1_score(test['toxic'], predictions_df['pred_toxic'], zero_division =1 )
    f1_severe_toxic = f1_score(test['severe_toxic'], predictions_df['pred_severe_toxic'], zero_division =1)
    f1_obscene = f1_score(test['obscene'], predictions_df['pred_obscene'], zero_division =1)
    f1_threat = f1_score(test['threat'], predictions_df['pred_threat'], zero_division =1)
    f1_insult = f1_score(test['insult'], predictions_df['pred_insult'], zero_division =1)
    f1_identity_hate = f1_score(test['identity_hate'], predictions_df['pred_identity_hate'], zero_division =1)
    print("f1_toxic:", f1_toxic)
    print("f1_severe_toxic:", f1_severe_toxic)
    print("f1_threat:", f1_threat)
    print("f1_obscene:", f1_obscene)
    print("f1_insult:", f1_insult)
    print("f1_identity_hate:", f1_identity_hate)
    print("macro F1:", (f1_toxic + f1_severe_toxic + f1_obscene + f1_threat + f1_insult + f1_identity_hate)/6)