예제 #1
0
def main():
    """
    main function for conducting Subtask D. Parameters are parsed with argparse.
    Language model should be one of the following:
    Language model should be suitable for German e.g.:
        'bert-base-multilingual-uncased', 
        'bert-base-multilingual-cased',              
        'bert-base-german-cased', 
        'bert-base-german-dbmdz-cased',
        'bert-base-german-dbmdz-uncased',
        'distilbert-base-german-cased',
        'distilbert-base-multilingual-cased'.
    """

    parser = argparse.ArgumentParser(description='Run Subtask D of GermEval 2017 Using Pre-Trained Language Model.')
    parser.add_argument('--seed', type=int, default=42, help='Random seed.')
    parser.add_argument('--lang_model', type=str, default='bert-base-german-dbmdz-uncased', help='The pre-trained language model.')
    parser.add_argument('--epochs', type=int, default=4, help='Number of epochs for training.')
    parser.add_argument('--lr', type=float, default=5e-5, help='The learning rate.')
    parser.add_argument('--max_len', type=int, default=256, help='The maximum sequence length of the input text.')
    parser.add_argument('--batch_size', type=int, default=32, help='Your train set batch size.')
    parser.add_argument('--df_path', type=str, default='./data/', help='The data directory.')    
    parser.add_argument('--train_data', type=str, default='train_df_opinion.tsv', help='The filename of the input train data.')
    parser.add_argument('--dev_data', type=str, default='dev_df_opinion.tsv', help='The filename of the input development data.')
    parser.add_argument('--test_data1', type=str, default='test_syn_df_opinion.tsv', help='The filename of the first input test data (synchronic).')
    parser.add_argument('--test_data2', type=str, default='test_dia_df_opinion.tsv', help='The filename of the second input test data (diachronic).')
    parser.add_argument('--output_path', type=str, default='./output/subtaskD/', help='The output directory of the model and predictions.')
    parser.add_argument("--train", default=True, action="store_true", help="Flag for training.")
    parser.add_argument("--use_crf", default=False, action="store_true", help="Flag for CRF usage.")
    parser.add_argument("--save_cr", default=False, action="store_true", help="Flag for saving classification report.")
    args = parser.parse_args()
    #############################################################################
    # Settings
    set_all_seeds(args.seed)
    device, n_gpu = initialize_device_settings(use_cuda=True)

    lm = args.lang_model
    if args.use_crf:
        lm = args.lang_model+"_crf"


    #############################################################################
    # Load and prepare data by adding BIO tags
    train_df = bio_tagging_df(pd.read_csv(args.df_path + args.train_data, delimiter = '\t'))
    dev_df = bio_tagging_df(pd.read_csv(args.df_path + args.dev_data, delimiter = '\t'))
    test_syn_df = bio_tagging_df(pd.read_csv(args.df_path + args.test_data1, delimiter = '\t'))
    test_dia_df = bio_tagging_df(pd.read_csv(args.df_path + args.test_data2, delimiter = '\t'))
    
    # 1. Create a tokenizer
    lower_case = False
    if args.lang_model[-7:] == "uncased":
        lower_case = True

    if args.lang_model[:4] == "bert":
        model_class = "BERT"
        tokenizer = BertTokenizer.from_pretrained(args.lang_model, do_lower_case = lower_case, max_length=args.max_len)
    
    if args.lang_model[:10] == "distilbert":
        model_class = "DistilBERT"
        tokenizer = DistilBertTokenizer.from_pretrained(args.lang_model, do_lower_case = lower_case, max_length=args.max_len)

    # get training features
    df = pd.concat([train_df, dev_df])
    sentences = df.text.values
    labels = df.bio_tags.values
    tokenized_texts, labels = get_sentences_biotags(tokenizer, sentences, labels, args.max_len)
    
    sentences_syn = test_syn_df.text.values
    labels_syn = test_syn_df.bio_tags
    tokenized_texts_syn, labels_syn = get_sentences_biotags(tokenizer, sentences_syn, labels_syn, args.max_len)
    
    sentences_dia = test_dia_df.text.values
    labels_dia = test_dia_df.bio_tags
    tokenized_texts_dia, labels_dia = get_sentences_biotags(tokenizer, sentences_dia, labels_dia, args.max_len)


    # get tag values and dictionary
    tag_values, tag2idx, entities = get_tags_list(args.df_path)
    
    # pad input_ids and tags
    input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen = args.max_len, value=0.0, padding="post",
                          dtype="long", truncating="post")
    tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=args.max_len, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")
    
    
    input_ids_syn = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts_syn],
                          maxlen = args.max_len, value=0.0, padding="post",
                          dtype="long", truncating="post")
    tags_syn = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels_syn],
                     maxlen=args.max_len, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")    
    
    input_ids_dia = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts_dia],
                          maxlen = args.max_len, value=0.0, padding="post",
                          dtype="long", truncating="post")
    tags_dia = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels_dia],
                     maxlen=args.max_len, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")
    
    # create attention masks
    attention_masks= [[int(token_id > 0) for token_id in sent] for sent in input_ids]    
    attention_masks_syn = [[int(token_id > 0) for token_id in sent] for sent in input_ids_syn]
    attention_masks_dia = [[int(token_id > 0) for token_id in sent] for sent in input_ids_dia]


    # split train, dev
    train_inputs, train_labels, dev_inputs, dev_labels, train_masks, dev_masks = split_train_dev(
        train_df, dev_df, attention_masks, input_ids, tags)

    # transform to torch tensor
    train_inputs = torch.tensor(train_inputs, dtype = torch.long)
    dev_inputs = torch.tensor(dev_inputs, dtype = torch.long)

    train_labels = torch.tensor(train_labels, dtype = torch.long)
    dev_labels = torch.tensor(dev_labels, dtype = torch.long)

    train_masks = torch.tensor(train_masks, dtype = torch.uint8)
    dev_masks = torch.tensor(dev_masks, dtype = torch.uint8)

    test_syn_inputs = torch.tensor(input_ids_syn, dtype = torch.long)
    test_syn_labels = torch.tensor(tags_syn, dtype = torch.long)
    test_syn_masks = torch.tensor(attention_masks_syn, dtype = torch.uint8)

    test_dia_inputs = torch.tensor(input_ids_dia, dtype = torch.long)
    test_dia_labels = torch.tensor(tags_dia, dtype = torch.long)
    test_dia_masks = torch.tensor(attention_masks_dia, dtype = torch.uint8)

    # create DataLoader
    train_dataloader = create_dataloader(train_inputs, train_masks, train_labels, args.batch_size, train = True)
    dev_dataloader = create_dataloader(dev_inputs, dev_masks, dev_labels, args.batch_size, train = False)  

    test_syn_dataloader = create_dataloader(test_syn_inputs, test_syn_masks, test_syn_labels, args.batch_size, train = False)   
    test_dia_dataloader = create_dataloader(test_dia_inputs, test_dia_masks, test_dia_labels, args.batch_size, train = False)


    #############################################################################
    # Training
    if args.train:
        # Load Config
        if model_class=="BERT":
            config = BertConfig.from_pretrained(args.lang_model, num_labels=len(tag2idx))
            config.hidden_dropout_prob = 0.1 # dropout probability for all fully connected layers
                                             # in the embeddings, encoder, and pooler; default = 0.1
            model = TokenBERT(
                model_name=args.lang_model, 
                num_labels=len(tag2idx),
                use_crf=args.use_crf)

        if model_class=="DistilBERT":
            config = DistilBertConfig.from_pretrained(args.lang_model, num_labels=len(tag2idx))   
            config.hidden_dropout_prob = 0.1       
            model = TokenDistilBERT(
                model_name=args.lang_model, 
                num_labels=len(tag2idx),
                use_crf=args.use_crf)
        
        model.cuda() 

        # Create an optimizer
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.weight', 'gamma', 'beta']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                'weight_decay_rate': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                'weight_decay_rate': 0.0}
        ]
        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=args.lr,
            eps=1e-8
        )
        # Total number of training steps = number of batches * number of epochs
        total_steps = len(train_dataloader) * args.epochs
        # Create the learning rate scheduler
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=total_steps
        )

        # Main Loop
        print("=================== Train ================")
        print("##### Language Model:", args.lang_model, ",", "use CRF:", args.use_crf, ",", "learning rate:", args.lr, ",", "DROPOUT:", config.hidden_dropout_prob)
        print()

        track_time = time.time()
                
        for epoch in trange(args.epochs, desc="Epoch"):
            print("Epoch: %4i"%epoch, dt.datetime.now())
            
            # TRAINING
            model, optimizer, scheduler, tr_loss = training(
                train_dataloader, 
                model=model, 
                device=device, 
                optimizer=optimizer, 
                scheduler=scheduler
                )
            
            # EVALUATION: TRAIN SET
            y_true_train, y_pred_train, f1s_train, f1s_overlap_train = evaluation(
                    train_dataloader, model=model, device=device, tag_values=tag_values)
            print("TRAIN: F1 Exact %.3f | F1 Overlap %.3f"%(f1s_train, f1s_overlap_train))
            
            # EVALUATION: DEV SET
            y_true_dev, y_pred_dev, f1s_dev, f1s_overlap_dev = evaluation(
                    dev_dataloader, model=model, device=device, tag_values=tag_values)
            print("EVAL: F1 Exact %.3f | F1 Overlap %.3f"%(f1s_dev, f1s_overlap_dev))
        
        print("  Training and validation took in total: {:}".format(format_time(time.time()-track_time)))

        # EVALUATION: TEST SYN SET
        y_true_test_syn, y_pred_test_syn, f1s_test_syn, f1s_overlap_test_syn = evaluation(
                test_syn_dataloader, model=model, device=device, tag_values=tag_values)
        print("TEST SYN: F1 Exact %.3f | F1 Overlap %.3f"%(f1s_test_syn, f1s_overlap_test_syn))
                
        # EVALUATION: TEST DIA SET
        y_true_test_dia, y_pred_test_dia, f1s_test_dia, f1s_overlap_test_dia = evaluation(
                test_dia_dataloader, model=model, device=device, tag_values=tag_values)
        print("TEST DIA: F1 Exact %.3f | F1 Overlap %.3f"%(f1s_test_dia, f1s_overlap_test_dia))
        
        # Print classification report
        cr_report_syn = seq_classification_report(y_true_test_syn, y_pred_test_syn, digits = 4)
        cr_report_dia = seq_classification_report(y_true_test_dia, y_pred_test_dia, digits = 4)
        cr_report_syn_overlap = seq_classification_report(y_true_test_syn, y_pred_test_syn, digits = 4, overlap = True)
        cr_report_dia_overlap = seq_classification_report(y_true_test_dia, y_pred_test_dia, digits = 4, overlap = True)
        
        print("Classification report for TEST SYN (Exact):", cr_report_syn)
        print("Classification report for TEST SYN (Overlap):", cr_report_dia)
        print("Classification report for TEST DIA (Exact):", cr_report_syn_overlap)
        print("Classification report for TEST DIA (Overlap):", cr_report_dia_overlap)

        if args.save_cr:            
            pickle.dump(cr_report_syn, open(args.output_path+'classification_report_'+lm+str(batch_size)+'_test_syn_exact.txt','wb'))
            pickle.dump(cr_report_dia, open(args.output_path+'classification_report_'+lm+str(batch_size)+'_test_dia_exact.txt','wb'))
            pickle.dump(cr_report_syn_overlap, open(args.output_path+'classification_report_'+lm+str(batch_size)+'_test_syn_overlap.txt','wb'))
            pickle.dump(cr_report_dia_overlap, open(args.output_path+'classification_report_'+lm+str(batch_size)+'_test_dia_overlap.txt','wb'))
예제 #2
0
def main():
    """
    main function for conducting Subtask A. Parameters are parsed with argparse.
    Language model should be suitable for German e.g.:
        'bert-base-multilingual-uncased', 
        'bert-base-multilingual-cased',              
        'bert-base-german-cased', 
        'bert-base-german-dbmdz-cased',
        'bert-base-german-dbmdz-uncased',
        'distilbert-base-german-cased',
        'distilbert-base-multilingual-cased'.
    """

    ############################ variable settings #################################
    parser = argparse.ArgumentParser(description='Run Subtask A or B of GermEval 2017 Using Pre-Trained Language Model.')
    parser.add_argument('--task', type=str, default='A', help="The task you want to conduct ('A' or 'B').")
    parser.add_argument('--seed', type=int, default=42, help='Random seed.')
    parser.add_argument('--lang_model', type=str, default='bert-base-german-dbmdz-uncased', help='The pre-trained language model.')
    parser.add_argument('--epochs', type=int, default=4, help='Number of epochs for training.')
    parser.add_argument('--lr', type=float, default=5e-5, help='The learning rate.')
    parser.add_argument('--max_len', type=int, default=256, help='The maximum sequence length of the input text.')
    parser.add_argument('--batch_size', type=int, default=32, help='Your train set batch size.')
    parser.add_argument('--df_path', type=str, default='./data/', help='The data directory.')    
    parser.add_argument('--train_data', type=str, default='train_df.tsv', help='The filename of the input train data.')
    parser.add_argument('--dev_data', type=str, default='dev_df.tsv', help='The filename of the input development data.')
    parser.add_argument('--test_data1', type=str, default='test_syn_df.tsv', help='The filename of the first input test data (synchronic).')
    parser.add_argument('--test_data2', type=str, default='test_dia_df.tsv', help='The filename of the second input test data (diachronic).')
    parser.add_argument('--output_path', type=str, default='./output/subtaskA/', help='The output directory of the model and predictions.')
    parser.add_argument("--train", default=True, action="store_true", help="Flag for training.")
    parser.add_argument("--save_prediction", default=True, action="store_true", help="Flag for saving predictions.")
    args = parser.parse_args()

    ################################################################################
    set_all_seeds(args.seed)
    device, n_gpu = initialize_device_settings(use_cuda=True)

    # Load data
    train_df = pd.read_csv(args.df_path + args.train_data, delimiter = '\t')
    dev_df = pd.read_csv(args.df_path + args.dev_data, delimiter = '\t')
    test_syn_df = pd.read_csv(args.df_path + args.test_data1, delimiter = '\t')
    test_syn_df = test_syn_df.dropna(subset = ["text"])    
    test_dia_df = pd.read_csv(args.df_path + args.test_data2, delimiter = '\t')
    
    # Create a tokenizer
    lower_case = False
    if args.lang_model[-7:] == "uncased":
        lower_case = True

    if args.lang_model[:4] == "bert":
        model_class = "BERT"
        tokenizer = BertTokenizer.from_pretrained(args.lang_model, do_lower_case=lower_case, max_length=args.max_len)
    
    if args.lang_model[:10] == "distilbert":
        model_class = "DistilBERT"
        tokenizer = DistilBertTokenizer.from_pretrained(args.lang_model, do_lower_case=lower_case, max_length=args.max_len)
    
    # get training features
    df = pd.concat([train_df, dev_df])
    sentences = df.text.values
    sentences_syn = test_syn_df.text.values    
    sentences_dia = test_dia_df.text.values
    
    if args.task == 'A':
        class_list = [False, True]
        df['relevance_label'] = df.apply(lambda x:  class_list.index(x['relevance']), axis = 1)
        labels = df.relevance_label.values
        test_syn_df['relevance_label'] = test_syn_df.apply(lambda x:  class_list.index(x['relevance']), axis = 1)
        labels_syn = test_syn_df.relevance_label.values
        test_dia_df['relevance_label'] = test_dia_df.apply(lambda x:  class_list.index(x['relevance']), axis = 1)
        labels_dia = test_dia_df.relevance_label.values

    if args.task == 'B':
        class_list = ["negative", "neutral", "positive"]
        df['sentiment_label'] = df.apply(lambda x:  class_list.index(x['sentiment']), axis = 1)
        labels = df.sentiment_label.values
        test_syn_df['sentiment_label'] = test_syn_df.apply(lambda x:  class_list.index(x['sentiment']), axis = 1)
        labels_syn = test_syn_df.sentiment_label.values
        test_dia_df['sentiment_label'] = test_dia_df.apply(lambda x:  class_list.index(x['sentiment']), axis = 1)
        labels_dia = test_dia_df.sentiment_label.values
    
    num_labels = len(set(labels))
    
    # Tokenize all of the sentences and map the tokens to their word IDs.
    input_ids = [tokenizer.encode(sent, add_special_tokens=True, truncation=True, 
                                  max_length=args.max_len) for sent in sentences]
    input_ids = pad_sequences(input_ids, maxlen=args.max_len, dtype="long", 
                          value=0.0, truncating="post", padding="post")
    # Create attention masks
    attention_masks = [[int(token_id > 0) for token_id in sent] for sent in input_ids]

    # synchronic test data
    input_ids_syn = [tokenizer.encode(sent, add_special_tokens=True, truncation=True) for sent in sentences_syn]
    input_ids_syn = pad_sequences(input_ids_syn, maxlen=args.max_len, dtype="long", 
                          value=0.0, truncating="post", padding="post")
    attention_masks_syn = [[int(token_id > 0) for token_id in sent] for sent in input_ids_syn]
    
    # diachronic test data
    input_ids_dia = [tokenizer.encode(sent, add_special_tokens=True, truncation=True) for sent in sentences_dia]
    input_ids_dia = pad_sequences(input_ids_dia, maxlen=args.max_len, dtype="long", 
                          value=0.0, truncating="post", padding="post")
    attention_masks_dia = [[int(token_id > 0) for token_id in sent] for sent in input_ids_dia]

    # split train, dev
    train_inputs, train_labels, dev_inputs, dev_labels, train_masks, dev_masks = split_train_dev(
        train_df, dev_df, attention_masks, input_ids, labels)

    # transform to torch tensor
    train_inputs = torch.tensor(train_inputs)
    dev_inputs = torch.tensor(dev_inputs)

    train_labels = torch.tensor(train_labels)
    dev_labels = torch.tensor(dev_labels)

    train_masks = torch.tensor(train_masks)
    dev_masks = torch.tensor(dev_masks)

    test_syn_inputs = torch.tensor(input_ids_syn)
    test_syn_labels = torch.tensor(labels_syn)
    test_syn_masks = torch.tensor(attention_masks_syn)

    test_dia_inputs = torch.tensor(input_ids_dia)
    test_dia_labels = torch.tensor(labels_dia)
    test_dia_masks = torch.tensor(attention_masks_dia)

    # Create the DataLoader
    train_dataloader = create_dataloader(train_inputs, train_masks, 
                                     train_labels, args.batch_size, train=True)

    dev_dataloader = create_dataloader(dev_inputs, dev_masks, 
                                   dev_labels, args.batch_size, train=False)

    test_syn_dataloader = create_dataloader(test_syn_inputs, test_syn_masks, 
                                        test_syn_labels, args.batch_size, 
                                        train=False)

    test_dia_dataloader = create_dataloader(test_dia_inputs, test_dia_masks, 
                                        test_dia_labels, args.batch_size, 
                                        train=False)

    # Create model
    if args.train:
        if model_class == "BERT":
            config = BertConfig.from_pretrained(args.lang_model, num_labels=num_labels)   
            config.hidden_dropout_prob = 0.1
            model = BertForSequenceClassification.from_pretrained(
                args.lang_model,
                num_labels = num_labels,
                output_attentions = False,
                output_hidden_states = False
            )

        if model_class == "DistilBERT":
            config = DistilBertConfig.from_pretrained(args.lang_model, num_labels=num_labels)   
            config.hidden_dropout_prob = 0.1 
            model = DistilBertForSequenceClassification.from_pretrained(
                args.lang_model,
                num_labels = num_labels,
                output_attentions = False,
                output_hidden_states = False
            )
        model.cuda()


        # Create an optimizer
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
                'weight_decay_rate': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
                'weight_decay_rate': 0.0}
        ]
        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=args.lr,
            eps=1e-8
        )

        # Total number of training steps = number of batches * number of epochs
        total_steps = len(train_dataloader) * args.epochs
        # Create the learning rate scheduler
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=total_steps
        )
    
        # train model
        # Main Loop
        print("=================== Train ================")
        print("##### Language Model:", args.lang_model, ",", "learning rate:", args.lr)
        print()

        track_time = time.time()
        # trange is a tqdm wrapper around the normal python range
        for epoch in trange(args.epochs, desc="Epoch"):
            print("Epoch: %4i"%epoch, dt.datetime.now())

            model, optimizer, scheduler, tr_loss = train(
                train_dataloader, 
                model=model, 
                device=device, 
                optimizer=optimizer, 
                scheduler=scheduler
            )
            # EVALUATION: TRAIN SET
            true_bools_train, pred_bools_train, f1_train = eval(
                train_dataloader, model=model, device=device)
            print("TRAIN: micro F1 %.4f"%(f1_train)) # here: same as accuracy
            print(confusion_matrix(true_bools_train,pred_bools_train))
            
            # EVALUATION: DEV SET
            true_bools_dev, pred_bools_dev, f1_dev = eval(
                dev_dataloader, model=model, device=device)
            print("EVAL: micro F1 %.4f"%(f1_dev))
            print(confusion_matrix(true_bools_dev,pred_bools_dev))
        

        print("  Training and validation took in total: {:}".format(format_time(time.time()-track_time)))

        # EVALUATION: TEST SYN SET
        true_bools_syn, pred_bools_syn, f1_test_syn = eval(
            test_syn_dataloader, model=model, device=device)
        print("TEST SYN: micro F1 %.4f"%(f1_test_syn))
        print(confusion_matrix(true_bools_syn,pred_bools_syn))

        # EVALUATION: TEST DIA SET
        true_bools_dia, pred_bools_dia, f1_test_dia = eval(
            test_dia_dataloader, model=model, device=device)
        print("TEST DIA: micro F1 %.4f"%(f1_test_dia))
        print(confusion_matrix(true_bools_dia, pred_bools_dia))

        if args.save_prediction:
            if args.task == 'A':
                test_syn_df["relevance_pred"] = pred_bools_syn
                test_dia_df["relevance_pred"] = pred_bools_dia
            if args.task == 'B':                
                test_syn_df["sentiment_pred"] = pred_bools_syn
                test_dia_df["sentiment_pred"] = pred_bools_dia
            
            test_syn_df.to_csv(args.output_path+args.lang_model+"_eval_test_syn.tsv", sep="\t", index = False, 
                header = True, encoding = "utf-8-sig")
            test_dia_df.to_csv(args.output_path+args.lang_model+"_eval_test_dia.tsv", sep="\t", index = False, 
                header = True, encoding = "utf-8-sig")
예제 #3
0
# Set cudnn flags to True for speed increase, but lose reproducibility
if torch.cuda.is_available():
    import torch.backends.cudnn as cudnn
    cudnn.enabled = False
    cudnn.benchmark = False

# Loading and processing the data
print('Loading the data')
train_input, train_target, test_input, test_target = load_eeg_data(
    feature_dim_last=True, standardize=True, one_khz=True)

train_input, train_target = augment_dataset(train_input, train_target, 0.1, 15)
dset_loaders, dset_sizes = create_dataloader(train_input,
                                             train_target,
                                             test_input,
                                             test_target,
                                             batch_size=64)

# Defining the model
model = CNN_Model(train_input.shape[1],
                  kernel_sizes=[3, 5, 7],
                  conv_channels=[28, 32, 16, 1],
                  dropout=0.1)
criterion = torch.nn.CrossEntropyLoss()
learning_rate = 1e-3
weight_decay = 1e-4  # L2 regularizer parameter
optimizer = torch.optim.Adam(model.parameters(),
                             lr=learning_rate,
                             weight_decay=weight_decay)
예제 #4
0
def main():
    """
    main function for conducting Subtask C. Parameters are parsed with argparse.
    Language model should be suitable for German e.g.:
        'bert-base-multilingual-uncased', 
        'bert-base-multilingual-cased',              
        'bert-base-german-cased', 
        'bert-base-german-dbmdz-cased',
        'bert-base-german-dbmdz-uncased',
        'distilbert-base-german-cased',
        'distilbert-base-multilingual-cased'.
    """

    ############################ variable settings #################################
    parser = argparse.ArgumentParser(
        description=
        'Run Subtask C of GermEval 2017 Using Pre-Trained Language Model.')
    parser.add_argument('--seed', type=int, default=42, help='Random seed.')
    parser.add_argument('--lang_model',
                        type=str,
                        default='bert-base-german-dbmdz-uncased',
                        help='The pre-trained language model.')
    parser.add_argument('--epochs',
                        type=int,
                        default=4,
                        help='Number of epochs for training.')
    parser.add_argument('--lr',
                        type=float,
                        default=5e-5,
                        help='The learning rate.')
    parser.add_argument('--max_len',
                        type=int,
                        default=256,
                        help='The maximum sequence length of the input text.')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='Your train set batch size.')
    parser.add_argument('--df_path',
                        type=str,
                        default='./data/',
                        help='The data directory.')
    parser.add_argument('--train_data',
                        type=str,
                        default='train_df_cat.tsv',
                        help='The filename of the input train data.')
    parser.add_argument('--dev_data',
                        type=str,
                        default='dev_df_cat.tsv',
                        help='The filename of the input development data.')
    parser.add_argument(
        '--test_data1',
        type=str,
        default='test_syn_df_cat.tsv',
        help='The filename of the first input test data (synchronic).')
    parser.add_argument(
        '--test_data2',
        type=str,
        default='test_dia_df_cat.tsv',
        help='The filename of the second input test data (diachronic).')
    parser.add_argument(
        '--output_path',
        type=str,
        default='./output/subtaskC/',
        help='The output directory of the model and predictions.')
    parser.add_argument("--train",
                        default=True,
                        action="store_true",
                        help="Flag for training.")
    parser.add_argument("--save_prediction",
                        default=False,
                        action="store_true",
                        help="Flag for saving predictions.")
    parser.add_argument("--save_cr",
                        default=False,
                        action="store_true",
                        help="Flag for saving confusion matrix.")
    parser.add_argument("--exclude_general",
                        default=False,
                        action="store_true",
                        help="Flag for excluding category Allgemein.")
    parser.add_argument("--exclude_neutral",
                        default=False,
                        action="store_true",
                        help="Flag for excluding neutral polarity.")
    parser.add_argument("--exclude_general_neutral",
                        default=False,
                        action="store_true",
                        help="Flag for excluding category Allgemein:neutral.")
    args = parser.parse_args()
    ################################################################################
    set_all_seeds(args.seed)
    device, n_gpu = initialize_device_settings(use_cuda=True)

    # Load data
    train_df = pd.read_csv(args.df_path + args.train_data, delimiter='\t')
    dev_df = pd.read_csv(args.df_path + args.dev_data, delimiter='\t')
    test_syn_df = pd.read_csv(args.df_path + args.test_data1, delimiter='\t')
    test_dia_df = pd.read_csv(args.df_path + args.test_data2, delimiter='\t')

    # Create a tokenizer
    lower_case = False
    if args.lang_model[-7:] == "uncased":
        lower_case = True

    if args.lang_model[:4] == "bert":
        model_class = "BERT"
        tokenizer = BertTokenizer.from_pretrained(args.lang_model,
                                                  do_lower_case=lower_case,
                                                  max_length=args.max_len)

    if args.lang_model[:10] == "distilbert":
        model_class = "DistilBERT"
        tokenizer = DistilBertTokenizer.from_pretrained(
            args.lang_model, do_lower_case=lower_case, max_length=args.max_len)

    # get training features
    cats = train_df.columns[5:]
    end = "full"
    # exclude categories if required
    if (args.exclude_general):
        cats = [i for i in list(cats) if "Allgemein" not in i]
        end = "excl_gen"
    if (args.exclude_neutral):
        cats = [i for i in list(cats) if "neutral" not in i]
        end = "excl_neu"
    if (args.exclude_general_neutral):
        cats = [i for i in list(cats) if "Allgemein:neutral" not in i]
        end = "excl_genneu"

    num_labels = len(list(cats))

    # create one hot labels
    train_df['one_hot_labels'] = list(train_df[list(cats)].values)
    dev_df['one_hot_labels'] = list(dev_df[list(cats)].values)
    test_syn_df['one_hot_labels'] = list(test_syn_df[list(cats)].values)
    test_dia_df['one_hot_labels'] = list(test_dia_df[list(cats)].values)

    # retrieve sentences and labels
    df = pd.concat([train_df, dev_df])
    sentences = df.text.values
    labels = list(df.one_hot_labels.values)

    sentences_syn = test_syn_df.text.values
    labels_syn = list(test_syn_df.one_hot_labels.values)

    sentences_dia = test_dia_df.text.values
    labels_dia = list(test_dia_df.one_hot_labels.values)

    print("number of categories:", len(list(cats)))

    # Tokenize all of the sentences and map the tokens to their word IDs.
    input_ids = [
        tokenizer.encode(sent,
                         add_special_tokens=True,
                         truncation=True,
                         max_length=args.max_len) for sent in sentences
    ]
    input_ids = pad_sequences(input_ids,
                              maxlen=args.max_len,
                              dtype="long",
                              value=0.0,
                              truncating="post",
                              padding="post")
    # Create attention masks
    attention_masks = [[int(token_id > 0) for token_id in sent]
                       for sent in input_ids]

    # synchronic test data
    input_ids_syn = [
        tokenizer.encode(sent, add_special_tokens=True, truncation=True)
        for sent in sentences_syn
    ]
    input_ids_syn = pad_sequences(input_ids_syn,
                                  maxlen=args.max_len,
                                  dtype="long",
                                  value=0.0,
                                  truncating="post",
                                  padding="post")
    attention_masks_syn = [[int(token_id > 0) for token_id in sent]
                           for sent in input_ids_syn]

    # diachronic test data
    input_ids_dia = [
        tokenizer.encode(sent, add_special_tokens=True, truncation=True)
        for sent in sentences_dia
    ]
    input_ids_dia = pad_sequences(input_ids_dia,
                                  maxlen=args.max_len,
                                  dtype="long",
                                  value=0.0,
                                  truncating="post",
                                  padding="post")
    attention_masks_dia = [[int(token_id > 0) for token_id in sent]
                           for sent in input_ids_dia]

    # split train, dev
    train_inputs, train_labels, dev_inputs, dev_labels, train_masks, dev_masks = split_train_dev(
        train_df, dev_df, attention_masks, input_ids, labels)

    # transform to torch tensor
    train_inputs = torch.tensor(train_inputs)
    dev_inputs = torch.tensor(dev_inputs)

    train_labels = torch.tensor(train_labels)
    dev_labels = torch.tensor(dev_labels)

    train_masks = torch.tensor(train_masks)
    dev_masks = torch.tensor(dev_masks)

    test_syn_inputs = torch.tensor(input_ids_syn)
    test_syn_masks = torch.tensor(attention_masks_syn)
    test_syn_labels = torch.tensor(labels_syn)

    test_dia_inputs = torch.tensor(input_ids_dia)
    test_dia_masks = torch.tensor(attention_masks_dia)
    test_dia_labels = torch.tensor(labels_dia)

    # Create the DataLoader
    train_dataloader = create_dataloader(train_inputs,
                                         train_masks,
                                         train_labels,
                                         args.batch_size,
                                         train=True)

    dev_dataloader = create_dataloader(dev_inputs,
                                       dev_masks,
                                       dev_labels,
                                       args.batch_size,
                                       train=False)

    test_syn_dataloader = create_dataloader(test_syn_inputs,
                                            test_syn_masks,
                                            test_syn_labels,
                                            args.batch_size,
                                            train=False)

    test_dia_dataloader = create_dataloader(test_dia_inputs,
                                            test_dia_masks,
                                            test_dia_labels,
                                            args.batch_size,
                                            train=False)

    # Create model
    if args.train:
        if model_class == "BERT":
            config = BertConfig.from_pretrained(args.lang_model,
                                                num_labels=num_labels)
            config.hidden_dropout_prob = 0.1
            model = BertForSequenceClassification.from_pretrained(
                args.lang_model,
                num_labels=num_labels,
                output_attentions=False,
                output_hidden_states=False)

        if model_class == "DistilBERT":
            config = DistilBertConfig.from_pretrained(args.lang_model,
                                                      num_labels=num_labels)
            config.hidden_dropout_prob = 0.1
            model = DistilBertForSequenceClassification.from_pretrained(
                args.lang_model,
                num_labels=num_labels,
                output_attentions=False,
                output_hidden_states=False)
        model.cuda()

        # Create an optimizer
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay_rate':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay_rate':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr, eps=1e-8)
        # Total number of training steps = number of batches * number of epochs
        total_steps = len(train_dataloader) * args.epochs
        # Create the learning rate scheduler
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=0, num_training_steps=total_steps)

        # train model
        # Main Loop
        print("=================== Train ================")
        print("##### Language Model:", args.lang_model, ",", "learning rate:",
              args.lr)
        print()

        track_time = time.time()
        # trange is a tqdm wrapper around the normal python range
        for epoch in trange(args.epochs, desc="Epoch"):
            print("Epoch: %4i" % epoch, dt.datetime.now())

            model, optimizer, scheduler, tr_loss = train_multilabel(
                train_dataloader=train_dataloader,
                model=model,
                device=device,
                optimizer=optimizer,
                scheduler=scheduler,
                num_labels=num_labels)
            # EVALUATION: TRAIN SET
            pred_bools_train, true_bools_train, f1_train = eval_multilabel(
                train_dataloader, model=model, device=device)
            print("TRAIN: micro F1 %.3f" % (f1_train))

            # EVALUATION: DEV SET
            pred_bools_dev, true_bools_dev, f1_dev = eval_multilabel(
                dev_dataloader, model=model, device=device)
            print("EVAL: micro F1 %.3f" % (f1_dev))

        print("  Training and validation took in total: {:}".format(
            format_time(time.time() - track_time)))

        # EVALUATION: TEST SYN SET
        pred_bools_syn, true_bools_syn, f1_test_syn = eval_multilabel(
            test_syn_dataloader, model=model, device=device)
        print("TEST SYN: micro F1 %.4f" % (f1_test_syn))

        # classification report
        clf_report_syn = classification_report(true_bools_syn,
                                               pred_bools_syn,
                                               target_names=cats,
                                               digits=3)
        print(clf_report_syn)

        # EVALUATION: TEST DIA SET
        pred_bools_dia, true_bools_dia, f1_test_dia = eval_multilabel(
            test_dia_dataloader, model=model, device=device)
        print("TEST DIA: micro F1 %.4f" % (f1_test_dia))

        # classification report
        clf_report_dia = classification_report(true_bools_dia,
                                               pred_bools_dia,
                                               target_names=cats,
                                               digits=3)
        print(clf_report_dia)

        if args.save_cr:
            pickle.dump(
                clf_report_syn,
                open(
                    args.output_path + 'clf_report_' + args.lang_model +
                    '_test_syn_' + str(num_labels) + end + '.txt', 'wb'))
            pickle.dump(
                clf_report_dia,
                open(
                    args.output_path + 'clf_report_' + args.lang_model +
                    '_test_dia_' + str(num_labels) + end + '.txt', 'wb'))

        if args.save_prediction:
            test_syn_df["category_pred"] = pred_bools_syn
            test_dia_df["category_pred"] = pred_bools_dia
            test_syn_df.category_pred.to_csv(args.output_path +
                                             args.lang_model + '_test_syn_' +
                                             str(num_labels) + end + ".tsv",
                                             sep="\t",
                                             index=False,
                                             header=True,
                                             encoding="utf-8-sig")
            test_dia_df.category_pred.to_csv(args.output_path +
                                             args.lang_model + '_test_dia_' +
                                             str(num_labels) + end + ".tsv",
                                             sep="\t",
                                             index=False,
                                             header=True,
                                             encoding="utf-8-sig")