def main(): """ main function for conducting Subtask D. Parameters are parsed with argparse. Language model should be one of the following: Language model should be suitable for German e.g.: 'bert-base-multilingual-uncased', 'bert-base-multilingual-cased', 'bert-base-german-cased', 'bert-base-german-dbmdz-cased', 'bert-base-german-dbmdz-uncased', 'distilbert-base-german-cased', 'distilbert-base-multilingual-cased'. """ parser = argparse.ArgumentParser(description='Run Subtask D of GermEval 2017 Using Pre-Trained Language Model.') parser.add_argument('--seed', type=int, default=42, help='Random seed.') parser.add_argument('--lang_model', type=str, default='bert-base-german-dbmdz-uncased', help='The pre-trained language model.') parser.add_argument('--epochs', type=int, default=4, help='Number of epochs for training.') parser.add_argument('--lr', type=float, default=5e-5, help='The learning rate.') parser.add_argument('--max_len', type=int, default=256, help='The maximum sequence length of the input text.') parser.add_argument('--batch_size', type=int, default=32, help='Your train set batch size.') parser.add_argument('--df_path', type=str, default='./data/', help='The data directory.') parser.add_argument('--train_data', type=str, default='train_df_opinion.tsv', help='The filename of the input train data.') parser.add_argument('--dev_data', type=str, default='dev_df_opinion.tsv', help='The filename of the input development data.') parser.add_argument('--test_data1', type=str, default='test_syn_df_opinion.tsv', help='The filename of the first input test data (synchronic).') parser.add_argument('--test_data2', type=str, default='test_dia_df_opinion.tsv', help='The filename of the second input test data (diachronic).') parser.add_argument('--output_path', type=str, default='./output/subtaskD/', help='The output directory of the model and predictions.') parser.add_argument("--train", default=True, action="store_true", help="Flag for training.") parser.add_argument("--use_crf", default=False, action="store_true", help="Flag for CRF usage.") parser.add_argument("--save_cr", default=False, action="store_true", help="Flag for saving classification report.") args = parser.parse_args() ############################################################################# # Settings set_all_seeds(args.seed) device, n_gpu = initialize_device_settings(use_cuda=True) lm = args.lang_model if args.use_crf: lm = args.lang_model+"_crf" ############################################################################# # Load and prepare data by adding BIO tags train_df = bio_tagging_df(pd.read_csv(args.df_path + args.train_data, delimiter = '\t')) dev_df = bio_tagging_df(pd.read_csv(args.df_path + args.dev_data, delimiter = '\t')) test_syn_df = bio_tagging_df(pd.read_csv(args.df_path + args.test_data1, delimiter = '\t')) test_dia_df = bio_tagging_df(pd.read_csv(args.df_path + args.test_data2, delimiter = '\t')) # 1. Create a tokenizer lower_case = False if args.lang_model[-7:] == "uncased": lower_case = True if args.lang_model[:4] == "bert": model_class = "BERT" tokenizer = BertTokenizer.from_pretrained(args.lang_model, do_lower_case = lower_case, max_length=args.max_len) if args.lang_model[:10] == "distilbert": model_class = "DistilBERT" tokenizer = DistilBertTokenizer.from_pretrained(args.lang_model, do_lower_case = lower_case, max_length=args.max_len) # get training features df = pd.concat([train_df, dev_df]) sentences = df.text.values labels = df.bio_tags.values tokenized_texts, labels = get_sentences_biotags(tokenizer, sentences, labels, args.max_len) sentences_syn = test_syn_df.text.values labels_syn = test_syn_df.bio_tags tokenized_texts_syn, labels_syn = get_sentences_biotags(tokenizer, sentences_syn, labels_syn, args.max_len) sentences_dia = test_dia_df.text.values labels_dia = test_dia_df.bio_tags tokenized_texts_dia, labels_dia = get_sentences_biotags(tokenizer, sentences_dia, labels_dia, args.max_len) # get tag values and dictionary tag_values, tag2idx, entities = get_tags_list(args.df_path) # pad input_ids and tags input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts], maxlen = args.max_len, value=0.0, padding="post", dtype="long", truncating="post") tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels], maxlen=args.max_len, value=tag2idx["PAD"], padding="post", dtype="long", truncating="post") input_ids_syn = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts_syn], maxlen = args.max_len, value=0.0, padding="post", dtype="long", truncating="post") tags_syn = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels_syn], maxlen=args.max_len, value=tag2idx["PAD"], padding="post", dtype="long", truncating="post") input_ids_dia = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts_dia], maxlen = args.max_len, value=0.0, padding="post", dtype="long", truncating="post") tags_dia = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels_dia], maxlen=args.max_len, value=tag2idx["PAD"], padding="post", dtype="long", truncating="post") # create attention masks attention_masks= [[int(token_id > 0) for token_id in sent] for sent in input_ids] attention_masks_syn = [[int(token_id > 0) for token_id in sent] for sent in input_ids_syn] attention_masks_dia = [[int(token_id > 0) for token_id in sent] for sent in input_ids_dia] # split train, dev train_inputs, train_labels, dev_inputs, dev_labels, train_masks, dev_masks = split_train_dev( train_df, dev_df, attention_masks, input_ids, tags) # transform to torch tensor train_inputs = torch.tensor(train_inputs, dtype = torch.long) dev_inputs = torch.tensor(dev_inputs, dtype = torch.long) train_labels = torch.tensor(train_labels, dtype = torch.long) dev_labels = torch.tensor(dev_labels, dtype = torch.long) train_masks = torch.tensor(train_masks, dtype = torch.uint8) dev_masks = torch.tensor(dev_masks, dtype = torch.uint8) test_syn_inputs = torch.tensor(input_ids_syn, dtype = torch.long) test_syn_labels = torch.tensor(tags_syn, dtype = torch.long) test_syn_masks = torch.tensor(attention_masks_syn, dtype = torch.uint8) test_dia_inputs = torch.tensor(input_ids_dia, dtype = torch.long) test_dia_labels = torch.tensor(tags_dia, dtype = torch.long) test_dia_masks = torch.tensor(attention_masks_dia, dtype = torch.uint8) # create DataLoader train_dataloader = create_dataloader(train_inputs, train_masks, train_labels, args.batch_size, train = True) dev_dataloader = create_dataloader(dev_inputs, dev_masks, dev_labels, args.batch_size, train = False) test_syn_dataloader = create_dataloader(test_syn_inputs, test_syn_masks, test_syn_labels, args.batch_size, train = False) test_dia_dataloader = create_dataloader(test_dia_inputs, test_dia_masks, test_dia_labels, args.batch_size, train = False) ############################################################################# # Training if args.train: # Load Config if model_class=="BERT": config = BertConfig.from_pretrained(args.lang_model, num_labels=len(tag2idx)) config.hidden_dropout_prob = 0.1 # dropout probability for all fully connected layers # in the embeddings, encoder, and pooler; default = 0.1 model = TokenBERT( model_name=args.lang_model, num_labels=len(tag2idx), use_crf=args.use_crf) if model_class=="DistilBERT": config = DistilBertConfig.from_pretrained(args.lang_model, num_labels=len(tag2idx)) config.hidden_dropout_prob = 0.1 model = TokenDistilBERT( model_name=args.lang_model, num_labels=len(tag2idx), use_crf=args.use_crf) model.cuda() # Create an optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight', 'gamma', 'beta'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] optimizer = AdamW( optimizer_grouped_parameters, lr=args.lr, eps=1e-8 ) # Total number of training steps = number of batches * number of epochs total_steps = len(train_dataloader) * args.epochs # Create the learning rate scheduler scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps ) # Main Loop print("=================== Train ================") print("##### Language Model:", args.lang_model, ",", "use CRF:", args.use_crf, ",", "learning rate:", args.lr, ",", "DROPOUT:", config.hidden_dropout_prob) print() track_time = time.time() for epoch in trange(args.epochs, desc="Epoch"): print("Epoch: %4i"%epoch, dt.datetime.now()) # TRAINING model, optimizer, scheduler, tr_loss = training( train_dataloader, model=model, device=device, optimizer=optimizer, scheduler=scheduler ) # EVALUATION: TRAIN SET y_true_train, y_pred_train, f1s_train, f1s_overlap_train = evaluation( train_dataloader, model=model, device=device, tag_values=tag_values) print("TRAIN: F1 Exact %.3f | F1 Overlap %.3f"%(f1s_train, f1s_overlap_train)) # EVALUATION: DEV SET y_true_dev, y_pred_dev, f1s_dev, f1s_overlap_dev = evaluation( dev_dataloader, model=model, device=device, tag_values=tag_values) print("EVAL: F1 Exact %.3f | F1 Overlap %.3f"%(f1s_dev, f1s_overlap_dev)) print(" Training and validation took in total: {:}".format(format_time(time.time()-track_time))) # EVALUATION: TEST SYN SET y_true_test_syn, y_pred_test_syn, f1s_test_syn, f1s_overlap_test_syn = evaluation( test_syn_dataloader, model=model, device=device, tag_values=tag_values) print("TEST SYN: F1 Exact %.3f | F1 Overlap %.3f"%(f1s_test_syn, f1s_overlap_test_syn)) # EVALUATION: TEST DIA SET y_true_test_dia, y_pred_test_dia, f1s_test_dia, f1s_overlap_test_dia = evaluation( test_dia_dataloader, model=model, device=device, tag_values=tag_values) print("TEST DIA: F1 Exact %.3f | F1 Overlap %.3f"%(f1s_test_dia, f1s_overlap_test_dia)) # Print classification report cr_report_syn = seq_classification_report(y_true_test_syn, y_pred_test_syn, digits = 4) cr_report_dia = seq_classification_report(y_true_test_dia, y_pred_test_dia, digits = 4) cr_report_syn_overlap = seq_classification_report(y_true_test_syn, y_pred_test_syn, digits = 4, overlap = True) cr_report_dia_overlap = seq_classification_report(y_true_test_dia, y_pred_test_dia, digits = 4, overlap = True) print("Classification report for TEST SYN (Exact):", cr_report_syn) print("Classification report for TEST SYN (Overlap):", cr_report_dia) print("Classification report for TEST DIA (Exact):", cr_report_syn_overlap) print("Classification report for TEST DIA (Overlap):", cr_report_dia_overlap) if args.save_cr: pickle.dump(cr_report_syn, open(args.output_path+'classification_report_'+lm+str(batch_size)+'_test_syn_exact.txt','wb')) pickle.dump(cr_report_dia, open(args.output_path+'classification_report_'+lm+str(batch_size)+'_test_dia_exact.txt','wb')) pickle.dump(cr_report_syn_overlap, open(args.output_path+'classification_report_'+lm+str(batch_size)+'_test_syn_overlap.txt','wb')) pickle.dump(cr_report_dia_overlap, open(args.output_path+'classification_report_'+lm+str(batch_size)+'_test_dia_overlap.txt','wb'))
def main(): """ main function for conducting Subtask A. Parameters are parsed with argparse. Language model should be suitable for German e.g.: 'bert-base-multilingual-uncased', 'bert-base-multilingual-cased', 'bert-base-german-cased', 'bert-base-german-dbmdz-cased', 'bert-base-german-dbmdz-uncased', 'distilbert-base-german-cased', 'distilbert-base-multilingual-cased'. """ ############################ variable settings ################################# parser = argparse.ArgumentParser(description='Run Subtask A or B of GermEval 2017 Using Pre-Trained Language Model.') parser.add_argument('--task', type=str, default='A', help="The task you want to conduct ('A' or 'B').") parser.add_argument('--seed', type=int, default=42, help='Random seed.') parser.add_argument('--lang_model', type=str, default='bert-base-german-dbmdz-uncased', help='The pre-trained language model.') parser.add_argument('--epochs', type=int, default=4, help='Number of epochs for training.') parser.add_argument('--lr', type=float, default=5e-5, help='The learning rate.') parser.add_argument('--max_len', type=int, default=256, help='The maximum sequence length of the input text.') parser.add_argument('--batch_size', type=int, default=32, help='Your train set batch size.') parser.add_argument('--df_path', type=str, default='./data/', help='The data directory.') parser.add_argument('--train_data', type=str, default='train_df.tsv', help='The filename of the input train data.') parser.add_argument('--dev_data', type=str, default='dev_df.tsv', help='The filename of the input development data.') parser.add_argument('--test_data1', type=str, default='test_syn_df.tsv', help='The filename of the first input test data (synchronic).') parser.add_argument('--test_data2', type=str, default='test_dia_df.tsv', help='The filename of the second input test data (diachronic).') parser.add_argument('--output_path', type=str, default='./output/subtaskA/', help='The output directory of the model and predictions.') parser.add_argument("--train", default=True, action="store_true", help="Flag for training.") parser.add_argument("--save_prediction", default=True, action="store_true", help="Flag for saving predictions.") args = parser.parse_args() ################################################################################ set_all_seeds(args.seed) device, n_gpu = initialize_device_settings(use_cuda=True) # Load data train_df = pd.read_csv(args.df_path + args.train_data, delimiter = '\t') dev_df = pd.read_csv(args.df_path + args.dev_data, delimiter = '\t') test_syn_df = pd.read_csv(args.df_path + args.test_data1, delimiter = '\t') test_syn_df = test_syn_df.dropna(subset = ["text"]) test_dia_df = pd.read_csv(args.df_path + args.test_data2, delimiter = '\t') # Create a tokenizer lower_case = False if args.lang_model[-7:] == "uncased": lower_case = True if args.lang_model[:4] == "bert": model_class = "BERT" tokenizer = BertTokenizer.from_pretrained(args.lang_model, do_lower_case=lower_case, max_length=args.max_len) if args.lang_model[:10] == "distilbert": model_class = "DistilBERT" tokenizer = DistilBertTokenizer.from_pretrained(args.lang_model, do_lower_case=lower_case, max_length=args.max_len) # get training features df = pd.concat([train_df, dev_df]) sentences = df.text.values sentences_syn = test_syn_df.text.values sentences_dia = test_dia_df.text.values if args.task == 'A': class_list = [False, True] df['relevance_label'] = df.apply(lambda x: class_list.index(x['relevance']), axis = 1) labels = df.relevance_label.values test_syn_df['relevance_label'] = test_syn_df.apply(lambda x: class_list.index(x['relevance']), axis = 1) labels_syn = test_syn_df.relevance_label.values test_dia_df['relevance_label'] = test_dia_df.apply(lambda x: class_list.index(x['relevance']), axis = 1) labels_dia = test_dia_df.relevance_label.values if args.task == 'B': class_list = ["negative", "neutral", "positive"] df['sentiment_label'] = df.apply(lambda x: class_list.index(x['sentiment']), axis = 1) labels = df.sentiment_label.values test_syn_df['sentiment_label'] = test_syn_df.apply(lambda x: class_list.index(x['sentiment']), axis = 1) labels_syn = test_syn_df.sentiment_label.values test_dia_df['sentiment_label'] = test_dia_df.apply(lambda x: class_list.index(x['sentiment']), axis = 1) labels_dia = test_dia_df.sentiment_label.values num_labels = len(set(labels)) # Tokenize all of the sentences and map the tokens to their word IDs. input_ids = [tokenizer.encode(sent, add_special_tokens=True, truncation=True, max_length=args.max_len) for sent in sentences] input_ids = pad_sequences(input_ids, maxlen=args.max_len, dtype="long", value=0.0, truncating="post", padding="post") # Create attention masks attention_masks = [[int(token_id > 0) for token_id in sent] for sent in input_ids] # synchronic test data input_ids_syn = [tokenizer.encode(sent, add_special_tokens=True, truncation=True) for sent in sentences_syn] input_ids_syn = pad_sequences(input_ids_syn, maxlen=args.max_len, dtype="long", value=0.0, truncating="post", padding="post") attention_masks_syn = [[int(token_id > 0) for token_id in sent] for sent in input_ids_syn] # diachronic test data input_ids_dia = [tokenizer.encode(sent, add_special_tokens=True, truncation=True) for sent in sentences_dia] input_ids_dia = pad_sequences(input_ids_dia, maxlen=args.max_len, dtype="long", value=0.0, truncating="post", padding="post") attention_masks_dia = [[int(token_id > 0) for token_id in sent] for sent in input_ids_dia] # split train, dev train_inputs, train_labels, dev_inputs, dev_labels, train_masks, dev_masks = split_train_dev( train_df, dev_df, attention_masks, input_ids, labels) # transform to torch tensor train_inputs = torch.tensor(train_inputs) dev_inputs = torch.tensor(dev_inputs) train_labels = torch.tensor(train_labels) dev_labels = torch.tensor(dev_labels) train_masks = torch.tensor(train_masks) dev_masks = torch.tensor(dev_masks) test_syn_inputs = torch.tensor(input_ids_syn) test_syn_labels = torch.tensor(labels_syn) test_syn_masks = torch.tensor(attention_masks_syn) test_dia_inputs = torch.tensor(input_ids_dia) test_dia_labels = torch.tensor(labels_dia) test_dia_masks = torch.tensor(attention_masks_dia) # Create the DataLoader train_dataloader = create_dataloader(train_inputs, train_masks, train_labels, args.batch_size, train=True) dev_dataloader = create_dataloader(dev_inputs, dev_masks, dev_labels, args.batch_size, train=False) test_syn_dataloader = create_dataloader(test_syn_inputs, test_syn_masks, test_syn_labels, args.batch_size, train=False) test_dia_dataloader = create_dataloader(test_dia_inputs, test_dia_masks, test_dia_labels, args.batch_size, train=False) # Create model if args.train: if model_class == "BERT": config = BertConfig.from_pretrained(args.lang_model, num_labels=num_labels) config.hidden_dropout_prob = 0.1 model = BertForSequenceClassification.from_pretrained( args.lang_model, num_labels = num_labels, output_attentions = False, output_hidden_states = False ) if model_class == "DistilBERT": config = DistilBertConfig.from_pretrained(args.lang_model, num_labels=num_labels) config.hidden_dropout_prob = 0.1 model = DistilBertForSequenceClassification.from_pretrained( args.lang_model, num_labels = num_labels, output_attentions = False, output_hidden_states = False ) model.cuda() # Create an optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] optimizer = AdamW( optimizer_grouped_parameters, lr=args.lr, eps=1e-8 ) # Total number of training steps = number of batches * number of epochs total_steps = len(train_dataloader) * args.epochs # Create the learning rate scheduler scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps ) # train model # Main Loop print("=================== Train ================") print("##### Language Model:", args.lang_model, ",", "learning rate:", args.lr) print() track_time = time.time() # trange is a tqdm wrapper around the normal python range for epoch in trange(args.epochs, desc="Epoch"): print("Epoch: %4i"%epoch, dt.datetime.now()) model, optimizer, scheduler, tr_loss = train( train_dataloader, model=model, device=device, optimizer=optimizer, scheduler=scheduler ) # EVALUATION: TRAIN SET true_bools_train, pred_bools_train, f1_train = eval( train_dataloader, model=model, device=device) print("TRAIN: micro F1 %.4f"%(f1_train)) # here: same as accuracy print(confusion_matrix(true_bools_train,pred_bools_train)) # EVALUATION: DEV SET true_bools_dev, pred_bools_dev, f1_dev = eval( dev_dataloader, model=model, device=device) print("EVAL: micro F1 %.4f"%(f1_dev)) print(confusion_matrix(true_bools_dev,pred_bools_dev)) print(" Training and validation took in total: {:}".format(format_time(time.time()-track_time))) # EVALUATION: TEST SYN SET true_bools_syn, pred_bools_syn, f1_test_syn = eval( test_syn_dataloader, model=model, device=device) print("TEST SYN: micro F1 %.4f"%(f1_test_syn)) print(confusion_matrix(true_bools_syn,pred_bools_syn)) # EVALUATION: TEST DIA SET true_bools_dia, pred_bools_dia, f1_test_dia = eval( test_dia_dataloader, model=model, device=device) print("TEST DIA: micro F1 %.4f"%(f1_test_dia)) print(confusion_matrix(true_bools_dia, pred_bools_dia)) if args.save_prediction: if args.task == 'A': test_syn_df["relevance_pred"] = pred_bools_syn test_dia_df["relevance_pred"] = pred_bools_dia if args.task == 'B': test_syn_df["sentiment_pred"] = pred_bools_syn test_dia_df["sentiment_pred"] = pred_bools_dia test_syn_df.to_csv(args.output_path+args.lang_model+"_eval_test_syn.tsv", sep="\t", index = False, header = True, encoding = "utf-8-sig") test_dia_df.to_csv(args.output_path+args.lang_model+"_eval_test_dia.tsv", sep="\t", index = False, header = True, encoding = "utf-8-sig")
# Set cudnn flags to True for speed increase, but lose reproducibility if torch.cuda.is_available(): import torch.backends.cudnn as cudnn cudnn.enabled = False cudnn.benchmark = False # Loading and processing the data print('Loading the data') train_input, train_target, test_input, test_target = load_eeg_data( feature_dim_last=True, standardize=True, one_khz=True) train_input, train_target = augment_dataset(train_input, train_target, 0.1, 15) dset_loaders, dset_sizes = create_dataloader(train_input, train_target, test_input, test_target, batch_size=64) # Defining the model model = CNN_Model(train_input.shape[1], kernel_sizes=[3, 5, 7], conv_channels=[28, 32, 16, 1], dropout=0.1) criterion = torch.nn.CrossEntropyLoss() learning_rate = 1e-3 weight_decay = 1e-4 # L2 regularizer parameter optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
def main(): """ main function for conducting Subtask C. Parameters are parsed with argparse. Language model should be suitable for German e.g.: 'bert-base-multilingual-uncased', 'bert-base-multilingual-cased', 'bert-base-german-cased', 'bert-base-german-dbmdz-cased', 'bert-base-german-dbmdz-uncased', 'distilbert-base-german-cased', 'distilbert-base-multilingual-cased'. """ ############################ variable settings ################################# parser = argparse.ArgumentParser( description= 'Run Subtask C of GermEval 2017 Using Pre-Trained Language Model.') parser.add_argument('--seed', type=int, default=42, help='Random seed.') parser.add_argument('--lang_model', type=str, default='bert-base-german-dbmdz-uncased', help='The pre-trained language model.') parser.add_argument('--epochs', type=int, default=4, help='Number of epochs for training.') parser.add_argument('--lr', type=float, default=5e-5, help='The learning rate.') parser.add_argument('--max_len', type=int, default=256, help='The maximum sequence length of the input text.') parser.add_argument('--batch_size', type=int, default=32, help='Your train set batch size.') parser.add_argument('--df_path', type=str, default='./data/', help='The data directory.') parser.add_argument('--train_data', type=str, default='train_df_cat.tsv', help='The filename of the input train data.') parser.add_argument('--dev_data', type=str, default='dev_df_cat.tsv', help='The filename of the input development data.') parser.add_argument( '--test_data1', type=str, default='test_syn_df_cat.tsv', help='The filename of the first input test data (synchronic).') parser.add_argument( '--test_data2', type=str, default='test_dia_df_cat.tsv', help='The filename of the second input test data (diachronic).') parser.add_argument( '--output_path', type=str, default='./output/subtaskC/', help='The output directory of the model and predictions.') parser.add_argument("--train", default=True, action="store_true", help="Flag for training.") parser.add_argument("--save_prediction", default=False, action="store_true", help="Flag for saving predictions.") parser.add_argument("--save_cr", default=False, action="store_true", help="Flag for saving confusion matrix.") parser.add_argument("--exclude_general", default=False, action="store_true", help="Flag for excluding category Allgemein.") parser.add_argument("--exclude_neutral", default=False, action="store_true", help="Flag for excluding neutral polarity.") parser.add_argument("--exclude_general_neutral", default=False, action="store_true", help="Flag for excluding category Allgemein:neutral.") args = parser.parse_args() ################################################################################ set_all_seeds(args.seed) device, n_gpu = initialize_device_settings(use_cuda=True) # Load data train_df = pd.read_csv(args.df_path + args.train_data, delimiter='\t') dev_df = pd.read_csv(args.df_path + args.dev_data, delimiter='\t') test_syn_df = pd.read_csv(args.df_path + args.test_data1, delimiter='\t') test_dia_df = pd.read_csv(args.df_path + args.test_data2, delimiter='\t') # Create a tokenizer lower_case = False if args.lang_model[-7:] == "uncased": lower_case = True if args.lang_model[:4] == "bert": model_class = "BERT" tokenizer = BertTokenizer.from_pretrained(args.lang_model, do_lower_case=lower_case, max_length=args.max_len) if args.lang_model[:10] == "distilbert": model_class = "DistilBERT" tokenizer = DistilBertTokenizer.from_pretrained( args.lang_model, do_lower_case=lower_case, max_length=args.max_len) # get training features cats = train_df.columns[5:] end = "full" # exclude categories if required if (args.exclude_general): cats = [i for i in list(cats) if "Allgemein" not in i] end = "excl_gen" if (args.exclude_neutral): cats = [i for i in list(cats) if "neutral" not in i] end = "excl_neu" if (args.exclude_general_neutral): cats = [i for i in list(cats) if "Allgemein:neutral" not in i] end = "excl_genneu" num_labels = len(list(cats)) # create one hot labels train_df['one_hot_labels'] = list(train_df[list(cats)].values) dev_df['one_hot_labels'] = list(dev_df[list(cats)].values) test_syn_df['one_hot_labels'] = list(test_syn_df[list(cats)].values) test_dia_df['one_hot_labels'] = list(test_dia_df[list(cats)].values) # retrieve sentences and labels df = pd.concat([train_df, dev_df]) sentences = df.text.values labels = list(df.one_hot_labels.values) sentences_syn = test_syn_df.text.values labels_syn = list(test_syn_df.one_hot_labels.values) sentences_dia = test_dia_df.text.values labels_dia = list(test_dia_df.one_hot_labels.values) print("number of categories:", len(list(cats))) # Tokenize all of the sentences and map the tokens to their word IDs. input_ids = [ tokenizer.encode(sent, add_special_tokens=True, truncation=True, max_length=args.max_len) for sent in sentences ] input_ids = pad_sequences(input_ids, maxlen=args.max_len, dtype="long", value=0.0, truncating="post", padding="post") # Create attention masks attention_masks = [[int(token_id > 0) for token_id in sent] for sent in input_ids] # synchronic test data input_ids_syn = [ tokenizer.encode(sent, add_special_tokens=True, truncation=True) for sent in sentences_syn ] input_ids_syn = pad_sequences(input_ids_syn, maxlen=args.max_len, dtype="long", value=0.0, truncating="post", padding="post") attention_masks_syn = [[int(token_id > 0) for token_id in sent] for sent in input_ids_syn] # diachronic test data input_ids_dia = [ tokenizer.encode(sent, add_special_tokens=True, truncation=True) for sent in sentences_dia ] input_ids_dia = pad_sequences(input_ids_dia, maxlen=args.max_len, dtype="long", value=0.0, truncating="post", padding="post") attention_masks_dia = [[int(token_id > 0) for token_id in sent] for sent in input_ids_dia] # split train, dev train_inputs, train_labels, dev_inputs, dev_labels, train_masks, dev_masks = split_train_dev( train_df, dev_df, attention_masks, input_ids, labels) # transform to torch tensor train_inputs = torch.tensor(train_inputs) dev_inputs = torch.tensor(dev_inputs) train_labels = torch.tensor(train_labels) dev_labels = torch.tensor(dev_labels) train_masks = torch.tensor(train_masks) dev_masks = torch.tensor(dev_masks) test_syn_inputs = torch.tensor(input_ids_syn) test_syn_masks = torch.tensor(attention_masks_syn) test_syn_labels = torch.tensor(labels_syn) test_dia_inputs = torch.tensor(input_ids_dia) test_dia_masks = torch.tensor(attention_masks_dia) test_dia_labels = torch.tensor(labels_dia) # Create the DataLoader train_dataloader = create_dataloader(train_inputs, train_masks, train_labels, args.batch_size, train=True) dev_dataloader = create_dataloader(dev_inputs, dev_masks, dev_labels, args.batch_size, train=False) test_syn_dataloader = create_dataloader(test_syn_inputs, test_syn_masks, test_syn_labels, args.batch_size, train=False) test_dia_dataloader = create_dataloader(test_dia_inputs, test_dia_masks, test_dia_labels, args.batch_size, train=False) # Create model if args.train: if model_class == "BERT": config = BertConfig.from_pretrained(args.lang_model, num_labels=num_labels) config.hidden_dropout_prob = 0.1 model = BertForSequenceClassification.from_pretrained( args.lang_model, num_labels=num_labels, output_attentions=False, output_hidden_states=False) if model_class == "DistilBERT": config = DistilBertConfig.from_pretrained(args.lang_model, num_labels=num_labels) config.hidden_dropout_prob = 0.1 model = DistilBertForSequenceClassification.from_pretrained( args.lang_model, num_labels=num_labels, output_attentions=False, output_hidden_states=False) model.cuda() # Create an optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr, eps=1e-8) # Total number of training steps = number of batches * number of epochs total_steps = len(train_dataloader) * args.epochs # Create the learning rate scheduler scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps) # train model # Main Loop print("=================== Train ================") print("##### Language Model:", args.lang_model, ",", "learning rate:", args.lr) print() track_time = time.time() # trange is a tqdm wrapper around the normal python range for epoch in trange(args.epochs, desc="Epoch"): print("Epoch: %4i" % epoch, dt.datetime.now()) model, optimizer, scheduler, tr_loss = train_multilabel( train_dataloader=train_dataloader, model=model, device=device, optimizer=optimizer, scheduler=scheduler, num_labels=num_labels) # EVALUATION: TRAIN SET pred_bools_train, true_bools_train, f1_train = eval_multilabel( train_dataloader, model=model, device=device) print("TRAIN: micro F1 %.3f" % (f1_train)) # EVALUATION: DEV SET pred_bools_dev, true_bools_dev, f1_dev = eval_multilabel( dev_dataloader, model=model, device=device) print("EVAL: micro F1 %.3f" % (f1_dev)) print(" Training and validation took in total: {:}".format( format_time(time.time() - track_time))) # EVALUATION: TEST SYN SET pred_bools_syn, true_bools_syn, f1_test_syn = eval_multilabel( test_syn_dataloader, model=model, device=device) print("TEST SYN: micro F1 %.4f" % (f1_test_syn)) # classification report clf_report_syn = classification_report(true_bools_syn, pred_bools_syn, target_names=cats, digits=3) print(clf_report_syn) # EVALUATION: TEST DIA SET pred_bools_dia, true_bools_dia, f1_test_dia = eval_multilabel( test_dia_dataloader, model=model, device=device) print("TEST DIA: micro F1 %.4f" % (f1_test_dia)) # classification report clf_report_dia = classification_report(true_bools_dia, pred_bools_dia, target_names=cats, digits=3) print(clf_report_dia) if args.save_cr: pickle.dump( clf_report_syn, open( args.output_path + 'clf_report_' + args.lang_model + '_test_syn_' + str(num_labels) + end + '.txt', 'wb')) pickle.dump( clf_report_dia, open( args.output_path + 'clf_report_' + args.lang_model + '_test_dia_' + str(num_labels) + end + '.txt', 'wb')) if args.save_prediction: test_syn_df["category_pred"] = pred_bools_syn test_dia_df["category_pred"] = pred_bools_dia test_syn_df.category_pred.to_csv(args.output_path + args.lang_model + '_test_syn_' + str(num_labels) + end + ".tsv", sep="\t", index=False, header=True, encoding="utf-8-sig") test_dia_df.category_pred.to_csv(args.output_path + args.lang_model + '_test_dia_' + str(num_labels) + end + ".tsv", sep="\t", index=False, header=True, encoding="utf-8-sig")