## huggingface from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification, DistilBertForSequenceClassification import torch distil_bert = 'distilbert-base-cased' tokenizer = DistilBertTokenizerFast.from_pretrained(distil_bert, do_lower_case=False, add_special_tokens=True, max_length=256, pad_to_max_length=True) token_clf = DistilBertForTokenClassification.from_pretrained(distil_bert) sequence_clf = DistilBertForSequenceClassification.from_pretrained(distil_bert) sentence = 'Apple and Microsoft plan to form a joint venture for the development of cloud-based computing ' \ 'infrastructure.' input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 token_clf(input_ids) outputs = model(input_ids) last_hidden_states = outputs[0] test = db.sample(n=10) token_clf(tokenizer.encode_plus(sentence)) tokenizer.batch_encode_plus(test.text.to_list()) kb ## spacy def get_sequences_with_2_orgs(text, dist=150): ''' Uses spacy NER to identify organisations. If two organizations are detected within dist tokens from each other, extracts the sequence ''' # Apply the model
def __init__(self, n_classes): super(SentimentClassifier, self).__init__() self.bert = DistilBertForSequenceClassification.from_pretrained( "distilbert-base-uncased", num_labels=3)
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast, Trainer, TrainingArguments from nlp import load_dataset import torch import numpy as np model = DistilBertForSequenceClassification.from_pretrained( 'distilbert-base-uncased') tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') import random def tokenize(batch): return tokenizer(batch['text'], truncation=True, max_length=256, add_special_tokens=True, padding='max_length', return_attention_mask=True) train_dataset = load_dataset( 'json', data_files={'train': 'dataset_full_question/quanta_train.json'}, field='questions')['train'] train_dataset = train_dataset.map( lambda example: {'label': [0 if example['difficulty'] == 'School' else 1]}) train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset)) train_dataset.set_format('torch',
alpha = 0.1 # smoothing parameters for true label # /PARAMETERS # create log file data_folder = '../../data/from-figure-eight/balanced-test-data/tobert/' res_path = '../../res/' res_path += logfile_name with open(res_path, 'w') as f: c = 'epoch, iter, loss_train, loss_val, pre_val, rec_val, f01_val, f1_val, f10_val, ece_val' f.write(c + '\n') # configure DistilBERT model config = DistilBertConfig.from_pretrained('distilbert-base-cased') config.num_labels = num_labels tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') model = DistilBertForSequenceClassification(config) # load model to GPU if available if torch.cuda.is_available(): model = model.cuda() # load datasets train_dataset = pd.read_csv(data_folder + train_file) val_dataset = pd.read_csv(data_folder + val_file) print("TRAIN Dataset: {}".format(train_dataset.shape)) print("VAL Dataset: {}".format(val_dataset.shape)) training_set = DataLoaderSmoothing(train_dataset, alpha) validating_set = DataLoaderHard(val_dataset) # initialize batch sampler target = train_dataset.crowd_label.values print('target train 0/1: {}/{}'.format(len(np.where(target == 0)[0]),
# If there's a GPU available... if torch.cuda.is_available(): # Tell PyTorch to use the GPU. device = torch.device("cuda") print('There are %d GPU(s) available.' % torch.cuda.device_count()) print('We will use the GPU:', torch.cuda.get_device_name(0)) # If not... else: print('No GPU available, using the CPU instead.') device = torch.device("cpu") model = DistilBertForSequenceClassification.from_pretrained('/app/incivility_project/models/distilbert_5000_03-06-20') #config = BertConfig.from_json_file('../models/bert_classifier_2epoch_256size/config.json') tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased') model.cuda() #load comments and labels from the input tsv comments, labels = load_data.get_data(sys.argv[1]) #encode inputs using BERT tokenizer input_ids = [] for comment in comments: encoded_comment = tokenizer.encode(comment, add_special_tokens = True, max_length=256,pad_to_max_length=True) input_ids.append(encoded_comment)
from transformers import ( DistilBertForSequenceClassification, AutoTokenizer, ) from nlp_model.labels import technical_competency_labels # TODO: Auto set these up if they don't already exist. cache_dir = (str(Path(__file__).parent) + "/cache").replace("\\", "/") saved_model_dir = (str(Path(__file__).parent) + "/results").replace("\\", "/") max_tokens_count = 512 # DistilBert was chosen based on its speed and lightweight footprint. # Future work could explore different models. tokenizer = AutoTokenizer.from_pretrained( "distilbert-base-uncased", cache_dir=cache_dir, max_character_length=max_tokens_count, ) # Change "distilbert-base-uncased" to save_model_dir after running the training script once. # Unfortunately cannot upload the pre-trained model to git due to the file size. # Will find a better workaround later. # TODO: experiment with linear regression output (eg estimated years of experience) over ALL questions as input. # Using SequenceClassification over ONE question as input for now because it's well supported out of box. technical_competency_classifier = DistilBertForSequenceClassification.from_pretrained( "distilbert-base-uncased", num_labels=len(technical_competency_labels), cache_dir=cache_dir, )
def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-uncased') self.model = DistilBertForSequenceClassification.from_pretrained( 'distilbert-base-uncased-finetuned-sst-2-english')
if __name__ == "__main__": parser = argparse.ArgumentParser( description='Export bert onnx model', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( '--input_dir', type=str, help='input_dir of bert model, must contain config.json') parser.add_argument('--task_name', type=str, choices=["MRPC", "MNLI"], help='tasks names of bert model') parser.add_argument('--max_len', type=int, default=128, help='Maximum length of the sentence pairs') parser.add_argument('--do_lower_case', type=bool, default=True, help='whether lower the tokenizer') parser.add_argument('--output_model', type=str, default='bert.onnx', help='path to exported model file') args = parser.parse_args() model = DistilBertForSequenceClassification.from_pretrained(args.input_dir) export_onnx_model(args, model, args.output_model)
from __future__ import absolute_import from __future__ import division from __future__ import print_function import argparse from transformers import pipeline, DistilBertForSequenceClassification, DistilBertTokenizerFast parser = argparse.ArgumentParser(description='Sentiment Analysis') parser.add_argument('text', help='tweet text') args = parser.parse_args() # Sentiment classes LABELS = {'LABEL_0': 'NEGATIVE', 'LABEL_1': 'POSITIVE'} model = pipeline( 'sentiment-analysis', model=DistilBertForSequenceClassification.from_pretrained("model"), tokenizer=DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased')) result = model(args.text) sentiment = LABELS[result[0].get('label')] score = result[0].get('score') if __name__ == "__main__": print( '\n' + f'The sentiment for the text `{args.text}` is {sentiment} with a probaility of {round(score, 5)}.' )
def main(proj_root_dir, epochs: int = 3): print("Begin fine-tune for IMDB sentiment") torch.manual_seed(1) np.random.seed(1) # Load raw IMDB train data into memory print("\nLoading IMDB train data subset into memory...") train_reviews, train_labels = read_imdb(f"{proj_root_dir}/train") # consider creating validation set here # # # Tokenize the raw data reviews text print("\nTokenizing training text...") tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") train_tokens = tokenizer(train_reviews, truncation=True, padding=True) # token IDs and mask # Load tokenized text and labels into PyTorch Dataset print("\nLoading tokenized text into Pytorch Dataset") train_dataset = IMDbDataset(train_tokens, train_labels) # Load (possibly cached) pretrained HF model print("\nLoading pre-trained DistilBERT model ") model = DistilBertForSequenceClassification.from_pretrained( "distilbert-base-uncased") model.to(device) model.train() # set at training mode # Fine-tune / train model using standard PyTorch print("Loading Dataset with batch_size: 10 ") train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True) print(f"\nFine-tuning the model. It's now {datetime.now()}") optim = AdamW(model.parameters(), lr=5.0e-5) # weight decay for epoch in range(epochs): epoch_loss = 0.0 for (batch_idx, batch) in enumerate(train_loader): optim.zero_grad() input_ids = batch["input_ids"] # tensor attn_mask = batch["attention_mask"] # tensor labels = batch["labels"] # tensor outputs = model(input_ids, attention_mask=attn_mask, labels=labels) loss = outputs[0] epoch_loss += loss.item() # accumulate batch loss loss.backward() optim.step() if batch_idx % 20 == 0: print("batch_idx: %5d, curr batch loss: %0.4f. It is now: %s" % (batch_idx, loss.item(), datetime.now())) print("End of epoch no. %4d, epoch loss = %0.4f. Now is %s" % (epoch, epoch_loss, datetime.now())) print("Training is complete") # 6. save trained model weights and biases print("\nSaving tuned model state") model.eval() torch.save(model.state_dict(), f"{proj_root_dir}/models/imdb_state.pt") # just state print("\nEnd pf demo")
num_replicas=hvd.size(), rank=hvd.rank()) val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size) gpu_available = torch.cuda.is_available() if gpu_available: torch.cuda.set_device(hvd.local_rank()) num_labels = len(set(labels)) model = DistilBertForSequenceClassification.from_pretrained( 'distilbert-base-uncased', num_labels=num_labels, output_attentions=False, output_hidden_states=False) lr_scaler = hvd.size() device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = model.to(device) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay_rate':
"distilBertBigram": lambda: DistilBertTokenizer.from_pretrained(DISTILBERT_BIGRAM_TOKENIZER, is_split_into_words=True), "distilBertPOS": lambda: DistilBertTokenizer.from_pretrained(DISTILBERT_POS_TOKENIZER, is_split_into_words=True), "distilBertEmbed": lambda: DistilBertTokenizer.from_pretrained(DISTILBERT_EMBED_TOKENIZER, is_split_into_words=True) } def getDistilbertEmbeds(tokenizerName): model = all_models["distilBert"](64) tokenizer = all_tokenizers[tokenizerName]() model.resize_token_embeddings(len(tokenizer)) return model all_models = { "T5": lambda _: T5ForConditionalGeneration.from_pretrained( "t5-small", num_labels=10 ), "distilBert": lambda _: DistilBertForSequenceClassification.from_pretrained( "distilbert-base-uncased", num_labels=10, ), "distilBertBig": lambda _: DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=10, n_layers=8), "distilBertSmall": lambda _: DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=10, n_layers=2, output_attentions=True, n_heads=4), "distilBertEmbed": lambda _: getDistilbertEmbeds("distilBertEmbed"), "distilBertBigram": lambda _: getDistilbertEmbeds("distilBertBigram"), "distilBertPOS": lambda _: getDistilbertEmbeds("distilBertPOS"), "lstm": lambda bs: newsLSTM(bs), "lstmAttention": lambda bs: lstmAttention(bs), "lstmAttentionBigram": lambda bs: lstmAttention(bs, useBigram=True), "lstmBigram": lambda bs: newsLSTM(bs, useBigram=True) }
def run_model(pos_train_file, neg_train_file, pos_dev_file, neg_dev_file, nrows_train, nrows_dev, epochs, out_dir): batch_size = 16 #x_train = _read_data('../data/train_bal.csv', nrows_train) #x_dev = _read_data('../data/dev_bal.csv', nrows_dev) #train_data = list( zip( x_train['comment_text'].values, x_train['target'].values )) #train_dataloader = DataLoader( train_data, # collate_fn=my_collate, # batch_size=batch_size , shuffle=True, ) # # #dev_data = list( zip( x_dev['comment_text'].values, x_dev['target'].values )) #dev_dataloader = DataLoader( dev_data, # collate_fn=my_collate, # batch_size=batch_size, shuffle=False, ) train_dataloader = get_data_loader_bal(pos_train_file, neg_train_file, batch_size=batch_size, nrows_pos=nrows_train, nrows_neg=nrows_train, mode='train') dev_dataloader = get_data_loader_bal(pos_dev_file, neg_dev_file, batch_size=batch_size, nrows_pos=nrows_dev, nrows_neg=nrows_dev, mode='dev') device = get_device() bert_hidden_states = 4 config = DistilBertConfig() config.output_hidden_states = True model = DistilBertForSequenceClassification.from_pretrained( "distilbert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab. num_labels= 2, # The number of output labels--2 for binary classification. # You can increase this for multi-class tasks. output_attentions=False, # Whether the model returns attentions weights. output_hidden_states= False, # Whether the model returns all hidden-states. ) model = model.to(device) optimizer = AdamW( model.parameters(), lr=2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5 eps=1e-8 # args.adam_epsilon - default is 1e-8. ) total_steps = len(train_dataloader) * epochs # Create the learning rate scheduler. scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, # Default value in run_glue.py num_training_steps=total_steps) if not os.path.exists(out_dir): os.makedirs(out_dir) best_score = -np.inf stats_vec = [] for epoch in range(epochs): stats = train_epoch(model, train_dataloader, dev_dataloader, optimizer, scheduler) print(stats) if stats['accuracy'] > best_score: best_score = stats['accuracy'] f = out_dir + '/' + 'best_model_ch.pt' torch.save({ 'epoch': epoch, 'model': model, 'stats': stats, }, f) stats_vec.append(stats) stats_vec = pd.DataFrame(stats_vec) f = out_dir + '/' + 'last_model_ch.pt' torch.save({ 'epoch': epoch, 'model': model, 'stats': stats, }, f) print(stats_vec) stats_vec.to_csv(out_dir + '/' + 'stats.csv')
# model = TFAutoModel.from_pretrained(SAVE_DIR, from_pt=True) # 在PyTorch中加载TensorFlow模型: # model = AutoModel.from_pretrained(SAVE_DIR, from_tf=True) # %% # 还可以令模型返回所有隐藏状态和注意力权重 outputs = model(**inputs, output_hidden_states=True, output_attentions=True) hidden_states, attentions = outputs[-2:] # %% from transformers import DistilBertTokenizer, DistilBertForSequenceClassification # 根据已知模型架构选择对应的模型类进行实例化,与上述使用`AutoModel`效果一样 model_name = "distilbert-base-uncased-finetuned-sst-2-english" tokenizer = DistilBertTokenizer.from_pretrained(model_name) model = DistilBertForSequenceClassification.from_pretrained(model_name) # %% # 自定义模型 # 每种模型架构都有相应的配置类(譬如,Customizing the model -> DistilBertConfig) # 可以通过修改配置参数(包括hidden dimension, dropout rate等)改变模型, from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification # %% # 若进行核心修改(譬如hidden dimension),则无法使用预训练模型,需要从头开始训练 config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4 * 512) tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') model = DistilBertForSequenceClassification(config) # %%
def __init__(self, config): # self.name, self.num_classes, epochs, batchs self.Configs = config self.num_classes = len(config.label_list) self.train_logits = [] self.validation_logits = [] self.test_logits = [] self.train_texts = [] self.train_labels = [] self.validation_texts = [] self.validation_labels = [] self.test_texts = [] self.test_labels = [] train = pd.read_csv(os.path.join(self.Configs.data_dir, 'train.csv')) try: dev = pd.read_csv(os.path.join(self.Configs.data_dir, 'dev.csv')) except: print('Validation disabled.') test = pd.read_csv(os.path.join(self.Configs.data_dir, 'test.csv')) self.train_texts = train['text'].tolist() self.train_labels = train['label'].tolist() try: self.validation_texts = dev['text'].tolist() self.validation_labels = dev['label'].tolist() except: pass self.test_texts = test['text'].tolist() for i in range(len(self.test_texts)): self.test_labels.append(0) if torch.cuda.is_available(): self.device = torch.device("cuda") else: print('No GPU available, using the CPU instead.') self.device = torch.device("cpu") if self.Configs.model_name == 'bert': self.model = BertForSequenceClassification.from_pretrained( self.Configs.pretrained_model_dir, num_labels=self.num_classes) self.tokenizer = BertTokenizer.from_pretrained( self.Configs.pretrained_model_dir) if self.Configs.model_name == 'albert': self.model = AlbertForSequenceClassification.from_pretrained( self.Configs.pretrained_model_dir, num_labels=self.num_classes) self.tokenizer = AlbertTokenizer.from_pretrained( self.Configs.pretrained_model_dir) if self.Configs.model_name == 'distilbert': self.model = DistilBertForSequenceClassification.from_pretrained( self.Configs.pretrained_model_dir, num_labels=self.num_classes) self.tokenizer = DistilBertTokenizer.from_pretrained( self.Configs.pretrained_model_dir) if self.Configs.model_name == 'roberta': self.model = RobertaForSequenceClassification.from_pretrained( self.Configs.pretrained_model_dir, num_labels=self.num_classes) self.tokenizer = RobertaTokenizer.from_pretrained( self.Configs.pretrained_model_dir) if torch.cuda.is_available(): self.model.cuda()
# -*- coding: utf-8 -*- # @Time : 10/12/19 5:44 PM # @Author : hujunchao # @Email : [email protected] # @File : text_classify_using_distil_bert.py from transformers import DistilBertModel, DistilBertTokenizer, DistilBertConfig, DistilBertForSequenceClassification import torch from download_save import save_path_model, save_path_tokenizer tokenizer = DistilBertTokenizer.from_pretrained(save_path_tokenizer) model = DistilBertForSequenceClassification.from_pretrained(save_path_model) # encoded = tokenizer.encode('hello world, my name is Tom') # encoded = torch.tensor(encoded).unsqueeze(dim=0) label = torch.tensor([1]).unsqueeze(dim=0) encoded = torch.randint(5000, size=[32, 512]) labels = torch.randint(1, size=[32, 2]) result = model(encoded, labels=label) print()
def main(): parser = setup_parser() args = parser.parse_args() # specifies the path where the biobert or clinical bert model is saved if args.bert_model == 'biobert' or args.bert_model == 'clinical_bert': args.bert_model = args.model_loc print(args.bert_model) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, "mednli": MedNLIProcessor, "goc": GOCProcessor } num_labels_task = {"cola": 2, "mnli": 3, "mrpc": 2, "mednli": 3, "goc": 2} if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") #if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: # raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) #if not os.path.exists(args.output_dir): # os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = DistilBertTokenizer.from_pretrained( args.bert_model, do_lower_case=args.do_lower_case) print('TRAIN') train = processor.get_train_examples(args.data_dir) print([(train[i].text_a, train[i].text_b, train[i].label) for i in range(3)]) print('DEV') dev = processor.get_dev_examples(args.data_dir) print([(dev[i].text_a, dev[i].text_b, dev[i].label) for i in range(3)]) print('TEST') test = processor.get_test_examples(args.data_dir) print([(test[i].text_a, test[i].text_b, test[i].label) for i in range(3)]) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format( args.local_rank)) model = DistilBertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) num_train_optimization_steps scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=float(num_train_optimization_steps) * args.warmup_proportion, num_training_steps=num_train_optimization_steps) #optimizer = BertAdam(optimizer_grouped_parameters, # lr=args.learning_rate, # warmup=args.warmup_proportion, # t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() for epoch_num in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, logits, other = model(input_ids=input_ids, attention_mask=input_mask, labels=label_ids) #print(loss[0].shape) #print(loss[1].shape) #print(loss[2].shape) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() scheduler.step() optimizer.zero_grad() global_step += 1 # Saving checkpoint save_checkpoint(model, args.output_dir, "epoch_%d_checkpoint.pth" % epoch_num) if args.do_train: # Save a trained model and the associated configuration model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) torch.save(model_to_save.state_dict(), output_model_file) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned #config = DistilBertConfig(output_config_file) model = DistilBertForSequenceClassification.from_pretrained( args.output_dir) #, num_labels=num_labels) #model.load_state_dict(torch.load(output_model_file)) else: model = DistilBertForSequenceClassification.from_pretrained( args.bert_model) #, num_labels=num_labels) model.to(device) if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits, other = model(input_ids=input_ids, attention_mask=input_mask, labels=label_ids) # logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) if args.do_test and (args.local_rank == -1 or torch.distributed.get_rank() == 0): test_examples = processor.get_test_examples(args.data_dir) test_features = convert_examples_to_features(test_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running testing *****") logger.info(" Num examples = %d", len(test_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in test_features], dtype=torch.long) test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) model.eval() test_loss, test_accuracy = 0, 0 nb_test_steps, nb_test_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( test_dataloader, desc="Testing"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): #tmp_test_loss = model(input_ids, segment_ids, input_mask, label_ids) tmp_test_loss, logits, other = model(input_ids=input_ids, attention_mask=input_mask, labels=label_ids) #logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_test_accuracy = accuracy(logits, label_ids) test_loss += tmp_test_loss.mean().item() test_accuracy += tmp_test_accuracy nb_test_examples += input_ids.size(0) nb_test_steps += 1 test_loss = test_loss / nb_test_steps test_accuracy = test_accuracy / nb_test_examples loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'test_loss': test_loss, 'test_accuracy': test_accuracy, 'global_step': global_step, 'loss': loss } output_test_file = os.path.join(args.output_dir, "test_results.txt") with open(output_test_file, "w") as writer: logger.info("***** Test results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def __init__(self): super(DistilBertModelTest, self).__init__() config = DistilBertConfig.from_pretrained('models/config.json') self.distilbert = DistilBertForSequenceClassification( config) # /bert_pretrain/ self.device = torch.device("cuda")
default=512, help='maximum length handled by the model') args = parser.parse_args() usecfg = False if usecfg: from transformers import ( DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer, ) config = DistilBertConfig.from_pretrained(args.model_name, finetuning_task='sentiment3', num_labels=3) model = DistilBertForSequenceClassification.from_pretrained( args.model_name, config=config) tokenizer = DistilBertTokenizer.from_pretrained( args.model_name, do_lower_case=(not args.keep_case)) else: from transformers import AutoTokenizer, AutoModelForSequenceClassification tokenizer = AutoTokenizer.from_pretrained( args.model_name, do_lower_case=(not args.keep_case)) model = AutoModelForSequenceClassification.from_pretrained(args.model_name) model.to("cpu") model.eval() classes = ["0", "1", "2"] texts = ["I hate you", "I love you", "Isomorphic protein matrices"]
def main(): """ main function for conducting Subtask C. Parameters are parsed with argparse. Language model should be suitable for German e.g.: 'bert-base-multilingual-uncased', 'bert-base-multilingual-cased', 'bert-base-german-cased', 'bert-base-german-dbmdz-cased', 'bert-base-german-dbmdz-uncased', 'distilbert-base-german-cased', 'distilbert-base-multilingual-cased'. """ ############################ variable settings ################################# parser = argparse.ArgumentParser( description= 'Run Subtask C of GermEval 2017 Using Pre-Trained Language Model.') parser.add_argument('--seed', type=int, default=42, help='Random seed.') parser.add_argument('--lang_model', type=str, default='bert-base-german-dbmdz-uncased', help='The pre-trained language model.') parser.add_argument('--epochs', type=int, default=4, help='Number of epochs for training.') parser.add_argument('--lr', type=float, default=5e-5, help='The learning rate.') parser.add_argument('--max_len', type=int, default=256, help='The maximum sequence length of the input text.') parser.add_argument('--batch_size', type=int, default=32, help='Your train set batch size.') parser.add_argument('--df_path', type=str, default='./data/', help='The data directory.') parser.add_argument('--train_data', type=str, default='train_df_cat.tsv', help='The filename of the input train data.') parser.add_argument('--dev_data', type=str, default='dev_df_cat.tsv', help='The filename of the input development data.') parser.add_argument( '--test_data1', type=str, default='test_syn_df_cat.tsv', help='The filename of the first input test data (synchronic).') parser.add_argument( '--test_data2', type=str, default='test_dia_df_cat.tsv', help='The filename of the second input test data (diachronic).') parser.add_argument( '--output_path', type=str, default='./output/subtaskC/', help='The output directory of the model and predictions.') parser.add_argument("--train", default=True, action="store_true", help="Flag for training.") parser.add_argument("--save_prediction", default=False, action="store_true", help="Flag for saving predictions.") parser.add_argument("--save_cr", default=False, action="store_true", help="Flag for saving confusion matrix.") parser.add_argument("--exclude_general", default=False, action="store_true", help="Flag for excluding category Allgemein.") parser.add_argument("--exclude_neutral", default=False, action="store_true", help="Flag for excluding neutral polarity.") parser.add_argument("--exclude_general_neutral", default=False, action="store_true", help="Flag for excluding category Allgemein:neutral.") args = parser.parse_args() ################################################################################ set_all_seeds(args.seed) device, n_gpu = initialize_device_settings(use_cuda=True) # Load data train_df = pd.read_csv(args.df_path + args.train_data, delimiter='\t') dev_df = pd.read_csv(args.df_path + args.dev_data, delimiter='\t') test_syn_df = pd.read_csv(args.df_path + args.test_data1, delimiter='\t') test_dia_df = pd.read_csv(args.df_path + args.test_data2, delimiter='\t') # Create a tokenizer lower_case = False if args.lang_model[-7:] == "uncased": lower_case = True if args.lang_model[:4] == "bert": model_class = "BERT" tokenizer = BertTokenizer.from_pretrained(args.lang_model, do_lower_case=lower_case, max_length=args.max_len) if args.lang_model[:10] == "distilbert": model_class = "DistilBERT" tokenizer = DistilBertTokenizer.from_pretrained( args.lang_model, do_lower_case=lower_case, max_length=args.max_len) # get training features cats = train_df.columns[5:] end = "full" # exclude categories if required if (args.exclude_general): cats = [i for i in list(cats) if "Allgemein" not in i] end = "excl_gen" if (args.exclude_neutral): cats = [i for i in list(cats) if "neutral" not in i] end = "excl_neu" if (args.exclude_general_neutral): cats = [i for i in list(cats) if "Allgemein:neutral" not in i] end = "excl_genneu" num_labels = len(list(cats)) # create one hot labels train_df['one_hot_labels'] = list(train_df[list(cats)].values) dev_df['one_hot_labels'] = list(dev_df[list(cats)].values) test_syn_df['one_hot_labels'] = list(test_syn_df[list(cats)].values) test_dia_df['one_hot_labels'] = list(test_dia_df[list(cats)].values) # retrieve sentences and labels df = pd.concat([train_df, dev_df]) sentences = df.text.values labels = list(df.one_hot_labels.values) sentences_syn = test_syn_df.text.values labels_syn = list(test_syn_df.one_hot_labels.values) sentences_dia = test_dia_df.text.values labels_dia = list(test_dia_df.one_hot_labels.values) print("number of categories:", len(list(cats))) # Tokenize all of the sentences and map the tokens to their word IDs. input_ids = [ tokenizer.encode(sent, add_special_tokens=True, truncation=True, max_length=args.max_len) for sent in sentences ] input_ids = pad_sequences(input_ids, maxlen=args.max_len, dtype="long", value=0.0, truncating="post", padding="post") # Create attention masks attention_masks = [[int(token_id > 0) for token_id in sent] for sent in input_ids] # synchronic test data input_ids_syn = [ tokenizer.encode(sent, add_special_tokens=True, truncation=True) for sent in sentences_syn ] input_ids_syn = pad_sequences(input_ids_syn, maxlen=args.max_len, dtype="long", value=0.0, truncating="post", padding="post") attention_masks_syn = [[int(token_id > 0) for token_id in sent] for sent in input_ids_syn] # diachronic test data input_ids_dia = [ tokenizer.encode(sent, add_special_tokens=True, truncation=True) for sent in sentences_dia ] input_ids_dia = pad_sequences(input_ids_dia, maxlen=args.max_len, dtype="long", value=0.0, truncating="post", padding="post") attention_masks_dia = [[int(token_id > 0) for token_id in sent] for sent in input_ids_dia] # split train, dev train_inputs, train_labels, dev_inputs, dev_labels, train_masks, dev_masks = split_train_dev( train_df, dev_df, attention_masks, input_ids, labels) # transform to torch tensor train_inputs = torch.tensor(train_inputs) dev_inputs = torch.tensor(dev_inputs) train_labels = torch.tensor(train_labels) dev_labels = torch.tensor(dev_labels) train_masks = torch.tensor(train_masks) dev_masks = torch.tensor(dev_masks) test_syn_inputs = torch.tensor(input_ids_syn) test_syn_masks = torch.tensor(attention_masks_syn) test_syn_labels = torch.tensor(labels_syn) test_dia_inputs = torch.tensor(input_ids_dia) test_dia_masks = torch.tensor(attention_masks_dia) test_dia_labels = torch.tensor(labels_dia) # Create the DataLoader train_dataloader = create_dataloader(train_inputs, train_masks, train_labels, args.batch_size, train=True) dev_dataloader = create_dataloader(dev_inputs, dev_masks, dev_labels, args.batch_size, train=False) test_syn_dataloader = create_dataloader(test_syn_inputs, test_syn_masks, test_syn_labels, args.batch_size, train=False) test_dia_dataloader = create_dataloader(test_dia_inputs, test_dia_masks, test_dia_labels, args.batch_size, train=False) # Create model if args.train: if model_class == "BERT": config = BertConfig.from_pretrained(args.lang_model, num_labels=num_labels) config.hidden_dropout_prob = 0.1 model = BertForSequenceClassification.from_pretrained( args.lang_model, num_labels=num_labels, output_attentions=False, output_hidden_states=False) if model_class == "DistilBERT": config = DistilBertConfig.from_pretrained(args.lang_model, num_labels=num_labels) config.hidden_dropout_prob = 0.1 model = DistilBertForSequenceClassification.from_pretrained( args.lang_model, num_labels=num_labels, output_attentions=False, output_hidden_states=False) model.cuda() # Create an optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay_rate': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.lr, eps=1e-8) # Total number of training steps = number of batches * number of epochs total_steps = len(train_dataloader) * args.epochs # Create the learning rate scheduler scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps) # train model # Main Loop print("=================== Train ================") print("##### Language Model:", args.lang_model, ",", "learning rate:", args.lr) print() track_time = time.time() # trange is a tqdm wrapper around the normal python range for epoch in trange(args.epochs, desc="Epoch"): print("Epoch: %4i" % epoch, dt.datetime.now()) model, optimizer, scheduler, tr_loss = train_multilabel( train_dataloader=train_dataloader, model=model, device=device, optimizer=optimizer, scheduler=scheduler, num_labels=num_labels) # EVALUATION: TRAIN SET pred_bools_train, true_bools_train, f1_train = eval_multilabel( train_dataloader, model=model, device=device) print("TRAIN: micro F1 %.3f" % (f1_train)) # EVALUATION: DEV SET pred_bools_dev, true_bools_dev, f1_dev = eval_multilabel( dev_dataloader, model=model, device=device) print("EVAL: micro F1 %.3f" % (f1_dev)) print(" Training and validation took in total: {:}".format( format_time(time.time() - track_time))) # EVALUATION: TEST SYN SET pred_bools_syn, true_bools_syn, f1_test_syn = eval_multilabel( test_syn_dataloader, model=model, device=device) print("TEST SYN: micro F1 %.4f" % (f1_test_syn)) # classification report clf_report_syn = classification_report(true_bools_syn, pred_bools_syn, target_names=cats, digits=3) print(clf_report_syn) # EVALUATION: TEST DIA SET pred_bools_dia, true_bools_dia, f1_test_dia = eval_multilabel( test_dia_dataloader, model=model, device=device) print("TEST DIA: micro F1 %.4f" % (f1_test_dia)) # classification report clf_report_dia = classification_report(true_bools_dia, pred_bools_dia, target_names=cats, digits=3) print(clf_report_dia) if args.save_cr: pickle.dump( clf_report_syn, open( args.output_path + 'clf_report_' + args.lang_model + '_test_syn_' + str(num_labels) + end + '.txt', 'wb')) pickle.dump( clf_report_dia, open( args.output_path + 'clf_report_' + args.lang_model + '_test_dia_' + str(num_labels) + end + '.txt', 'wb')) if args.save_prediction: test_syn_df["category_pred"] = pred_bools_syn test_dia_df["category_pred"] = pred_bools_dia test_syn_df.category_pred.to_csv(args.output_path + args.lang_model + '_test_syn_' + str(num_labels) + end + ".tsv", sep="\t", index=False, header=True, encoding="utf-8-sig") test_dia_df.category_pred.to_csv(args.output_path + args.lang_model + '_test_dia_' + str(num_labels) + end + ".tsv", sep="\t", index=False, header=True, encoding="utf-8-sig")
def train_optim_2(model, epochs, log_frequency, device, learning_rate): resnet18=models.resnet18(pretrained=True) resnet18.fc=Identity() resnet18.to(device) distilbert=DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased") token = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") distilbert.classifier=Identity() distilbert.to(device) model.to(device) # we make sure the model is on the proper device # Multiclass classification setting, we use cross-entropy # note that this implementation requires the logits as input # logits: values prior softmax transformation loss_fn = torch.nn.CrossEntropyLoss(reduction='mean') optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) for t in range(epochs): model.train() # we specify that we are training the model # At each epoch, the training set will be processed as a set of batches for batch_id, batch in enumerate(train_set) : images, question, labels = batch question = token(question,return_tensors="pt",truncation=True, padding=True) # we put the data on the same device images , question , labels = images.to(device), question.to(device) , labels.to(device) representation_image = resnet18(images) #vecteur de taille 512 output_distil = distilbert(**question) representation_texte = output_distil.logits #vecteur de taille 768 X = torch.cat((representation_image,representation_texte),dim=1) y_pred = model(X) # forward pass output=logits loss = loss_fn(y_pred, labels) if batch_id % log_frequency == 0: print("epoch: {:03d}, batch: {:03d}, loss: {:.3f} ".format(t+1, batch_id+1, loss.item())) optimizer.zero_grad() # clear the gradient before backward loss.backward() # update the gradient optimizer.step() # update the model parameters using the gradient # Model evaluation after each step computing the accuracy model.eval() total = 0 correct = 0 for batch_id, batch in enumerate(test_set): images , question , labels = batch question = token(question,return_tensors="pt",truncation=True, padding=True) images , question , labels = images.to(device), question.to(device) , labels.to(device) representation_image = resnet18(images) #vecteur de taille 512 output_distil = distilbert(**question) representation_texte = output_distil.logits #vecteur de taille 768 X = torch.cat((representation_image,representation_texte),dim=1) y_pred = model(X) # forward computes the logits sf_y_pred = torch.nn.Softmax(dim=1)(y_pred) # softmax to obtain the probability distribution _, predicted = torch.max(sf_y_pred , 1) # decision rule, we select the max total += labels.size(0) correct += (predicted == labels).sum().item() print("[validation] accuracy: {:.3f}%\n".format(100 * correct / total)) return
def __init__(self, args: dict, doLower: bool, train_batchSize: int, testval_batchSize:int, learningRate: float, doLearningRateScheduler: bool, target_columns: list, smartBatching: bool = True, mixedPrecision: bool = True, labelSentences: dict = None, max_label_len= None, model= None, optimizer= None, loss_fct= None, device= "cpu"): self.args = args self.labelSentences = labelSentences self.tokenizer = None self.device = device self.train_batchSize = train_batchSize self.testval_batchSize = testval_batchSize self.learningRate = learningRate self.optimizer = optimizer self.doLearningRateScheduler = doLearningRateScheduler self.learningRateScheduler = None self.smartBatching = smartBatching self.mixedPrecision = mixedPrecision self.max_label_len = max_label_len self.target_columns = target_columns self.input_multiclass_as_one = False if self.args["model"] in ["distilbert", "bert", "xlnet", "lstm", "roberta", "distilroberta"]: # define loss function if loss_fct: self.loss_fct = loss_fct else: self.loss_fct = BCEWithLogitsLoss() # define how many labels need to be classified if self.args["binaryClassification"]: self.num_labels = 1 else: self.num_labels = len(self.labelSentences.keys()) # build model from the model_str if self.args["model"] == "distilbert": if doLower: self.model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True) self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') else: self.model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True) self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') elif self.args["model"] == "bert": if doLower: self.model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True) self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') else: self.model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True) self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased') elif self.args["model"] == "xlnet": if doLower: # no lowercase version exists therefore using the cased version in the doLower case as well self.model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True) self.tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') else: self.model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True) self.tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') elif self.args["model"] == "roberta": if doLower: self.model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True) self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base') else: self.model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True) self.tokenizer = RobertaTokenizer.from_pretrained('roberta-base') elif self.args["model"] == "distilroberta": if doLower: self.model = RobertaForSequenceClassification.from_pretrained('distilroberta-base', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True) self.tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base') else: self.model = RobertaForSequenceClassification.from_pretrained('distilroberta-base', num_labels=self.num_labels, output_attentions=False, output_hidden_states=False, torchscript=True) self.tokenizer = RobertaTokenizer.from_pretrained('distilroberta-base') #elif self.args["model"] == "CNN": # self.model = MyLSTM(num_labels=self.num_labels) elif self.args["model"] == "gradboost": self.model = GradientBoostingClassifier(learning_rate= self.learningRate, n_estimators= self.args["n_estimators"], max_depth= self.args["max_depth"], verbose=1) self.input_multiclass_as_one = True elif self.args["model"] == "randomforest": self.model = RandomForestClassifier(n_estimators= self.args["n_estimators"], max_depth= self.args["max_depth"], verbose=1, n_jobs= -1) self.input_multiclass_as_one = True elif self.args["model"] == "naivebayes": self.model = OneVsRestClassifier(MultinomialNB(alpha= self.learningRate)) elif self.args["model"] == "naivebayes_norm": self.model = Pipeline([ ("nb_norm", MinMaxScaler()), ("nb_clf", OneVsRestClassifier(MultinomialNB(alpha= self.learningRate))) ]) elif self.args["model"] == "sgd": self.model = OneVsRestClassifier(SGDClassifier(alpha= self.learningRate, loss='hinge', penalty='l2')) else: logging.error("Define a model in the args dict.") sys.exit("Define a model in the args dict.")
train_sizes = [int(len(dset) * args.train_pct) for j, dset in enumerate(all_dsets)] val_sizes = [len(all_dsets[j]) - train_sizes[j] for j in range(len(train_sizes))] for i in range(len(all_dsets)): domain = args.domains[i] test_dset = all_dsets[i] dataloader = DataLoader( test_dset, batch_size=4, shuffle=True, collate_fn=collate_batch_transformer ) bert = DistilBertForSequenceClassification.from_pretrained(bert_model, config=bert_config).to(device) # Create the model model = torch.nn.DataParallel(MultiViewTransformerNetworkAveragingIndividuals( bert_model, bert_config, len(all_dsets) - 1 )).to(device) model.module.average = True # load the trained model # Load the best weights for v in range(len(all_dsets)-1): model.module.domain_experts[v].load_state_dict(torch.load(f'{args.pretrained_model}/model_{domain}_{v}.pth')) model.module.shared_bert.load_state_dict(torch.load(f'{args.pretrained_model}/model_{domain}_{len(all_dsets)-1}.pth'))
def __init__(self, pre_trained: str, class_count: int): super().__init__() self.bert = DistilBertForSequenceClassification.from_pretrained( pre_trained, num_labels=class_count)
def retrain(filepath, epochs_per_item=2, min_to_train=10): '''Retrain a new model from scratch ''' global label global current_model global currently_training if currently_training: "skipping while model already training" return positives = load_headlines(filepath+"positive.csv") negatives = load_headlines(filepath+"negative.csv") if len(positives) < min_to_train or len(negatives) < min_to_train: print("too few annotations to train: less than "+str(min_to_train)) return currently_training = True new_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased') # sample each item no more than `epochs_per_item` for least frequent label iterations = int(min(len(positives), len(negatives)) * epochs_per_item) for i in range(0, iterations): positive_headline = random.choice(positives) positive_inputs = tokenizer(positive_headline, return_tensors="pt") positive_labels = torch.tensor([1]).unsqueeze(0) train_item(new_model, positive_inputs, positive_labels) negative_headline = random.choice(negatives) negative_inputs = tokenizer(negative_headline, return_tensors="pt") negative_labels = torch.tensor([0]).unsqueeze(0) train_item(new_model, negative_inputs, negative_labels) eel.sleep(0.01) # allow other processes through new_fscore = evaluate_model(new_model) current_fscore = evaluate_model(current_model) if(new_fscore > current_fscore): print("replacing model!") current_model = new_model timestamp = re.sub('\.[0-9]*','_',str(datetime.datetime.now())).replace(" ", "_").replace("-", "").replace(":","") accuracy = str(round(new_fscore, 4)) model_path = "data/"+label+"/"+timestamp+accuracy+".model" current_model.save_pretrained(model_path) if verbose: print("saved model to "+model_path) clean_old_models() get_predictions() else: print("staying with old model") currently_training = False
def main(): """ main function for conducting Subtask A. Parameters are parsed with argparse. Language model should be suitable for German e.g.: 'bert-base-multilingual-uncased', 'bert-base-multilingual-cased', 'bert-base-german-cased', 'bert-base-german-dbmdz-cased', 'bert-base-german-dbmdz-uncased', 'distilbert-base-german-cased', 'distilbert-base-multilingual-cased'. """ ############################ variable settings ################################# parser = argparse.ArgumentParser(description='Run Subtask A or B of GermEval 2017 Using Pre-Trained Language Model.') parser.add_argument('--task', type=str, default='A', help="The task you want to conduct ('A' or 'B').") parser.add_argument('--seed', type=int, default=42, help='Random seed.') parser.add_argument('--lang_model', type=str, default='bert-base-german-dbmdz-uncased', help='The pre-trained language model.') parser.add_argument('--epochs', type=int, default=4, help='Number of epochs for training.') parser.add_argument('--lr', type=float, default=5e-5, help='The learning rate.') parser.add_argument('--max_len', type=int, default=256, help='The maximum sequence length of the input text.') parser.add_argument('--batch_size', type=int, default=32, help='Your train set batch size.') parser.add_argument('--df_path', type=str, default='./data/', help='The data directory.') parser.add_argument('--train_data', type=str, default='train_df.tsv', help='The filename of the input train data.') parser.add_argument('--dev_data', type=str, default='dev_df.tsv', help='The filename of the input development data.') parser.add_argument('--test_data1', type=str, default='test_syn_df.tsv', help='The filename of the first input test data (synchronic).') parser.add_argument('--test_data2', type=str, default='test_dia_df.tsv', help='The filename of the second input test data (diachronic).') parser.add_argument('--output_path', type=str, default='./output/subtaskA/', help='The output directory of the model and predictions.') parser.add_argument("--train", default=True, action="store_true", help="Flag for training.") parser.add_argument("--save_prediction", default=True, action="store_true", help="Flag for saving predictions.") args = parser.parse_args() ################################################################################ set_all_seeds(args.seed) device, n_gpu = initialize_device_settings(use_cuda=True) # Load data train_df = pd.read_csv(args.df_path + args.train_data, delimiter = '\t') dev_df = pd.read_csv(args.df_path + args.dev_data, delimiter = '\t') test_syn_df = pd.read_csv(args.df_path + args.test_data1, delimiter = '\t') test_syn_df = test_syn_df.dropna(subset = ["text"]) test_dia_df = pd.read_csv(args.df_path + args.test_data2, delimiter = '\t') # Create a tokenizer lower_case = False if args.lang_model[-7:] == "uncased": lower_case = True if args.lang_model[:4] == "bert": model_class = "BERT" tokenizer = BertTokenizer.from_pretrained(args.lang_model, do_lower_case=lower_case, max_length=args.max_len) if args.lang_model[:10] == "distilbert": model_class = "DistilBERT" tokenizer = DistilBertTokenizer.from_pretrained(args.lang_model, do_lower_case=lower_case, max_length=args.max_len) # get training features df = pd.concat([train_df, dev_df]) sentences = df.text.values sentences_syn = test_syn_df.text.values sentences_dia = test_dia_df.text.values if args.task == 'A': class_list = [False, True] df['relevance_label'] = df.apply(lambda x: class_list.index(x['relevance']), axis = 1) labels = df.relevance_label.values test_syn_df['relevance_label'] = test_syn_df.apply(lambda x: class_list.index(x['relevance']), axis = 1) labels_syn = test_syn_df.relevance_label.values test_dia_df['relevance_label'] = test_dia_df.apply(lambda x: class_list.index(x['relevance']), axis = 1) labels_dia = test_dia_df.relevance_label.values if args.task == 'B': class_list = ["negative", "neutral", "positive"] df['sentiment_label'] = df.apply(lambda x: class_list.index(x['sentiment']), axis = 1) labels = df.sentiment_label.values test_syn_df['sentiment_label'] = test_syn_df.apply(lambda x: class_list.index(x['sentiment']), axis = 1) labels_syn = test_syn_df.sentiment_label.values test_dia_df['sentiment_label'] = test_dia_df.apply(lambda x: class_list.index(x['sentiment']), axis = 1) labels_dia = test_dia_df.sentiment_label.values num_labels = len(set(labels)) # Tokenize all of the sentences and map the tokens to their word IDs. input_ids = [tokenizer.encode(sent, add_special_tokens=True, truncation=True, max_length=args.max_len) for sent in sentences] input_ids = pad_sequences(input_ids, maxlen=args.max_len, dtype="long", value=0.0, truncating="post", padding="post") # Create attention masks attention_masks = [[int(token_id > 0) for token_id in sent] for sent in input_ids] # synchronic test data input_ids_syn = [tokenizer.encode(sent, add_special_tokens=True, truncation=True) for sent in sentences_syn] input_ids_syn = pad_sequences(input_ids_syn, maxlen=args.max_len, dtype="long", value=0.0, truncating="post", padding="post") attention_masks_syn = [[int(token_id > 0) for token_id in sent] for sent in input_ids_syn] # diachronic test data input_ids_dia = [tokenizer.encode(sent, add_special_tokens=True, truncation=True) for sent in sentences_dia] input_ids_dia = pad_sequences(input_ids_dia, maxlen=args.max_len, dtype="long", value=0.0, truncating="post", padding="post") attention_masks_dia = [[int(token_id > 0) for token_id in sent] for sent in input_ids_dia] # split train, dev train_inputs, train_labels, dev_inputs, dev_labels, train_masks, dev_masks = split_train_dev( train_df, dev_df, attention_masks, input_ids, labels) # transform to torch tensor train_inputs = torch.tensor(train_inputs) dev_inputs = torch.tensor(dev_inputs) train_labels = torch.tensor(train_labels) dev_labels = torch.tensor(dev_labels) train_masks = torch.tensor(train_masks) dev_masks = torch.tensor(dev_masks) test_syn_inputs = torch.tensor(input_ids_syn) test_syn_labels = torch.tensor(labels_syn) test_syn_masks = torch.tensor(attention_masks_syn) test_dia_inputs = torch.tensor(input_ids_dia) test_dia_labels = torch.tensor(labels_dia) test_dia_masks = torch.tensor(attention_masks_dia) # Create the DataLoader train_dataloader = create_dataloader(train_inputs, train_masks, train_labels, args.batch_size, train=True) dev_dataloader = create_dataloader(dev_inputs, dev_masks, dev_labels, args.batch_size, train=False) test_syn_dataloader = create_dataloader(test_syn_inputs, test_syn_masks, test_syn_labels, args.batch_size, train=False) test_dia_dataloader = create_dataloader(test_dia_inputs, test_dia_masks, test_dia_labels, args.batch_size, train=False) # Create model if args.train: if model_class == "BERT": config = BertConfig.from_pretrained(args.lang_model, num_labels=num_labels) config.hidden_dropout_prob = 0.1 model = BertForSequenceClassification.from_pretrained( args.lang_model, num_labels = num_labels, output_attentions = False, output_hidden_states = False ) if model_class == "DistilBERT": config = DistilBertConfig.from_pretrained(args.lang_model, num_labels=num_labels) config.hidden_dropout_prob = 0.1 model = DistilBertForSequenceClassification.from_pretrained( args.lang_model, num_labels = num_labels, output_attentions = False, output_hidden_states = False ) model.cuda() # Create an optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] optimizer = AdamW( optimizer_grouped_parameters, lr=args.lr, eps=1e-8 ) # Total number of training steps = number of batches * number of epochs total_steps = len(train_dataloader) * args.epochs # Create the learning rate scheduler scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps ) # train model # Main Loop print("=================== Train ================") print("##### Language Model:", args.lang_model, ",", "learning rate:", args.lr) print() track_time = time.time() # trange is a tqdm wrapper around the normal python range for epoch in trange(args.epochs, desc="Epoch"): print("Epoch: %4i"%epoch, dt.datetime.now()) model, optimizer, scheduler, tr_loss = train( train_dataloader, model=model, device=device, optimizer=optimizer, scheduler=scheduler ) # EVALUATION: TRAIN SET true_bools_train, pred_bools_train, f1_train = eval( train_dataloader, model=model, device=device) print("TRAIN: micro F1 %.4f"%(f1_train)) # here: same as accuracy print(confusion_matrix(true_bools_train,pred_bools_train)) # EVALUATION: DEV SET true_bools_dev, pred_bools_dev, f1_dev = eval( dev_dataloader, model=model, device=device) print("EVAL: micro F1 %.4f"%(f1_dev)) print(confusion_matrix(true_bools_dev,pred_bools_dev)) print(" Training and validation took in total: {:}".format(format_time(time.time()-track_time))) # EVALUATION: TEST SYN SET true_bools_syn, pred_bools_syn, f1_test_syn = eval( test_syn_dataloader, model=model, device=device) print("TEST SYN: micro F1 %.4f"%(f1_test_syn)) print(confusion_matrix(true_bools_syn,pred_bools_syn)) # EVALUATION: TEST DIA SET true_bools_dia, pred_bools_dia, f1_test_dia = eval( test_dia_dataloader, model=model, device=device) print("TEST DIA: micro F1 %.4f"%(f1_test_dia)) print(confusion_matrix(true_bools_dia, pred_bools_dia)) if args.save_prediction: if args.task == 'A': test_syn_df["relevance_pred"] = pred_bools_syn test_dia_df["relevance_pred"] = pred_bools_dia if args.task == 'B': test_syn_df["sentiment_pred"] = pred_bools_syn test_dia_df["sentiment_pred"] = pred_bools_dia test_syn_df.to_csv(args.output_path+args.lang_model+"_eval_test_syn.tsv", sep="\t", index = False, header = True, encoding = "utf-8-sig") test_dia_df.to_csv(args.output_path+args.lang_model+"_eval_test_dia.tsv", sep="\t", index = False, header = True, encoding = "utf-8-sig")
def main(): parser = argparse.ArgumentParser( description='argument parsing for training') parser.add_argument('--data_dir', default='data', type=str, help='path to data directory - default: \'data\'') parser.add_argument('--review_file', default='yelp_reviews_train5000.csv', type=str, help='file name containig reviews') parser.add_argument('--batch_size', default=32, type=int, help='batch size - default: 32') parser.add_argument('--train_ratio', default=0.85, type=float, help='train size - default: 0.85') parser.add_argument('--epochs', default=4, type=int, help='number of training epochs - default: 4') parser.add_argument('--distil', action='store_true', help='use DistilBert instead of BERT') parser.add_argument('--model_save', default='./model_save', type=str, help='directory to save model') parser.add_argument('--nolog', action='store_true', help='disable logging') # parse input arguments clargs = parser.parse_args() # log to file and stdout if clargs.nolog: print("Not logging") else: sys.stdout = Logger('train') print("") print("==========================================") print("-------------Confirm Arguments------------") print("==========================================") print("") print("Data directory: {0:s}".format(clargs.data_dir)) print("Reviews file: {0:s}".format(clargs.review_file)) print("Batch size of {0:d}".format(clargs.batch_size)) print("Train ratio of {0:0.2f}".format(clargs.train_ratio)) print("Train for {0:d} epochs".format(clargs.epochs)) print("Using DistilBert" if clargs.distil else "Using Bert") print("Will save model in: {0:s}".format(clargs.model_save)) # Check to see if GPU is available CUDA_FLAG = False if torch.cuda.is_available(): CUDA_FLAG = True device = torch.device("cuda") print('*We will use the GPU:', torch.cuda.get_device_name(0)) else: CUDA_FLAG = False print('*No GPU available, using the CPU instead.') device = torch.device("cpu") print("") print("==========================================") print("---------------Process Data---------------") print("==========================================") print("") path = clargs.data_dir fn = clargs.review_file filename = path + "/" + fn # read in data from review dataset t0 = time.perf_counter() print("Reading in training data from {0:s}".format(clargs.review_file)) reviews_df = pd.read_csv(filename) reviews_df = reviews_df[['text', 'stars']] elapsed = time.perf_counter() - t0 print("Finished reading {0:d} entries | Took {1:0.2f} seconds".format( len(reviews_df.index), elapsed)) # create tokenizer and model from transformers if clargs.distil: tokenizer = DistilBertTokenizer.from_pretrained( 'distilbert-base-uncased') else: tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # tokenize the data into something that BERT can use, then split print("Tokenizing and encoding data to be fed into BERT model") t1 = time.perf_counter() dataset = extract_features(reviews_df, tokenizer) elapsed = time.perf_counter() - t1 print("Finished tokenizing | Took {0:0.2f} seconds".format(elapsed)) # split the data into training and validation set TRAIN_SIZE = int(len(reviews_df.index) * clargs.train_ratio) VAL_SIZE = len(reviews_df.index) - TRAIN_SIZE BATCH_SIZE = clargs.batch_size train_dataloader, validation_dataloader = train_val_split( dataset=dataset, batch_sz=BATCH_SIZE, lengths=[TRAIN_SIZE, VAL_SIZE]) print("Training - Split {0:d} examples into {1:d} batches".format( TRAIN_SIZE, len(train_dataloader))) print("Validation - Split {0:d} examples into {1:d} batches".format( VAL_SIZE, len(validation_dataloader))) print("Finished splitting") # load a pre-trained model if clargs.distil: model = DistilBertForSequenceClassification.from_pretrained( 'distilbert-base-uncased', num_labels=5, output_attentions=False, output_hidden_states=False) else: model = BertForSequenceClassification.from_pretrained( 'bert-base-uncased', num_labels=5, output_attentions=False, output_hidden_states=False) if CUDA_FLAG: model.cuda() optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8) # Total number of training steps is [number of batches] x [number of epochs]. # (Note that this is not the same as the number of training samples). epochs = clargs.epochs total_steps = len(train_dataloader) * epochs # Create the learning rate scheduler. scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) # Training statistics: train_losses = [] val_losses = [] val_accs = [] print("") print("==========================================") print("-------------Starting training------------") print("==========================================") print("") # TRAINING LOOP: # - epoch: number of times through the entire dataset # - consists of a training portion: forward pass, then backward pass # - followed by a validation portion: evaluate model on a validation set start_train_time = time.perf_counter() for i in range(epochs): print("-----------------Epoch {0:d}-----------------".format(i + 1)) print("Epoch {0:d} Training Phase".format(i + 1)) # first train model.train() # only to put the model into train mode train_loss = train(model, device, train_dataloader, optimizer, scheduler) print(" Training Loss: {0:.2f}".format(train_loss)) train_losses.append(train_loss) print("") # then validate print("Epoch {0:d} Validation Phase".format(i + 1)) val_loss, val_acc, _, _ = evaluate(model, device, validation_dataloader, VAL_SIZE) print(" Validation Accuracy: {0:.2f}".format(val_acc)) print(" Validation Loss: {0:.2f}".format(val_loss)) val_losses.append(val_loss) val_accs.append(val_acc) print("") elapsed_time = time.perf_counter() - start_train_time m, s = divmod(elapsed_time, 60) print("End epoch {0:d} - Time so far - {1:02d}:{2:05.2f}".format( (i + 1), int(m), s)) print("") print("==========================================") print("------------Summary of Training-----------") print("==========================================") print("") total_elapsed_time = time.perf_counter() - start_train_time m, s = divmod(total_elapsed_time, 60) print("Total training time: {0:02d}:{1:05.2f}".format(int(m), s)) print("") print( tabulate(np.stack((train_losses, val_losses, val_accs), axis=-1), ["train_loss", "val_loss", "val_acc"])) print("") print("Data directory: {0:s}".format(clargs.data_dir)) print("Reviews file: {0:s}".format(clargs.review_file)) print("Batch size of {0:d}".format(clargs.batch_size)) print("Train ratio of {0:0.2f}".format(clargs.train_ratio)) print("Train for {0:d} epochs".format(clargs.epochs)) print("") # save model output_dir = clargs.model_save # Create output directory if needed if not os.path.exists(output_dir): os.makedirs(output_dir) model_type = "distil" if clargs.distil else "bert" # save hyperparameters for testing: hyper_json = { "dataDirectory": clargs.data_dir, "dataFile": clargs.review_file, "batchSize": str(clargs.batch_size), "trainRatio": str(clargs.train_ratio), "numEpochs": str(clargs.epochs), "model": model_type } json_outfile = output_dir + '/' + 'hyperparams.json' with open(json_outfile, 'w') as outfile: json.dump(hyper_json, outfile) print("Saving model to %s" % output_dir) model_to_save = model.module if hasattr( model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) print("Finished saving model")
import tensorflow as tf import transformers as trans import torch from transformers import DistilBertTokenizer, BertConfig from transformers import AdamW, DistilBertForSequenceClassification, Trainer, TrainingArguments import torch.nn as nn import numpy as np tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True) model = DistilBertForSequenceClassification.from_pretrained( "distilbert-base-uncased", num_labels=2) model.load_state_dict(torch.load('torch_weights', map_location='cpu')) def classify(Message): label = {0: 'Ham', 1: 'Spam'} X = tokenizer(Message, max_length=200, padding=True, truncation=True, return_tensors="pt") device = torch.device("cpu") X.to(device) model.to(device) result = model(**X) result_list = list(result[0][0].cpu().detach().numpy()) max_value = max(zip(result_list, [0, 1])) proba1 = max(nn.Softmax(dim=-1)(result[0][0]).cpu().detach().numpy()) proba = nn.Softmax(dim=-1)(result[0][0]).cpu().detach().numpy() #proba = np.exp(max_value[0])/(1+np.exp(max_value[0]))
def load_pretrained_model(self, input_path = './DistilBERT/pre_trained_model/'): self.model = DistilBertForSequenceClassification.from_pretrained(input_path, num_labels = 9) self.tokenizer = DistilBertTokenizer.from_pretrained(input_path, do_lower_case = True)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--data_dir", default="/home/jqu/Documents/data/XNLI/", type=str, required=False, help= "The input data dir. Should contain the .tsv files (or other data files) for the task.", ) parser.add_argument("--model_type", type=str, required=True, help="distilbert|bert") parser.add_argument("--model_dir", type=str, required=True, help="where the trained model locates") args = parser.parse_args() # load test dataset processor = processors["xnli"](language="en", train_language="en") examples = processor.get_test_examples(args.data_dir) if args.model_type == "bert": # prepare tokenizer tokenizer = BertTokenizer.from_pretrained(args.model_dir, do_lower_case=False) model = BertForSequenceClassification.from_pretrained(args.model_dir) elif args.model_type == "distilbert": tokenizer = DistilBertTokenizer.from_pretrained(args.model_dir, do_lower_case=False) model = DistilBertForSequenceClassification.from_pretrained( args.model_dir) elif args.model_type == "albert": tokenizer = AlbertTokenizer.from_pretrained(args.model_dir, do_lower_case=False) model = AlbertForSequenceClassification.from_pretrained(args.model_dir) model.to("cuda:0") model.eval() features = convert_examples_to_features( examples, tokenizer, label_list=processor.get_labels(), max_length=128, output_mode="classification", pad_on_left=False, pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=0, mask_padding_with_zero=True) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_labels = torch.tensor([f.label for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels) eval_sampler = SequentialSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=512) overall_preds = [[], []] for batch in tqdm(eval_dataloader, desc="Evaluating"): with torch.no_grad(): batch = tuple(t.to("cuda:0") for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3] } if args.model_type != "distilbert": inputs["token_type_ids"] = ( batch[2] if args.model_type in ["bert"] else None ) # XLM and DistilBERT don't use segment_ids outputs = model(**inputs) _, logits = outputs[:2] preds = logits.detach().cpu().numpy() preds = np.argmax(preds, axis=1) overall_preds[0] += preds.tolist() out_label_ids = inputs["labels"].detach().cpu().numpy() overall_preds[1] += out_label_ids.tolist() # compute scores result = accuracy_score(overall_preds[0], overall_preds[1]) print(f"Overall accuracy: {result}") confusion_score = confusion_matrix(overall_preds[0], overall_preds[1]) print("confusion matrix:\n") print(confusion_score)