def __init__(self, model, num_steps, num_classes=2): super(GPT2Classifier, self).__init__() self.tokenizer = GPT2Tokenizer.from_pretrained(model) self.tokenizer.pad_token = self.tokenizer.eos_token self.encoder = GPT2ForSequenceClassification.from_pretrained(model) self.encoder.config.pad_token_id = self.tokenizer.eos_token_id self.num_steps = num_steps
def makeUnilabelModel(self, modelName, num_labels=10, root='', **kwargs): if modelName == 'distilbert-base-uncased': tokenizer = DistilBertTokenizerFast.from_pretrained( 'distilbert-base-uncased') model = DistilBertForSequenceClassification.from_pretrained( root + "distilbert-base-uncased", num_labels=num_labels, **kwargs) if modelName == 'gpt2': tokenizer = GPT2TokenizerFast.from_pretrained('gpt2') tokenizer.add_special_tokens({'pad_token': '[PAD]'}) model = GPT2ForSequenceClassification.from_pretrained( root + "gpt2", num_labels=num_labels, **kwargs) model.resize_token_embeddings(len(tokenizer)) # add padding token model.config.pad_token_id = tokenizer('[PAD]').input_ids[0] if modelName == 'bertweet': tokenizer = AutoTokenizer.from_pretrained('vinai/bertweet-base') model = AutoModelForSequenceClassification.from_pretrained( root + "vinai/bertweet-base", num_labels=num_labels, **kwargs) if modelName == 'distilroberta-base': tokenizer = AutoTokenizer.from_pretrained('distilroberta-base') model = AutoModelForSequenceClassification.from_pretrained( root + "distilroberta-base", num_labels=num_labels, **kwargs) if modelName == 'lstm': tokenizer = AutoTokenizer.from_pretrained( 'distilbert-base-uncased') model = LSTMCclassifier(128, 64, 2, tokenizer.vocab_size, num_labels) return tokenizer, model
def create_and_check_gpt2_for_sequence_classification( self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, sequence_labels, *args ): config.num_labels = self.num_labels model = GPT2ForSequenceClassification(config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
def __init__(self, conf): super(GPT2Classifier, self).__init__() self.conf = conf self.padding = 0 self.gpt_path = getattr(conf, 'bert_path', None) pretrain_name = 'gpt2' if self.gpt_path: pretrain_name = self.gpt_path print('GPT Model from {}'.format(pretrain_name)) self.gpt = GPT2ForSequenceClassification.from_pretrained( pretrain_name, num_labels=conf.class_num) self.gpt.config.pad_token_id = self.gpt.config.eos_token_id self.drop = nn.Dropout(p=0.3)
def __init__(self): # Look for gpu to use. Will use `cpu` by default if no gpu found. self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') # device = torch.device('cpu') print("Device: ", self.device) _ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) model_path = os.path.join(_ROOT, "Models", "GPT2_Model", "model") self.tokenizer = GPT2Tokenizer.from_pretrained( pretrained_model_name_or_path=model_path) self.gpt_model = GPT2ForSequenceClassification.from_pretrained( pretrained_model_name_or_path=model_path) self.gpt_model.eval()
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification import torch tokenizer = GPT2Tokenizer.from_pretrained('microsoft/dialogrpt') model = GPT2ForSequenceClassification.from_pretrained('microsoft/dialogrpt') inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(**inputs, labels=labels) loss = outputs.loss logits = outputs.logits
validation_dataloader = torch.load(val_loader_fn) train_dataloader = torch.load(train_loader_fn) else: validation_dataloader, train_dataloader = create_train_val_loaders( config['dataset']['train'], val_loader_fn, train_loader_fn, tokenizer, batch_size, max_length) if os.path.exists(test_loader_fn) and (not build_new_dataloaders): pass else: create_test_loader(config['dataset']['test'], test_loader_fn, tokenizer, batch_size, max_length) # import pretrained model if model_name == "gpt2": model = GPT2ForSequenceClassification.from_pretrained( "gpt2", num_labels=NUM_LABELS) else: model = AutoModelForSequenceClassification.from_pretrained( config['model'][model_name], num_labels=NUM_LABELS) if cpu: # Distributor = torch.nn.parallel.DistributedDataParallelCPU # import torch.distributed as dist # rank=1 # world_size=12 # dist.init_process_group("gloo", world_size=world_size,rank=-1, store= None) # parallel_model = Distributor(model) parallel_model = model else: parallel_model = torch.nn.DataParallel(model) # Encapsulate the model parallel_model.cuda()
) # For validation the order doesn't matter, so we'll just read them sequentially. validation_dataloader = DataLoader( val_dataset, # The validation samples. sampler=SequentialSampler(val_dataset), # Pull out batches sequentially. batch_size=batch_size # Evaluate with this batch size. ) # Load GPT2ForSequenceClassification, the pretrained GPT2 model with a single # linear classification layer on top. model_config = GPT2Config.from_pretrained('gpt2', num_labels=2) # Get the actual model. print('Loading model...') model = GPT2ForSequenceClassification.from_pretrained('gpt2', config=model_config) # resize model embedding to match new tokenizer model.resize_token_embeddings(len(tokenizer)) # fix model padding token id model.config.pad_token_id = model.config.eos_token_id # Run this model on GPU. model.cuda() optimizer = AdamW( model.parameters(), lr= 2e-5, # args.learning_rate - default is 5e-5, paper recommends 5e-5/3e-5/2e-5 eps=1e-8 # args.adam_epsilon - default is 1e-8.
import numpy as np start_debugger_on_exception() train_dataset = DataSetBert(data_file= './data/data_train/train.csv') val_dataset = DataSetBert(data_file= './data/data_train/val.csv') test_dataset = DataSetBert(data_file= './data/data_train/test.csv') from torch.utils.data import DataLoader device = torch.device('cuda:1') train_dataloader = DataLoader(train_dataset, batch_size=11, shuffle=True) val_dataloader = DataLoader(val_dataset, batch_size=11, shuffle=True) test_dataloader = DataLoader(test_dataset, batch_size=11, shuffle=True) from transformers import BertTokenizer tokenizer = BertTokenizer.from_pretrained('uer/gpt2-chinese-cluecorpussmall') tokenizer.add_special_tokens({"additional_special_tokens": ["[P]"]}) model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path='uer/gpt2-chinese-cluecorpussmall', num_labels=2) model_config.num_labels = 15 model =GPT2ForSequenceClassification.from_pretrained('uer/gpt2-chinese-cluecorpussmall',config = model_config) model.resize_token_embeddings(len(tokenizer)) model.config.pad_token_id = model.config.eos_token_id model.config.n_positions = 1024 assert model.config.num_labels == 15 model.to(device) model.train() model.to(device) import pdb;pdb.set_trace() from transformers import AdamW optimizer = AdamW(model.parameters(), lr=1e-5) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ]
# Get model configuration. print('Loading configuraiton...') model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path=model_name_or_path, num_labels=n_labels) # Get model's tokenizer. print('Loading tokenizer...') tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path) # default to left padding tokenizer.padding_side = "left" # Define PAD Token = EOS Token = 50256 tokenizer.pad_token = tokenizer.eos_token # Get the actual model. print('Loading model...') model = GPT2ForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, config=model_config) # resize model embedding to match new tokenizer model.resize_token_embeddings(len(tokenizer)) # fix model padding token id model.config.pad_token_id = model.config.eos_token_id # Load model to defined device. model.to(device) print('Model loaded to `%s`'%device) """# Getting and training model""" train_dataset = MovieReviewsDataset(use_tokenizer = tokenizer, labels = label_train, texts = text_train) valid_dataset = MovieReviewsDataset(use_tokenizer = tokenizer, labels = label_test, texts = text_test)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--train', type=str, default='data/disaster_response_messages_training.csv') parser.add_argument('--test', type=str, default='data/disaster_response_messages_test.csv') parser.add_argument( '--validation', type=str, default='data/disaster_response_messages_validation.csv') parser.add_argument('--epoch', type=str, default='10') parser.add_argument('--model', type=str, default='bert', choices=['bert', 'bart', 'gpt2', 'roberta', 'xlnet']) args = parser.parse_args() EPOCH = int(args.epoch) model_name = args.model # create data loader for training and validation if model_name == 'bert': train_set = BertDataset(args.train) val_set = BertDataset(args.validation) test_set = BertDataset(args.test) elif model_name == 'bart': train_set = BartDataset(args.train) val_set = BartDataset(args.validation) test_set = BartDataset(args.test) elif model_name == 'gpt2': train_set = GPT2Dataset(args.train) val_set = GPT2Dataset(args.validation) test_set = GPT2Dataset(args.test) elif model_name == 'roberta': train_set = RobertaDataset(args.train) val_set = RobertaDataset(args.validation) test_set = RobertaDataset(args.test) elif model_name == 'xlnet': train_set = XLNetDataset(args.train) val_set = XLNetDataset(args.validation) test_set = XLNetDataset(args.test) train_loader = DataLoader(train_set, batch_size=20, shuffle=True) val_loader = DataLoader(val_set, batch_size=20, shuffle=False) test_loader = DataLoader(test_set, batch_size=20, shuffle=False) print('Data Loaded.') if model_name == 'bert': model = BertForSequenceClassification.from_pretrained( 'bert-base-uncased', num_labels=2) elif model_name == 'gpt2': model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=2) model.config.pad_token_id = model.config.eos_token_id elif model_name == 'bart': model = BartForSequenceClassification.from_pretrained( 'facebook/bart-base', num_labels=2) elif model_name == 'roberta': model = RobertaForSequenceClassification.from_pretrained( 'roberta-base', num_labels=2) elif model_name == 'xlnet': model = XLNetForSequenceClassification.from_pretrained( 'xlnet-base-cased', num_labels=2) optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False) total_steps = len(train_loader) * EPOCH scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) criterion = nn.CrossEntropyLoss() print('\nModel: ', model_name, '\tEpochs: ', EPOCH) epoch_loss = [] epoch_val_acc = [] for epoch in range(EPOCH): tqdm.write('Epoch: {}'.format(epoch + 1)) loss = train(model, train_loader, criterion, optimizer, scheduler) epoch_loss.append(loss) val_acc = val(model, val_loader) epoch_val_acc.append(val_acc) torch.save(model, model_name + '/' + model_name + '_model.pt') # model = torch.load(model_name+'_model.pt') tqdm.write('\nFinal test...') test_result = test(model, test_loader) with open(model_name + '/' + model_name + '_loss.p', 'wb') as f: pickle.dump(epoch_loss, f) with open(model_name + '/' + model_name + '_val_accuracy.p', 'wb') as f: pickle.dump(epoch_val_acc, f) with open(model_name + '/' + model_name + '_test_result.p', 'wb') as f: pickle.dump(test_result, f)
batch_size=batch_size) test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size) # ## Начнем с нуля. # # Попробуем обучить модель трансформер с нуля решать данную задачу. # In[25]: config = GPT2Config.from_pretrained( "distilgpt2", # distilgpt2 – уменьшенная версия модели gpt2 output_attentions=True, pad_token_id=tokenizer.eos_token_id, num_labels=8) model_0 = GPT2ForSequenceClassification(config=config).to( device) # GPT2 для классификации текста # Подготовь оптимайзер и критерий: # In[26]: lr = 1e-5 # Предполагаемый learning rate. Он может быть больше или меньше :) optimizer = torch.optim.Adam(model_0.parameters(), lr=lr) criterion = nn.CrossEntropyLoss() # scheduler = get_cosine_schedule_with_warmup(optimizer, 0, 10) # Посмотри, что возвращает модель ([docs](https://huggingface.co/transformers/model_doc/gpt2.html#gpt2forsequenceclassification)), если в неё подать данные: # In[22]:
tokenizer = GPT2Tokenizer.from_pretrained( pretrained_model_name_or_path=model_name_or_path) # Creating the tokenizer is pretty standard when using the Transformers library. # After creating the tokenizer it is critical for this tutorial to set padding to the left tokenizer.padding_side = "left" # and initialize the padding token to tokenizer.eos_token which is the GPT2's original end of sequence token. # This is the most essential part of this tutorial since GPT2 uses the last token for prediction so we need to pad to the left. # default to left padding tokenizer.padding_side = "left" # Define PAD Token = EOS Token = 50256 tokenizer.pad_token = tokenizer.eos_token # Get the actual model. print('Loading model...') model = GPT2ForSequenceClassification.from_pretrained( pretrained_model_name_or_path=model_name_or_path, config=model_config, cache_dir="/home/jovyan/data-vol-1/gpt2/models") print( f"number of parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}" ) # resize model embedding to match new tokenizer model.resize_token_embeddings(len(tokenizer)) # fix model padding token id model.config.pad_token_id = model.config.eos_token_id # Load model to defined device. model.to(device) print('Model loaded to `%s`' % device)