def train(): parser = argparse.ArgumentParser() # load model and tokenizer # MODEL_NAME = "bert-base-multilingual-cased" MODEL_NAME = args.model_name # "distilbert-base-multilingual-cased" tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # load dataset train_dataset = load_data("../input/data/train/train.tsv") #dev_dataset = load_data("./dataset/train/dev.tsv") train_label = train_dataset['label'].values #dev_label = dev_dataset['label'].values # tokenizing dataset tokenized_train = tokenized_dataset(train_dataset, tokenizer) #tokenized_dev = tokenized_dataset(dev_dataset, tokenizer) # make dataset for pytorch. RE_train_dataset = RE_Dataset(tokenized_train, train_label) #RE_dev_dataset = RE_Dataset(tokenized_dev, dev_label) device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') # setting model hyperparameter bert_config = BertConfig.from_pretrained(MODEL_NAME) bert_config.num_labels = 42 model = BertForSequenceClassification(bert_config) model.parameters model.to(device) # 사용한 option 외에도 다양한 option들이 있습니다. # https://huggingface.co/transformers/main_classes/trainer.html#trainingarguments 참고해주세요. training_args = TrainingArguments( output_dir=f'./results/{MODEL_NAME}', # output directory save_total_limit=3, # number of total save model. save_steps=500, # model saving step. # num_train_epochs=4, # total number of training epochs num_train_epochs=5, # total number of training epochs learning_rate=5e-5, # learning_rate per_device_train_batch_size=16, # batch size per device during training #per_device_eval_batch_size=16, # batch size for evaluation warmup_steps=500, # number of warmup steps for learning rate scheduler weight_decay=0.01, # strength of weight decay logging_dir='./logs', # directory for storing logs logging_steps=100, # log saving step. #evaluation_strategy='steps', # evaluation strategy to adopt during training # `no`: No evaluation during training. # `steps`: Evaluate every `eval_steps`. # `epoch`: Evaluate every end of epoch. #eval_steps = 500, # evaluation step. #load_best_model_at_end = True, # When set to True, the parameters save_strategy and save_steps will be ignored and the model will be saved after each evaluation. ) trainer = Trainer( model=model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=RE_train_dataset, # training dataset #eval_dataset=RE_dev_dataset, # evaluation dataset #compute_metrics=compute_metrics # define metrics function ) # train model trainer.train()
def create_and_check_for_sequence_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels ): config.num_labels = self.num_labels model = BertForSequenceClassification(config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
def create_and_check_bert_for_sequence_classification( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): config.num_labels = self.num_labels model = BertForSequenceClassification(config) model.to(torch_device) model.eval() result = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels) self.parent.assertListEqual(list(result["logits"].size()), [self.batch_size, self.num_labels]) self.check_loss_output(result)
def main(device='lazy', full_size=False): """ Load model to specified device. Ensure that any backends have been initialized by this point. :param device: name of device to load tensors to :param full_size: if true, use a full pretrained bert-base-cased model instead of a smaller variant """ torch.manual_seed(0) tokenized_datasets = tokenize_dataset(load_dataset('imdb')) small_train_dataset = tokenized_datasets['train'].shuffle(seed=42) \ .select(range(2)) train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8) if full_size: model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=2) else: configuration = BertConfig( vocab_size=28996, hidden_size=32, num_hidden_layers=1, num_attention_heads=2, intermediate_size=32, hidden_act='gelu', hidden_dropout_prob=0.0, attention_probs_dropout_prob=0.0, max_position_embeddings=512, layer_norm_eps=1.0e-05, ) model = BertForSequenceClassification(configuration) model.to(device) num_epochs = 3 num_training_steps = num_epochs * len(train_dataloader) losses = train(model, num_epochs, num_training_steps, train_dataloader, device) # Get debug information from LTC if 'torch_mlir.reference_lazy_backend._REFERENCE_LAZY_BACKEND' in sys.modules: computation = lazy_backend.get_latest_computation() if computation: print(computation.debug_string()) print('Loss: ', losses) return model, losses
def load(args, checkpoint_dir): state_dict = torch.load(os.path.join(checkpoint_dir, 'checkpoint.pth')) from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): if 'module' in k: namekey = k[7:] # remove `module.` else: namekey = k new_state_dict[namekey] = v if args.model_type == 'bert': config = BertConfig.from_json_file(os.path.join(checkpoint_dir, 'config.bin')) model = BertForSequenceClassification(config) model.load_state_dict(new_state_dict) elif args.model_type == 'cnn': model = CNNModel(n_vocab=args.vocab_size, embed_size=args.embed_size, num_classes=args.num_labels, num_filters=args.num_filters, filter_sizes=args.filter_sizes, device=args.device) model.load_state_dict(new_state_dict) elif args.model_type == 'lstm': model = LSTMModel(n_vocab=args.vocab_size, embed_size=args.embed_size, num_classes=args.num_labels, hidden_size=args.hidden_size, device=args.device) model.load_state_dict(new_state_dict) elif args.model_type == 'char-cnn': model = CharCNN(num_features=args.num_features, num_classes=args.num_labels) model.load_state_dict(new_state_dict) else: raise ValueError('model type is not found!') return model.to(args.device)
def __init__( self, model: BertForSequenceClassification, tokenizer: BertTokenizer, max_length: int ): """ :param model: pre trained `BertForSequenceClassification` model :param tokenizer: tokenizer for the model :param max_length: maximum tokens in a sequence for the model """ self.model = model self.tokenizer = tokenizer self.max_length = max_length self._set_up_device() model.to(self.device)
def __init__(self, model: BertForSequenceClassification, tokenizer: BertTokenizer, seed: int = 100): """ :param model: `BertForSequenceClassification` model to train, num_labels should be set to 3 :param tokenizer: tokenizer for the model :param seed: seed for reproducible results """ random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) self._set_up_device() self.model = model model.cuda() model.to(self.device) self.tokenizer = tokenizer
def main(args): """ 주어진 dataset tsv 파일과 같은 형태일 경우 inference 가능한 코드입니다. """ seed_everything(args.seed) use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") # load tokenizer TOK_NAME = args.token if TOK_NAME == "monologg/kobert": tokenizer = KoBertTokenizer.from_pretrained(TOK_NAME) else: tokenizer = AutoTokenizer.from_pretrained(TOK_NAME) # load my model bert_config = BertConfig.from_pretrained(TOK_NAME) bert_config.num_labels = args.num_labels bert_config.num_hidden_layers = args.num_hidden_layers model = BertForSequenceClassification(bert_config) model_dir = os.path.join(args.model_dir, args.name) model_path = os.path.join(model_dir, 'best.pth') # load test datset test_dataset_dir = "/opt/ml/input/data/test/test.tsv" test_dataset, test_label = load_test_dataset(test_dataset_dir, model, tokenizer, args) test_dataset = RE_Dataset(test_dataset, test_label) model.load_state_dict(torch.load(model_path, map_location=device)) model.to(device) # predict answer batch_size = args.batch_size print("Inference Start!!!") pred_answer = inference(model, test_dataset, device, batch_size) # make csv file with predicted answer # 아래 directory와 columns의 형태는 지켜주시기 바랍니다. output = pd.DataFrame(pred_answer, columns=['pred']) save_dir = os.path.join(args.output_dir, args.name) os.makedirs(save_dir, exist_ok=True) output.to_csv(os.path.join(save_dir, f'{args.name}.csv'), index=False)
def test_classifier(model: BertForSequenceClassification, dataset: TensorDataset, batch_size: int): device = select_device() prediction_dataloader = DataLoader(dataset, sampler=SequentialSampler(dataset), batch_size=batch_size) print("") print("Running Prediction...") model.to(device) model.eval() predictions, true_labels = [], [] for batch in prediction_dataloader: b_input_ids = batch[0].to(device) b_input_mask = batch[1].to(device) b_labels = batch[2] with torch.no_grad(): outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits = outputs.logits logits = logits.detach().cpu().numpy() label_ids = b_labels.numpy() #predictions.append(logits) predictions.extend(list(np.argmax(logits, axis=1).flatten())) true_labels.extend(list(label_ids)) print('DONE.') return predictions, true_labels
def inference_no_args( data: TensorDataset, loader: DataLoader, logger: Logger, model: BertForSequenceClassification, batch_size: int, ) -> List[float]: device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu') predictions = [] states = [] logger.info("***** Running inference {} *****".format("")) logger.info(" Num examples = %d", len(data)) logger.info(" Batch size = %d", batch_size) model.to(device) model.eval() for batch in tqdm(loader, desc="Inference"): batch = tuple(t.to(device) for t in batch) logits, state = model.forward(input_ids=batch[0], attention_mask=batch[1], token_type_ids=batch[2], output_hidden_states=True) predictions.extend(logits.cpu()) states.extend(state[-1][:, 0, :].cpu()) return predictions, states
def load(args, checkpoint_dir): state_dict = torch.load(os.path.join(checkpoint_dir, 'checkpoint.pth')) from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): if 'module' in k: namekey = k[7:] # remove `module.` else: namekey = k new_state_dict[namekey] = v if args.model_type == 'bert': config = BertConfig.from_json_file( os.path.join(checkpoint_dir, 'config.bin')) model = BertForSequenceClassification(config) model.load_state_dict(new_state_dict) elif args.model_type == 'bow': model = BOWModel(new_state_dict['embedding.weight'], n_vocab=args.vocab_size, embed_size=args.embed_size, hidden_size=args.hidden_size, num_classes=args.num_labels) model.load_state_dict(new_state_dict) elif args.model_type == 'decom_att': model = DecompAttentionModel(args.word_mat, n_vocab=args.vocab_size, embed_size=args.embed_size, hidden_size=args.hidden_size, num_classes=args.num_labels) model.load_state_dict(new_state_dict) elif args.model_type == 'esim': model = ESIM(vocab_size=args.vocab_size, embedding_dim=args.embed_size, hidden_size=args.hidden_size, embeddings=None, padding_idx=0, dropout=0.1, num_classes=args.num_labels, device=args.device) model.load_state_dict(new_state_dict) else: raise ValueError('model type is not found!') return model.to(args.device)
def model_infer(config,test_load,k): print("***********load model weight*****************") model_config = model_config = BertConfig() model_config.vocab_size = len(pd.read_csv('../user_data/vocab',names=["score"])) model = BertForSequenceClassification(config=model_config) model.load_state_dict(torch.load('../user_data/save_model/{}_best_model.pth.tar'.format(config.model_name))['status']) model = model.to(config.device) print("***********make predict for test file*****************") model.eval() predict_all = [] with torch.no_grad(): for batch, (input_ids, token_type_ids, attention_mask, label) in enumerate(test_load): input_ids = input_ids.to(config.device) attention_mask = attention_mask.to(config.device) token_type_ids = token_type_ids.to(config.device) outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) logits = outputs.logits pred_pob = torch.nn.functional.softmax(logits, dim=1)[:, 1] predict_all.extend(list(pred_pob.detach().cpu().numpy())) # submit_result(predict) if k==0: df=pd.DataFrame(predict_all,columns=["{}_socre".format(k+1)]) df.to_csv('./{}_result.csv'.format(config.model_name),index=False) else: df=pd.read_csv('./{}_result.csv'.format(config.model_name)) df["{}_socre".format(k+1)] = predict_all df.to_csv('./{}_result.csv'.format(config.model_name),index=False) print("***********done*****************")
def __init__( self, args: argparse.ArgumentParser, model: BertForSequenceClassification = None, data_collator: Optional[DataCollator] = None, train_dataset: Optional[Dataset] = None, eval_dataset: Optional[Dataset] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, prediction_loss_only=False, ): super(Trainer, self).__init__() self.args = args self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.model = model.to(self.device) self.train_dataset = train_dataset self.eval_dataset = eval_dataset self.compute_metrics = compute_metrics self.prediction_loss_only = prediction_loss_only if data_collator is not None: self.data_collator = data_collator else: self.data_collator = DefaultDataCollator()
# Define models bert_config = BertConfig.from_json_file('bert_config/bert_config.json') bert_config_T3 = BertConfig.from_json_file('bert_config/bert_config_T3.json') bert_config.output_hidden_states = True bert_config_T3.output_hidden_states = True teacher_model = BertForSequenceClassification(bert_config) #, num_labels = 2 # Teacher should be initialized with pre-trained weights and fine-tuned on the downstream task. # For the demonstration purpose, we omit these steps here student_model = BertForSequenceClassification( bert_config_T3) #, num_labels = 2 teacher_model.to(device=device) student_model.to(device=device) # Define Dict Dataset class DictDataset(Dataset): def __init__(self, all_input_ids, all_attention_mask, all_labels): assert len(all_input_ids) == len(all_attention_mask) == len(all_labels) self.all_input_ids = all_input_ids self.all_attention_mask = all_attention_mask self.all_labels = all_labels def __getitem__(self, index): return { 'input_ids': self.all_input_ids[index], 'attention_mask': self.all_attention_mask[index],
label = torch.tensor(data=label).type(torch.LongTensor) return input_ids, token_type_ids, attention_mask, label print("***********load test data*****************") config = roBerta_Config() vocab = Vocab() train_data, valid_data, test_data = vocab.get_train_dev_test() test_dataset = BuildDataSet(test_data) test_load = DataLoader(dataset=test_dataset, batch_size=config.batch_size, shuffle=False, collate_fn=collate_fn) print("***********load model weight*****************") model_config = BertConfig.from_pretrained( pretrained_model_name_or_path="bert_source/bert_config.json") model = BertForSequenceClassification(config=model_config) model.load_state_dict(torch.load('save_bert/best_model.pth.tar')) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = model.to(device) config.device = device print("***********make predict for test file*****************") predict = model_infer(model, config, test_load) submit_result(predict) print("***********done*****************")
class AdapterCompositionTest(unittest.TestCase): def setUp(self): self.model = BertForSequenceClassification(BertConfig()) self.model.add_adapter("a") self.model.add_adapter("b") self.model.add_adapter("c") self.model.add_adapter("d") self.model.to(torch_device) self.model.train() def training_pass(self): inputs = {} inputs["input_ids"] = ids_tensor((1, 128), 1000) inputs["labels"] = torch.ones(1, dtype=torch.long) loss = self.model(**inputs).loss loss.backward() def batched_training_pass(self): inputs = {"input_ids": ids_tensor((4, 128), 1000), "labels": torch.ones(4, dtype=torch.long)} loss = self.model(**inputs).loss loss.backward() def test_simple_split(self): # pass over split setup self.model.set_active_adapters(Split("a", "b", 64)) self.training_pass() def test_stacked_split(self): # split into two stacks self.model.set_active_adapters(Split(Stack("a", "b"), Stack("c", "d"), split_index=64)) self.training_pass() def test_stacked_fusion(self): self.model.add_adapter_fusion(Fuse("b", "d")) # fuse two stacks self.model.set_active_adapters(Fuse(Stack("a", "b"), Stack("c", "d"))) self.training_pass() def test_mixed_stack(self): self.model.add_adapter_fusion(Fuse("a", "b")) self.model.set_active_adapters(Stack("a", Split("c", "d", split_index=64), Fuse("a", "b"))) self.training_pass() def test_nested_split(self): # split into two stacks self.model.set_active_adapters(Split(Split("a", "b", split_index=32), "c", split_index=64)) self.training_pass() def test_parallel(self): self.model.set_active_adapters(Parallel("a", "b", "c", "d")) inputs = {} inputs["input_ids"] = ids_tensor((1, 128), 1000) logits = self.model(**inputs).logits self.assertEqual(logits.shape, (4, 2)) def test_nested_parallel(self): self.model.set_active_adapters(Stack("a", Parallel(Stack("b", "c"), "d"))) inputs = {} inputs["input_ids"] = ids_tensor((1, 128), 1000) logits = self.model(**inputs).logits self.assertEqual(logits.shape, (2, 2)) def test_batch_split(self): self.model.set_active_adapters(BatchSplit("a", "b", "c", batch_sizes=[1, 1, 2])) self.batched_training_pass() def test_batch_split_int(self): self.model.set_active_adapters(BatchSplit("a", "b", batch_sizes=2)) self.batched_training_pass() def test_nested_batch_split(self): self.model.set_active_adapters(Stack("a", BatchSplit("b", "c", batch_sizes=[2, 2]))) self.batched_training_pass() def test_batch_split_invalid(self): self.model.set_active_adapters(BatchSplit("a", "b", batch_sizes=[3, 4])) with self.assertRaises(IndexError): self.batched_training_pass() def test_batch_split_equivalent(self): self.model.set_active_adapters("a") self.model.eval() input_ids = ids_tensor((2, 128), 1000) output_a = self.model(input_ids[:1]) self.model.set_active_adapters("b") output_b = self.model(input_ids[1:2]) self.model.set_active_adapters(BatchSplit("a", "b", batch_sizes=[1, 1])) output = self.model(input_ids) self.assertTrue(torch.allclose(output_a[0], output[0][0], atol=1e-6)) self.assertTrue(torch.allclose(output_b[0], output[0][1], atol=1e-6))
class Classifier: """The Classifier""" ############################################# def __init__(self, train_batch_size=16, eval_batch_size=8, max_length=128, lr=2e-5, eps=1e-6, n_epochs=11): """ :param train_batch_size: (int) Training batch size :param eval_batch_size: (int) Batch size while using the `predict` method. :param max_length: (int) Maximum length for padding :param lr: (float) Learning rate :param eps: (float) Adam optimizer epsilon parameter :param n_epochs: (int) Number of epochs to train """ # model parameters self.train_batch_size = train_batch_size self.eval_batch_size = eval_batch_size self.max_length = max_length self.lr = lr self.eps = eps self.n_epochs = n_epochs # Information to be set or updated later self.trainset = None self.categories = None self.labels = None self.model = None # Tokenizer self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # The model # # We first need to specify some configurations to the model configs = BertConfig.from_pretrained( 'bert-base-uncased', num_labels=3, type_vocab_size=8) # BERT configuration self.model = BertForSequenceClassification(configs) # We are changing the header classifier of the model (Which is initially a simple fully connect layer layer) clf = Net() self.model.classifier = clf self.model.to( device ) # putting the model on GPU if available otherwise device is CPU def preprocess(self, sentences): """ The preprocessing function :param sentences: List of all sentences to be given at once. :return: List of preprocessed sentences. """ preprocessed = [] for sentence in tqdm(sentences): assert isinstance(sentence, str) doc = nlp(str(sentence)) tokens = [] for token in doc: if (not token.is_punct) or (token.text not in [ ',', '-', '.', "'", '!' ]): # Some punctuations can be interesting for BERT tokens.append(token.text) tokens = (' '.join(tokens)).lower().replace(" '", "'") preprocessed.append(tokens) return preprocessed def question(self, category): """ Computes the questions corresponding to each category :param category: (str) The category/aspect :return: (str) computed question using the QA-M task """ assert category in self.categories if category == 'AMBIENCE#GENERAL': return "what do you think of the ambience of it ?" elif category == 'DRINKS#PRICES' or category == 'FOOD#PRICES' or category == 'RESTAURANT#PRICES': return "what do you think of the price of it ?" elif category == 'DRINKS#QUALITY' or category == 'FOOD#QUALITY': return "what do you think of the quality of it ?" elif category == 'DRINKS#STYLE_OPTIONS': return "what do you think of drinks ?" elif category == 'FOOD#STYLE_OPTIONS': return "what do you think of the food ?" elif category == 'LOCATION#GENERAL': return "what do you think of the location of it ?" elif category == 'RESTAURANT#GENERAL' or category == 'RESTAURANT#MISCELLANEOUS': return "what do you think of the restaurant ?" elif category == 'SERVICE#GENERAL': return "what do you think of the service of it ?" def train(self, trainfile): """Trains the classifier model on the training set stored in file trainfile""" # Loading the data and splitting up its information in lists print("\n Loading training data...") trainset = np.genfromtxt(trainfile, delimiter='\t', dtype=str, comments=None) self.trainset = trainset n = len(trainset) targets = trainset[:, 0] categories = trainset[:, 1] self.labels = list(Counter(targets).keys()) # label names self.categories = list(Counter(categories).keys()) # category names start_end = [[int(x) for x in w.split(':')] for w in trainset[:, 3]] # target words words_of_interest = [ trainset[:, 4][i][start_end[i][0]:start_end[i][1]] for i in range(n) ] # sentences to be classified sentences = [str(s) for s in trainset[:, 4]] # Preprocessing the text data print(" Preprocessing the text data...") sentences = self.preprocess(sentences) # Computing question sequences print(" Computing questions...") questions = [self.question(categories[i]) for i in tqdm(range(n))] # Tokenization attention_masks = [] input_ids = [] token_type_ids = [] labels = [] for word, question, answer in zip(words_of_interest, questions, sentences): encoded_dict = self.tokenizer.encode_plus( answer, question + ' ' + word.lower(), add_special_tokens=True, # Add '[CLS]' and '[SEP]' tokens max_length=self.max_length, # Pad & truncate all sequences pad_to_max_length=True, return_attention_mask=True, # Construct attention masks return_tensors='pt', # Return pytorch tensors. ) attention_masks.append(encoded_dict['attention_mask']) input_ids.append(encoded_dict['input_ids']) token_type_ids.append(encoded_dict['token_type_ids']) attention_masks = torch.cat(attention_masks, dim=0) input_ids = torch.cat(input_ids, dim=0) token_type_ids = torch.cat(token_type_ids, dim=0) # Converting polarities into integers (0: positive, 1: negative, 2: neutral) for target in targets: if target == 'positive': labels.append(0) elif target == 'negative': labels.append(1) elif target == 'neutral': labels.append(2) labels = torch.tensor(labels) # Pytorch data iterators train_data = TensorDataset(input_ids, attention_masks, token_type_ids, labels) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, batch_size=self.train_batch_size, sampler=train_sampler) # Optimizer and scheduler (we are using a linear scheduler without warm up) no_decay = ['bias', 'gamma', 'beta'] # These parameters are not going to be decreased optimizer_parameters = [{ 'params': [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_parameters, lr=self.lr, eps=self.eps) total_steps = len(train_dataloader) * self.n_epochs scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps) # Training initial_t0 = time.time() for epoch in range(self.n_epochs): print('\n ======== Epoch %d / %d ========' % (epoch + 1, self.n_epochs)) print(' Training...\n') t0 = time.time() total_train_loss = 0 self.model.train() for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids_, input_mask_, segment_ids_, label_ids_ = batch self.model.zero_grad() loss, _ = self.model(input_ids_, token_type_ids=segment_ids_, attention_mask=input_mask_, labels=label_ids_) total_train_loss += loss.item() loss.backward() # clip gradient norm torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) optimizer.step() scheduler.step() avg_train_loss = total_train_loss / len(train_dataloader) training_time = format_time(time.time() - t0) # print(" Average training loss: {0:.2f}".format(avg_train_loss)) print(" Training epoch duration: {:}".format(training_time)) print(" Total training time: {:}".format( format_time(time.time() - initial_t0))) def predict(self, datafile): """Predicts class labels for the input instances in file 'datafile' Returns the list of predicted labels """ # Loading the data and splitting up its information in lists evalset = np.genfromtxt(datafile, delimiter='\t', dtype=str, comments=None) m = len(evalset) categories = evalset[:, 1] start_end = [[int(x) for x in w.split(':')] for w in evalset[:, 3]] # target words words_of_interest = [ evalset[:, 4][i][start_end[i][0]:start_end[i][1]] for i in range(m) ] # sentences to be classified sentences = [str(s) for s in evalset[:, 4]] # Preprocessing the text data print("\n Preprocessing the text data...") sentences = self.preprocess(sentences) # Computing question sequences print(" Computing questions...") questions = [self.question(categories[i]) for i in tqdm(range(m))] # Tokenization attention_masks = [] input_ids = [] token_type_ids = [] for word, question, answer in zip(words_of_interest, questions, sentences): encoded_dict = self.tokenizer.encode_plus( answer, question + ' ' + word.lower(), add_special_tokens=True, # Add '[CLS]' and '[SEP]' max_length=self.max_length, # Pad & truncate all sequences pad_to_max_length=True, return_attention_mask=True, # Construct attention masks return_tensors='pt', # Return pytorch tensors. ) attention_masks.append(encoded_dict['attention_mask']) input_ids.append(encoded_dict['input_ids']) token_type_ids.append(encoded_dict['token_type_ids']) attention_masks = torch.cat(attention_masks, dim=0) input_ids = torch.cat(input_ids, dim=0) token_type_ids = torch.cat(token_type_ids, dim=0) # Pytorch data iterators eval_data = TensorDataset(input_ids, attention_masks, token_type_ids) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, batch_size=self.eval_batch_size, sampler=eval_sampler) # Prediction named_labels = [] self.model.eval() for batch in eval_dataloader: batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids = batch with torch.no_grad(): logits = self.model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)[0] logits = softmax(logits, dim=-1) logits = logits.detach().cpu().numpy() outputs = np.argmax(logits, axis=1) # converting integer labels into named labels for label in outputs: if label == 0: named_labels.append('positive') elif label == 1: named_labels.append('negative') elif label == 2: named_labels.append('neutral') return np.array(named_labels)
def train_process(config, train_load, train_sampler, model_name): # load source bert weights model_config = BertConfig.from_pretrained( pretrained_model_name_or_path="../user_data/bert_source/{}_config.json" .format(model_name)) # model_config = BertConfig() model_config.vocab_size = len( pd.read_csv('../user_data/vocab', names=["score"])) model = BertForSequenceClassification(config=model_config) checkpoint = torch.load( '../user_data/save_bert/{}_checkpoint.pth.tar'.format(model_name), map_location=torch.device('cpu')) model.load_state_dict(checkpoint['status'], strict=False) print('***********load pretrained mlm {} weight*************'.format( model_name)) for param in model.parameters(): param.requires_grad = True # 4) 封装之前要把模型移到对应的gpu model = model.to(config.device) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": config.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate) # t_total = len(train_load) * config.num_train_epochs # scheduler = get_linear_schedule_with_warmup( # optimizer, num_warmup_steps=t_total * config.warmup_proportion, num_training_steps=t_total # ) cudnn.benchmark = True if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") # 5)封装 model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[config.local_rank]) model.train() if config.fgm: fgm = FGM(model) for epoch in range(config.num_train_epochs): train_sampler.set_epoch(epoch) torch.cuda.empty_cache() for batch, (input_ids, token_type_ids, attention_mask, label) in enumerate(train_load): input_ids = input_ids.cuda(config.local_rank, non_blocking=True) attention_mask = attention_mask.cuda(config.local_rank, non_blocking=True) token_type_ids = token_type_ids.cuda(config.local_rank, non_blocking=True) label = label.cuda(config.local_rank, non_blocking=True) outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=label) loss = outputs.loss model.zero_grad() loss.backward() # torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm) if config.fgm: fgm.attack() # 在embedding上添加对抗扰动 loss_adv = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, labels=label).loss loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度 fgm.restore() # 恢复embedding参数 optimizer.step() # scheduler.step() # dev_auc = model_evaluate(config, model, valid_load) # 同步各个进程的速度,计算分布式loss torch.distributed.barrier() # reduce_dev_auc = reduce_auc(dev_auc, config.nprocs).item() # if reduce_dev_auc > best_dev_auc: # best_dev_auc = reduce_dev_auc # is_best = True now = strftime("%Y-%m-%d %H:%M:%S", localtime()) msg = 'model_name:{},time:{},epoch:{}/{}' if config.local_rank in [0, -1]: print( msg.format(model_name, now, epoch + 1, config.num_train_epochs)) checkpoint = {"status": model.module.state_dict()} torch.save( checkpoint, '../user_data/save_model' + os.sep + '{}_checkpoint.pth.tar'.format(model_name)) del checkpoint torch.distributed.barrier()
model = BertModel.from_pretrained( './model/bert_pre58_4/pytorch_model.bin', config=config) model.cuda() model = torch.nn.DataParallel(model, device_ids=[0, 1, 2, 3]).cuda() model.to(device) save_offset = 12 supreme_config = BertConfig.from_json_file('./dataset/bert_config.json') supreme_config.num_labels = len(myDataset.cls_label_2_id) model_ = BertForSequenceClassification(config=supreme_config) model_.cuda() model_ = torch.nn.DataParallel(model_, device_ids=[0, 1, 2, 3]).cuda() model_.to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.Adam([{'params': model.parameters(), 'lr': 5e-5}, {'params': textCNN.parameters(), 'lr': 1e-3}], lr=1e-3, weight_decay=0.) # %% losses = [] num_epochs = 30 for epoch in range(num_epochs): train_count = 0 train_loss = 0 train_acc = [] train_iter = tqdm(dataiter) for sentences, attn_masks, std_ids, _, _ in train_iter:
def train_classifier(model: BertForSequenceClassification, dataset: TensorDataset, validation_ratio: float, batch_size: int, freeze_embeddings_layer: bool, freeze_encoder_layers: int, epochs: int) -> (BertForSequenceClassification, list): device = select_device() train_size = int(validation_ratio * len(dataset)) val_size = len(dataset) - train_size train_dataset, val_dataset = random_split(dataset, [train_size, val_size]) train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size) validation_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size) modules = [] if freeze_embeddings_layer: modules.append(model.bert.embeddings) for i in range(freeze_encoder_layers): modules.append(model.bert.encoder.layer[i]) for module in modules: for param in module.parameters(): param.requires_grad = False model.to(device) optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=5e-5) total_steps = len(train_dataloader) * epochs scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps) training_stats = [] total_t0 = time.time() for epoch_i in range(0, epochs): print("") print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) print('Training...') t0 = time.time() total_train_loss = 0 model.train() for step, batch in enumerate(train_dataloader): if step % 40 == 0 and not step == 0: elapsed = format_time(time.time() - t0) print(' Batch {:>5,} of {:>5,}. Elapsed: {:}.'.format( step, len(train_dataloader), elapsed)) b_input_ids = batch[0].to(device) b_input_mask = batch[1].to(device) b_labels = batch[2].to(device) model.zero_grad() outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) loss = outputs.loss logits = outputs.logits total_train_loss += loss.item() loss.backward() # Clip the norm of the gradients to 1.0. # This is to help prevent the "exploding gradients" problem. torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() avg_train_loss = total_train_loss / len(train_dataloader) training_time = format_time(time.time() - t0) print("") print(" Average training loss: {0:.2f}".format(avg_train_loss)) print(" Training epcoh took: {:}".format(training_time)) print("") print("Running Validation...") t0 = time.time() model.eval() total_eval_accuracy = 0 total_eval_loss = 0 nb_eval_steps = 0 for batch in validation_dataloader: b_input_ids = batch[0].to(device) b_input_mask = batch[1].to(device) b_labels = batch[2].to(device) with torch.no_grad(): outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels) loss = outputs.loss logits = outputs.logits total_eval_loss += loss.item() logits = logits.detach().cpu().numpy() label_ids = b_labels.cpu().numpy() total_eval_accuracy += flat_accuracy(logits, label_ids) avg_val_accuracy = total_eval_accuracy / len(validation_dataloader) print(" Accuracy: {0:.2f}".format(avg_val_accuracy)) avg_val_loss = total_eval_loss / len(validation_dataloader) validation_time = format_time(time.time() - t0) print(" Validation Loss: {0:.2f}".format(avg_val_loss)) print(" Validation took: {:}".format(validation_time)) training_stats.append({ 'epoch': epoch_i + 1, 'Training Loss': avg_train_loss, 'Valid. Loss': avg_val_loss, 'Valid. Accur.': avg_val_accuracy, 'Training Time': training_time, 'Validation Time': validation_time }) print("") print("Training complete!") print("Total training took {:} (h:mm:ss)".format( format_time(time.time() - total_t0))) return model, training_stats
class AdapterCompositionTest(unittest.TestCase): def setUp(self): self.model = BertForSequenceClassification(BertConfig()) self.model.add_adapter("a") self.model.add_adapter("b") self.model.add_adapter("c") self.model.add_adapter("d") self.model.to(torch_device) self.model.train() def training_pass(self): inputs = {} inputs["input_ids"] = ids_tensor((1, 128), 1000) inputs["labels"] = torch.ones(1, dtype=torch.long) loss = self.model(**inputs).loss loss.backward() def test_simple_split(self): # pass over split setup self.model.set_active_adapters(Split("a", "b", 64)) self.training_pass() def test_stacked_split(self): # split into two stacks self.model.set_active_adapters( Split(Stack("a", "b"), Stack("c", "d"), split_index=64)) self.training_pass() def test_stacked_fusion(self): self.model.add_fusion(Fuse("b", "d")) # fuse two stacks self.model.set_active_adapters(Fuse(Stack("a", "b"), Stack("c", "d"))) self.training_pass() def test_mixed_stack(self): self.model.add_fusion(Fuse("a", "b")) self.model.set_active_adapters( Stack("a", Split("c", "d", split_index=64), Fuse("a", "b"))) self.training_pass() def test_nested_split(self): # split into two stacks self.model.set_active_adapters( Split(Split("a", "b", split_index=32), "c", split_index=64)) self.training_pass() def test_parallel(self): self.model.set_active_adapters(Parallel("a", "b", "c", "d")) inputs = {} inputs["input_ids"] = ids_tensor((1, 128), 1000) logits = self.model(**inputs).logits self.assertEqual(logits.shape, (4, 2)) def test_nested_parallel(self): self.model.set_active_adapters( Stack("a", Parallel(Stack("b", "c"), "d"))) inputs = {} inputs["input_ids"] = ids_tensor((1, 128), 1000) logits = self.model(**inputs).logits self.assertEqual(logits.shape, (2, 2))
class TorchBertClassifierModel(TorchModel): """Bert-based model for text classification on PyTorch. It uses output from [CLS] token and predicts labels using linear transformation. Args: n_classes: number of classes pretrained_bert: pretrained Bert checkpoint path or key title (e.g. "bert-base-uncased") one_hot_labels: set True if one-hot encoding for labels is used multilabel: set True if it is multi-label classification return_probas: set True if return class probabilites instead of most probable label needed attention_probs_keep_prob: keep_prob for Bert self-attention layers hidden_keep_prob: keep_prob for Bert hidden layers optimizer: optimizer name from `torch.optim` optimizer_parameters: dictionary with optimizer's parameters, e.g. {'lr': 0.1, 'weight_decay': 0.001, 'momentum': 0.9} clip_norm: clip gradients by norm coefficient bert_config_file: path to Bert configuration file (not used if pretrained_bert is key title) """ def __init__(self, n_classes, pretrained_bert, one_hot_labels: bool = False, multilabel: bool = False, return_probas: bool = False, attention_probs_keep_prob: Optional[float] = None, hidden_keep_prob: Optional[float] = None, optimizer: str = "AdamW", optimizer_parameters: dict = { "lr": 1e-3, "weight_decay": 0.01, "betas": (0.9, 0.999), "eps": 1e-6 }, clip_norm: Optional[float] = None, bert_config_file: Optional[str] = None, **kwargs) -> None: self.return_probas = return_probas self.one_hot_labels = one_hot_labels self.multilabel = multilabel self.pretrained_bert = pretrained_bert self.bert_config_file = bert_config_file self.attention_probs_keep_prob = attention_probs_keep_prob self.hidden_keep_prob = hidden_keep_prob self.n_classes = n_classes self.clip_norm = clip_norm if self.multilabel and not self.one_hot_labels: raise RuntimeError( 'Use one-hot encoded labels for multilabel classification!') if self.multilabel and not self.return_probas: raise RuntimeError( 'Set return_probas to True for multilabel classification!') super().__init__(optimizer=optimizer, optimizer_parameters=optimizer_parameters, **kwargs) def train_on_batch(self, features: List[InputFeatures], y: Union[List[int], List[List[int]]]) -> Dict: """Train model on given batch. This method calls train_op using features and y (labels). Args: features: batch of InputFeatures y: batch of labels (class id or one-hot encoding) Returns: dict with loss and learning_rate values """ input_ids = [f.input_ids for f in features] input_masks = [f.attention_mask for f in features] b_input_ids = torch.cat(input_ids, dim=0).to(self.device) b_input_masks = torch.cat(input_masks, dim=0).to(self.device) b_labels = torch.from_numpy(np.array(y)).to(self.device) self.optimizer.zero_grad() loss, logits = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_masks, labels=b_labels) loss.backward() # Clip the norm of the gradients to 1.0. # This is to help prevent the "exploding gradients" problem. if self.clip_norm: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip_norm) self.optimizer.step() if self.lr_scheduler is not None: self.lr_scheduler.step() return {'loss': loss.item()} def __call__( self, features: List[InputFeatures] ) -> Union[List[int], List[List[float]]]: """Make prediction for given features (texts). Args: features: batch of InputFeatures Returns: predicted classes or probabilities of each class """ input_ids = [f.input_ids for f in features] input_masks = [f.attention_mask for f in features] b_input_ids = torch.cat(input_ids, dim=0).to(self.device) b_input_masks = torch.cat(input_masks, dim=0).to(self.device) with torch.no_grad(): # Forward pass, calculate logit predictions logits = self.model(b_input_ids, token_type_ids=None, attention_mask=b_input_masks) logits = logits[0] if self.return_probas: if not self.multilabel: pred = torch.nn.functional.softmax(logits, dim=-1) else: pred = torch.nn.functional.sigmoid(logits) pred = pred.detach().cpu().numpy() else: logits = logits.detach().cpu().numpy() pred = np.argmax(logits, axis=1) return pred @overrides def load(self, fname=None): if fname is not None: self.load_path = fname if self.pretrained_bert and not Path(self.pretrained_bert).is_file(): self.model = BertForSequenceClassification.from_pretrained( self.pretrained_bert, num_labels=self.n_classes, output_attentions=False, output_hidden_states=False) elif self.bert_config_file and Path(self.bert_config_file).is_file(): self.bert_config = BertConfig.from_json_file( str(expand_path(self.bert_config_file))) if self.attention_probs_keep_prob is not None: self.bert_config.attention_probs_dropout_prob = 1.0 - self.attention_probs_keep_prob if self.hidden_keep_prob is not None: self.bert_config.hidden_dropout_prob = 1.0 - self.hidden_keep_prob self.model = BertForSequenceClassification(config=self.bert_config) else: raise ConfigError("No pre-trained BERT model is given.") self.model.to(self.device) self.optimizer = getattr(torch.optim, self.optimizer_name)( self.model.parameters(), **self.optimizer_parameters) if self.lr_scheduler_name is not None: self.lr_scheduler = getattr(torch.optim.lr_scheduler, self.lr_scheduler_name)( self.optimizer, **self.lr_scheduler_parameters) if self.load_path: log.info(f"Load path {self.load_path} is given.") if isinstance(self.load_path, Path) and not self.load_path.parent.is_dir(): raise ConfigError("Provided load path is incorrect!") weights_path = Path(self.load_path.resolve()) weights_path = weights_path.with_suffix(f".pth.tar") if weights_path.exists(): log.info(f"Load path {weights_path} exists.") log.info( f"Initializing `{self.__class__.__name__}` from saved.") # now load the weights, optimizer from saved log.info(f"Loading weights from {weights_path}.") checkpoint = torch.load(weights_path, map_location=self.device) self.model.load_state_dict(checkpoint["model_state_dict"]) self.optimizer.load_state_dict( checkpoint["optimizer_state_dict"]) self.epochs_done = checkpoint.get("epochs_done", 0) else: log.info( f"Init from scratch. Load path {weights_path} does not exist." )
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--data_dir", default='/hdd/lujunyu/dataset/multi_turn_corpus/ubuntu/', type=str, required=False, help= "The input data dir. Should contain the .tsv files (or other data files) for the task." ) parser.add_argument("--task_name", default='ubuntu', type=str, required=False, help="The name of the task to train.") parser.add_argument( "--output_dir", default='/hdd/lujunyu/model/chatbert/ubuntu_base_si/', type=str, required=False, help="The output directory where the model checkpoints will be written." ) parser.add_argument( "--init_checkpoint", default='/hdd/lujunyu/model/chatbert/ubuntu_base_si_aug/model.pt', type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") ## Other parameters parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument( "--do_lower_case", default=False, action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument( "--max_seq_length", default=256, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--eval_batch_size", default=2000, type=int, help="Total batch size for eval.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') bert_config = BertConfig.from_pretrained('bert-base-uncased') tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=args.do_lower_case) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}" .format(args.max_seq_length, bert_config.max_position_embeddings)) test_dataset = UbuntuDataset(file_path=os.path.join( args.data_dir, "test.txt"), max_seq_length=args.max_seq_length, tokenizer=tokenizer) test_dataloader = torch.utils.data.DataLoader( test_dataset, batch_size=args.eval_batch_size, sampler=SequentialSampler(test_dataset), num_workers=4) model = BertForSequenceClassification(bert_config).from_pretrained( args.init_checkpoint, config=bert_config) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) logger.info("***** Running testing *****") logger.info(" Num examples = %d", len(test_dataset)) logger.info(" Batch size = %d", args.eval_batch_size) f = open(os.path.join(args.output_dir, 'logits_test.txt'), 'w') model.eval() test_loss = 0 nb_test_steps, nb_test_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm(test_dataloader, desc="Step"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): tmp_test_loss, logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() for logit, label in zip(logits, label_ids): logit = '{},{}'.format(logit[0], logit[1]) f.write('_\t{}\t{}\n'.format(logit, label)) test_loss += tmp_test_loss.mean().item() nb_test_examples += input_ids.size(0) nb_test_steps += 1 f.close() test_loss = test_loss / nb_test_steps result = evaluate(os.path.join(args.output_dir, 'logits_test.txt')) result.update({'test_loss': test_loss}) output_eval_file = os.path.join(args.output_dir, "results_test.txt") with open(output_eval_file, "w") as writer: logger.info("***** Test results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument("--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) parser.add_argument("--task_name", default=None, type=str, required=True, help="The name of the task to train selected in the list: " + ", ".join(processors.keys())) parser.add_argument("--output_dir", default=None, type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument("--tokenizer_name", default="", type=str, help="Pretrained tokenizer name or path if not the same as model_name") parser.add_argument("--cache_dir", default="", type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after tokenization. Sequences longer " "than this will be truncated, sequences shorter will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--evaluate_during_training", action='store_true', help="Rul evaluation during training at each logging step.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.") parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight deay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training steps to perform. Override num_train_epochs.") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--logging_steps', type=int, default=50, help="Log every X updates steps.") parser.add_argument('--save_steps', type=int, default=50, help="Save checkpoint every X updates steps.") parser.add_argument("--eval_all_checkpoints", action='store_true', help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA when available") parser.add_argument('--overwrite_output_dir', action='store_true', help="Overwrite the content of the output directory") parser.add_argument('--overwrite_cache', action='store_true', help="Overwrite the cached training and evaluation sets") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--tpu', action='store_true', help="Whether to run on the TPU defined in the environment variables") parser.add_argument('--tpu_ip_address', type=str, default='', help="TPU IP address if none are set in the environment variables") parser.add_argument('--tpu_name', type=str, default='', help="TPU name if none are set in the environment variables") parser.add_argument('--xrt_tpu_config', type=str, default='', help="XRT TPU config if none are set in the environment variables") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") parser.add_argument('--fp16_opt_level', type=str, default='O1', help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank") parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") parser.add_argument('--first_n_examples', type=int, default=10000) parser.add_argument('--add_cnn', type=int, default=0) parser.add_argument('--cnn_filter_width', type=int, default=1) parser.add_argument('--diagonal_mask', type=int, default=0) parser.add_argument('--context_width', type=int, default=5) args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) # Setup distant debugging if needed if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') args.n_gpu = 1 args.device = device if args.tpu: if args.tpu_ip_address: os.environ["TPU_IP_ADDRESS"] = args.tpu_ip_address if args.tpu_name: os.environ["TPU_NAME"] = args.tpu_name if args.xrt_tpu_config: os.environ["XRT_TPU_CONFIG"] = args.xrt_tpu_config assert "TPU_IP_ADDRESS" in os.environ assert "TPU_NAME" in os.environ assert "XRT_TPU_CONFIG" in os.environ import torch_xla import torch_xla.core.xla_model as xm args.device = xm.xla_device() args.xla_model = xm # Setup logging logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) # Set seed set_seed(args) # Prepare GLUE task args.task_name = args.task_name.lower() if args.task_name not in processors: raise ValueError("Task not found: %s" % (args.task_name)) processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] label_list = processor.get_labels() num_labels = len(label_list) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name, cache_dir=args.cache_dir if args.cache_dir else None) config.add_cnn = bool(args.add_cnn) config.cnn_filter_width = args.cnn_filter_width config.max_seq_length = args.max_seq_length tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None) #model = model_class.from_pretrained(args.model_name_or_path, # from_tf=bool('.ckpt' in args.model_name_or_path), # config=config, # cache_dir=args.cache_dir if args.cache_dir else None) model = BertForSequenceClassification(config) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0) and not args.tpu: # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) # Load a trained model and vocabulary that you have fine-tuned model = model_class.from_pretrained(args.output_dir) tokenizer = tokenizer_class.from_pretrained(args.output_dir) model.to(args.device) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else "" model = model_class.from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, prefix=prefix) result = dict((k + '_{}'.format(global_step), v) for k, v in result.items()) results.update(result) return results
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default='/hdd/lujunyu/dataset/multi_turn_corpus/ubuntu/', type=str, required=False, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--task_name", default='ubuntu', type=str, required=False, help="The name of the task to train.") parser.add_argument("--output_dir", default='/hdd/lujunyu/model/chatbert/ubuntu_without_pretraining/', type=str, required=False, help="The output directory where the model checkpoints will be written.") ## Other parameters parser.add_argument("--init_model_name", default='bert-base-uncased', type=str, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument("--do_lower_case", default=True, action='store_true', help="Whether to lower case the input text. True for uncased models, False for cased models.") parser.add_argument("--data_augmentation", default=False, action='store_true', help="Whether to use augmentation") parser.add_argument("--max_seq_length", default=256, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=True, action='store_true', help="Whether to run training.") parser.add_argument("--do_test", default=True, action='store_true', help="Whether to run eval on the test set.") parser.add_argument("--train_batch_size", default=500, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=500, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=3e-3, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=10.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_steps", default=0.0, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--weight_decay", default=1e-3, type=float, help="weight_decay") parser.add_argument("--save_checkpoints_steps", default=8000, type=int, help="How often to save the model checkpoint.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=20, help="Number of updates steps to accumualte before performing a backward/update pass.") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") bert_config = BertConfig.from_pretrained(args.init_model_name, num_labels=2) if args.max_seq_length > bert_config.max_position_embeddings: raise ValueError( "Cannot use sequence length {} because the BERT model was only trained up to sequence length {}".format( args.max_seq_length, bert_config.max_position_embeddings)) if os.path.exists(args.output_dir) and os.listdir(args.output_dir): if args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) else: os.makedirs(args.output_dir, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.init_model_name, do_lower_case=args.do_lower_case) if args.data_augmentation: train_dataset = UbuntuDatasetForSP( file_path=os.path.join(args.data_dir, "train_augment_3.txt"), max_seq_length=args.max_seq_length, tokenizer=tokenizer ) else: train_dataset = UbuntuDatasetForSP( file_path=os.path.join(args.data_dir, "train.txt"), max_seq_length=args.max_seq_length, tokenizer=tokenizer ) eval_dataset = UbuntuDatasetForSP( file_path=os.path.join(args.data_dir, "valid.txt"), max_seq_length=args.max_seq_length, tokenizer=tokenizer ) train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=args.train_batch_size, sampler=RandomSampler(train_dataset), num_workers=4) eval_dataloader = torch.utils.data.DataLoader(eval_dataset, batch_size=args.eval_batch_size, sampler=SequentialSampler(eval_dataset), num_workers=4) model = BertForSequenceClassification(config=bert_config) model.to(device) num_train_steps = None if args.do_train: num_train_steps = int( len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare optimizer param_optimizer = list(model.named_parameters()) # remove pooler, which is not used thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer] no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_steps) else: optimizer = None scheduler = None if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) global_step = 0 best_metric = 0.0 if args.do_train: logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() # We have accumulated enought gradients scheduler.step() model.zero_grad() global_step += 1 if step % args.save_checkpoints_steps == 0: model.eval() f = open(os.path.join(args.output_dir, 'logits_dev.txt'), 'w') eval_loss = 0 nb_eval_steps, nb_eval_examples = 0, 0 logits_all = [] for input_ids, input_mask, segment_ids, label_ids in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss, logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) logits = logits.detach().cpu().numpy() logits_all.append(logits) label_ids = label_ids.cpu().numpy() for logit, label in zip(logits, label_ids): logit = '{},{}'.format(logit[0], logit[1]) f.write('_\t{}\t{}\n'.format(logit, label)) eval_loss += tmp_eval_loss.mean().item() nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 f.close() logits_all = np.concatenate(logits_all,axis=0) eval_loss = eval_loss / nb_eval_steps result = evaluate(os.path.join(args.output_dir, 'logits_dev.txt')) result.update({'eval_loss': eval_loss}) output_eval_file = os.path.join(args.output_dir, "eval_results_dev.txt") with open(output_eval_file, "a") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) ### Save the best checkpoint if best_metric < result['R10@1'] + result['R10@2']: try: ### Remove 'module' prefix when using DataParallel state_dict = model.module.state_dict() except AttributeError: state_dict = model.state_dict() torch.save(state_dict, os.path.join(args.output_dir, "model.pt")) best_metric = result['R10@1'] + result['R10@2'] logger.info('Saving the best model in {}'.format(os.path.join(args.output_dir, "model.pt"))) ### visualize bad cases of the best model # logger.info('Saving Bad cases...') # visualize_bad_cases( # logits=logits_all, # input_file_path=os.path.join(args.data_dir, 'valid.txt'), # output_file_path=os.path.join(args.output_dir, 'valid_bad_cases.txt') # ) model.train()
import emoji from soynlp.normalizer import repeat_normalize finetune_ckpt = './your_local_path/BaekBERT.ckpt' test_path = '../data/testset/inferset.csv' device = 'cuda' if torch.cuda.is_available() else 'cpu' args = Arg() ckp = torch.load(finetune_ckpt, map_location=torch.device('cpu')) pretrained_model_config = BertConfig.from_pretrained( args.pretrained_model, num_labels=ckp['state_dict']['bert.classifier.bias'].shape.numel(), ) model = BertForSequenceClassification(pretrained_model_config) model.load_state_dict({k[5:]: v for k, v in ckp['state_dict'].items()}) model.to(device) model.eval() def read_data(path): if path.endswith('xlsx'): return pd.read_excel(path) elif path.endswith('csv'): return pd.read_csv(path) elif path.endswith('tsv') or path.endswith('txt'): return pd.read_csv(path, sep='\t') else: raise NotImplementedError( 'Only Excel(xlsx)/Csv/Tsv(txt) are Supported')
class bert_classifier(object): def __init__(self): self.config = Config() self.device_setup() self.model_setup() def device_setup(self): """ 设备配置并加载BERT模型 :return: """ # 使用GPU,通过model.to(device)的方式使用 self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") model_save_path = self.config.get("result", "model_save_path") config_save_path = self.config.get("result", "config_save_path") vocab_save_path = self.config.get("result", "vocab_save_path") self.model_config = BertConfig.from_json_file(config_save_path) self.model = BertForSequenceClassification(self.model_config) self.state_dict = torch.load(model_save_path) self.model.load_state_dict(self.state_dict) self.tokenizer = transformers.BertTokenizer(vocab_save_path) self.model.to(self.device) self.model.eval() def model_setup(self): weight_decay = self.config.get("training_rule", "weight_decay") learning_rate = self.config.get("training_rule", "learning_rate") # 定义优化器和损失函数 # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': weight_decay }, { 'params': [ p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] self.optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate) self.criterion = nn.CrossEntropyLoss() def predict(self, sentence): input_ids, token_type_ids = convert_text_to_ids( self.tokenizer, sentence) input_ids = seq_padding(self.tokenizer, [input_ids]) token_type_ids = seq_padding(self.tokenizer, [token_type_ids]) # 需要 LongTensor input_ids, token_type_ids = input_ids.long(), token_type_ids.long() # 梯度清零 self.optimizer.zero_grad() # 迁移到GPU input_ids, token_type_ids = input_ids.to( self.device), token_type_ids.to(self.device) output = self.model(input_ids=input_ids, token_type_ids=token_type_ids) y_pred_prob = output[0] y_pred_label = y_pred_prob.argmax(dim=1) print(y_pred_label)