def load_classification_model(): global trainer global tokenizer mod = 'mtn_models/pytorch_model.bin' tok = 'mtn_models/vocab.txt' conf = 'mtn_models/config.json' tokenizer = BertTokenizer.from_pretrained(tok, do_lower_case=False, do_basic_tokenize=True, never_split=never_split_tokens, truncation=True) config = PretrainedConfig.from_pretrained(conf, num_labels=6) model = BertForSequenceClassification.from_pretrained(mod, config=config) training_args = TrainingArguments("./train") training_args.do_train = True training_args.evaluate_during_training = True training_args.adam_epsilon = 1e-8 training_args.learning_rate = 2e-5 training_args.warmup_steps = 0 training_args.per_gpu_train_batch_size = 16 training_args.per_gpu_eval_batch_size = 16 training_args.num_train_epochs = 3 #training_args.logging_steps = (len(train_features) - 1) // training_args.per_gpu_train_batch_size + 1 training_args.save_steps = training_args.logging_steps training_args.seed = 42 trainer = Trainer(model=model, args=training_args)
def init_training_args(self, model_path: str) -> TrainingArguments: r""" 构造训练参数. """ training_args = TrainingArguments(output_dir=model_path) training_args.logging_steps = 5000 training_args.save_steps = 5000 training_args.learning_rate = 2e-5 training_args.num_train_epochs = 3 training_args.per_device_train_batch_size = 32 training_args.fp16 = self.fp16 training_args.fp16_opt_level = "O1" return training_args
def fit(self, train_df, dev_df): """ fitting the model based on the train set. validation is done using the dev set Parameters ---------- :param train_df: dataframe a pandas dataframe containing data to be trained on :param dev_df: dataframe a pandas dataframe containing data to validate on :return: None all relevant results are saved under the the location provided to save the model in. Next a prediction can be done """ train_labels = Counter(train_df[self.label_col_name]).keys() num_labels = len(train_labels) dev_labels = Counter(train_df[self.label_col_name]).keys() if num_labels != len(dev_labels): raise IOError("train and dev datasets contain different number of labels") # creating a DF for train/test with relevant columns. # Not clear why the 'alpha' column is needed, but as written here # (https://towardsdatascience.com/https-medium-com-chaturangarajapakshe-text-classification-with-transformer-models-d370944b50ca) - it is required train_df = pd.DataFrame({ 'id': range(len(train_df)), 'label': train_df[self.label_col_name], 'alpha': ['a'] * train_df.shape[0], 'text': train_df["text"].replace(r'\n', ' ', regex=True) }) dev_df = pd.DataFrame({ 'id': range(len(dev_df)), 'label': dev_df[self.label_col_name], 'alpha': ['a'] * dev_df.shape[0], 'text': dev_df["text"].replace(r'\n', ' ', regex=True) }) # saving the DF to the new/old folder train_df.to_csv(os.path.join(self.saving_data_folder, "train.tsv"), index=False, columns=train_df.columns, sep='\t', header=False) dev_df.to_csv(os.path.join(self.saving_data_folder, "dev.tsv"), index=False, columns=dev_df.columns, sep='\t', header=False) config = AutoConfig.from_pretrained(self.model_name, num_labels=num_labels, output_attentions=True) ##needed for the visualizations # loading the actual model to memory model = BertForSequenceClassification.from_pretrained(self.model_name, config=config) # Now we need to convert the examples in the dataset to features that the model can understand # this is a ready made class, provided by HuggingFace train_dataset = SingleSentenceClassificationProcessor(mode='classification') dev_dataset = SingleSentenceClassificationProcessor(mode='classification') # now adding examples (from the DF we created earlier) to the objects we created in the cell above) _ = train_dataset.add_examples(texts_or_text_and_labels=train_df['text'], labels=train_df[self.label_col_name], overwrite_examples=True) _ = dev_dataset.add_examples(texts_or_text_and_labels=dev_df['text'], labels=dev_df[self.label_col_name], overwrite_examples=True) train_features = train_dataset.get_features(tokenizer=self.tokenizer, max_length=self.max_length) test_features = dev_dataset.get_features(tokenizer=self.tokenizer, max_length=self.max_length) training_args = TrainingArguments("./train") training_args.do_train = True # setting the params of the BERT classifier for cur_param in self.bert_model_params.keys(): try: training_args.__dict__[cur_param] = eval(self.bert_model_params[cur_param]) except TypeError: training_args.__dict__[cur_param] = self.bert_model_params[cur_param] training_args.logging_steps = (len(train_features) - 1) // training_args.per_gpu_train_batch_size + 1 training_args.save_steps = training_args.logging_steps training_args.output_dir = self.saving_model_folder training_args.eval_steps = 100 # training_args.logging_dir = "gs://" from torch.utils.tensorboard import SummaryWriter supports google cloud storage trainer = Trainer(model=model, args=training_args, train_dataset=train_features, eval_dataset=test_features, compute_metrics=self.compute_metrics) trainer.train() # saving the model self.save_model(model=trainer.model, folder_name='bert_based_model')
def fit(self, train_df, dev_df): """ fitting the model based on the train set. validation is done using the dev set Parameters ---------- :param train_df: dataframe a pandas dataframe containing data to be trained on :param dev_df: dataframe a pandas dataframe containing data to validate on :return: None all relevant results are saved under the the location provided to save the model in. Next a prediction can be done """ train_labels = Counter(train_df[self.label_col_name]).keys() num_labels = len(train_labels) dev_labels = Counter(dev_df[self.label_col_name]).keys() if num_labels != len(dev_labels): raise IOError( "train and dev datasets contain different number of labels") # creating a DF for train/test with relevant columns. # Not clear why the 'alpha' column is needed, but as written here # (https://towardsdatascience.com/https-medium-com-chaturangarajapakshe-text-classification-with-transformer-models-d370944b50ca) - it is required train_df = pd.DataFrame({ 'id': range(len(train_df)), 'label': train_df[self.label_col_name], 'alpha': ['a'] * train_df.shape[0], 'text': train_df["text"].replace(r'\n', ' ', regex=True) }) dev_df = pd.DataFrame({ 'id': range(len(dev_df)), 'label': dev_df[self.label_col_name], 'alpha': ['a'] * dev_df.shape[0], 'text': dev_df["text"].replace(r'\n', ' ', regex=True) }) # saving the DF to the new/old folder train_df.to_csv(os.path.join(self.saving_data_folder, "train.tsv"), index=False, columns=train_df.columns, sep='\t', header=False) dev_df.to_csv(os.path.join(self.saving_data_folder, "dev.tsv"), index=False, columns=dev_df.columns, sep='\t', header=False) config = AutoConfig.from_pretrained( self.model_name, num_labels=num_labels, output_attentions=True) ##needed for the visualizations # loading the actual model to memory model = BertForSequenceClassification.from_pretrained(self.model_name, config=config) # Now we need to convert the examples in the dataset to features that the model can understand # this is a ready made class, provided by HuggingFace train_dataset = SingleSentenceClassificationProcessor( mode='classification') dev_dataset = SingleSentenceClassificationProcessor( mode='classification') # now adding examples (from the DF we created earlier) to the objects we created in the cell above) _ = train_dataset.add_examples( texts_or_text_and_labels=train_df['text'], labels=train_df[self.label_col_name], overwrite_examples=True) _ = dev_dataset.add_examples(texts_or_text_and_labels=dev_df['text'], labels=dev_df[self.label_col_name], overwrite_examples=True) train_features = train_dataset.get_features(tokenizer=self.tokenizer, max_length=self.max_length) dev_features = dev_dataset.get_features(tokenizer=self.tokenizer, max_length=self.max_length) # idea about a self-trainer is taken from here - https://huggingface.co/transformers/main_classes/trainer.html class MyTrainer(Trainer): def __init__(self, loss_func=torch.nn.CrossEntropyLoss(), **kwargs): self.loss_func = loss_func super().__init__(**kwargs) def compute_loss(self, model, inputs): labels = inputs.pop("labels") outputs = model(**inputs) logits = outputs[0] return self.loss_func(logits, labels) class FocalLoss(nn.modules.loss._WeightedLoss): def __init__(self, weight=None, gamma=2, reduction='mean'): super(FocalLoss, self).__init__(weight, reduction=reduction) self.gamma = gamma self.weight = weight # weight parameter will act as the alpha parameter to balance class weights def forward(self, input, target): ce_loss = F.cross_entropy(input, target, reduction=self.reduction, weight=self.weight) pt = torch.exp(-ce_loss) focal_loss = ((1 - pt)**self.gamma * ce_loss).mean() return focal_loss class_weights = compute_class_weight(class_weight='balanced', classes=np.unique( list(train_labels)), y=train_df['label']) #my_loss_func = torch.nn.CrossEntropyLoss(weight=torch.tensor(class_weights, dtype=torch.float)) my_loss_func = FocalLoss( weight=torch.tensor(class_weights, dtype=torch.float)) # how to define a trainer and all its arguments is taken from here - https://github.com/huggingface/notebooks/blob/master/examples/text_classification.ipynb args = TrainingArguments( "arabic_nlp_model", evaluation_strategy="epoch", #learning_rate=1e-5, learning_rate=1e-4, per_device_train_batch_size=16, per_device_eval_batch_size=8, num_train_epochs=5, weight_decay=0.01, load_best_model_at_end=True, #metric_for_best_model="macro_f1_PN", ) # setting the params of the BERT classifier for cur_param in self.bert_model_params.keys(): try: args.__dict__[cur_param] = eval( self.bert_model_params[cur_param]) except TypeError: args.__dict__[cur_param] = self.bert_model_params[cur_param] args.logging_steps = (len(train_features) - 1) // args.per_device_train_batch_size + 1 args.save_steps = args.logging_steps args.output_dir = self.saving_model_folder #training_args.compute_metrics = f1_score #training_args.compute_metrics = self.compute_metrics # training_args.logging_dir = "gs://" from torch.utils.tensorboard import SummaryWriter supports google cloud storage trainer = MyTrainer(loss_func=my_loss_func, model=model, args=args, train_dataset=train_features, eval_dataset=dev_features, compute_metrics=self.compute_metrics) #trainer = Trainer(model=model, # args=args, # train_dataset=train_features, # eval_dataset=dev_features, # #compute_metrics = compute_metrics) # compute_metrics=self.compute_metrics) trainer.train() # saving the model self.save_model(model=trainer.model)
per_device_eval_batch_size=8, num_train_epochs=5, weight_decay=0.01, load_best_model_at_end=True, metric_for_best_model="macro_f1_PN", ) # setting the params of the BERT classifier bert_model_params = config_dict['bert_model_params'] bert_model_params['seed'] = config_dict['random_seed'] for cur_param in bert_model_params.keys(): try: args.__dict__[cur_param] = eval(bert_model_params[cur_param]) except TypeError: args.__dict__[cur_param] = bert_model_params[cur_param] args.save_steps = args.logging_steps trainer = MyTrainer( model=loaded_model, args=args, train_dataset=dev_features, eval_dataset=dev_features, compute_metrics=BertBasedSentimentAnalyser.compute_metrics) # splitting the prediction set to bulks of 1000 so we will not meet memory errors) pred_bulks = 1001 all_predictions = list() for cur_idx in range(int(len(dev_features) / pred_bulks) + 1): cur_bulk = dev_features[pred_bulks * cur_idx:pred_bulks * (cur_idx + 1)] trainer_predictions = trainer.predict(cur_bulk) all_predictions.append(trainer_predictions.predictions[0]) # creating the predictions (proba)