def load_model_and_evaluate(state_dict: str, data: m.Posts, label:str, positive_class_weight:float): BATCH_SIZE = 8 MAX_LEN = 264 #EPOCHS = 10 # ## MLflow instantiation IS_DEVELOPMENT = False mlflow_logger = m.MLFlowLogger( uri=TRACKING_URI, experiment=EXPERIMENT_NAME, is_dev=IS_DEVELOPMENT, ) # ## Data loading normalize = lambda x: cleaning.normalize(x, url_emoji_dummy=False, pure_words=False) X_train, y_train = data.get_X_y('train') X_train = X_train.apply(normalize) df_train = pd.concat([X_train,y_train], axis=1) df_train.columns = ['text', label] X_val, y_val = data.get_X_y('val', balance_method='translate') X_val = X_val.apply(normalize) df_val = pd.concat([X_val,y_val], axis=1) df_val.columns = ['text', label] X_test, y_test = data.get_X_y('test', balance_method=None) X_test = X_test.apply(normalize) df_test = pd.concat([X_test,y_test], axis=1) df_test.columns = ['text', label] tokenizer = BertTokenizer.from_pretrained("deepset/gbert-base") train_data_loader = create_data_loader(df_train, label, tokenizer, MAX_LEN, BATCH_SIZE) val_data_loader = create_data_loader(df_val, label, tokenizer, MAX_LEN, BATCH_SIZE) test_data_loader = create_data_loader(df_test, label, tokenizer, MAX_LEN, BATCH_SIZE) # ## Instantiation model = BinaryClassifier().to(device) #optimizer = AdamW(model.parameters(), lr=learning_rate, correct_bias=False) mlflow_logger.add_param('optimizer', 'AdamW') # https://pytorch.org/docs/stable/generated/torch.nn.BCEWithLogitsLoss.html#torch.nn.BCEWithLogitsLoss # From the BCEWithLogitLoss() documentation: # > For example, if a dataset contains 100 positive and 300 negative examples of a single class, # > then pos_weight for the class should be equal to 300/100=3 . # > The loss would act as if the dataset contains 3*100=300 positive examples. loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.FloatTensor([positive_class_weight])).to(device) # Loading the model model.load_state_dict(torch.load(f"./models/{state_dict}.bin")) model.eval() #history = defaultdict(list) train_fbeta, train_metrics, train_params, train_loss = eval_model( model, train_data_loader, loss_fn, device, 'train', ) logger.info(f'Train loss {train_loss} Fbeta {train_fbeta}') val_fbeta, val_metrics, val_params, val_loss = eval_model( model, val_data_loader, loss_fn, device, 'val', ) logger.info(f'Val loss {val_loss} Fbeta {val_fbeta}') test_fbeta, test_metrics, test_params, test_loss = eval_model( model, test_data_loader, loss_fn, device, 'test', ) logger.info(f'Test loss {test_loss} Fbeta {test_fbeta}') print() #history['train_fbeta'].append(train_fbeta) #history['train_loss'].append(train_loss) #history['val_fbeta'].append(val_fbeta) #history['val_loss'].append(val_loss) for k,v in train_metrics.items(): mlflow_logger.add_metric(k, v) for k,v in train_params.items(): mlflow_logger.add_param(k, v) for k,v in val_metrics.items(): mlflow_logger.add_metric(k, v) for k,v in val_params.items(): mlflow_logger.add_param(k, v) for k,v in test_metrics.items(): mlflow_logger.add_metric(k, v) for k,v in test_params.items(): mlflow_logger.add_param(k, v) ############################################# #MLflow logging #constant_params = { # 'epochs': EPOCHS, # 'batch_size': BATCH_SIZE, # 'max_len': MAX_LEN, # } mlflow_logger.add_tag("cycle4", True) mlflow_logger.add_param("normalization", 'norm') mlflow_logger.add_param("vectorizer", 'deepset/gbert-base') mlflow_logger.add_param("model", "deepset/gbert-base") # I'm re-using the grid_search_params field in order to not open too many mlflow columns #mlflow_logger.add_param("grid_search_params", str(constant_params)[:249]) #mlflow_logger.add_param("lr", learning_rate) mlflow_logger.add_param('pos_weight', positive_class_weight) mlflow_logger.add_param("saved_model", state_dict) mlflow_logger.add_param("label", data.current_label) mlflow_logger.add_param("balance_method", data.balance_method) if data.balance_method: mlflow_logger.add_param("sampling_strategy", data.sampling_strategy) mlflow_logger.add_model(None) with mlflow.start_run(run_name='deepset/gbert-base') as run: mlflow_logger.log()
TARGET_LABELS = [ 'label_argumentsused', 'label_discriminating', 'label_inappropriate', 'label_offtopic', 'label_personalstories', 'label_possiblyfeedback', 'label_sentimentnegative', 'label_sentimentpositive', ] IS_DEVELOPMENT = False mlflow_logger = m.MLFlowLogger(uri=TRACKING_URI, experiment=EXPERIMENT_NAME, is_dev=IS_DEVELOPMENT, params=mlflow_params, tags=mlflow_tags) training = m.Modeling(data, gs, mlflow_logger) for method, strat in trans_os.items(): for strategy in strat: print(method, strategy) for label in TARGET_LABELS: logger.info(f"-" * 20) logger.info(f"Target: {label}") data.set_label(label=label) data.set_balance_method(balance_method=method, sampling_strategy=strategy) training.train() training.evaluate(["train", "val"]) #if True:
def make_model(data:m.Posts, label:str, learning_rate:float, positive_class_weight:float): BATCH_SIZE = 8 MAX_LEN = 264 EPOCHS = 10 # ## MLflow instantiation IS_DEVELOPMENT = False mlflow_logger = m.MLFlowLogger( uri=TRACKING_URI, experiment=EXPERIMENT_NAME, is_dev=IS_DEVELOPMENT, ) # ## Data loading normalize = lambda x: cleaning.normalize(x, url_emoji_dummy=False, pure_words=False) X_train, y_train = data.get_X_y('train') X_train = X_train.apply(normalize) df_train = pd.concat([X_train,y_train], axis=1) df_train.columns = ['text', label] X_val, y_val = data.get_X_y('val', balance_method='translate') X_val = X_val.apply(normalize) df_val = pd.concat([X_val,y_val], axis=1) df_val.columns = ['text', label] #X_test, y_test = data.get_X_y('test') #X_test = X_test.apply(normalize) #df_test = pd.concat([X_test,y_test], axis=1) #df_test.columns = ['text', label] tokenizer = BertTokenizer.from_pretrained("deepset/gbert-base") train_data_loader = create_data_loader(df_train, label, tokenizer, MAX_LEN, BATCH_SIZE) val_data_loader = create_data_loader(df_val, label, tokenizer, MAX_LEN, BATCH_SIZE) #test_data_loader = create_data_loader(df_test, label, tokenizer, MAX_LEN, BATCH_SIZE) # ## Instantiation model = BinaryClassifier().to(device) optimizer = AdamW(model.parameters(), lr=learning_rate, correct_bias=False) mlflow_logger.add_param('optimizer', 'AdamW') total_steps = len(train_data_loader) * EPOCHS scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_steps, ) # https://pytorch.org/docs/stable/generated/torch.nn.BCEWithLogitsLoss.html#torch.nn.BCEWithLogitsLoss # From the BCEWithLogitLoss() documentation: # > For example, if a dataset contains 100 positive and 300 negative examples of a single class, # > then pos_weight for the class should be equal to 300/100=3 . # > The loss would act as if the dataset contains 3*100=300 positive examples. loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.FloatTensor([positive_class_weight])).to(device) history = defaultdict(list) best_fbeta = 0 t = datetime.now().strftime("%y%m%d_%H%M%S") for epoch in range(EPOCHS): logger.info(f'Epoch {epoch + 1}/{EPOCHS}') logger.info('-' * 10) train_fbeta, train_metrics, train_params, train_loss = train_epoch( model, train_data_loader, loss_fn, optimizer, device, scheduler, 'train', ) logger.info(f'Train loss {train_loss} Fbeta {train_fbeta}') val_fbeta, val_metrics, val_params, val_loss = eval_model( model, val_data_loader, loss_fn, device, 'val', ) logger.info(f'Val loss {val_loss} Fbeta {val_fbeta}') print() history['train_fbeta'].append(train_fbeta) history['train_loss'].append(train_loss) history['val_fbeta'].append(val_fbeta) history['val_loss'].append(val_loss) if val_fbeta > best_fbeta: file_name = f"./models/model_gbertbase_{label}_{t}.bin" torch.save(model.state_dict(), file_name) for k,v in val_metrics.items(): mlflow_logger.add_metric(k, v) for k,v in val_params.items(): mlflow_logger.add_param(k, v) for k,v in train_metrics.items(): mlflow_logger.add_metric(k, v) for k,v in train_params.items(): mlflow_logger.add_param(k, v) best_fbeta = val_fbeta ############################################# #MLflow logging constant_params = { 'epochs': EPOCHS, 'batch_size': BATCH_SIZE, 'max_len': MAX_LEN, } mlflow_logger.add_tag("cycle4", True) mlflow_logger.add_param("normalization", 'norm') mlflow_logger.add_param("vectorizer", 'deepset/gbert-base') mlflow_logger.add_param("model", "deepset/gbert-base") # I'm re-using the grid_search_params field in order to not open too many mlflow columns mlflow_logger.add_param("grid_search_params", str(constant_params)[:249]) mlflow_logger.add_param("lr", learning_rate) mlflow_logger.add_param('pos_weight', positive_class_weight) mlflow_logger.add_param("saved_model", f"model_gbertbase_{label}_{t}") mlflow_logger.add_param("label", data.current_label) mlflow_logger.add_param("balance_method", data.balance_method) if data.balance_method: mlflow_logger.add_param("sampling_strategy", data.sampling_strategy) mlflow_logger.add_model(None) with mlflow.start_run(run_name='deepset/gbert-base') as run: mlflow_logger.log()