def validate_model(): data = prepare_data() #build graph with tf.Graph().as_default(): image_preprocessing_fn = preprocessing_factory.get_preprocessing( 'resnet_v1_50', is_training=False) processed_image, score = load_data(data['val_image_names'], data['val_image_scores'], 1, image_preprocessing_fn, 128, False) score = tf.reshape(score, [-1, 1]) logits, _ = predict_model(processed_image, is_training=False) variables_to_use = slim.get_variables_to_restore() variables_restorer = tf.train.Saver(variables_to_use) #Loss with tf.name_scope('loss'): #MSE loss loss = tf.sqrt(tf.reduce_mean(tf.square(logits - score))) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) variables_restorer.restore(sess, SAVE_MODEL_PATH) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) sum_ls = 0.0 steps = 0 try: while not coord.should_stop(): ls = sess.run(loss) sum_ls += ls steps += 1 except tf.errors.OutOfRangeError: print("Validating: mean loss %f" % (sum_ls / steps)) finally: coord.request_stop() coord.join(threads) return sum_ls / steps
def fit_model(): # Static parameters for data pipeline and fitting model BATCH_SIZE = 500 # preparing model configuration # TODO model configuration can be changed to more accessiable option LR = 1e-5 EPOCHS = 2 MODEL_NAME = 'EfficientNet_model' # loading, processing data and saving preprocessed data # TODO this probably can be reworked to dialog choice using TKinter, but I didn't find such need in my case copy_file_name = 'training_data_copy.npy' file_name = 'training_data.npy' processed_file_name = 'training_data_processed.npy' training_data = list(np.load(file_name, allow_pickle=True)) training_data = data_processing.prepare_data(training_data) np.save(processed_file_name, training_data) # preparing data for model training_data = list(np.load(processed_file_name, allow_pickle=True)) # setting train and test data for training model X = np.array([i[0] for i in training_data]).reshape(-1, 128, 128, 3) Y = [i[1] for i in training_data] # fitting model model.fit(x=[X], y=[Y], epochs=EPOCHS, validation_split=0.1, verbose=1, batch_size=BATCH_SIZE, shuffle=True) # saving model model.save(MODEL_NAME) print("Done")
def train_model(): data = prepare_data() #build graph with tf.Graph().as_default(): image_preprocessing_fn = preprocessing_factory.get_preprocessing( args.model_name, is_training=True) processed_image, score = load_data(data['train_image_names'], data['train_image_scores'], args.epoch_num, image_preprocessing_fn, args.batch_size, True) score = tf.reshape(score, [-1, 1]) print(score.shape) logits, _ = predict_model(processed_image, is_training=True) print(logits.shape) variables_to_restore = slim.get_variables_to_restore( exclude=['resnet_v1_50/logits']) variables_restorer = tf.train.Saver(variables_to_restore) #Loss with tf.name_scope('ls'): #MSE loss loss = tf.sqrt(tf.reduce_mean(tf.square(logits - score))) tf.summary.scalar('loss', loss) current_epoch = tf.Variable(0, trainable=False) decay_step = EPOCHS_PER_LR_DECAY * len( data['train_image_names']) // args.batch_size learning_rate = tf.train.exponential_decay(args.lr, current_epoch, decay_step, LR_DECAY_FACTORY, staircase=True) opt = tf.train.MomentumOptimizer(learning_rate, 0.9) #opt = tf.train.AdamOptimizer(learning_rate) optimizer = slim.learning.create_train_op(loss, opt, global_step=current_epoch) saver = tf.train.Saver() summary_op = tf.summary.merge_all() config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: summary_writer = tf.summary.FileWriter(TRAIN_LOG_DIR, sess.graph) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) variables_restorer.restore(sess, RES_v1_50_MODEL_PATH) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) sum_ls = 0.0 batch_num = len(data['train_image_scores']) // args.batch_size val_step = 0 best_val_ls = 100.0 try: while not coord.should_stop(): _, ls, step, summary = sess.run( [optimizer, loss, current_epoch, summary_op]) sum_ls += ls if step % 50 == 0: print("Epoch %d, loss %f" % (step / batch_num + 1, ls)) summary_writer.add_summary(summary, step) if step % batch_num == 0 and step != 0: print("Epoch %d, mean loss %f" % (step / batch_num + 1, sum_ls / batch_num)) sum_ls = 0.0 saver.save(sess, SAVE_MODEL_PATH) val_ls = validate_model() if val_ls < best_val_ls: best_val_ls = val_ls saver.save(sess, BEST_MODEL_PATH) print('best val loss %f' % (best_val_ls)) except tf.errors.OutOfRangeError: saver.save(sess, SAVE_MODEL_PATH) finally: coord.request_stop() coord.join(threads)
from data_processing import prepare_data_brut, train_val_size, prepare_data from ml_models_results import resmpling_data, results_model_dict, confusion_matrix, reglog_model_results from sklearn.ensemble import GradientBoostingClassifier from sklearn.cluster import KMeans from sklearn.utils import resample import pandas as pd from prettytable import PrettyTable from statistics import mean, median, stdev #prepare data data = prepare_data_brut( path_to="/home/user/Kaggle/data/application_train.csv") dict_data_input = prepare_data( path_to="/home/user/Kaggle/data/application_train.csv") # datasets : data_val = dict_data_input['data_val'] data_test = dict_data_input['data_test'] # Get target variable from each data y_val = data_val['TARGET'] y_test = data_test['TARGET'] # drop target variable from each dataset data_val_model = data_val.drop(['TARGET'], axis=1) data_test_model = data_test.drop(['TARGET'], axis=1) # the model model = GradientBoostingClassifier() #get data train
# viz_sample_segmentation_augmentations(data["X_train"], data["Y_train"], colormap=data["colormap"], aug_func=aug_func, n_images=2, n_per_image=5, saveto="sample_augmentation_pairs.jpg") # ############################################################################## # MAIN # ############################################################################## if __name__ == '__main__': # SETTINGS n_valid = 128 data_file = "data_256.pickle" # vgg16_snapshot = "/path/to/vgg16/vgg_16.ckpt" # vgg16_snapshot = "/home/ronny/TEMP/pretrained_models/tfslim/vgg/vgg16/vgg_16.ckpt" # PREPARE DATA DATA_LIMIT = None data = prepare_data(data_file, valid_from_train=True, n_valid=n_valid, max_data=DATA_LIMIT) n_classes = len(data["id2label"]) # MODEL - ERFNet, with Paszke class weighting model_name = "aug_erfnetC_03" model = SegmentationModel(model_name, img_shape=[256, 256], n_classes=len(data["id2label"]), l2=2e-4) class_weights = calculate_class_weights(data["Y_train"], n_classes=n_classes, method="paszke", c=1.10) model.set_class_weights(class_weights) model.create_graph(erfnetB)
from sklearn.ensemble import GradientBoostingClassifier from ml_models_results import reglog_model_results, resmpling_data from data_processing import prepare_data from sklearn.model_selection import RandomizedSearchCV from time import time import pandas as pd # Import datasets dict_data = prepare_data(path_to="data/application_train.csv") data_train = dict_data['data_train'] #resample data data_resampled = resmpling_data(data_train,9,string="percentage") # datasets : data_val = dict_data['data_val'] data_test = dict_data['data_test'] # Get target variable from each data y_train = data_resampled['TARGET'] y_val = data_val['TARGET'] y_test = data_test['TARGET'] # drop target variable from each dataset data_train_model = data_resampled.drop(['TARGET'], axis=1) data_val_model = data_val.drop(['TARGET'], axis=1) data_test_model = data_test.drop(['TARGET'], axis=1)
def main(): x, y, x_test = data_processing.prepare_data() x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, shuffle=False) neg_to_pos_ratio = round(len(y.loc[y == 0]) / len(y.loc[y == 1])) models = { 'XGBoost': XGBClassifier(random_state=42, n_jobs=-1), 'XGBoost balanced': XGBClassifier(scale_pos_weight=neg_to_pos_ratio, random_state=42, n_jobs=-1), 'LightGBM': LGBMClassifier(random_state=42), 'LightGBM balanced': LGBMClassifier(scale_pos_weight=neg_to_pos_ratio, random_state=42), 'CatBoost': CatBoostClassifier(random_state=42, verbose=0), 'CatBoost balanced': CatBoostClassifier(scale_pos_weight=neg_to_pos_ratio, random_state=42, verbose=0), 'AdaBoost': AdaBoostClassifier(random_state=42), 'GradientBoosting': GradientBoostingClassifier(random_state=42), 'RandomForest': RandomForestClassifier(random_state=42, n_jobs=-1), 'BalancedRandomForest': BalancedRandomForestClassifier(random_state=42, n_jobs=-1), 'ExtraTrees': ExtraTreesClassifier(random_state=42, n_jobs=-1), # 'MLP': MLPClassifier(random_state=42), } scores = {} results = [] for name, model in models.items(): start = time.time() model.fit(x_train, y_train) end = time.time() print( f"Elapsed: {int((end - start) // 60)}m {int((end - start) % 60)}s") results.append((name, y_val, model.predict_proba(x_val)[:, 1])) scores[name] = get_scores(model, x_val, y_val) scores_df = pd.DataFrame(scores).transpose() plot_factory.plot_roc_curve(results) # Feature importance important_features = [ get_xgboost_important_features(models[0], x.columns), get_xgboost_important_features(models[1], x.columns), get_lightgbm_important_features(models[2], x.columns), get_lightgbm_important_features(models[3], x.columns), get_catboost_important_features(models[4]), get_catboost_important_features(models[5]), ] unique_features = set( [f for sublist in important_features for f in sublist]) print(f"{len(unique_features)} unique features were chosen") x_train_small = x_train.filter(unique_features) x_val_small = x_val.filter(unique_features) scores_fi = {} results_fi = [] for name, model in models.items(): model.fit(x_train_small, y_train) results_fi.append((name, y_val, model.predict_proba(x_val)[:, 1])) scores_fi[name] = get_scores(model, x_val_small, y_val) scores_fi_df = pd.DataFrame(scores_fi).transpose() plot_factory.plot_roc_curve(results) # Fine tuning scoring = {'Accuracy@10': make_scorer(accuracy_at_10, needs_proba=True)} kwargs = { 'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor', } params = { 'scale_pos_weight': [1, 7, 13], # 13 'max_depth': [3, 4, 5], # 5 'min_child_weight': [7], # 7 'learning_rate': [0.01], # 0.01 'n_estimators': [150, 200], # 200 'gamma': [0, 0.2], # 0 'subsample': [0.8, 1.0], # 0.8 'colsample_bytree': [0.8, 1.0], # 0.8 } grid_search = GridSearchCV( estimator=XGBClassifier(random_state=42, **kwargs), param_grid=params, scoring=scoring, refit='Accuracy@10', cv=3, # n_jobs=-1, verbose=10) start_time = time.time() grid_search.fit(x, y) end_time = time.time() print( f"Grid search finished in: {str(datetime.timedelta(seconds=end_time - start_time))}" ) print(f"Best params: {grid_search.best_params_}") print(f"Best Accuracy@10: {grid_search.best_score_}") report_best_scores(grid_search.cv_results_, 1) # best_params = [] # best_score = [] best_params += [grid_search.best_params_] best_score += [grid_search.best_score_] best_model = grid_search.best_estimator_ # best_model.fit(x_train, y_train) scores_train = get_scores(best_model, x_train, y_train) scores_val = get_scores(best_model, x_val, y_val) fine_tune_df = pd.DataFrame( data={ f"XGBoost search 1 - train": scores_train, f"XGBoost search 1 - val": scores_val }).transpose() # Cross validation of the best model model = XGBClassifier(scale_pos_weight=13, max_depth=5, min_child_weight=7, learning_rate=0.01, n_estimators=200, gamma=0, subsample=0.8, colsample_bytree=0.8, random_state=42) scores_cv = {} results_cv = [] kfold = KFold(n_splits=5, shuffle=True, random_state=42) for fold_, (train_index, test_index) in enumerate(kfold.split(x)): x_t, x_v = x.iloc[train_index], x.iloc[test_index] y_t, y_v = y.iloc[train_index], y.iloc[test_index] model.fit(x_t, y_t) results_cv.append( (f"Fold {fold_}", y_val, model.predict_proba(x_val)[:, 1])) scores_cv[f"Fold {fold_}"] = get_scores(model, x_v, y_v) scores_cv_df = pd.DataFrame(scores_cv).transpose() plot_factory.plot_roc_curve(results_cv) # Final predictions best_model.fit(x, y) y_test_predictions = best_model.predict_proba(x_test) repository.save_results(list(y_test_predictions[:, 1]), best_model.__class__.__name__)
import numpy as np import pandas as pd import data_processing as data_processing import plot_factory as plot_factory x_train, y_train, x_test = data_processing.prepare_data() # Lets present some of the features on scatter plots features = list(x_train.transpose().index[0:16]) plot_factory.plot_feature_scatter(x_train[0:len(x_test)], x_test, features) plot_factory.plot_feature_scatter(x_train[:1000], x_test[:1000], features) features = list(x_train.transpose().index[16:32]) plot_factory.plot_feature_scatter(x_train[:1000], x_test[:1000], features) # Density plots of features # Firstly lets analyse distribution for values with target value 0 and 1 t0 = x_train.loc[y_train == 0] t1 = x_train.loc[y_train == 1] features = x_train.select_dtypes(['float64', 'int64']).columns[:-1] plot_factory.plot_feature_distribution(t0, t1, '0', '1', features[0:20]) plot_factory.plot_feature_distribution(t0, t1, '0', '1', features[20:40]) # We can observe that some of the features are clearly different depending on 'class' # Those features are: Var38, Var73, Var126, Var153 # Lets now compare features from train and test data sets features = x_train.select_dtypes(['float64', 'int64']).columns[:-1] plot_factory.plot_feature_distribution(x_train, x_test, 'train', 'test', features[0:20])
noise=10) # # Visualize samples of augmentations # from viz import viz_sample_augmentations # viz_sample_augmentations(data["X_train"], aug_func=aug_func, n_images=10, n_per_image=5, saveto=None) # ############################################################################## # MAIN # ############################################################################## if __name__ == '__main__': # SETTINGS n_valid = 1024 data_file = "/path/to/data.pickle" data = prepare_data(data_file, valid_from_train=True, n_valid=n_valid, max_data=None) model = MyModel("delete2", img_shape=[28, 28], n_channels=1, n_classes=10) model.create_graph() model.train(data, n_epochs=5, print_every=300, dropout=0.2, aug_func=aug_func) # # Pretrained Inception v3 Model # pretrained_snapshot = "/path/to/inception_v3.ckpt" # model = PretrainedInceptionClassifier("deleteIV3", pretrained_snapshot=pretrained_snapshot, img_shape=[299, 299], n_channels=3, n_classes=10, dynamic=True) # model.create_graph() # model.train(data, n_epochs=2, print_every=300, batch_size=4, aug_func=aug_func)