def validate_model():
    data = prepare_data()
    #build graph
    with tf.Graph().as_default():
        image_preprocessing_fn = preprocessing_factory.get_preprocessing(
            'resnet_v1_50', is_training=False)
        processed_image, score = load_data(data['val_image_names'],
                                           data['val_image_scores'], 1,
                                           image_preprocessing_fn, 128, False)
        score = tf.reshape(score, [-1, 1])

        logits, _ = predict_model(processed_image, is_training=False)
        variables_to_use = slim.get_variables_to_restore()
        variables_restorer = tf.train.Saver(variables_to_use)
        #Loss
        with tf.name_scope('loss'):
            #MSE loss
            loss = tf.sqrt(tf.reduce_mean(tf.square(logits - score)))

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True

        with tf.Session(config=config) as sess:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())
            variables_restorer.restore(sess, SAVE_MODEL_PATH)

            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(coord=coord)
            sum_ls = 0.0
            steps = 0
            try:
                while not coord.should_stop():
                    ls = sess.run(loss)
                    sum_ls += ls
                    steps += 1

            except tf.errors.OutOfRangeError:
                print("Validating: mean loss %f" % (sum_ls / steps))
            finally:
                coord.request_stop()
            coord.join(threads)
    return sum_ls / steps
def fit_model():
    # Static parameters for data pipeline and fitting model
    BATCH_SIZE = 500
    # preparing model configuration
    # TODO model configuration can be changed to more accessiable option
    LR = 1e-5
    EPOCHS = 2
    MODEL_NAME = 'EfficientNet_model'

    # loading, processing data and saving preprocessed data
    # TODO this probably can be reworked to dialog choice using TKinter, but I didn't find such need in my case
    copy_file_name = 'training_data_copy.npy'
    file_name = 'training_data.npy'
    processed_file_name = 'training_data_processed.npy'
    training_data = list(np.load(file_name, allow_pickle=True))
    training_data = data_processing.prepare_data(training_data)
    np.save(processed_file_name, training_data)

    # preparing data for model
    training_data = list(np.load(processed_file_name, allow_pickle=True))

    # setting train and test data for training model
    X = np.array([i[0] for i in training_data]).reshape(-1, 128, 128, 3)
    Y = [i[1] for i in training_data]

    # fitting model
    model.fit(x=[X],
              y=[Y],
              epochs=EPOCHS,
              validation_split=0.1,
              verbose=1,
              batch_size=BATCH_SIZE,
              shuffle=True)
    # saving model
    model.save(MODEL_NAME)
    print("Done")
示例#3
0
def train_model():
    data = prepare_data()
    #build graph
    with tf.Graph().as_default():
        image_preprocessing_fn = preprocessing_factory.get_preprocessing(
            args.model_name, is_training=True)
        processed_image, score = load_data(data['train_image_names'],
                                           data['train_image_scores'],
                                           args.epoch_num,
                                           image_preprocessing_fn,
                                           args.batch_size, True)
        score = tf.reshape(score, [-1, 1])
        print(score.shape)
        logits, _ = predict_model(processed_image, is_training=True)
        print(logits.shape)
        variables_to_restore = slim.get_variables_to_restore(
            exclude=['resnet_v1_50/logits'])
        variables_restorer = tf.train.Saver(variables_to_restore)

        #Loss
        with tf.name_scope('ls'):
            #MSE loss
            loss = tf.sqrt(tf.reduce_mean(tf.square(logits - score)))
            tf.summary.scalar('loss', loss)

        current_epoch = tf.Variable(0, trainable=False)
        decay_step = EPOCHS_PER_LR_DECAY * len(
            data['train_image_names']) // args.batch_size
        learning_rate = tf.train.exponential_decay(args.lr,
                                                   current_epoch,
                                                   decay_step,
                                                   LR_DECAY_FACTORY,
                                                   staircase=True)

        opt = tf.train.MomentumOptimizer(learning_rate, 0.9)
        #opt = tf.train.AdamOptimizer(learning_rate)
        optimizer = slim.learning.create_train_op(loss,
                                                  opt,
                                                  global_step=current_epoch)

        saver = tf.train.Saver()
        summary_op = tf.summary.merge_all()
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True

        with tf.Session(config=config) as sess:
            summary_writer = tf.summary.FileWriter(TRAIN_LOG_DIR, sess.graph)
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())
            variables_restorer.restore(sess, RES_v1_50_MODEL_PATH)

            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(coord=coord)
            sum_ls = 0.0
            batch_num = len(data['train_image_scores']) // args.batch_size
            val_step = 0
            best_val_ls = 100.0
            try:
                while not coord.should_stop():
                    _, ls, step, summary = sess.run(
                        [optimizer, loss, current_epoch, summary_op])
                    sum_ls += ls

                    if step % 50 == 0:
                        print("Epoch %d, loss %f" % (step / batch_num + 1, ls))
                        summary_writer.add_summary(summary, step)
                    if step % batch_num == 0 and step != 0:
                        print("Epoch %d, mean loss %f" %
                              (step / batch_num + 1, sum_ls / batch_num))
                        sum_ls = 0.0
                        saver.save(sess, SAVE_MODEL_PATH)
                        val_ls = validate_model()
                        if val_ls < best_val_ls:
                            best_val_ls = val_ls
                            saver.save(sess, BEST_MODEL_PATH)
                        print('best val loss %f' % (best_val_ls))
            except tf.errors.OutOfRangeError:
                saver.save(sess, SAVE_MODEL_PATH)
            finally:
                coord.request_stop()
            coord.join(threads)
示例#4
0
from data_processing import prepare_data_brut, train_val_size, prepare_data
from ml_models_results import resmpling_data, results_model_dict, confusion_matrix, reglog_model_results
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cluster import KMeans
from sklearn.utils import resample
import pandas as pd
from prettytable import PrettyTable
from statistics import mean, median, stdev

#prepare data
data = prepare_data_brut(
    path_to="/home/user/Kaggle/data/application_train.csv")
dict_data_input = prepare_data(
    path_to="/home/user/Kaggle/data/application_train.csv")

# datasets :
data_val = dict_data_input['data_val']
data_test = dict_data_input['data_test']

# Get target variable from each data
y_val = data_val['TARGET']
y_test = data_test['TARGET']

# drop target variable from each dataset
data_val_model = data_val.drop(['TARGET'], axis=1)
data_test_model = data_test.drop(['TARGET'], axis=1)

# the model
model = GradientBoostingClassifier()

#get data train
# viz_sample_segmentation_augmentations(data["X_train"], data["Y_train"], colormap=data["colormap"], aug_func=aug_func, n_images=2, n_per_image=5, saveto="sample_augmentation_pairs.jpg")

# ##############################################################################
#                                                                           MAIN
# ##############################################################################
if __name__ == '__main__':
    # SETTINGS
    n_valid = 128
    data_file = "data_256.pickle"
    # vgg16_snapshot = "/path/to/vgg16/vgg_16.ckpt"
    # vgg16_snapshot = "/home/ronny/TEMP/pretrained_models/tfslim/vgg/vgg16/vgg_16.ckpt"

    # PREPARE DATA
    DATA_LIMIT = None
    data = prepare_data(data_file,
                        valid_from_train=True,
                        n_valid=n_valid,
                        max_data=DATA_LIMIT)
    n_classes = len(data["id2label"])

    # MODEL - ERFNet, with Paszke class weighting
    model_name = "aug_erfnetC_03"
    model = SegmentationModel(model_name,
                              img_shape=[256, 256],
                              n_classes=len(data["id2label"]),
                              l2=2e-4)
    class_weights = calculate_class_weights(data["Y_train"],
                                            n_classes=n_classes,
                                            method="paszke",
                                            c=1.10)
    model.set_class_weights(class_weights)
    model.create_graph(erfnetB)
示例#6
0
from sklearn.ensemble import GradientBoostingClassifier
from ml_models_results import reglog_model_results, resmpling_data
from data_processing import prepare_data
from sklearn.model_selection import RandomizedSearchCV
from time import time
import pandas as pd


# Import datasets
dict_data = prepare_data(path_to="data/application_train.csv")
data_train = dict_data['data_train']

#resample data
data_resampled = resmpling_data(data_train,9,string="percentage")


# datasets :
data_val =  dict_data['data_val']
data_test = dict_data['data_test']

# Get target variable from each data
y_train = data_resampled['TARGET']
y_val = data_val['TARGET']
y_test = data_test['TARGET']

# drop target variable from each dataset
data_train_model = data_resampled.drop(['TARGET'], axis=1)
data_val_model = data_val.drop(['TARGET'], axis=1)
data_test_model = data_test.drop(['TARGET'], axis=1)

示例#7
0
def main():
    x, y, x_test = data_processing.prepare_data()
    x_train, x_val, y_train, y_val = train_test_split(x,
                                                      y,
                                                      test_size=0.2,
                                                      shuffle=False)
    neg_to_pos_ratio = round(len(y.loc[y == 0]) / len(y.loc[y == 1]))

    models = {
        'XGBoost':
        XGBClassifier(random_state=42, n_jobs=-1),
        'XGBoost balanced':
        XGBClassifier(scale_pos_weight=neg_to_pos_ratio,
                      random_state=42,
                      n_jobs=-1),
        'LightGBM':
        LGBMClassifier(random_state=42),
        'LightGBM balanced':
        LGBMClassifier(scale_pos_weight=neg_to_pos_ratio, random_state=42),
        'CatBoost':
        CatBoostClassifier(random_state=42, verbose=0),
        'CatBoost balanced':
        CatBoostClassifier(scale_pos_weight=neg_to_pos_ratio,
                           random_state=42,
                           verbose=0),
        'AdaBoost':
        AdaBoostClassifier(random_state=42),
        'GradientBoosting':
        GradientBoostingClassifier(random_state=42),
        'RandomForest':
        RandomForestClassifier(random_state=42, n_jobs=-1),
        'BalancedRandomForest':
        BalancedRandomForestClassifier(random_state=42, n_jobs=-1),
        'ExtraTrees':
        ExtraTreesClassifier(random_state=42, n_jobs=-1),
        # 'MLP': MLPClassifier(random_state=42),
    }
    scores = {}
    results = []
    for name, model in models.items():
        start = time.time()
        model.fit(x_train, y_train)
        end = time.time()
        print(
            f"Elapsed: {int((end - start) // 60)}m {int((end - start) % 60)}s")
        results.append((name, y_val, model.predict_proba(x_val)[:, 1]))
        scores[name] = get_scores(model, x_val, y_val)
    scores_df = pd.DataFrame(scores).transpose()
    plot_factory.plot_roc_curve(results)

    # Feature importance
    important_features = [
        get_xgboost_important_features(models[0], x.columns),
        get_xgboost_important_features(models[1], x.columns),
        get_lightgbm_important_features(models[2], x.columns),
        get_lightgbm_important_features(models[3], x.columns),
        get_catboost_important_features(models[4]),
        get_catboost_important_features(models[5]),
    ]
    unique_features = set(
        [f for sublist in important_features for f in sublist])
    print(f"{len(unique_features)} unique features were chosen")

    x_train_small = x_train.filter(unique_features)
    x_val_small = x_val.filter(unique_features)
    scores_fi = {}
    results_fi = []
    for name, model in models.items():
        model.fit(x_train_small, y_train)
        results_fi.append((name, y_val, model.predict_proba(x_val)[:, 1]))
        scores_fi[name] = get_scores(model, x_val_small, y_val)
    scores_fi_df = pd.DataFrame(scores_fi).transpose()
    plot_factory.plot_roc_curve(results)

    # Fine tuning
    scoring = {'Accuracy@10': make_scorer(accuracy_at_10, needs_proba=True)}
    kwargs = {
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor',
    }
    params = {
        'scale_pos_weight': [1, 7, 13],  # 13
        'max_depth': [3, 4, 5],  # 5
        'min_child_weight': [7],  # 7
        'learning_rate': [0.01],  # 0.01
        'n_estimators': [150, 200],  # 200
        'gamma': [0, 0.2],  # 0
        'subsample': [0.8, 1.0],  # 0.8
        'colsample_bytree': [0.8, 1.0],  # 0.8
    }
    grid_search = GridSearchCV(
        estimator=XGBClassifier(random_state=42, **kwargs),
        param_grid=params,
        scoring=scoring,
        refit='Accuracy@10',
        cv=3,
        # n_jobs=-1,
        verbose=10)
    start_time = time.time()
    grid_search.fit(x, y)
    end_time = time.time()
    print(
        f"Grid search finished in: {str(datetime.timedelta(seconds=end_time - start_time))}"
    )
    print(f"Best params: {grid_search.best_params_}")
    print(f"Best Accuracy@10: {grid_search.best_score_}")
    report_best_scores(grid_search.cv_results_, 1)
    # best_params = []
    # best_score = []
    best_params += [grid_search.best_params_]
    best_score += [grid_search.best_score_]

    best_model = grid_search.best_estimator_
    # best_model.fit(x_train, y_train)
    scores_train = get_scores(best_model, x_train, y_train)
    scores_val = get_scores(best_model, x_val, y_val)
    fine_tune_df = pd.DataFrame(
        data={
            f"XGBoost search 1 - train": scores_train,
            f"XGBoost search 1 - val": scores_val
        }).transpose()

    # Cross validation of the best model
    model = XGBClassifier(scale_pos_weight=13,
                          max_depth=5,
                          min_child_weight=7,
                          learning_rate=0.01,
                          n_estimators=200,
                          gamma=0,
                          subsample=0.8,
                          colsample_bytree=0.8,
                          random_state=42)
    scores_cv = {}
    results_cv = []
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    for fold_, (train_index, test_index) in enumerate(kfold.split(x)):
        x_t, x_v = x.iloc[train_index], x.iloc[test_index]
        y_t, y_v = y.iloc[train_index], y.iloc[test_index]
        model.fit(x_t, y_t)
        results_cv.append(
            (f"Fold {fold_}", y_val, model.predict_proba(x_val)[:, 1]))
        scores_cv[f"Fold {fold_}"] = get_scores(model, x_v, y_v)
    scores_cv_df = pd.DataFrame(scores_cv).transpose()
    plot_factory.plot_roc_curve(results_cv)

    # Final predictions
    best_model.fit(x, y)
    y_test_predictions = best_model.predict_proba(x_test)
    repository.save_results(list(y_test_predictions[:, 1]),
                            best_model.__class__.__name__)
示例#8
0
import numpy as np
import pandas as pd

import data_processing as data_processing
import plot_factory as plot_factory

x_train, y_train, x_test = data_processing.prepare_data()

# Lets present some of the features on scatter plots
features = list(x_train.transpose().index[0:16])
plot_factory.plot_feature_scatter(x_train[0:len(x_test)], x_test, features)

plot_factory.plot_feature_scatter(x_train[:1000], x_test[:1000], features)
features = list(x_train.transpose().index[16:32])
plot_factory.plot_feature_scatter(x_train[:1000], x_test[:1000], features)

# Density plots of features

# Firstly lets analyse distribution for values with target value 0 and 1
t0 = x_train.loc[y_train == 0]
t1 = x_train.loc[y_train == 1]
features = x_train.select_dtypes(['float64', 'int64']).columns[:-1]
plot_factory.plot_feature_distribution(t0, t1, '0', '1', features[0:20])
plot_factory.plot_feature_distribution(t0, t1, '0', '1', features[20:40])

# We can observe that some of the features are clearly different depending on 'class'
# Those features are: Var38, Var73, Var126, Var153

# Lets now compare features from train and test data sets
features = x_train.select_dtypes(['float64', 'int64']).columns[:-1]
plot_factory.plot_feature_distribution(x_train, x_test, 'train', 'test', features[0:20])
示例#9
0
                                    noise=10)

# # Visualize samples of augmentations
# from viz import viz_sample_augmentations
# viz_sample_augmentations(data["X_train"], aug_func=aug_func, n_images=10, n_per_image=5, saveto=None)

# ##############################################################################
#                                                                           MAIN
# ##############################################################################
if __name__ == '__main__':
    # SETTINGS
    n_valid = 1024
    data_file = "/path/to/data.pickle"

    data = prepare_data(data_file,
                        valid_from_train=True,
                        n_valid=n_valid,
                        max_data=None)

    model = MyModel("delete2", img_shape=[28, 28], n_channels=1, n_classes=10)
    model.create_graph()
    model.train(data,
                n_epochs=5,
                print_every=300,
                dropout=0.2,
                aug_func=aug_func)

    # # Pretrained Inception v3 Model
    # pretrained_snapshot = "/path/to/inception_v3.ckpt"
    # model = PretrainedInceptionClassifier("deleteIV3", pretrained_snapshot=pretrained_snapshot, img_shape=[299, 299], n_channels=3, n_classes=10, dynamic=True)
    # model.create_graph()
    # model.train(data, n_epochs=2, print_every=300, batch_size=4, aug_func=aug_func)