Exemplo n.º 1
0
def lightgbm_tp(train, prj_info, setting):
    """Tuning parameters
    train -- pandas dataframe
    prj_info -- dictionnary containing projet information (response...)
    setting -- dictionnary containing settings
    """
    #Split data
    train_, test_ = split_data(train, prj_info['PRJ_COLUMN']['FOLD_ASSIGN'])

    #Build data
    y_train, y_test, X_train, X_test, W_train, W_test, O_train, monotonicity_vec = build_data(
        train_, test_, prj_info)

    #Dataset lightgbm
    lgb_train = lgb.Dataset(X_train,
                            y_train,
                            weight=W_train,
                            init_score=O_train)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

    #params
    param_mono = {'monotone_constraints': monotonicity_vec}
    params = setting['params']
    keys, values = zip(*params.items())
    experiments = [dict(zip(keys, v)) for v in itertools.product(*values)]
    #Loop
    data_results = pd.DataFrame([])
    timeout = time.time() + setting['naive_tp_time']
    loop_exp = 0
    exp_tested = []
    while True:
        #Break if time out
        if time.time() > timeout and data_results.shape[0] > 2:
            break
        #Experiment
        exp = random.choice(experiments)
        exp.update(param_mono)
        #Model
        lightgbm = lgb.train(exp,
                             train_set=lgb_train,
                             num_boost_round=5000,
                             early_stopping_rounds=20,
                             valid_sets=lgb_eval,
                             verbose_eval=False)
        #Predict
        pred_train = lightgbm.predict(X_train,
                                      num_iteration=lightgbm.best_iteration)
        pred_test = lightgbm.predict(X_test,
                                     num_iteration=lightgbm.best_iteration)
        if not os.path.exists(prj_info['OUTPUT_PATH'] + "shadow/"):
            os.makedirs(prj_info['OUTPUT_PATH'] + "shadow/")
        lightgbm.save_model(prj_info['OUTPUT_PATH'] + "shadow/" +
                            str(loop_exp) + "model.txt")

        #Metric
        metric_train = error_metric(y_train, pred_train, W_train,
                                    prj_info['METRIC'])
        metric_test = error_metric(y_test, pred_test, W_test,
                                   prj_info['METRIC'])
        #Save results
        data_results_ = pd.DataFrame.from_dict(exp, orient='index')
        data_results_ = data_results_.transpose()
        data_results_["train"] = metric_train
        data_results_["test"] = metric_test
        data_results = data_results.append(data_results_)

        exp_tested.append(exp)
        loop_exp = loop_exp + 1

    #Find max experiment
    data_results = data_results.reset_index(drop=True)
    best_experiment_index = data_results["test"].idxmax()
    best_experiment = exp_tested[best_experiment_index]
    #Best model
    best_lightgbm = lgb.Booster(model_file=prj_info['OUTPUT_PATH'] +
                                "shadow/" + str(best_experiment_index) +
                                "model.txt")
    print("        " + str(loop_exp + 1) + " models built")

    return best_experiment, y_test, X_test, W_test, best_lightgbm, data_results
Exemplo n.º 2
0
def bp02_autoencoder(train,test,prj_info,encoding_dim = 100):
    """Autoencoder
    train -- pandas dataframe
    test -- pandas dataframe
    prj_info -- dictionnary containing projet information (response...)
    encoding_dim -- int
    """
    #Copy data
    train_ = train.copy()
    test_ = test.copy()
    
    #Build data
    y_train,y_test,X_train,X_test,W_train,W_test,O_train,monotonicity_vec = build_data(train_,test_,prj_info)
    #Prep data
    X_train,le_X,scale_X = keras_prep_data(X_train)
    X_test,le_X,scale_X = keras_prep_data(X_test,le_X,scale_X)
    
    #Test set
    train_fold = train_[train_[prj_info['PRJ_COLUMN']['FOLD_ASSIGN']] < max(train_[prj_info['PRJ_COLUMN']['FOLD_ASSIGN']])]
    test_fold = train_[train_[prj_info['PRJ_COLUMN']['FOLD_ASSIGN']] == max(train_[prj_info['PRJ_COLUMN']['FOLD_ASSIGN']])]
    y_train_fold,y_valid_fold,X_train_fold,X_valid_fold,W_train_fold,W_test_fold,O_train_fold,monotonicity_vec = build_data(train_fold,test_fold,prj_info)
    #Prep data
    X_train_fold,le_X,scale_X = keras_prep_data(X_train_fold,le_X,scale_X)
    X_valid_fold,le_X,scale_X = keras_prep_data(X_valid_fold,le_X,scale_X)
    
    #Early stop
    early_stop = EarlyStopping(monitor='val_loss', patience=20, mode='auto')
    #Encoder
    input_dim = Input(shape=(X_train.shape[1],))

    encoded1 = Dense(30, activation = 'relu')(input_dim)#200
    encoded3 = Dense(50, activation = 'relu')(encoded1) #500
    encoded4 = Dense(encoding_dim, activation = 'relu')(encoded3)

    decoded1 = Dense(50, activation = 'relu')(encoded4)
    decoded3 = Dense(30, activation = 'relu')(decoded1)
    decoded4 = Dense(X_train.shape[1], activation = 'sigmoid')(decoded3)

    # Whole model
    autoencoder = Model(input_dim, decoded4)
    # Encoder model
    encoder = Model(input_dim, encoded4)

    #Compile Fit
    adam = optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0, amsgrad=False)
    autoencoder.compile(optimizer=adam, loss='mean_squared_error')
    autoencoder.fit(X_train_fold,X_train_fold,
                    epochs=1500,
                    batch_size=20,
                    shuffle=True,
                    verbose=2,
                    validation_data=(X_valid_fold, X_valid_fold),
                    callbacks=[early_stop])
    
    encoded_train = encoder.predict(X_train)
    encoded_test = encoder.predict(X_test)

    #Rebuild train test
    column = list(range(0,encoding_dim))
    column = [str(s) for s in column]
    column = ['autoencoder_' + s for s in column]
    train_new = pd.DataFrame(encoded_train, columns=column)
    test_new = pd.DataFrame(encoded_test, columns=column)

    train_new[prj_info['PRJ_COLUMN']['RESPONSE']] = y_train
    test_new[prj_info['PRJ_COLUMN']['RESPONSE']] = y_test
    train_new[prj_info['PRJ_COLUMN']['FOLD_ASSIGN']] = train_[prj_info['PRJ_COLUMN']['FOLD_ASSIGN']].values
    test_new[prj_info['PRJ_COLUMN']['FOLD_ASSIGN']] = test_[prj_info['PRJ_COLUMN']['FOLD_ASSIGN']].values
    train_new[prj_info['PRJ_COLUMN']['INDEX']] = train_[prj_info['PRJ_COLUMN']['INDEX']].values
    test_new[prj_info['PRJ_COLUMN']['INDEX']] = test_[prj_info['PRJ_COLUMN']['INDEX']].values
    
    return train_new, test_new
Exemplo n.º 3
0
def lightgbm_final_model(train, test, prj_info, best_experiment):
    """Lightgbm final model
    train -- pandas dataframe
    test -- pandas dataframe
    prj_info -- dictionnary containing projet information (response...)
    best_experiment -- dictionnary containing settings
    """
    #Param
    params = best_experiment
    #Build data
    y_train, y_test, X_train, X_test, W_train, W_test, O_train, monotonicity_vec = build_data(
        train, test, prj_info)

    #Cv Model
    metric_cv = []
    pred_fold = pd.DataFrame([])
    best_it = []
    for fold in range(1,
                      max(train[prj_info['PRJ_COLUMN']['FOLD_ASSIGN']]) + 1):
        print('         Fold ' + str(fold))
        train_fold = train[
            train[prj_info['PRJ_COLUMN']['FOLD_ASSIGN']] != fold]
        test_fold = train[train[prj_info['PRJ_COLUMN']['FOLD_ASSIGN']] == fold]
        test_fold_idx = train[train[prj_info['PRJ_COLUMN']['FOLD_ASSIGN']] ==
                              fold][prj_info['PRJ_COLUMN']['INDEX']]
        y_train_fold, y_valid_fold, X_train_fold, X_valid_fold, W_train_fold, W_test_fold, O_train_fold, monotonicity_vec = build_data(
            train_fold, test_fold, prj_info)

        #Build datset
        lgb_train_fold = lgb.Dataset(X_train_fold,
                                     y_train_fold,
                                     weight=W_train_fold,
                                     init_score=O_train)
        lgb_eval_fold = lgb.Dataset(X_valid_fold,
                                    y_valid_fold,
                                    reference=lgb_train_fold)

        #Model cv
        lightgbm_cv = lgb.train(params,
                                num_boost_round=5000,
                                early_stopping_rounds=20,
                                train_set=lgb_train_fold,
                                valid_sets=lgb_eval_fold,
                                verbose_eval=False)

        #Predict
        pred_valid_fold = lightgbm_cv.predict(
            X_valid_fold, num_iteration=lightgbm_cv.best_iteration)
        pred_fold_data = pd.DataFrame(
            data={
                prj_info['PRJ_COLUMN']['INDEX']: test_fold_idx,
                'Pred': pred_valid_fold
            })
        pred_fold = pred_fold.append(pred_fold_data)
        #Metric
        metric_test_cv = error_metric(y_valid_fold, pred_valid_fold,
                                      W_test_fold, prj_info['METRIC'])
        print(metric_test_cv)
        #Save results
        metric_cv.append(metric_test_cv)
        #Save best It
        best_it.append(lightgbm_cv.best_iteration)

    metric_cv_mean = np.mean(metric_cv)
    best_it_mean = np.mean(best_it)
    #Full Model
    lgb_train = lgb.Dataset(X_train,
                            y_train,
                            weight=W_train,
                            init_score=O_train)

    print('         Full model')
    lightgbm = lgb.train(params,
                         num_boost_round=int(round(best_it_mean)),
                         train_set=lgb_train,
                         verbose_eval=False)

    pred_test = lightgbm.predict(X_test)
    metric_test = error_metric(y_test, pred_test, W_test, prj_info['METRIC'])
    pred_test = pd.DataFrame(
        data={
            prj_info['PRJ_COLUMN']['INDEX']: test[prj_info['PRJ_COLUMN']
                                                  ['INDEX']],
            'Pred': pred_test
        })

    print("    Fold mean " + prj_info['METRIC'] + " : " + str(metric_cv_mean))
    print("    Test " + prj_info['METRIC'] + " : " + str(metric_test))

    return lightgbm, pred_fold, pred_test
Exemplo n.º 4
0
def keras_final_model(train,test,prj_info,settings):
    """Training pilot
    train -- pandas dataframe
    test -- pandas dataframe
    prj_info -- dictionnary containing projet information (response...)
    settings -- dictionnary containing settings
    """
    #Build data
    y_train,y_test,X_train,X_test,W_train,W_test,O_train,monotonicity_vec = build_data(train,test,prj_info)
    variables_selected = X_train.columns.values
    
    #Prep data
    X_train,le_X,scale_X = keras_prep_data(X_train)
    X_test,le_X,scale_X = keras_prep_data(X_test,le_X,scale_X)

    #Early stop
    early_stop = EarlyStopping(monitor='val_loss', patience=20, mode='auto') 
    
    #Model
    def bp02_model(input_dim = None):
        adam = optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0, amsgrad=False)
        model = Sequential()
        model.add(Dense(50, input_dim = input_dim, kernel_initializer='normal', activation='relu',kernel_regularizer=regularizers.l2(0.01)))
        model.add(Dropout(0.1)) #1500
        model.add(Dense(20, kernel_initializer='normal', activation='relu',kernel_regularizer=regularizers.l2(0.01)))
        model.add(Dropout(0.1)) #750
        model.add(Dense(20, kernel_initializer='normal', activation='relu',kernel_regularizer=regularizers.l2(0.01)))
        model.add(Dropout(0.1))#750
        model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
        model.compile(loss='binary_crossentropy', optimizer=adam)
        return model

    #Cv Model
    metric_cv = []
    pred_fold = pd.DataFrame([])
    best_it = []
    for fold in range(1,max(train[prj_info['PRJ_COLUMN']['FOLD_ASSIGN']])+1):
        print('         Fold ' + str(fold))
        train_fold = train[train[prj_info['PRJ_COLUMN']['FOLD_ASSIGN']] != fold]
        test_fold = train[train[prj_info['PRJ_COLUMN']['FOLD_ASSIGN']] == fold]
        test_fold_idx = train[train[prj_info['PRJ_COLUMN']['FOLD_ASSIGN']] == fold][prj_info['PRJ_COLUMN']['INDEX']]
        y_train_fold,y_valid_fold,X_train_fold,X_valid_fold,W_train_fold,W_test_fold,O_train_fold,monotonicity_vec = build_data(train_fold,test_fold,prj_info)

        #Prep data
        X_train_fold,le_X,scale_X = keras_prep_data(X_train_fold,le_X,scale_X)
        X_valid_fold,le_X,scale_X = keras_prep_data(X_valid_fold,le_X,scale_X)
        
        #Estimator
        clf_fold = KerasClassifier(build_fn = bp02_model,
                                   input_dim = X_train.shape[1],
                                   epochs = settings['params']['epochs'],
                                   batch_size = settings['params']['batch_size'],
                                   verbose = settings['params']['verbose'],
                                   callbacks=[early_stop])
        
        #Model cv
        history_fold =  clf_fold.fit(X_train_fold,y_train_fold, validation_data = (X_valid_fold,y_valid_fold))
        #Predict
        pred_valid_fold = clf_fold.predict_proba(X_valid_fold)
        pred_valid_fold = [item[1] for item in pred_valid_fold]
        pred_fold_data = pd.DataFrame(data={prj_info['PRJ_COLUMN']['INDEX']: test_fold_idx, 'Pred' : pred_valid_fold})
        pred_fold = pred_fold.append(pred_fold_data)
        #Metric
        metric_test_cv = error_metric(y_valid_fold,pred_valid_fold,W_test_fold,prj_info['METRIC'])
        print(metric_test_cv)
        #Save results
        metric_cv.append(metric_test_cv)
        #Save best Iteration
        best_it_fold = history_fold.history['val_loss'].index(min(history_fold.history['val_loss']))+1
        best_it.append(best_it_fold)

    metric_cv_mean = np.mean(metric_cv)
    best_it_mean = np.mean(best_it)
    
    #Full model
    print('         Full model')

    #Estimator
    clf = KerasClassifier(build_fn = bp02_model,
                          input_dim = X_train.shape[1],
                          epochs = int(round(best_it_mean)),
                          batch_size = settings['params']['batch_size'],
                          verbose = settings['params']['verbose'])
    
    clf.fit(X_train,y_train)

    pred_test = clf.predict_proba(X_test)
    pred_test = [item[1] for item in pred_test]
    metric_test = error_metric(y_test,pred_test,W_test,prj_info['METRIC'])
    pred_test = pd.DataFrame(data={prj_info['PRJ_COLUMN']['INDEX']: test[prj_info['PRJ_COLUMN']['INDEX']], 'Pred' : pred_test})
    
    print("    Fold mean " + prj_info['METRIC'] + " : " + str(metric_cv_mean))
    print("    Test " + prj_info['METRIC'] + " : " + str(metric_test))
    
    return clf,pred_fold,pred_test,variables_selected,le_X,scale_X