예제 #1
0
def evaluate_regression(combined,
                        model_name,
                        target_vars=None,
                        id_subset_score=None,
                        day_subset_score=None,
                        n_models=1):
    from sklearn.preprocessing import RobustScaler, OneHotEncoder
    from sklearn import metrics
    import statsmodels.api as sm

    if target_vars is None:
        target_vars = (
            "power",
            "amplitude",
            "age",
            "mean_movement_speed",
            "total_movement_distance",
            "proportion_active",
            "mean_angular_speed",
            "mean_exit_distance",
            "mean_turtosity",
            "is_circadian",
            "well_tested_circadianess",
        )

    results = {}
    results["model"] = model_name

    if id_subset_score is None:
        use_rows = [True for _ in range(len(combined))]
    else:
        use_rows = [bee_id in id_subset_score for bee_id in combined.bee_id]

    if day_subset_score is not None:
        use_rows = [(use and (day in day_subset_score))
                    for day, use in zip(combined.day, use_rows)]

    for target_var in tqdm(target_vars):
        day_onehot = pd.DataFrame(
            OneHotEncoder(sparse=False,
                          categories="auto").fit_transform(combined.day[:,
                                                                        None]),
            columns=[f"day_{d}" for d in combined.day.unique()],
        )

        factor_names = [c for c in combined.columns if c.startswith("f_")]
        emb_names = [c for c in combined.columns if c.startswith("e_")]

        combined_ = np.concatenate(
            (combined[factor_names], combined[emb_names], day_onehot), axis=1)

        feature_names = factor_names + emb_names + list(day_onehot.columns)

        features_scaled = pd.DataFrame(
            sm.add_constant(RobustScaler().fit_transform(combined_)),
            columns=["const"] + feature_names,
        )

        if "circadian" in target_var:
            scores = []
            for _ in range(n_models):
                model = sm.Logit(combined.reset_index()[target_var],
                                 features_scaled).fit_regularized(
                                     alpha=1,
                                     disp=False,
                                     trim_mode="off",
                                     qc_tol=1)
                scores.append(
                    metrics.roc_auc_score(
                        combined.loc[use_rows][target_var],
                        model.predict(features_scaled[use_rows])))

            results[target_var + "_roc_auc_mean"] = np.mean(scores)
            if n_models > 1:
                results[target_var + "_roc_auc_std"] = np.std(scores)
        else:
            scores = []
            for _ in range(n_models):
                model = sm.OLS(combined.reset_index()[target_var],
                               features_scaled).fit()
                scores.append(
                    metrics.r2_score(combined.loc[use_rows][target_var],
                                     model.predict(features_scaled[use_rows])))

            results[target_var + "_r2_mean"] = np.mean(scores)
            if n_models > 1:
                results[target_var + "_r2_std"] = np.std(scores)

    results["data_subset"] = [combined[["bee_id", "date"]]]

    return pd.DataFrame(results)
예제 #2
0
    cbar.ax.set_yticklabels(['< -1', '0',
                             '> 1'])  # vertically oriented colorbar

    filename = "./imagesEli/y_sample_before_prep.png"
    plt.savefig(filename)

    # Preprocessing: train_test_split...
    test_size = 0.20
    seed = 46
    X_train, X_test, Y_train, Y_test = train_test_split(TTdata,
                                                        Vdata,
                                                        test_size=test_size,
                                                        random_state=seed)

    # ...and normalization
    scaler_X = RobustScaler()
    X_train_scaled = scaler_X.fit_transform(X_train)
    X_test_scaled = scaler_X.transform(X_test)
    scaler_y = RobustScaler()
    y_train_scaled = scaler_y.fit_transform(Y_train)
    y_test_scaled = scaler_y.transform(Y_test)

    Vdata_rect = np.reshape(y_train_scaled, [-1, 50, 80])
    # Make plot with vertical (default) colorbar
    fig, ax = plt.subplots()
    data1 = Vdata_rect[0]
    data1 = np.reshape(data1, [50, 80]).transpose()
    cax = ax.imshow(data1, cmap=cm.coolwarm)
    ax.set_title('after prep')
    # Add colorbar, make sure to specify tick locations to match desired ticklabels
    cbar = fig.colorbar(cax, ticks=[-1, 0, 1])
예제 #3
0
파일: Kernel1.py 프로젝트: chfenix/kaggle
                 annot=True,
                 square=True,
                 fmt=".2f",
                 annot_kws={"size": 10},
                 yticklabels=valid_feature.values,
                 xticklabels=valid_feature.values,
                 cmap="YlGnBu")
# plt.show()
valid_feature = valid_feature.delete(0)  # 删除SalePrice本身
print(valid_feature)
valid_feature = [
    'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath',
    'TotRmsAbvGrd', 'YearBuilt'
]
# 标准化
scaler = RobustScaler()
data_train_scaled = scaler.fit_transform(data_train[valid_feature])
data_test_scaled = scaler.fit_transform(data_test[valid_feature])

# 降维
pca = PCA(n_components=15)
# data_train_pca = pca.fit_transform(data_train_scaled)    # 降维后数据
# data_test_pca = pca.fit_transform(data_test_scaled)

# 模型评估
print("========= Modeling =========")
# 进行模型交叉验证
models = [
    LinearRegression(),
    Ridge(),
    Lasso(alpha=0.01, max_iter=10000),
예제 #4
0
                                                stratify=train_demographics['stratify'],
                                                random_state=random_seed)


print('Divided BANC dataset into test and training')
print('Check train test split sizes')
print('X_train: ' + str(Xtrain.shape))
print('X_test: '  + str(Xtest.shape))
print('Y_train: ' + str(Ytrain.shape))
print('Y_test: '  + str(Ytest.shape))
print('X_valitation ' + str(Xvalidate.shape))
print('Y_test: ' + str(Yvalidate.shape))

# Normalise the test dataset and apply the transformation to the train
# dataset
robustscaler = RobustScaler().fit(Xtrain)
Xtrain_scaled = robustscaler.transform(Xtrain)
Xtest_scaled = robustscaler.transform(Xtest)
Xvalidate_scaled = robustscaler.transform(Xvalidate)

# Transform pandas into numpy arrays (no nneed to do it if you are scaling
# the results)
Ytrain = Ytrain.values
Ytest = Ytest.values
Yvalidate = Yvalidate.values

# Best pipeline recommended by TPOT
exported_pipeline = make_pipeline(
      StackingEstimator(estimator=LinearRegression()),
      StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=True,
                                                      max_features=0.9000000000000001,
예제 #5
0
def main():
    index = load_dataset('all_merged', return_index=True)
    resultFile = './data/datasets/all_merged/estimators/randomforest_hyperparameters.json'
    estFile = './data/datasets/all_merged/estimators/randomforest_{}.p'
    hyperparameters = {}
    for _sym, data in index.items():
        features = pd.read_csv(data['csv'],
                               sep=',',
                               encoding='utf-8',
                               index_col='Date',
                               parse_dates=True)
        # Replace nan with infinity so that it can later be imputed to a finite value
        features = features.replace([np.inf, -np.inf], np.nan)
        # Derive target classes from closing price
        target_pct = target_price_variation(features['close'])
        target = target_binned_price_variation(target_pct, n_bins=2)
        # target = target_discrete_price_variation(target_pct)

        # Split data in train and blind test set with 70:30 ratio,
        #  most ML models don't take sequentiality into account, but our pipeline
        #  uses a SimpleImputer with mean strategy, so it's best not to shuffle the data.
        X_train, X_test, y_train, y_test = train_test_split(features.values,
                                                            target.values,
                                                            shuffle=False,
                                                            test_size=0.3)
        # Summarize distribution
        print("Training set: # Features {}, # Samples {}".format(
            X_train.shape[1], X_train.shape[0]))
        plot_class_distribution("Training set", _sym, y_train)
        print("Test set: # Features {}, # Samples {}".format(
            X_test.shape[1], X_test.shape[0]))
        plot_class_distribution("Test set", _sym, y_test)
        if not np.isfinite(X_train).all():
            logger.warning("Training x is not finite!")
        if not np.isfinite(y_train).all():
            logger.warning("Training y is not finite!")
        if not np.isfinite(X_test).all():
            logger.warning("Test x is not finite!")
        if not np.isfinite(y_test).all():
            logger.warning("Test y is not finite!")
        # Build pipeline to be used as estimator in bagging classifier
        #  so that each subset of the data is transformed independently
        #  to avoid contamination between folds.
        pipeline = Pipeline([
            (
                'i', SimpleImputer()
            ),  # Replace nan's with the median value between previous and next observation
            (
                's', RobustScaler()
            ),  # Scale data in order to center it and increase robustness against noise and outliers
            #('k', SelectKBest()), # Select top 10 best features
            #('u', RandomUnderSampler()),
            ('c', RandomForestClassifier()),
        ])

        # Perform hyperparameter tuning of the ensemble with 5-fold cross validation
        logger.info("Start Grid search")
        CV_rfc = GridSearchCV(estimator=pipeline,
                              param_grid=RANDOMFOREST_PARAM_GRID,
                              cv=5,
                              n_jobs=4,
                              scoring='neg_mean_squared_error',
                              verbose=1)
        CV_rfc.fit(X_train, y_train)
        logger.info("End Grid search")

        # Take the fitted ensemble with tuned hyperparameters
        clf = CV_rfc.best_estimator_

        # Test ensemble's performance on training and test sets
        logger.info("Classification report on train set")
        predictions1 = clf.predict(X_train)
        print(classification_report(y_train, predictions1))
        logger.info("Classification report on test set")
        predictions2 = clf.predict(X_test)
        print(classification_report(y_test, predictions2))
        stats = {
            'score': accuracy_score(y_train, predictions1),
            'mse': mean_squared_error(y_train, predictions1),
            'test_score': accuracy_score(y_test, predictions2),
            'test_mse': mean_squared_error(y_test, predictions2),
            'cv_best_mse': -1 * CV_rfc.best_score_,  # CV score is negated MSE
            # 'cv_results': CV_rfc.cv_results_,
            'cv_bestparams': CV_rfc.best_params_,
        }
        print(stats)
        with open(estFile.format(_sym), 'wb') as f:
            pickle.dump(clf, f)
        hyperparameters[_sym] = {
            'estimator': estFile.format(_sym),
            'stats': stats
        }
        # feature_importances = np.mean([
        #     p.named_steps.c.feature_importances_ for p in clf.estimators_
        # ], axis=0)

        # importances = {X.columns[i]: v for i, v in enumerate(feature_importances)}
        # labeled = {str(k): v for k, v in sorted(importances.items(), key=lambda item: -item[1])}

        # print({
        #     # 'features':sel_features
        #     'feature_importances': labeled,
        #     # 'rank': {l: i + 1 for i, l in enumerate(labeled.keys())},
        # })
        with open(resultFile, 'w') as f:  # Save results at every update
            json.dump(hyperparameters, f, indent=4)
        print("--- end ---")
예제 #6
0
def get_predictions(train, target, test, mod, mod_type, rand_state, is_shuffle,
                    sample_weight, cat_feats):
    '''
        test: 0 if preds on test set aren't required
        test:pd.DataFrame if preds on test set are required
        '''
    folds = StratifiedKFold(n_splits=5,
                            random_state=rand_state,
                            shuffle=is_shuffle)
    fold_score = []
    indexes = []

    temp_train = copy.deepcopy(train)
    y = target

    cv_preds = []
    preds = []

    for i, (train_index, test_index) in enumerate(folds.split(temp_train, y)):

        xtrain = temp_train.iloc[train_index]
        xval = temp_train.iloc[test_index]
        ytrain, yval = y.iloc[train_index], y.iloc[test_index]
        indexes.append(xval.index)
        if (mod_type == 'lr') | (mod_type == 'nb') | (mod_type == 'knn'):
            scaler = RobustScaler()
            xtrain = pd.DataFrame(scaler.fit_transform(xtrain),
                                  columns=temp_train.columns)
            xval = pd.DataFrame(scaler.transform(xval),
                                columns=temp_train.columns)
        watchlist = [(xtrain, ytrain), (xval, yval)]
        try:
            train_sample_wt = sample_weight.iloc[train_index].values
            test_sample_wt = sample_weight.iloc[test_index].values
        except:
            train_sample_wt = None
        if (mod_type == 'rf') | (mod_type == 'lr') | (mod_type == 'et') | (
                mod_type == 'nb') | (mod_type == 'knn'):
            if mod_type == 'rf':
                mod.fit(xtrain, ytrain, sample_weight=train_sample_wt)
            else:
                mod.fit(xtrain, ytrain)
            preds_val = mod.predict_proba(xval)[:, 1]
            score = roc_auc_score(yval, preds_val)
        else:
            if mod_type == 'xgb':
                mod.fit(X=xtrain,
                        y=ytrain,
                        eval_set=watchlist,
                        eval_metric=custom_f1,
                        early_stopping_rounds=100,
                        verbose=False,
                        sample_weight=train_sample_wt)
                score = -mod.best_score
            elif mod_type == 'lgb':
                mod.fit(X=xtrain,
                        y=ytrain,
                        eval_set=watchlist,
                        eval_metric='auc',
                        early_stopping_rounds=100,
                        verbose=False,
                        sample_weight=train_sample_wt,
                        categorical_feature=cat_feats)
                score = mod.best_score_['valid_1']['f_1']
        print 'fold', i + 1, 'score:', score
        fold_score.append(score)

        if isinstance(test, pd.DataFrame):
            if (mod_type == 'lr') | (mod_type == 'nb') | (mod_type == 'knn'):
                test = pd.DataFrame(scaler.transform(test),
                                    columns=temp_train.columns)
            preds_test = mod.predict_proba(test)[:, 1]
            preds.append(preds_test)

        cv_preds.append(mod.predict_proba(xval)[:, 1])

    print 'mean cv score:', (np.mean(fold_score))
    if isinstance(test, pd.DataFrame):
        test_df_preds = np.mean(preds, axis=0)
    else:
        test_df_preds = 0

    cv_mod_preds = [j for i in cv_preds for j in i]
    cv_mod_index = [j for i in indexes for j in i]

    final_cv_preds = pd.DataFrame({
        'cv_preds': cv_mod_preds,
        'cv_index': cv_mod_index
    }).sort_values('cv_index').cv_preds.values
    return final_cv_preds, test_df_preds
예제 #7
0
],
                       axis=1)

public_labels = df_train.Histology
PA_labels = df_test.Histology

tot_data = pd.concat([public_data, PA_data], axis=0)
tot_label = pd.concat([public_labels, PA_labels], axis=0)

encoder = LabelEncoder()

#Scalers

from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler

scalers_to_test = [StandardScaler(), RobustScaler(), MinMaxScaler(), None]

df = pd.DataFrame()

# Designate distributions to sample hyperparameters from
C_range = np.power(2, np.arange(-10, 11, dtype=float))
gamma_range = np.power(2, np.arange(-10, 11, dtype=float))
n_features_to_test = np.arange(4, 10)

for i in range(1, 21):

    #Train test split
    X_train, X_test, y_train, y_test = train_test_split(tot_data,
                                                        tot_label,
                                                        test_size=0.3,
                                                        stratify=tot_label,
예제 #8
0
    "dropout_keep_prob": 0.1,
    "hidden_unit": 500,
    "validation_split": 0.1,
    "input_size": 3,
    "num_outputs": 10
}

# In[5]:

X_test = test_df.drop(['period', 'powerSetPoint', 'sigma', 'delay'], axis=1)
y_test = test_df[['delay']]

X_train = train_df.drop(['period', 'powerSetPoint', 'sigma', 'delay'], axis=1)
y_train = train_df[['delay']]

scaler_X = RobustScaler()
scaler_y = MinMaxScaler()

scaler_X.fit(np.concatenate([X_test, X_train], axis=0))
scaler_y.fit(np.concatenate([y_test, y_train], axis=0))

scaler_filename = "old_scaler.save"
joblib.dump(scaler_X, scaler_filename)

data_loader = DataLoader(scaler_X.transform(X_test),
                         scaler_y.transform(y_test), params["batch_size"],
                         params["seq_length"], params["input_size"])
X_test, y_test = data_loader.dataset()

data_loader = DataLoader(scaler_X.transform(X_train),
                         scaler_y.transform(y_train), params["batch_size"],
예제 #9
0
import matplotlib.pyplot as plt
import pandas as pd
from pandas.tools.plotting import parallel_coordinates
from sklearn.cluster import KMeans
from sklearn.preprocessing import RobustScaler
from sklearn import metrics

# Eye state
eye_train_file = "data\\eye_state.csv"
scalar_x = RobustScaler()
eye_label_pos = 14
eye_label_name = 'eyeDetection'
eyeDF = pd.read_csv(eye_train_file, sep=',')
#Scale for plot
scaleDF = eyeDF.iloc[:, :-1]
scaleDF = pd.DataFrame(scalar_x.fit_transform(scaleDF),
                       columns=scaleDF.columns)
scaleDF[eye_label_name] = eyeDF[eye_label_name]

# 1. parallel plot
parallel_coordinates(scaleDF, eye_label_name)
plt.legend(loc='lower left', frameon=False)
plt.savefig('image/km/eye/eye_det_parallel.png')
plt.clf()
plt.cla()

eyeX = eyeDF.drop(eye_label_name, axis=1)
eyeX = pd.DataFrame(scalar_x.fit_transform(eyeX), columns=eyeX.columns).values
eyeY = eyeDF.iloc[:, 0].values
print("Rows,Cols in the eye Detection dataset:{}".format(eyeX.shape))
eyekm = KMeans(n_clusters=2)
예제 #10
0
],
                            axis=1)
PA_data = df_test.drop([
    'Histology', 'Surv_time_months', 'OS', 'deadstatus.event', 'Overall_Stage'
],
                       axis=1)

public_labels = df_train.Histology
PA_labels = df_test.Histology

encoder = LabelEncoder()

#Scalers

from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer, MinMaxScaler
scalers_to_test = [StandardScaler(), RobustScaler(), QuantileTransformer()]

# Designate distributions to sample hyperparameters from
C_range = np.power(2, np.arange(-10, 11, dtype=float))
n_features_to_test = np.arange(4, 10)

for i in range(1, 21):

    #Train test split
    X_train, X_test, y_train, y_test = train_test_split(public_data,
                                                        public_labels,
                                                        test_size=0.3,
                                                        stratify=public_labels,
                                                        random_state=i * 500)

    #Vettorizzare i label
예제 #11
0
 def fit(self, X, y=None):
     self.rs = RobustScaler()
     self.rs.fit(X)
     self.center_ = pd.Series(self.rs.center_, index=X.columns)
     self.scale_ = pd.Series(self.rs.scale_, index=X.columns)
     return self
예제 #12
0
final_rf_classifier = RandomForestRegressor()


final_rf_regressor = RandomForestRegressor\
    (
    bootstrap=False,
    criterion='mse',
    max_depth=80,
    min_samples_split=2,
    max_features='sqrt',
    min_samples_leaf=2,
    n_estimators=2000,
    random_state=42
    )

final_rf_pipeline = Pipeline([('scaler', RobustScaler()),
                              ('kc', final_rf_regressor)],
                             verbose=True)

#Fit the final model
rf_regressor = final_rf_pipeline.fit(X_classical, unencoded_y)

#Dump the saved model
joblib.dump(rf_regressor,
            '/home/ubuntu/model_test/saved_models/rf_regressor.mod')

#Load and predict from saved model
load_regressor = joblib.load(
    '/home/ubuntu/model_test/saved_models/rf_regressor.mod')

get_score(rf_regressor)
### Credit Card Fraud and Non-Fraud ration with graph
target_count = df_train['Class'].value_counts()
print('Class 0:', target_count[0])
print('Class 1:', target_count[1])
print('Proportion:', round(target_count[0] / target_count[1], 2), ': 1')

#target_count.plot(kind='bar', title='Count (target)');

## End of Class ratio

# Here we use Robust Scaler technique for feature scalling
# Scale "Time" and "Amount"



df_train['scaled_amount'] = RobustScaler().fit_transform(df_train['Amount'].values.reshape(-1,1))
df_train['scaled_time'] = RobustScaler().fit_transform(df_train['Time'].values.reshape(-1,1))

# Make a new dataset named "df_scaled" dropping out original "Time" and "Amount"
df_scaled = df_train.drop(['Time','Amount'],axis = 1,inplace=False)
df_scaled.head()

# Define the prep_data function to extrac features 
def prep_data(df):
    X = df.drop(['Class'],axis=1, inplace=False)  
    X = np.array(X).astype(np.float)
    y = df[['Class']]  
    y = np.array(y).astype(np.float)
    return X,y

# Create X and y from the prep_data function 
예제 #14
0
def pd_robustscale(Data) :
    x = Data.values
    scaler = RobustScaler()
    x_rs = scaler.fit_transform(x)
    data_rs = pd.DataFrame(x_rs, index=Data.index, columns=Data.columns)
    return data_rs, scaler
def iqr_robust_scaler(train, test):
    scaler = RobustScaler(quantile_range=(25.0,75.0), copy=True, with_centering=True, with_scaling=True).fit(train)
    train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index([train.index.values])
    test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index([test.index.values])
    return scaler, train_scaled, test_scaled
예제 #16
0
 def LassoPipelineModel(self):
     return make_pipeline(RobustScaler(), Lasso(alpha = 0.00035, random_state = 1))
예제 #17
0
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
# import xgboost as xgb
# import lightgbm as lgb


#Validation function
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)
    
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

'''
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
예제 #18
0
print(x1.shape)
print(x2.shape)
print(x3.shape)

# y_data
y = train.iloc[:, -4:]
y = y.values

# scaler
scaler1 = MinMaxScaler()
scaler1.fit(x1)
train_rho2 = scaler1.transform(x1)
test_rho2 = scaler1.transform(x1_pred)

scaler2 = RobustScaler()
scaler2.fit(x2)
train_ratio = scaler2.fit(x2)
test_ratio = scaler2.fit(x2_pred)

scaler2.fit(x3)
train_fft = scaler2.transform(x3)
test_fft = scaler2.transform(x3_pred)

# train_test_split
x1_train, x1_test, x2_train, x2_test, x3_train, x3_test, y_train, y_test = train_test_split(
    x1, x2, x3, y, random_state=66, train_size=0.8)

act = 'relu'

#2. model
예제 #19
0
                'SCALER', 'PCA__n_components', 'CLF__n_estimators', 'CLF__criterion', 'CLF__bootstrap']


    ## write the data to the specified output path: "output"/+file_name
    ## without adding the index of the dataframe to the output 
    ## and without adding a header to the output. 
    ## => these parameters are added to be fit the desired output. 
    #df.to_csv(f'/home/users/ubaldi/TESI_PA/result_score/Public/score_{name}.csv', index=False, header=fieldnames)



    #create folder and save

    import os

    outname = f'score_{name}_{str(abbr_scaler)}_YES_NO.csv'

    outdir = f'/home/users/ubaldi/TESI_PA/result_score/Public/{folder}/'
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    fullname = os.path.join(outdir, outname)    

    df.to_csv(fullname, index=False, header=fieldnames)



create_csv_score_YES_NO(MinMaxScaler(), 'MMS')
create_csv_score_YES_NO(StandardScaler(), 'STDS')
create_csv_score_YES_NO(RobustScaler(), 'RBT')
예제 #20
0
         'age_nan', 'tax_nan', 'ptratio_nan', 'b_nan', 'dis_nan'
     ],
     'main_imputer':
     ModelBasedFullImputer(columns=[
         'crim', 'zn', 'nox', 'indus', 'rm', 'age', 'tax', 'ptratio',
         'b', 'dis'
     ],
                           model=DecisionTreeRegressor(max_depth=None)),
     'poly':
     None,
     'predictor':
     Lasso(alpha=0.01),
     'reduce_dim':
     None,
     'scaler':
     RobustScaler(),
     'simple_imputer':
     FillNaTransformer(from_dict={},
                       mean=[],
                       median=[
                           'crim', 'zn', 'nox', 'indus', 'rm', 'age',
                           'tax', 'ptratio', 'b', 'dis'
                       ],
                       nan_flag=[
                           'crim', 'zn', 'nox', 'indus', 'rm', 'age',
                           'tax', 'ptratio', 'b', 'dis'
                       ],
                       zero=[])
 },
 'score': 3.993797473454735,
 'std': 0.5808956921355953
예제 #21
0
# LightGBM
model_lgb = lgb.LGBMRegressor(objective='regression',
                              num_leaves=5,
                              learning_rate=0.05,
                              n_estimators=720,
                              max_bin=55,
                              bagging_fraction=0.8,
                              bagging_freq=5,
                              feature_fraction=0.2319,
                              feature_fraction_seed=9,
                              bagging_seed=9,
                              min_data_in_leaf=6,
                              min_sum_hessian_in_leaf=11)

# Elastic Net
ENet = make_pipeline(RobustScaler(),
                     ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
"""
Lasso:

This model may be very sensitive to outliers. So we need to made it more robust on them. For that we use the sklearn's 
Robustscaler() method on pipeline
"""
lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1))

# Kernel Ridge Regression
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)

# Let's score our models ...
score = rmsle_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
예제 #22
0
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.metrics import r2_score, mean_squared_error
from MLP_Class_FromScratch import MultiP

listind=np.linspace(start=0,stop=20427,num=3000).astype(int) #Verisetinin tamamını kullanmadığımız durumda alt satırdaki ilk iki nokta yerine listind yazılır.
print(listind)
df = pd.read_pickle("df.pkl").iloc[:,:]
y = df.median_house_value.to_numpy()
X = df.drop(["median_house_value"],axis=1).to_numpy()

scaler1 = MinMaxScaler()
scaler2 = StandardScaler()
scaler3 = RobustScaler()



iter = 1 # Ağ'ın çalışma sayısı burada belirlenir.
mse, r2 = [],[] # Ağın her çalışmasında elde edilen değerlendirme metrikleri bu boş listelerin içine aktarılacak.
for i in range(iter):
        mlp = MultiP(inputDim=X.shape[1], firstLayer=32, secondLayer=16, outputDim=1)

        X = scaler2.fit_transform(X)

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

        mlp.train(X_train,y_train,epochs=300,lr=0.1,dropout1=0.0,dropout2=0.0)  # Dropout oranları hem eğitimde hem testte verilmelidir ve birbiriyle uyuşmalıdır.

        y_pred = np.zeros(len(y_test))
예제 #23
0
from sklearn.datasets import fetch_openml
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import RobustScaler

import warnings

warnings.simplefilter(action="ignore", category=FutureWarning)

X, y = fetch_openml(name="diabetes", as_frame=False, return_X_y=True)

# replace the label names with 0 or 1
y = (y == "tested_positive").astype(int)

# Scale the dataset with sklearn RobustScaler (important for this algorithm)
X = RobustScaler().fit_transform(X)

# Using GridSearchCV, to tune the parameters alpha, eta0, learning_rate, loss
# and average of SGDClassifier, we get the following parameters.

clf_not_rob = SGDClassifier(average=10, learning_rate="optimal", loss="hinge")

# Then, we use this estimator as base_estimator of RobustWeightedEstimator.
# Using GridSearchCV, we tuned the parameters c and eta0, with the
# choice of "huber" weighting because the sample_size is not very large.

clf_rob = RobustWeightedClassifier(
    weighting="huber",
    loss="hinge",
    c=1.35,
    eta0=1e-3,
예제 #24
0
plt.xlabel('Время (сек.)')
plt.ylabel('Число транзакций')
plt.show()

f, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(15, 5))
ax1.hist(original_df.Amount[original_df.Class == 1], bins=5)
ax1.set_title('Мошеннические транзакции')
ax2.hist(original_df.Amount[original_df.Class == 0], bins=5)
ax2.set_title('Честные транзакции')
plt.xlabel('Сумма')
plt.ylabel('Число транзакций')
plt.yscale('log')
plt.show()

# Масштабирование
rob_scaler = RobustScaler()

original_df['rubust_time'] = RobustScaler().fit_transform(
    original_df['Time'].values.reshape(-1, 1))
original_df['robust_amount'] = RobustScaler().fit_transform(
    original_df['Amount'].values.reshape(-1, 1))

original_df.drop(['Time', 'Amount'], axis=1, inplace=True)

rubust_time = original_df['rubust_time']
robust_amount = original_df['robust_amount']

original_df.drop(['robust_amount', 'rubust_time'], axis=1, inplace=True)

original_df.insert(0, 'robust_amount', robust_amount)
original_df.insert(1, 'rubust_time', rubust_time)
예제 #25
0
    x_val = val.drop('target', axis=1)

    # train, predict, evaluate

    auc, ll = train_and_evaluate(y_train, x_train, y_val, x_val)

    print("No transformation")
    print("AUC: {:.2%}, log loss: {:.2%} \n".format(auc, ll))

    # try different transformations for X
    # X is already scaled to (0,1) so these won't make much difference

    transformers = [
        MaxAbsScaler(),
        MinMaxScaler(),
        RobustScaler(),
        StandardScaler(),
        Normalizer(norm='l1'),
        Normalizer(norm='l2'),
        Normalizer(norm='max')
    ]

    #poly_scaled = Pipeline([ ( 'poly', PolynomialFeatures()), ( 'scaler', MinMaxScaler()) ])
    #transformers.append( PolynomialFeatures(), poly_scaled )

    for transformer in transformers:

        print(transformer)
        auc, ll = transform_train_and_evaluate(transformer)
        print("AUC: {:.2%}, log loss: {:.2%} \n".format(auc, ll))
예제 #26
0
    'Histology', 'Surv_time_months', 'OS', 'deadstatus.event', 'Overall_Stage'
],
                            axis=1)
PA_data = df_test.drop([
    'Histology', 'Surv_time_months', 'OS', 'deadstatus.event', 'Overall_Stage'
],
                       axis=1)

public_labels = df_train.Histology
PA_labels = df_test.Histology

encoder = LabelEncoder()

#Scalers
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
scalers_to_test = [RobustScaler(), MinMaxScaler()]

df = pd.DataFrame()

#Designate distributions to sample hyperparameters from
C_range = np.power(2, np.arange(-10, 11, dtype=float))
n_features_to_test = np.arange(4, 10)

for i in range(1, 21):

    #Train test split
    X_train, X_test, y_train, y_test = train_test_split(public_data,
                                                        public_labels,
                                                        test_size=0.3,
                                                        stratify=public_labels,
                                                        random_state=i * 500)
예제 #27
0
from sklearn.svm import NuSVC
from sklearn.linear_model import Perceptron
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.base import BaseEstimator, TransformerMixin

#=================Scaler
scaler = [
    StandardScaler(),
    MinMaxScaler(),
    MaxAbsScaler(),
    RobustScaler(quantile_range=(25, 75)),
    PowerTransformer(method='yeo-johnson'),
    # PowerTransformer(method='box-cox'),
    QuantileTransformer(output_distribution='normal'),
    QuantileTransformer(output_distribution='uniform'),
    Normalizer()
]

#=================Classifier
classifier_test = [
    OneVsRestClassifier(SVC()),
    DecisionTreeClassifier(max_depth=5),
    SVC(),
    SVC(kernel="linear", C=0.025),
    LogisticRegressionCV(cv=5, random_state=0),
    GradientBoostingClassifier(random_state=0),
예제 #28
0
  unorm_train[i] = unorm_train[i].clip_upper(unorm_train[i].quantile(0.99))
  unorm_train[i] = unorm_train[i].clip_lower(unorm_train[i].quantile(0.01))
# for i in clip_lower_feature:


# remove outlier
# outlier_index = unorm_train[(unorm_train['TotalSF'] > 100000) | (unorm_train['BsmtValue']>60000) | (unorm_train['TotalBsmtSF'] > 6000) | (unorm_train['LotFrontage'] > 200) | (unorm_train['LotArea'] > 100000) | (unorm_train['MasVnrArea'] > 1500)].index
# outlier_index = unorm_train[(unorm_train['GrLivArea'] > 4000)].index
# unorm_train.drop(outlier_index, inplace=True)

# print(outlier_index)
train_data2 = unorm_train.drop(['SalePrice'], axis=1)


# Scaling
trans_train = RobustScaler().fit_transform(train_data2.values)
trans_test = RobustScaler().fit_transform(test_data)

output_train = pd.DataFrame(trans_train, columns=combined_df.columns)
output_train['SalePrice'] = pd.Series(np.log1p(SalePrice.values))

output_test = pd.DataFrame(trans_test , columns=combined_df.columns)
output_test['Id'] = pd.Series(Test_id)


# selected most important features

important_features = ['OverallQual', 'ExterQual', 'GarageValue',
  'GrLivArea', 'GarageAge', 'KitchenQual', 'FullBath', 'BsmtQual',
  'BathValue', 'HouseAge', 'GarageCars', 'Oldness',
  'GarageFinish', 'GarageArea', 'KitchenValue', 'ExterValue',
예제 #29
0
def main():
    # load data
    train_df = pd.read_csv("data/resampled_data.csv")
    y = train_df["signal"]
    X = train_df.drop("signal", axis=1)

    # split in X and y
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=42)

    # scaling is required for some of the following models
    # try different scalings
    minmax_scaler = MinMaxScaler()
    X_train_minmax_scaled = minmax_scaler.fit_transform(X_train)
    X_test_minmax_scaled = minmax_scaler.fit_transform(X_test)

    std_scaler = StandardScaler()
    X_train_std_scaled = std_scaler.fit_transform(X_train)
    X_test_std_scaled = std_scaler.fit_transform(X_test)

    # Robust Scaler: doesn't take the median into account and
    # only focuses on the parts where the bulk data is.
    rob_scaler = RobustScaler()
    X_train_rob_scaled = rob_scaler.fit_transform(X_train)
    X_test_rob_scaled = rob_scaler.fit_transform(X_test)

    metrics_df = pd.DataFrame()
    for X_train, X_test, scale in [
        (X_train_minmax_scaled, X_test_minmax_scaled, "minmax"),
        (X_train_std_scaled, X_test_std_scaled, "std"),
        (X_train_rob_scaled, X_test_rob_scaled, "rob"),
    ]:

        # # model 1 - KNN
        knn_clf = KNeighborsClassifier()
        # make use of grid search as is relatively fast for knn
        knn_rs_cv = GridSearchCV(
            estimator=knn_clf,
            param_grid={"n_neighbors": range(1, 60)},
            scoring="accuracy",
        )
        knn_rs_cv.fit(X=X_train, y=y_train)
        knn_prediction = knn_rs_cv.predict(X_test)
        metrics_df = metrics_df.append(
            gather_performance_metrics(
                y_true=y_test,
                y_pred=knn_prediction,
                model_name=f"knn_{scale}",
                best_params=knn_rs_cv.best_params_,
            ))

        # model 2 - SDG: Stohastic Gradient Descient
        # minimizes the loss function
        # the regularization term (penalty, for overfitting)
        sgd_clf = SGDClassifier()
        sgd_rs_cv = RandomizedSearchCV(
            estimator=sgd_clf,
            param_distributions={
                "loss": [
                    "log",
                    "hinge",
                    "modified_huber",
                    "squared_hinge",
                    "perceptron",
                    "squared_loss",
                    "huber",
                    "epsilon_insensitive",
                    "squared_epsilon_insensitive",
                ],
                "penalty": ["l2", "l1"],
                "alpha":
                list(range_inc(0.001, 50, 0.01, 1.5, 3)),
                "early_stopping": [True],
                "random_state": [42],
            },
            scoring="accuracy",
        )
        sgd_rs_cv.fit(X_train, y_train)
        sgd_prediction = sgd_rs_cv.predict(X_test)
        metrics_df = metrics_df.append(
            gather_performance_metrics(
                y_true=y_test,
                y_pred=sgd_prediction,
                model_name=f"sgd_{scale}",
                best_params=sgd_rs_cv.best_params_,
            ))

        # model 3 - support vector machines
        svm_clf = svm.SVC()
        svm_clf.fit(X_train, y_train)
        svm_prediction = svm_clf.predict(X_test)

        svm_rs_cv = RandomizedSearchCV(
            estimator=svm_clf,
            param_distributions={
                "C": list(range_inc(0.5, 100, 0.9, 1.2, 1)),
                "break_ties": [False],
                "decision_function_shape": ["ovo", "ovr"],
                "kernel": ["poly", "rbf", "sigmoid"],
                "random_state": [42],
            },
            scoring="accuracy",
        )
        svm_rs_cv.fit(X_train, y_train)
        svm_prediction = svm_rs_cv.predict(X_test)
        metrics_df = metrics_df.append(
            gather_performance_metrics(
                y_true=y_test,
                y_pred=svm_prediction,
                model_name=f"svm_{scale}",
                best_params=svm_rs_cv.best_params_,
            ))

        # model 4 - bayessian optimization + cv
        bayes_cv_tuner = BayesSearchCV(
            estimator=XGBClassifier(
                n_jobs=1,
                objective="binary:logistic",
                eval_metric="auc",
                # how many samples will xgboost randomly sample
                # before growing trees to prevent obverfitting
                subsample=0.8,
                use_label_encoder=False,
                random_state=42,
            ),
            search_spaces={
                "learning_rate": (0.01, 1.0),
                "max_depth": (3, 7),
                "min_child_weight": (1, 10),
                "gamma": (1e-9, 0.5),
                "colsample_bytree": (0.01, 1.0),
                "colsample_bylevel": (0.01, 1.0),
                "reg_lambda": (1, 1000),
                "reg_alpha": (1e-9, 1.0),
                "n_estimators": (50, 100),
            },
            cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
            n_iter=10,
            verbose=0,
            refit=True,
            random_state=42,
        )

        def status_print(optim_result):
            """Status callback durring bayesian hyperparameter search"""

            # Get all the models tested so far in DataFrame format
            all_models = pd.DataFrame(bayes_cv_tuner.cv_results_)

            # Get current parameters and the best parameters
            best_params = pd.Series(bayes_cv_tuner.best_params_)
            print("Model #{}\nBest Accuracy: {}\nBest params: {}\n".format(
                len(all_models),
                np.round(bayes_cv_tuner.best_score_, 4),
                bayes_cv_tuner.best_params_,
            ))

        result = bayes_cv_tuner.fit(X_train, y_train, callback=status_print)
        xgb_prediction = result.predict(X_test)
        metrics_df = metrics_df.append(
            gather_performance_metrics(
                y_true=y_test,
                y_pred=xgb_prediction,
                model_name=f"xgb_bayes_opt_{scale}",
                best_params=bayes_cv_tuner.best_params_,
            ))
    metrics_df.to_csv("reports/metrics_results.csv")
    accuracy_heatmap(df=metrics_df)
    plt.savefig("images/accuracy_heatmap.png")
                                                    public_labels,
                                                    test_size=0.3,
                                                    stratify=public_labels,
                                                    random_state=1)

#Vettorizzare i label

from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
train_labels_encoded = encoder.fit_transform(y_train)
test_labels_encoded = encoder.transform(y_test)

#Scalers

from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer
scalers_to_test = [StandardScaler(), RobustScaler()]

# Designate distributions to sample hyperparameters from
C_range = np.array([
    9.78736006e+00, 2.23814334e+01, 1.00000000e-04, 1.00000000e-04,
    1.74371223e+01, 1.00000000e-04, 2.96832303e-01, 1.06931597e+01,
    8.90706391e+00, 1.75488618e+01, 1.49564414e+01, 1.06939267e+01,
    1.00000000e-04, 7.94862668e+00, 3.14271995e+00, 1.00000000e-04,
    1.41729905e+01, 8.07236535e+00, 4.54900806e-01, 1.00000000e-04,
    1.00000000e-04, 1.99524074e+00, 4.68439119e+00, 1.00000000e-04,
    1.16220405e+01, 1.00000000e-04, 1.00000000e-04, 1.03972709e+01,
    1.00000000e-04, 1.00000000e-04, 1.00000000e-04, 1.00000000e-04,
    1.25523737e+01, 1.00000000e-04, 1.66095249e+01, 8.07308186e+00,
    1.00000000e-04, 1.00000000e-04, 1.00000000e-04, 1.00000000e-04,
    2.08711336e+01, 1.64441230e+00, 1.15020554e+01, 1.00000000e-04,
    1.81035130e+00, 1.17786194e+01, 1.00000000e-04, 1.03111446e+01,