def evaluate_regression(combined, model_name, target_vars=None, id_subset_score=None, day_subset_score=None, n_models=1): from sklearn.preprocessing import RobustScaler, OneHotEncoder from sklearn import metrics import statsmodels.api as sm if target_vars is None: target_vars = ( "power", "amplitude", "age", "mean_movement_speed", "total_movement_distance", "proportion_active", "mean_angular_speed", "mean_exit_distance", "mean_turtosity", "is_circadian", "well_tested_circadianess", ) results = {} results["model"] = model_name if id_subset_score is None: use_rows = [True for _ in range(len(combined))] else: use_rows = [bee_id in id_subset_score for bee_id in combined.bee_id] if day_subset_score is not None: use_rows = [(use and (day in day_subset_score)) for day, use in zip(combined.day, use_rows)] for target_var in tqdm(target_vars): day_onehot = pd.DataFrame( OneHotEncoder(sparse=False, categories="auto").fit_transform(combined.day[:, None]), columns=[f"day_{d}" for d in combined.day.unique()], ) factor_names = [c for c in combined.columns if c.startswith("f_")] emb_names = [c for c in combined.columns if c.startswith("e_")] combined_ = np.concatenate( (combined[factor_names], combined[emb_names], day_onehot), axis=1) feature_names = factor_names + emb_names + list(day_onehot.columns) features_scaled = pd.DataFrame( sm.add_constant(RobustScaler().fit_transform(combined_)), columns=["const"] + feature_names, ) if "circadian" in target_var: scores = [] for _ in range(n_models): model = sm.Logit(combined.reset_index()[target_var], features_scaled).fit_regularized( alpha=1, disp=False, trim_mode="off", qc_tol=1) scores.append( metrics.roc_auc_score( combined.loc[use_rows][target_var], model.predict(features_scaled[use_rows]))) results[target_var + "_roc_auc_mean"] = np.mean(scores) if n_models > 1: results[target_var + "_roc_auc_std"] = np.std(scores) else: scores = [] for _ in range(n_models): model = sm.OLS(combined.reset_index()[target_var], features_scaled).fit() scores.append( metrics.r2_score(combined.loc[use_rows][target_var], model.predict(features_scaled[use_rows]))) results[target_var + "_r2_mean"] = np.mean(scores) if n_models > 1: results[target_var + "_r2_std"] = np.std(scores) results["data_subset"] = [combined[["bee_id", "date"]]] return pd.DataFrame(results)
cbar.ax.set_yticklabels(['< -1', '0', '> 1']) # vertically oriented colorbar filename = "./imagesEli/y_sample_before_prep.png" plt.savefig(filename) # Preprocessing: train_test_split... test_size = 0.20 seed = 46 X_train, X_test, Y_train, Y_test = train_test_split(TTdata, Vdata, test_size=test_size, random_state=seed) # ...and normalization scaler_X = RobustScaler() X_train_scaled = scaler_X.fit_transform(X_train) X_test_scaled = scaler_X.transform(X_test) scaler_y = RobustScaler() y_train_scaled = scaler_y.fit_transform(Y_train) y_test_scaled = scaler_y.transform(Y_test) Vdata_rect = np.reshape(y_train_scaled, [-1, 50, 80]) # Make plot with vertical (default) colorbar fig, ax = plt.subplots() data1 = Vdata_rect[0] data1 = np.reshape(data1, [50, 80]).transpose() cax = ax.imshow(data1, cmap=cm.coolwarm) ax.set_title('after prep') # Add colorbar, make sure to specify tick locations to match desired ticklabels cbar = fig.colorbar(cax, ticks=[-1, 0, 1])
annot=True, square=True, fmt=".2f", annot_kws={"size": 10}, yticklabels=valid_feature.values, xticklabels=valid_feature.values, cmap="YlGnBu") # plt.show() valid_feature = valid_feature.delete(0) # 删除SalePrice本身 print(valid_feature) valid_feature = [ 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'TotRmsAbvGrd', 'YearBuilt' ] # 标准化 scaler = RobustScaler() data_train_scaled = scaler.fit_transform(data_train[valid_feature]) data_test_scaled = scaler.fit_transform(data_test[valid_feature]) # 降维 pca = PCA(n_components=15) # data_train_pca = pca.fit_transform(data_train_scaled) # 降维后数据 # data_test_pca = pca.fit_transform(data_test_scaled) # 模型评估 print("========= Modeling =========") # 进行模型交叉验证 models = [ LinearRegression(), Ridge(), Lasso(alpha=0.01, max_iter=10000),
stratify=train_demographics['stratify'], random_state=random_seed) print('Divided BANC dataset into test and training') print('Check train test split sizes') print('X_train: ' + str(Xtrain.shape)) print('X_test: ' + str(Xtest.shape)) print('Y_train: ' + str(Ytrain.shape)) print('Y_test: ' + str(Ytest.shape)) print('X_valitation ' + str(Xvalidate.shape)) print('Y_test: ' + str(Yvalidate.shape)) # Normalise the test dataset and apply the transformation to the train # dataset robustscaler = RobustScaler().fit(Xtrain) Xtrain_scaled = robustscaler.transform(Xtrain) Xtest_scaled = robustscaler.transform(Xtest) Xvalidate_scaled = robustscaler.transform(Xvalidate) # Transform pandas into numpy arrays (no nneed to do it if you are scaling # the results) Ytrain = Ytrain.values Ytest = Ytest.values Yvalidate = Yvalidate.values # Best pipeline recommended by TPOT exported_pipeline = make_pipeline( StackingEstimator(estimator=LinearRegression()), StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=True, max_features=0.9000000000000001,
def main(): index = load_dataset('all_merged', return_index=True) resultFile = './data/datasets/all_merged/estimators/randomforest_hyperparameters.json' estFile = './data/datasets/all_merged/estimators/randomforest_{}.p' hyperparameters = {} for _sym, data in index.items(): features = pd.read_csv(data['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) # Replace nan with infinity so that it can later be imputed to a finite value features = features.replace([np.inf, -np.inf], np.nan) # Derive target classes from closing price target_pct = target_price_variation(features['close']) target = target_binned_price_variation(target_pct, n_bins=2) # target = target_discrete_price_variation(target_pct) # Split data in train and blind test set with 70:30 ratio, # most ML models don't take sequentiality into account, but our pipeline # uses a SimpleImputer with mean strategy, so it's best not to shuffle the data. X_train, X_test, y_train, y_test = train_test_split(features.values, target.values, shuffle=False, test_size=0.3) # Summarize distribution print("Training set: # Features {}, # Samples {}".format( X_train.shape[1], X_train.shape[0])) plot_class_distribution("Training set", _sym, y_train) print("Test set: # Features {}, # Samples {}".format( X_test.shape[1], X_test.shape[0])) plot_class_distribution("Test set", _sym, y_test) if not np.isfinite(X_train).all(): logger.warning("Training x is not finite!") if not np.isfinite(y_train).all(): logger.warning("Training y is not finite!") if not np.isfinite(X_test).all(): logger.warning("Test x is not finite!") if not np.isfinite(y_test).all(): logger.warning("Test y is not finite!") # Build pipeline to be used as estimator in bagging classifier # so that each subset of the data is transformed independently # to avoid contamination between folds. pipeline = Pipeline([ ( 'i', SimpleImputer() ), # Replace nan's with the median value between previous and next observation ( 's', RobustScaler() ), # Scale data in order to center it and increase robustness against noise and outliers #('k', SelectKBest()), # Select top 10 best features #('u', RandomUnderSampler()), ('c', RandomForestClassifier()), ]) # Perform hyperparameter tuning of the ensemble with 5-fold cross validation logger.info("Start Grid search") CV_rfc = GridSearchCV(estimator=pipeline, param_grid=RANDOMFOREST_PARAM_GRID, cv=5, n_jobs=4, scoring='neg_mean_squared_error', verbose=1) CV_rfc.fit(X_train, y_train) logger.info("End Grid search") # Take the fitted ensemble with tuned hyperparameters clf = CV_rfc.best_estimator_ # Test ensemble's performance on training and test sets logger.info("Classification report on train set") predictions1 = clf.predict(X_train) print(classification_report(y_train, predictions1)) logger.info("Classification report on test set") predictions2 = clf.predict(X_test) print(classification_report(y_test, predictions2)) stats = { 'score': accuracy_score(y_train, predictions1), 'mse': mean_squared_error(y_train, predictions1), 'test_score': accuracy_score(y_test, predictions2), 'test_mse': mean_squared_error(y_test, predictions2), 'cv_best_mse': -1 * CV_rfc.best_score_, # CV score is negated MSE # 'cv_results': CV_rfc.cv_results_, 'cv_bestparams': CV_rfc.best_params_, } print(stats) with open(estFile.format(_sym), 'wb') as f: pickle.dump(clf, f) hyperparameters[_sym] = { 'estimator': estFile.format(_sym), 'stats': stats } # feature_importances = np.mean([ # p.named_steps.c.feature_importances_ for p in clf.estimators_ # ], axis=0) # importances = {X.columns[i]: v for i, v in enumerate(feature_importances)} # labeled = {str(k): v for k, v in sorted(importances.items(), key=lambda item: -item[1])} # print({ # # 'features':sel_features # 'feature_importances': labeled, # # 'rank': {l: i + 1 for i, l in enumerate(labeled.keys())}, # }) with open(resultFile, 'w') as f: # Save results at every update json.dump(hyperparameters, f, indent=4) print("--- end ---")
def get_predictions(train, target, test, mod, mod_type, rand_state, is_shuffle, sample_weight, cat_feats): ''' test: 0 if preds on test set aren't required test:pd.DataFrame if preds on test set are required ''' folds = StratifiedKFold(n_splits=5, random_state=rand_state, shuffle=is_shuffle) fold_score = [] indexes = [] temp_train = copy.deepcopy(train) y = target cv_preds = [] preds = [] for i, (train_index, test_index) in enumerate(folds.split(temp_train, y)): xtrain = temp_train.iloc[train_index] xval = temp_train.iloc[test_index] ytrain, yval = y.iloc[train_index], y.iloc[test_index] indexes.append(xval.index) if (mod_type == 'lr') | (mod_type == 'nb') | (mod_type == 'knn'): scaler = RobustScaler() xtrain = pd.DataFrame(scaler.fit_transform(xtrain), columns=temp_train.columns) xval = pd.DataFrame(scaler.transform(xval), columns=temp_train.columns) watchlist = [(xtrain, ytrain), (xval, yval)] try: train_sample_wt = sample_weight.iloc[train_index].values test_sample_wt = sample_weight.iloc[test_index].values except: train_sample_wt = None if (mod_type == 'rf') | (mod_type == 'lr') | (mod_type == 'et') | ( mod_type == 'nb') | (mod_type == 'knn'): if mod_type == 'rf': mod.fit(xtrain, ytrain, sample_weight=train_sample_wt) else: mod.fit(xtrain, ytrain) preds_val = mod.predict_proba(xval)[:, 1] score = roc_auc_score(yval, preds_val) else: if mod_type == 'xgb': mod.fit(X=xtrain, y=ytrain, eval_set=watchlist, eval_metric=custom_f1, early_stopping_rounds=100, verbose=False, sample_weight=train_sample_wt) score = -mod.best_score elif mod_type == 'lgb': mod.fit(X=xtrain, y=ytrain, eval_set=watchlist, eval_metric='auc', early_stopping_rounds=100, verbose=False, sample_weight=train_sample_wt, categorical_feature=cat_feats) score = mod.best_score_['valid_1']['f_1'] print 'fold', i + 1, 'score:', score fold_score.append(score) if isinstance(test, pd.DataFrame): if (mod_type == 'lr') | (mod_type == 'nb') | (mod_type == 'knn'): test = pd.DataFrame(scaler.transform(test), columns=temp_train.columns) preds_test = mod.predict_proba(test)[:, 1] preds.append(preds_test) cv_preds.append(mod.predict_proba(xval)[:, 1]) print 'mean cv score:', (np.mean(fold_score)) if isinstance(test, pd.DataFrame): test_df_preds = np.mean(preds, axis=0) else: test_df_preds = 0 cv_mod_preds = [j for i in cv_preds for j in i] cv_mod_index = [j for i in indexes for j in i] final_cv_preds = pd.DataFrame({ 'cv_preds': cv_mod_preds, 'cv_index': cv_mod_index }).sort_values('cv_index').cv_preds.values return final_cv_preds, test_df_preds
], axis=1) public_labels = df_train.Histology PA_labels = df_test.Histology tot_data = pd.concat([public_data, PA_data], axis=0) tot_label = pd.concat([public_labels, PA_labels], axis=0) encoder = LabelEncoder() #Scalers from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler scalers_to_test = [StandardScaler(), RobustScaler(), MinMaxScaler(), None] df = pd.DataFrame() # Designate distributions to sample hyperparameters from C_range = np.power(2, np.arange(-10, 11, dtype=float)) gamma_range = np.power(2, np.arange(-10, 11, dtype=float)) n_features_to_test = np.arange(4, 10) for i in range(1, 21): #Train test split X_train, X_test, y_train, y_test = train_test_split(tot_data, tot_label, test_size=0.3, stratify=tot_label,
"dropout_keep_prob": 0.1, "hidden_unit": 500, "validation_split": 0.1, "input_size": 3, "num_outputs": 10 } # In[5]: X_test = test_df.drop(['period', 'powerSetPoint', 'sigma', 'delay'], axis=1) y_test = test_df[['delay']] X_train = train_df.drop(['period', 'powerSetPoint', 'sigma', 'delay'], axis=1) y_train = train_df[['delay']] scaler_X = RobustScaler() scaler_y = MinMaxScaler() scaler_X.fit(np.concatenate([X_test, X_train], axis=0)) scaler_y.fit(np.concatenate([y_test, y_train], axis=0)) scaler_filename = "old_scaler.save" joblib.dump(scaler_X, scaler_filename) data_loader = DataLoader(scaler_X.transform(X_test), scaler_y.transform(y_test), params["batch_size"], params["seq_length"], params["input_size"]) X_test, y_test = data_loader.dataset() data_loader = DataLoader(scaler_X.transform(X_train), scaler_y.transform(y_train), params["batch_size"],
import matplotlib.pyplot as plt import pandas as pd from pandas.tools.plotting import parallel_coordinates from sklearn.cluster import KMeans from sklearn.preprocessing import RobustScaler from sklearn import metrics # Eye state eye_train_file = "data\\eye_state.csv" scalar_x = RobustScaler() eye_label_pos = 14 eye_label_name = 'eyeDetection' eyeDF = pd.read_csv(eye_train_file, sep=',') #Scale for plot scaleDF = eyeDF.iloc[:, :-1] scaleDF = pd.DataFrame(scalar_x.fit_transform(scaleDF), columns=scaleDF.columns) scaleDF[eye_label_name] = eyeDF[eye_label_name] # 1. parallel plot parallel_coordinates(scaleDF, eye_label_name) plt.legend(loc='lower left', frameon=False) plt.savefig('image/km/eye/eye_det_parallel.png') plt.clf() plt.cla() eyeX = eyeDF.drop(eye_label_name, axis=1) eyeX = pd.DataFrame(scalar_x.fit_transform(eyeX), columns=eyeX.columns).values eyeY = eyeDF.iloc[:, 0].values print("Rows,Cols in the eye Detection dataset:{}".format(eyeX.shape)) eyekm = KMeans(n_clusters=2)
], axis=1) PA_data = df_test.drop([ 'Histology', 'Surv_time_months', 'OS', 'deadstatus.event', 'Overall_Stage' ], axis=1) public_labels = df_train.Histology PA_labels = df_test.Histology encoder = LabelEncoder() #Scalers from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer, MinMaxScaler scalers_to_test = [StandardScaler(), RobustScaler(), QuantileTransformer()] # Designate distributions to sample hyperparameters from C_range = np.power(2, np.arange(-10, 11, dtype=float)) n_features_to_test = np.arange(4, 10) for i in range(1, 21): #Train test split X_train, X_test, y_train, y_test = train_test_split(public_data, public_labels, test_size=0.3, stratify=public_labels, random_state=i * 500) #Vettorizzare i label
def fit(self, X, y=None): self.rs = RobustScaler() self.rs.fit(X) self.center_ = pd.Series(self.rs.center_, index=X.columns) self.scale_ = pd.Series(self.rs.scale_, index=X.columns) return self
final_rf_classifier = RandomForestRegressor() final_rf_regressor = RandomForestRegressor\ ( bootstrap=False, criterion='mse', max_depth=80, min_samples_split=2, max_features='sqrt', min_samples_leaf=2, n_estimators=2000, random_state=42 ) final_rf_pipeline = Pipeline([('scaler', RobustScaler()), ('kc', final_rf_regressor)], verbose=True) #Fit the final model rf_regressor = final_rf_pipeline.fit(X_classical, unencoded_y) #Dump the saved model joblib.dump(rf_regressor, '/home/ubuntu/model_test/saved_models/rf_regressor.mod') #Load and predict from saved model load_regressor = joblib.load( '/home/ubuntu/model_test/saved_models/rf_regressor.mod') get_score(rf_regressor)
### Credit Card Fraud and Non-Fraud ration with graph target_count = df_train['Class'].value_counts() print('Class 0:', target_count[0]) print('Class 1:', target_count[1]) print('Proportion:', round(target_count[0] / target_count[1], 2), ': 1') #target_count.plot(kind='bar', title='Count (target)'); ## End of Class ratio # Here we use Robust Scaler technique for feature scalling # Scale "Time" and "Amount" df_train['scaled_amount'] = RobustScaler().fit_transform(df_train['Amount'].values.reshape(-1,1)) df_train['scaled_time'] = RobustScaler().fit_transform(df_train['Time'].values.reshape(-1,1)) # Make a new dataset named "df_scaled" dropping out original "Time" and "Amount" df_scaled = df_train.drop(['Time','Amount'],axis = 1,inplace=False) df_scaled.head() # Define the prep_data function to extrac features def prep_data(df): X = df.drop(['Class'],axis=1, inplace=False) X = np.array(X).astype(np.float) y = df[['Class']] y = np.array(y).astype(np.float) return X,y # Create X and y from the prep_data function
def pd_robustscale(Data) : x = Data.values scaler = RobustScaler() x_rs = scaler.fit_transform(x) data_rs = pd.DataFrame(x_rs, index=Data.index, columns=Data.columns) return data_rs, scaler
def iqr_robust_scaler(train, test): scaler = RobustScaler(quantile_range=(25.0,75.0), copy=True, with_centering=True, with_scaling=True).fit(train) train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index([train.index.values]) test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index([test.index.values]) return scaler, train_scaled, test_scaled
def LassoPipelineModel(self): return make_pipeline(RobustScaler(), Lasso(alpha = 0.00035, random_state = 1))
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone from sklearn.model_selection import KFold, cross_val_score, train_test_split from sklearn.metrics import mean_squared_error # import xgboost as xgb # import lightgbm as lgb #Validation function n_folds = 5 def rmsle_cv(model): kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values) rmse= np.sqrt(-cross_val_score(model, train.values, y_train, scoring="neg_mean_squared_error", cv = kf)) return(rmse) lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1)) ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3)) KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5) GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state =5) ''' model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, learning_rate=0.05, max_depth=3, min_child_weight=1.7817, n_estimators=2200, reg_alpha=0.4640, reg_lambda=0.8571, subsample=0.5213, silent=1, random_state =7, nthread = -1) model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
print(x1.shape) print(x2.shape) print(x3.shape) # y_data y = train.iloc[:, -4:] y = y.values # scaler scaler1 = MinMaxScaler() scaler1.fit(x1) train_rho2 = scaler1.transform(x1) test_rho2 = scaler1.transform(x1_pred) scaler2 = RobustScaler() scaler2.fit(x2) train_ratio = scaler2.fit(x2) test_ratio = scaler2.fit(x2_pred) scaler2.fit(x3) train_fft = scaler2.transform(x3) test_fft = scaler2.transform(x3_pred) # train_test_split x1_train, x1_test, x2_train, x2_test, x3_train, x3_test, y_train, y_test = train_test_split( x1, x2, x3, y, random_state=66, train_size=0.8) act = 'relu' #2. model
'SCALER', 'PCA__n_components', 'CLF__n_estimators', 'CLF__criterion', 'CLF__bootstrap'] ## write the data to the specified output path: "output"/+file_name ## without adding the index of the dataframe to the output ## and without adding a header to the output. ## => these parameters are added to be fit the desired output. #df.to_csv(f'/home/users/ubaldi/TESI_PA/result_score/Public/score_{name}.csv', index=False, header=fieldnames) #create folder and save import os outname = f'score_{name}_{str(abbr_scaler)}_YES_NO.csv' outdir = f'/home/users/ubaldi/TESI_PA/result_score/Public/{folder}/' if not os.path.exists(outdir): os.makedirs(outdir) fullname = os.path.join(outdir, outname) df.to_csv(fullname, index=False, header=fieldnames) create_csv_score_YES_NO(MinMaxScaler(), 'MMS') create_csv_score_YES_NO(StandardScaler(), 'STDS') create_csv_score_YES_NO(RobustScaler(), 'RBT')
'age_nan', 'tax_nan', 'ptratio_nan', 'b_nan', 'dis_nan' ], 'main_imputer': ModelBasedFullImputer(columns=[ 'crim', 'zn', 'nox', 'indus', 'rm', 'age', 'tax', 'ptratio', 'b', 'dis' ], model=DecisionTreeRegressor(max_depth=None)), 'poly': None, 'predictor': Lasso(alpha=0.01), 'reduce_dim': None, 'scaler': RobustScaler(), 'simple_imputer': FillNaTransformer(from_dict={}, mean=[], median=[ 'crim', 'zn', 'nox', 'indus', 'rm', 'age', 'tax', 'ptratio', 'b', 'dis' ], nan_flag=[ 'crim', 'zn', 'nox', 'indus', 'rm', 'age', 'tax', 'ptratio', 'b', 'dis' ], zero=[]) }, 'score': 3.993797473454735, 'std': 0.5808956921355953
# LightGBM model_lgb = lgb.LGBMRegressor(objective='regression', num_leaves=5, learning_rate=0.05, n_estimators=720, max_bin=55, bagging_fraction=0.8, bagging_freq=5, feature_fraction=0.2319, feature_fraction_seed=9, bagging_seed=9, min_data_in_leaf=6, min_sum_hessian_in_leaf=11) # Elastic Net ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3)) """ Lasso: This model may be very sensitive to outliers. So we need to made it more robust on them. For that we use the sklearn's Robustscaler() method on pipeline """ lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1)) # Kernel Ridge Regression KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5) # Let's score our models ... score = rmsle_cv(lasso) print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))
import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler from sklearn.metrics import r2_score, mean_squared_error from MLP_Class_FromScratch import MultiP listind=np.linspace(start=0,stop=20427,num=3000).astype(int) #Verisetinin tamamını kullanmadığımız durumda alt satırdaki ilk iki nokta yerine listind yazılır. print(listind) df = pd.read_pickle("df.pkl").iloc[:,:] y = df.median_house_value.to_numpy() X = df.drop(["median_house_value"],axis=1).to_numpy() scaler1 = MinMaxScaler() scaler2 = StandardScaler() scaler3 = RobustScaler() iter = 1 # Ağ'ın çalışma sayısı burada belirlenir. mse, r2 = [],[] # Ağın her çalışmasında elde edilen değerlendirme metrikleri bu boş listelerin içine aktarılacak. for i in range(iter): mlp = MultiP(inputDim=X.shape[1], firstLayer=32, secondLayer=16, outputDim=1) X = scaler2.fit_transform(X) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) mlp.train(X_train,y_train,epochs=300,lr=0.1,dropout1=0.0,dropout2=0.0) # Dropout oranları hem eğitimde hem testte verilmelidir ve birbiriyle uyuşmalıdır. y_pred = np.zeros(len(y_test))
from sklearn.datasets import fetch_openml from sklearn.metrics import roc_auc_score, make_scorer from sklearn.model_selection import cross_val_score from sklearn.preprocessing import RobustScaler import warnings warnings.simplefilter(action="ignore", category=FutureWarning) X, y = fetch_openml(name="diabetes", as_frame=False, return_X_y=True) # replace the label names with 0 or 1 y = (y == "tested_positive").astype(int) # Scale the dataset with sklearn RobustScaler (important for this algorithm) X = RobustScaler().fit_transform(X) # Using GridSearchCV, to tune the parameters alpha, eta0, learning_rate, loss # and average of SGDClassifier, we get the following parameters. clf_not_rob = SGDClassifier(average=10, learning_rate="optimal", loss="hinge") # Then, we use this estimator as base_estimator of RobustWeightedEstimator. # Using GridSearchCV, we tuned the parameters c and eta0, with the # choice of "huber" weighting because the sample_size is not very large. clf_rob = RobustWeightedClassifier( weighting="huber", loss="hinge", c=1.35, eta0=1e-3,
plt.xlabel('Время (сек.)') plt.ylabel('Число транзакций') plt.show() f, (ax1, ax2) = plt.subplots(2, 1, sharex=True, figsize=(15, 5)) ax1.hist(original_df.Amount[original_df.Class == 1], bins=5) ax1.set_title('Мошеннические транзакции') ax2.hist(original_df.Amount[original_df.Class == 0], bins=5) ax2.set_title('Честные транзакции') plt.xlabel('Сумма') plt.ylabel('Число транзакций') plt.yscale('log') plt.show() # Масштабирование rob_scaler = RobustScaler() original_df['rubust_time'] = RobustScaler().fit_transform( original_df['Time'].values.reshape(-1, 1)) original_df['robust_amount'] = RobustScaler().fit_transform( original_df['Amount'].values.reshape(-1, 1)) original_df.drop(['Time', 'Amount'], axis=1, inplace=True) rubust_time = original_df['rubust_time'] robust_amount = original_df['robust_amount'] original_df.drop(['robust_amount', 'rubust_time'], axis=1, inplace=True) original_df.insert(0, 'robust_amount', robust_amount) original_df.insert(1, 'rubust_time', rubust_time)
x_val = val.drop('target', axis=1) # train, predict, evaluate auc, ll = train_and_evaluate(y_train, x_train, y_val, x_val) print("No transformation") print("AUC: {:.2%}, log loss: {:.2%} \n".format(auc, ll)) # try different transformations for X # X is already scaled to (0,1) so these won't make much difference transformers = [ MaxAbsScaler(), MinMaxScaler(), RobustScaler(), StandardScaler(), Normalizer(norm='l1'), Normalizer(norm='l2'), Normalizer(norm='max') ] #poly_scaled = Pipeline([ ( 'poly', PolynomialFeatures()), ( 'scaler', MinMaxScaler()) ]) #transformers.append( PolynomialFeatures(), poly_scaled ) for transformer in transformers: print(transformer) auc, ll = transform_train_and_evaluate(transformer) print("AUC: {:.2%}, log loss: {:.2%} \n".format(auc, ll))
'Histology', 'Surv_time_months', 'OS', 'deadstatus.event', 'Overall_Stage' ], axis=1) PA_data = df_test.drop([ 'Histology', 'Surv_time_months', 'OS', 'deadstatus.event', 'Overall_Stage' ], axis=1) public_labels = df_train.Histology PA_labels = df_test.Histology encoder = LabelEncoder() #Scalers from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler scalers_to_test = [RobustScaler(), MinMaxScaler()] df = pd.DataFrame() #Designate distributions to sample hyperparameters from C_range = np.power(2, np.arange(-10, 11, dtype=float)) n_features_to_test = np.arange(4, 10) for i in range(1, 21): #Train test split X_train, X_test, y_train, y_test = train_test_split(public_data, public_labels, test_size=0.3, stratify=public_labels, random_state=i * 500)
from sklearn.svm import NuSVC from sklearn.linear_model import Perceptron from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier, VotingClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.experimental import enable_hist_gradient_boosting from sklearn.ensemble import HistGradientBoostingClassifier from sklearn.base import BaseEstimator, TransformerMixin #=================Scaler scaler = [ StandardScaler(), MinMaxScaler(), MaxAbsScaler(), RobustScaler(quantile_range=(25, 75)), PowerTransformer(method='yeo-johnson'), # PowerTransformer(method='box-cox'), QuantileTransformer(output_distribution='normal'), QuantileTransformer(output_distribution='uniform'), Normalizer() ] #=================Classifier classifier_test = [ OneVsRestClassifier(SVC()), DecisionTreeClassifier(max_depth=5), SVC(), SVC(kernel="linear", C=0.025), LogisticRegressionCV(cv=5, random_state=0), GradientBoostingClassifier(random_state=0),
unorm_train[i] = unorm_train[i].clip_upper(unorm_train[i].quantile(0.99)) unorm_train[i] = unorm_train[i].clip_lower(unorm_train[i].quantile(0.01)) # for i in clip_lower_feature: # remove outlier # outlier_index = unorm_train[(unorm_train['TotalSF'] > 100000) | (unorm_train['BsmtValue']>60000) | (unorm_train['TotalBsmtSF'] > 6000) | (unorm_train['LotFrontage'] > 200) | (unorm_train['LotArea'] > 100000) | (unorm_train['MasVnrArea'] > 1500)].index # outlier_index = unorm_train[(unorm_train['GrLivArea'] > 4000)].index # unorm_train.drop(outlier_index, inplace=True) # print(outlier_index) train_data2 = unorm_train.drop(['SalePrice'], axis=1) # Scaling trans_train = RobustScaler().fit_transform(train_data2.values) trans_test = RobustScaler().fit_transform(test_data) output_train = pd.DataFrame(trans_train, columns=combined_df.columns) output_train['SalePrice'] = pd.Series(np.log1p(SalePrice.values)) output_test = pd.DataFrame(trans_test , columns=combined_df.columns) output_test['Id'] = pd.Series(Test_id) # selected most important features important_features = ['OverallQual', 'ExterQual', 'GarageValue', 'GrLivArea', 'GarageAge', 'KitchenQual', 'FullBath', 'BsmtQual', 'BathValue', 'HouseAge', 'GarageCars', 'Oldness', 'GarageFinish', 'GarageArea', 'KitchenValue', 'ExterValue',
def main(): # load data train_df = pd.read_csv("data/resampled_data.csv") y = train_df["signal"] X = train_df.drop("signal", axis=1) # split in X and y X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # scaling is required for some of the following models # try different scalings minmax_scaler = MinMaxScaler() X_train_minmax_scaled = minmax_scaler.fit_transform(X_train) X_test_minmax_scaled = minmax_scaler.fit_transform(X_test) std_scaler = StandardScaler() X_train_std_scaled = std_scaler.fit_transform(X_train) X_test_std_scaled = std_scaler.fit_transform(X_test) # Robust Scaler: doesn't take the median into account and # only focuses on the parts where the bulk data is. rob_scaler = RobustScaler() X_train_rob_scaled = rob_scaler.fit_transform(X_train) X_test_rob_scaled = rob_scaler.fit_transform(X_test) metrics_df = pd.DataFrame() for X_train, X_test, scale in [ (X_train_minmax_scaled, X_test_minmax_scaled, "minmax"), (X_train_std_scaled, X_test_std_scaled, "std"), (X_train_rob_scaled, X_test_rob_scaled, "rob"), ]: # # model 1 - KNN knn_clf = KNeighborsClassifier() # make use of grid search as is relatively fast for knn knn_rs_cv = GridSearchCV( estimator=knn_clf, param_grid={"n_neighbors": range(1, 60)}, scoring="accuracy", ) knn_rs_cv.fit(X=X_train, y=y_train) knn_prediction = knn_rs_cv.predict(X_test) metrics_df = metrics_df.append( gather_performance_metrics( y_true=y_test, y_pred=knn_prediction, model_name=f"knn_{scale}", best_params=knn_rs_cv.best_params_, )) # model 2 - SDG: Stohastic Gradient Descient # minimizes the loss function # the regularization term (penalty, for overfitting) sgd_clf = SGDClassifier() sgd_rs_cv = RandomizedSearchCV( estimator=sgd_clf, param_distributions={ "loss": [ "log", "hinge", "modified_huber", "squared_hinge", "perceptron", "squared_loss", "huber", "epsilon_insensitive", "squared_epsilon_insensitive", ], "penalty": ["l2", "l1"], "alpha": list(range_inc(0.001, 50, 0.01, 1.5, 3)), "early_stopping": [True], "random_state": [42], }, scoring="accuracy", ) sgd_rs_cv.fit(X_train, y_train) sgd_prediction = sgd_rs_cv.predict(X_test) metrics_df = metrics_df.append( gather_performance_metrics( y_true=y_test, y_pred=sgd_prediction, model_name=f"sgd_{scale}", best_params=sgd_rs_cv.best_params_, )) # model 3 - support vector machines svm_clf = svm.SVC() svm_clf.fit(X_train, y_train) svm_prediction = svm_clf.predict(X_test) svm_rs_cv = RandomizedSearchCV( estimator=svm_clf, param_distributions={ "C": list(range_inc(0.5, 100, 0.9, 1.2, 1)), "break_ties": [False], "decision_function_shape": ["ovo", "ovr"], "kernel": ["poly", "rbf", "sigmoid"], "random_state": [42], }, scoring="accuracy", ) svm_rs_cv.fit(X_train, y_train) svm_prediction = svm_rs_cv.predict(X_test) metrics_df = metrics_df.append( gather_performance_metrics( y_true=y_test, y_pred=svm_prediction, model_name=f"svm_{scale}", best_params=svm_rs_cv.best_params_, )) # model 4 - bayessian optimization + cv bayes_cv_tuner = BayesSearchCV( estimator=XGBClassifier( n_jobs=1, objective="binary:logistic", eval_metric="auc", # how many samples will xgboost randomly sample # before growing trees to prevent obverfitting subsample=0.8, use_label_encoder=False, random_state=42, ), search_spaces={ "learning_rate": (0.01, 1.0), "max_depth": (3, 7), "min_child_weight": (1, 10), "gamma": (1e-9, 0.5), "colsample_bytree": (0.01, 1.0), "colsample_bylevel": (0.01, 1.0), "reg_lambda": (1, 1000), "reg_alpha": (1e-9, 1.0), "n_estimators": (50, 100), }, cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42), n_iter=10, verbose=0, refit=True, random_state=42, ) def status_print(optim_result): """Status callback durring bayesian hyperparameter search""" # Get all the models tested so far in DataFrame format all_models = pd.DataFrame(bayes_cv_tuner.cv_results_) # Get current parameters and the best parameters best_params = pd.Series(bayes_cv_tuner.best_params_) print("Model #{}\nBest Accuracy: {}\nBest params: {}\n".format( len(all_models), np.round(bayes_cv_tuner.best_score_, 4), bayes_cv_tuner.best_params_, )) result = bayes_cv_tuner.fit(X_train, y_train, callback=status_print) xgb_prediction = result.predict(X_test) metrics_df = metrics_df.append( gather_performance_metrics( y_true=y_test, y_pred=xgb_prediction, model_name=f"xgb_bayes_opt_{scale}", best_params=bayes_cv_tuner.best_params_, )) metrics_df.to_csv("reports/metrics_results.csv") accuracy_heatmap(df=metrics_df) plt.savefig("images/accuracy_heatmap.png")
public_labels, test_size=0.3, stratify=public_labels, random_state=1) #Vettorizzare i label from sklearn.preprocessing import LabelEncoder encoder = LabelEncoder() train_labels_encoded = encoder.fit_transform(y_train) test_labels_encoded = encoder.transform(y_test) #Scalers from sklearn.preprocessing import StandardScaler, RobustScaler, QuantileTransformer scalers_to_test = [StandardScaler(), RobustScaler()] # Designate distributions to sample hyperparameters from C_range = np.array([ 9.78736006e+00, 2.23814334e+01, 1.00000000e-04, 1.00000000e-04, 1.74371223e+01, 1.00000000e-04, 2.96832303e-01, 1.06931597e+01, 8.90706391e+00, 1.75488618e+01, 1.49564414e+01, 1.06939267e+01, 1.00000000e-04, 7.94862668e+00, 3.14271995e+00, 1.00000000e-04, 1.41729905e+01, 8.07236535e+00, 4.54900806e-01, 1.00000000e-04, 1.00000000e-04, 1.99524074e+00, 4.68439119e+00, 1.00000000e-04, 1.16220405e+01, 1.00000000e-04, 1.00000000e-04, 1.03972709e+01, 1.00000000e-04, 1.00000000e-04, 1.00000000e-04, 1.00000000e-04, 1.25523737e+01, 1.00000000e-04, 1.66095249e+01, 8.07308186e+00, 1.00000000e-04, 1.00000000e-04, 1.00000000e-04, 1.00000000e-04, 2.08711336e+01, 1.64441230e+00, 1.15020554e+01, 1.00000000e-04, 1.81035130e+00, 1.17786194e+01, 1.00000000e-04, 1.03111446e+01,