def test_regressor(self): reg = RGFRegressor() reg.fit(self.X_train, self.y_train) y_pred = reg.predict(self.X_test) mse = mean_squared_error(self.y_test, y_pred) print("MSE: {0:.5f}".format(mse)) self.assertLess(mse, 6.0)
def __fit(self): """""" start = time.time() ## id_cols = [ col for col in self.TrainData.columns if (col.startswith('Item_Identifier')) ] self._l_drop_cols.extend(id_cols) X = self.TrainData.drop(self._l_drop_cols, axis=1) Y = self.TrainData['Item_Outlet_Sales'] ## self._l_train_columns = X.columns print('Size of feature space: %s' % len(self._l_train_columns)) ## self._model = RGFRegressor( algorithm=self.parameters['algorithm'], loss=self.parameters['loss'], learning_rate=self.parameters['learning_rate'], n_iter=self.parameters['n_iter'], reg_depth=self.parameters['reg_depth'], l2=self.parameters['l2'], sl2=self.parameters['sl2'], #min_samples_leaf= self.parameters['min_samples_leaf'], max_leaf=self.parameters['max_leaf'], verbose=True) self._model.fit(X, Y) end = time.time() print('\nTraining is done. Time elapsed %ds' % (end - start)) return
def test_attributes(self): reg = RGFRegressor() attributes = ('n_features_', 'fitted_', 'sl2_', 'min_samples_leaf_', 'n_iter_') for attr in attributes: self.assertRaises(NotFittedError, getattr, reg, attr) reg.fit(self.X_train, self.y_train) self.assertEqual(reg.n_features_, self.X_train.shape[-1]) self.assertTrue(reg.fitted_) if reg.sl2 is None: self.assertEqual(reg.sl2_, reg.l2) else: self.assertEqual(reg.sl2_, reg.sl2) if reg.min_samples_leaf < 1: self.assertLessEqual(reg.min_samples_leaf_, 0.5 * self.X_train.shape[0]) else: self.assertEqual(reg.min_samples_leaf_, reg.min_samples_leaf) if reg.n_iter is None: if reg.loss == "LS": self.assertEqual(reg.n_iter_, 10) else: self.assertEqual(reg.n_iter_, 5) else: self.assertEqual(reg.n_iter_, reg.n_iter)
def test_regressor_sparse_input(self): reg = RGFRegressor(prefix='reg') for sparse_format in (csr_matrix, csc_matrix, coo_matrix): X_sparse = sparse_format(self.X) reg.fit(X_sparse, self.y) y_pred = reg.predict(X_sparse) mse = mean_squared_error(self.y, y_pred) self.assertLess(mse, 6.0)
def test_regressor_sparse_input(self): reg = RGFRegressor() for sparse_format in (sparse.bsr_matrix, sparse.coo_matrix, sparse.csc_matrix, sparse.csr_matrix, sparse.dia_matrix, sparse.dok_matrix, sparse.lil_matrix): X_sparse = sparse_format(self.X) reg.fit(X_sparse, self.y) y_pred = reg.predict(X_sparse) mse = mean_squared_error(self.y, y_pred) self.assertLess(mse, 6.0)
def test_joblib_pickle(self): reg = RGFRegressor() reg.fit(self.X_train, self.y_train) y_pred1 = reg.predict(self.X_test) joblib.dump(reg, 'test_reg.pkl') # Remove model file _cleanup() reg2 = joblib.load('test_reg.pkl') y_pred2 = reg2.predict(self.X_test) np.testing.assert_allclose(y_pred1, y_pred2)
def test_pickle(self): reg = RGFRegressor() reg.fit(self.X_train, self.y_train) y_pred1 = reg.predict(self.X_test) s = pickle.dumps(reg) # Remove model file _cleanup() reg2 = pickle.loads(s) y_pred2 = reg2.predict(self.X_test) np.testing.assert_allclose(y_pred1, y_pred2)
def test_cleanup(self): reg1 = RGFRegressor() reg1.fit(self.X_train, self.y_train) reg2 = RGFRegressor() reg2.fit(self.X_train, self.y_train) self.assertNotEqual(reg1.cleanup(), 0) self.assertEqual(reg1.cleanup(), 0) glob_file = os.path.join(_get_temp_path(), reg1._file_prefix + "*") self.assertFalse(glob.glob(glob_file)) self.assertRaises(NotFittedError, reg1.predict, self.X_test) reg2.predict(self.X_test)
def test_parallel_gridsearch(self): param_grid = dict(max_leaf=[100, 300]) grid = GridSearchCV(RGFRegressor(), param_grid=param_grid, refit=True, cv=2, verbose=0, n_jobs=-1) grid.fit(self.X_train, self.y_train) y_pred = grid.best_estimator_.predict(self.X_test) mse = mean_squared_error(self.y_test, y_pred) self.assertLess(mse, 6.0)
def test_params(self): reg = RGFRegressor() valid_params = dict(max_leaf=300, test_interval=100, algorithm='RGF_Sib', loss='Log', reg_depth=1.1, l2=0.1, sl2=None, normalize=False, min_samples_leaf=9, n_iter=None, n_tree_search=2, opt_interval=100, learning_rate=0.4, verbose=True, prefix='rgf_regressor', inc_prefix=True, clean=True) reg.set_params(**valid_params) reg.fit(self.X_train, self.y_train) non_valid_params = dict(max_leaf=0, test_interval=0, algorithm='RGF_Test', loss=True, reg_depth=0.1, l2=11, sl2=-1.1, normalize='False', min_samples_leaf=0.7, n_iter=11.1, n_tree_search=0, opt_interval=100.1, learning_rate=-0.5, verbose=-1, prefix='', inc_prefix=1, clean=0) for key in non_valid_params: reg.set_params(**valid_params) # Reset to valid params reg.set_params(**{key: non_valid_params[key]}) # Pick and set one non-valid parametr self.assertRaises(ValueError, reg.fit, self.X_train, self.y_train)
def test_input_arrays_shape(self): reg = RGFRegressor() n_samples = self.y_train.shape[0] self.assertRaises(ValueError, reg.fit, self.X_train, self.y_train[:(n_samples - 1)]) self.assertRaises(ValueError, reg.fit, self.X_train, self.y_train, np.ones(n_samples - 1)) self.assertRaises(ValueError, reg.fit, self.X_train, self.y_train, np.ones((n_samples, 2)))
def test(self): df =pd.read_csv('MorganMACCS.csv') baseDf = df extractDf = df['CAS'].isin(ejectCAS) df = df[~df['CAS'].isin(ejectCAS)] y = df['logTox'] dropList = ['CAS','toxValue','logTox','HDonor', 'HAcceptors', 'AromaticHeterocycles', 'AromaticCarbocycles', 'FractionCSP3'] #dropList = ['CAS','toxValue','logTox'] X = df.drop(columns=dropList) #Normalize for name in X.columns: if str.isdecimal(name)==True: if X[str(name)].sum() == 0: print(name) X = X.drop(columns=name) else: std =X[name].std() mean = X[name].mean() X[name] = X[name].apply(lambda x: ((x - mean) * 1 / std + 0)) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.1, random_state=2) cols = np.arange(1,550,1).tolist() cols = X.columns.tolist() cols = [1,2,3] # Initializing Classifiers reg1 = Ridge(random_state=1) #reg2 = ExtraTreesRegressor() reg2 = ExtraTreesRegressor(n_estimators=50,max_features= 50,min_samples_split= 5,max_depth= 50, min_samples_leaf= 5) reg3 = SVR(gamma='auto',kernel='linear') reg4 = LGBMRegressor(boosting_type='gbdt', num_leaves= 60,learning_rate=0.06) pls = PLSRegression(n_components=3) pipe1 = make_pipeline(ColumnSelector(cols=cols), ExtraTreesRegressor(n_estimators=50)) #linear =SGDRegressor(max_iter=1000) rgf = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0) nbrs = KNeighborsRegressor(2) pipe2 = make_pipeline(ColumnSelector(cols=cols), KNeighborsRegressor(31)) meta = ExtraTreesRegressor(n_estimators=50,max_features= 7,min_samples_split= 5,max_depth= 50, min_samples_leaf= 5) stackReg = StackingRegressor(regressors=[reg1,reg2, reg3,pipe1,pls,nbrs,rgf], meta_regressor=meta,verbose=1) stackReg.fit(X_train, y_train) y_pred = stackReg.predict(X_train) y_val = stackReg.predict(X_test) print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred,y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val,y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred,y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val,y_test)) reg4.fit(X_train, y_train) y_pred = reg4.predict(X_train) y_val = reg4.predict(X_test) print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred,y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val,y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred,y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val,y_test))
def test_sample_weight(self): reg = RGFRegressor() y_pred = reg.fit(self.X_train, self.y_train).predict(self.X_test) y_pred_weighted = reg.fit(self.X_train, self.y_train, np.ones(self.y_train.shape[0]) ).predict(self.X_test) np.testing.assert_allclose(y_pred, y_pred_weighted) np.random.seed(42) idx = np.random.choice(400, 80, replace=False) self.X_train[idx] = -99999 # Add some outliers y_pred_corrupt = reg.fit(self.X_train, self.y_train).predict(self.X_test) mse_corrupt = mean_squared_error(self.y_test, y_pred_corrupt) weights = np.ones(self.y_train.shape[0]) weights[idx] = np.nextafter(np.float32(0), np.float32(1)) # Eliminate outliers y_pred_weighted = reg.fit(self.X_train, self.y_train, weights).predict(self.X_test) mse_fixed = mean_squared_error(self.y_test, y_pred_weighted) self.assertLess(mse_fixed, mse_corrupt)
def test_params(self): reg = RGFRegressor() valid_params = dict(max_leaf=300, test_interval=100, algorithm='RGF_Sib', loss='Log', reg_depth=1.1, l2=0.1, sl2=None, normalize=False, min_samples_leaf=9, n_iter=None, n_tree_search=2, opt_interval=100, learning_rate=0.4, memory_policy='conservative', verbose=True) reg.set_params(**valid_params) reg.fit(self.X_train, self.y_train) non_valid_params = dict(max_leaf=0, test_interval=0, algorithm='RGF_Test', loss=True, reg_depth=0.1, l2=11, sl2=-1.1, normalize='False', min_samples_leaf=0.7, n_iter=11.1, n_tree_search=0, opt_interval=100.1, learning_rate=-0.5, memory_policy='Generos', verbose=-1) for key in non_valid_params: reg.set_params(**valid_params) # Reset to valid params reg.set_params(**{key: non_valid_params[key]}) # Pick and set one non-valid parametr self.assertRaises(ValueError, reg.fit, self.X_train, self.y_train)
def RGF_cv(max_leaf, l2, min_samples_leaf): score = cross_validate( RGFRegressor( max_leaf=int(max_leaf), algorithm="RGF", test_interval=100, loss="LS", verbose=False, l2=l2, min_samples_leaf = int(min_samples_leaf) ), X, y, scoring='neg_mean_squared_error', cv=cv,n_jobs=-1) val = score['test_score'].mean() return val
def select_model(model_type, seed=1208): if model_type == 'ridge': params['solver'] = 'auto' params['fit_intercept'] = True params['alpha'] = 0.4 params['max_iter'] = 1000 params['normalize'] = False params['tol'] = 0.01 model = Ridge(**params) elif model_type == 'rmf': params['max_depth'] = 10 params['n_estimators'] = 3000 params['criterion'] = 'mse' params['max_features'] = 0.3 params['min_samples_leaf'] = 30 params['min_samples_split'] = 30 params['n_jobs'] = num_threads params['random_state'] = seed model = RandomForestRegressor(**params) elif model_type == 'ext': params['max_depth'] = 10 params['n_estimators'] = 3000 params['max_features'] = 'auto' params['min_samples_leaf'] = 30 params['min_samples_split'] = 30 params['n_jobs'] = num_threads params['random_state'] = seed model = ExtraTreesRegressor(**params) elif model_type == 'rgf': # params['reg_depth'] = 10 params['max_leaf'] = 2000 params['loss'] = "LS" params['n_tree_search'] = 3000 params['min_samples_leaf'] = 30 params['learning_rate'] = 0.01 params['verbose'] = True params['algorithm'] = "RGF" params['test_interval'] = 100 model = RGFRegressor(**params) return model
def __init__(self, task, fast=False): if task == 'classification': self.metric = 'roc_auc' self.task = "classification" if fast: self.model = FastRGFClassifier() else: self.model = RGFClassifier(loss="Log") else: self.metric = 'neg_mean_squared_error' self.task = "regression" if fast: self.model = FastRGFRegressor() else: self.model = RGFRegressor(loss="LS", normalize=True) self.X_test = None self.X_train = None self.y_test = None self.y_train = None self.grid_search = None self.y_predict = None self.test_score = None
df_train_set["travel_time"] = df_train_set["travel_time"].str.split(':').apply( lambda x: int(x[0]) * 60 + int(x[1])) df_train_set['is_weekend'] = np.where(df_train_set['travel_date'] >= 5, 1, 0) #print(df_train_set.head(5)) # ------ model X = df_train_set.drop(["number_of_tickets"], axis=1) y = df_train_set.number_of_tickets X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.25, shuffle=True) model = RGFRegressor(max_leaf=4500, algorithm="RGF_Sib", test_interval=50, loss="LS", verbose=False) scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=5) print(sum(scores) / len(scores)) #print(X.head(5)) #model.fit(X, y) #preds_train_set = model.predict(X_test) #print(mean_absolute_error(preds_train_set, y_test)) sys.exit() # ----------------------
def run(seed): # create folders for scores models and preds folder_models = './models/domain2_var1/scores/' if not os.path.exists(folder_models): os.makedirs(folder_models) folder_preds = './predicts/domain2_var1/scores/' if not os.path.exists(folder_preds): os.makedirs(folder_preds) print('Loading data...') # load biases ic_bias = read_pickle('./data/biases/ic_biases.pickle') ic_bias_site = read_pickle('./data/biases/ic_biases_site.pickle') fnc_bias = read_pickle('./data/biases/fnc_biases.pickle') fnc_bias_site = read_pickle('./data/biases/fnc_biases_site.pickle') pca_bias = read_pickle('./data/biases/200pca_biases.pickle') pca_bias_site = read_pickle('./data/biases/200pca_biases_site.pickle') # load classifier and add extra sites2 extra_site = pd.DataFrame() extra_site['Id'] = np.load('./predicts/classifier/site2_test_new_9735.npy') # load competiton data ids_df = pd.read_csv('./data/raw/reveal_ID_site2.csv') fnc_df = pd.read_csv('./data/raw/fnc.csv') loading_df = pd.read_csv('./data/raw/loading.csv') labels_df = pd.read_csv('./data/raw/train_scores.csv') ids_df = ids_df.append(extra_site) print('Detected Site2 ids count: ', ids_df['Id'].nunique()) # load created features agg_df = pd.read_csv('./data/features/agg_feats.csv') im_df = pd.read_csv('./data/features/im_feats.csv') dl_df = pd.read_csv('./data/features/dl_feats.csv') pca_df = pd.read_csv('./data/features/200pca_feats/200pca_3d_k0.csv') for i in range(1, 6): part = pd.read_csv( './data/features/200pca_feats/200pca_3d_k{}.csv'.format(i)) del part['Id'] pca_df = pd.concat((pca_df, part), axis=1) # merge data ic_cols = list(loading_df.columns[1:]) fnc_cols = list(fnc_df.columns[1:]) agg_cols = list(agg_df.columns[1:]) im_cols = list(im_df.columns[1:]) pca_cols = list(pca_df.columns[1:]) dl_cols = list(dl_df.columns[1:]) pca0_cols = [c for c in pca_cols if 'k0' in c] df = fnc_df.merge(loading_df, on='Id') df = df.merge(agg_df, how='left', on='Id') df = df.merge(im_df, how='left', on='Id') df = df.merge(pca_df, how='left', on='Id') df = df.merge(dl_df, how='left', on='Id') df = df.merge(labels_df, how='left', on='Id') del loading_df, fnc_df, agg_df, im_df, pca_df gc.collect() # split train and test df.loc[df['Id'].isin(labels_df['Id']), 'is_test'] = 0 df.loc[~df['Id'].isin(labels_df['Id']), 'is_test'] = 1 train = df.query('is_test==0') del train['is_test'] test = df.query('is_test==1') del test['is_test'] y = train['domain2_var1'].copy().reset_index(drop=True) d21_index = list(train['domain2_var1'].dropna().index) # apply biases for c in ic_bias_site.keys(): test.loc[~test['Id'].isin(ids_df['Id']), c] += ic_bias[c] test.loc[test['Id'].isin(ids_df['Id']), c] += ic_bias_site[c] for c in fnc_bias_site.keys(): test.loc[~test['Id'].isin(ids_df['Id']), c] += fnc_bias[c] test.loc[test['Id'].isin(ids_df['Id']), c] += fnc_bias_site[c] for c in pca_bias_site.keys(): test.loc[~test['Id'].isin(ids_df['Id']), c] += pca_bias[c] test.loc[test['Id'].isin(ids_df['Id']), c] += pca_bias_site[c] # save df for scaling df_scale = pd.concat([train, test], axis=0) # I. Create fnc score print('Creating FNC score...') # prepare datasets for fnc score train_for_score, test_for_score = scale_select_data( train, test, df_scale, fnc_cols) # define models names = ['ENet', 'BRidge'] names = [name + '_fnc_seed{}'.format(seed) for name in names] pack = [ ElasticNet(alpha=0.05, l1_ratio=0.5, random_state=0), BayesianRidge() ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 2, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 2, names) # save oof, pred, models np.save(folder_preds + 'fnc_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'fnc_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # II. Create agg score print('Creating AGG score...') # prepare datasets for agg score train_for_score, test_for_score = scale_select_data( train, test, df_scale, agg_cols) # define models names = ['RGF', 'ENet', 'Huber'] names = [name + '_agg_seed{}'.format(seed) for name in names] pack = [ RGFRegressor(max_leaf=1000, reg_depth=5, min_samples_leaf=100, normalize=True), ElasticNet(alpha=0.05, l1_ratio=0.3, random_state=0), HuberRegressor(epsilon=2.5, alpha=1) ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 3, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 3, names) # save oof, pred, models np.save(folder_preds + 'agg_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'agg_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # III. Create pca score print('Creating PCA score...') # prepare datasets for pca score train_for_score, test_for_score = scale_select_data( train, test, df_scale, pca_cols) # define models names = ['ENet', 'BRidge', 'OMP'] names = [name + '_pca_seed{}'.format(seed) for name in names] pack = [ ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0), BayesianRidge(), OrthogonalMatchingPursuit() ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 3, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 3, names) # save oof, pred, models np.save(folder_preds + 'pca_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'pca_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # IV. Create im score print('Creating IM score...') # prepare datasets for pca score train_for_score, test_for_score = scale_select_data( train, test, df_scale, im_cols) # define models names = ['ENet', 'BRidge', 'OMP'] names = [name + '_im_seed{}'.format(seed) for name in names] pack = [ ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0), BayesianRidge(), OrthogonalMatchingPursuit() ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 3, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 3, names) # save oof, pred, models np.save(folder_preds + 'im_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'im_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # V. Create dl score print('Creating DL score...') # prepare datasets for pca score train_for_score, test_for_score = scale_select_data( train, test, df_scale, dl_cols) # define models names = ['ENet', 'BRidge', 'OMP'] names = [name + '_dl_seed{}'.format(seed) for name in names] pack = [ ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0), BayesianRidge(), OrthogonalMatchingPursuit() ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 3, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 3, names) # save oof, pred, models np.save(folder_preds + 'dl_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'dl_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # VI. Training and predicting procedure print('Training has started...') # add scores for prefix in ['fnc', 'agg', 'im', 'pca', 'dl']: train.loc[d21_index, prefix + '_score'] = np.load( folder_preds + '{}_score_seed{}.npy'.format(prefix, seed)) test.loc[:, prefix + '_score'] = np.load( folder_preds + '{}_score_test_seed{}.npy'.format(prefix, seed)) score_cols = [c for c in train.columns if c.endswith('_score')] # save df for scaling df_scale = pd.concat([train, test], axis=0) # create differents datasets # linear linear_cols = sorted( list(set(ic_cols + fnc_cols + pca0_cols) - set(['IC_20']))) train_linear, test_linear = scale_select_data(train, test, df_scale, linear_cols) # kernel kernel_cols = sorted(list(set(ic_cols + pca0_cols) - set(['IC_20']))) train_kernel, test_kernel = scale_select_data(train=train, test=test, df_scale=df_scale, cols=kernel_cols, scale_factor=0.2, scale_cols=pca0_cols, sc=StandardScaler()) # score sc_cols = sorted(list(set(ic_cols + score_cols) - set(['IC_20']))) train_sc, test_sc = scale_select_data(train, test, df_scale, sc_cols) # learning process on different datasets names = ['GP', 'SVM1', 'SVM2', 'Lasso', 'BgR'] names = [name + '_seed{}'.format(seed) for name in names] pack = [ GaussianProcessRegressor(DotProduct(), random_state=0), NuSVR(C=3, kernel='rbf'), NuSVR(C=3, kernel='rbf'), Lasso(alpha=0.1, random_state=0), BaggingRegressor(Ridge(alpha=1), n_estimators=100, max_samples=0.2, max_features=0.2, random_state=0) ] zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_sc] * 2 + [train_kernel] + [train_linear] * 2, y) de_blend = zoo.blend_oof() preds = zoo.predict([test_sc] * 2 + [test_kernel] + [test_linear] * 2, names, is_blend=True) # rewrite folders for models and preds folder_models = './models/domain2_var1/stack/' if not os.path.exists(folder_models): os.makedirs(folder_models) folder_preds = './predicts/domain2_var1/stack/' if not os.path.exists(folder_preds): os.makedirs(folder_preds) print('Saving models to', folder_models) print('Saving predictions to', folder_preds) # save oofs and models zoo.save_oofs(names, folder=folder_preds) zoo.save_models(names, folder=folder_models) # stacking predictions print('Stacking predictions...') d21_prediction = pd.DataFrame() d21_prediction['Id'] = test['Id'].values d21_prediction['pred'] = preds d21_prediction.to_csv(folder_preds + 'domain2_var1_stack_seed{}.csv'.format(seed), index=False) print('domain2_var1 seed pred is saved as', folder_preds + 'domain2_var1_stack_seed{}.csv'.format(seed))
def rgf_state_prediction(state, lookback, horizon, predictors): clusters = pd.read_pickle('../analysis/clusters_{}.pkl'.format(state)) for cluster in clusters: data_full, group = get_cluster_data(geocode=cluster[0], clusters=clusters, data_types=DATA_TYPES, cols=predictors) for city in cluster: if os.path.isfile('saved_models/rgf/{}/rgf_metrics_{}.pkl'.format( state, city)): print(city, 'done') continue target = 'casos_est_{}'.format(city) casos_est_columns = ['casos_est_{}'.format(i) for i in group] casos_columns = ['casos_{}'.format(i) for i in group] data = data_full.drop(casos_columns, axis=1) data_lag = build_lagged_features(data, lookback) data_lag.dropna() targets = {} for d in range(1, horizon + 1): if d == 1: targets[d] = data_lag[target].shift(-(d - 1)) else: targets[d] = data_lag[target].shift(-(d - 1))[:-(d - 1)] X_data = data_lag.drop(casos_est_columns, axis=1) X_train, X_test, y_train, y_test = train_test_split( X_data, data_lag[target], train_size=0.7, test_size=0.3, shuffle=False) city_name = get_city_names([city, 0])[0][1] preds = np.empty((len(data_lag), horizon)) metrics = pd.DataFrame(index=('mean_absolute_error', 'explained_variance_score', 'mean_squared_error', 'mean_squared_log_error', 'median_absolute_error', 'r2_score')) for d in range(1, horizon + 1): model = RGFRegressor(max_leaf=300, algorithm="RGF_Sib", test_interval=100, loss="LS", verbose=False) tgt = targets[d][:len(X_train)] tgtt = targets[d][len(X_train):] try: model.fit(X_train, tgt) except ValueError as err: print( '-----------------------------------------------------' ) print(city, 'ERRO') print( '-----------------------------------------------------' ) break pred = model.predict(X_data[:len(targets[d])]) dif = len(data_lag) - len(pred) if dif > 0: pred = list(pred) + ([np.nan] * dif) preds[:, (d - 1)] = pred pred_m = model.predict(X_test[:(len(tgtt))]) metrics[d] = calculate_metrics(pred_m, tgtt) metrics.to_pickle('{}/{}/rgf_metrics_{}.pkl'.format( 'saved_models/rgf', state, city)) plot_prediction(preds, targets[1], city_name, len(X_train)) # plt.show() return None
kf = model_selection.KFold(n_splits=nfold, shuffle=False, random_state=seed) for dev_index, val_index in kf.split( y ): # explain for regression convert y to bins and use that for split dev_X, val_X = train.iloc[dev_index, :], train.iloc[val_index, :] dev_y, val_y = y[dev_index], y[val_index] dev_X = dev_X[(dev_y > lbound) & (dev_y < ubound)] dev_y = dev_y[(dev_y > lbound) & (dev_y < ubound)] val_X2 = val_X[(val_y > lbound) & (val_y < ubound)] val_y2 = val_y[(val_y > lbound) & (val_y < ubound)] print(dev_X.shape) rgf = RGFRegressor(max_leaf=1000, algorithm="RGF_Sib", test_interval=100, loss="LS", learning_rate=0.01, verbose=False) model = rgf.fit(dev_X, dev_y) print("predicting..") preds = model.predict(val_X) oobval[val_index] += preds.reshape(-1, 1) valerr.append(mean_absolute_error(val_y, preds)) print(valerr, "mean:", np.mean(valerr), "std:", np.std(valerr)) oobtest += model.predict(test.values).reshape(-1, 1) val_scores.append(mean_absolute_error(model.predict(valid), yval)) del (rgf, model) gc.collect() print(val_scores, np.mean(val_scores), "---", np.std(val_scores))
# XGBoost if ml_algorithm == 'XGBoost': num_rounds = 10 params = {'eta': 0.01, 'max_depth': 18, 'colsample_bytree': 0.2, 'subsample': 0.8, 'colsample_bylevel':0.3, 'alpha':2, 'objective': 'reg:linear', 'eval_metric': 'rmse', 'seed': 99, 'silent': True} # 'objective': 'binary:logistic', 'eval_metric': 'auc' # LightGBM elif ml_algorithm == 'LightGBM': num_rounds = 10 params = {'learning_rate': 0.01, 'max_depth': 13, 'colsample_bytree': 0.2, 'num_leaves' : 580, 'application': 'regression', 'metric': 'rmse', 'seed': 99, 'silent': True} # RGFRegressor elif ml_algorithm == 'RGF': model = RGFRegressor(max_leaf=3500, algorithm='RGF_Opt', loss="LS", l2=0.01) # FastRGFRegressor elif ml_algorithm == 'FastRGF': model = FastRGFRegressor(n_estimators=1200, sparse_max_features=1500, max_depth=5, max_bin=150, min_samples_leaf=12, sparse_min_occurences=1, opt_algorithm='epsilon-greedy', l2=1.0, min_child_weight=210.0, learning_rate=0.2) # Ridge Regression elif ml_algorithm == 'Ridge': model = Ridge(alpha=.6, copy_X=True, fit_intercept=True, max_iter=100, normalize=False, random_state=101, solver='auto', tol=0.01) # Lasso Regression elif ml_algorithm == 'Lasso': model = Lasso(alpha=.6, copy_X=True, fit_intercept=True, max_iter=100, normalize=False, random_state=101, tol=0.01)
from sklearn.ensemble import RandomForestRegressor from rgf.sklearn import FastRGFRegressor, RGFRegressor boston = load_boston() rng = check_random_state(42) perm = rng.permutation(boston.target.size) boston.data = boston.data[perm] boston.target = boston.target[perm] train_x = boston.data[:300] test_x = boston.data[300:] train_y = boston.target[:300] test_y = boston.target[300:] start = time.time() reg = RGFRegressor() reg.fit(train_x, train_y) score = reg.score(test_x, test_y) end = time.time() print("RGF: {} sec".format(end - start)) print("score: {}".format(score)) start = time.time() reg = FastRGFRegressor() reg.fit(train_x, train_y) score = reg.score(test_x, test_y) end = time.time() print("FastRGF: {} sec".format(end - start)) print("score: {}".format(score)) start = time.time()
def stacklearning(self): class extAll(BaseEstimator, TransformerMixin): def __init__(self): pass def fit(self, X, y=None): return self def transform(self, X): return self def predict(self, X): return self class extMorgan(BaseEstimator, TransformerMixin): def __init__(self): pass def fit(self, X, y=None): return self def transform(self, X): _,morgan,_=sepTables(X) return morgan class extMACCS(BaseEstimator, TransformerMixin): def __init__(self): pass def fit(self, X, y=None): return self def transform(self, X): maccs,morgan,_=sepTables(X) maccs = pd.concat([morgan,maccs],axis=1) return maccs class extDescriptor(BaseEstimator, TransformerMixin): def __init__(self): pass def fit(self, X, y=None): return self def transform(self, X): maccs,morgan,descriptor=sepTables(X) descriptor = pd.concat([morgan,descriptor],axis=1) descriptor = pd.concat([maccs,descriptor],axis=1) return descriptor class extPCA(BaseEstimator, TransformerMixin): def __init__(self): pass def fit(self, X, y=None): return self def transform(self, X): model = PCA(n_components=64) _,morgan,_=sepTables(X) morgan = morgan.reset_index().drop('index', axis=1) W = pd.DataFrame(model.fit_transform(X)) W = pd.concat([morgan,W],axis=1) return W lgbm = LGBMRegressor(boosting_type='gbdt', num_leaves= 60,learning_rate=0.06) rgf = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0) rgf1 = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0) rgf2 = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0) rgf3 = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0) rgf4 = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0) pipe1 = make_pipeline(extMACCS(), rgf) pipe2 = make_pipeline(extMorgan(), rgf1) pipe3 = make_pipeline(extDescriptor(), rgf2) pipe4 = make_pipeline(extPCA(), rgf3) pipe7 =make_pipeline(extDescriptor(), rgf4) pipe8 =make_pipeline(extDescriptor(), rgf4) xgb = xgboost.XGBRegressor() nbrs = KNeighborsRegressor(2) svr = SVR(gamma='auto',kernel='linear') sgd = SGDRegressor(max_iter=1000) pls = PLSRegression(n_components=3) ext = ExtraTreesRegressor(n_estimators=30,max_features= 20,min_samples_split= 5,max_depth= 50, min_samples_leaf= 5) pipe5 = make_pipeline(extMorgan(), nbrs) pipe6 = make_pipeline(extMACCS(), rgf) alldata = make_pipeline(extAll()) meta = RandomForestRegressor(max_depth=20, random_state=0, n_estimators=400) stack1 = StackingRegressor(regressors=[pipe1, pipe2, pipe3], meta_regressor=rgf, verbose=1) #stack2 = StackingRegressor(regressors=[stack1,nbrs, svr,pls,rgf], meta_regressor=lgbm, verbose=1) stack2 = StackingRegressor(regressors=[stack1,pipe5,pipe7,pipe1], meta_regressor=rgf,verbose=1) scores = cross_val_score(stack2, X, y, cv=10) print("R^2 Score: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), 'stacking')) stack1_score = cross_val_score(stack1,X,y, cv=10) rgf_score = cross_val_score(rgf,X,y,cv=10) stack2.fit(X_train, y_train) y_pred = stack2.predict(X_train) y_val = stack2.predict(X_test) print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test)) rgf.fit(X_train, y_train) y_pred = rgf.predict(X_train) y_val = rgf.predict(X_test) print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test)) pipe1.fit(X_train, y_train) y_pred = pipe1.predict(X_train) y_val = pipe1.predict(X_test) print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test)) cols = np.arange(1,550,1).tolist() cols = X.columns.tolist() cols = [1,2,3] # Initializing Classifiers reg1 = Ridge(random_state=1) #reg2 = ExtraTreesRegressor() reg2 = ExtraTreesRegressor(n_estimators=50,max_features= 50,min_samples_split= 5,max_depth= 50, min_samples_leaf= 5) reg3 = SVR(gamma='auto',kernel='linear') reg4 = LGBMRegressor(boosting_type='gbdt', num_leaves= 60,learning_rate=0.06) pls = PLSRegression(n_components=3) pipe1 = make_pipeline(ColumnSelector(cols=cols), ExtraTreesRegressor(n_estimators=50)) #linear =SGDRegressor(max_iter=1000) rgf = RGFRegressor(max_leaf=1000, algorithm="RGF",test_interval=100, loss="LS",verbose=False,l2=1.0) nbrs = KNeighborsRegressor(2) pipe2 = make_pipeline(ColumnSelector(cols=cols), KNeighborsRegressor(31)) meta = ExtraTreesRegressor(n_estimators=50,max_features= 7,min_samples_split= 5,max_depth= 50, min_samples_leaf= 5) stackReg = StackingRegressor(regressors=[reg1,reg2, reg3,pipe1,pls,nbrs,rgf], meta_regressor=meta,verbose=1) stackReg.fit(X_train, y_train) y_pred = stackReg.predict(X_train) y_val = stackReg.predict(X_test) print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred,y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val,y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred,y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val,y_test)) rgf.fit(X_train, y_train) y_pred = reg4.predict(X_train) y_val = reg4.predict(X_test) print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred,y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val,y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred,y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val,y_test))
elif ml_algorithm == 'LightGBM': num_rounds = 10 params = { 'learning_rate': 0.01, 'max_depth': 13, 'colsample_bytree': 0.2, 'num_leaves': 580, 'objective': 'binary', 'metric': 'auc', 'seed': 99, 'silent': True } # RGFRegressor elif ml_algorithm == 'RGF': model = RGFRegressor(max_leaf=3500, algorithm='RGF_Opt', loss="LS", l2=0.01) # FastRGFRegressor elif ml_algorithm == 'FastRGF': model = FastRGFRegressor(n_estimators=1200, sparse_max_features=1500, max_depth=5, max_bin=150, min_samples_leaf=12, sparse_min_occurences=1, opt_algorithm='epsilon-greedy', l2=1.0, min_child_weight=210.0, learning_rate=0.2) # Ridge Regression elif ml_algorithm == 'Ridge':
def run(seed): # create folders for scores models and preds folder_models = './models/age/scores/' if not os.path.exists(folder_models): os.makedirs(folder_models) folder_preds = './predicts/age/scores/' if not os.path.exists(folder_preds): os.makedirs(folder_preds) print('Loading data...') # load biases ic_bias = read_pickle('./data/biases/ic_biases.pickle') ic_bias_site = read_pickle('./data/biases/ic_biases_site.pickle') fnc_bias = read_pickle('./data/biases/fnc_biases.pickle') fnc_bias_site = read_pickle('./data/biases/fnc_biases_site.pickle') pca_bias = read_pickle('./data/biases/200pca_biases.pickle') pca_bias_site = read_pickle('./data/biases/200pca_biases_site.pickle') # load classifier and add extra sites2 extra_site = pd.DataFrame() extra_site['Id'] = np.load('./predicts/classifier/site2_test_new_9735.npy') # load competiton data ids_df = pd.read_csv('./data/raw/reveal_ID_site2.csv') fnc_df = pd.read_csv('./data/raw/fnc.csv') loading_df = pd.read_csv('./data/raw/loading.csv') labels_df = pd.read_csv('./data/raw/train_scores.csv') ids_df = ids_df.append(extra_site) print('Detected Site2 ids count: ', ids_df['Id'].nunique()) # load created features agg_df = pd.read_csv('./data/features/agg_feats.csv') im_df = pd.read_csv('./data/features/im_feats.csv') dl_df = pd.read_csv('./data/features/dl_feats.csv') pca_df = pd.read_csv('./data/features/200pca_feats/200pca_3d_k0.csv') for i in range(1, 6): part = pd.read_csv( './data/features/200pca_feats/200pca_3d_k{}.csv'.format(i)) del part['Id'] pca_df = pd.concat((pca_df, part), axis=1) # merge data ic_cols = list(loading_df.columns[1:]) fnc_cols = list(fnc_df.columns[1:]) agg_cols = list(agg_df.columns[1:]) im_cols = list(im_df.columns[1:]) pca_cols = list(pca_df.columns[1:]) dl_cols = list(dl_df.columns[1:]) df = fnc_df.merge(loading_df, on='Id') df = df.merge(agg_df, how='left', on='Id') df = df.merge(im_df, how='left', on='Id') df = df.merge(pca_df, how='left', on='Id') df = df.merge(dl_df, how='left', on='Id') df = df.merge(labels_df, how='left', on='Id') del loading_df, fnc_df, agg_df, im_df, pca_df gc.collect() # split train and test df.loc[df['Id'].isin(labels_df['Id']), 'is_test'] = 0 df.loc[~df['Id'].isin(labels_df['Id']), 'is_test'] = 1 train = df.query('is_test==0') del train['is_test'] test = df.query('is_test==1') del test['is_test'] y = train['age'].copy().reset_index(drop=True) # apply biases for c in ic_bias_site.keys(): test.loc[~test['Id'].isin(ids_df['Id']), c] += ic_bias[c] test.loc[test['Id'].isin(ids_df['Id']), c] += ic_bias_site[c] for c in fnc_bias_site.keys(): test.loc[~test['Id'].isin(ids_df['Id']), c] += fnc_bias[c] test.loc[test['Id'].isin(ids_df['Id']), c] += fnc_bias_site[c] for c in pca_bias_site.keys(): test.loc[~test['Id'].isin(ids_df['Id']), c] += pca_bias[c] test.loc[test['Id'].isin(ids_df['Id']), c] += pca_bias_site[c] # save df for scaling df_scale = pd.concat([train, test], axis=0) # I. Create fnc score print('Creating FNC score...') # prepare datasets for fnc score train_for_score, test_for_score = scale_select_data( train, test, df_scale, fnc_cols) # define models names = ['RGF', 'ENet', 'BRidge', 'Huber', 'OMP'] names = [name + '_fnc_seed{}'.format(seed) for name in names] pack = [ RGFRegressor(max_leaf=1000, reg_depth=5, normalize=True), ElasticNet(alpha=0.05, l1_ratio=0.5, random_state=0), BayesianRidge(), HuberRegressor(epsilon=2.5, alpha=1), OrthogonalMatchingPursuit(n_nonzero_coefs=300) ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 5, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 5, names) # save oof, pred, models np.save(folder_preds + 'fnc_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'fnc_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # II. Create agg score print('Creating AGG score...') # prepare datasets for agg score train_for_score, test_for_score = scale_select_data( train, test, df_scale, agg_cols) # define models names = ['RGF', 'ENet', 'Huber'] names = [name + '_agg_seed{}'.format(seed) for name in names] pack = [ RGFRegressor(max_leaf=1000, reg_depth=5, min_samples_leaf=100, normalize=True), ElasticNet(alpha=0.05, l1_ratio=0.3, random_state=0), HuberRegressor(epsilon=2.5, alpha=1) ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 3, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 3, names) # save oof, pred, models np.save(folder_preds + 'agg_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'agg_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # III. Create pca score print('Creating PCA score...') # prepare datasets for pca score train_for_score, test_for_score = scale_select_data( train, test, df_scale, pca_cols) # define models names = ['RGF', 'ENet', 'BRidge', 'OMP'] names = [name + '_pca_seed{}'.format(seed) for name in names] pack = [ RGFRegressor(max_leaf=1000, reg_depth=5, min_samples_leaf=100, normalize=True), ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0), BayesianRidge(), OrthogonalMatchingPursuit() ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 4, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 4, names) # save oof, pred, models np.save(folder_preds + 'pca_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'pca_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # IV. Create im score print('Creating IM score...') # prepare datasets for pca score train_for_score, test_for_score = scale_select_data( train, test, df_scale, im_cols) # define models names = ['RGF', 'ENet', 'BRidge', 'OMP'] names = [name + '_im_seed{}'.format(seed) for name in names] pack = [ RGFRegressor(max_leaf=1000, reg_depth=5, min_samples_leaf=100, normalize=True), ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0), BayesianRidge(), OrthogonalMatchingPursuit() ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 4, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 4, names) # save oof, pred, models np.save(folder_preds + 'im_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'im_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # V. Create dl score print('Creating DL score...') # prepare datasets for pca score train_for_score, test_for_score = scale_select_data( train, test, df_scale, dl_cols) # define models names = ['RGF', 'ENet', 'BRidge'] names = [name + '_dl_seed{}'.format(seed) for name in names] pack = [ RGFRegressor(max_leaf=1000, reg_depth=5, min_samples_leaf=100, normalize=True), ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0), BayesianRidge() ] # train models zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_for_score] * 3, y) score_blend = zoo.blend_oof() pred = zoo.predict([test_for_score] * 3, names) # save oof, pred, models np.save(folder_preds + 'dl_score_seed{}.npy'.format(seed), score_blend) np.save(folder_preds + 'dl_score_test_seed{}.npy'.format(seed), pred) zoo.save_models(names, folder=folder_models) # VI. Training and predicting procedure print('Training has started...') print('Reading scores from ', folder_preds) # add scores for prefix in ['fnc', 'agg', 'im', 'pca', 'dl']: train[prefix + '_score'] = np.load(folder_preds + '{}_score_seed{}.npy'.format(prefix, seed)) test[prefix + '_score'] = np.load( folder_preds + '{}_score_test_seed{}.npy'.format(prefix, seed)) score_cols = [c for c in train.columns if c.endswith('_score')] # save df for scaling df_scale = pd.concat([train, test], axis=0) # create differents datasets # linear linear_cols = sorted( list( set(ic_cols + fnc_cols + pca_cols + agg_cols + im_cols) - set(['IC_20']))) train_linear, test_linear = scale_select_data(train, test, df_scale, linear_cols) # kernel kernel_cols = sorted(list(set(ic_cols + pca_cols) - set(['IC_20']))) train_kernel, test_kernel = scale_select_data(train=train, test=test, df_scale=df_scale, cols=kernel_cols, scale_cols=pca_cols) # score sc_cols = sorted(list(set(ic_cols + score_cols) - set(['IC_20']))) train_sc, test_sc = scale_select_data(train, test, df_scale, sc_cols) # dl dict_cols = sorted( list( set(ic_cols + fnc_cols + dl_cols + im_cols + agg_cols) - set(['IC_20']))) train_dl, test_dl = scale_select_data(train, test, df_scale, dict_cols) # learning process on different datasets names = ['MLP', 'RGF', 'SVM', 'BR', 'OMP', 'EN', 'KR'] names = [name + '_seed{}'.format(seed) for name in names] pack = [ MLPRegressor(activation='tanh', random_state=0), RGFRegressor(max_leaf=1500, loss='Abs'), NuSVR(C=10, nu=0.4, kernel='rbf'), BayesianRidge(), OrthogonalMatchingPursuitCV(), ElasticNet(alpha=0.5, l1_ratio=0.7, random_state=0), KernelRidge(kernel='poly', alpha=0.5) ] zoo = TrendsModelSklearn(pack, seed=seed) zoo.fit([train_sc] * 2 + [train_kernel] + [train_linear] * 2 + [train_dl] * 2, y) de_blend = zoo.blend_oof() preds = zoo.predict([test_sc] * 2 + [test_kernel] + [test_linear] * 2 + [test_dl] * 2, names, is_blend=False) # rewrite folders for models and preds folder_models = './models/age/stack/' if not os.path.exists(folder_models): os.makedirs(folder_models) folder_preds = './predicts/age/stack/' if not os.path.exists(folder_preds): os.makedirs(folder_preds) print('Saving models to', folder_models) print('Saving predictions to', folder_preds) # save oofs and models zoo.save_oofs(names, folder=folder_preds) zoo.save_models(names, folder=folder_models) # stacking predictions print('Stacking predictions...') folds = KFold(n_splits=10, shuffle=True, random_state=0) stack = pd.DataFrame(zoo.oof_preds).T stack.columns = names model_stacker_rgf = RGFRegressor(max_leaf=1000, reg_depth=25, verbose=False) rgf_pred = cross_val_predict(model_stacker_rgf, stack, y.dropna(), cv=folds, n_jobs=-1) model_stacker_br = BayesianRidge() br_pred = cross_val_predict(model_stacker_br, stack, y.dropna(), cv=folds, n_jobs=-1) model_stacker_rgf.fit(stack, y.dropna()) model_stacker_br.fit(stack, y.dropna()) # save models save_pickle(model_stacker_br, folder_models + 'BRidge_stack_seed{}'.format(seed)) save_pickle(model_stacker_rgf, folder_models + 'RGF_stack_seed{}'.format(seed)) print('Final age NMAE: {:.5f}'.format( NMAE(y, 0.75 * br_pred + 0.25 * rgf_pred))) test_preds = pd.DataFrame(preds).T test_preds.columns = names age_prediction = pd.DataFrame() age_prediction['Id'] = test['Id'].values age_prediction['pred'] = 0.25 * model_stacker_rgf.predict( test_preds) + 0.75 * model_stacker_br.predict(test_preds) age_prediction.to_csv(folder_preds + 'age_stack_seed{}.csv'.format(seed), index=False) print('age seed pred is saved as', folder_preds + 'age_stack_seed{}.csv'.format(seed))
class RGF(ModelBase): """""" _l_drop_cols = ['Item_Outlet_Sales', 'index'] ## training, parameter tuning for single L1 def train(self, importance=False): """""" print('\n parameters %s \n' % self.parameters) d_fold_val = {} for fold in range(self.kfold): print('\n---- fold %s begins.\n' % fold) ## load data TrainFile = '%s/kfold/%s/train.%s' % (self.InputDir, fold, self.data_format) TestFile = '%s/kfold/%s/test.%s' % (self.InputDir, fold, self.data_format) self.TrainData = DataUtil.load(TrainFile, format=self.data_format) self.TestData = DataUtil.load(TestFile, format=self.data_format) ## train and predict on valid self.__fit() eval = self.__predict() d_fold_val[fold] = eval ## save OutputDir = '%s/kfold/%s' % (self.OutputDir, fold) if (os.path.exists(OutputDir) == False): os.makedirs(OutputDir) DataUtil.save(self.TrainData, '%s/train.%s' % (OutputDir, self.data_format), format=self.data_format) DataUtil.save(self.TestData, '%s/test.%s' % (OutputDir, self.data_format), format=self.data_format) print('\n---- Fold %d done. ----\n' % fold) return d_fold_val ## inferring for fold data and holdout data def infer(self, head, HoldoutData, SubmitData, metric_pk=False): """""" ## l_pred_fold = [] PredHoldout = pd.DataFrame(index=HoldoutData.index) PredHoldout['index'] = HoldoutData['index'] PredHoldout['Item_Outlet_Sales'] = HoldoutData['Item_Outlet_Sales'] PredSubmit = pd.DataFrame(index=SubmitData.index) for fold in range(self.kfold): ## load TrainFile = '%s/kfold/%s/train.%s' % (self.InputDir, fold, self.data_format) TestFile = '%s/kfold/%s/test.%s' % (self.InputDir, fold, self.data_format) self.TrainData = DataUtil.load(TrainFile, format=self.data_format) self.TestData = DataUtil.load(TestFile, format=self.data_format) ## fit PredFold = pd.DataFrame(index=self.TestData.index) PredFold['index'] = self.TestData['index'] PredFold['Item_Outlet_Sales'] = self.TestData['Item_Outlet_Sales'] PredFold['fold'] = fold self.__fit() ## inferring PredFold[head] = self._model.predict( self.TestData[self._l_train_columns]) PredHoldout['fold%s' % (fold)] = self._model.predict( HoldoutData[self._l_train_columns]) PredSubmit['fold%s' % fold] = self._model.predict( SubmitData[self._l_train_columns]) l_pred_fold.append(PredFold) ## aggregate folds data PredKFold = pd.concat(l_pred_fold, axis=0, ignore_index=True) ## save for folds data for fold in range(self.kfold): FoldOutputDir = '%s/kfold/%s' % (self.OutputDir, fold) if (os.path.exists(FoldOutputDir) == False): os.makedirs(FoldOutputDir) TrainFile = '%s/train.%s' % (FoldOutputDir, self.data_format) TestFile = '%s/test.%s' % (FoldOutputDir, self.data_format) TrainData = PredKFold[PredKFold['fold'] != fold] TestData = PredKFold[PredKFold['fold'] == fold] DataUtil.save(TrainData, TrainFile, format=self.data_format) DataUtil.save(TestData, TestFile, format=self.data_format) HoldCols = [ col for col in PredHoldout.columns if col.startswith('fold') ] ## save for holdout data PredHoldout[head] = PredHoldout[HoldCols].mean(axis=1) HoldoutOutputDir = '%s/holdout' % self.OutputDir if (os.path.exists(HoldoutOutputDir) == False): os.makedirs(HoldoutOutputDir) DataUtil.save(PredHoldout, '%s/test.%s' % (HoldoutOutputDir, self.data_format), format=self.data_format) ## save for submit data PredSubmit[head] = PredSubmit[HoldCols].mean(axis=1) SubmitOutputDir = '%s/submit' % self.OutputDir if (os.path.exists(SubmitOutputDir) == False): os.makedirs(SubmitOutputDir) DataUtil.save(PredSubmit, '%s/test.%s' % (SubmitOutputDir, self.data_format), format=self.data_format) ## metric PK if (metric_pk): d_metric = {} for col in self._l_train_columns: diff = (HoldoutData[col] - HoldoutData['Item_Outlet_Sales']) rmse = np.sqrt(np.sum(diff * diff) / len(diff)) d_metric[col] = rmse diff = PredHoldout[head] - PredHoldout['Item_Outlet_Sales'] ensemble_metric = np.sqrt(np.sum(diff * diff) / len(diff)) print('\n===== metric pk result ====\n') print('single model: %s, ensemble model %s: %s' % (d_metric, head, ensemble_metric)) print('\n===== metric pk result ====\n') return ## L1 fitting def __fit(self): """""" start = time.time() ## id_cols = [ col for col in self.TrainData.columns if (col.startswith('Item_Identifier')) ] self._l_drop_cols.extend(id_cols) X = self.TrainData.drop(self._l_drop_cols, axis=1) Y = self.TrainData['Item_Outlet_Sales'] ## self._l_train_columns = X.columns print('Size of feature space: %s' % len(self._l_train_columns)) ## self._model = RGFRegressor( algorithm=self.parameters['algorithm'], loss=self.parameters['loss'], learning_rate=self.parameters['learning_rate'], n_iter=self.parameters['n_iter'], reg_depth=self.parameters['reg_depth'], l2=self.parameters['l2'], sl2=self.parameters['sl2'], #min_samples_leaf= self.parameters['min_samples_leaf'], max_leaf=self.parameters['max_leaf'], verbose=True) self._model.fit(X, Y) end = time.time() print('\nTraining is done. Time elapsed %ds' % (end - start)) return ## predict def __predict(self): """""" start = time.time() ## x_test = self.TestData[self._l_train_columns] pred_test = self._model.predict(x_test) truth_test = self.TestData['Item_Outlet_Sales'] ## RMSE diff = (pred_test - truth_test) rmse = np.sqrt(np.sum(diff * diff) / len(diff)) ## end = time.time() print('\n Prediction done. Time consumed %ds' % (end - start)) return rmse
def stacklearning(self): class sparseNorm(BaseEstimator, TransformerMixin): def __init__(self): pass def fit(self, X, y=None): return self def transform(self, X): from sklearn import preprocessing Y = preprocessing.normalize(sp.sparse.csc_matrix(X.values)) return Y fm = sgd.FMRegression( n_iter=4743, init_stdev=0.1, rank=100, l2_reg_w=0, l2_reg_V=0, step_size=0.1, ) fm = sgd.FMRegression( n_iter=9943, init_stdev=0.1, rank=219, l2_reg_w=0, l2_reg_V=0.06454, step_size=0.1, ) pipe = make_pipeline(sparseNorm(), fm) calcACC(pipe, X=X2) xgb = xgboost.XGBRegressor( n_estimators=100, max_depth=7, gamma=0, colsample_bytree=0.1 ) lgbm = LGBMRegressor( boosting_type='gbdt', num_leaves=367, learning_rate=0.06,feature_fraction=0.14, max_depth=28, min_data_in_leaf=8 ) rgf = RGFRegressor( max_leaf=1211, algorithm="RGF", test_interval=100, loss="LS", verbose=False, l2=0.93, min_samples_leaf=2 ) rf = RandomForestRegressor( max_depth=20, random_state=0, n_estimators=56,min_samples_split=2, max_features=0.21 ) rf = RandomForestRegressor() ext = ExtraTreesRegressor( n_estimators=384,max_features= 2228, min_samples_split= 0.01,max_depth= 856, min_samples_leaf= 1 ) svr = SVR( gamma=9.5367431640625e-07, epsilon=0.0009765625, C= 2048.0 ) #test combination desNew = make_pipeline(extdescriptorNew(),rf) morNew = make_pipeline(extMorganNew(),rf) kotNew = make_pipeline(extklekotaTothNew(),rf) macNew = make_pipeline(extMACCSNew(),rf) desMac = make_pipeline(extDescriptorMACCS(),rf) morMac = make_pipeline(extMorganMACCS(),rf) kotMac = make_pipeline(extKlekotaTothMACCS(),rf) morKotNew = make_pipeline(extMorganKlekotaTothNew(),rf) des = make_pipeline(extOnlyDescriptor(),rf) mor = make_pipeline(extOnlyMorgan(),rf) kot = make_pipeline(extOnlyklekotaToth(),rf) mac = make_pipeline(extOnlyMACCS(),rf) all = make_pipeline(extAll(),rf) allwithoutNew = make_pipeline(extAllwithoutNew(),rf) allwithoutMaccs = make_pipeline(extAllwithoutMaccs(),rf) allwithoutDes = make_pipeline(extAllwithoutDescriptor(),rf) testDic = {"Desc+New":desNew,"Mor+New":morNew,"kot+New":kotNew,"MACCS+New":macNew,"Des+MAC":desMac,"Morgan+Maccs":morMac,"Kot+MACCS":kotMac,"mor+kot+New":morKotNew, "descriptor":des,"morgan":mor,"kot":kot,"MACCS":mac,"All":all,"All without " "new":allwithoutNew, "All without MACCS":allwithoutMaccs,"All without Des":allwithoutDes} #10fold cv = KFold(n_splits=10, shuffle=True, random_state=0) #Fingerprinttest resultDic={} resultDic2={} for name,model in testDic.items(): #model = StackingRegressor(regressors=[name], meta_regressor=rf,verbose=1) #calcACC(model,X=X,y=y2,name=name) Scores = cross_validate(model, X2, y2, cv=cv,scoring=myScoreFunc) RMSETmp = Scores['test_RMSE'].mean() CORRTmP = Scores['test_Correlation coefficient'].mean() resultDic.update({name:[RMSETmp,CORRTmP]}) print(name,RMSETmp,CORRTmP) #stacking alldata = make_pipeline(extAll()) # random forest #1.1546 0.70905 stack = StackingRegressor(regressors=[alldata], meta_regressor=rf,verbose=1) # Light Gradient boosting # 1.160732 0.703776 testmodel = StackingRegressor(regressors=[alldata], meta_regressor=lgbm,verbose=1) # XGboost # 1.1839805 0.689571 testmodel = StackingRegressor(regressors=[alldata], meta_regressor=xgb,verbose=1) # Regularized greedily forest # 1.17050 0.6992 testmodel = StackingRegressor(regressors=[alldata], meta_regressor=rgf,verbose=1) #pls 22.808047774809697 0.6410026452910016 i=4 for i in np.arange(3,11,1): pls = PLSRegression(n_components=i) testmodel = StackingRegressor(regressors=[alldata], meta_regressor=pls,verbose=0) calcACC(testmodel) pls = PLSRegression(n_components=4) #SVR svr = SVR(gamma=9.5367431640625/10000000,C=1559.4918100725592, epsilon=0.0009765625,) svr = SVR(kernel='rbf',gamma=9.5367431640625e-07,epsilon=0.0009765625,C=2048.0) testmodel = StackingRegressor(regressors=[alldata], meta_regressor=svr, verbose=1) calcACC(svr) #Extratree 1.157420824123527 0.7061010221224269 testmodel = StackingRegressor(regressors=[alldata], meta_regressor=ext, verbose=1) calcACC(testmodel) #k-NN nbrs = KNeighborsRegressor(3) ##Linear regressions #Stochastic Gradient Descenta sgd = SGDRegressor(max_iter=1000) # Ridge for i in [1,10,100,1000]: ridge = Ridge(alpha=i) calcACC(ridge) ridge = Ridge(alpha=45.50940042350705) calcACC(ridge) # multiple linear lin = make_pipeline(forlinear(),LinearRegression(n_jobs=-1)) calcACC(lin) #stacking #0.69 testmodel = StackingRegressor(regressors=[alldata,nbrs,all], meta_regressor=rf,verbose=1) #1.1532 0.70926 testmodel = StackingRegressor(regressors=[alldata,nbrs,all,xgb,lgbm,rgf], meta_regressor=rf, verbose=1) #1.16420 0.7041 testmodel = StackingRegressor(regressors=[alldata,alldata,all], meta_regressor=rf,verbose=1) #1.16379 0.7044 stack1 = StackingRegressor(regressors=[alldata,nbrs,all,xgb,lgbm,rgf], meta_regressor=rf,verbose=1) testmodel = StackingRegressor(regressors=[alldata,stack1,stack1], meta_regressor=rf,verbose=1) #1.1535496740699531 0.7108839199109559 pcaFeature = make_pipeline(extPCA()) testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf] ,meta_regressor=rf,verbose=1) #1.181801005432221 0.6889745579620922 testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf] ,meta_regressor=lgbm,verbose=1) #0.70613 testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf,ext] ,meta_regressor=xgb,verbose=1) #0.71641717 testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,rf,xgb,lgbm,rgf,ext] ,meta_regressor=rf,verbose=1) #0.7146922 testmodel = StackingRegressor(regressors=[pcaFeature,alldata,nbrs,ridge,rf,xgb,lgbm,rgf,ext] ,meta_regressor=rf,verbose=1) #new features pcaFeature = make_pipeline(extPCA()) #old pipe1 = make_pipeline(extMACCS(), rf) pipe2 = make_pipeline(extMorgan(), rf) pipe3 = make_pipeline(extDescriptor(), rf) pipe4 = make_pipeline(extPCA(), rgf) pipe7 =make_pipeline(extDescriptor(), rgf) pipe8 =make_pipeline(extDescriptor(), rgf) xgb = xgboost.XGBRegressor() nbrs = KNeighborsRegressor(2) svr = SVR(gamma='auto',kernel='linear') pls = PLSRegression(n_components=4) extMACCSdata = make_pipeline(extMACCS()) nbrsPipe = make_pipeline(extMorgan(), nbrs) pipe6 = make_pipeline(extMACCS(), rgf) alldata = make_pipeline(extAll()) ave = extAverage() withoutdesc = make_pipeline(extMACCS()) meta = RandomForestRegressor(max_depth=20, random_state=0, n_estimators=400) #stack1 = StackingRegressor(regressors=[rgf, nbrs, alldata], meta_regressor=rgf, verbose=1) #0.70 stack = StackingRegressor(regressors=[pipe1,pipe2,pipe3,xgb,lgbm,rgf,rf], meta_regressor=ave, verbose=1) #stack2 = StackingRegressor(regressors=[stack1,nbrs, svr,pls,rgf], meta_regressor=lgbm, verbose=1) #0.69###################### stack1 = StackingRegressor(regressors=[pipe1,pipe2,pipe3], meta_regressor=rf, verbose=1) #0.70 stack2 = StackingRegressor(regressors=[stack1,alldata,rgf,lgbm,xgb], meta_regressor=rf,verbose=1) #0.71 stack3 = StackingRegressor(regressors=[stack2,pipe1], meta_regressor=ave, verbose=1) ########################### ########################### stack1 = StackingRegressor(regressors=[pipe1,pipe2,pipe3], meta_regressor=rf, verbose=1) stack2 = StackingRegressor(regressors=[stack1,withoutdesc,lgbm,rgf], meta_regressor=rf,verbose=1) stack3 = StackingRegressor(regressors=[stack2,pipe1,xgb], meta_regressor=ave, verbose=1) ########################### #stackingwithknn stack1 = StackingRegressor(regressors=[pipe1,pipe2,pipe3], meta_regressor=rf, verbose=1) stack2 = StackingRegressor(regressors=[stack1,nbrs,pipe1], meta_regressor=rf, verbose=1) #stack3 = StackingRegressor(regressors=[rgf, nbrs, alldata], meta_regressor=ave, verbose=1) cv = ShuffleSplit(n_splits=10, test_size=0.1, random_state=0) cv = KFold(n_splits=10, shuffle=True, random_state=0) St1Scores = cross_validate(stack1,X,y,cv=cv) St1Scores['test_score'].mean()**(1/2) St2Scores = cross_validate(stack2,X,y,cv=cv) St2Scores['test_score'].mean()**(1/2) St3Scores = cross_validate(stack3,X,y,cv=cv) St3Scores['test_score'].mean()**(1/2) stackScore = cross_validate(stack, X, y, cv=cv) stackScore['test_score'].mean()**(1/2) lgbmScores =cross_validate(lgbm,X,y,cv=cv) lgbmScores['test_score'].mean()**(1/2) rgfScores = cross_validate(rgf,X,y,cv=cv) rgfScores['test_score'].mean()**(1/2) RFScores = cross_validate(rf,X,y,cv=cv) RFScores['test_score'].mean()**(1/2) scores = cross_validate(stack2,X,y,cv=cv) scores['test_score'].mean()**(1/2) print("R^2 Score: %0.2f (+/- %0.2f) [%s]" % (scores['test_score'].mean(), scores['test_score'].std(), 'stacking')) stack3.fit(X, y) y_pred = stack3.predict(X_train) y_val = stack3.predict(X_test) #stack3.score(X_train, y_train) exX = preprocess(extractDf, changeList) valy = (10 **(stack3.predict(exX))).tolist() print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test)) stack1.fit(X, y) valy = (10 **(stack1.predict(exX))).tolist() sgd.fit(X,y) valy = (10 **(sgd.predict(exX))).tolist() rgfpipe = make_pipeline(extMACCS(), rf) rgf.fit(X,y) valy = (10 **(rgf.predict(exX))).tolist() nbrs.fit(X,y) valy = (10 **(nbrs.predict(exX))).tolist() pipe = make_pipeline(extMACCS(), rf) pipe.fit(X,y) valy = (10 **(pipe.predict(exX))).tolist() rf.fit(X, y) y_pred = rf.predict(X_train) y_val = rf.predict(X_test) exX = preprocess(extractDf, changeList) valy = (10 **(rf.predict(exX))).tolist() print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test)) lgbm.fit(X, y) #y_pred = pipe1.predict(X_train) #y_val = pipe1.predict(X_test) exX = preprocess(extractDf, changeList) valy = (10 **(lgbm.predict(exX))).tolist() print("Root Mean Squared Error train: %.4f" % calcRMSE(y_pred, y_train)) print("Root Mean Squared Error test: %.4f" % calcRMSE(y_val, y_test)) print('Correlation Coefficient train: %.4f' % calcCorr(y_pred, y_train)) print('Correlation Coefficient test: %.4f' % calcCorr(y_val, y_test))
def metalPredictotherML(self): rf = RandomForestRegressor( max_depth=20, random_state=0, n_estimators=134,min_samples_split=9, max_features=0.33 ) xg=xgboost.XGBRegressor( n_estimators=9196, max_depth=497, gamma=0.0, colsample_bytree= 1 ) xg=xgboost.XGBRegressor( n_estimators=7534, max_depth=1000, gamma=2.0, colsample_bytree= 0.1 ) lgbm = LGBMRegressor( boosting_type='gbdt', num_leaves=662, feature_fraction= 0.36875625810601625, bagging_fraction=0.39668072810414723, learning_rate=0.06, min_data_in_leaf=35, max_depth=27 ) rgf = RGFRegressor( max_leaf=1998, algorithm="RGF", test_interval=100, loss="LS", verbose=False, l2=0.187, min_samples_leaf=1 ) from sklearn import linear_model clf = linear_model.LogisticRegression(random_state=0) ml = LinearRegression(n_jobs=-1) [rf,xg,lgbm,rgf,ml] #test #RMSE 10**1.048481123342563 #Coor 0.6585531363438192 #train #RMSE 10**0.7063184454700483 #Coor 0.8793733707424819 rf = RandomForestRegressor() #test #RMSE 10**1.109618572731106 #Coor 0.6198791156669657 #train #0.5647634395040055 #0.9179571547567322 os.chdir(r'G:\マイドライブ\Data\tox_predict') df = pd.read_csv('metalMACCS.csv').set_index('CAS') dftemp = df.drop(['daphnia_tox','Algae_tox'], axis=1) dftemp = dftemp.dropna() y = np.log10(dftemp['fish_tox']) X = dftemp.iloc[:,0:-2] for i in [rf,xg,lgbm,rgf,ml]: calcACC(i, X=X, name=i, y=y)
# Criação do Dicionário de Modelos models_reg = {} models_reg["LinearR"] = LinearRegression() models_reg["Lasso"] = Lasso(random_state=seed_reg) models_reg["LassoLarsIC"] = LassoLarsIC() models_reg["Ridge"] = Ridge(random_state=seed_reg) models_reg["KernelRidge"] = KernelRidge() models_reg["BayesianRidge"] = BayesianRidge() models_reg["ElasticNet"] = ElasticNet(random_state=seed_reg) models_reg["KNN"] = KNeighborsRegressor() models_reg["SVR"] = SVR() models_reg["DecisionTree"] = DecisionTreeRegressor(random_state=seed_reg) models_reg["ExtraTrees"] = ExtraTreesRegressor(random_state=seed_reg) models_reg["Earth"] = Earth() models_reg["RGFRegressor"] = RGFRegressor() #models_reg["FastRGFRegressor"] = FastRGFRegressor() models_reg["RandomForest"] = RandomForestRegressor(random_state=seed_reg) models_reg["AdaBoost"] = AdaBoostRegressor(random_state=seed_reg) models_reg["GradientBoost"] = GradientBoostingRegressor(random_state=seed_reg) models_reg["XGBoost"] = XGBRegressor(random_state=seed_reg) models_reg["LightGBM"] = LGBMRegressor(random_state=seed_reg) models_reg["CatBoost"] = CatBoostRegressor(random_state=seed_reg) models_reg["MLPRegressor"] = MLPRegressor(random_state=seed_reg) #TESTE03-1: Treino e Teste dos Modelos com CrossValScore + 10 KFold(shuffle = False) + R2 Score - X_reg, Y_reg from sklearn.model_selection import KFold from sklearn.model_selection import cross_val_score
if __name__ == '__main__': models=[ ######## First level ######## [ #AdaBoostRegressor(RandomForestRegressor(n_estimators=100,criterion='mse'),n_estimators=8, learning_rate= 0.05), #LinearRegression(fit_intercept=True, normalize= True), #Ridge(alpha=0.1, fit_intercept=True, normalize=False), #BaggingRegressor(RandomForestRegressor(n_estimators=100, criterion="mae"), oob_score= False, n_estimators= 20, max_samples= 0.5, max_features= 0.7), #SVR(max_iter=-1, degree=5, kernel='rbf'), #KNeighborsRegressor(n_neighbors=5), #MLPRegressor(early_stopping=True, hidden_layer_sizes=(2), learning_rate_init= 0.01, max_iter= 1000), XGBRegressor(n_estimators=100, criterion="mae", max_depth=12, subsample=0.5, learning_rate=0.05, colsample_bytree=0.9), #ExtraTreesRegressor(n_estimators=120, criterion="mae", max_depth=10, max_features=0.5, random_state=1), GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=50, max_features=0.5, random_state=1), RandomForestRegressor(n_estimators=100, criterion="mae", random_state=1), RGFRegressor(max_leaf=4500, algorithm="RGF_Sib", test_interval=50, loss="LS", verbose=False), ], ######## Second level ######## [ # RandomForestRegressor(n_estimators=100, criterion="mae", random_state=1), RandomForestRegressor(n_estimators=100, criterion="mse",max_depth=10,min_samples_split=9,min_samples_leaf=1,min_weight_fraction_leaf=0,max_leaf_nodes=None,min_impurity_decrease=0.0005,oob_score=True) # RGFRegressor(max_leaf=4500, algorithm="RGF_Sib", test_interval=50, loss="LS", verbose=False), # ExtraTreesRegressor(n_estimators=100, criterion="mae", random_state=1) ], #[ # RandomForestRegressor(n_estimators=100, criterion="mse",max_depth=10,min_samples_split=9,min_samples_leaf=1,min_weight_fraction_leaf=0,max_leaf_nodes=None,min_impurity_decrease=0.0005,oob_score=True), #] ] model = StackNetRegressor(models, metric="mae", folds=5, restacking=True, use_retraining=False, random_state=12345, verbose=1)
def metalPredict(self): rf = RandomForestRegressor( max_depth=20, random_state=0, n_estimators=134,min_samples_split=9, max_features=0.33 ) xg=xgboost.XGBRegressor( n_estimators=100, max_depth=7, gamma=0.0, colsample_bytree=1, ) lgbm = LGBMRegressor( boosting_type='gbdt', num_leaves=662, feature_fraction= 0.36875625810601625, bagging_fraction=0.39668072810414723, learning_rate=0.06, min_data_in_leaf=35, max_depth=27 ) rgf = RGFRegressor( max_leaf=1998, algorithm="RGF", test_interval=100, loss="LS", verbose=False, l2=0.187, min_samples_leaf=1 ) os.chdir(r'G:\マイドライブ\Data\tox_predict') #test #RMSE 10**1.048481123342563 #Coor 0.6585531363438192 #train #RMSE 10**0.7063184454700483 #Coor 0.8793733707424819 rf = RandomForestRegressor() #test #RMSE 10**1.109618572731106 #Coor 0.6198791156669657 #train #0.5647634395040055 #0.9179571547567322 df1 = pd.read_csv('metalMACCS.csv').set_index('CAS') df2 = pd.read_csv('metalECFP4.csv').set_index('CAS') for i,df in enumerate([df1,df2]) : names = ['fish_tox','daphnia_tox','Algae_tox'] if i == 0: print('MACCS') else: print('ECFP4') for name in names: if name == 'fish_tox': dftemp = df.drop(['daphnia_tox','Algae_tox'], axis=1) elif name== 'daphnia_tox': dftemp = df.drop(['fish_tox','Algae_tox'], axis=1) elif name== 'Algae_tox': dftemp = df.drop(['fish_tox','daphnia_tox'], axis=1) print(name) dftemp = dftemp.dropna() y = np.log10(dftemp[name]) X = dftemp.iloc[:,0:-1] calcACC(rf, X=X, name=None, y=y)