# ML areas = df.pop('Area') y = df['Dry_Yield'] X = df.drop(['Dry_Yield'], axis=1) X_train, X_validation, y_train, y_validation = \ train_test_split(X, y, test_size=.2, random_state=7) scaler = StandardScaler() scaler.fit(X_train) X_train_scaled = scaler.transform(X_train) nn = MLPRegressor(random_state=7, verbose=99, max_iter=5000) nn.fit(X_train_scaled, y_train) scr = score_util.score(nn, scaler, X_validation, y_validation) print(scr) # grid = GridSearchCV( # estimator=nn, # param_grid={ # 'activation': ['relu', 'tanh', 'logistic'], # 'solver': ['adam'], # 'alpha': [.0000001, .000001, .00001, .0001], # 'learning_rate_init': [.0000001, .000001, .00001, .0001], # 'hidden_layer_sizes': [ # (2048, 1024, 512), # (512, 256, 128, 32, 16), # (256, 128, 32, 16, 8, 4), # ]}, # verbose=100,
# Compile model model.compile(loss='mean_squared_error', optimizer='adam') return model # fix random seed for reproducibility seed = 7 numpy.random.seed(seed) # evaluate model estimator = KerasRegressor(build_fn=baseline_model, epochs=10000, batch_size=17000, verbose=1) kfold = KFold(n_splits=10, random_state=seed) estimator.fit( scaler.transform(X_train), y_train.values, callbacks=[ ModelCheckpoint( './results/20170824_tf_dnn_reg/keras/val_acc_best.chkpnt', monitor="loss", save_best_only=True, save_weights_only=False, verbose=5), EarlyStopping(monitor='loss', min_delta=.0001, patience=25, verbose=5) ]) estimator.model.load_weights( './results/20170824_tf_dnn_reg/keras/val_acc_best.chkpnt') scr = score_util.score(estimator, scaler, X_validation, y_validation) print(scr)
train_split_idx] X_test_split, y_test_split = X.iloc[test_split_idx], y.iloc[test_split_idx] log.info("fitting transforms") transform_pipe.fit(X_train_split) log.info("transforming") X_train_transformed = transform_pipe.transform(X_train_split) logging.info("Running on input data shape: %s", X_train_transformed.shape) # model = ExtraTreesRegressor(verbose=99, n_jobs=4) # model = MLPRegressor(verbose=99, max_iter=150, tol=.01, learning_rate='constant', alpha=.1) model = GradientBoostingRegressor(verbose=99, n_estimators=100, max_depth=8) model.fit(X_train_transformed, y_train_split) logging.info("Scoring") scr = score_util.score(model, transform_pipe, X_test_split, y_test_split) joblib.dump(scr, os.path.join(env.result_path, f"score_{i}.pickle")) joblib.dump(scr, os.path.join(env.result_path, f"model_{i}.pickle")) scores.append(scr) # for score in scores: # logging.info(score) combined_score = score_util.combine(scores) logging.info("kfold scores combined: %s", combined_score) joblib.dump(combined_score, os.path.join(env.result_path, f"combined_score.pickle"))
from sklearn.ensemble import ExtraTreesRegressor from sklearn.model_selection import KFold, train_test_split from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler from data_scripts import pcs_data_loader as dl from modeling import score_util result_path = './results/20170828_et_optimizer' df = dl.shape_pps_data(dl.load_corn_rows_mssql()) # ML areas = df.pop('Area') y = df['Dry_Yield'].values X = df.drop(['Dry_Yield'], axis=1).values kcv = KFold(n_splits=5, random_state=971) pipeline = make_pipeline( StandardScaler(), ExtraTreesRegressor(n_jobs=2, verbose=99, n_estimators=10)) scores = [] for train_split_idx, test_split_idx in kcv.split(X): X_train_split, y_train_split = X[train_split_idx], y[train_split_idx] X_test_split, y_test_split = X[test_split_idx], y[test_split_idx] pipeline.fit(X_train_split, y_train_split) scr = score_util.score(pipeline, None, X_test_split, y_test_split) scores.append(scr) print(scr)
df = dl.load_corn_data_frame() # ML areas = df.pop('Area') y = df['Dry_Yield'] X = df.drop(['Dry_Yield'], axis=1) X_train, X_validation, y_train, y_validation = \ train_test_split(X, y, test_size=.2, random_state=7) scaler = StandardScaler() scaler.fit(X_train) X_train_scaled = scaler.transform(X_train) extra_trees = ExtraTreesRegressor(n_jobs=-1, verbose=True) extra_trees.fit(X_train_scaled, y_train) scr = score_util.score(extra_trees, scaler, X_validation, y_validation) grid_search_cv = GridSearchCV( estimator=extra_trees, param_grid={"n_estimators": [5, 10, 15, 20, 25, 30, 35, 40]}, error_score=0, n_jobs=2, verbose=99) grid_search_cv.fit(X_train_scaled, y_train) print(grid_search_cv.best_params_) scr = score_util.score(grid_search_cv.best_estimator_, scaler, X_validation, y_validation) print(scr)
# ML areas = df.pop('Area') y = df['Dry_Yield'] X = df.drop(['Dry_Yield'], axis=1) X_train, X_validation, y_train, y_validation = \ train_test_split(X, y, test_size=.3, random_state=7) scaler = StandardScaler() scaler.fit(X_train) X_train_scaled = scaler.transform(X_train) logger.info("Fitting extra trees") extra_trees = ExtraTreesRegressor(n_jobs=-1, verbose=True) extra_trees.fit(X_train_scaled, y_train) scr = score_util.score(extra_trees, scaler, X_validation, y_validation) # explore # seed = 11 # num_folds = 3 # model = Pipeline([('Scaler', StandardScaler()), ('ET', ExtraTreesRegressor())]) # kfold = KFold(n_splits=num_folds, random_state=seed) # # scoring = 'neg_mean_squared_error' # cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring, n_jobs=-1) def _score(m): scaler.fit(X_validation) predictions = m.predict(scaler.transform(X_validation)) mean_sq = mean_squared_error(y_validation, predictions)
with open(model_path_, 'wb') as f: pickle.dump(model, f) print(f'model saved: {model_path_}') scaler_path_ = f'{result_base_path}/et_scaler_{run_id}.pickle' with open(scaler_path_, 'wb') as f: pickle.dump(scaler, f) print(f'model saved: {scaler_path_}') results = [] for idx, elb_data in enumerate( sql_to_scikit_converter.load_cached_elbs(df.columns)): year_id, elb_X, elb_y, extra_cols = elb_data print(f'comparing elb year id: {year_id}, index: {idx}') elb_score = score_util.score(model, scaler, elb_X, elb_y) print(elb_score) results.append((year_id, elb_score, extra_cols)) rdf: pandas.DataFrame = pandas.concat([ pandas.DataFrame([_id for (_id, _, _) in results], columns=['year_id']), score_util.create_data_frame([scr for (_, scr, _) in results]), pandas.DataFrame( pandas.Series([c for (_, _, c) in results], name='extra_cols')) ], axis=1) os.makedirs(result_base_path, exist_ok=True) rdf.to_csv(f'{result_base_path}/elb_harvest_predictions_results_{run_id}.csv')
], axis=1) # remove any extra enum dummy columns in elb (that training isn't aware of) elb_extra_cols = set(pps_elb_cells.columns) - train_cols if any(elb_extra_cols): print( f"WARNING: ELB has unknown training enum (dummy) cols: {','.join(elb_extra_cols)}" ) pps_elb_cells.drop(elb_extra_cols, axis=1, inplace=True) elb_y = pps_elb_cells['Dry_Yield'] elb_X = pps_elb_cells.drop(['Dry_Yield', 'Area'], axis=1) # order columns to match training elb_X = elb_X[X.columns] elb_score = score_util.score(extra_trees, scaler, elb_X, elb_y) print(elb_score) results.append((year_id, elb_score, elb_extra_cols)) rdf: pandas.DataFrame = pandas.concat([ pandas.DataFrame([_id for (_id, _, _) in results], columns=['year_id']), score_util.create_data_frame([scr for (_, scr, _) in results]), pandas.DataFrame( pandas.Series([c for (_, _, c) in results], name='extra_cols')) ], axis=1) rdf.to_csv( './results/20170823_elb_predictions/elb_harvest_predictions_results_{:%Y%m%d%H%m}.csv' .format(datetime.now()))