root_log = logging.getLogger() for h in root_log.handlers: root_log.removeHandler(h) handler = logging.StreamHandler(stream=sys.stdout) handler.setFormatter(coloredlogs.ColoredFormatter()) handler.addFilter(coloredlogs.HostNameFilter()) root_log.addHandler(handler) logger = logging.getLogger(__name__) logger.info("hello!") logger.warning("warn!") df = dl.shape_pps_data(dl.load_corn_rows_mssql()) # ML areas = df.pop('Area') y = df['Dry_Yield'] X = df.drop(['Dry_Yield'], axis=1) X_train, X_validation, y_train, y_validation = \ train_test_split(X, y, test_size=.2, random_state=7) scaler = StandardScaler() scaler.fit(X_train) X_train_scaled = scaler.transform(X_train) nn = MLPRegressor(random_state=7, verbose=99, max_iter=5000) nn.fit(X_train_scaled, y_train)
import numpy from keras.callbacks import EarlyStopping, ModelCheckpoint from keras.layers import Dense, Dropout from keras.models import Sequential from keras.wrappers.scikit_learn import KerasRegressor from sklearn.model_selection import KFold, train_test_split from sklearn.preprocessing import StandardScaler from data_scripts import pcs_data_loader # load dataset from modeling import score_util df = pcs_data_loader.shape_pps_data(pcs_data_loader.load_corn_rows_mssql()) df.drop(['Area'], axis=1, inplace=True) y = df['Dry_Yield'] X = df.drop(['Dry_Yield'], axis=1) X_train, X_validation, y_train, y_validation = \ train_test_split(X, y, test_size=.3, random_state=7) scaler = StandardScaler() scaler.fit(X_train) # define base model def baseline_model(): # create model model = Sequential() model.add(Dropout(.1, input_shape=(len(X.columns), ), seed=91)) model.add(Dense(1024, kernel_initializer='normal', activation='relu')) model.add(Dropout(.1, seed=71)) model.add(Dense(256, kernel_initializer='normal', activation='relu')) model.add(Dropout(.1, seed=51))
pps = gis_repo.processed_layer_shapes_by_year_id(year_id) # get pps cells that have an elb pps_elb_cells = pandas.DataFrame(pps.loc[pps['geometry'].apply( lambda x: any(x.intersects(c) for c in elb_centroids))]) pps_elb_cells.drop(['geometry'], inplace=True, axis=1) # load weather record wx = gis_repo.weather_by_year_id(year_id) pps_elb_cells = pandas.concat([ pps_elb_cells, pandas.DataFrame( [wx.values], index=pps_elb_cells.index, columns=wx.keys()) ], axis=1) pps_elb_cells = pcs_data_loader.shape_pps_data(pps_elb_cells) pps_elb_cols = set(pps_elb_cells.columns.tolist()) # add missing elb columns needed for the model missing_cols = train_cols - pps_elb_cols pps_elb_cells: pandas.DataFrame = pandas.concat([ pps_elb_cells, pandas.DataFrame(0, index=pps_elb_cells.index, columns=missing_cols) ], axis=1) # remove any extra enum dummy columns in elb (that training isn't aware of) elb_extra_cols = set(pps_elb_cells.columns) - train_cols if any(elb_extra_cols): print( f"WARNING: ELB has unknown training enum (dummy) cols: {','.join(elb_extra_cols)}"
from data_scripts import pcs_data_loader as dl df_raw = dl.load_corn_rows_mssql() dump_base_path_ = f'./_temp/out/raw_{dt.datetime.now():%Y%m%d%_H%m}' df_raw.to_csv((dump_base_path_ + f'.csv'), compression='gzip') with zipfile.ZipFile(f'{dump_base_path_}.csv.zip', mode="w", compression=zipfile.ZIP_DEFLATED) as z: z.write(dump_base_path_ + ".csv", arcname=f'pcs_ml_raw_dump_{dt.datetime.now():%Y-%m-%d}.csv') df_raw.to_pickle(f'{dump_base_path_}.pickle') df_raw.to_hdf(f'{dump_base_path_}.hdf5', key='_') df = dl.shape_pps_data(df_raw) with zipfile.ZipFile(dump_base_path_ + "_pickle.zip", mode="w", compression=zipfile.ZIP_DEFLATED) as z: z.write(dump_base_path_ + ".pickle") df.to_pickle(dump_base_path_ + ".pickle", compression='gzip') ## # Reading from gzip pickle ~11 sec, dumping ~1 min, filesize ~50MB # Reading from pickle ~3 sec, dumping ~14 sec, filesize ~2.5GB # # shaping df_raw ~1 min ##
use_full_df = args_.use_full_dataframe result_base_path: str = args_.result_base_path n_jobs: int = args_.n_jobs n_estimators = args_.n_estimators run_id = f'{datetime.now():%Y%m%d%H%m}' print(f'running....: {run_id}') print(args_) print(f'use_full_dataframe: {use_full_df}') train_df: pandas.DataFrame if not use_full_df: train_df = pcs_data_loader.load_corn_rows_sample_shaped_pickle_gz() else: train_df = pcs_data_loader.shape_pps_data(pcs_data_loader.load_corn_rows_pickle_gz()) # load training data and train et model y = train_df['Dry_Yield'] X = train_df.drop(['Dry_Yield', 'Area'], axis=1) scaler = StandardScaler() scaler.fit(X) print('fitting model') model = ExtraTreesRegressor(n_jobs=n_jobs, n_estimators=n_estimators, verbose=99) model.fit(scaler.transform(X), y) model_path_ = f'{result_base_path}/et_model_{run_id}.pickle' with open(model_path_, 'wb') as f: pickle.dump(model, f) print(f'model saved: {model_path_}')