예제 #1
0
root_log = logging.getLogger()
for h in root_log.handlers:
    root_log.removeHandler(h)
handler = logging.StreamHandler(stream=sys.stdout)

handler.setFormatter(coloredlogs.ColoredFormatter())
handler.addFilter(coloredlogs.HostNameFilter())
root_log.addHandler(handler)

logger = logging.getLogger(__name__)

logger.info("hello!")
logger.warning("warn!")

df = dl.shape_pps_data(dl.load_corn_rows_mssql())

# ML
areas = df.pop('Area')
y = df['Dry_Yield']
X = df.drop(['Dry_Yield'], axis=1)
X_train, X_validation, y_train, y_validation = \
    train_test_split(X, y, test_size=.2, random_state=7)

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

nn = MLPRegressor(random_state=7, verbose=99, max_iter=5000)
nn.fit(X_train_scaled, y_train)
예제 #2
0
import numpy
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler

from data_scripts import pcs_data_loader
# load dataset
from modeling import score_util

df = pcs_data_loader.shape_pps_data(pcs_data_loader.load_corn_rows_mssql())
df.drop(['Area'], axis=1, inplace=True)
y = df['Dry_Yield']
X = df.drop(['Dry_Yield'], axis=1)
X_train, X_validation, y_train, y_validation = \
    train_test_split(X, y, test_size=.3, random_state=7)
scaler = StandardScaler()
scaler.fit(X_train)


# define base model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dropout(.1, input_shape=(len(X.columns), ), seed=91))
    model.add(Dense(1024, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(.1, seed=71))
    model.add(Dense(256, kernel_initializer='normal', activation='relu'))
    model.add(Dropout(.1, seed=51))
예제 #3
0
    pps = gis_repo.processed_layer_shapes_by_year_id(year_id)
    # get pps cells that have an elb
    pps_elb_cells = pandas.DataFrame(pps.loc[pps['geometry'].apply(
        lambda x: any(x.intersects(c) for c in elb_centroids))])
    pps_elb_cells.drop(['geometry'], inplace=True, axis=1)

    # load weather record
    wx = gis_repo.weather_by_year_id(year_id)
    pps_elb_cells = pandas.concat([
        pps_elb_cells,
        pandas.DataFrame(
            [wx.values], index=pps_elb_cells.index, columns=wx.keys())
    ],
                                  axis=1)

    pps_elb_cells = pcs_data_loader.shape_pps_data(pps_elb_cells)
    pps_elb_cols = set(pps_elb_cells.columns.tolist())

    # add missing elb columns needed for the model
    missing_cols = train_cols - pps_elb_cols
    pps_elb_cells: pandas.DataFrame = pandas.concat([
        pps_elb_cells,
        pandas.DataFrame(0, index=pps_elb_cells.index, columns=missing_cols)
    ],
                                                    axis=1)

    # remove any extra enum dummy columns in elb (that training isn't aware of)
    elb_extra_cols = set(pps_elb_cells.columns) - train_cols
    if any(elb_extra_cols):
        print(
            f"WARNING: ELB has unknown training enum (dummy) cols: {','.join(elb_extra_cols)}"
예제 #4
0
from data_scripts import pcs_data_loader as dl

df_raw = dl.load_corn_rows_mssql()
dump_base_path_ = f'./_temp/out/raw_{dt.datetime.now():%Y%m%d%_H%m}'

df_raw.to_csv((dump_base_path_ + f'.csv'), compression='gzip')
with zipfile.ZipFile(f'{dump_base_path_}.csv.zip',
                     mode="w",
                     compression=zipfile.ZIP_DEFLATED) as z:
    z.write(dump_base_path_ + ".csv",
            arcname=f'pcs_ml_raw_dump_{dt.datetime.now():%Y-%m-%d}.csv')

df_raw.to_pickle(f'{dump_base_path_}.pickle')
df_raw.to_hdf(f'{dump_base_path_}.hdf5', key='_')

df = dl.shape_pps_data(df_raw)
with zipfile.ZipFile(dump_base_path_ + "_pickle.zip",
                     mode="w",
                     compression=zipfile.ZIP_DEFLATED) as z:
    z.write(dump_base_path_ + ".pickle")

df.to_pickle(dump_base_path_ + ".pickle", compression='gzip')

##
# Reading from gzip pickle ~11 sec, dumping ~1 min, filesize ~50MB
# Reading from pickle ~3 sec, dumping ~14 sec, filesize ~2.5GB
#
# shaping df_raw ~1 min
##
예제 #5
0
use_full_df = args_.use_full_dataframe
result_base_path: str = args_.result_base_path
n_jobs: int = args_.n_jobs
n_estimators = args_.n_estimators

run_id = f'{datetime.now():%Y%m%d%H%m}'

print(f'running....: {run_id}')
print(args_)
print(f'use_full_dataframe: {use_full_df}')

train_df: pandas.DataFrame
if not use_full_df:
    train_df = pcs_data_loader.load_corn_rows_sample_shaped_pickle_gz()
else:
    train_df = pcs_data_loader.shape_pps_data(pcs_data_loader.load_corn_rows_pickle_gz())

# load training data and train et model
y = train_df['Dry_Yield']
X = train_df.drop(['Dry_Yield', 'Area'], axis=1)
scaler = StandardScaler()
scaler.fit(X)

print('fitting model')
model = ExtraTreesRegressor(n_jobs=n_jobs, n_estimators=n_estimators, verbose=99)
model.fit(scaler.transform(X), y)

model_path_ = f'{result_base_path}/et_model_{run_id}.pickle'
with open(model_path_, 'wb') as f:
    pickle.dump(model, f)
    print(f'model saved: {model_path_}')