def test_one_hot_encode(self): df = loader.load(date(2020, 1, 1), date(2020, 3, 1), geo, [country_code, population]) df = imputer.impute(df) df = encoders.OneHotEncoder(COUNTRY_NAME).fit_transform(df) self.assertIn(f"{COUNTRY_NAME}_ohe_Germany", df.columns) df.info()
def test_hash_encode(self): df = loader.load(date(2020, 1, 1), date(2020, 3, 1), geo, [country_code, population]) df = imputer.impute(df) df = encoders.HashEncoder(COUNTRY_NAME, 8).fit_transform(df) self.assertIn(f"{COUNTRY_NAME}_hash_0", df.columns) df.info()
def test_cyclical_encode(self): df = loader.load(date(2020, 1, 1), date(2020, 3, 1), geo, [country_code, population]) df = imputer.impute(df) df[DAY_OF_WEEK] = df[DAY_OF_WEEK] / 7.0 df = encoders.CyclicalEncoder(DAY_OF_WEEK).fit_transform(df) self.assertIn(f"{DAY_OF_WEEK}_sin", df.columns) self.assertIn(f"{DAY_OF_WEEK}_cos", df.columns)
def test_end_date_after_expansion_window(self): start_date = date(2020, 1, 10) end_date = date(2020, 1, 11) imputation_window_start_date = date(2020, 1, 1) imputation_window_end_date = date(2020, 1, 8) df = loader.load(start_date, end_date, imputation_window_start_date, imputation_window_end_date, geo.module, [country_code.module, working_day.module]) df.info() print(df[[COUNTRY_CODE, DATE]].head(10)) # verify the date merging worked properly, for the given date range self.assertEqual(df[DATE].min(), start_date) self.assertEqual(df[DATE].max(), end_date)
def test_loader(self): start_date = date(2020, 1, 1) end_date = date(2020, 12, 31) imputation_window_start_date = date(2020, 1, 1) imputation_window_end_date = date(2020, 12, 31) df = loader.load( start_date, end_date, imputation_window_start_date, imputation_window_end_date, geo.module, [ country_code.module, continent.module, population.module, age_dist.module, temperatures.module, oxford.module, working_day.module ]) # verify the date merging worked properly, for the given date range self.assertEqual(df[DATE].min(), start_date) self.assertEqual(df[DATE].max(), end_date) df.info()
def test_bloom_filter_encode(self): df = loader.load(date(2020, 1, 1), date(2020, 3, 1), geo, [country_code, population]) df = imputer.impute(df) df = encoders.BloomFilterEncoder(COUNTRY_NAME, 3, 31).fit_transform(df) self.assertIn(f"{COUNTRY_NAME}_bloom_0", df.columns)
def test_polynomial_encode(self): df = loader.load(date(2020, 1, 1), date(2020, 3, 1), geo, [country_code, population]) df = imputer.impute(df) df = encoders.PolynomialEncoder(COUNTRY_NAME).fit_transform(df) self.assertIn(f"{COUNTRY_NAME}_poly_0", df.columns)
def test_helmert_contrast_encode(self): df = loader.load(date(2020, 1, 1), date(2020, 3, 1), geo, [country_code, population]) df = imputer.impute(df) df = encoders.HelmertContrastEncoder(COUNTRY_NAME).fit_transform(df) self.assertIn(f"{COUNTRY_NAME}_helmert_0", df.columns)
def test_backward_difference_encode(self): df = loader.load(date(2020, 1, 1), date(2020, 3, 1), geo, [country_code, population]) df = imputer.impute(df) df = encoders.BackwardDifferenceEncoder(COUNTRY_NAME).fit_transform(df) self.assertIn(f"{COUNTRY_NAME}_bde_0", df.columns)
def test_sum_encoder(self): df = loader.load(date(2020, 1, 1), date(2020, 12, 31), geo, [country_code, population]) df = imputer.impute(df) df = encoders.SumEncoder(COUNTRY_NAME).fit_transform(df) self.assertIn(f"{COUNTRY_NAME}_sum_0", df.columns)
def test_sklearn_pipeline(self): # load the dataset start_date = date(2020, 1, 1) end_date = date(2021, 1, 3) imputation_window_start_date = date(2019, 1, 1) imputation_window_end_date = end_date df = loader.load( start_date, end_date, imputation_window_start_date, imputation_window_end_date, geo.module, [ country_code.module, continent.module, population.module, age_distribution.module, temperatures.module, oxford_data.module, working_day.module ]) # derive the label columns, and move new cases to the front info('calculating label') df = df.groupby(GEO_CODE).apply( self.determine_new_cases).reset_index(drop=True) df = df.groupby(GEO_CODE).apply(lambda group: group_add_ma_a( group, PREDICTED_NEW_CASES)).reset_index(drop=True) df = df.groupby(GEO_CODE).apply(lambda group: group_add_ma_b( group, PREDICTED_NEW_CASES)).reset_index(drop=True) df = df.groupby(GEO_CODE).apply(lambda group: group_add_ma_c( group, PREDICTED_NEW_CASES)).reset_index(drop=True) df = df.groupby(GEO_CODE).apply(lambda group: group_add_ma_a( group, CONFIRMED_CASES)).reset_index(drop=True) df = df.groupby(GEO_CODE).apply(lambda group: group_add_ma_b( group, CONFIRMED_CASES)).reset_index(drop=True) df = df.groupby(GEO_CODE).apply(lambda group: group_add_ma_c( group, CONFIRMED_CASES)).reset_index(drop=True) df = df.groupby(GEO_CODE).apply(lambda group: add_working_day_tomorrow( group)).reset_index(drop=True) df = df.groupby(GEO_CODE).apply( lambda group: add_working_day_yesterday(group)).reset_index( drop=True) # move the label to the front df = transform_column_order(df) # create the pipeline info('creating pipeline') pipeline = Pipeline([ ( 'features', FeatureUnion([ # geographic location nominal(CONTINENT), nominal(COUNTRY_CODE), # it is important to have the geo code, to help distinguish from different countries with no regions, # otherwise, if we only use a region code, all countries have the same 0 value. This way, we do not # need to rely on the algorithm determining this from the combo of country + region. we make it explicit nominal(GEO_CODE), # case information numeric(PREDICTED_NEW_CASES + SUFFIX_MA_A), numeric(PREDICTED_NEW_CASES + SUFFIX_MA_B), numeric(PREDICTED_NEW_CASES + SUFFIX_MA_C), numeric(CONFIRMED_CASES), numeric(CONFIRMED_CASES + SUFFIX_MA_A), numeric(CONFIRMED_CASES + SUFFIX_MA_B), numeric(CONFIRMED_CASES + SUFFIX_MA_C), # non-pharmaceutical interventions numeric(C1), numeric(C2), numeric(C3), numeric(C4), numeric(C5), numeric(C6), numeric(C7), numeric(C8), numeric(H1), numeric(H2), numeric(H3), numeric(H6), # country and regional information numeric(AGE_DISTRIBUTION_1), numeric(AGE_DISTRIBUTION_2), numeric(AGE_DISTRIBUTION_3), numeric(AGE_DISTRIBUTION_4), numeric(AGE_DISTRIBUTION_5), numeric(GDP_PER_CAPITA), numeric(OBESITY_RATE), numeric(POPULATION), numeric(POPULATION_DENSITY), numeric(POPULATION_PERCENT_URBAN), numeric(PNEUMONIA_DEATHS_PER_100K), numeric(SPECIFIC_HUMIDITY), numeric(TEMPERATURE), numeric(WORKING_DAY), numeric(WORKING_DAY + '_tomorrow'), numeric(WORKING_DAY + '_yesterday'), # date/time fields # numeric(DATE), # numeric(DAY_OF_MONTH), nominal(DAY_OF_WEEK), numeric(DAY_OF_YEAR), # numeric(WEEK), # numeric(MONTH), # numeric(QUARTER), # numeric(YEAR) ])), ('estimator', SGDRegressor(max_iter=10000, early_stopping=True, n_iter_no_change=2000, shuffle=True)) ]) info('getting train/val/test split') train, val, test = split(df, 30, 1) train_x, train_y, validation_x, validation_y, test_x, test_y = ( train.iloc[:, 1:], train.iloc[:, :1], val.iloc[:, 1:], val.iloc[:, :1], test.iloc[:, 1:], test.iloc[:, :1]) # split our dataset """ Best so far: -1009.1139305621175 {'estimator__alpha': 0.0004, 'estimator__epsilon': 0.0075, 'estimator__learning_rate': 'adaptive', 'estimator__loss': 'squared_epsilon_insensitive'} """ parameters = { 'estimator__alpha': [0.0003, 0.0004, 0.0006], 'estimator__epsilon': [0.004, 0.008, 0.017], 'estimator__loss': ['huber', 'squared_epsilon_insensitive'], # ['squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'], 'estimator__learning_rate': ['invscaling', 'adaptive'] # ['invscaling', 'adaptive', 'optimal', 'constant'] } grid = GridSearchCV(pipeline, param_grid=parameters, cv=5, scoring='neg_root_mean_squared_error', n_jobs=10, verbose=10) grid.fit(train_x, train_y.values.ravel()) print("score A = %3.2f" % (grid.score(validation_x, validation_y.values.ravel()))) print("score C = %3.2f" % (grid.best_estimator_.score( validation_x, validation_y.values.ravel()))) print(grid.best_score_) print(grid.best_params_)