예제 #1
0
 def test_one_hot_encode(self):
     df = loader.load(date(2020, 1, 1), date(2020, 3, 1), geo,
                      [country_code, population])
     df = imputer.impute(df)
     df = encoders.OneHotEncoder(COUNTRY_NAME).fit_transform(df)
     self.assertIn(f"{COUNTRY_NAME}_ohe_Germany", df.columns)
     df.info()
예제 #2
0
 def test_hash_encode(self):
     df = loader.load(date(2020, 1, 1), date(2020, 3, 1), geo,
                      [country_code, population])
     df = imputer.impute(df)
     df = encoders.HashEncoder(COUNTRY_NAME, 8).fit_transform(df)
     self.assertIn(f"{COUNTRY_NAME}_hash_0", df.columns)
     df.info()
예제 #3
0
 def test_cyclical_encode(self):
     df = loader.load(date(2020, 1, 1), date(2020, 3, 1), geo,
                      [country_code, population])
     df = imputer.impute(df)
     df[DAY_OF_WEEK] = df[DAY_OF_WEEK] / 7.0
     df = encoders.CyclicalEncoder(DAY_OF_WEEK).fit_transform(df)
     self.assertIn(f"{DAY_OF_WEEK}_sin", df.columns)
     self.assertIn(f"{DAY_OF_WEEK}_cos", df.columns)
예제 #4
0
 def test_end_date_after_expansion_window(self):
     start_date = date(2020, 1, 10)
     end_date = date(2020, 1, 11)
     imputation_window_start_date = date(2020, 1, 1)
     imputation_window_end_date = date(2020, 1, 8)
     df = loader.load(start_date, end_date, imputation_window_start_date,
                      imputation_window_end_date, geo.module,
                      [country_code.module, working_day.module])
     df.info()
     print(df[[COUNTRY_CODE, DATE]].head(10))
     # verify the date merging worked properly, for the given date range
     self.assertEqual(df[DATE].min(), start_date)
     self.assertEqual(df[DATE].max(), end_date)
예제 #5
0
 def test_loader(self):
     start_date = date(2020, 1, 1)
     end_date = date(2020, 12, 31)
     imputation_window_start_date = date(2020, 1, 1)
     imputation_window_end_date = date(2020, 12, 31)
     df = loader.load(
         start_date, end_date, imputation_window_start_date,
         imputation_window_end_date, geo.module, [
             country_code.module, continent.module, population.module,
             age_dist.module, temperatures.module, oxford.module,
             working_day.module
         ])
     # verify the date merging worked properly, for the given date range
     self.assertEqual(df[DATE].min(), start_date)
     self.assertEqual(df[DATE].max(), end_date)
     df.info()
예제 #6
0
 def test_bloom_filter_encode(self):
     df = loader.load(date(2020, 1, 1), date(2020, 3, 1), geo,
                      [country_code, population])
     df = imputer.impute(df)
     df = encoders.BloomFilterEncoder(COUNTRY_NAME, 3, 31).fit_transform(df)
     self.assertIn(f"{COUNTRY_NAME}_bloom_0", df.columns)
예제 #7
0
 def test_polynomial_encode(self):
     df = loader.load(date(2020, 1, 1), date(2020, 3, 1), geo,
                      [country_code, population])
     df = imputer.impute(df)
     df = encoders.PolynomialEncoder(COUNTRY_NAME).fit_transform(df)
     self.assertIn(f"{COUNTRY_NAME}_poly_0", df.columns)
예제 #8
0
 def test_helmert_contrast_encode(self):
     df = loader.load(date(2020, 1, 1), date(2020, 3, 1), geo,
                      [country_code, population])
     df = imputer.impute(df)
     df = encoders.HelmertContrastEncoder(COUNTRY_NAME).fit_transform(df)
     self.assertIn(f"{COUNTRY_NAME}_helmert_0", df.columns)
예제 #9
0
 def test_backward_difference_encode(self):
     df = loader.load(date(2020, 1, 1), date(2020, 3, 1), geo,
                      [country_code, population])
     df = imputer.impute(df)
     df = encoders.BackwardDifferenceEncoder(COUNTRY_NAME).fit_transform(df)
     self.assertIn(f"{COUNTRY_NAME}_bde_0", df.columns)
예제 #10
0
 def test_sum_encoder(self):
     df = loader.load(date(2020, 1, 1), date(2020, 12, 31), geo,
                      [country_code, population])
     df = imputer.impute(df)
     df = encoders.SumEncoder(COUNTRY_NAME).fit_transform(df)
     self.assertIn(f"{COUNTRY_NAME}_sum_0", df.columns)
예제 #11
0
    def test_sklearn_pipeline(self):
        # load the dataset
        start_date = date(2020, 1, 1)
        end_date = date(2021, 1, 3)
        imputation_window_start_date = date(2019, 1, 1)
        imputation_window_end_date = end_date
        df = loader.load(
            start_date, end_date, imputation_window_start_date,
            imputation_window_end_date, geo.module, [
                country_code.module, continent.module, population.module,
                age_distribution.module, temperatures.module,
                oxford_data.module, working_day.module
            ])

        # derive the label columns, and move new cases to the front
        info('calculating label')
        df = df.groupby(GEO_CODE).apply(
            self.determine_new_cases).reset_index(drop=True)
        df = df.groupby(GEO_CODE).apply(lambda group: group_add_ma_a(
            group, PREDICTED_NEW_CASES)).reset_index(drop=True)
        df = df.groupby(GEO_CODE).apply(lambda group: group_add_ma_b(
            group, PREDICTED_NEW_CASES)).reset_index(drop=True)
        df = df.groupby(GEO_CODE).apply(lambda group: group_add_ma_c(
            group, PREDICTED_NEW_CASES)).reset_index(drop=True)
        df = df.groupby(GEO_CODE).apply(lambda group: group_add_ma_a(
            group, CONFIRMED_CASES)).reset_index(drop=True)
        df = df.groupby(GEO_CODE).apply(lambda group: group_add_ma_b(
            group, CONFIRMED_CASES)).reset_index(drop=True)
        df = df.groupby(GEO_CODE).apply(lambda group: group_add_ma_c(
            group, CONFIRMED_CASES)).reset_index(drop=True)
        df = df.groupby(GEO_CODE).apply(lambda group: add_working_day_tomorrow(
            group)).reset_index(drop=True)
        df = df.groupby(GEO_CODE).apply(
            lambda group: add_working_day_yesterday(group)).reset_index(
                drop=True)

        # move the label to the front
        df = transform_column_order(df)

        # create the pipeline
        info('creating pipeline')
        pipeline = Pipeline([
            (
                'features',
                FeatureUnion([
                    # geographic location
                    nominal(CONTINENT),
                    nominal(COUNTRY_CODE),
                    # it is important to have the geo code, to help distinguish from different countries with no regions,
                    # otherwise, if we only use a region code, all countries have the same 0 value. This way, we do not
                    # need to rely on the algorithm determining this from the combo of country + region. we make it explicit
                    nominal(GEO_CODE),
                    # case information
                    numeric(PREDICTED_NEW_CASES + SUFFIX_MA_A),
                    numeric(PREDICTED_NEW_CASES + SUFFIX_MA_B),
                    numeric(PREDICTED_NEW_CASES + SUFFIX_MA_C),
                    numeric(CONFIRMED_CASES),
                    numeric(CONFIRMED_CASES + SUFFIX_MA_A),
                    numeric(CONFIRMED_CASES + SUFFIX_MA_B),
                    numeric(CONFIRMED_CASES + SUFFIX_MA_C),
                    # non-pharmaceutical interventions
                    numeric(C1),
                    numeric(C2),
                    numeric(C3),
                    numeric(C4),
                    numeric(C5),
                    numeric(C6),
                    numeric(C7),
                    numeric(C8),
                    numeric(H1),
                    numeric(H2),
                    numeric(H3),
                    numeric(H6),
                    # country and regional information
                    numeric(AGE_DISTRIBUTION_1),
                    numeric(AGE_DISTRIBUTION_2),
                    numeric(AGE_DISTRIBUTION_3),
                    numeric(AGE_DISTRIBUTION_4),
                    numeric(AGE_DISTRIBUTION_5),
                    numeric(GDP_PER_CAPITA),
                    numeric(OBESITY_RATE),
                    numeric(POPULATION),
                    numeric(POPULATION_DENSITY),
                    numeric(POPULATION_PERCENT_URBAN),
                    numeric(PNEUMONIA_DEATHS_PER_100K),
                    numeric(SPECIFIC_HUMIDITY),
                    numeric(TEMPERATURE),
                    numeric(WORKING_DAY),
                    numeric(WORKING_DAY + '_tomorrow'),
                    numeric(WORKING_DAY + '_yesterday'),
                    # date/time fields
                    # numeric(DATE),
                    # numeric(DAY_OF_MONTH),
                    nominal(DAY_OF_WEEK),
                    numeric(DAY_OF_YEAR),
                    # numeric(WEEK),
                    # numeric(MONTH),
                    # numeric(QUARTER),
                    # numeric(YEAR)
                ])),
            ('estimator',
             SGDRegressor(max_iter=10000,
                          early_stopping=True,
                          n_iter_no_change=2000,
                          shuffle=True))
        ])

        info('getting train/val/test split')
        train, val, test = split(df, 30, 1)
        train_x, train_y, validation_x, validation_y, test_x, test_y = (
            train.iloc[:, 1:], train.iloc[:, :1], val.iloc[:, 1:],
            val.iloc[:, :1], test.iloc[:, 1:], test.iloc[:, :1])

        # split our dataset
        """
        Best so far:
            -1009.1139305621175
            {'estimator__alpha': 0.0004, 'estimator__epsilon': 0.0075, 
            'estimator__learning_rate': 'adaptive', 'estimator__loss': 'squared_epsilon_insensitive'}
        """
        parameters = {
            'estimator__alpha': [0.0003, 0.0004, 0.0006],
            'estimator__epsilon': [0.004, 0.008, 0.017],
            'estimator__loss': ['huber', 'squared_epsilon_insensitive'],
            # ['squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
            'estimator__learning_rate':
            ['invscaling',
             'adaptive']  # ['invscaling', 'adaptive', 'optimal', 'constant']
        }
        grid = GridSearchCV(pipeline,
                            param_grid=parameters,
                            cv=5,
                            scoring='neg_root_mean_squared_error',
                            n_jobs=10,
                            verbose=10)
        grid.fit(train_x, train_y.values.ravel())
        print("score A = %3.2f" %
              (grid.score(validation_x, validation_y.values.ravel())))
        print("score C = %3.2f" % (grid.best_estimator_.score(
            validation_x, validation_y.values.ravel())))
        print(grid.best_score_)
        print(grid.best_params_)