예제 #1
0
def test_hpo_numeric_best_pick(test_dir, data_frame):
    feature_col, label_col = "feature", "label"

    df = data_frame(feature_col=feature_col, label_col=label_col)

    df.loc[:, label_col] = np.random.randn(df.shape[0])

    imputer = SimpleImputer(input_columns=[feature_col],
                            output_column=label_col,
                            output_path=test_dir,
                            is_explainable=True)

    hps = {feature_col: {'max_tokens': [1, 2, 3]}}
    hps[feature_col]['tokens'] = [['chars']]

    imputer.fit_hpo(df, hps=hps)

    results = imputer.hpo.results

    max_tokens_of_encoder = imputer.imputer.data_encoders[
        0].vectorizer.max_features

    # model with minimal MSE
    best_hpo_run = imputer.hpo.results['mse'].astype('float').idxmin()
    loaded_hpo_run = results.loc[results[feature_col + ':max_tokens'] ==
                                 max_tokens_of_encoder].index[0]

    assert best_hpo_run == loaded_hpo_run
예제 #2
0
def test_hpo_many_columns(test_dir, data_frame):
    feature_col, label_col = "feature", "label"

    n_samples = 300
    num_labels = 3
    ncols = 10
    seq_len = 4

    # generate some random data
    df = data_frame(feature_col=feature_col,
                    label_col=label_col,
                    num_labels=num_labels,
                    num_words=seq_len,
                    n_samples=n_samples)

    for col in range(ncols):
        df[feature_col + '_' + str(col)] = df[feature_col]

    imputer = SimpleImputer(
        input_columns=[col for col in df.columns if not col in ['label']],
        output_column=label_col,
        output_path=test_dir)

    imputer.fit_hpo(df, num_evals=2, num_epochs=10)

    assert imputer.hpo.results.precision_weighted.max() > .75
예제 #3
0
def test_hpo_runs(test_dir, data_frame):
    feature_col, label_col = "feature", "label"

    df = data_frame(feature_col=feature_col, label_col=label_col)

    imputer = SimpleImputer(
        input_columns=[col for col in df.columns if col != label_col],
        output_column=label_col,
        output_path=test_dir)

    hps = dict()
    max_tokens = [1024, 2048]
    hps[feature_col] = {'max_tokens': max_tokens}
    hps['global'] = {}
    hps['global']['concat_columns'] = [False]
    hps['global']['num_epochs'] = [10]
    hps['global']['num_epochs'] = [10]
    hps['global']['num_epochs'] = [10]

    imputer.fit_hpo(df,
                    hps=hps,
                    num_hash_bucket_candidates=[2**15],
                    tokens_candidates=['words'])

    # only search over specified parameter ranges
    assert set(
        imputer.hpo.results[feature_col + ':' +
                            'max_tokens'].unique().tolist()) == set(max_tokens)
    assert imputer.hpo.results.shape[0] == 2
예제 #4
0
def test_hpo_defaults(test_dir, data_frame):
    label_col = "label"

    n_samples = 500
    num_labels = 3
    seq_len = 10

    # generate some random data
    df = data_frame(feature_col="string_feature",
                    label_col=label_col,
                    num_labels=num_labels,
                    num_words=seq_len,
                    n_samples=n_samples)

    # add categorical feature
    df['categorical_feature'] = [
        'foo' if r > .5 else 'bar' for r in np.random.rand(n_samples)
    ]

    # add numerical feature
    df['numeric_feature'] = np.random.rand(n_samples)

    df_train, df_test = random_split(df, [.8, .2])
    output_path = os.path.join(test_dir, "tmp",
                               "real_data_experiment_text_hpo")

    imputer = SimpleImputer(input_columns=[
        'string_feature', 'categorical_feature', 'numeric_feature'
    ],
                            output_column='label',
                            output_path=output_path)

    imputer.fit_hpo(df_train, num_evals=10, num_epochs=5)

    assert imputer.hpo.results.precision_weighted.max() > .9
예제 #5
0
def test_hpo_many_columns(test_dir, data_frame):
    """

    """
    label_col = "label"

    n_samples = 300
    num_labels = 3
    ncols = 10
    seq_len = 4

    # generate some random data
    df = data_frame(feature_col="string_feature",
                    label_col=label_col,
                    num_labels=num_labels,
                    num_words=seq_len,
                    n_samples=n_samples)

    for col in range(ncols):
        df['string_featur_' + str(col)] = df['string_feature']

    df_train, df_test = random_split(df, [.8, .2])
    output_path = os.path.join(test_dir, "tmp",
                               "real_data_experiment_text_hpo")

    imputer = SimpleImputer(
        input_columns=[col for col in df.columns if not col in ['label']],
        output_column='label',
        output_path=output_path)

    imputer.fit_hpo(df_train, num_evals=2)

    assert imputer.hpo.results.precision_weighted.max() > .8
예제 #6
0
def test_hpo_mixed_hps_and_kwargs(test_dir, data_frame):
    feature_col, label_col = "feature", "label"

    df = data_frame(feature_col=feature_col, label_col=label_col)

    imputer = SimpleImputer(input_columns=[feature_col],
                            output_column=label_col,
                            output_path=test_dir)

    hps = {feature_col: {'max_tokens': [1024]}}

    imputer.fit_hpo(df, hps=hps, learning_rate_candidates=[0.1])

    assert imputer.hpo.results['global:learning_rate'].values[0] == 0.1
예제 #7
0
def test_hpo_num_evals_empty_hps(test_dir, data_frame):
    feature_col, label_col = "feature", "label"

    # generate some random data
    df = data_frame(feature_col=feature_col, label_col=label_col)

    imputer = SimpleImputer(
        input_columns=[col for col in df.columns if col != label_col],
        output_column=label_col,
        output_path=test_dir)

    num_evals = 2
    imputer.fit_hpo(df, num_evals=num_evals, num_epochs=10)

    assert imputer.hpo.results.shape[0] == 2
예제 #8
0
def test_hpo_num_evals_given_hps(test_dir, data_frame):
    feature_col, label_col = "feature", "label"

    # generate some random data
    df = data_frame(feature_col=feature_col, label_col=label_col)

    # assert that num_evals is an upper bound on the number of hpo runs
    for num_evals in range(1, 3):
        imputer = SimpleImputer(
            input_columns=[col for col in df.columns if col != label_col],
            output_column=label_col,
            output_path=test_dir)

        imputer.fit_hpo(df, num_evals=num_evals, num_epochs=5)

        assert imputer.hpo.results.shape[0] == num_evals
예제 #9
0
def test_hpo_similar_input_col_mixed_types(test_dir, data_frame):
    feature_col, label_col = "feature", "label"
    numeric_col = "numeric_feature"
    categorical_col = "categorical_col"

    df = data_frame(feature_col=feature_col, label_col=label_col)

    df.loc[:, numeric_col] = np.random.randn(df.shape[0])
    df.loc[:, categorical_col] = np.random.randint(df.shape[0])

    imputer = SimpleImputer(
        input_columns=[feature_col, numeric_col, categorical_col],
        output_column=label_col,
        output_path=test_dir)

    imputer.fit_hpo(df, num_epochs=10)
예제 #10
0
def test_hpo_kwargs_only_support(test_dir, data_frame):
    feature_col, label_col = "feature", "label"
    numeric_col = "numeric_feature"

    df = data_frame(feature_col=feature_col,
                    label_col=label_col)

    df.loc[:, numeric_col] = np.random.randn(df.shape[0])

    imputer = SimpleImputer(
        input_columns=[feature_col, numeric_col],
        output_column=label_col,
        output_path=test_dir
    )

    imputer.fit_hpo(
        df,
        num_epochs=1,
        patience=1,
        weight_decay=[0.001],
        batch_size=320,
        num_hash_bucket_candidates=[3],
        tokens_candidates=['words'],
        numeric_latent_dim_candidates=[1],
        numeric_hidden_layers_candidates=[1],
        final_fc_hidden_units=[[1]],
        learning_rate_candidates=[0.1],
        normalize_numeric=False
    )

    def assert_val(col, value):
        assert imputer.hpo.results[col].values[0] == value

    assert_val('global:num_epochs', 1)
    assert_val('global:patience', 1)
    assert_val('global:weight_decay', 0.001)

    assert_val('global:batch_size', 320)
    assert_val(feature_col + ':max_tokens', 3)
    assert_val(feature_col + ':tokens', ['words'])

    assert_val(numeric_col + ':numeric_latent_dim', 1)
    assert_val(numeric_col + ':numeric_hidden_layers', 1)

    assert_val('global:final_fc_hidden_units', [1])
    assert_val('global:learning_rate', 0.1)
예제 #11
0
def test_imputer_hpo_text(test_dir, data_frame):
    """

    Tests SimpleImputer HPO with text data and categorical imputations

    """
    feature_col = "string_feature"
    label_col = "label"

    n_samples = 1000
    num_labels = 3
    seq_len = 20

    # generate some random data
    df = data_frame(feature_col=feature_col,
                    label_col=label_col,
                    num_labels=num_labels,
                    num_words=seq_len,
                    n_samples=n_samples)

    df_train, df_test = random_split(df, [.8, .2])

    output_path = os.path.join(test_dir, "tmp", "experiment_text_hpo")

    imputer_string = SimpleImputer(
        input_columns=[feature_col],
        output_column=label_col,
        output_path=output_path
    )

    hps = dict()
    hps[feature_col] = {}
    hps[feature_col]['type'] = ['string']
    hps[feature_col]['tokens'] = [['words'], ['chars']]

    hps['global'] = {}
    hps['global']['final_fc_hidden_units'] = [[]]
    hps['global']['learning_rate'] = [1e-3]
    hps['global']['weight_decay'] = [0]
    hps['global']['num_epochs'] = [30]

    imputer_string.fit_hpo(df_train, hps=hps, num_epochs=10, num_evals=3)

    assert max(imputer_string.hpo.results['f1_micro']) > 0.7
예제 #12
0
def test_hpo_num_evals_given_hps(test_dir, data_frame):
    feature_col, label_col = "feature", "label"

    # generate some random data
    df = data_frame(feature_col=feature_col, label_col=label_col)

    num_evals = 2
    # assert that num_evals is an upper bound on the number of hpo runs
    for n_max_tokens_to_try in range(1, 5):
        imputer = SimpleImputer(
            input_columns=[col for col in df.columns if col != label_col],
            output_column=label_col,
            output_path=test_dir)

        hps = {feature_col: {'max_tokens': n_max_tokens_to_try * [10]}}
        imputer.fit_hpo(df, hps=hps, num_evals=num_evals)

        assert imputer.hpo.results.shape[0] == min(num_evals,
                                                   n_max_tokens_to_try)
예제 #13
0
def test_hpo_single_column_encoder_parameter(test_dir, data_frame):
    feature_col, label_col = "feature", "label"

    df = data_frame(feature_col=feature_col, label_col=label_col)

    imputer = SimpleImputer(
        input_columns=[col for col in df.columns if col != label_col],
        output_column=label_col,
        output_path=test_dir,
        is_explainable=True)

    hps = dict()
    hps[feature_col] = {'max_tokens': [1024]}
    hps['global'] = {}
    hps['global']['num_epochs'] = [10]

    imputer.fit_hpo(df, hps=hps)

    assert imputer.hpo.results.shape[0] == 2
    assert imputer.imputer.data_encoders[0].vectorizer.max_features == 1024
예제 #14
0
def test_hpo_multiple_columns_only_one_used(test_dir, data_frame):
    feature_col, label_col = "feature", "label"

    df = data_frame(feature_col=feature_col, label_col=label_col)
    df.loc[:, feature_col + '_2'] = df.loc[:, feature_col]

    imputer = SimpleImputer(input_columns=[feature_col],
                            output_column=label_col,
                            output_path=test_dir,
                            is_explainable=True)

    hps = dict()
    hps[feature_col] = {'max_tokens': [1024]}
    hps['global'] = {}
    hps['global']['num_epochs'] = [10]

    imputer.fit_hpo(df, hps=hps)

    assert imputer.hpo.results.shape[0] == 1
    assert imputer.imputer.data_encoders[0].vectorizer.max_features == 1024
예제 #15
0
def test_imputer_hpo_numeric(test_dir):
    """

    Tests SimpleImputer HPO for numeric data/imputation

    """

    N = 200
    numeric_data = np.random.uniform(-np.pi, np.pi, (N, ))
    df = pd.DataFrame({
        'x': numeric_data,
        '**2': numeric_data**2 + np.random.normal(0, .1, (N, )),
    })

    df_train, df_test = random_split(df, [.8, .2])
    output_path = os.path.join(test_dir, "tmp", "experiment_numeric_hpo")

    imputer_numeric = SimpleImputer(input_columns=['x'],
                                    output_column="**2",
                                    output_path=output_path)

    feature_col = 'x'

    hps = {}
    hps[feature_col] = {}
    hps[feature_col]['type'] = ['numeric']
    hps[feature_col]['numeric_latent_dim'] = [30]
    hps[feature_col]['numeric_hidden_layers'] = [1]

    hps['global'] = {}
    hps['global']['final_fc_hidden_units'] = [[]]
    hps['global']['learning_rate'] = [1e-3, 1e-4]
    hps['global']['weight_decay'] = [0]
    hps['global']['num_epochs'] = [200]
    hps['global']['patience'] = [100]
    hps['global']['concat_columns'] = [False]

    imputer_numeric.fit_hpo(df_train, hps=hps)
    results = imputer_numeric.hpo.results

    assert results[results['mse'] == min(results['mse'])]['mse'].iloc[0] < .3
예제 #16
0
def test_imputer_hpo_text(test_dir, data_frame):
    """

    Tests SimpleImputer HPO with text data and categorical imputations

    """
    feature_col = "string_feature"
    label_col = "label"

    n_samples = 1000
    num_labels = 3
    seq_len = 20

    # generate some random data
    df = data_frame(feature_col=feature_col,
                                    label_col=label_col,
                                    num_labels=num_labels,
                                    num_words=seq_len,
                                    n_samples=n_samples)

    df_train, df_test = random_split(df, [.8, .2])
    output_path = os.path.join(test_dir, "tmp", "real_data_experiment_text_hpo")

    imputer_string = SimpleImputer(
        input_columns=[feature_col],
        output_column=label_col,
        output_path=output_path
    )
    imputer_string.fit_hpo(
        train_df=df_train,
        num_epochs=100,
        patience=3,
        num_hash_bucket_candidates=[2 ** 10, 2 ** 15],
        tokens_candidates=['words'],
        numeric_latent_dim_candidates=[10],
        hpo_max_train_samples=1000
    )

    imputer_string.predict(df_test, inplace=True)

    assert f1_score(df_test[label_col], df_test[label_col + '_imputed'], average="weighted") > .7
예제 #17
0
def test_imputer_hpo_numeric(test_dir):
    """

    Tests SimpleImputer HPO for numeric data/imputation

    """

    N = 200
    numeric_data = np.random.uniform(-np.pi, np.pi, (N,))
    df = pd.DataFrame({
        'x': numeric_data,
        '**2': numeric_data ** 2 + np.random.normal(0, .1, (N,)),
    })

    df_train, df_test = random_split(df, [.8, .2])
    output_path = os.path.join(test_dir, "tmp", "real_data_experiment_numeric_hpo")

    imputer_numeric = SimpleImputer(
        input_columns=['x'],
        output_column="**2",
        output_path=output_path
    )
    imputer_numeric.fit_hpo(
        train_df=df_train,
        learning_rate=1e-3,
        num_epochs=100,
        patience=10,
        num_hash_bucket_candidates=[2 ** 10],
        tokens_candidates=['words'],
        numeric_latent_dim_candidates=[10, 50, 100],
        numeric_hidden_layers_candidates=[1, 2]
    )

    imputer_numeric.predict(df_test, inplace=True)

    assert mean_squared_error(df_test['**2'], df_test['**2_imputed']) < 1.0
예제 #18
0
def test_hpo_all_input_types(test_dir, data_frame):
    """

    Using sklearn advantages: parallelism, distributions of parameters, multiple cross-validation

    """
    label_col = "label"

    n_samples = 1000
    num_labels = 3
    seq_len = 12

    # generate some random data
    df = data_frame(feature_col="string_feature",
                    label_col=label_col,
                    num_labels=num_labels,
                    num_words=seq_len,
                    n_samples=n_samples)

    # add categorical feature
    df['categorical_feature'] = [
        'foo' if r > .5 else 'bar' for r in np.random.rand(n_samples)
    ]

    # add numerical feature
    df['numeric_feature'] = np.random.rand(n_samples)

    df_train, df_test = random_split(df, [.8, .2])
    output_path = os.path.join(test_dir, "tmp",
                               "real_data_experiment_text_hpo")

    imputer = SimpleImputer(input_columns=[
        'string_feature', 'categorical_feature', 'numeric_feature'
    ],
                            output_column='label',
                            output_path=output_path)

    # Define default hyperparameter choices for each column type (string, categorical, numeric)
    hps = dict()
    hps['global'] = {}
    hps['global']['learning_rate'] = [3e-4]
    hps['global']['weight_decay'] = [1e-8]
    hps['global']['num_epochs'] = [5, 50]
    hps['global']['patience'] = [5]
    hps['global']['batch_size'] = [16]
    hps['global']['final_fc_hidden_units'] = [[]]
    hps['global']['concat_columns'] = [True, False]

    hps['string_feature'] = {}
    hps['string_feature']['max_tokens'] = [2**15]
    hps['string_feature']['tokens'] = [['words', 'chars']]
    hps['string_feature']['ngram_range'] = {}
    hps['string_feature']['ngram_range']['words'] = [(1, 4), (2, 5)]
    hps['string_feature']['ngram_range']['chars'] = [(2, 4), (3, 5)]

    hps['categorical_feature'] = {}
    hps['categorical_feature']['type'] = ['categorical']
    hps['categorical_feature']['max_tokens'] = [2**15]
    hps['categorical_feature']['embed_dim'] = [10]

    hps['numeric_feature'] = {}
    hps['numeric_feature']['normalize'] = [True]
    hps['numeric_feature']['numeric_latent_dim'] = [10]
    hps['numeric_feature']['numeric_hidden_layers'] = [1]

    # user defined score function for hyperparameters
    def calibration_check(true, predicted, confidence):
        """
        expect kwargs: true, predicted, confidence
        here we compute a calibration sanity check
        """
        return (np.mean(true[confidence > .9] == predicted[confidence > .9]),
                np.mean(true[confidence > .5] == predicted[confidence > .5]))

    def coverage_check(true, predicted, confidence):
        return np.mean(confidence > .9)

    uds = [(calibration_check, 'calibration check'),
           (coverage_check, 'coverage at 90')]

    imputer.fit_hpo(df_train,
                    hps=hps,
                    user_defined_scores=uds,
                    num_evals=5,
                    hpo_run_name='test1_')

    imputer.fit_hpo(df_train,
                    hps=hps,
                    user_defined_scores=uds,
                    num_evals=5,
                    hpo_run_name='test2_',
                    max_running_hours=1 / 3600)

    results = imputer.hpo.results

    assert results[results['global:num_epochs'] == 50]['f1_micro'].iloc[0] > \
           results[results['global:num_epochs'] == 5]['f1_micro'].iloc[0]