def test_hpo_numeric_best_pick(test_dir, data_frame): feature_col, label_col = "feature", "label" df = data_frame(feature_col=feature_col, label_col=label_col) df.loc[:, label_col] = np.random.randn(df.shape[0]) imputer = SimpleImputer(input_columns=[feature_col], output_column=label_col, output_path=test_dir, is_explainable=True) hps = {feature_col: {'max_tokens': [1, 2, 3]}} hps[feature_col]['tokens'] = [['chars']] imputer.fit_hpo(df, hps=hps) results = imputer.hpo.results max_tokens_of_encoder = imputer.imputer.data_encoders[ 0].vectorizer.max_features # model with minimal MSE best_hpo_run = imputer.hpo.results['mse'].astype('float').idxmin() loaded_hpo_run = results.loc[results[feature_col + ':max_tokens'] == max_tokens_of_encoder].index[0] assert best_hpo_run == loaded_hpo_run
def test_hpo_many_columns(test_dir, data_frame): feature_col, label_col = "feature", "label" n_samples = 300 num_labels = 3 ncols = 10 seq_len = 4 # generate some random data df = data_frame(feature_col=feature_col, label_col=label_col, num_labels=num_labels, num_words=seq_len, n_samples=n_samples) for col in range(ncols): df[feature_col + '_' + str(col)] = df[feature_col] imputer = SimpleImputer( input_columns=[col for col in df.columns if not col in ['label']], output_column=label_col, output_path=test_dir) imputer.fit_hpo(df, num_evals=2, num_epochs=10) assert imputer.hpo.results.precision_weighted.max() > .75
def test_hpo_runs(test_dir, data_frame): feature_col, label_col = "feature", "label" df = data_frame(feature_col=feature_col, label_col=label_col) imputer = SimpleImputer( input_columns=[col for col in df.columns if col != label_col], output_column=label_col, output_path=test_dir) hps = dict() max_tokens = [1024, 2048] hps[feature_col] = {'max_tokens': max_tokens} hps['global'] = {} hps['global']['concat_columns'] = [False] hps['global']['num_epochs'] = [10] hps['global']['num_epochs'] = [10] hps['global']['num_epochs'] = [10] imputer.fit_hpo(df, hps=hps, num_hash_bucket_candidates=[2**15], tokens_candidates=['words']) # only search over specified parameter ranges assert set( imputer.hpo.results[feature_col + ':' + 'max_tokens'].unique().tolist()) == set(max_tokens) assert imputer.hpo.results.shape[0] == 2
def test_hpo_defaults(test_dir, data_frame): label_col = "label" n_samples = 500 num_labels = 3 seq_len = 10 # generate some random data df = data_frame(feature_col="string_feature", label_col=label_col, num_labels=num_labels, num_words=seq_len, n_samples=n_samples) # add categorical feature df['categorical_feature'] = [ 'foo' if r > .5 else 'bar' for r in np.random.rand(n_samples) ] # add numerical feature df['numeric_feature'] = np.random.rand(n_samples) df_train, df_test = random_split(df, [.8, .2]) output_path = os.path.join(test_dir, "tmp", "real_data_experiment_text_hpo") imputer = SimpleImputer(input_columns=[ 'string_feature', 'categorical_feature', 'numeric_feature' ], output_column='label', output_path=output_path) imputer.fit_hpo(df_train, num_evals=10, num_epochs=5) assert imputer.hpo.results.precision_weighted.max() > .9
def test_hpo_many_columns(test_dir, data_frame): """ """ label_col = "label" n_samples = 300 num_labels = 3 ncols = 10 seq_len = 4 # generate some random data df = data_frame(feature_col="string_feature", label_col=label_col, num_labels=num_labels, num_words=seq_len, n_samples=n_samples) for col in range(ncols): df['string_featur_' + str(col)] = df['string_feature'] df_train, df_test = random_split(df, [.8, .2]) output_path = os.path.join(test_dir, "tmp", "real_data_experiment_text_hpo") imputer = SimpleImputer( input_columns=[col for col in df.columns if not col in ['label']], output_column='label', output_path=output_path) imputer.fit_hpo(df_train, num_evals=2) assert imputer.hpo.results.precision_weighted.max() > .8
def test_hpo_mixed_hps_and_kwargs(test_dir, data_frame): feature_col, label_col = "feature", "label" df = data_frame(feature_col=feature_col, label_col=label_col) imputer = SimpleImputer(input_columns=[feature_col], output_column=label_col, output_path=test_dir) hps = {feature_col: {'max_tokens': [1024]}} imputer.fit_hpo(df, hps=hps, learning_rate_candidates=[0.1]) assert imputer.hpo.results['global:learning_rate'].values[0] == 0.1
def test_hpo_num_evals_empty_hps(test_dir, data_frame): feature_col, label_col = "feature", "label" # generate some random data df = data_frame(feature_col=feature_col, label_col=label_col) imputer = SimpleImputer( input_columns=[col for col in df.columns if col != label_col], output_column=label_col, output_path=test_dir) num_evals = 2 imputer.fit_hpo(df, num_evals=num_evals, num_epochs=10) assert imputer.hpo.results.shape[0] == 2
def test_hpo_num_evals_given_hps(test_dir, data_frame): feature_col, label_col = "feature", "label" # generate some random data df = data_frame(feature_col=feature_col, label_col=label_col) # assert that num_evals is an upper bound on the number of hpo runs for num_evals in range(1, 3): imputer = SimpleImputer( input_columns=[col for col in df.columns if col != label_col], output_column=label_col, output_path=test_dir) imputer.fit_hpo(df, num_evals=num_evals, num_epochs=5) assert imputer.hpo.results.shape[0] == num_evals
def test_hpo_similar_input_col_mixed_types(test_dir, data_frame): feature_col, label_col = "feature", "label" numeric_col = "numeric_feature" categorical_col = "categorical_col" df = data_frame(feature_col=feature_col, label_col=label_col) df.loc[:, numeric_col] = np.random.randn(df.shape[0]) df.loc[:, categorical_col] = np.random.randint(df.shape[0]) imputer = SimpleImputer( input_columns=[feature_col, numeric_col, categorical_col], output_column=label_col, output_path=test_dir) imputer.fit_hpo(df, num_epochs=10)
def test_hpo_kwargs_only_support(test_dir, data_frame): feature_col, label_col = "feature", "label" numeric_col = "numeric_feature" df = data_frame(feature_col=feature_col, label_col=label_col) df.loc[:, numeric_col] = np.random.randn(df.shape[0]) imputer = SimpleImputer( input_columns=[feature_col, numeric_col], output_column=label_col, output_path=test_dir ) imputer.fit_hpo( df, num_epochs=1, patience=1, weight_decay=[0.001], batch_size=320, num_hash_bucket_candidates=[3], tokens_candidates=['words'], numeric_latent_dim_candidates=[1], numeric_hidden_layers_candidates=[1], final_fc_hidden_units=[[1]], learning_rate_candidates=[0.1], normalize_numeric=False ) def assert_val(col, value): assert imputer.hpo.results[col].values[0] == value assert_val('global:num_epochs', 1) assert_val('global:patience', 1) assert_val('global:weight_decay', 0.001) assert_val('global:batch_size', 320) assert_val(feature_col + ':max_tokens', 3) assert_val(feature_col + ':tokens', ['words']) assert_val(numeric_col + ':numeric_latent_dim', 1) assert_val(numeric_col + ':numeric_hidden_layers', 1) assert_val('global:final_fc_hidden_units', [1]) assert_val('global:learning_rate', 0.1)
def test_imputer_hpo_text(test_dir, data_frame): """ Tests SimpleImputer HPO with text data and categorical imputations """ feature_col = "string_feature" label_col = "label" n_samples = 1000 num_labels = 3 seq_len = 20 # generate some random data df = data_frame(feature_col=feature_col, label_col=label_col, num_labels=num_labels, num_words=seq_len, n_samples=n_samples) df_train, df_test = random_split(df, [.8, .2]) output_path = os.path.join(test_dir, "tmp", "experiment_text_hpo") imputer_string = SimpleImputer( input_columns=[feature_col], output_column=label_col, output_path=output_path ) hps = dict() hps[feature_col] = {} hps[feature_col]['type'] = ['string'] hps[feature_col]['tokens'] = [['words'], ['chars']] hps['global'] = {} hps['global']['final_fc_hidden_units'] = [[]] hps['global']['learning_rate'] = [1e-3] hps['global']['weight_decay'] = [0] hps['global']['num_epochs'] = [30] imputer_string.fit_hpo(df_train, hps=hps, num_epochs=10, num_evals=3) assert max(imputer_string.hpo.results['f1_micro']) > 0.7
def test_hpo_num_evals_given_hps(test_dir, data_frame): feature_col, label_col = "feature", "label" # generate some random data df = data_frame(feature_col=feature_col, label_col=label_col) num_evals = 2 # assert that num_evals is an upper bound on the number of hpo runs for n_max_tokens_to_try in range(1, 5): imputer = SimpleImputer( input_columns=[col for col in df.columns if col != label_col], output_column=label_col, output_path=test_dir) hps = {feature_col: {'max_tokens': n_max_tokens_to_try * [10]}} imputer.fit_hpo(df, hps=hps, num_evals=num_evals) assert imputer.hpo.results.shape[0] == min(num_evals, n_max_tokens_to_try)
def test_hpo_single_column_encoder_parameter(test_dir, data_frame): feature_col, label_col = "feature", "label" df = data_frame(feature_col=feature_col, label_col=label_col) imputer = SimpleImputer( input_columns=[col for col in df.columns if col != label_col], output_column=label_col, output_path=test_dir, is_explainable=True) hps = dict() hps[feature_col] = {'max_tokens': [1024]} hps['global'] = {} hps['global']['num_epochs'] = [10] imputer.fit_hpo(df, hps=hps) assert imputer.hpo.results.shape[0] == 2 assert imputer.imputer.data_encoders[0].vectorizer.max_features == 1024
def test_hpo_multiple_columns_only_one_used(test_dir, data_frame): feature_col, label_col = "feature", "label" df = data_frame(feature_col=feature_col, label_col=label_col) df.loc[:, feature_col + '_2'] = df.loc[:, feature_col] imputer = SimpleImputer(input_columns=[feature_col], output_column=label_col, output_path=test_dir, is_explainable=True) hps = dict() hps[feature_col] = {'max_tokens': [1024]} hps['global'] = {} hps['global']['num_epochs'] = [10] imputer.fit_hpo(df, hps=hps) assert imputer.hpo.results.shape[0] == 1 assert imputer.imputer.data_encoders[0].vectorizer.max_features == 1024
def test_imputer_hpo_numeric(test_dir): """ Tests SimpleImputer HPO for numeric data/imputation """ N = 200 numeric_data = np.random.uniform(-np.pi, np.pi, (N, )) df = pd.DataFrame({ 'x': numeric_data, '**2': numeric_data**2 + np.random.normal(0, .1, (N, )), }) df_train, df_test = random_split(df, [.8, .2]) output_path = os.path.join(test_dir, "tmp", "experiment_numeric_hpo") imputer_numeric = SimpleImputer(input_columns=['x'], output_column="**2", output_path=output_path) feature_col = 'x' hps = {} hps[feature_col] = {} hps[feature_col]['type'] = ['numeric'] hps[feature_col]['numeric_latent_dim'] = [30] hps[feature_col]['numeric_hidden_layers'] = [1] hps['global'] = {} hps['global']['final_fc_hidden_units'] = [[]] hps['global']['learning_rate'] = [1e-3, 1e-4] hps['global']['weight_decay'] = [0] hps['global']['num_epochs'] = [200] hps['global']['patience'] = [100] hps['global']['concat_columns'] = [False] imputer_numeric.fit_hpo(df_train, hps=hps) results = imputer_numeric.hpo.results assert results[results['mse'] == min(results['mse'])]['mse'].iloc[0] < .3
def test_imputer_hpo_text(test_dir, data_frame): """ Tests SimpleImputer HPO with text data and categorical imputations """ feature_col = "string_feature" label_col = "label" n_samples = 1000 num_labels = 3 seq_len = 20 # generate some random data df = data_frame(feature_col=feature_col, label_col=label_col, num_labels=num_labels, num_words=seq_len, n_samples=n_samples) df_train, df_test = random_split(df, [.8, .2]) output_path = os.path.join(test_dir, "tmp", "real_data_experiment_text_hpo") imputer_string = SimpleImputer( input_columns=[feature_col], output_column=label_col, output_path=output_path ) imputer_string.fit_hpo( train_df=df_train, num_epochs=100, patience=3, num_hash_bucket_candidates=[2 ** 10, 2 ** 15], tokens_candidates=['words'], numeric_latent_dim_candidates=[10], hpo_max_train_samples=1000 ) imputer_string.predict(df_test, inplace=True) assert f1_score(df_test[label_col], df_test[label_col + '_imputed'], average="weighted") > .7
def test_imputer_hpo_numeric(test_dir): """ Tests SimpleImputer HPO for numeric data/imputation """ N = 200 numeric_data = np.random.uniform(-np.pi, np.pi, (N,)) df = pd.DataFrame({ 'x': numeric_data, '**2': numeric_data ** 2 + np.random.normal(0, .1, (N,)), }) df_train, df_test = random_split(df, [.8, .2]) output_path = os.path.join(test_dir, "tmp", "real_data_experiment_numeric_hpo") imputer_numeric = SimpleImputer( input_columns=['x'], output_column="**2", output_path=output_path ) imputer_numeric.fit_hpo( train_df=df_train, learning_rate=1e-3, num_epochs=100, patience=10, num_hash_bucket_candidates=[2 ** 10], tokens_candidates=['words'], numeric_latent_dim_candidates=[10, 50, 100], numeric_hidden_layers_candidates=[1, 2] ) imputer_numeric.predict(df_test, inplace=True) assert mean_squared_error(df_test['**2'], df_test['**2_imputed']) < 1.0
def test_hpo_all_input_types(test_dir, data_frame): """ Using sklearn advantages: parallelism, distributions of parameters, multiple cross-validation """ label_col = "label" n_samples = 1000 num_labels = 3 seq_len = 12 # generate some random data df = data_frame(feature_col="string_feature", label_col=label_col, num_labels=num_labels, num_words=seq_len, n_samples=n_samples) # add categorical feature df['categorical_feature'] = [ 'foo' if r > .5 else 'bar' for r in np.random.rand(n_samples) ] # add numerical feature df['numeric_feature'] = np.random.rand(n_samples) df_train, df_test = random_split(df, [.8, .2]) output_path = os.path.join(test_dir, "tmp", "real_data_experiment_text_hpo") imputer = SimpleImputer(input_columns=[ 'string_feature', 'categorical_feature', 'numeric_feature' ], output_column='label', output_path=output_path) # Define default hyperparameter choices for each column type (string, categorical, numeric) hps = dict() hps['global'] = {} hps['global']['learning_rate'] = [3e-4] hps['global']['weight_decay'] = [1e-8] hps['global']['num_epochs'] = [5, 50] hps['global']['patience'] = [5] hps['global']['batch_size'] = [16] hps['global']['final_fc_hidden_units'] = [[]] hps['global']['concat_columns'] = [True, False] hps['string_feature'] = {} hps['string_feature']['max_tokens'] = [2**15] hps['string_feature']['tokens'] = [['words', 'chars']] hps['string_feature']['ngram_range'] = {} hps['string_feature']['ngram_range']['words'] = [(1, 4), (2, 5)] hps['string_feature']['ngram_range']['chars'] = [(2, 4), (3, 5)] hps['categorical_feature'] = {} hps['categorical_feature']['type'] = ['categorical'] hps['categorical_feature']['max_tokens'] = [2**15] hps['categorical_feature']['embed_dim'] = [10] hps['numeric_feature'] = {} hps['numeric_feature']['normalize'] = [True] hps['numeric_feature']['numeric_latent_dim'] = [10] hps['numeric_feature']['numeric_hidden_layers'] = [1] # user defined score function for hyperparameters def calibration_check(true, predicted, confidence): """ expect kwargs: true, predicted, confidence here we compute a calibration sanity check """ return (np.mean(true[confidence > .9] == predicted[confidence > .9]), np.mean(true[confidence > .5] == predicted[confidence > .5])) def coverage_check(true, predicted, confidence): return np.mean(confidence > .9) uds = [(calibration_check, 'calibration check'), (coverage_check, 'coverage at 90')] imputer.fit_hpo(df_train, hps=hps, user_defined_scores=uds, num_evals=5, hpo_run_name='test1_') imputer.fit_hpo(df_train, hps=hps, user_defined_scores=uds, num_evals=5, hpo_run_name='test2_', max_running_hours=1 / 3600) results = imputer.hpo.results assert results[results['global:num_epochs'] == 50]['f1_micro'].iloc[0] > \ results[results['global:num_epochs'] == 5]['f1_micro'].iloc[0]