Python SimpleImputer 예제들, datawig.simple_imputer.SimpleImputer Python 예제들

예제 #1

0

파일 보기

파일: test_simple_imputer.py 프로젝트: yadevi/datawig

def test_hpo_many_columns(test_dir, data_frame):
    feature_col, label_col = "feature", "label"

    n_samples = 300
    num_labels = 3
    ncols = 10
    seq_len = 4

    # generate some random data
    df = data_frame(feature_col=feature_col,
                    label_col=label_col,
                    num_labels=num_labels,
                    num_words=seq_len,
                    n_samples=n_samples)

    for col in range(ncols):
        df[feature_col + '_' + str(col)] = df[feature_col]

    imputer = SimpleImputer(
        input_columns=[col for col in df.columns if not col in ['label']],
        output_column=label_col,
        output_path=test_dir)

    imputer.fit_hpo(df, num_evals=2, num_epochs=10)

    assert imputer.hpo.results.precision_weighted.max() > .75

예제 #2

0

파일 보기

def test_label_shift_weight_computation():
    """
    Tests that label shift detection can determine the label marginals of validation data.
    """

    train_proportion = [.7, .3]
    target_proportion = [.3, .7]

    data = synthetic_label_shift_simple(N=2000, label_proportions=train_proportion,
                                        error_proba=.1, covariates=['foo', 'bar'])

    # original train test splits
    tr, te = random_split(data, [.5, .5])

    # train domain classifier
    imputer = SimpleImputer(
        input_columns=['covariate'],
        output_column='label',
        output_path='/tmp/imputer_model')

    # Fit an imputer model on the train data (coo_imputed_proba, coo_imputed)
    imputer.fit(tr, te, num_epochs=15, learning_rate=3e-4, weight_decay=0)

    target_data = synthetic_label_shift_simple(1000, target_proportion,
                                               error_proba=.1, covariates=['foo', 'bar'])

    weights = imputer.check_for_label_shift(target_data)

    # compare the product of weights and training marginals
    # (i.e. estimated target marginals) with the true target marginals.
    for x in list(zip(list(weights.values()), train_proportion, target_proportion)):
        assert x[0]*x[1] - x[2] < .1

예제 #3

0

파일 보기

파일: test_simple_imputer.py 프로젝트: sscdotopen/datawig

def test_imputer_hpo_numeric():
    """

    Tests SimpleImputer HPO for numeric data/imputation

    """

    N = 200
    numeric_data = np.random.uniform(-np.pi, np.pi, (N, ))
    df = pd.DataFrame({
        'x': numeric_data,
        '**2': numeric_data**2 + np.random.normal(0, .1, (N, )),
    })

    df_train, df_test = random_split(df, [.8, .2])
    output_path = os.path.join(dir_path, "resources", "tmp",
                               "real_data_experiment_numeric_hpo")

    imputer_numeric = SimpleImputer(input_columns=['x'],
                                    output_column="**2",
                                    output_path=output_path).fit_hpo(
                                        train_df=df_train,
                                        learning_rate=1e-3,
                                        num_epochs=100,
                                        patience=10,
                                        num_hash_bucket_candidates=[2**10],
                                        tokens_candidates=['words'],
                                        latent_dim_candidates=[10, 50, 100],
                                        hidden_layers_candidates=[1, 2])

    imputer_numeric.predict(df_test)

    assert mean_squared_error(df_test['**2'], df_test['**2_imputed']) < 1.0

    shutil.rmtree(output_path)

예제 #4

0

파일 보기

파일: test_simple_imputer.py 프로젝트: yadevi/datawig

def test_hpo_runs(test_dir, data_frame):
    feature_col, label_col = "feature", "label"

    df = data_frame(feature_col=feature_col, label_col=label_col)

    imputer = SimpleImputer(
        input_columns=[col for col in df.columns if col != label_col],
        output_column=label_col,
        output_path=test_dir)

    hps = dict()
    max_tokens = [1024, 2048]
    hps[feature_col] = {'max_tokens': max_tokens}
    hps['global'] = {}
    hps['global']['concat_columns'] = [False]
    hps['global']['num_epochs'] = [10]
    hps['global']['num_epochs'] = [10]
    hps['global']['num_epochs'] = [10]

    imputer.fit_hpo(df,
                    hps=hps,
                    num_hash_bucket_candidates=[2**15],
                    tokens_candidates=['words'])

    # only search over specified parameter ranges
    assert set(
        imputer.hpo.results[feature_col + ':' +
                            'max_tokens'].unique().tolist()) == set(max_tokens)
    assert imputer.hpo.results.shape[0] == 2

예제 #5

0

파일 보기

파일: test_simple_imputer.py 프로젝트: jbdatascience/datawig

def test_hpo_many_columns(test_dir, data_frame):
    """

    """
    label_col = "label"

    n_samples = 300
    num_labels = 3
    ncols = 10
    seq_len = 4

    # generate some random data
    df = data_frame(feature_col="string_feature",
                    label_col=label_col,
                    num_labels=num_labels,
                    num_words=seq_len,
                    n_samples=n_samples)

    for col in range(ncols):
        df['string_featur_' + str(col)] = df['string_feature']

    df_train, df_test = random_split(df, [.8, .2])
    output_path = os.path.join(test_dir, "tmp",
                               "real_data_experiment_text_hpo")

    imputer = SimpleImputer(
        input_columns=[col for col in df.columns if not col in ['label']],
        output_column='label',
        output_path=output_path)

    imputer.fit_hpo(df_train, num_evals=2)

    assert imputer.hpo.results.precision_weighted.max() > .8

예제 #6

0

파일 보기

파일: test_simple_imputer.py 프로젝트: yadevi/datawig

def test_hpo_defaults(test_dir, data_frame):
    label_col = "label"

    n_samples = 500
    num_labels = 3
    seq_len = 10

    # generate some random data
    df = data_frame(feature_col="string_feature",
                    label_col=label_col,
                    num_labels=num_labels,
                    num_words=seq_len,
                    n_samples=n_samples)

    # add categorical feature
    df['categorical_feature'] = [
        'foo' if r > .5 else 'bar' for r in np.random.rand(n_samples)
    ]

    # add numerical feature
    df['numeric_feature'] = np.random.rand(n_samples)

    df_train, df_test = random_split(df, [.8, .2])
    output_path = os.path.join(test_dir, "tmp",
                               "real_data_experiment_text_hpo")

    imputer = SimpleImputer(input_columns=[
        'string_feature', 'categorical_feature', 'numeric_feature'
    ],
                            output_column='label',
                            output_path=output_path)

    imputer.fit_hpo(df_train, num_evals=10, num_epochs=5)

    assert imputer.hpo.results.precision_weighted.max() > .9

예제 #7

0

파일 보기

파일: test_simple_imputer.py 프로젝트: yadevi/datawig

def test_hpo_numeric_best_pick(test_dir, data_frame):
    feature_col, label_col = "feature", "label"

    df = data_frame(feature_col=feature_col, label_col=label_col)

    df.loc[:, label_col] = np.random.randn(df.shape[0])

    imputer = SimpleImputer(input_columns=[feature_col],
                            output_column=label_col,
                            output_path=test_dir,
                            is_explainable=True)

    hps = {feature_col: {'max_tokens': [1, 2, 3]}}
    hps[feature_col]['tokens'] = [['chars']]

    imputer.fit_hpo(df, hps=hps)

    results = imputer.hpo.results

    max_tokens_of_encoder = imputer.imputer.data_encoders[
        0].vectorizer.max_features

    # model with minimal MSE
    best_hpo_run = imputer.hpo.results['mse'].astype('float').idxmin()
    loaded_hpo_run = results.loc[results[feature_col + ':max_tokens'] ==
                                 max_tokens_of_encoder].index[0]

    assert best_hpo_run == loaded_hpo_run

예제 #8

0

파일 보기

파일: test_simple_imputer.py 프로젝트: yadevi/datawig

def test_imputer_complete():
    """
    Tests the complete functionality of SimpleImputer
    :param data_frame:
    """

    feature_col = "string_feature"
    label_col = "label"
    feature_col_numeric = "numeric_feature"
    label_col_numeric = "numeric_label"

    num_samples = 1000
    num_labels = 3
    seq_len = 20

    missing_ratio = .1

    df_string = generate_df_string(num_labels=num_labels,
                                   num_words=seq_len,
                                   num_samples=num_samples,
                                   label_column_name=label_col,
                                   data_column_name=feature_col)

    df_numeric = generate_df_numeric(num_samples=num_samples,
                                     label_column_name=label_col_numeric,
                                     data_column_name=feature_col_numeric)

    df = pd.concat([
        df_string[[feature_col, label_col]],
        df_numeric[[feature_col_numeric, label_col_numeric]]
    ],
                   ignore_index=True,
                   axis=1)
    df.columns = [
        feature_col, label_col, feature_col_numeric, label_col_numeric
    ]

    # delete some entries
    for col in df.columns:
        missing = np.random.random(len(df)) < missing_ratio
        df[col].iloc[missing] = np.nan

    feature_col_missing = df[feature_col].isnull()
    label_col_missing = df[label_col].isnull()

    df = SimpleImputer.complete(data_frame=df)

    assert all(df[feature_col].isnull() == feature_col_missing)
    assert df[label_col].isnull().sum() < label_col_missing.sum()
    assert df[feature_col_numeric].isnull().sum() == 0
    assert df[label_col_numeric].isnull().sum() == 0

    df = SimpleImputer.complete(data_frame=df, output_path='some_path')

예제 #9

0

파일 보기

파일: test_simple_imputer.py 프로젝트: yadevi/datawig

def test_hpo_mixed_hps_and_kwargs(test_dir, data_frame):
    feature_col, label_col = "feature", "label"

    df = data_frame(feature_col=feature_col, label_col=label_col)

    imputer = SimpleImputer(input_columns=[feature_col],
                            output_column=label_col,
                            output_path=test_dir)

    hps = {feature_col: {'max_tokens': [1024]}}

    imputer.fit_hpo(df, hps=hps, learning_rate_candidates=[0.1])

    assert imputer.hpo.results['global:learning_rate'].values[0] == 0.1

예제 #10

0

파일 보기

파일: test_simple_imputer.py 프로젝트: yadevi/datawig

def test_hpo_num_evals_empty_hps(test_dir, data_frame):
    feature_col, label_col = "feature", "label"

    # generate some random data
    df = data_frame(feature_col=feature_col, label_col=label_col)

    imputer = SimpleImputer(
        input_columns=[col for col in df.columns if col != label_col],
        output_column=label_col,
        output_path=test_dir)

    num_evals = 2
    imputer.fit_hpo(df, num_evals=num_evals, num_epochs=10)

    assert imputer.hpo.results.shape[0] == 2

예제 #11

0

파일 보기

파일: test_simple_imputer.py 프로젝트: yadevi/datawig

def test_hpo_num_evals_given_hps(test_dir, data_frame):
    feature_col, label_col = "feature", "label"

    # generate some random data
    df = data_frame(feature_col=feature_col, label_col=label_col)

    # assert that num_evals is an upper bound on the number of hpo runs
    for num_evals in range(1, 3):
        imputer = SimpleImputer(
            input_columns=[col for col in df.columns if col != label_col],
            output_column=label_col,
            output_path=test_dir)

        imputer.fit_hpo(df, num_evals=num_evals, num_epochs=5)

        assert imputer.hpo.results.shape[0] == num_evals

예제 #12

0

파일 보기

파일: test_simple_imputer.py 프로젝트: yadevi/datawig

def test_hpo_similar_input_col_mixed_types(test_dir, data_frame):
    feature_col, label_col = "feature", "label"
    numeric_col = "numeric_feature"
    categorical_col = "categorical_col"

    df = data_frame(feature_col=feature_col, label_col=label_col)

    df.loc[:, numeric_col] = np.random.randn(df.shape[0])
    df.loc[:, categorical_col] = np.random.randint(df.shape[0])

    imputer = SimpleImputer(
        input_columns=[feature_col, numeric_col, categorical_col],
        output_column=label_col,
        output_path=test_dir)

    imputer.fit_hpo(df, num_epochs=10)

예제 #13

0

파일 보기

def test_hpo_kwargs_only_support(test_dir, data_frame):
    feature_col, label_col = "feature", "label"
    numeric_col = "numeric_feature"

    df = data_frame(feature_col=feature_col,
                    label_col=label_col)

    df.loc[:, numeric_col] = np.random.randn(df.shape[0])

    imputer = SimpleImputer(
        input_columns=[feature_col, numeric_col],
        output_column=label_col,
        output_path=test_dir
    )

    imputer.fit_hpo(
        df,
        num_epochs=1,
        patience=1,
        weight_decay=[0.001],
        batch_size=320,
        num_hash_bucket_candidates=[3],
        tokens_candidates=['words'],
        numeric_latent_dim_candidates=[1],
        numeric_hidden_layers_candidates=[1],
        final_fc_hidden_units=[[1]],
        learning_rate_candidates=[0.1],
        normalize_numeric=False
    )

    def assert_val(col, value):
        assert imputer.hpo.results[col].values[0] == value

    assert_val('global:num_epochs', 1)
    assert_val('global:patience', 1)
    assert_val('global:weight_decay', 0.001)

    assert_val('global:batch_size', 320)
    assert_val(feature_col + ':max_tokens', 3)
    assert_val(feature_col + ':tokens', ['words'])

    assert_val(numeric_col + ':numeric_latent_dim', 1)
    assert_val(numeric_col + ':numeric_hidden_layers', 1)

    assert_val('global:final_fc_hidden_units', [1])
    assert_val('global:learning_rate', 0.1)

예제 #14

0

파일 보기

def test_imputer_hpo_text(test_dir, data_frame):
    """

    Tests SimpleImputer HPO with text data and categorical imputations

    """
    feature_col = "string_feature"
    label_col = "label"

    n_samples = 1000
    num_labels = 3
    seq_len = 20

    # generate some random data
    df = data_frame(feature_col=feature_col,
                    label_col=label_col,
                    num_labels=num_labels,
                    num_words=seq_len,
                    n_samples=n_samples)

    df_train, df_test = random_split(df, [.8, .2])

    output_path = os.path.join(test_dir, "tmp", "experiment_text_hpo")

    imputer_string = SimpleImputer(
        input_columns=[feature_col],
        output_column=label_col,
        output_path=output_path
    )

    hps = dict()
    hps[feature_col] = {}
    hps[feature_col]['type'] = ['string']
    hps[feature_col]['tokens'] = [['words'], ['chars']]

    hps['global'] = {}
    hps['global']['final_fc_hidden_units'] = [[]]
    hps['global']['learning_rate'] = [1e-3]
    hps['global']['weight_decay'] = [0]
    hps['global']['num_epochs'] = [30]

    imputer_string.fit_hpo(df_train, hps=hps, num_epochs=10, num_evals=3)

    assert max(imputer_string.hpo.results['f1_micro']) > 0.7

예제 #15

0

파일 보기

def test_hpo_num_evals_given_hps(test_dir, data_frame):
    feature_col, label_col = "feature", "label"

    # generate some random data
    df = data_frame(feature_col=feature_col, label_col=label_col)

    num_evals = 2
    # assert that num_evals is an upper bound on the number of hpo runs
    for n_max_tokens_to_try in range(1, 5):
        imputer = SimpleImputer(
            input_columns=[col for col in df.columns if col != label_col],
            output_column=label_col,
            output_path=test_dir)

        hps = {feature_col: {'max_tokens': n_max_tokens_to_try * [10]}}
        imputer.fit_hpo(df, hps=hps, num_evals=num_evals)

        assert imputer.hpo.results.shape[0] == min(num_evals,
                                                   n_max_tokens_to_try)

예제 #16

0

파일 보기

파일: test_simple_imputer.py 프로젝트: sscdotopen/datawig

def test_imputer_hpo_text():
    """

    Tests SimpleImputer HPO with text data and categorical imputations

    """
    feature_col = "string_feature"
    label_col = "label"

    n_samples = 1000
    num_labels = 3
    seq_len = 20

    # generate some random data
    df = generate_string_data_frame(feature_col=feature_col,
                                    label_col=label_col,
                                    num_labels=num_labels,
                                    num_words=seq_len,
                                    n_samples=n_samples)

    df_train, df_test = random_split(df, [.8, .2])
    output_path = os.path.join(dir_path, "resources", "tmp",
                               "real_data_experiment_text_hpo")

    imputer_string = SimpleImputer(
        input_columns=[feature_col],
        output_column=label_col,
        output_path=output_path).fit_hpo(
            train_df=df_train,
            num_epochs=100,
            patience=3,
            num_hash_bucket_candidates=[2**10, 2**15],
            tokens_candidates=['words'],
            latent_dim_candidates=[10],
            hpo_max_train_samples=1000)

    imputer_string.predict(df_test)

    assert f1_score(df_test[label_col],
                    df_test[label_col + '_imputed'],
                    average="weighted") > .7

    shutil.rmtree(output_path)

예제 #17

0

파일 보기

def test_hpo_multiple_columns_only_one_used(test_dir, data_frame):
    feature_col, label_col = "feature", "label"

    df = data_frame(feature_col=feature_col, label_col=label_col)
    df.loc[:, feature_col + '_2'] = df.loc[:, feature_col]

    imputer = SimpleImputer(input_columns=[feature_col],
                            output_column=label_col,
                            output_path=test_dir,
                            is_explainable=True)

    hps = dict()
    hps[feature_col] = {'max_tokens': [1024]}
    hps['global'] = {}
    hps['global']['num_epochs'] = [10]

    imputer.fit_hpo(df, hps=hps)

    assert imputer.hpo.results.shape[0] == 1
    assert imputer.imputer.data_encoders[0].vectorizer.max_features == 1024

예제 #18

0

파일 보기

파일: test_simple_imputer.py 프로젝트: yadevi/datawig

def test_hpo_single_column_encoder_parameter(test_dir, data_frame):
    feature_col, label_col = "feature", "label"

    df = data_frame(feature_col=feature_col, label_col=label_col)

    imputer = SimpleImputer(
        input_columns=[col for col in df.columns if col != label_col],
        output_column=label_col,
        output_path=test_dir,
        is_explainable=True)

    hps = dict()
    hps[feature_col] = {'max_tokens': [1024]}
    hps['global'] = {}
    hps['global']['num_epochs'] = [10]

    imputer.fit_hpo(df, hps=hps)

    assert imputer.hpo.results.shape[0] == 2
    assert imputer.imputer.data_encoders[0].vectorizer.max_features == 1024

예제 #19

0

파일 보기

파일: test_simple_imputer.py 프로젝트: jbdatascience/datawig

def test_imputer_hpo_numeric(test_dir):
    """

    Tests SimpleImputer HPO for numeric data/imputation

    """

    N = 200
    numeric_data = np.random.uniform(-np.pi, np.pi, (N, ))
    df = pd.DataFrame({
        'x': numeric_data,
        '**2': numeric_data**2 + np.random.normal(0, .1, (N, )),
    })

    df_train, df_test = random_split(df, [.8, .2])
    output_path = os.path.join(test_dir, "tmp", "experiment_numeric_hpo")

    imputer_numeric = SimpleImputer(input_columns=['x'],
                                    output_column="**2",
                                    output_path=output_path)

    feature_col = 'x'

    hps = {}
    hps[feature_col] = {}
    hps[feature_col]['type'] = ['numeric']
    hps[feature_col]['numeric_latent_dim'] = [30]
    hps[feature_col]['numeric_hidden_layers'] = [1]

    hps['global'] = {}
    hps['global']['final_fc_hidden_units'] = [[]]
    hps['global']['learning_rate'] = [1e-3, 1e-4]
    hps['global']['weight_decay'] = [0]
    hps['global']['num_epochs'] = [200]
    hps['global']['patience'] = [100]
    hps['global']['concat_columns'] = [False]

    imputer_numeric.fit_hpo(df_train, hps=hps)
    results = imputer_numeric.hpo.results

    assert results[results['mse'] == min(results['mse'])]['mse'].iloc[0] < .3

예제 #20

0

파일 보기

파일: test_simple_imputer.py 프로젝트: jbdatascience/datawig

def test_imputer_categorical_heuristic(data_frame):
    """
    Tests the heuristic used for checking whether a column is categorical
    :param data_frame:
    """

    feature_col = "string_feature"
    label_col = "label"

    n_samples = 1000
    num_labels = 3
    seq_len = 20

    # generate some random data
    df = data_frame(feature_col=feature_col,
                    label_col=label_col,
                    num_labels=num_labels,
                    num_words=seq_len,
                    n_samples=n_samples)

    assert SimpleImputer._is_categorical(df[feature_col]) == False
    assert SimpleImputer._is_categorical(df[label_col]) == True

예제 #21

0

파일 보기

파일: test_simple_imputer.py 프로젝트: jbdatascience/datawig

def test_explainable_simple_imputer(test_dir, data_frame):
    label_col = 'label'
    df = data_frame(n_samples=100, label_col=label_col)

    output_path = os.path.join(test_dir, "tmp")
    imputer = SimpleImputer(['features'],
                            label_col,
                            output_path=output_path,
                            is_explainable=True).fit(df)

    assert imputer.is_explainable

    assert isinstance(imputer.imputer.data_encoders[0],
                      column_encoders.TfIdfEncoder)

    # explain should not raise an exception
    _ = imputer.explain(df[label_col].unique()[0])

    # explain_instance should not raise an exception
    instance = pd.Series({'features': 'some feature text'})
    _ = imputer.explain_instance(instance)

    assert True

예제 #22

0

파일 보기

def test_hpo_explainable(test_dir, data_frame):
    from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer
    feature_col, label_col = "feature", "label"

    df = data_frame(feature_col=feature_col, label_col=label_col)

    for explainable, vectorizer in [(False, HashingVectorizer),
                                    (True, TfidfVectorizer)]:
        imputer = SimpleImputer(input_columns=[feature_col],
                                output_column=label_col,
                                output_path=test_dir,
                                is_explainable=explainable).fit_hpo(df)
        assert isinstance(imputer.imputer.data_encoders[0].vectorizer,
                          vectorizer)

예제 #23

0

파일 보기

파일: test_simple_imputer.py 프로젝트: sscdotopen/datawig

def test_imputer_image_hpo():
    """

    Tests SimpleImputer HPO with image data imputing a text column

    """

    img_path = os.path.join(dir_path, "resources", "test_images")
    os.makedirs(img_path, exist_ok=True)

    colors = ['red', 'green', 'blue']

    for color in colors:
        create_test_image(os.path.join(img_path, color + ".png"), color)

    n_samples = 32
    color_labels = [random.choice(colors) for _ in range(n_samples)]

    df = pd.DataFrame({"image_files": color_labels, "label": color_labels})

    df['image_files'] = img_path + "/" + df['image_files'] + ".png"

    output_path = os.path.join(dir_path, "resources", "tmp",
                               "experiment_images_hpo")

    imputer_string = SimpleImputer(input_columns=['image_files'],
                                   output_column="label",
                                   output_path=output_path).fit_hpo(
                                       train_df=df,
                                       learning_rate=1e-3,
                                       num_epochs=10,
                                       patience=10,
                                       test_split=.3,
                                       weight_decay=.0,
                                       num_hash_bucket_candidates=[2**10],
                                       tokens_candidates=['words'],
                                       latent_dim_candidates=[10, 100],
                                       hpo_max_train_samples=1000)

예제 #24

0

파일 보기

파일: test_simple_imputer.py 프로젝트: yadevi/datawig

def test_fit_resumes(test_dir, data_frame):
    feature_col, label_col = "feature", "label"

    df = data_frame(feature_col=feature_col, label_col=label_col)

    imputer = SimpleImputer(input_columns=[feature_col],
                            output_column=label_col,
                            output_path=test_dir)

    assert imputer.imputer is None

    imputer.fit(df)
    first_fit_imputer = imputer.imputer

    imputer.fit(df)
    second_fit_imputer = imputer.imputer

    assert first_fit_imputer == second_fit_imputer

예제 #25

0

파일 보기

파일: test_hpo.py 프로젝트: sravanroy/datawig

def test_single_hpo(test_dir, data_frame):
    feature_col, label_col = "feature", "label"

    df = data_frame(feature_col=feature_col, label_col=label_col)

    imputer = SimpleImputer(
        input_columns=[col for col in df.columns if col != label_col],
        output_column=label_col,
        output_path=test_dir)

    hps = dict()
    hps[feature_col] = {'max_tokens': [1024]}
    hps['global'] = {}
    hps['global']['num_epochs'] = [10]
    hps['string'] = {}
    hps['categorical'] = {}
    hps['numeric'] = {}

    hpo = _HPO()
    hpo.tune(train_df=df, hps=hps, simple_imputer=imputer)

    assert hpo.results.shape[0] == 1
    assert hpo.results[feature_col + ':max_tokens'].values[0] == 1024
    assert hpo.results['global:num_epochs'].values[0] == 10

예제 #26

0

파일 보기

파일: test_simple_imputer.py 프로젝트: jbdatascience/datawig

def test_explainable_simple_imputer_unfitted():
    label_col = 'label'

    imputer = SimpleImputer(['features'], label_col, is_explainable=True)

    assert imputer.is_explainable

    try:
        imputer.explain('some class')
        raise pytest.fail(
            'imputer.explain should fail with an appropriate error message')
    except ValueError as exception:
        assert exception.args[0] == 'Need to call .fit() before'

    instance = pd.Series({'features': 'some feature text'})
    try:
        imputer.explain_instance(instance)
        raise pytest.fail(
            'imputer.explain_instance should fail with an appropriate error message'
        )
    except ValueError as exception:
        assert exception.args[0] == 'Need to call .fit() before'

예제 #27

0

파일 보기

파일: test_simple_imputer.py 프로젝트: jbdatascience/datawig

def test_default_no_explainable_simple_imputer():
    imputer = SimpleImputer(['features'], 'label')
    assert not imputer.is_explainable

예제 #28

0

파일 보기

파일: test_simple_imputer.py 프로젝트: jbdatascience/datawig

def test_simple_imputer_real_data_default_args(test_dir, data_frame):
    """
    Tests SimpleImputer with default options

    """
    feature_col = "string_feature"
    label_col = "label"

    n_samples = 2000
    num_labels = 3
    seq_len = 100
    vocab_size = int(2**15)

    # generate some random data
    random_data = data_frame(feature_col=feature_col,
                             label_col=label_col,
                             vocab_size=vocab_size,
                             num_labels=num_labels,
                             num_words=seq_len,
                             n_samples=n_samples)

    df_train, df_test, df_val = random_split(random_data, [.8, .1, .1])

    output_path = os.path.join(test_dir, "tmp", "real_data_experiment_simple")

    df_train_cols_before = df_train.columns.tolist()

    input_columns = [feature_col]

    imputer = SimpleImputer(input_columns=input_columns,
                            output_column=label_col,
                            output_path=output_path).fit(train_df=df_train)

    logfile = os.path.join(imputer.output_path, 'imputer.log')
    assert os.path.exists(logfile)
    assert os.path.getsize(logfile) > 0

    assert imputer.output_path == output_path
    assert imputer.imputer.data_featurizers[0].__class__ == BowFeaturizer
    assert imputer.imputer.data_encoders[0].__class__ == BowEncoder
    assert set(
        imputer.imputer.data_encoders[0].input_columns) == set(input_columns)
    assert set(imputer.imputer.label_encoders[0].input_columns) == set(
        [label_col])

    assert all([
        after == before
        for after, before in zip(df_train.columns, df_train_cols_before)
    ])

    df_no_label_column = df_test.copy()
    true_labels = df_test[label_col]
    del (df_no_label_column[label_col])
    df_test_cols_before = df_no_label_column.columns.tolist()

    df_test_imputed = imputer.predict(df_no_label_column, inplace=True)

    assert all([
        after == before for after, before in zip(df_no_label_column.columns,
                                                 df_test_cols_before)
    ])

    imputed_columns = df_test_cols_before + [
        label_col + "_imputed", label_col + "_imputed_proba"
    ]

    assert all([
        after == before
        for after, before in zip(df_test_imputed, imputed_columns)
    ])

    f1 = f1_score(true_labels,
                  df_test_imputed[label_col + '_imputed'],
                  average="weighted")

    assert f1 > .9

    new_path = imputer.output_path + "-" + rand_string()

    os.rename(imputer.output_path, new_path)

    deserialized = SimpleImputer.load(new_path)
    df_test = deserialized.predict(df_test,
                                   imputation_suffix="_deserialized_imputed")
    f1 = f1_score(df_test[label_col],
                  df_test[label_col + '_deserialized_imputed'],
                  average="weighted")

    assert f1 > .9

    retrained_simple_imputer = deserialized.fit(df_train, df_train)

    df_train_imputed = retrained_simple_imputer.predict(df_train.copy(),
                                                        inplace=True)
    f1 = f1_score(df_train[label_col],
                  df_train_imputed[label_col + '_imputed'],
                  average="weighted")

    assert f1 > .9

    metrics = retrained_simple_imputer.load_metrics()

    assert f1 == metrics['weighted_f1']

예제 #29

0

파일 보기

파일: test_simple_imputer.py 프로젝트: jbdatascience/datawig

def test_simple_imputer_no_string_column_name():
    with pytest.raises(ValueError):
        SimpleImputer([0], '1')
    with pytest.raises(ValueError):
        SimpleImputer(['0'], 1)

예제 #30

0

파일 보기

파일: test_simple_imputer.py 프로젝트: jbdatascience/datawig

def test_hpo_all_input_types(test_dir, data_frame):
    """

    Using sklearn advantages: parallelism, distributions of parameters, multiple cross-validation

    """
    label_col = "label"

    n_samples = 1000
    num_labels = 3
    seq_len = 12

    # generate some random data
    df = data_frame(feature_col="string_feature",
                    label_col=label_col,
                    num_labels=num_labels,
                    num_words=seq_len,
                    n_samples=n_samples)

    # add categorical feature
    df['categorical_feature'] = [
        'foo' if r > .5 else 'bar' for r in np.random.rand(n_samples)
    ]

    # add numerical feature
    df['numeric_feature'] = np.random.rand(n_samples)

    df_train, df_test = random_split(df, [.8, .2])
    output_path = os.path.join(test_dir, "tmp",
                               "real_data_experiment_text_hpo")

    imputer = SimpleImputer(input_columns=[
        'string_feature', 'categorical_feature', 'numeric_feature'
    ],
                            output_column='label',
                            output_path=output_path)

    # Define default hyperparameter choices for each column type (string, categorical, numeric)
    hps = dict()
    hps['global'] = {}
    hps['global']['learning_rate'] = [3e-4]
    hps['global']['weight_decay'] = [1e-8]
    hps['global']['num_epochs'] = [5, 50]
    hps['global']['patience'] = [5]
    hps['global']['batch_size'] = [16]
    hps['global']['final_fc_hidden_units'] = [[]]
    hps['global']['concat_columns'] = [True, False]

    hps['string_feature'] = {}
    hps['string_feature']['max_tokens'] = [2**15]
    hps['string_feature']['tokens'] = [['words', 'chars']]
    hps['string_feature']['ngram_range'] = {}
    hps['string_feature']['ngram_range']['words'] = [(1, 4), (2, 5)]
    hps['string_feature']['ngram_range']['chars'] = [(2, 4), (3, 5)]

    hps['categorical_feature'] = {}
    hps['categorical_feature']['type'] = ['categorical']
    hps['categorical_feature']['max_tokens'] = [2**15]
    hps['categorical_feature']['embed_dim'] = [10]

    hps['numeric_feature'] = {}
    hps['numeric_feature']['normalize'] = [True]
    hps['numeric_feature']['numeric_latent_dim'] = [10]
    hps['numeric_feature']['numeric_hidden_layers'] = [1]

    # user defined score function for hyperparameters
    def calibration_check(true, predicted, confidence):
        """
        expect kwargs: true, predicted, confidence
        here we compute a calibration sanity check
        """
        return (np.mean(true[confidence > .9] == predicted[confidence > .9]),
                np.mean(true[confidence > .5] == predicted[confidence > .5]))

    def coverage_check(true, predicted, confidence):
        return np.mean(confidence > .9)

    uds = [(calibration_check, 'calibration check'),
           (coverage_check, 'coverage at 90')]

    imputer.fit_hpo(df_train,
                    hps=hps,
                    user_defined_scores=uds,
                    num_evals=5,
                    hpo_run_name='test1_')

    imputer.fit_hpo(df_train,
                    hps=hps,
                    user_defined_scores=uds,
                    num_evals=5,
                    hpo_run_name='test2_',
                    max_running_hours=1 / 3600)

    results = imputer.hpo.results

    assert results[results['global:num_epochs'] == 50]['f1_micro'].iloc[0] > \
           results[results['global:num_epochs'] == 5]['f1_micro'].iloc[0]