Python Imputer.load примеры использования

Язык программирования: Python

Пространство имен/Пакет: datawig.imputer

Класс/Тип: Imputer

Метод/Функция: load

Примеров на hotexamples.com: 5

Python Imputer.load - 5 примеров найдено. Это лучшие примеры Python кода для datawig.imputer.Imputer.load, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Imputer(19)

fit(8)

load(5)

transform_and_compute_metrics(3)

explain_instance(2)

predict(2)

_Imputer__drop_missing_labels(1)

ctx(1)

explain(1)

predict_above_precision(1)

predict_proba_top_k(1)

save(1)

transform(1)

Пример #1

Показать файл

Файл: test_imputer.py Проект: andrey-tpt/datawig

def test_imputer_load_read_exec_only_dir(tmpdir, data_frame):
    import stat

    # on shared build-fleet tests fail with converting tmpdir to string
    tmpdir = str(tmpdir)
    feature = 'feature'
    label = 'label'

    df = data_frame(feature, label, n_samples=100)
    # fit and output model + metrics to tmpdir

    imputer = Imputer(data_featurizers=[BowFeaturizer(feature)],
                      label_encoders=[CategoricalEncoder(label)],
                      data_encoders=[BowEncoder(feature)],
                      output_path=tmpdir)
    imputer.fit(train_df=df, num_epochs=1)

    # make tmpdir read/exec-only by owner/group/others
    os.chmod(
        tmpdir, stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH | stat.S_IREAD
        | stat.S_IRGRP | stat.S_IROTH)

    try:
        Imputer.load(tmpdir)
    except AssertionError as e:
        print(e)
        pytest.fail(
            'Loading imputer from read-only directory should not fail.')

Пример #2

Показать файл

Файл: test_imputer.py Проект: stjordanis/datawig

def test_non_writable_output_path(test_dir, data_frame):
    label_col = 'label'
    df = data_frame(n_samples=100, label_col=label_col)

    data_encoder_cols = [TfIdfEncoder('features')]
    label_encoder_cols = [CategoricalEncoder(label_col)]
    data_cols = [BowFeaturizer('features')]

    output_path = os.path.join(test_dir, 'non_writable')

    Imputer(data_featurizers=data_cols,
            label_encoders=label_encoder_cols,
            data_encoders=data_encoder_cols,
            output_path=output_path).fit(train_df=df, num_epochs=1).save()

    from datawig.utils import logger

    try:
        # make output dir of imputer read-only
        os.chmod(output_path, S_IREAD | S_IXUSR)

        # make log file read only
        os.chmod(os.path.join(output_path, "imputer.log"), S_IREAD)
        imputer = Imputer.load(output_path)
        _ = imputer.predict(df)
        logger.warning("this should not fail")

        # remove log file
        os.chmod(os.path.join(output_path, "imputer.log"),
                 S_IREAD | S_IXUSR | S_IWUSR)
        os.chmod(output_path, S_IREAD | S_IXUSR | S_IWUSR)
        os.remove(os.path.join(output_path, "imputer.log"))

        # make output dir of imputer read-only
        os.chmod(output_path, S_IREAD | S_IXUSR)

        imputer = Imputer.load(output_path)
        _ = imputer.predict(df)
        logger.warning("this should not fail")
        os.chmod(output_path, S_IREAD | S_IXUSR | S_IWUSR)
    except Exception as e:
        print(e)
        pytest.fail("This invocation not raise any Exception")

Пример #3

Показать файл

Файл: test_imputer.py Проект: felixbiessmann/datawig

def test_imputer_load_with_invalid_context(tmpdir, data_frame):

    # on shared build-fleet tests fail with converting tmpdir to string
    tmpdir = str(tmpdir)
    feature = 'feature'
    label = 'label'

    df = data_frame(feature, label, n_samples=100)
    # fit and output model + metrics to tmpdir

    imputer = Imputer(data_featurizers=[BowFeaturizer(feature)],
                      label_encoders=[CategoricalEncoder(label)],
                      data_encoders=[BowEncoder(feature)],
                      output_path=tmpdir)
    imputer.fit(train_df=df, num_epochs=1)
    imputer.ctx = None
    imputer.save()

    imputer_deser = Imputer.load(tmpdir)
    _ = imputer_deser.predict(df)

Пример #4

Показать файл

Файл: test_imputer.py Проект: andrey-tpt/datawig

def test_imputer_real_data_all_featurizers(test_dir, data_frame):
    """
    Tests Imputer with sequential, bag-of-words and categorical variables as inputs
    this could be run as part of integration test suite.
    """

    feature_col = "string_feature"
    categorical_col = "categorical_feature"
    label_col = "label"

    n_samples = 5000
    num_labels = 3
    seq_len = 20
    vocab_size = int(2**10)

    latent_dim = 30
    embed_dim = 30

    # generate some random data
    random_data = data_frame(feature_col=feature_col,
                             label_col=label_col,
                             vocab_size=vocab_size,
                             num_labels=num_labels,
                             num_words=seq_len,
                             n_samples=n_samples)

    # we use a the label prefixes as a dummy categorical input variable
    random_data[categorical_col] = random_data[label_col].apply(
        lambda x: x[:2])

    df_train, df_test, df_val = random_split(random_data, [.8, .1, .1])

    data_encoder_cols = [
        BowEncoder(feature_col, feature_col + "_bow", max_tokens=vocab_size),
        SequentialEncoder(feature_col,
                          feature_col + "_lstm",
                          max_tokens=vocab_size,
                          seq_len=seq_len),
        CategoricalEncoder(categorical_col, max_tokens=num_labels)
    ]
    label_encoder_cols = [CategoricalEncoder(label_col, max_tokens=num_labels)]

    data_cols = [
        BowFeaturizer(feature_col + "_bow", vocab_size=vocab_size),
        LSTMFeaturizer(field_name=feature_col + "_lstm",
                       seq_len=seq_len,
                       latent_dim=latent_dim,
                       num_hidden=30,
                       embed_dim=embed_dim,
                       num_layers=2,
                       vocab_size=num_labels),
        EmbeddingFeaturizer(field_name=categorical_col,
                            embed_dim=embed_dim,
                            vocab_size=num_labels)
    ]

    output_path = os.path.join(test_dir, "tmp",
                               "imputer_experiment_synthetic_data")

    num_epochs = 10
    batch_size = 32
    learning_rate = 1e-2

    imputer = Imputer(data_featurizers=data_cols,
                      label_encoders=label_encoder_cols,
                      data_encoders=data_encoder_cols,
                      output_path=output_path).fit(train_df=df_train,
                                                   test_df=df_val,
                                                   learning_rate=learning_rate,
                                                   num_epochs=num_epochs,
                                                   batch_size=batch_size,
                                                   calibrate=False)

    len_df_before_predict = len(df_test)
    pred = imputer.transform(df_test)

    assert len(pred[label_col]) == len_df_before_predict

    assert sum(df_test[label_col].values == pred[label_col]) == len(df_test)

    _ = imputer.predict_proba_top_k(df_test, top_k=2)

    _, metrics = imputer.transform_and_compute_metrics(df_test)

    assert metrics[label_col]['avg_f1'] > 0.9

    deserialized = Imputer.load(imputer.output_path)

    _, metrics_deserialized = deserialized.transform_and_compute_metrics(
        df_test)

    assert metrics_deserialized[label_col]['avg_f1'] > 0.9

    # training on a small data set to get a imputer with low precision
    not_so_precise_imputer = Imputer(data_featurizers=data_cols,
                                     label_encoders=label_encoder_cols,
                                     data_encoders=data_encoder_cols,
                                     output_path=output_path).fit(
                                         train_df=df_train[:50],
                                         test_df=df_test,
                                         learning_rate=learning_rate,
                                         num_epochs=num_epochs,
                                         batch_size=batch_size,
                                         calibrate=False)

    df_test = df_test.reset_index()
    predictions_df = not_so_precise_imputer.predict(
        df_test, precision_threshold=.5, imputation_suffix="_imputed")

    assert predictions_df.columns.contains(label_col + "_imputed")
    assert predictions_df.columns.contains(label_col + "_imputed_proba")

Пример #5

Показать файл

Файл: test_imputer.py Проект: jbdatascience/datawig

def test_explain_method_synthetic(test_dir):
    # Generate simulated data for testing explain method
    # Predict output column with entries in ['foo', 'bar'] from two columns, one
    # categorical in ['foo', 'dummy'], one text in ['text_foo_text', 'text_dummy_text'].
    # the output column is deterministically 'foo', if 'foo' occurs anywhere in any input column.
    N = 100
    cat_in_col = ['foo' if r > (1 / 2) else 'dummy' for r in np.random.rand(N)]
    text_in_col = ['fff' if r > (1 / 2) else 'ddd' for r in np.random.rand(N)]
    hash_in_col = ['h' for r in range(N)]
    cat_out_col = [
        'foo' if 'f' in input[0] + input[1] else 'bar'
        for input in zip(cat_in_col, text_in_col)
    ]

    df = pd.DataFrame()
    df['in_cat'] = cat_in_col
    df['in_text'] = text_in_col
    df['in_text_hash'] = hash_in_col
    df['out_cat'] = cat_out_col

    # Specify encoders and featurizers #
    data_encoder_cols = [
        datawig.column_encoders.TfIdfEncoder('in_text', tokens="chars"),
        datawig.column_encoders.CategoricalEncoder('in_cat', max_tokens=10),
        datawig.column_encoders.BowEncoder('in_text_hash', tokens="chars")
    ]
    data_featurizer_cols = [
        datawig.mxnet_input_symbols.BowFeaturizer('in_text'),
        datawig.mxnet_input_symbols.EmbeddingFeaturizer('in_cat'),
        datawig.mxnet_input_symbols.BowFeaturizer('in_text_hash')
    ]

    label_encoder_cols = [
        datawig.column_encoders.CategoricalEncoder('out_cat')
    ]

    # Specify model
    imputer = datawig.Imputer(data_featurizers=data_featurizer_cols,
                              label_encoders=label_encoder_cols,
                              data_encoders=data_encoder_cols,
                              output_path=os.path.join(test_dir, "tmp",
                                                       "explanation_tests"))

    # Train
    tr, te = random_split(df.sample(90), [.8, .2])
    imputer.fit(train_df=tr, test_df=te, num_epochs=20, learning_rate=1e-2)
    predictions = imputer.predict(te)

    # Evaluate
    assert precision_score(predictions.out_cat,
                           predictions.out_cat_imputed,
                           average='weighted') > .99

    # assert item explanation, iterate over some inputs
    for i in np.random.choice(N, 10):
        explanation = imputer.explain_instance(df.iloc[i])
        top_label = explanation['explained_label']

        if top_label == 'bar':
            assert (explanation['in_text'][0][0] == 'd'
                    and explanation['in_cat'][0][0] == 'dummy')
        elif top_label == 'foo':
            assert (explanation['in_text'][0][0] == 'f'
                    or explanation['in_cat'][0][0] == 'foo')

    # assert class explanations
    assert np.all([
        'f' in token for token, weight in imputer.explain('foo')['in_text']
    ][:3])
    assert [
        'f' in token for token, weight in imputer.explain('foo')['in_cat']
    ][0]

    # test serialisation to disk
    imputer.save()
    imputer_from_disk = Imputer.load(imputer.output_path)
    assert np.all([
        'f' in token
        for token, weight in imputer_from_disk.explain('foo')['in_text']
    ][:3])