Пример #1
0
def test_multi_column_tfidf_vectorizer_one_column_zero_output_tokens(kwargs, output_shape):
    """Tests that a TF-IDF document-term matrix is still returned when only one column breaks"""
    corpus = np.array(
        [
            ["Cats eat rats.", "Rats are mammals."],
            ["Dogs chase cats.", "Rats are mammals."],
            ["People like dogs.", "Rats are mammals."],
            ["People hate rats.", "Rats are mammals."],
        ]
    )

    vec = MultiColumnTfidfVectorizer(**kwargs)
    output = vec.fit_transform(corpus)
    assert output.shape == output_shape
Пример #2
0
def test_multi_column_tfidf_vectorizer():
    vec = MultiColumnTfidfVectorizer()
    output = vec.fit_transform(corpus)

    assert isinstance(output, sp.coo.coo_matrix)

    observed = output.todense()
    expected = np.hstack(
        [
            TfidfVectorizer().fit_transform(corpus[:, 0]).todense(),
            TfidfVectorizer().fit_transform(corpus[:, 1]).todense(),
        ]
    )

    np.testing.assert_array_equal(observed, expected)
Пример #3
0
def build_feature_transform():
    """ Returns the model definition representing feature processing."""

    # These features contain a relatively small number of unique items.
    categorical = HEADER.as_feature_indices(['dev_platform_vec'])

    # These features can be parsed as natural language.
    text = HEADER.as_feature_indices([
        'ifa', 'bundle_vec', 'persona_segment_vec', 'persona_L1_vec',
        'persona_L2_vec', 'persona_L3_vec', 'device_vendor_vec',
        'device_name_vec', 'device_manufacturer_vec', 'device_model_vec',
        'device_year_of_release_vec', 'major_os_vec'
    ])

    categorical_processors = Pipeline(steps=[('thresholdonehotencoder',
                                              ThresholdOneHotEncoder(
                                                  threshold=5))])

    text_processors = Pipeline(
        steps=[('multicolumntfidfvectorizer',
                MultiColumnTfidfVectorizer(max_df=0.9365,
                                           min_df=0.011235955056179775,
                                           analyzer='word',
                                           max_features=10000))])

    column_transformer = ColumnTransformer(
        transformers=[('categorical_processing', categorical_processors,
                       categorical), ('text_processing', text_processors,
                                      text)])

    return Pipeline(steps=[(
        'column_transformer',
        column_transformer), ('robustpca', RobustPCA(
            n_components=53)), ('robuststandardscaler',
                                RobustStandardScaler())])
Пример #4
0
def build_feature_transform():
    """ Returns the model definition representing feature processing."""

    # These features can be parsed as natural language.
    text = HEADER.as_feature_indices(['features'])

    text_processors = Pipeline(
        steps=[
            (
                'multicolumntfidfvectorizer',
                MultiColumnTfidfVectorizer(
                    max_df=0.9684,
                    min_df=0.013108614232209739,
                    analyzer='word',
                    max_features=10000
                )
            )
        ]
    )

    column_transformer = ColumnTransformer(
        transformers=[('text_processing', text_processors, text)]
    )

    return Pipeline(
        steps=[
            ('column_transformer', column_transformer
            ), ('robuststandardscaler', RobustStandardScaler())
        ]
    )
Пример #5
0
def build_feature_transform():
    """ Returns the model definition representing feature processing."""

    # These features can be parsed as natural language.

    text = HEADER.as_feature_indices(['review_body'])

    text_processors = Pipeline(
        steps=[
            (
                'multicolumntfidfvectorizer',
                MultiColumnTfidfVectorizer(
                    max_df=0.99,
                    min_df=0.0021,
                    analyzer='char_wb',
                    max_features=10000
                )
            )
        ]
    )

    column_transformer = ColumnTransformer(
        transformers=[('text_processing', text_processors, text)]
    )

    return Pipeline(
        steps=[
            ('column_transformer',
             column_transformer), ('robustpca', RobustPCA(n_components=5)),
            ('robuststandardscaler', RobustStandardScaler())
        ]
    )
Пример #6
0
def build_feature_transform():
    """ Returns the model definition representing feature processing."""

    # These features can be parsed as natural language.

    text = HEADER.as_feature_indices(["review_body"])

    text_processors = Pipeline(steps=[(
        "multicolumntfidfvectorizer",
        MultiColumnTfidfVectorizer(
            max_df=0.9941, min_df=0.0007, analyzer="word", max_features=10000),
    )])

    column_transformer = ColumnTransformer(transformers=[("text_processing",
                                                          text_processors,
                                                          text)])

    return Pipeline(steps=[(
        "column_transformer",
        column_transformer), ("robuststandardscaler", RobustStandardScaler())])
# 2. Impute missing values with the string "missing"
# 3. One hot encode the data (ignoring new categorical levels at prediction time)
# You can set `handle_unknown='error'` to make your model raise an error at prediction time if
# it encounters a new categorical level
categorical_pipeline = Pipeline(steps=[
    ("bool_to_string", FunctionTransformer(to_string)),
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

# For text, we:
# 1. Impute missing values with the string "missing"
# 2. Tfidf encode the text, using 1-grams and 2-grams.
text_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("tfidf", MultiColumnTfidfVectorizer(ngram_range=(1, 2))),
])

# Sparse preprocessing pipeline, for models such as Ridge that handle sparse input well
sparse_preprocessing_pipeline = ColumnTransformer(transformers=[
    ("num", numeric_pipeline, numeric_selector),
    ("cat", categorical_pipeline, categorical_selector),
    ("txt", text_pipeline, text_selector),
])


# Modified TruncatedSVD that doesn't fail if n_components > ncols
class MyTruncatedSVD(TruncatedSVD):
    def fit_transform(self, X, y=None):
        if X.shape[1] <= self.n_components:
            self.n_components = X.shape[1] - 1
from sagemaker_sklearn_extension.impute import RobustMissingIndicator
from sagemaker_sklearn_extension.preprocessing import LogExtremeValuesTransformer
from sagemaker_sklearn_extension.preprocessing import NALabelEncoder
from sagemaker_sklearn_extension.preprocessing import QuadraticFeatures
from sagemaker_sklearn_extension.preprocessing import QuantileExtremeValuesTransformer
from sagemaker_sklearn_extension.preprocessing import RemoveConstantColumnsTransformer
from sagemaker_sklearn_extension.preprocessing import RobustLabelEncoder
from sagemaker_sklearn_extension.preprocessing import RobustStandardScaler
from sagemaker_sklearn_extension.preprocessing import ThresholdOneHotEncoder


@pytest.mark.parametrize(
    "Estimator",
    [
        DateTimeVectorizer(),
        LogExtremeValuesTransformer(),
        MultiColumnTfidfVectorizer(),
        NALabelEncoder(),
        QuadraticFeatures(),
        QuantileExtremeValuesTransformer(),
        RobustImputer(),
        RemoveConstantColumnsTransformer(),
        RobustLabelEncoder(),
        RobustMissingIndicator(),
        RobustStandardScaler(),
        ThresholdOneHotEncoder(),
    ],
)
def test_all_estimators(Estimator):
    return check_estimator(Estimator)
Пример #9
0
def test_multi_column_tfidf_vectorizer_zero_output_tokens_ignore_zero_vocab_on(kwargs, data, shape):
    """Tests for empty matrix when no terms remain after pruning"""
    vec = MultiColumnTfidfVectorizer(**kwargs)
    output = vec.fit_transform(data)
    assert output.shape == shape
Пример #10
0
def test_multi_column_tfidf_vectorizer_vocabulary_sizes_error():
    with pytest.raises(ValueError):
        vectorizer = MultiColumnTfidfVectorizer(vocabulary_sizes=[1])
        vectorizer.fit(corpus)
Пример #11
0
def test_multi_column_tfidf_vectorizer_vocabulary_sizes_small():
    vocabulary_sizes = [TfidfVectorizer().fit_transform(corpus[:, i]).shape[1] - 1 for i in range(corpus.shape[1])]
    vectorizer = MultiColumnTfidfVectorizer(vocabulary_sizes=vocabulary_sizes)
    observed = vectorizer.fit_transform(corpus)
    assert observed.shape[1] == sum(vocabulary_sizes)
    assert sp.issparse(observed)
Пример #12
0
def test_multi_column_tfidf_vectorizer_transform_dim_error():
    with pytest.raises(ValueError):
        vec = MultiColumnTfidfVectorizer()
        vec.fit(corpus)
        vec.transform(corpus[0])
Пример #13
0
def test_multi_column_tfidf_vectorizer_zero_output_tokens_ignore_zero_vocab_off(kwargs, data):
    """Tests for ValueError when no terms remain after pruning and `ignore_overpruned_columns=False`"""
    with pytest.raises(ValueError):
        vec = MultiColumnTfidfVectorizer(**kwargs)
        vec.fit_transform(data)