def test_inverse_transform():

    df = pd.DataFrame(
        {
            "text_column": [
                "life is like a box of chocolates",
                "You never know what you're going to get",
            ]
        }
    )

    text_preprocessor = TextPreprocessor(
        text_col="text_column", max_vocab=25, min_freq=1, maxlen=10, verbose=False
    )
    padded_seq = text_preprocessor.fit_transform(df)
    org_df = text_preprocessor.inverse_transform(padded_seq)

    texts = org_df.text_column.values

    assert ("life is like box of chocolates" in texts[0]) and (
        "you never know what you re going to get" in texts[1]
    )
예제 #2
0
    target = df[target].values

    wide_preprocessor = WidePreprocessor(wide_cols=wide_cols,
                                         crossed_cols=crossed_cols)
    X_wide = wide_preprocessor.fit_transform(df)

    tab_preprocessor = TabPreprocessor(
        embed_cols=cat_embed_cols,  # type: ignore[arg-type]
        continuous_cols=continuous_cols,
        already_standard=already_standard,
    )
    X_tab = tab_preprocessor.fit_transform(df)

    text_processor = TextPreprocessor(word_vectors_path=word_vectors_path,
                                      text_col=text_col)
    X_text = text_processor.fit_transform(df)

    image_processor = ImagePreprocessor(img_col=img_col, img_path=img_path)
    X_images = image_processor.fit_transform(df)

    wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1)
    deepdense = TabMlp(
        mlp_hidden_dims=[64, 32],
        mlp_dropout=[0.2, 0.2],
        column_idx=tab_preprocessor.column_idx,
        embed_input=tab_preprocessor.embeddings_input,
        continuous_cols=continuous_cols,
    )
    # # To use TabResnet as the deepdense component simply:
    # deepdense = TabResnet(
    #     blocks_dims=[64, 32],
예제 #3
0
import numpy as np
import pandas as pd
import pytest
import warnings

from sklearn.datasets import fetch_20newsgroups
from pytorch_widedeep.preprocessing import TextPreprocessor

texts = np.random.choice(fetch_20newsgroups().data, 10)
df = pd.DataFrame({'texts': texts})
processor = TextPreprocessor(min_freq=0)
X_text = processor.fit_transform(df, 'texts')


###############################################################################
# There is not much to test here. I will simply test that the tokenization and
# and padding processes went well
###############################################################################
def test_text_processor():
    idx = int(np.random.choice(np.arange(10), 1))

    original_tokens = processor.tokens[idx]
    if len(original_tokens) > processor.maxlen:
        original_tokens = original_tokens[-processor.maxlen:]

    padded_sequence = X_text[idx]

    recovered_tokens = []
    for t in padded_sequence:
        if processor.vocab.itos[t] != 'xxpad':
            recovered_tokens.append(processor.vocab.itos[t])