Python TextPreprocessor.fit_transform 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: pytorch_widedeep.preprocessing

클래스/타입: TextPreprocessor

메소드/함수: fit_transform

hotexamples.com에서의 예제들: 3

Python TextPreprocessor.fit_transform - 3개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 pytorch_widedeep.preprocessing.TextPreprocessor.fit_transform에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

TextPreprocessor(6)

fit_transform(3)

inverse_transform(1)

transform(1)

예제 #1

파일 보기

파일: test_du_deep_text.py 프로젝트: wuwuyang/pytorch-widedeep

def test_inverse_transform():

    df = pd.DataFrame(
        {
            "text_column": [
                "life is like a box of chocolates",
                "You never know what you're going to get",
            ]
        }
    )

    text_preprocessor = TextPreprocessor(
        text_col="text_column", max_vocab=25, min_freq=1, maxlen=10, verbose=False
    )
    padded_seq = text_preprocessor.fit_transform(df)
    org_df = text_preprocessor.inverse_transform(padded_seq)

    texts = org_df.text_column.values

    assert ("life is like box of chocolates" in texts[0]) and (
        "you never know what you re going to get" in texts[1]
    )

예제 #2

파일 보기

    target = df[target].values

    wide_preprocessor = WidePreprocessor(wide_cols=wide_cols,
                                         crossed_cols=crossed_cols)
    X_wide = wide_preprocessor.fit_transform(df)

    tab_preprocessor = TabPreprocessor(
        embed_cols=cat_embed_cols,  # type: ignore[arg-type]
        continuous_cols=continuous_cols,
        already_standard=already_standard,
    )
    X_tab = tab_preprocessor.fit_transform(df)

    text_processor = TextPreprocessor(word_vectors_path=word_vectors_path,
                                      text_col=text_col)
    X_text = text_processor.fit_transform(df)

    image_processor = ImagePreprocessor(img_col=img_col, img_path=img_path)
    X_images = image_processor.fit_transform(df)

    wide = Wide(wide_dim=np.unique(X_wide).shape[0], pred_dim=1)
    deepdense = TabMlp(
        mlp_hidden_dims=[64, 32],
        mlp_dropout=[0.2, 0.2],
        column_idx=tab_preprocessor.column_idx,
        embed_input=tab_preprocessor.embeddings_input,
        continuous_cols=continuous_cols,
    )
    # # To use TabResnet as the deepdense component simply:
    # deepdense = TabResnet(
    #     blocks_dims=[64, 32],

예제 #3

파일 보기

import numpy as np
import pandas as pd
import pytest
import warnings

from sklearn.datasets import fetch_20newsgroups
from pytorch_widedeep.preprocessing import TextPreprocessor

texts = np.random.choice(fetch_20newsgroups().data, 10)
df = pd.DataFrame({'texts': texts})
processor = TextPreprocessor(min_freq=0)
X_text = processor.fit_transform(df, 'texts')


###############################################################################
# There is not much to test here. I will simply test that the tokenization and
# and padding processes went well
###############################################################################
def test_text_processor():
    idx = int(np.random.choice(np.arange(10), 1))

    original_tokens = processor.tokens[idx]
    if len(original_tokens) > processor.maxlen:
        original_tokens = original_tokens[-processor.maxlen:]

    padded_sequence = X_text[idx]

    recovered_tokens = []
    for t in padded_sequence:
        if processor.vocab.itos[t] != 'xxpad':
            recovered_tokens.append(processor.vocab.itos[t])