Python Workflow.fit_schema 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: nvtabular

클래스/타입: Workflow

메소드/함수: fit_schema

hotexamples.com에서의 예제들: 8

Python Workflow.fit_schema - 8개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 nvtabular.Workflow.fit_schema에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Workflow(30)

transform(16)

fit(14)

fit_transform(13)

apply(11)

add_preprocess(10)

finalize(9)

fit_schema(8)

get_ddf(6)

load(6)

add_feature(4)

save(4)

add_cat_feature(1)

add_cat_preprocess(1)

add_cont_feature(1)

update_stats(1)

예제 #1

0

파일 보기

파일: test_workflow_schemas.py 프로젝트: thibaultcharrin/NVTabular

def test_fit_schema_works_with_raw_column_dependencies():
    schema = Schema(["x", "y", "cost"])

    cat_features = ColumnSelector(["x", "y"]) >> ops.TargetEncoding("cost")

    workflow = Workflow(cat_features)
    workflow.fit_schema(schema)

    assert workflow.output_schema.column_names == ["TE_x_cost", "TE_y_cost"]

예제 #2

0

파일 보기

파일: test_workflow_schemas.py 프로젝트: thibaultcharrin/NVTabular

def test_fit_schema_works_when_subtracting_column_names():
    schema = Schema(["x", "y", "id"])

    cont_features = (ColumnSelector(
        ["x", "y"]) >> ops.FillMissing() >> ops.Clip(min_value=0) >> ops.LogOp
                     >> ops.Normalize() >> ops.Rename(postfix="_renamed"))

    workflow1 = Workflow(cont_features - "y_renamed")
    workflow1.fit_schema(schema)

    assert workflow1.output_schema.column_names == ["x_renamed"]

예제 #3

0

파일 보기

파일: test_workflow_schemas.py 프로젝트: thibaultcharrin/NVTabular

def test_fit_schema_works_with_grouped_node_inputs():
    schema = Schema(["x", "y", "cost"])

    cat_features = ColumnSelector(["x", "y",
                                   ("x", "y")]) >> ops.TargetEncoding("cost")

    workflow1 = Workflow(cat_features)
    workflow1.fit_schema(schema)

    assert sorted(workflow1.output_schema.column_names) == sorted(
        ["TE_x_cost", "TE_y_cost", "TE_x_y_cost"])

예제 #4

0

파일 보기

파일: test_workflow_schemas.py 프로젝트: thibaultcharrin/NVTabular

def test_workflow_select_by_tags(op):
    schema1 = ColumnSchema("col1", tags=["b", "c", "d"])
    schema2 = ColumnSchema("col2", tags=["c", "d"])
    schema3 = ColumnSchema("col3", tags=["d"])
    schema = Schema([schema1, schema2, schema3])

    cont_features = ColumnSelector(tags=["c"]) >> op
    workflow = Workflow(cont_features)
    workflow.fit_schema(schema)

    output_cols = op.output_column_names(ColumnSelector(["col1", "col2"]))
    assert len(workflow.output_schema.column_names) == len(output_cols.names)

예제 #5

0

파일 보기

파일: test_workflow_schemas.py 프로젝트: thibaultcharrin/NVTabular

def test_fit_schema():
    schema = Schema(["x", "y", "id"])

    cont_features = (ColumnSelector(schema.column_names) >> ops.FillMissing()
                     >> ops.Clip(min_value=0) >> ops.LogOp >> ops.Normalize()
                     >> ops.Rename(postfix="_renamed"))

    workflow = Workflow(cont_features)
    workflow.fit_schema(schema)

    assert workflow.output_schema.column_names == [
        "x_renamed", "y_renamed", "id_renamed"
    ]

예제 #6

0

파일 보기

파일: test_workflow_schemas.py 프로젝트: thibaultcharrin/NVTabular

def test_fit_schema_works_with_node_dependencies():
    schema = Schema(["x", "y", "cost"])

    cont_features = ColumnSelector(["cost"]) >> ops.Rename(postfix="_renamed")
    cat_features = ColumnSelector(["x", "y"
                                   ]) >> ops.TargetEncoding(cont_features)

    workflow1 = Workflow(cat_features)
    workflow1.fit_schema(schema)

    assert workflow1.output_schema.column_names == [
        "TE_x_cost_renamed", "TE_y_cost_renamed"
    ]

예제 #7

0

파일 보기

파일: test_workflow_schemas.py 프로젝트: thibaultcharrin/NVTabular

def test_fit_schema_works_with_addition_nodes():
    schema = Schema(["x", "y", "id"])

    x_node = ColumnSelector(["x"]) >> ops.Rename(postfix="_renamed")

    workflow = Workflow(x_node + "y")
    workflow.fit_schema(schema)

    assert workflow.output_schema.column_names == ["x_renamed", "y"]

    x_node = ColumnSelector(["x"]) >> ops.Rename(postfix="_renamed")
    y_node = ColumnSelector(["y"]) >> ops.Rename(postfix="_renamed")

    workflow = Workflow(x_node + y_node)
    workflow.fit_schema(schema)

    assert workflow.output_schema.column_names == ["x_renamed", "y_renamed"]

예제 #8

0

파일 보기

def test_nested_workflow_node():
    df = dispatch._make_df({
        "geo": ["US>CA", "US>NY", "CA>BC", "CA>ON"],
        "user": ["User_A", "User_A", "User_A", "User_B"],
    })
    dataset = Dataset(df)

    geo_selector = ColumnSelector(["geo"])
    country = (geo_selector >> LambdaOp(lambda col: col.str.slice(0, 2)) >>
               Rename(postfix="_country"))
    # country1 = geo_selector >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country1")
    # country2 = geo_selector >> (lambda col: col.str.slice(0, 2)) >> Rename(postfix="_country2")
    user = "******"
    # user2 = "user2"

    # make sure we can do a 'combo' categorify (cross based) of country+user
    # as well as categorifying the country and user columns on their own
    cats = country + user + [country + user] >> Categorify(encode_type="combo")

    workflow = Workflow(cats)
    workflow.fit_schema(dataset.infer_schema())

    df_out = workflow.fit_transform(dataset).to_ddf().compute(
        scheduler="synchronous")

    geo_country = df_out["geo_country"]
    assert geo_country[0] == geo_country[1]  # rows 0,1 are both 'US'
    assert geo_country[2] == geo_country[3]  # rows 2,3 are both 'CA'

    user = df_out["user"]
    assert user[0] == user[1] == user[2]
    assert user[3] != user[2]

    geo_country_user = df_out["geo_country_user"]
    assert geo_country_user[0] == geo_country_user[1]  # US / userA
    assert geo_country_user[2] != geo_country_user[
        0]  # same user but in canada

    # make sure we get an exception if we nest too deeply (can't handle arbitrarily deep
    # nested column groups - and the exceptions we would get in operators like Categorify
    # are super confusing for users)
    with pytest.raises(ValueError):
        cats = [[country + "user"] + country + "user"
                ] >> Categorify(encode_type="combo")