Python align_categories 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: kartothek.io_components.utils

메소드/함수: align_categories

hotexamples.com에서의 예제들: 4

Python align_categories - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 kartothek.io_components.utils.align_categories에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: test_utils.py 프로젝트: stephan-hesselmann-by/kartothek

def test_align_categories():
    df1 = pd.DataFrame(
        {
            "col_A": pd.Categorical(["A1", "A3", "A3"]),
            "col_B": pd.Categorical(["B1", "B3", "B3"]),
        }
    )
    df2 = pd.DataFrame(
        {
            "col_A": pd.Categorical(["A2", "A3", "A4"]),
            "col_B": pd.Categorical(["B2", "B3", "B4"]),
        }
    )
    df3 = pd.DataFrame(
        {
            "col_A": pd.Categorical(["A4", "A5", "A1"]),
            "col_B": pd.Categorical(["B4", "B5", "B1"]),
        }
    )
    in_dfs = [df1, df2, df3]

    out_dfs = align_categories(in_dfs, categoricals=["col_A", "col_B"])

    for prefix in ["A", "B"]:
        col_name = "col_{}".format(prefix)
        expected_categories = [
            "{}1".format(prefix),
            "{}3".format(prefix),
            "{}2".format(prefix),
            "{}4".format(prefix),
            "{}5".format(prefix),
        ]
        expected_1 = pd.Series(
            pd.Categorical(
                ["{}1".format(prefix), "{}3".format(prefix), "{}3".format(prefix)],
                categories=expected_categories,
            ),
            name=col_name,
        )
        pdt.assert_series_equal(out_dfs[0][col_name], expected_1)

        expected_2 = pd.Series(
            pd.Categorical(
                ["{}2".format(prefix), "{}3".format(prefix), "{}4".format(prefix)],
                categories=expected_categories,
            ),
            name=col_name,
        )
        pdt.assert_series_equal(out_dfs[1][col_name], expected_2)

        expected_3 = pd.Series(
            pd.Categorical(
                ["{}4".format(prefix), "{}5".format(prefix), "{}1".format(prefix)],
                categories=expected_categories,
            ),
            name=col_name,
        )
        pdt.assert_series_equal(out_dfs[2][col_name], expected_3)

예제 #2

파일 보기

파일: test_utils.py 프로젝트: stephan-hesselmann-by/kartothek

def test_align_categories_with_missings():
    df_0 = pd.DataFrame({"letters": ["a", "a", "b", np.nan]})
    df_1 = pd.DataFrame({"letters": ["a", "a"]})
    out = align_categories([df_0, df_1], ["letters"])
    expected_0 = pd.DataFrame(
        {"letters": pd.Categorical(["a", "a", "b", np.nan], categories=["a", "b"])}
    )
    expected_1 = pd.DataFrame(
        {"letters": pd.Categorical(["a", "a"], categories=["a", "b"])}
    )
    pdt.assert_frame_equal(out[0], expected_0)
    pdt.assert_frame_equal(out[1], expected_1)

예제 #3

파일 보기

    def concat_metapartitions(metapartitions, label_merger=None):
        LOGGER.debug("Concatenating metapartitions")

        new_metadata_version = -1
        data = []
        schema = []
        for mp in metapartitions:
            new_metadata_version = max(new_metadata_version,
                                       mp.metadata_version)
            data.append(mp.data)
            schema.append(mp.schema)
            # Don't care about the partition_keys. If we try to merge
            # MetaPartitions without alignment the schemas won't match.
            partition_keys = mp.partition_keys

        categoricals = [
            col for col, dtype in data[0].items()
            if pd.api.types.is_categorical_dtype(dtype)
        ]
        if categoricals:
            data = align_categories(data, categoricals)
        new_df = pd.concat(data)

        new_schema = validate_compatible(schema)

        new_label = MetaPartition._merge_labels(metapartitions, label_merger)

        new_mp = MetaPartition(
            label=new_label,
            data=new_df,
            metadata_version=new_metadata_version,
            schema=new_schema,
            partition_keys=partition_keys,
        )

        return new_mp

예제 #4

파일 보기

파일: eager.py 프로젝트: trucnguyenlam/kartothek

def read_table(
    dataset_uuid=None,
    store=None,
    table=SINGLE_TABLE,
    columns=None,
    concat_partitions_on_primary_index=False,
    predicate_pushdown_to_io=True,
    categoricals=None,
    label_filter=None,
    dates_as_object=False,
    predicates=None,
    factory=None,
):
    """
    A utility function to load a single table with multiple partitions as a single dataframe in one go.
    Mostly useful for smaller tables or datasets where all partitions fit into memory.

    The order of partitions is not guaranteed to be stable in the resulting dataframe.

    Parameters
    ----------

    Returns
    -------
    pandas.DataFrame
        Returns a pandas.DataFrame holding the data of the requested columns

    Examples
    --------
    Dataset in store contains two partitions with two files each

    .. code ::

        >>> import storefact
        >>> from kartothek.io.eager import read_table

        >>> store = storefact.get_store_from_url('s3://bucket_with_dataset')

        >>> df = read_table(store, 'dataset_uuid', 'core')

    """
    if concat_partitions_on_primary_index is not False:
        warnings.warn(
            "The keyword `concat_partitions_on_primary_index` is deprecated and will be removed in the next major release.",
            DeprecationWarning,
        )

    if not isinstance(table, str):
        raise TypeError("Argument `table` needs to be a string")

    columns = _check_compatible_list(table, columns, "columns")
    categoricals = _check_compatible_list(table, categoricals, "categoricals")

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=_make_callable(store),
        factory=factory,
        load_dataset_metadata=False,
    )
    partitions = read_dataset_as_dataframes(
        tables=[table],
        columns=columns,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
        predicate_pushdown_to_io=predicate_pushdown_to_io,
        categoricals=categoricals,
        label_filter=label_filter,
        dates_as_object=dates_as_object,
        predicates=predicates,
        factory=ds_factory,
    )

    empty_df = empty_dataframe_from_schema(
        schema=ds_factory.table_meta[table],
        columns=columns[table] if columns is not None else None,
    )
    dfs = [partition_data[table] for partition_data in partitions] + [empty_df]
    # require meta 4 otherwise, can't construct types/columns
    if categoricals:
        dfs = align_categories(dfs, categoricals[table])
    df = pd.concat(dfs, ignore_index=True, sort=False)

    # ensure column order
    if len(empty_df.columns) > 0:
        df = df.reindex(empty_df.columns, copy=False, axis=1)

    return df