예제 #1
0
def test_align_categories():
    df1 = pd.DataFrame(
        {
            "col_A": pd.Categorical(["A1", "A3", "A3"]),
            "col_B": pd.Categorical(["B1", "B3", "B3"]),
        }
    )
    df2 = pd.DataFrame(
        {
            "col_A": pd.Categorical(["A2", "A3", "A4"]),
            "col_B": pd.Categorical(["B2", "B3", "B4"]),
        }
    )
    df3 = pd.DataFrame(
        {
            "col_A": pd.Categorical(["A4", "A5", "A1"]),
            "col_B": pd.Categorical(["B4", "B5", "B1"]),
        }
    )
    in_dfs = [df1, df2, df3]

    out_dfs = align_categories(in_dfs, categoricals=["col_A", "col_B"])

    for prefix in ["A", "B"]:
        col_name = "col_{}".format(prefix)
        expected_categories = [
            "{}1".format(prefix),
            "{}3".format(prefix),
            "{}2".format(prefix),
            "{}4".format(prefix),
            "{}5".format(prefix),
        ]
        expected_1 = pd.Series(
            pd.Categorical(
                ["{}1".format(prefix), "{}3".format(prefix), "{}3".format(prefix)],
                categories=expected_categories,
            ),
            name=col_name,
        )
        pdt.assert_series_equal(out_dfs[0][col_name], expected_1)

        expected_2 = pd.Series(
            pd.Categorical(
                ["{}2".format(prefix), "{}3".format(prefix), "{}4".format(prefix)],
                categories=expected_categories,
            ),
            name=col_name,
        )
        pdt.assert_series_equal(out_dfs[1][col_name], expected_2)

        expected_3 = pd.Series(
            pd.Categorical(
                ["{}4".format(prefix), "{}5".format(prefix), "{}1".format(prefix)],
                categories=expected_categories,
            ),
            name=col_name,
        )
        pdt.assert_series_equal(out_dfs[2][col_name], expected_3)
예제 #2
0
def test_align_categories_with_missings():
    df_0 = pd.DataFrame({"letters": ["a", "a", "b", np.nan]})
    df_1 = pd.DataFrame({"letters": ["a", "a"]})
    out = align_categories([df_0, df_1], ["letters"])
    expected_0 = pd.DataFrame(
        {"letters": pd.Categorical(["a", "a", "b", np.nan], categories=["a", "b"])}
    )
    expected_1 = pd.DataFrame(
        {"letters": pd.Categorical(["a", "a"], categories=["a", "b"])}
    )
    pdt.assert_frame_equal(out[0], expected_0)
    pdt.assert_frame_equal(out[1], expected_1)
예제 #3
0
    def concat_metapartitions(metapartitions, label_merger=None):
        LOGGER.debug("Concatenating metapartitions")

        new_metadata_version = -1
        data = []
        schema = []
        for mp in metapartitions:
            new_metadata_version = max(new_metadata_version,
                                       mp.metadata_version)
            data.append(mp.data)
            schema.append(mp.schema)
            # Don't care about the partition_keys. If we try to merge
            # MetaPartitions without alignment the schemas won't match.
            partition_keys = mp.partition_keys

        categoricals = [
            col for col, dtype in data[0].items()
            if pd.api.types.is_categorical_dtype(dtype)
        ]
        if categoricals:
            data = align_categories(data, categoricals)
        new_df = pd.concat(data)

        new_schema = validate_compatible(schema)

        new_label = MetaPartition._merge_labels(metapartitions, label_merger)

        new_mp = MetaPartition(
            label=new_label,
            data=new_df,
            metadata_version=new_metadata_version,
            schema=new_schema,
            partition_keys=partition_keys,
        )

        return new_mp
예제 #4
0
def read_table(
    dataset_uuid=None,
    store=None,
    table=SINGLE_TABLE,
    columns=None,
    concat_partitions_on_primary_index=False,
    predicate_pushdown_to_io=True,
    categoricals=None,
    label_filter=None,
    dates_as_object=False,
    predicates=None,
    factory=None,
):
    """
    A utility function to load a single table with multiple partitions as a single dataframe in one go.
    Mostly useful for smaller tables or datasets where all partitions fit into memory.

    The order of partitions is not guaranteed to be stable in the resulting dataframe.

    Parameters
    ----------

    Returns
    -------
    pandas.DataFrame
        Returns a pandas.DataFrame holding the data of the requested columns

    Examples
    --------
    Dataset in store contains two partitions with two files each

    .. code ::

        >>> import storefact
        >>> from kartothek.io.eager import read_table

        >>> store = storefact.get_store_from_url('s3://bucket_with_dataset')

        >>> df = read_table(store, 'dataset_uuid', 'core')

    """
    if concat_partitions_on_primary_index is not False:
        warnings.warn(
            "The keyword `concat_partitions_on_primary_index` is deprecated and will be removed in the next major release.",
            DeprecationWarning,
        )

    if not isinstance(table, str):
        raise TypeError("Argument `table` needs to be a string")

    columns = _check_compatible_list(table, columns, "columns")
    categoricals = _check_compatible_list(table, categoricals, "categoricals")

    ds_factory = _ensure_factory(
        dataset_uuid=dataset_uuid,
        store=_make_callable(store),
        factory=factory,
        load_dataset_metadata=False,
    )
    partitions = read_dataset_as_dataframes(
        tables=[table],
        columns=columns,
        concat_partitions_on_primary_index=concat_partitions_on_primary_index,
        predicate_pushdown_to_io=predicate_pushdown_to_io,
        categoricals=categoricals,
        label_filter=label_filter,
        dates_as_object=dates_as_object,
        predicates=predicates,
        factory=ds_factory,
    )

    empty_df = empty_dataframe_from_schema(
        schema=ds_factory.table_meta[table],
        columns=columns[table] if columns is not None else None,
    )
    dfs = [partition_data[table] for partition_data in partitions] + [empty_df]
    # require meta 4 otherwise, can't construct types/columns
    if categoricals:
        dfs = align_categories(dfs, categoricals[table])
    df = pd.concat(dfs, ignore_index=True, sort=False)

    # ensure column order
    if len(empty_df.columns) > 0:
        df = df.reindex(empty_df.columns, copy=False, axis=1)

    return df