示例#1
0
def quick_concat(dfs, dimension_columns, partition_columns):
    """
    Fast version of::

        pd.concat(
            dfs,
            ignore_index=True,
            sort=False,
        ).sort_values(dimension_columns + partition_columns).reset_index(drop=True)

    if inputs are presorted.

    Parameters
    -----------
    dfs: Iterable[pandas.DataFrame]
        DataFrames to concat.
    dimension_columns: Iterable[str]
        Dimension columns in correct order.
    partition_columns: Iterable[str]
        Partition columns in correct order.

    Returns
    -------
    df: pandas.DataFrame
        Concatenated result.
    """
    return sort_dataframe(
        df=concat_dataframes(dfs),
        columns=list(dimension_columns) + list(partition_columns),
    )
示例#2
0
    def test_many(self, dummy_default, maybe_iter):
        dfs = [
            pd.DataFrame(data={
                "a": [0, 1],
                "b": 1.0
            },
                         columns=["a", "b"],
                         index=[10, 11]),
            pd.DataFrame(data={
                "a": [2, 3],
                "b": 2.0
            },
                         columns=["a", "b"],
                         index=[10, 11]),
            pd.DataFrame(data={
                "a": [4, 5],
                "b": 3.0
            }, columns=["a", "b"]),
        ]
        expected = pd.DataFrame(
            {
                "a": [0, 1, 2, 3, 4, 5],
                "b": [1.0, 1.0, 2.0, 2.0, 3.0, 3.0]
            },
            columns=["a", "b"],
        )

        actual = concat_dataframes(maybe_iter(dfs), dummy_default)
        pdt.assert_frame_equal(actual, expected)
示例#3
0
    def test_default(self, maybe_iter):
        df = pd.DataFrame(data={
            "a": [0, 1],
            "b": 1.0
        },
                          columns=["a", "b"],
                          index=[10, 11])

        actual = concat_dataframes(maybe_iter([]), df)
        pdt.assert_frame_equal(actual, df)
示例#4
0
def _normalize_user_input(data, cube):
    if isinstance(data, (dict, pd.DataFrame)):
        data = [data]
    else:
        data = list(data)

    data_lists = defaultdict(list)
    for part in data:
        part = multiplex_user_input(part, cube)
        for k, v in part.items():
            data_lists[k].append(v)

    return {
        k: concat_dataframes([df for df in v if df is not None])
        for k, v in data_lists.items()
    }
示例#5
0
def _load_all_mps(mps, store, load_columns, predicates, empty):
    """
    Load kartothek_cube-relevant data from all given MetaPartitions.

    The result will be a concatenated Dataframe.

    Parameters
    ----------
    mps: Iterable[MetaPartition]
        MetaPartitions to load.
    store: simplekv.KeyValueStore
        Store to load data from.
    load_columns: List[str]
        Columns to load.
    predicates: Optional[List[List[Tuple[str, str, Any]]]]
        Predicates to apply during load.
    empty: pandas.DataFrame
        Empty Dataframe dummy.

    Returns
    -------
    df: pandas.DataFrame
        Concatenated data.
    """
    dfs_mp = []
    for mp in mps:
        mp = mp.load_dataframes(
            store=store,
            predicate_pushdown_to_io=True,
            tables=[SINGLE_TABLE],
            columns={SINGLE_TABLE: sorted(load_columns)},
            predicates=predicates,
        )
        df = mp.data[SINGLE_TABLE]
        df.columns = df.columns.map(converter_str)
        dfs_mp.append(df)
    return concat_dataframes(dfs_mp, empty)
示例#6
0
 def test_fail_no_default(self, maybe_iter):
     with pytest.raises(ValueError) as exc:
         concat_dataframes(maybe_iter([]), None)
     assert str(exc.value) == "Cannot concatenate 0 dataframes."
示例#7
0
 def test_fail_different_colsets(self, maybe_iter):
     dfs = [pd.DataFrame({"a": [1]}), pd.DataFrame({"a": [1], "b": [2]})]
     with pytest.raises(
             ValueError,
             match="Not all DataFrames have the same set of columns!"):
         concat_dataframes(maybe_iter(dfs))
示例#8
0
 def test_no_columns(self, dfs, expected):
     actual = concat_dataframes(dfs)
     pdt.assert_frame_equal(actual, expected)
示例#9
0
 def test_whipe_list(self, dfs):
     concat_dataframes(dfs)
     assert dfs == []