Пример #1
0
    def test_pivot_table_dropna(self):
        df = DataFrame({'amount': {0: 60000, 1: 100000, 2: 50000, 3: 30000},
                        'customer': {0: 'A', 1: 'A', 2: 'B', 3: 'C'},
                        'month': {0: 201307, 1: 201309, 2: 201308, 3: 201310},
                        'product': {0: 'a', 1: 'b', 2: 'c', 3: 'd'},
                        'quantity': {0: 2000000, 1: 500000,
                                     2: 1000000, 3: 1000000}})
        pv_col = df.pivot_table('quantity', 'month', [
                                'customer', 'product'], dropna=False)
        pv_ind = df.pivot_table(
            'quantity', ['customer', 'product'], 'month', dropna=False)

        m = MultiIndex.from_tuples([(u('A'), u('a')),
                                    (u('A'), u('b')),
                                    (u('A'), u('c')),
                                    (u('A'), u('d')),
                                    (u('B'), u('a')),
                                    (u('B'), u('b')),
                                    (u('B'), u('c')),
                                    (u('B'), u('d')),
                                    (u('C'), u('a')),
                                    (u('C'), u('b')),
                                    (u('C'), u('c')),
                                    (u('C'), u('d'))])

        assert_equal(pv_col.columns.values, m.values)
        assert_equal(pv_ind.index.values, m.values)
Пример #2
0
    def test_to_html_regression_GH6098(self):
        df = DataFrame({
            u('clé1'): [u('a'), u('a'), u('b'), u('b'), u('a')],
            u('clé2'): [u('1er'), u('2ème'), u('1er'), u('2ème'), u('1er')],
            'données1': np.random.randn(5),
            'données2': np.random.randn(5)})

        # it works
        df.pivot_table(index=[u('clé1')], columns=[u('clé2')])._repr_html_()
Пример #3
0
    def test_pivot_table_nocols(self):
        df = DataFrame({"rows": ["a", "b", "c"], "cols": ["x", "y", "z"], "values": [1, 2, 3]})
        rs = df.pivot_table(columns="cols", aggfunc=np.sum)
        xp = df.pivot_table(index="cols", aggfunc=np.sum).T
        tm.assert_frame_equal(rs, xp)

        rs = df.pivot_table(columns="cols", aggfunc={"values": "mean"})
        xp = df.pivot_table(index="cols", aggfunc={"values": "mean"}).T
        tm.assert_frame_equal(rs, xp)
Пример #4
0
def test_to_html_regression_GH6098():
    df = DataFrame({
        'clé1': ['a', 'a', 'b', 'b', 'a'],
        'clé2': ['1er', '2ème', '1er', '2ème', '1er'],
        'données1': np.random.randn(5),
        'données2': np.random.randn(5)})

    # it works
    df.pivot_table(index=['clé1'], columns=['clé2'])._repr_html_()
Пример #5
0
    def test_pivot_table_nocols(self):
        df = DataFrame({'rows': ['a', 'b', 'c'],
                        'cols': ['x', 'y', 'z'],
                        'values': [1,2,3]})
        rs = df.pivot_table(columns='cols', aggfunc=np.sum)
        xp = df.pivot_table(index='cols', aggfunc=np.sum).T
        tm.assert_frame_equal(rs, xp)

        rs = df.pivot_table(columns='cols', aggfunc={'values': 'mean'})
        xp = df.pivot_table(index='cols', aggfunc={'values': 'mean'}).T
        tm.assert_frame_equal(rs, xp)
Пример #6
0
def get_heatmap_data(ans: pd.DataFrame):
    """Sorts the values for the heatmap to make it more intuitive and readable
    Expects a dataframe `ans` containing processed (clean) barcode read counts data
    Expects a dataframe `flag`"""
    # generate heatmap matrix of (logged) read counts per sample per paired read
    hmap = (ans.pivot_table(index=["sample"],
                            columns=["paired_read"],
                            values="paired_read_count").replace(
                                [np.inf, -np.inf], np.nan).fillna(0))
    # drop unknown-unknown reads only
    hmap = hmap.drop(columns='unknown-unknown')
    # sort values to make plot more intuitive
    hmap['max_idx'] = hmap.apply(
        lambda x: hmap.columns.tolist().index(x.idxmax()), axis=1)
    hmap = hmap.sort_values('max_idx').drop(columns=['max_idx'])
    # grab read counts
    counts = hmap.values
    # prepare data for identifying potential contaminants
    flag = (ans.groupby('sample').agg(
        uniq_forward_bcodes=('forward_barcode', get_unique_barcodes),
        uniq_reverse_bcodes=('reverse_barcode', get_unique_barcodes)))

    # create boolean column that identifies potential contamination
    flag['contamination'] = flag.apply(is_contaminant, axis=1)
    # merge with original data to include the contamination flags
    contaminants_flag = (hmap.join(flag, how='inner')['contamination'].apply(
        lambda x: np.where(x == True, 1.0, 0.0)))
    # add contaminant flag column to the read counts
    data = np.hstack((counts, contaminants_flag[:, np.newaxis]))
    # list of all sample IDs
    x = hmap.index.values
    # list of all paired reads and an extra flag column for contamination
    y = hmap.columns.tolist()
    return hmap, data, x, y
Пример #7
0
def pivot_party_votes_df(df: pd.DataFrame):
    return df.pivot_table(
        index=["Fraktion/Gruppe", "date", "title"],
        columns="vote",
        values="fraction",
        fill_value=0,
    ).reset_index(level="Fraktion/Gruppe")
def barplot(df: pd.DataFrame, column: str, show: bool, save_location: str):
    plt.figure(figsize=(20, 20))
    df = df.pivot_table(index=[column], aggfunc="size").sort_values()
    sns.barplot(x=df.index, y=df.values)
    plt.savefig(save_location)
    if show:
        plt.show()
    def upload_data_long_format_as_single_data_set(
        self,
        data: pd.DataFrame,
        name: str,
        cross_section_column_name: str,
        date_column_name: str,
        replace_missing_values: bool = True,
        forward_fill_missing_values: bool = False,
    ) -> IndividualDataset:
        """
        Uploads long format data into Horizon. The data frame should have a date column, with a numeric index.

        :param data: The dataset in a pandas data frame. Must have a valid date column.
        :param name: Name of the data set to be uploaded
        :param cross_section_column_name: The identifier column that groups the records
        :param date_column_name: The column name of the date index.
        :param forward_fill_missing_values: Forward-fill missing values
        :param replace_missing_values: Replace missing values
        :return: A summary of the uploaded data set.
        :param encode_categorical_data: Categorically encode data that is non-numeric
        :param max_categories: Maximum number of categories per series.
        """

        df = data.pivot_table(columns=cross_section_column_name,
                              index=date_column_name)
        df.reset_index(inplace=True)
        df.columns = ["/".join(column) for column in df.columns]

        return self.upload_data(
            data=df,
            name=name,
            forward_fill_missing_values=forward_fill_missing_values,
            replace_missing_values=replace_missing_values,
        )
Пример #10
0
    def __pivot_table(self, data: pd.DataFrame) -> pd.DataFrame:
        """
        Pivot the thc result by session/topic in rows and team in columns.
        """
        try:
            pivot_table = data.pivot_table(
                index=["session", "topic"],
                columns=["team_name"],
                values="result",
                aggfunc=lambda x: x,
            )
        except AttributeError as e:
            logging.error(
                f"Error creating pivot table for THC result. The error was {e}."
            )
            pivot_table = pd.DataFrame()
        except ValueError as e:
            logging.error(
                "Error creating pivot table for THC result."
                f"Possibly the database is inconsistent. Error was {e}.")
            # TODO: This raises a ValueError: Function does not reduce in case
            # the database has non-unique entries. Can we recover from this?
            pivot_table = pd.DataFrame()
        else:
            pivot_table.fillna(THCResult.NoResult, inplace=True)

        return pivot_table
Пример #11
0
def getPivotMedian(dataFrame:pandas.DataFrame,valueColumns,indexColumns):
    """
    Return pivot meadian dataframe of source dataframe
    """
    if dataFrame is None:
        return None
    return dataFrame.pivot_table(values=valueColumns,index=indexColumns,aggfunc=numpy.median)
Пример #12
0
    def test_type_error_multiindex(self):
        # See gh-12218
        df = DataFrame(
            columns=["i", "c", "x", "y"],
            data=[[0, 0, 1, 2], [1, 0, 3, 4], [0, 1, 1, 2], [1, 1, 3, 4]],
        )
        dg = df.pivot_table(index="i", columns="c", values=["x", "y"])
        # TODO: Is this test for pivot_table?
        with pytest.raises(TypeError, match="unhashable type"):
            dg[:, 0]

        index = Index(range(2), name="i")
        columns = MultiIndex(
            levels=[["x", "y"], [0, 1]], codes=[[0, 1], [0, 0]], names=[None, "c"]
        )
        expected = DataFrame([[1, 2], [3, 4]], columns=columns, index=index)

        result = dg.loc[:, (slice(None), 0)]
        tm.assert_frame_equal(result, expected)

        name = ("x", 0)
        index = Index(range(2), name="i")
        expected = Series([1, 3], index=index, name=name)

        result = dg["x", 0]
        tm.assert_series_equal(result, expected)
Пример #13
0
def _reshape_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Reshape a DataFrame from long to wide form, adhering to the following
    rules:

    - If the `meta_date` column exists, replace `variable` column with
      {variable}_{meta_date} and then drop `meta_date`
    - Construct a pivot_table where the columns come from the `variable`
      column, values come from the `value` column, and all other columns are
      used as an index
    """
    if df.shape[0] == 0:
        # empty dataframe
        return df

    cols = list(df)
    for c in ["variable", "value"]:
        if c not in cols:
            gh_issues = "https://github.com/valorumdata/cmdc.py/issues/new"
            msg = (f"Column {c} not found in DataFrame. "
                   f"Please report a bug at {gh_issues}")
            raise ValueError(msg)
    if "meta_date" in cols:
        if "variable" in cols:
            df["variable"] = (df["variable"].astype(str) + "_" +
                              df["meta_date"].astype(str))
            df.drop("meta_date", axis="columns")

    idx = list(set(cols) - {"variable", "value"})
    return df.pivot_table(index=idx, columns="variable",
                          values="value").reset_index()
Пример #14
0
    def test_drop_multiindex_not_lexsorted(self):
        # GH#11640

        # define the lexsorted version
        lexsorted_mi = MultiIndex.from_tuples([("a", ""), ("b1", "c1"),
                                               ("b2", "c2")],
                                              names=["b", "c"])
        lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
        assert lexsorted_df.columns.is_lexsorted()

        # define the non-lexsorted version
        not_lexsorted_df = DataFrame(columns=["a", "b", "c", "d"],
                                     data=[[1, "b1", "c1", 3],
                                           [1, "b2", "c2", 4]])
        not_lexsorted_df = not_lexsorted_df.pivot_table(index="a",
                                                        columns=["b", "c"],
                                                        values="d")
        not_lexsorted_df = not_lexsorted_df.reset_index()
        assert not not_lexsorted_df.columns.is_lexsorted()

        # compare the results
        tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)

        expected = lexsorted_df.drop("a", axis=1)
        with tm.assert_produces_warning(PerformanceWarning):
            result = not_lexsorted_df.drop("a", axis=1)

        tm.assert_frame_equal(result, expected)
Пример #15
0
    def _aggregate_proj_points_data(self,
                                    df_score: pd.DataFrame) -> pd.DataFrame:
        """ Returns the Projected Points data aggregated to the Season/League/Week level """

        df_score = df_score.copy()

        df_score['starter_ind'] = np.where(df_score['Pos'] == 'Bench', 0, 1)

        # Including the starter indiciator in order to include starter and bench points
        groupby_vars = [
            'season_id', 'league_id', 'Week', 'Team', 'starter_ind'
        ]
        sum_vars = ['Proj', 'Actual']
        df_score = df_score.groupby(groupby_vars,
                                    as_index=False)[sum_vars].sum()

        id_vars = ['season_id', 'league_id', 'Week', 'Team']
        df_score = df_score.pivot_table(index=id_vars,
                                        columns='starter_ind',
                                        aggfunc=sum,
                                        fill_value=0)

        # Pivot table causes multi-dimensional column names and id_vars become the index
        df_score.columns = [
            '{}_{}'.format(x[0], 'Starter') if x[1] == 1 else '{}_{}'.format(
                x[0], 'Bench') for x in df_score.columns
        ]
        df_score = df_score.reset_index().rename_axis(None, axis=1)

        final_df = df_score

        return final_df
Пример #16
0
    def test_drop_multiindex_not_lexsorted(self):
        # GH 11640

        # define the lexsorted version
        lexsorted_mi = MultiIndex.from_tuples([('a', ''), ('b1', 'c1'),
                                               ('b2', 'c2')],
                                              names=['b', 'c'])
        lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
        self.assertTrue(lexsorted_df.columns.is_lexsorted())

        # define the non-lexsorted version
        not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'],
                                     data=[[1, 'b1', 'c1', 3],
                                           [1, 'b2', 'c2', 4]])
        not_lexsorted_df = not_lexsorted_df.pivot_table(index='a',
                                                        columns=['b', 'c'],
                                                        values='d')
        not_lexsorted_df = not_lexsorted_df.reset_index()
        self.assertFalse(not_lexsorted_df.columns.is_lexsorted())

        # compare the results
        tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)

        expected = lexsorted_df.drop('a', axis=1)
        with tm.assert_produces_warning(PerformanceWarning):
            result = not_lexsorted_df.drop('a', axis=1)

        tm.assert_frame_equal(result, expected)
Пример #17
0
def _plot_and_save(
    df: pd.DataFrame,
    index: str,
    column: str,
    index_label: str,
    column_label: str,
    values: str = 'ret',
    add_sep_colorbar: bool = True,
    norm: colors.Normalize = None,
    save_figure: bool = False,
    save_dir: str = None
):
    if index in df.columns and column in df.columns:
        # Pivot table (by default averages over identical index / columns cells)
        df_pivot = df.pivot_table(index=index, columns=column, values=values)

        # Generate the plot
        fig_hm, fig_cb = render_heatmap(df_pivot, add_sep_colorbar=add_sep_colorbar, norm=norm,
                                        y_label=index_label, x_label=column_label)

        # Save heat map and color bar if desired
        if save_figure:
            name = '-'.join([index, column])
            fig_hm.savefig(osp.join(save_dir, f'hm-{name}.pdf'))
            if fig_cb is not None:
                fig_cb.savefig(osp.join(save_dir, f'cb-{name}.pdf'))
Пример #18
0
def Process(n_states, filepath):
    rdr = csv.reader(open(filepath), delimiter=',')
    datacols = defaultdict(list)

    for ag, ev, ob, tr in rdr:
        datacols['agents'].append(int(ag))
        datacols['events'].append(int(ev))
        datacols['observations'].append(int(ob))
        datacols['truths'].append(int(tr))

    df = DataFrame(datacols)
    MV_result = []
    for i in range(0, len(np.unique(df['events']))):
        mcount = df[df['events'] == i][['agents', 'observations'
                                        ]].groupby('observations').count()
        MV_result.append(mcount['agents'].idxmax())
    df2 = df.pivot_table(index='agents',
                         columns='events',
                         values='observations')
    #df3 = df2.replace(0,n_states).fillna(0) #(0,6)
    #MV_result=[x if x!=0 else n_states for x in MV_result] # x if x!=0 else 6

    GroundTruth = df[['events', 'truths']].drop_duplicates().sort_values(
        ['events'])  #.replace(0,n_states)
    return df2, MV_result, GroundTruth
Пример #19
0
    def test_pivot_table_dropna(self):
        df = DataFrame(
            {
                "amount": {0: 60000, 1: 100000, 2: 50000, 3: 30000},
                "customer": {0: "A", 1: "A", 2: "B", 3: "C"},
                "month": {0: 201307, 1: 201309, 2: 201308, 3: 201310},
                "product": {0: "a", 1: "b", 2: "c", 3: "d"},
                "quantity": {0: 2000000, 1: 500000, 2: 1000000, 3: 1000000},
            }
        )
        pv_col = df.pivot_table("quantity", "month", ["customer", "product"], dropna=False)
        pv_ind = df.pivot_table("quantity", ["customer", "product"], "month", dropna=False)

        m = MultiIndex.from_tuples(
            [
                ("A", "a"),
                ("A", "b"),
                ("A", "c"),
                ("A", "d"),
                ("B", "a"),
                ("B", "b"),
                ("B", "c"),
                ("B", "d"),
                ("C", "a"),
                ("C", "b"),
                ("C", "c"),
                ("C", "d"),
            ],
            names=["customer", "product"],
        )
        tm.assert_index_equal(pv_col.columns, m)
        tm.assert_index_equal(pv_ind.index, m)
Пример #20
0
def prepareBreakagebreakageSummary(breakageData, stlSalesSamePeriod, kcSalesSamePeriod, reportYear, lastYear):
    '''
    Takes in clean data and gets it ready for consumption
    '''
    aggFuncs = { 'Breakage|Dollars' : np.sum,
                'Breakage|Cases' : np.sum }
    groupCols = ['Warehouse','ReasonCode','Year']
    breakageSummary = DataFrame(breakageData.groupby(groupCols).agg(aggFuncs).reset_index(drop=False))
    breakageSummary = pd.DataFrame(breakageSummary.pivot_table(values=['Breakage|Cases','Breakage|Dollars'], index=['Warehouse','ReasonCode'], columns=['Year']))
    breakageSummary.columns = ['%s%s' % (a, '|%s' % b if b else '') for a, b in breakageSummary.columns]  
    breakageSummary.sort_index(inplace=True, ascending=False)
    
    breakageSummary['Breakage|% Sales'] = breakageSummary.index.get_level_values(0)
    breakageSummary['Breakage|% Sales'] = breakageSummary['Breakage|% Sales'].map({'Kansas City':kcSalesSamePeriod, 'Saint Louis':stlSalesSamePeriod})
    breakageSummary['Breakage|% Sales'] = np.divide(breakageSummary['Breakage|Dollars|2016'], breakageSummary['Breakage|% Sales'])
    
    def yoy_delta(now, then): return np.divide(np.subtract(now,then), then)
    
    breakageSummary['Breakage|Dollars|% Change'] = round(yoy_delta(breakageSummary['Breakage|Dollars|'+str(reportYear)], breakageSummary['Breakage|Dollars|'+str(lastYear)]),4)
    breakageSummary['Breakage|Cases|% Change'] = round(yoy_delta(breakageSummary['Breakage|Cases|'+str(reportYear)], breakageSummary['Breakage|Cases|'+str(lastYear)]),4)
    breakageSummary = breakageSummary.reindex(columns=['Breakage|Dollars|'+str(lastYear), 'Breakage|Dollars|'+str(reportYear), 'Breakage|Dollars|% Change', 'Breakage|% Sales',
                                        'Breakage|Cases|'+str(lastYear), 'Breakage|Cases|'+str(reportYear), 'Breakage|Cases|% Change'])
    breakageSummary = breakageSummary.reindex(index=['Warehouse Breakage','Cross-Dock Breakage','Driver Breakage','Supplier Breakage','Sales Breakage & Unsaleables'], level='ReasonCode')

    return breakageSummary
Пример #21
0
    def test_drop_multiindex_not_lexsorted(self):
        # GH 11640

        # define the lexsorted version
        lexsorted_mi = MultiIndex.from_tuples(
            [('a', ''), ('b1', 'c1'), ('b2', 'c2')], names=['b', 'c'])
        lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
        self.assertTrue(lexsorted_df.columns.is_lexsorted())

        # define the non-lexsorted version
        not_lexsorted_df = DataFrame(columns=['a', 'b', 'c', 'd'],
                                     data=[[1, 'b1', 'c1', 3],
                                           [1, 'b2', 'c2', 4]])
        not_lexsorted_df = not_lexsorted_df.pivot_table(
            index='a', columns=['b', 'c'], values='d')
        not_lexsorted_df = not_lexsorted_df.reset_index()
        self.assertFalse(not_lexsorted_df.columns.is_lexsorted())

        # compare the results
        tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)

        expected = lexsorted_df.drop('a', axis=1)
        with tm.assert_produces_warning(PerformanceWarning):
            result = not_lexsorted_df.drop('a', axis=1)

        tm.assert_frame_equal(result, expected)
Пример #22
0
def check_count(df: pd.DataFrame):
    df_group_by = df.pivot_table(values='SITE',
                                 index='Time',
                                 aggfunc=pd.Series.nunique)
    df_count_check = df_group_by[df_group_by['SITE'] <= 10900]

    return df_count_check
Пример #23
0
    def test_pivot_table_dropna(self):
        df = DataFrame({
            'amount': {
                0: 60000,
                1: 100000,
                2: 50000,
                3: 30000
            },
            'customer': {
                0: 'A',
                1: 'A',
                2: 'B',
                3: 'C'
            },
            'month': {
                0: 201307,
                1: 201309,
                2: 201308,
                3: 201310
            },
            'product': {
                0: 'a',
                1: 'b',
                2: 'c',
                3: 'd'
            },
            'quantity': {
                0: 2000000,
                1: 500000,
                2: 1000000,
                3: 1000000
            }
        })
        pv_col = df.pivot_table('quantity',
                                'month', ['customer', 'product'],
                                dropna=False)
        pv_ind = df.pivot_table('quantity', ['customer', 'product'],
                                'month',
                                dropna=False)

        m = MultiIndex.from_tuples([('A', 'a'), ('A', 'b'), ('A', 'c'),
                                    ('A', 'd'), ('B', 'a'), ('B', 'b'),
                                    ('B', 'c'), ('B', 'd'), ('C', 'a'),
                                    ('C', 'b'), ('C', 'c'), ('C', 'd')])

        assert_equal(pv_col.columns.values, m.values)
        assert_equal(pv_ind.index.values, m.values)
def plot_hyper_comparison(all_dfs: pd.DataFrame, hyper_param: str, ys: List[str], x: str='iteration', *args, **kwargs):
    compare_df = all_dfs.pivot_table(index=x, columns=[hyper_param], values=ys, aggfunc='first').reset_index()

    compare_df.columns = compare_df.columns.to_flat_index()

    hyper_values = all_dfs[hyper_param].unique()
    return plot_fields(compare_df, x=(x, ''),
                       ys=[(field, hyper_value) for hyper_value in hyper_values for field in ys], *args, **kwargs)
Пример #25
0
    def test_to_html_regression_GH6098(self):
        df = DataFrame({
            u('clé1'): [u('a'), u('a'), u('b'),
                        u('b'), u('a')],
            u('clé2'): [u('1er'),
                        u('2ème'),
                        u('1er'),
                        u('2ème'),
                        u('1er')],
            'données1':
            np.random.randn(5),
            'données2':
            np.random.randn(5)
        })

        # it works
        df.pivot_table(index=[u('clé1')], columns=[u('clé2')])._repr_html_()
Пример #26
0
class PivotTable(object):

    def setup(self):
        N = 100000
        fac1 = np.array(['A', 'B', 'C'], dtype='O')
        fac2 = np.array(['one', 'two'], dtype='O')
        ind1 = np.random.randint(0, 3, size=N)
        ind2 = np.random.randint(0, 2, size=N)
        self.df = DataFrame({'key1': fac1.take(ind1),
                             'key2': fac2.take(ind2),
                             'key3': fac2.take(ind2),
                             'value1': np.random.randn(N),
                             'value2': np.random.randn(N),
                             'value3': np.random.randn(N)})

    def time_pivot_table(self):
        self.df.pivot_table(index='key1', columns=['key2', 'key3'])
Пример #27
0
def get_runs_counts_by_match():
    new_data = DataFrame(ipl_df[['match_code',
                                 'runs']].groupby(['match_code',
                                                   'runs'])['runs'].count())
    new_data.columns = ['Total_runs']
    return new_data.pivot_table(index='match_code',
                                columns='runs',
                                values='Total_runs')
Пример #28
0
def create_data_table(df: pd.DataFrame) -> pd.DataFrame:
    """Creates a data table containing all of the input dataframe's data in a format
    appropriate for sharing with others

    Return the data in the input dataframe, dropping some columns only used internally
    and pivoting to a wide format w.r.t. case types

    :param df: Dataframe containing all locations' data in long format
    :type df: pd.DataFrame
    :return: Dataframe containing all locations' data in wide format, with some
    auxiliary columns dropped
    :rtype: pd.DataFrame
    """

    df = df.copy()

    # Normalize times by labeling all of today's data with its future label, 00:00
    # tomorrow (as that's the timestamp marking the end of the 24-hour data collection
    # period). No need to adjust data not from today; it's already been adjusted and is
    # labeled with the date whose 00:00 marked the end of data collection (i.e., data
    # generated on Mar 20 is labeled Mar 21).
    normalized_dates = df[Columns.DATE].dt.normalize()
    is_at_midnight = df[Columns.DATE] == normalized_dates
    df.loc[~is_at_midnight,
           Columns.DATE] = normalized_dates[~is_at_midnight] + pd.Timedelta(
               days=1)
    df[Columns.DATE] = df[Columns.DATE].dt.strftime(r"%Y-%m-%d")

    df = df.drop(columns=[
        Columns.IS_STATE,
        Columns.LOCATION_NAME,
        Columns.OUTBREAK_START_DATE_COL,
        Columns.DAYS_SINCE_OUTBREAK,
        Columns.POPULATION,
        Columns.STAGE,
        Columns.COUNT_TYPE,
    ])

    df = (df.pivot_table(
        index=[
            c for c in df.columns
            if c not in [Columns.CASE_TYPE, Columns.CASE_COUNT]
        ],
        columns=Columns.CASE_TYPE,
        values=Columns.CASE_COUNT,
        aggfunc="first",
    ).reset_index().sort_values([Columns.COUNTRY, Columns.STATE,
                                 Columns.DATE]))

    for col in CaseInfo.get_info_items_for(InfoField.CASE_TYPE,
                                           count=Counting.TOTAL_CASES):
        df[col] = pd.to_numeric(df[col], downcast="integer")

    # save_path = Paths.DATA / "data_table.csv"
    # df.to_csv(save_path, index=False)
    # print(f"Saved data to {save_path.relative_to(Paths.ROOT)}")

    return df
Пример #29
0
def pivot_categorical(data: pd.DataFrame) -> pd.DataFrame:
    """Pivots data that is long on categories to be wide."""
    key_cols = ['sex', 'age_start', 'age_end', 'year_start', 'year_end']
    key_cols = [k for k in key_cols if k in data.columns]
    data = data.pivot_table(index=key_cols,
                            columns='parameter',
                            values='value').reset_index()
    data.columns.name = None
    return data
Пример #30
0
def arruma_votos():
    votos = read_csv("voto_secao_partido.csv")
    partidos = [13,45,15,55]
    votos = votos[votos.partido.isin(partidos)]
    votos["id"] = votos.apply(lambda t:str(t["num_zona"])+"--"+str(t["num_secao"]),axis=1)
    votos = DataFrame(votos.pivot_table(index=votos["id"], columns="partido",values="sum(votos)",aggfunc=np.sum))
    votos.columns = ["PT","PMDB","PSDB","PSD"]
    votos.to_csv("voto_secao_partido_trabalhada.csv")
    return votos
Пример #31
0
 def execute(self, df: DataFrame, domain_retriever=None, execute_pipeline=None) -> DataFrame:
     pivoted_df = df.pivot_table(
         values=self.value_column,
         index=self.index,
         columns=self.column_to_pivot,
         aggfunc='mean' if self.agg_function == 'avg' else self.agg_function,
     ).reset_index()
     pivoted_df.columns.name = None
     return pivoted_df
Пример #32
0
def curr_test_ts_states(daily: pd.DataFrame) -> pd.DataFrame:
    """
    This function is to read the current testing from https://covidtracking.com/api/
    and return the pandas dataframe
    """
    states_total_by_dates = daily.pivot_table(index="date",
                                              columns="state",
                                              values="totalTestResults")
    return states_total_by_dates
Пример #33
0
class PivotTable(object):
    def setup(self):
        N = 100000
        fac1 = np.array(['A', 'B', 'C'], dtype='O')
        fac2 = np.array(['one', 'two'], dtype='O')
        ind1 = np.random.randint(0, 3, size=N)
        ind2 = np.random.randint(0, 2, size=N)
        self.df = DataFrame({
            'key1': fac1.take(ind1),
            'key2': fac2.take(ind2),
            'key3': fac2.take(ind2),
            'value1': np.random.randn(N),
            'value2': np.random.randn(N),
            'value3': np.random.randn(N)
        })

    def time_pivot_table(self):
        self.df.pivot_table(index='key1', columns=['key2', 'key3'])
Пример #34
0
def get_pivoted_data(data, population_data):
    columns = ["value", "date", "areaCode", "areaType", "areaName", "category"]

    dt_final = DataFrame(columns=columns)

    # Because of the hierarchical nature of the original data, there is
    # no easy way to automate this process using a generic solution
    # without prolonging the execution time. The iterative method appears
    # to produce the optimal time.
    for area_type in data:
        if area_type not in CATEGORY_LABELS:
            continue

        logging.info(f"\t\tArea type: {area_type}")
        dt_label = extract_category_data(data, columns, area_type, population_data)
        dt_final = dt_final.append(dt_label)

    # Reset index to appear incrementally.
    dt_final.reset_index(inplace=True)
    dt_final = dt_final.loc[:, columns]
    logging.info(">> Data was processed and converted into a categorical table")

    # Convert date strings to timestamp objects (needed for sorting).
    dt_final[DATE_COLUMN] = to_datetime(dt_final[DATE_COLUMN])
    logging.info(">> Dates were converted to datetime object")

    dt_pivot = dt_final.pivot_table(
        values='value',
        index=["areaType", "date", "areaName", "areaCode"],
        columns=['category'],
        aggfunc=lambda x: x.max()
    )

    logging.info(">> Pivot table created")

    dt_pivot.sort_values(
        ["date", "areaName"],
        ascending=[False, True],
        inplace=True
    )
    dt_pivot.reset_index(inplace=True)

    logging.info(">> Data table was sorted by date and areaName")

    # Change column names.
    dt_pivot.columns = [
        "areaType",
        "date",
        "areaName",
        "areaCode",
        *dt_pivot.columns[4:]
    ]

    logging.info(">> New column names were set")

    return dt_pivot
Пример #35
0
def make_pivot_table(data: pd.DataFrame) -> pd.DataFrame:
    values = get_names(STATISTICS)

    report = data.pivot_table(index=get_names(PARAMETERS),
                              values=values,
                              margins=True,
                              margins_name='Avg',
                              aggfunc=np.mean)

    return report.reindex(values, axis=1)
def plot_innings_runs_histogram():
    ipl_df = pd.read_csv('data/ipl_dataset.csv', index_col=None)
    df = DataFrame(ipl_df[['batting_team', 'inning',
                           'runs']].groupby(['batting_team',
                                             'inning'])['runs'].sum())
    piv_tab = df.pivot_table(index='batting_team',
                             columns='inning',
                             values='runs')
    piv_tab.plot(kind='bar', stacked=True)
    plt.show()
Пример #37
0
    def pivot_categories(df_extract_categories: pd.DataFrame) -> pd.DataFrame:
        """

        :param df_extract_categories: DataFrame with categories from function extract_categories()
        :return: DataFrame where if business_id has the categorie the value will be 1
        """
        return df_extract_categories.pivot_table(index='business_id',
                                                 columns='category',
                                                 aggfunc='size',
                                                 fill_value=0)
def plot_innings_runs_histogram():
    df = DataFrame(ipl_df[['inning', 'runs',
                           'batting_team']].groupby(['batting_team',
                                                     'inning'])['runs'].sum())
    table = df.pivot_table(index='batting_team',
                           columns='inning',
                           values='runs')
    table1 = table.sort_index()
    table1.plot(kind='bar', stacked=True)
    plt.show()
 def _get_lof_features(self, ratings: pd.DataFrame):
     rating_mx = ratings.pivot_table(index=self.USER_ID_COL,
                                     columns=self.ITEM_ID_COL,
                                     values=self.RATING_COL).fillna(0)
     clf = LocalOutlierFactor(n_neighbors=30, metric='euclidean')
     clf.fit(rating_mx)
     user_lof = pd.DataFrame(clf.negative_outlier_factor_)
     user_lof.columns = ['LOF']
     user_lof[self.USER_ID_COL] = rating_mx.index
     return user_lof
Пример #40
0
    def test_crosstab_pass_values(self):
        a = np.random.randint(0, 7, size=100)
        b = np.random.randint(0, 3, size=100)
        c = np.random.randint(0, 5, size=100)
        values = np.random.randn(100)

        table = crosstab([a, b], c, values, aggfunc=np.sum, rownames=["foo", "bar"], colnames=["baz"])

        df = DataFrame({"foo": a, "bar": b, "baz": c, "values": values})

        expected = df.pivot_table("values", index=["foo", "bar"], columns="baz", aggfunc=np.sum)
        tm.assert_frame_equal(table, expected)
Пример #41
0
    def test_pivot_integer_columns(self):
        # caused by upstream bug in unstack

        d = datetime.date.min
        data = list(product(['foo', 'bar'], ['A', 'B', 'C'], ['x1', 'x2'],
                            [d + datetime.timedelta(i) for i in range(20)], [1.0]))
        df = DataFrame(data)
        table = df.pivot_table(values=4, index=[0, 1, 3], columns=[2])

        df2 = df.rename(columns=str)
        table2 = df2.pivot_table(values='4', index=['0', '1', '3'], columns=['2'])

        tm.assert_frame_equal(table, table2, check_names=False)
Пример #42
0
    def test_pivot_no_level_overlap(self):
        # GH #1181

        data = DataFrame({'a': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'] * 2,
                          'b': [0, 0, 0, 0, 1, 1, 1, 1] * 2,
                          'c': (['foo'] * 4 + ['bar'] * 4) * 2,
                          'value': np.random.randn(16)})

        table = data.pivot_table('value', index='a', columns=['b', 'c'])

        grouped = data.groupby(['a', 'b', 'c'])['value'].mean()
        expected = grouped.unstack('b').unstack('c').dropna(axis=1, how='all')
        tm.assert_frame_equal(table, expected)
Пример #43
0
    def test_pivot_integer_columns(self):
        # caused by upstream bug in unstack

        d = date.min
        data = list(
            product(["foo", "bar"], ["A", "B", "C"], ["x1", "x2"], [d + timedelta(i) for i in range(20)], [1.0])
        )
        df = DataFrame(data)
        table = df.pivot_table(values=4, index=[0, 1, 3], columns=[2])

        df2 = df.rename(columns=str)
        table2 = df2.pivot_table(values="4", index=["0", "1", "3"], columns=["2"])

        tm.assert_frame_equal(table, table2, check_names=False)
Пример #44
0
    def test_crosstab_pass_values(self):
        a = np.random.randint(0, 7, size=100)
        b = np.random.randint(0, 3, size=100)
        c = np.random.randint(0, 5, size=100)
        values = np.random.randn(100)

        table = crosstab([a, b], c, values, aggfunc=np.sum,
                         rownames=['foo', 'bar'], colnames=['baz'])

        df = DataFrame({'foo': a, 'bar': b, 'baz': c, 'values': values})

        expected = df.pivot_table('values', index=['foo', 'bar'], columns='baz',
                                  aggfunc=np.sum)
        tm.assert_frame_equal(table, expected)
Пример #45
0
class PivotTable:

    def setup(self):
        N = 100000
        fac1 = np.array(['A', 'B', 'C'], dtype='O')
        fac2 = np.array(['one', 'two'], dtype='O')
        ind1 = np.random.randint(0, 3, size=N)
        ind2 = np.random.randint(0, 2, size=N)
        self.df = DataFrame({'key1': fac1.take(ind1),
                             'key2': fac2.take(ind2),
                             'key3': fac2.take(ind2),
                             'value1': np.random.randn(N),
                             'value2': np.random.randn(N),
                             'value3': np.random.randn(N)})
        self.df2 = DataFrame({'col1': list('abcde'), 'col2': list('fghij'),
                              'col3': [1, 2, 3, 4, 5]})
        self.df2.col1 = self.df2.col1.astype('category')
        self.df2.col2 = self.df2.col2.astype('category')

    def time_pivot_table(self):
        self.df.pivot_table(index='key1', columns=['key2', 'key3'])

    def time_pivot_table_agg(self):
        self.df.pivot_table(index='key1', columns=['key2', 'key3'],
                            aggfunc=['sum', 'mean'])

    def time_pivot_table_margins(self):
        self.df.pivot_table(index='key1', columns=['key2', 'key3'],
                            margins=True)

    def time_pivot_table_categorical(self):
        self.df2.pivot_table(index='col1', values='col3', columns='col2',
                             aggfunc=np.sum, fill_value=0)

    def time_pivot_table_categorical_observed(self):
        self.df2.pivot_table(index='col1', values='col3', columns='col2',
                             aggfunc=np.sum, fill_value=0, observed=True)
Пример #46
0
    def test_pivot_columns_lexsorted(self):
        import datetime
        import numpy as np
        import pandas

        n = 10000

        dtype = np.dtype(
            [
                ("Index", object),
                ("Symbol", object),
                ("Year", int),
                ("Month", int),
                ("Day", int),
                ("Quantity", int),
                ("Price", float),
            ]
        )

        products = np.array(
            [
                ("SP500", "ADBE"),
                ("SP500", "NVDA"),
                ("SP500", "ORCL"),
                ("NDQ100", "AAPL"),
                ("NDQ100", "MSFT"),
                ("NDQ100", "GOOG"),
                ("FTSE", "DGE.L"),
                ("FTSE", "TSCO.L"),
                ("FTSE", "GSK.L"),
            ],
            dtype=[("Index", object), ("Symbol", object)],
        )
        items = np.empty(n, dtype=dtype)
        iproduct = np.random.randint(0, len(products), n)
        items["Index"] = products["Index"][iproduct]
        items["Symbol"] = products["Symbol"][iproduct]
        dr = pandas.date_range(datetime.date(2000, 1, 1), datetime.date(2010, 12, 31))
        dates = dr[np.random.randint(0, len(dr), n)]
        items["Year"] = dates.year
        items["Month"] = dates.month
        items["Day"] = dates.day
        items["Price"] = np.random.lognormal(4.0, 2.0, n)

        df = DataFrame(items)

        pivoted = df.pivot_table("Price", rows=["Month", "Day"], cols=["Index", "Symbol", "Year"], aggfunc="mean")

        self.assert_(pivoted.columns.is_monotonic)
Пример #47
0
    def test_pivot_columns_lexsorted(self):
        import datetime
        import numpy as np
        import pandas

        n = 10000

        dtype = np.dtype([
            ("Index", object),
            ("Symbol", object),
            ("Year", int),
            ("Month", int),
            ("Day", int),
            ("Quantity", int),
            ("Price", float),
        ])

        products = np.array([
            ('SP500', 'ADBE'),
            ('SP500', 'NVDA'),
            ('SP500', 'ORCL'),
            ('NDQ100', 'AAPL'),
            ('NDQ100', 'MSFT'),
            ('NDQ100', 'GOOG'),
            ('FTSE', 'DGE.L'),
            ('FTSE', 'TSCO.L'),
            ('FTSE', 'GSK.L'),
        ], dtype=[('Index', object), ('Symbol', object)])
        items = np.empty(n, dtype=dtype)
        iproduct = np.random.randint(0, len(products), n)
        items['Index'] = products['Index'][iproduct]
        items['Symbol'] = products['Symbol'][iproduct]
        dr = pandas.date_range(datetime.date(2000, 1, 1),
                               datetime.date(2010, 12, 31))
        dates = dr[np.random.randint(0, len(dr), n)]
        items['Year'] = dates.year
        items['Month'] = dates.month
        items['Day'] = dates.day
        items['Price'] = np.random.lognormal(4.0, 2.0, n)

        df = DataFrame(items)

        pivoted = df.pivot_table('Price', rows=['Month', 'Day'],
                                 cols=['Index', 'Symbol', 'Year'],
                                 aggfunc='mean')

        self.assert_(pivoted.columns.is_monotonic)
Пример #48
0
    def test_pivot_no_level_overlap(self):
        # GH #1181

        data = DataFrame(
            {
                "a": ["a", "a", "a", "a", "b", "b", "b", "b"] * 2,
                "b": [0, 0, 0, 0, 1, 1, 1, 1] * 2,
                "c": (["foo"] * 4 + ["bar"] * 4) * 2,
                "value": np.random.randn(16),
            }
        )

        table = data.pivot_table("value", index="a", columns=["b", "c"])

        grouped = data.groupby(["a", "b", "c"])["value"].mean()
        expected = grouped.unstack("b").unstack("c").dropna(axis=1, how="all")
        tm.assert_frame_equal(table, expected)
    def test_drop_multiindex_not_lexsorted(self):
        # GH 11640

        # define the lexsorted version
        lexsorted_mi = MultiIndex.from_tuples([("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"])
        lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
        self.assertTrue(lexsorted_df.columns.is_lexsorted())

        # define the non-lexsorted version
        not_lexsorted_df = DataFrame(columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]])
        not_lexsorted_df = not_lexsorted_df.pivot_table(index="a", columns=["b", "c"], values="d")
        not_lexsorted_df = not_lexsorted_df.reset_index()
        self.assertFalse(not_lexsorted_df.columns.is_lexsorted())

        # compare the results
        tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)

        expected = lexsorted_df.drop("a", axis=1)
        with tm.assert_produces_warning(PerformanceWarning):
            result = not_lexsorted_df.drop("a", axis=1)

        tm.assert_frame_equal(result, expected)
Пример #50
0
 4.5: [9, 20, 31, 42, 53, 64, 75],
 5.0: [10, 21, 32, 43, 54, 65, 76]}

grouped_MaxResponse_f.groups.keys()
->
dict_keys([0.5, 1.0, 2.0, 3.0, 4.0, 5.0, 3.5, 1.5, 4.5, 2.5])

grouped_MaxResponse_f.groups.values()
->

#列の値をもとに演算を施し、その演算結果を列として追加する->applyを用いる
AAFremoved["p_peak_appeared_repaired"] = AAFremoved["p_remark"].apply(lambda data: isinstance(data, str)) | AAFremoved["p_peak_appeared"]
AAFremoved["Max Response (%)"] = AAFremoved["p_max"].apply(lambda F: F if F>0 else 0)

#ピボットテーブル
data.pivot_table()

#クロス集計
data.crosstab()

#縦に連結
pd.concat([A,B])
A.append(B)

#横に連結
pd.concat([A, B], axis=1)
pd.merge(left, right, left_index=True, right_index=True)
A.join(B)

#結合
pd.merge(left, right, on='key', how="outer")
Пример #51
0
class TestPivotTable(tm.TestCase):

    _multiprocess_can_split_ = True

    def setUp(self):
        self.data = DataFrame({'A': ['foo', 'foo', 'foo', 'foo',
                                     'bar', 'bar', 'bar', 'bar',
                                     'foo', 'foo', 'foo'],
                               'B': ['one', 'one', 'one', 'two',
                                     'one', 'one', 'one', 'two',
                                     'two', 'two', 'one'],
                               'C': ['dull', 'dull', 'shiny', 'dull',
                                     'dull', 'shiny', 'shiny', 'dull',
                                     'shiny', 'shiny', 'shiny'],
                               'D': np.random.randn(11),
                               'E': np.random.randn(11),
                               'F': np.random.randn(11)})

    def test_pivot_table(self):
        index = ['A', 'B']
        columns = 'C'
        table = pivot_table(self.data, values='D', index=index, columns=columns)

        table2 = self.data.pivot_table(values='D', index=index, columns=columns)
        tm.assert_frame_equal(table, table2)

        # this works
        pivot_table(self.data, values='D', index=index)

        if len(index) > 1:
            self.assertEqual(table.index.names, tuple(index))
        else:
            self.assertEqual(table.index.name, index[0])

        if len(columns) > 1:
            self.assertEqual(table.columns.names, columns)
        else:
            self.assertEqual(table.columns.name, columns[0])

        expected = self.data.groupby(index + [columns])['D'].agg(np.mean).unstack()
        tm.assert_frame_equal(table, expected)

    def test_pivot_table_warnings(self):
        index = ['A', 'B']
        columns = 'C'
        with tm.assert_produces_warning(FutureWarning):
            table = pivot_table(self.data, values='D', rows=index,
                                cols=columns)

        with tm.assert_produces_warning(False):
            table2 = pivot_table(self.data, values='D', index=index,
                                 columns=columns)

        tm.assert_frame_equal(table, table2)

    def test_pivot_table_nocols(self):
        df = DataFrame({'rows': ['a', 'b', 'c'],
                        'cols': ['x', 'y', 'z'],
                        'values': [1,2,3]})
        rs = df.pivot_table(columns='cols', aggfunc=np.sum)
        xp = df.pivot_table(index='cols', aggfunc=np.sum).T
        tm.assert_frame_equal(rs, xp)

        rs = df.pivot_table(columns='cols', aggfunc={'values': 'mean'})
        xp = df.pivot_table(index='cols', aggfunc={'values': 'mean'}).T
        tm.assert_frame_equal(rs, xp)

    def test_pivot_table_dropna(self):
        df = DataFrame({'amount': {0: 60000, 1: 100000, 2: 50000, 3: 30000},
                        'customer': {0: 'A', 1: 'A', 2: 'B', 3: 'C'},
                        'month': {0: 201307, 1: 201309, 2: 201308, 3: 201310},
                        'product': {0: 'a', 1: 'b', 2: 'c', 3: 'd'},
                        'quantity': {0: 2000000, 1: 500000, 2: 1000000, 3: 1000000}})
        pv_col = df.pivot_table('quantity', 'month', ['customer', 'product'], dropna=False)
        pv_ind = df.pivot_table('quantity', ['customer', 'product'], 'month', dropna=False)

        m = MultiIndex.from_tuples([(u('A'), u('a')),
                                    (u('A'), u('b')),
                                    (u('A'), u('c')),
                                    (u('A'), u('d')),
                                    (u('B'), u('a')),
                                    (u('B'), u('b')),
                                    (u('B'), u('c')),
                                    (u('B'), u('d')),
                                    (u('C'), u('a')),
                                    (u('C'), u('b')),
                                    (u('C'), u('c')),
                                    (u('C'), u('d'))])

        assert_equal(pv_col.columns.values, m.values)
        assert_equal(pv_ind.index.values, m.values)


    def test_pass_array(self):
        result = self.data.pivot_table('D', index=self.data.A, columns=self.data.C)
        expected = self.data.pivot_table('D', index='A', columns='C')
        tm.assert_frame_equal(result, expected)

    def test_pass_function(self):
        result = self.data.pivot_table('D', index=lambda x: x // 5,
                                       columns=self.data.C)
        expected = self.data.pivot_table('D', index=self.data.index // 5,
                                         columns='C')
        tm.assert_frame_equal(result, expected)

    def test_pivot_table_multiple(self):
        index = ['A', 'B']
        columns = 'C'
        table = pivot_table(self.data, index=index, columns=columns)
        expected = self.data.groupby(index + [columns]).agg(np.mean).unstack()
        tm.assert_frame_equal(table, expected)

    def test_pivot_dtypes(self):

        # can convert dtypes
        f = DataFrame({'a' : ['cat', 'bat', 'cat', 'bat'], 'v' : [1,2,3,4], 'i' : ['a','b','a','b']})
        self.assertEqual(f.dtypes['v'], 'int64')

        z = pivot_table(f, values='v', index=['a'], columns=['i'], fill_value=0, aggfunc=np.sum)
        result = z.get_dtype_counts()
        expected = Series(dict(int64 = 2))
        tm.assert_series_equal(result, expected)

        # cannot convert dtypes
        f = DataFrame({'a' : ['cat', 'bat', 'cat', 'bat'], 'v' : [1.5,2.5,3.5,4.5], 'i' : ['a','b','a','b']})
        self.assertEqual(f.dtypes['v'], 'float64')

        z = pivot_table(f, values='v', index=['a'], columns=['i'], fill_value=0, aggfunc=np.mean)
        result = z.get_dtype_counts()
        expected = Series(dict(float64 = 2))
        tm.assert_series_equal(result, expected)

    def test_pivot_multi_values(self):
        result = pivot_table(self.data, values=['D', 'E'],
                             index='A', columns=['B', 'C'], fill_value=0)
        expected = pivot_table(self.data.drop(['F'], axis=1),
                               index='A', columns=['B', 'C'], fill_value=0)
        tm.assert_frame_equal(result, expected)

    def test_pivot_multi_functions(self):
        f = lambda func: pivot_table(self.data, values=['D', 'E'],
                                     index=['A', 'B'], columns='C',
                                     aggfunc=func)
        result = f([np.mean, np.std])
        means = f(np.mean)
        stds = f(np.std)
        expected = concat([means, stds], keys=['mean', 'std'], axis=1)
        tm.assert_frame_equal(result, expected)

        # margins not supported??
        f = lambda func: pivot_table(self.data, values=['D', 'E'],
                                     index=['A', 'B'], columns='C',
                                     aggfunc=func, margins=True)
        result = f([np.mean, np.std])
        means = f(np.mean)
        stds = f(np.std)
        expected = concat([means, stds], keys=['mean', 'std'], axis=1)
        tm.assert_frame_equal(result, expected)

    def test_pivot_index_with_nan(self):
        # GH 3588
        nan = np.nan
        df = DataFrame({"a":['R1', 'R2', nan, 'R4'], 'b':["C1", "C2", "C3" , "C4"], "c":[10, 15, nan , 20]})
        result = df.pivot('a','b','c')
        expected = DataFrame([[nan,nan,nan,nan],[nan,10,nan,nan],
                              [nan,nan,nan,nan],[nan,nan,15,20]],
                             index = Index(['R1','R2',nan,'R4'],name='a'),
                             columns = Index(['C1','C2','C3','C4'],name='b'))
        tm.assert_frame_equal(result, expected)

    def test_pivot_with_tz(self):
        # GH 5878
        df = DataFrame({'dt1': [datetime.datetime(2013, 1, 1, 9, 0),
                                datetime.datetime(2013, 1, 2, 9, 0),
                                datetime.datetime(2013, 1, 1, 9, 0),
                                datetime.datetime(2013, 1, 2, 9, 0)],
                        'dt2': [datetime.datetime(2014, 1, 1, 9, 0),
                                datetime.datetime(2014, 1, 1, 9, 0),
                                datetime.datetime(2014, 1, 2, 9, 0),
                                datetime.datetime(2014, 1, 2, 9, 0)],
                        'data1': np.arange(4,dtype='int64'),
                        'data2': np.arange(4,dtype='int64')})

        df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d, tz='US/Pacific'))
        df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d, tz='Asia/Tokyo'))

        exp_col1 = Index(['data1', 'data1', 'data2', 'data2'])
        exp_col2 = pd.DatetimeIndex(['2014/01/01 09:00', '2014/01/02 09:00'] * 2,
                                    name='dt2', tz='Asia/Tokyo')
        exp_col = pd.MultiIndex.from_arrays([exp_col1, exp_col2])
        expected = DataFrame([[0, 2, 0, 2], [1, 3, 1, 3]],
                             index=pd.DatetimeIndex(['2013/01/01 09:00', '2013/01/02 09:00'],
                                                    name='dt1', tz='US/Pacific'),
                             columns=exp_col)

        pv =  df.pivot(index='dt1', columns='dt2')
        tm.assert_frame_equal(pv, expected)

        expected = DataFrame([[0, 2], [1, 3]],
                     index=pd.DatetimeIndex(['2013/01/01 09:00', '2013/01/02 09:00'],
                                            name='dt1', tz='US/Pacific'),
                     columns=pd.DatetimeIndex(['2014/01/01 09:00', '2014/01/02 09:00'],
                                            name='dt2', tz='Asia/Tokyo'))

        pv =  df.pivot(index='dt1', columns='dt2', values='data1')
        tm.assert_frame_equal(pv, expected)

    def test_margins(self):
        def _check_output(res, col, index=['A', 'B'], columns=['C']):
            cmarg = res['All'][:-1]
            exp = self.data.groupby(index)[col].mean()
            tm.assert_series_equal(cmarg, exp)

            res = res.sortlevel()
            rmarg = res.xs(('All', ''))[:-1]
            exp = self.data.groupby(columns)[col].mean()
            tm.assert_series_equal(rmarg, exp)

            gmarg = res['All']['All', '']
            exp = self.data[col].mean()
            self.assertEqual(gmarg, exp)

        # column specified
        table = self.data.pivot_table('D', index=['A', 'B'], columns='C',
                                      margins=True, aggfunc=np.mean)
        _check_output(table, 'D')

        # no column specified
        table = self.data.pivot_table(index=['A', 'B'], columns='C',
                                      margins=True, aggfunc=np.mean)
        for valcol in table.columns.levels[0]:
            _check_output(table[valcol], valcol)

        # no col

        # to help with a buglet
        self.data.columns = [k * 2 for k in self.data.columns]
        table = self.data.pivot_table(index=['AA', 'BB'], margins=True,
                                      aggfunc=np.mean)
        for valcol in table.columns:
            gmarg = table[valcol]['All', '']
            self.assertEqual(gmarg, self.data[valcol].mean())

        # this is OK
        table = self.data.pivot_table(index=['AA', 'BB'], margins=True,
                                      aggfunc='mean')

        # no rows
        rtable = self.data.pivot_table(columns=['AA', 'BB'], margins=True,
                                       aggfunc=np.mean)
        tm.assert_isinstance(rtable, Series)
        for item in ['DD', 'EE', 'FF']:
            gmarg = table[item]['All', '']
            self.assertEqual(gmarg, self.data[item].mean())

    def test_pivot_integer_columns(self):
        # caused by upstream bug in unstack

        d = datetime.date.min
        data = list(product(['foo', 'bar'], ['A', 'B', 'C'], ['x1', 'x2'],
                            [d + datetime.timedelta(i) for i in range(20)], [1.0]))
        df = DataFrame(data)
        table = df.pivot_table(values=4, index=[0, 1, 3], columns=[2])

        df2 = df.rename(columns=str)
        table2 = df2.pivot_table(values='4', index=['0', '1', '3'], columns=['2'])

        tm.assert_frame_equal(table, table2, check_names=False)

    def test_pivot_no_level_overlap(self):
        # GH #1181

        data = DataFrame({'a': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'] * 2,
                          'b': [0, 0, 0, 0, 1, 1, 1, 1] * 2,
                          'c': (['foo'] * 4 + ['bar'] * 4) * 2,
                          'value': np.random.randn(16)})

        table = data.pivot_table('value', index='a', columns=['b', 'c'])

        grouped = data.groupby(['a', 'b', 'c'])['value'].mean()
        expected = grouped.unstack('b').unstack('c').dropna(axis=1, how='all')
        tm.assert_frame_equal(table, expected)

    def test_pivot_columns_lexsorted(self):

        n = 10000

        dtype = np.dtype([
            ("Index", object),
            ("Symbol", object),
            ("Year", int),
            ("Month", int),
            ("Day", int),
            ("Quantity", int),
            ("Price", float),
        ])

        products = np.array([
            ('SP500', 'ADBE'),
            ('SP500', 'NVDA'),
            ('SP500', 'ORCL'),
            ('NDQ100', 'AAPL'),
            ('NDQ100', 'MSFT'),
            ('NDQ100', 'GOOG'),
            ('FTSE', 'DGE.L'),
            ('FTSE', 'TSCO.L'),
            ('FTSE', 'GSK.L'),
        ], dtype=[('Index', object), ('Symbol', object)])
        items = np.empty(n, dtype=dtype)
        iproduct = np.random.randint(0, len(products), n)
        items['Index'] = products['Index'][iproduct]
        items['Symbol'] = products['Symbol'][iproduct]
        dr = pd.date_range(datetime.date(2000, 1, 1), datetime.date(2010, 12, 31))
        dates = dr[np.random.randint(0, len(dr), n)]
        items['Year'] = dates.year
        items['Month'] = dates.month
        items['Day'] = dates.day
        items['Price'] = np.random.lognormal(4.0, 2.0, n)

        df = DataFrame(items)

        pivoted = df.pivot_table('Price', index=['Month', 'Day'],
                                 columns=['Index', 'Symbol', 'Year'],
                                 aggfunc='mean')

        self.assertTrue(pivoted.columns.is_monotonic)

    def test_pivot_complex_aggfunc(self):
        f = {'D': ['std'], 'E': ['sum']}
        expected = self.data.groupby(['A', 'B']).agg(f).unstack('B')
        result = self.data.pivot_table(index='A', columns='B', aggfunc=f)

        tm.assert_frame_equal(result, expected)

    def test_margins_no_values_no_cols(self):
        # Regression test on pivot table: no values or cols passed.
        result = self.data[['A', 'B']].pivot_table(index=['A', 'B'], aggfunc=len, margins=True)
        result_list = result.tolist()
        self.assertEqual(sum(result_list[:-1]), result_list[-1])

    def test_margins_no_values_two_rows(self):
        # Regression test on pivot table: no values passed but rows are a multi-index
        result = self.data[['A', 'B', 'C']].pivot_table(index=['A', 'B'], columns='C', aggfunc=len, margins=True)
        self.assertEqual(result.All.tolist(), [3.0, 1.0, 4.0, 3.0, 11.0])

    def test_margins_no_values_one_row_one_col(self):
        # Regression test on pivot table: no values passed but row and col defined
        result = self.data[['A', 'B']].pivot_table(index='A', columns='B', aggfunc=len, margins=True)
        self.assertEqual(result.All.tolist(), [4.0, 7.0, 11.0])

    def test_margins_no_values_two_row_two_cols(self):
        # Regression test on pivot table: no values passed but rows and cols are multi-indexed
        self.data['D'] = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']
        result = self.data[['A', 'B', 'C', 'D']].pivot_table(index=['A', 'B'], columns=['C', 'D'], aggfunc=len, margins=True)
        self.assertEqual(result.All.tolist(), [3.0, 1.0, 4.0, 3.0, 11.0])

    def test_pivot_timegrouper(self):
        df = DataFrame({
            'Branch' : 'A A A A A A A B'.split(),
            'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
            'Quantity': [1, 3, 5, 1, 8, 1, 9, 3],
            'Date' : [datetime.datetime(2013, 1, 1), datetime.datetime(2013, 1, 1),
                      datetime.datetime(2013, 10, 1), datetime.datetime(2013, 10, 2),
                      datetime.datetime(2013, 10, 1), datetime.datetime(2013, 10, 2),
                      datetime.datetime(2013, 12, 2), datetime.datetime(2013, 12, 2),]}).set_index('Date')

        expected = DataFrame(np.array([10, 18, 3],dtype='int64').reshape(1, 3),
                             index=[datetime.datetime(2013, 12, 31)],
                             columns='Carl Joe Mark'.split())
        expected.index.name = 'Date'
        expected.columns.name = 'Buyer'

        result = pivot_table(df, index=Grouper(freq='A'), columns='Buyer',
                             values='Quantity', aggfunc=np.sum)
        tm.assert_frame_equal(result,expected)

        result = pivot_table(df, index='Buyer', columns=Grouper(freq='A'),
                             values='Quantity', aggfunc=np.sum)
        tm.assert_frame_equal(result,expected.T)

        expected = DataFrame(np.array([1, np.nan, 3, 9, 18, np.nan]).reshape(2, 3),
                             index=[datetime.datetime(2013, 1, 1), datetime.datetime(2013, 7, 1)],
                             columns='Carl Joe Mark'.split())
        expected.index.name = 'Date'
        expected.columns.name = 'Buyer'

        result = pivot_table(df, index=Grouper(freq='6MS'), columns='Buyer',
                             values='Quantity', aggfunc=np.sum)
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS'),
                             values='Quantity', aggfunc=np.sum)
        tm.assert_frame_equal(result, expected.T)

        # passing the name
        df = df.reset_index()
        result = pivot_table(df, index=Grouper(freq='6MS', key='Date'), columns='Buyer',
                             values='Quantity', aggfunc=np.sum)
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', key='Date'),
                             values='Quantity', aggfunc=np.sum)
        tm.assert_frame_equal(result, expected.T)

        self.assertRaises(KeyError, lambda : pivot_table(df, index=Grouper(freq='6MS', key='foo'),
                          columns='Buyer', values='Quantity', aggfunc=np.sum))
        self.assertRaises(KeyError, lambda : pivot_table(df, index='Buyer',
                          columns=Grouper(freq='6MS', key='foo'), values='Quantity', aggfunc=np.sum))

        # passing the level
        df = df.set_index('Date')
        result = pivot_table(df, index=Grouper(freq='6MS', level='Date'), columns='Buyer',
                             values='Quantity', aggfunc=np.sum)
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df, index='Buyer', columns=Grouper(freq='6MS', level='Date'),
                             values='Quantity', aggfunc=np.sum)
        tm.assert_frame_equal(result, expected.T)

        self.assertRaises(ValueError, lambda : pivot_table(df, index=Grouper(freq='6MS', level='foo'),
                          columns='Buyer', values='Quantity', aggfunc=np.sum))
        self.assertRaises(ValueError, lambda : pivot_table(df, index='Buyer',
                          columns=Grouper(freq='6MS', level='foo'), values='Quantity', aggfunc=np.sum))

        # double grouper
        df = DataFrame({
            'Branch' : 'A A A A A A A B'.split(),
            'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
            'Quantity': [1,3,5,1,8,1,9,3],
            'Date' : [datetime.datetime(2013,11,1,13,0), datetime.datetime(2013,9,1,13,5),
                      datetime.datetime(2013,10,1,20,0), datetime.datetime(2013,10,2,10,0),
                      datetime.datetime(2013,11,1,20,0), datetime.datetime(2013,10,2,10,0),
                      datetime.datetime(2013,10,2,12,0), datetime.datetime(2013,12,5,14,0)],
            'PayDay' : [datetime.datetime(2013,10,4,0,0), datetime.datetime(2013,10,15,13,5),
                        datetime.datetime(2013,9,5,20,0), datetime.datetime(2013,11,2,10,0),
                        datetime.datetime(2013,10,7,20,0), datetime.datetime(2013,9,5,10,0),
                        datetime.datetime(2013,12,30,12,0), datetime.datetime(2013,11,20,14,0),]})

        result = pivot_table(df, index=Grouper(freq='M', key='Date'),
                             columns=Grouper(freq='M', key='PayDay'),
                             values='Quantity', aggfunc=np.sum)
        expected = DataFrame(np.array([np.nan, 3, np.nan, np.nan, 6, np.nan, 1, 9,
                                       np.nan, 9, np.nan, np.nan, np.nan, np.nan, 3, np.nan]).reshape(4, 4),
                             index=[datetime.datetime(2013, 9, 30), datetime.datetime(2013, 10, 31),
                                    datetime.datetime(2013, 11, 30), datetime.datetime(2013, 12, 31)],
                             columns=[datetime.datetime(2013, 9, 30), datetime.datetime(2013, 10, 31),
                                    datetime.datetime(2013, 11, 30), datetime.datetime(2013, 12, 31)])
        expected.index.name = 'Date'
        expected.columns.name = 'PayDay'

        tm.assert_frame_equal(result, expected)

        result = pivot_table(df, index=Grouper(freq='M', key='PayDay'),
                             columns=Grouper(freq='M', key='Date'),
                             values='Quantity', aggfunc=np.sum)
        tm.assert_frame_equal(result, expected.T)

        tuples = [(datetime.datetime(2013, 9, 30), datetime.datetime(2013, 10, 31)),
                  (datetime.datetime(2013, 10, 31), datetime.datetime(2013, 9, 30)),
                  (datetime.datetime(2013, 10, 31), datetime.datetime(2013, 11, 30)),
                  (datetime.datetime(2013, 10, 31), datetime.datetime(2013, 12, 31)),
                  (datetime.datetime(2013, 11, 30), datetime.datetime(2013, 10, 31)),
                  (datetime.datetime(2013, 12, 31), datetime.datetime(2013, 11, 30)),]
        idx = MultiIndex.from_tuples(tuples, names=['Date', 'PayDay'])
        expected = DataFrame(np.array([3, np.nan, 6, np.nan, 1, np.nan,
                                       9, np.nan, 9, np.nan, np.nan, 3]).reshape(6, 2),
                             index=idx, columns=['A', 'B'])
        expected.columns.name = 'Branch'

        result = pivot_table(df, index=[Grouper(freq='M', key='Date'),
                             Grouper(freq='M', key='PayDay')], columns=['Branch'],
                             values='Quantity', aggfunc=np.sum)
        tm.assert_frame_equal(result, expected)

        result = pivot_table(df, index=['Branch'], columns=[Grouper(freq='M', key='Date'),
                             Grouper(freq='M', key='PayDay')],
                             values='Quantity', aggfunc=np.sum)
        tm.assert_frame_equal(result, expected.T)

    def test_pivot_datetime_tz(self):
        dates1 = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00',
                  '2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00']
        dates2 = ['2013-01-01 15:00:00', '2013-01-01 15:00:00', '2013-01-01 15:00:00',
                  '2013-02-01 15:00:00', '2013-02-01 15:00:00', '2013-02-01 15:00:00']
        df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'],
                        'dt1': dates1, 'dt2': dates2,
                        'value1': range(6), 'value2': [1, 2] * 3})
        df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d, tz='US/Pacific'))
        df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d, tz='Asia/Tokyo'))

        exp_idx = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 08:00:00',
                                    '2011-07-19 09:00:00'], tz='US/Pacific', name='dt1')
        exp_col1 = Index(['value1', 'value1'])
        exp_col2 = Index(['a', 'b'], name='label')
        exp_col = MultiIndex.from_arrays([exp_col1, exp_col2])
        expected = DataFrame([[0, 3], [1, 4], [2, 5]],
                             index=exp_idx, columns=exp_col)
        result = pivot_table(df, index=['dt1'], columns=['label'], values=['value1'])
        tm.assert_frame_equal(result, expected)


        exp_col1 = Index(['sum', 'sum', 'sum', 'sum', 'mean', 'mean', 'mean', 'mean'])
        exp_col2 = Index(['value1', 'value1', 'value2', 'value2'] * 2)
        exp_col3 = pd.DatetimeIndex(['2013-01-01 15:00:00', '2013-02-01 15:00:00'] * 4,
                                    tz='Asia/Tokyo', name='dt2')
        exp_col = MultiIndex.from_arrays([exp_col1, exp_col2, exp_col3])
        expected = DataFrame(np.array([[0, 3, 1, 2, 0, 3, 1, 2], [1, 4, 2, 1, 1, 4, 2, 1],
                              [2, 5, 1, 2, 2, 5, 1, 2]]), index=exp_idx, columns=exp_col)

        result = pivot_table(df, index=['dt1'], columns=['dt2'], values=['value1', 'value2'],
                             aggfunc=[np.sum, np.mean])
        tm.assert_frame_equal(result, expected)
Пример #52
0
    def test_margins(self):
        def _check_output(res, col, index=['A', 'B'], columns=['C']):
            cmarg = res['All'][:-1]
            exp = self.data.groupby(index)[col].mean()
            tm.assert_series_equal(cmarg, exp)

            res = res.sortlevel()
            rmarg = res.xs(('All', ''))[:-1]
            exp = self.data.groupby(columns)[col].mean()
            tm.assert_series_equal(rmarg, exp)

            gmarg = res['All']['All', '']
            exp = self.data[col].mean()
            self.assertEqual(gmarg, exp)

        # column specified
        table = self.data.pivot_table('D', index=['A', 'B'], columns='C',
                                      margins=True, aggfunc=np.mean)
        _check_output(table, 'D')

        # no column specified
        table = self.data.pivot_table(index=['A', 'B'], columns='C',
                                      margins=True, aggfunc=np.mean)
        for valcol in table.columns.levels[0]:
            _check_output(table[valcol], valcol)

        # no col

        # to help with a buglet
        self.data.columns = [k * 2 for k in self.data.columns]
        table = self.data.pivot_table(index=['AA', 'BB'], margins=True,
                                      aggfunc=np.mean)
        for valcol in table.columns:
            gmarg = table[valcol]['All', '']
            self.assertEqual(gmarg, self.data[valcol].mean())

        # this is OK
        table = self.data.pivot_table(index=['AA', 'BB'], margins=True,
                                      aggfunc='mean')

        # no rows
        rtable = self.data.pivot_table(columns=['AA', 'BB'], margins=True,
                                       aggfunc=np.mean)
        tm.assert_isinstance(rtable, Series)
        for item in ['DD', 'EE', 'FF']:
            gmarg = table[item]['All', '']
            self.assertEqual(gmarg, self.data[item].mean())

        # issue number #8349: pivot_table with margins and dictionary aggfunc

        df=DataFrame([  {'JOB':'Worker','NAME':'Bob' ,'YEAR':2013,'MONTH':12,'DAYS': 3,'SALARY': 17}, 
                        {'JOB':'Employ','NAME':'Mary','YEAR':2013,'MONTH':12,'DAYS': 5,'SALARY': 23}, 
                        {'JOB':'Worker','NAME':'Bob' ,'YEAR':2014,'MONTH': 1,'DAYS':10,'SALARY':100}, 
                        {'JOB':'Worker','NAME':'Bob' ,'YEAR':2014,'MONTH': 1,'DAYS':11,'SALARY':110}, 
                        {'JOB':'Employ','NAME':'Mary','YEAR':2014,'MONTH': 1,'DAYS':15,'SALARY':200}, 
                        {'JOB':'Worker','NAME':'Bob' ,'YEAR':2014,'MONTH': 2,'DAYS': 8,'SALARY': 80}, 
                        {'JOB':'Employ','NAME':'Mary','YEAR':2014,'MONTH': 2,'DAYS': 5,'SALARY':190} ])

        df=df.set_index(['JOB','NAME','YEAR','MONTH'],drop=False,append=False)

        rs=df.pivot_table(  index=['JOB','NAME'],
                            columns=['YEAR','MONTH'],
                            values=['DAYS','SALARY'],
                            aggfunc={'DAYS':'mean','SALARY':'sum'},
                            margins=True)

        ex=df.pivot_table(index=['JOB','NAME'],columns=['YEAR','MONTH'],values=['DAYS'],aggfunc='mean',margins=True)

        tm.assert_frame_equal(rs['DAYS'], ex['DAYS'])

        ex=df.pivot_table(index=['JOB','NAME'],columns=['YEAR','MONTH'],values=['SALARY'],aggfunc='sum',margins=True)

        tm.assert_frame_equal(rs['SALARY'], ex['SALARY'])
Пример #53
0
class TestPivotTable(unittest.TestCase):

    def setUp(self):
        self.data = DataFrame({'A' : ['foo', 'foo', 'foo', 'foo',
                                      'bar', 'bar', 'bar', 'bar',
                                      'foo', 'foo', 'foo'],
                               'B' : ['one', 'one', 'one', 'two',
                                      'one', 'one', 'one', 'two',
                                      'two', 'two', 'one'],
                               'C' : ['dull', 'dull', 'shiny', 'dull',
                                      'dull', 'shiny', 'shiny', 'dull',
                                      'shiny', 'shiny', 'shiny'],
                               'D' : np.random.randn(11),
                               'E' : np.random.randn(11),
                               'F' : np.random.randn(11)})

    def test_pivot_table(self):
        rows = ['A', 'B']
        cols=  'C'
        table = pivot_table(self.data, values='D', rows=rows, cols=cols)

        table2 = self.data.pivot_table(values='D', rows=rows, cols=cols)
        tm.assert_frame_equal(table, table2)

        # this works
        pivot_table(self.data, values='D', rows=rows)

        if len(rows) > 1:
            self.assertEqual(table.index.names, rows)
        else:
            self.assertEqual(table.index.name, rows[0])

        if len(cols) > 1:
            self.assertEqual(table.columns.names, cols)
        else:
            self.assertEqual(table.columns.name, cols[0])

        expected = self.data.groupby(rows + [cols])['D'].agg(np.mean).unstack()
        tm.assert_frame_equal(table, expected)

    def test_pass_array(self):
        result = self.data.pivot_table('D', rows=self.data.A, cols=self.data.C)
        expected = self.data.pivot_table('D', rows='A', cols='C')
        tm.assert_frame_equal(result, expected)

    def test_pass_function(self):
        result = self.data.pivot_table('D', rows=lambda x: x // 5,
                                       cols=self.data.C)
        expected = self.data.pivot_table('D', rows=self.data.index // 5,
                                         cols='C')
        tm.assert_frame_equal(result, expected)

    def test_pivot_table_multiple(self):
        rows = ['A', 'B']
        cols=  'C'
        table = pivot_table(self.data, rows=rows, cols=cols)
        expected = self.data.groupby(rows + [cols]).agg(np.mean).unstack()
        tm.assert_frame_equal(table, expected)

    def test_pivot_multi_values(self):
        result = pivot_table(self.data, values=['D', 'E'],
                             rows='A', cols=['B', 'C'], fill_value=0)
        expected = pivot_table(self.data.drop(['F'], axis=1),
                               rows='A', cols=['B', 'C'], fill_value=0)
        tm.assert_frame_equal(result, expected)

    def test_pivot_multi_functions(self):
        f = lambda func: pivot_table(self.data, values=['D', 'E'],
                                     rows=['A', 'B'], cols='C',
                                     aggfunc=func)
        result = f([np.mean, np.std])
        means = f(np.mean)
        stds = f(np.std)
        expected = concat([means, stds], keys=['mean', 'std'], axis=1)
        tm.assert_frame_equal(result, expected)

        # margins not supported??
        f = lambda func: pivot_table(self.data, values=['D', 'E'],
                                     rows=['A', 'B'], cols='C',
                                     aggfunc=func, margins=True)
        result = f([np.mean, np.std])
        means = f(np.mean)
        stds = f(np.std)
        expected = concat([means, stds], keys=['mean', 'std'], axis=1)
        tm.assert_frame_equal(result, expected)

    def test_margins(self):
        def _check_output(res, col, rows=['A', 'B'], cols=['C']):
            cmarg = res['All'][:-1]
            exp = self.data.groupby(rows)[col].mean()
            tm.assert_series_equal(cmarg, exp)

            rmarg = res.xs(('All', ''))[:-1]
            exp = self.data.groupby(cols)[col].mean()
            tm.assert_series_equal(rmarg, exp)

            gmarg = res['All']['All', '']
            exp = self.data[col].mean()
            self.assertEqual(gmarg, exp)

        # column specified
        table = self.data.pivot_table('D', rows=['A', 'B'], cols='C',
                                      margins=True, aggfunc=np.mean)
        _check_output(table, 'D')

        # no column specified
        table = self.data.pivot_table(rows=['A', 'B'], cols='C',
                                      margins=True, aggfunc=np.mean)
        for valcol in table.columns.levels[0]:
            _check_output(table[valcol], valcol)

        # no col

        # to help with a buglet
        self.data.columns = [k * 2 for k in self.data.columns]
        table = self.data.pivot_table(rows=['AA', 'BB'], margins=True,
                                      aggfunc=np.mean)
        for valcol in table.columns:
            gmarg = table[valcol]['All', '']
            self.assertEqual(gmarg, self.data[valcol].mean())

        # this is OK
        table = self.data.pivot_table(rows=['AA', 'BB'], margins=True,
                                      aggfunc='mean')

        # no rows
        rtable = self.data.pivot_table(cols=['AA', 'BB'], margins=True,
                                      aggfunc=np.mean)
        self.assert_(isinstance(rtable, Series))
        for item in ['DD', 'EE', 'FF']:
            gmarg = table[item]['All', '']
            self.assertEqual(gmarg, self.data[item].mean())

    def test_pivot_integer_columns(self):
        # caused by upstream bug in unstack
        from pandas.util.compat import product
        import datetime
        import pandas

        d = datetime.date.min
        data = list(product(['foo', 'bar'], ['A', 'B', 'C'], ['x1', 'x2'],
                [d + datetime.timedelta(i) for i in xrange(20)], [1.0]))
        df = pandas.DataFrame(data)
        table = df.pivot_table(values=4, rows=[0,1,3],cols=[2])

        df2 = df.rename(columns=str)
        table2 = df2.pivot_table(values='4', rows=['0','1','3'], cols=['2'])

        tm.assert_frame_equal(table, table2)

    def test_pivot_no_level_overlap(self):
        # GH #1181

        data = DataFrame({'a': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'] * 2,
                          'b': [0, 0, 0, 0, 1, 1, 1, 1] * 2,
                          'c': (['foo'] * 4 + ['bar'] * 4) * 2,
                          'value': np.random.randn(16)})

        table = data.pivot_table('value', rows='a', cols=['b', 'c'])

        grouped = data.groupby(['a', 'b', 'c'])['value'].mean()
        expected = grouped.unstack('b').unstack('c').dropna(axis=1, how='all')
        tm.assert_frame_equal(table, expected)

    def test_pivot_columns_lexsorted(self):
        import datetime
        import numpy as np
        import pandas

        n = 10000

        dtype = np.dtype([
            ("Index", object),
            ("Symbol", object),
            ("Year", int),
            ("Month", int),
            ("Day", int),
            ("Quantity", int),
            ("Price", float),
        ])

        products = np.array([
            ('SP500', 'ADBE'),
            ('SP500', 'NVDA'),
            ('SP500', 'ORCL'),
            ('NDQ100', 'AAPL'),
            ('NDQ100', 'MSFT'),
            ('NDQ100', 'GOOG'),
            ('FTSE', 'DGE.L'),
            ('FTSE', 'TSCO.L'),
            ('FTSE', 'GSK.L'),
        ], dtype=[('Index', object), ('Symbol', object)])
        items = np.empty(n, dtype=dtype)
        iproduct = np.random.randint(0, len(products), n)
        items['Index'] = products['Index'][iproduct]
        items['Symbol'] = products['Symbol'][iproduct]
        dr = pandas.date_range(datetime.date(2000, 1, 1),
                               datetime.date(2010, 12, 31))
        dates = dr[np.random.randint(0, len(dr), n)]
        items['Year'] = dates.year
        items['Month'] = dates.month
        items['Day'] = dates.day
        items['Price'] = np.random.lognormal(4.0, 2.0, n)

        df = DataFrame(items)

        pivoted = df.pivot_table('Price', rows=['Month', 'Day'],
                                 cols=['Index', 'Symbol', 'Year'],
                                 aggfunc='mean')

        self.assert_(pivoted.columns.is_monotonic)
Пример #54
0
class TestPivotTable(unittest.TestCase):

    def setUp(self):
        self.data = DataFrame({'A' : ['foo', 'foo', 'foo', 'foo',
                                      'bar', 'bar', 'bar', 'bar',
                                      'foo', 'foo', 'foo'],
                               'B' : ['one', 'one', 'one', 'two',
                                      'one', 'one', 'one', 'two',
                                      'two', 'two', 'one'],
                               'C' : ['dull', 'dull', 'shiny', 'dull',
                                      'dull', 'shiny', 'shiny', 'dull',
                                      'shiny', 'shiny', 'shiny'],
                               'D' : np.random.randn(11),
                               'E' : np.random.randn(11),
                               'F' : np.random.randn(11)})

    def test_pivot_table(self):
        rows = ['A', 'B']
        cols=  'C'
        table = pivot_table(self.data, values='D', rows=rows, cols=cols)

        table2 = self.data.pivot_table(values='D', rows=rows, cols=cols)
        tm.assert_frame_equal(table, table2)

        # this works
        pivot_table(self.data, values='D', rows=rows)

        if len(rows) > 1:
            self.assertEqual(table.index.names, rows)
        else:
            self.assertEqual(table.index.name, rows[0])

        if len(cols) > 1:
            self.assertEqual(table.columns.names, cols)
        else:
            self.assertEqual(table.columns.name, cols[0])

        expected = self.data.groupby(rows + [cols])['D'].agg(np.mean).unstack()
        tm.assert_frame_equal(table, expected)

    def test_pivot_table_multiple(self):
        rows = ['A', 'B']
        cols=  'C'
        table = pivot_table(self.data, rows=rows, cols=cols)
        expected = self.data.groupby(rows + [cols]).agg(np.mean).unstack()
        tm.assert_frame_equal(table, expected)

    def test_pivot_multi_values(self):
        result = pivot_table(self.data, values=['D', 'E'],
                             rows='A', cols=['B', 'C'], fill_value=0)
        expected = pivot_table(self.data.drop(['F'], axis=1),
                               rows='A', cols=['B', 'C'], fill_value=0)
        tm.assert_frame_equal(result, expected)

    def test_pivot_multi_functions(self):
        f = lambda func: pivot_table(self.data, values=['D', 'E'],
                                     rows=['A', 'B'], cols='C',
                                     aggfunc=func)
        result = f([np.mean, np.std])
        means = f(np.mean)
        stds = f(np.std)
        expected = concat([means, stds], keys=['mean', 'std'], axis=1)
        tm.assert_frame_equal(result, expected)

        # margins not supported??
        f = lambda func: pivot_table(self.data, values=['D', 'E'],
                                     rows=['A', 'B'], cols='C',
                                     aggfunc=func, margins=True)
        result = f([np.mean, np.std])
        means = f(np.mean)
        stds = f(np.std)
        expected = concat([means, stds], keys=['mean', 'std'], axis=1)
        tm.assert_frame_equal(result, expected)

    def test_margins(self):
        def _check_output(res, col, rows=['A', 'B'], cols=['C']):
            cmarg = res['All'][:-1]
            exp = self.data.groupby(rows)[col].mean()
            tm.assert_series_equal(cmarg, exp)

            rmarg = res.xs(('All', ''))[:-1]
            exp = self.data.groupby(cols)[col].mean()
            tm.assert_series_equal(rmarg, exp)

            gmarg = res['All']['All', '']
            exp = self.data[col].mean()
            self.assertEqual(gmarg, exp)

        # column specified
        table = self.data.pivot_table('D', rows=['A', 'B'], cols='C',
                                      margins=True, aggfunc=np.mean)
        _check_output(table, 'D')

        # no column specified
        table = self.data.pivot_table(rows=['A', 'B'], cols='C',
                                      margins=True, aggfunc=np.mean)
        for valcol in table.columns.levels[0]:
            _check_output(table[valcol], valcol)

        # no col

        # to help with a buglet
        self.data.columns = [k * 2 for k in self.data.columns]
        table = self.data.pivot_table(rows=['AA', 'BB'], margins=True,
                                      aggfunc=np.mean)
        for valcol in table.columns:
            gmarg = table[valcol]['All', '']
            self.assertEqual(gmarg, self.data[valcol].mean())

        # this is OK
        table = self.data.pivot_table(rows=['AA', 'BB'], margins=True,
                                      aggfunc='mean')

        # no rows
        rtable = self.data.pivot_table(cols=['AA', 'BB'], margins=True,
                                      aggfunc=np.mean)
        self.assert_(isinstance(rtable, Series))
        for item in ['DD', 'EE', 'FF']:
            gmarg = table[item]['All', '']
            self.assertEqual(gmarg, self.data[item].mean())

    def test_pivot_integer_columns(self):
        # caused by upstream bug in unstack
        from pandas.util.compat import product
        import datetime
        import pandas

        d = datetime.date.min
        data = list(product(['foo', 'bar'], ['A', 'B', 'C'], ['x1', 'x2'],
                [d + datetime.timedelta(i) for i in xrange(20)], [1.0]))
        df = pandas.DataFrame(data)
        table = df.pivot_table(values=4, rows=[0,1,3],cols=[2])

        df2 = df.rename(columns=str)
        table2 = df2.pivot_table(values='4', rows=['0','1','3'], cols=['2'])

        tm.assert_frame_equal(table, table2)
Пример #55
0
class TestPivotTable(unittest.TestCase):

    _multiprocess_can_split_ = True

    def setUp(self):
        self.data = DataFrame({'A': ['foo', 'foo', 'foo', 'foo',
                                     'bar', 'bar', 'bar', 'bar',
                                     'foo', 'foo', 'foo'],
                               'B': ['one', 'one', 'one', 'two',
                                     'one', 'one', 'one', 'two',
                                     'two', 'two', 'one'],
                               'C': ['dull', 'dull', 'shiny', 'dull',
                                     'dull', 'shiny', 'shiny', 'dull',
                                     'shiny', 'shiny', 'shiny'],
                               'D': np.random.randn(11),
                               'E': np.random.randn(11),
                               'F': np.random.randn(11)})

    def test_pivot_table(self):
        rows = ['A', 'B']
        cols = 'C'
        table = pivot_table(self.data, values='D', rows=rows, cols=cols)

        table2 = self.data.pivot_table(values='D', rows=rows, cols=cols)
        tm.assert_frame_equal(table, table2)

        # this works
        pivot_table(self.data, values='D', rows=rows)

        if len(rows) > 1:
            self.assertEqual(table.index.names, tuple(rows))
        else:
            self.assertEqual(table.index.name, rows[0])

        if len(cols) > 1:
            self.assertEqual(table.columns.names, cols)
        else:
            self.assertEqual(table.columns.name, cols[0])

        expected = self.data.groupby(rows + [cols])['D'].agg(np.mean).unstack()
        tm.assert_frame_equal(table, expected)

    def test_pivot_table_nocols(self):
        df = DataFrame({'rows': ['a', 'b', 'c'],
                        'cols': ['x', 'y', 'z'],
                        'values': [1,2,3]})
        rs = df.pivot_table(cols='cols', aggfunc=np.sum)
        xp = df.pivot_table(rows='cols', aggfunc=np.sum).T
        tm.assert_frame_equal(rs, xp)

        rs = df.pivot_table(cols='cols', aggfunc={'values': 'mean'})
        xp = df.pivot_table(rows='cols', aggfunc={'values': 'mean'}).T
        tm.assert_frame_equal(rs, xp)

    def test_pivot_table_dropna(self):
        df = DataFrame({'amount': {0: 60000, 1: 100000, 2: 50000, 3: 30000},
                        'customer': {0: 'A', 1: 'A', 2: 'B', 3: 'C'},
                        'month': {0: 201307, 1: 201309, 2: 201308, 3: 201310},
                        'product': {0: 'a', 1: 'b', 2: 'c', 3: 'd'},
                        'quantity': {0: 2000000, 1: 500000, 2: 1000000, 3: 1000000}})
        pv_col = df.pivot_table('quantity', 'month', ['customer', 'product'], dropna=False)
        pv_ind = df.pivot_table('quantity', ['customer', 'product'], 'month', dropna=False)

        m = MultiIndex.from_tuples([(u('A'), u('a')),
                                    (u('A'), u('b')),
                                    (u('A'), u('c')),
                                    (u('A'), u('d')),
                                    (u('B'), u('a')),
                                    (u('B'), u('b')),
                                    (u('B'), u('c')),
                                    (u('B'), u('d')),
                                    (u('C'), u('a')),
                                    (u('C'), u('b')),
                                    (u('C'), u('c')),
                                    (u('C'), u('d'))])

        assert_equal(pv_col.columns.values, m.values)
        assert_equal(pv_ind.index.values, m.values)


    def test_pass_array(self):
        result = self.data.pivot_table('D', rows=self.data.A, cols=self.data.C)
        expected = self.data.pivot_table('D', rows='A', cols='C')
        tm.assert_frame_equal(result, expected)

    def test_pass_function(self):
        result = self.data.pivot_table('D', rows=lambda x: x // 5,
                                       cols=self.data.C)
        expected = self.data.pivot_table('D', rows=self.data.index // 5,
                                         cols='C')
        tm.assert_frame_equal(result, expected)

    def test_pivot_table_multiple(self):
        rows = ['A', 'B']
        cols = 'C'
        table = pivot_table(self.data, rows=rows, cols=cols)
        expected = self.data.groupby(rows + [cols]).agg(np.mean).unstack()
        tm.assert_frame_equal(table, expected)

    def test_pivot_dtypes(self):

        # can convert dtypes
        f = DataFrame({'a' : ['cat', 'bat', 'cat', 'bat'], 'v' : [1,2,3,4], 'i' : ['a','b','a','b']})
        self.assert_(f.dtypes['v'] == 'int64')

        z = pivot_table(f, values='v', rows=['a'], cols=['i'], fill_value=0, aggfunc=np.sum)
        result = z.get_dtype_counts()
        expected = Series(dict(int64 = 2))
        tm.assert_series_equal(result, expected)

        # cannot convert dtypes
        f = DataFrame({'a' : ['cat', 'bat', 'cat', 'bat'], 'v' : [1.5,2.5,3.5,4.5], 'i' : ['a','b','a','b']})
        self.assert_(f.dtypes['v'] == 'float64')

        z = pivot_table(f, values='v', rows=['a'], cols=['i'], fill_value=0, aggfunc=np.mean)
        result = z.get_dtype_counts()
        expected = Series(dict(float64 = 2))
        tm.assert_series_equal(result, expected)

    def test_pivot_multi_values(self):
        result = pivot_table(self.data, values=['D', 'E'],
                             rows='A', cols=['B', 'C'], fill_value=0)
        expected = pivot_table(self.data.drop(['F'], axis=1),
                               rows='A', cols=['B', 'C'], fill_value=0)
        tm.assert_frame_equal(result, expected)

    def test_pivot_multi_functions(self):
        f = lambda func: pivot_table(self.data, values=['D', 'E'],
                                     rows=['A', 'B'], cols='C',
                                     aggfunc=func)
        result = f([np.mean, np.std])
        means = f(np.mean)
        stds = f(np.std)
        expected = concat([means, stds], keys=['mean', 'std'], axis=1)
        tm.assert_frame_equal(result, expected)

        # margins not supported??
        f = lambda func: pivot_table(self.data, values=['D', 'E'],
                                     rows=['A', 'B'], cols='C',
                                     aggfunc=func, margins=True)
        result = f([np.mean, np.std])
        means = f(np.mean)
        stds = f(np.std)
        expected = concat([means, stds], keys=['mean', 'std'], axis=1)
        tm.assert_frame_equal(result, expected)

    def test_pivot_index_with_nan(self):
        # GH 3588
        nan = np.nan
        df = DataFrame({"a":['R1', 'R2', nan, 'R4'], 'b':["C1", "C2", "C3" , "C4"], "c":[10, 15, nan , 20]})
        result = df.pivot('a','b','c')
        expected = DataFrame([[nan,nan,nan,nan],[nan,10,nan,nan],
                              [nan,nan,nan,nan],[nan,nan,15,20]],
                             index = Index(['R1','R2',nan,'R4'],name='a'),
                             columns = Index(['C1','C2','C3','C4'],name='b'))
        tm.assert_frame_equal(result, expected)

    def test_margins(self):
        def _check_output(res, col, rows=['A', 'B'], cols=['C']):
            cmarg = res['All'][:-1]
            exp = self.data.groupby(rows)[col].mean()
            tm.assert_series_equal(cmarg, exp)

            res.sortlevel(inplace=True)
            rmarg = res.xs(('All', ''))[:-1]
            exp = self.data.groupby(cols)[col].mean()
            tm.assert_series_equal(rmarg, exp)

            gmarg = res['All']['All', '']
            exp = self.data[col].mean()
            self.assertEqual(gmarg, exp)

        # column specified
        table = self.data.pivot_table('D', rows=['A', 'B'], cols='C',
                                      margins=True, aggfunc=np.mean)
        _check_output(table, 'D')

        # no column specified
        table = self.data.pivot_table(rows=['A', 'B'], cols='C',
                                      margins=True, aggfunc=np.mean)
        for valcol in table.columns.levels[0]:
            _check_output(table[valcol], valcol)

        # no col

        # to help with a buglet
        self.data.columns = [k * 2 for k in self.data.columns]
        table = self.data.pivot_table(rows=['AA', 'BB'], margins=True,
                                      aggfunc=np.mean)
        for valcol in table.columns:
            gmarg = table[valcol]['All', '']
            self.assertEqual(gmarg, self.data[valcol].mean())

        # this is OK
        table = self.data.pivot_table(rows=['AA', 'BB'], margins=True,
                                      aggfunc='mean')

        # no rows
        rtable = self.data.pivot_table(cols=['AA', 'BB'], margins=True,
                                       aggfunc=np.mean)
        tm.assert_isinstance(rtable, Series)
        for item in ['DD', 'EE', 'FF']:
            gmarg = table[item]['All', '']
            self.assertEqual(gmarg, self.data[item].mean())

    def test_pivot_integer_columns(self):
        # caused by upstream bug in unstack

        d = datetime.date.min
        data = list(product(['foo', 'bar'], ['A', 'B', 'C'], ['x1', 'x2'],
                            [d + datetime.timedelta(i) for i in range(20)], [1.0]))
        df = pandas.DataFrame(data)
        table = df.pivot_table(values=4, rows=[0, 1, 3], cols=[2])

        df2 = df.rename(columns=str)
        table2 = df2.pivot_table(values='4', rows=['0', '1', '3'], cols=['2'])

        tm.assert_frame_equal(table, table2, check_names=False)

    def test_pivot_no_level_overlap(self):
        # GH #1181

        data = DataFrame({'a': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'] * 2,
                          'b': [0, 0, 0, 0, 1, 1, 1, 1] * 2,
                          'c': (['foo'] * 4 + ['bar'] * 4) * 2,
                          'value': np.random.randn(16)})

        table = data.pivot_table('value', rows='a', cols=['b', 'c'])

        grouped = data.groupby(['a', 'b', 'c'])['value'].mean()
        expected = grouped.unstack('b').unstack('c').dropna(axis=1, how='all')
        tm.assert_frame_equal(table, expected)

    def test_pivot_columns_lexsorted(self):

        n = 10000

        dtype = np.dtype([
            ("Index", object),
            ("Symbol", object),
            ("Year", int),
            ("Month", int),
            ("Day", int),
            ("Quantity", int),
            ("Price", float),
        ])

        products = np.array([
            ('SP500', 'ADBE'),
            ('SP500', 'NVDA'),
            ('SP500', 'ORCL'),
            ('NDQ100', 'AAPL'),
            ('NDQ100', 'MSFT'),
            ('NDQ100', 'GOOG'),
            ('FTSE', 'DGE.L'),
            ('FTSE', 'TSCO.L'),
            ('FTSE', 'GSK.L'),
        ], dtype=[('Index', object), ('Symbol', object)])
        items = np.empty(n, dtype=dtype)
        iproduct = np.random.randint(0, len(products), n)
        items['Index'] = products['Index'][iproduct]
        items['Symbol'] = products['Symbol'][iproduct]
        dr = pandas.date_range(datetime.date(2000, 1, 1),
                               datetime.date(2010, 12, 31))
        dates = dr[np.random.randint(0, len(dr), n)]
        items['Year'] = dates.year
        items['Month'] = dates.month
        items['Day'] = dates.day
        items['Price'] = np.random.lognormal(4.0, 2.0, n)

        df = DataFrame(items)

        pivoted = df.pivot_table('Price', rows=['Month', 'Day'],
                                 cols=['Index', 'Symbol', 'Year'],
                                 aggfunc='mean')

        self.assert_(pivoted.columns.is_monotonic)

    def test_pivot_complex_aggfunc(self):
        f = {'D': ['std'], 'E': ['sum']}
        expected = self.data.groupby(['A', 'B']).agg(f).unstack('B')
        result = self.data.pivot_table(rows='A', cols='B', aggfunc=f)

        tm.assert_frame_equal(result, expected)

    def test_margins_no_values_no_cols(self):
        # Regression test on pivot table: no values or cols passed.
        result = self.data[['A', 'B']].pivot_table(rows=['A', 'B'], aggfunc=len, margins=True)
        result_list = result.tolist()
        self.assertEqual(sum(result_list[:-1]), result_list[-1])

    def test_margins_no_values_two_rows(self):
        # Regression test on pivot table: no values passed but rows are a multi-index
        result = self.data[['A', 'B', 'C']].pivot_table(rows=['A', 'B'], cols='C', aggfunc=len, margins=True)
        self.assertEqual(result.All.tolist(), [3.0, 1.0, 4.0, 3.0, 11.0])

    def test_margins_no_values_one_row_one_col(self):
        # Regression test on pivot table: no values passed but row and col defined
        result = self.data[['A', 'B']].pivot_table(rows='A', cols='B', aggfunc=len, margins=True)
        self.assertEqual(result.All.tolist(), [4.0, 7.0, 11.0])

    def test_margins_no_values_two_row_two_cols(self):
        # Regression test on pivot table: no values passed but rows and cols are multi-indexed
        self.data['D'] = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k']
        result = self.data[['A', 'B', 'C', 'D']].pivot_table(rows=['A', 'B'], cols=['C', 'D'], aggfunc=len, margins=True)
        self.assertEqual(result.All.tolist(), [3.0, 1.0, 4.0, 3.0, 11.0])
Пример #56
0
def crosstab(index, columns, values=None, rownames=None, colnames=None,
             aggfunc=None, margins=False, margins_name='All', dropna=True,
             normalize=False):
    """
    Compute a simple cross-tabulation of two (or more) factors. By default
    computes a frequency table of the factors unless an array of values and an
    aggregation function are passed

    Parameters
    ----------
    index : array-like, Series, or list of arrays/Series
        Values to group by in the rows
    columns : array-like, Series, or list of arrays/Series
        Values to group by in the columns
    values : array-like, optional
        Array of values to aggregate according to the factors.
        Requires `aggfunc` be specified.
    aggfunc : function, optional
        If specified, requires `values` be specified as well
    rownames : sequence, default None
        If passed, must match number of row arrays passed
    colnames : sequence, default None
        If passed, must match number of column arrays passed
    margins : boolean, default False
        Add row/column margins (subtotals)
    margins_name : string, default 'All'
        Name of the row / column that will contain the totals
        when margins is True.

        .. versionadded:: 0.21.0

    dropna : boolean, default True
        Do not include columns whose entries are all NaN
    normalize : boolean, {'all', 'index', 'columns'}, or {0,1}, default False
        Normalize by dividing all values by the sum of values.

        - If passed 'all' or `True`, will normalize over all values.
        - If passed 'index' will normalize over each row.
        - If passed 'columns' will normalize over each column.
        - If margins is `True`, will also normalize margin values.

        .. versionadded:: 0.18.1


    Notes
    -----
    Any Series passed will have their name attributes used unless row or column
    names for the cross-tabulation are specified.

    Any input passed containing Categorical data will have **all** of its
    categories included in the cross-tabulation, even if the actual data does
    not contain any instances of a particular category.

    In the event that there aren't overlapping indexes an empty DataFrame will
    be returned.

    Examples
    --------
    >>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar",
    ...               "bar", "bar", "foo", "foo", "foo"], dtype=object)
    >>> b = np.array(["one", "one", "one", "two", "one", "one",
    ...               "one", "two", "two", "two", "one"], dtype=object)
    >>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny",
    ...               "shiny", "dull", "shiny", "shiny", "shiny"],
    ...               dtype=object)

    >>> pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])
    ... # doctest: +NORMALIZE_WHITESPACE
    b   one        two
    c   dull shiny dull shiny
    a
    bar    1     2    1     0
    foo    2     2    1     2

    >>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])
    >>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f'])
    >>> crosstab(foo, bar)  # 'c' and 'f' are not represented in the data,
                            # and will not be shown in the output because
                            # dropna is True by default. Set 'dropna=False'
                            # to preserve categories with no data
    ... # doctest: +SKIP
    col_0  d  e
    row_0
    a      1  0
    b      0  1

    >>> crosstab(foo, bar, dropna=False)  # 'c' and 'f' are not represented
                            # in the data, but they still will be counted
                            # and shown in the output
    ... # doctest: +SKIP
    col_0  d  e  f
    row_0
    a      1  0  0
    b      0  1  0
    c      0  0  0

    Returns
    -------
    crosstab : DataFrame
    """

    index = com.maybe_make_list(index)
    columns = com.maybe_make_list(columns)

    rownames = _get_names(index, rownames, prefix='row')
    colnames = _get_names(columns, colnames, prefix='col')

    common_idx = _get_objs_combined_axis(index + columns, intersect=True,
                                         sort=False)

    data = {}
    data.update(zip(rownames, index))
    data.update(zip(colnames, columns))

    if values is None and aggfunc is not None:
        raise ValueError("aggfunc cannot be used without values.")

    if values is not None and aggfunc is None:
        raise ValueError("values cannot be used without an aggfunc.")

    from pandas import DataFrame
    df = DataFrame(data, index=common_idx)
    if values is None:
        df['__dummy__'] = 0
        kwargs = {'aggfunc': len, 'fill_value': 0}
    else:
        df['__dummy__'] = values
        kwargs = {'aggfunc': aggfunc}

    table = df.pivot_table('__dummy__', index=rownames, columns=colnames,
                           margins=margins, margins_name=margins_name,
                           dropna=dropna, **kwargs)

    # Post-process
    if normalize is not False:
        table = _normalize(table, normalize=normalize, margins=margins,
                           margins_name=margins_name)

    return table
Пример #57
0
def crosstab(index, columns, values=None, rownames=None, colnames=None,
             aggfunc=None, margins=False, dropna=True, normalize=False):
    """
    Compute a simple cross-tabulation of two (or more) factors. By default
    computes a frequency table of the factors unless an array of values and an
    aggregation function are passed

    Parameters
    ----------
    index : array-like, Series, or list of arrays/Series
        Values to group by in the rows
    columns : array-like, Series, or list of arrays/Series
        Values to group by in the columns
    values : array-like, optional
        Array of values to aggregate according to the factors.
        Requires `aggfunc` be specified.
    aggfunc : function, optional
        If specified, requires `values` be specified as well
    rownames : sequence, default None
        If passed, must match number of row arrays passed
    colnames : sequence, default None
        If passed, must match number of column arrays passed
    margins : boolean, default False
        Add row/column margins (subtotals)
    dropna : boolean, default True
        Do not include columns whose entries are all NaN
    normalize : boolean, {'all', 'index', 'columns'}, or {0,1}, default False
        Normalize by dividing all values by the sum of values.

        - If passed 'all' or `True`, will normalize over all values.
        - If passed 'index' will normalize over each row.
        - If passed 'columns' will normalize over each column.
        - If margins is `True`, will also normalize margin values.

        .. versionadded:: 0.18.1


    Notes
    -----
    Any Series passed will have their name attributes used unless row or column
    names for the cross-tabulation are specified.

    Any input passed containing Categorical data will have **all** of its
    categories included in the cross-tabulation, even if the actual data does
    not contain any instances of a particular category.

    In the event that there aren't overlapping indexes an empty DataFrame will
    be returned.

    Examples
    --------
    >>> a
    array([foo, foo, foo, foo, bar, bar,
           bar, bar, foo, foo, foo], dtype=object)
    >>> b
    array([one, one, one, two, one, one,
           one, two, two, two, one], dtype=object)
    >>> c
    array([dull, dull, shiny, dull, dull, shiny,
           shiny, dull, shiny, shiny, shiny], dtype=object)

    >>> crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])
    b    one          two
    c    dull  shiny  dull  shiny
    a
    bar  1     2      1     0
    foo  2     2      1     2

    >>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])
    >>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f'])
    >>> crosstab(foo, bar)  # 'c' and 'f' are not represented in the data,
                            # but they still will be counted in the output
    col_0  d  e  f
    row_0
    a      1  0  0
    b      0  1  0
    c      0  0  0

    Returns
    -------
    crosstab : DataFrame
    """

    index = com._maybe_make_list(index)
    columns = com._maybe_make_list(columns)

    rownames = _get_names(index, rownames, prefix='row')
    colnames = _get_names(columns, colnames, prefix='col')

    data = {}
    data.update(zip(rownames, index))
    data.update(zip(colnames, columns))

    if values is None and aggfunc is not None:
        raise ValueError("aggfunc cannot be used without values.")

    if values is not None and aggfunc is None:
        raise ValueError("values cannot be used without an aggfunc.")

    if values is None:
        df = DataFrame(data)
        df['__dummy__'] = 0
        table = df.pivot_table('__dummy__', index=rownames, columns=colnames,
                               aggfunc=len, margins=margins, dropna=dropna)
        table = table.fillna(0).astype(np.int64)

    else:
        data['__dummy__'] = values
        df = DataFrame(data)
        table = df.pivot_table('__dummy__', index=rownames, columns=colnames,
                               aggfunc=aggfunc, margins=margins, dropna=dropna)

    # Post-process
    if normalize is not False:
        table = _normalize(table, normalize=normalize, margins=margins)

    return table
Пример #58
0
def crosstab(index, columns, values=None, rownames=None, colnames=None,
             aggfunc=None, margins=False, dropna=True, **kwarg):
    """
    Compute a simple cross-tabulation of two (or more) factors. By default
    computes a frequency table of the factors unless an array of values and an
    aggregation function are passed

    Parameters
    ----------
    index : array-like, Series, or list of arrays/Series
        Values to group by in the rows
    columns : array-like, Series, or list of arrays/Series
        Values to group by in the columns
    values : array-like, optional
        Array of values to aggregate according to the factors
    aggfunc : function, optional
        If no values array is passed, computes a frequency table
    rownames : sequence, default None
        If passed, must match number of row arrays passed
    colnames : sequence, default None
        If passed, must match number of column arrays passed
    margins : boolean, default False
        Add row/column margins (subtotals)
    dropna : boolean, default True
        Do not include columns whose entries are all NaN
    rows : kwarg only alias of index [deprecated]
    cols : kwarg only alias of columns [deprecated]

    Notes
    -----
    Any Series passed will have their name attributes used unless row or column
    names for the cross-tabulation are specified

    Examples
    --------
    >>> a
    array([foo, foo, foo, foo, bar, bar,
           bar, bar, foo, foo, foo], dtype=object)
    >>> b
    array([one, one, one, two, one, one,
           one, two, two, two, one], dtype=object)
    >>> c
    array([dull, dull, shiny, dull, dull, shiny,
           shiny, dull, shiny, shiny, shiny], dtype=object)

    >>> crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])
    b    one          two
    c    dull  shiny  dull  shiny
    a
    bar  1     2      1     0
    foo  2     2      1     2

    Returns
    -------
    crosstab : DataFrame
    """
    # Parse old-style keyword arguments
    rows = kwarg.pop('rows', None)
    if rows is not None:
        warnings.warn("rows is deprecated, use index", FutureWarning)
        if index is None:
            index = rows
        else:
            msg = "Can only specify either 'rows' or 'index'"
            raise TypeError(msg)

    cols = kwarg.pop('cols', None)
    if cols is not None:
        warnings.warn("cols is deprecated, use columns", FutureWarning)
        if columns is None:
            columns = cols
        else:
            msg = "Can only specify either 'cols' or 'columns'"
            raise TypeError(msg)

    if kwarg:
        raise TypeError("Unexpected argument(s): %s" % kwarg.keys())

    index = com._maybe_make_list(index)
    columns = com._maybe_make_list(columns)

    rownames = _get_names(index, rownames, prefix='row')
    colnames = _get_names(columns, colnames, prefix='col')

    data = {}
    data.update(zip(rownames, index))
    data.update(zip(colnames, columns))

    if values is None:
        df = DataFrame(data)
        df['__dummy__'] = 0
        table = df.pivot_table('__dummy__', index=rownames, columns=colnames,
                               aggfunc=len, margins=margins, dropna=dropna)
        return table.fillna(0).astype(np.int64)
    else:
        data['__dummy__'] = values
        df = DataFrame(data)
        table = df.pivot_table('__dummy__', index=rownames, columns=colnames,
                               aggfunc=aggfunc, margins=margins, dropna=dropna)
        return table
Пример #59
0
def crosstab(index, columns, values=None, rownames=None, colnames=None,
             aggfunc=None, margins=False, dropna=True):
    """
    Compute a simple cross-tabulation of two (or more) factors. By default
    computes a frequency table of the factors unless an array of values and an
    aggregation function are passed

    Parameters
    ----------
    index : array-like, Series, or list of arrays/Series
        Values to group by in the rows
    columns : array-like, Series, or list of arrays/Series
        Values to group by in the columns
    values : array-like, optional
        Array of values to aggregate according to the factors
    aggfunc : function, optional
        If no values array is passed, computes a frequency table
    rownames : sequence, default None
        If passed, must match number of row arrays passed
    colnames : sequence, default None
        If passed, must match number of column arrays passed
    margins : boolean, default False
        Add row/column margins (subtotals)
    dropna : boolean, default True
        Do not include columns whose entries are all NaN

    Notes
    -----
    Any Series passed will have their name attributes used unless row or column
    names for the cross-tabulation are specified

    Examples
    --------
    >>> a
    array([foo, foo, foo, foo, bar, bar,
           bar, bar, foo, foo, foo], dtype=object)
    >>> b
    array([one, one, one, two, one, one,
           one, two, two, two, one], dtype=object)
    >>> c
    array([dull, dull, shiny, dull, dull, shiny,
           shiny, dull, shiny, shiny, shiny], dtype=object)

    >>> crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])
    b    one          two
    c    dull  shiny  dull  shiny
    a
    bar  1     2      1     0
    foo  2     2      1     2

    Returns
    -------
    crosstab : DataFrame
    """

    index = com._maybe_make_list(index)
    columns = com._maybe_make_list(columns)

    rownames = _get_names(index, rownames, prefix='row')
    colnames = _get_names(columns, colnames, prefix='col')

    data = {}
    data.update(zip(rownames, index))
    data.update(zip(colnames, columns))

    if values is None:
        df = DataFrame(data)
        df['__dummy__'] = 0
        table = df.pivot_table('__dummy__', index=rownames, columns=colnames,
                               aggfunc=len, margins=margins, dropna=dropna)
        return table.fillna(0).astype(np.int64)
    else:
        data['__dummy__'] = values
        df = DataFrame(data)
        table = df.pivot_table('__dummy__', index=rownames, columns=colnames,
                               aggfunc=aggfunc, margins=margins, dropna=dropna)
        return table