예제 #1
0
    def __init__(self,
                 left,
                 right,
                 how='inner',
                 on=None,
                 left_on=None,
                 right_on=None,
                 axis=1,
                 left_index=False,
                 right_index=False,
                 sort=True,
                 suffixes=('_x', '_y'),
                 copy=True):
        """Construct a join object providing the functions on top
            of the provided RDD. """
        self.left = left
        self.right = right
        self.how = self._map_how(how)
        self.axis = axis

        self.on = com._maybe_make_list(on)
        self.left_on = com._maybe_make_list(left_on)
        self.right_on = com._maybe_make_list(right_on)

        self.copy = copy
        self.suffixes = suffixes
        self.sort = sort

        self.left_index = left_index
        self.right_index = right_index
예제 #2
0
    def __init__(self,
                 left,
                 right,
                 how='inner',
                 on=None,
                 left_on=None,
                 right_on=None,
                 axis=1,
                 left_index=False,
                 right_index=False,
                 sort=True,
                 suffixes=('_x', '_y'),
                 copy=True):
        self.left = self.orig_left = left
        self.right = self.orig_right = right
        self.how = how
        self.axis = axis

        self.on = com._maybe_make_list(on)
        self.left_on = com._maybe_make_list(left_on)
        self.right_on = com._maybe_make_list(right_on)

        self.copy = copy
        self.suffixes = suffixes
        self.sort = sort

        self.left_index = left_index
        self.right_index = right_index

        # note this function has side effects
        (self.left_join_keys, self.right_join_keys,
         self.join_names) = self._get_merge_keys()
예제 #3
0
파일: merge.py 프로젝트: ogrisel/pandas
    def __init__(self, left, right, how='inner', on=None,
                 left_on=None, right_on=None, axis=1,
                 left_index=False, right_index=False, sort=True,
                 suffixes=('.x', '.y'), copy=True):
        self.left = self.orig_left = left
        self.right = self.orig_right = right
        self.how = how
        self.axis = axis

        self.on = com._maybe_make_list(on)
        self.left_on = com._maybe_make_list(left_on)
        self.right_on = com._maybe_make_list(right_on)

        self.drop_keys = False # set this later...kludge

        self.copy = copy

        self.suffixes = suffixes

        self.sort = sort

        self.left_index = left_index
        self.right_index = right_index

        # note this function has side effects
        (self.left_join_keys,
         self.right_join_keys,
         self.join_names) = self._get_merge_keys()
예제 #4
0
파일: merge.py 프로젝트: brianhuey/pandas
    def __init__(self, left, right, how='inner', on=None,
                 left_on=None, right_on=None, axis=1,
                 left_index=False, right_index=False, sort=True,
                 suffixes=('_x', '_y'), copy=True, indicator=False):
        self.left = self.orig_left = left
        self.right = self.orig_right = right
        self.how = how
        self.axis = axis

        self.on = com._maybe_make_list(on)
        self.left_on = com._maybe_make_list(left_on)
        self.right_on = com._maybe_make_list(right_on)

        self.copy = copy
        self.suffixes = suffixes
        self.sort = sort

        self.left_index = left_index
        self.right_index = right_index

        self.indicator = indicator

        if isinstance(self.indicator, compat.string_types):
            self.indicator_name = self.indicator
        elif isinstance(self.indicator, bool):
            self.indicator_name = '_merge' if self.indicator else None
        else:
            raise ValueError('indicator option can only accept boolean or string arguments')


        # note this function has side effects
        (self.left_join_keys,
         self.right_join_keys,
         self.join_names) = self._get_merge_keys()
예제 #5
0
파일: merge.py 프로젝트: nitfer/pandas
    def __init__(
        self,
        left,
        right,
        how="inner",
        on=None,
        left_on=None,
        right_on=None,
        axis=1,
        left_index=False,
        right_index=False,
        sort=True,
        suffixes=("_x", "_y"),
        copy=True,
    ):
        self.left = self.orig_left = left
        self.right = self.orig_right = right
        self.how = how
        self.axis = axis

        self.on = com._maybe_make_list(on)
        self.left_on = com._maybe_make_list(left_on)
        self.right_on = com._maybe_make_list(right_on)

        self.copy = copy
        self.suffixes = suffixes
        self.sort = sort

        self.left_index = left_index
        self.right_index = right_index

        # note this function has side effects
        (self.left_join_keys, self.right_join_keys, self.join_names) = self._get_merge_keys()
예제 #6
0
파일: merge.py 프로젝트: tdhock/pandas
    def __init__(self, left, right, how='inner', on=None,
                 left_on=None, right_on=None, axis=1,
                 left_index=False, right_index=False, sort=True,
                 suffixes=('_x', '_y'), copy=True, indicator=False):
        self.left = self.orig_left = left
        self.right = self.orig_right = right
        self.how = how
        self.axis = axis

        self.on = com._maybe_make_list(on)
        self.left_on = com._maybe_make_list(left_on)
        self.right_on = com._maybe_make_list(right_on)

        self.copy = copy
        self.suffixes = suffixes
        self.sort = sort

        self.left_index = left_index
        self.right_index = right_index

        self.indicator = indicator

        if isinstance(self.indicator, compat.string_types):
            self.indicator_name = self.indicator
        elif isinstance(self.indicator, bool):
            self.indicator_name = '_merge' if self.indicator else None
        else:
            raise ValueError('indicator option can only accept boolean or string arguments')


        # note this function has side effects
        (self.left_join_keys,
         self.right_join_keys,
         self.join_names) = self._get_merge_keys()
예제 #7
0
파일: merge.py 프로젝트: AbnerZheng/pandas
    def __init__(self, left, right, how='inner', on=None,
                 left_on=None, right_on=None, axis=1,
                 left_index=False, right_index=False, sort=True,
                 suffixes=('_x', '_y'), copy=True, indicator=False):
        self.left = self.orig_left = left
        self.right = self.orig_right = right
        self.how = how
        self.axis = axis

        self.on = com._maybe_make_list(on)
        self.left_on = com._maybe_make_list(left_on)
        self.right_on = com._maybe_make_list(right_on)

        self.copy = copy
        self.suffixes = suffixes
        self.sort = sort

        self.left_index = left_index
        self.right_index = right_index

        self.indicator = indicator

        if isinstance(self.indicator, compat.string_types):
            self.indicator_name = self.indicator
        elif isinstance(self.indicator, bool):
            self.indicator_name = '_merge' if self.indicator else None
        else:
            raise ValueError(
                'indicator option can only accept boolean or string arguments')

        if not isinstance(left, DataFrame):
            raise ValueError(
                'can not merge DataFrame with instance of '
                'type {0}'.format(type(left)))
        if not isinstance(right, DataFrame):
            raise ValueError(
                'can not merge DataFrame with instance of '
                'type {0}'.format(type(right)))

        # warn user when merging between different levels
        if left.columns.nlevels != right.columns.nlevels:
            msg = ('merging between different levels can give an unintended '
                   'result ({0} levels on the left, {1} on the right)')
            msg = msg.format(left.columns.nlevels, right.columns.nlevels)
            warnings.warn(msg, UserWarning)

        # note this function has side effects
        (self.left_join_keys,
         self.right_join_keys,
         self.join_names) = self._get_merge_keys()
예제 #8
0
    def __init__(self, left, right, how='inner', on=None,
                 left_on=None, right_on=None, axis=1,
                 left_index=False, right_index=False, sort=True,
                 suffixes=('_x', '_y'), copy=True, indicator=False):
        self.left = self.orig_left = left
        self.right = self.orig_right = right
        self.how = how
        self.axis = axis

        self.on = com._maybe_make_list(on)
        self.left_on = com._maybe_make_list(left_on)
        self.right_on = com._maybe_make_list(right_on)

        self.copy = copy
        self.suffixes = suffixes
        self.sort = sort

        self.left_index = left_index
        self.right_index = right_index

        self.indicator = indicator

        if isinstance(self.indicator, compat.string_types):
            self.indicator_name = self.indicator
        elif isinstance(self.indicator, bool):
            self.indicator_name = '_merge' if self.indicator else None
        else:
            raise ValueError(
                'indicator option can only accept boolean or string arguments')

        if not isinstance(left, DataFrame):
            raise ValueError(
                'can not merge DataFrame with instance of '
                'type {0}'.format(type(left)))
        if not isinstance(right, DataFrame):
            raise ValueError(
                'can not merge DataFrame with instance of '
                'type {0}'.format(type(right)))

        # warn user when merging between different levels
        if left.columns.nlevels != right.columns.nlevels:
            msg = ('merging between different levels can give an unintended '
                   'result ({0} levels on the left, {1} on the right)')
            msg = msg.format(left.columns.nlevels, right.columns.nlevels)
            warnings.warn(msg, UserWarning)

        # note this function has side effects
        (self.left_join_keys,
         self.right_join_keys,
         self.join_names) = self._get_merge_keys()
예제 #9
0
def crosstab(index,
             columns,
             values=None,
             rownames=None,
             colnames=None,
             aggfunc=None,
             margins=False,
             dropna=True,
             normalize=False):
    """
    Compute a simple cross-tabulation of two (or more) factors. By default
    computes a frequency table of the factors unless an array of values and an
    aggregation function are passed

    Parameters
    ----------
    index : array-like, Series, or list of arrays/Series
        Values to group by in the rows
    columns : array-like, Series, or list of arrays/Series
        Values to group by in the columns
    values : array-like, optional
        Array of values to aggregate according to the factors.
        Requires `aggfunc` be specified.
    aggfunc : function, optional
        If specified, requires `values` be specified as well
    rownames : sequence, default None
        If passed, must match number of row arrays passed
    colnames : sequence, default None
        If passed, must match number of column arrays passed
    margins : boolean, default False
        Add row/column margins (subtotals)
    dropna : boolean, default True
        Do not include columns whose entries are all NaN
    normalize : boolean, {'all', 'index', 'columns'}, or {0,1}, default False
        Normalize by dividing all values by the sum of values.

        - If passed 'all' or `True`, will normalize over all values.
        - If passed 'index' will normalize over each row.
        - If passed 'columns' will normalize over each column.
        - If margins is `True`, will also normalize margin values.

        .. versionadded:: 0.18.1


    Notes
    -----
    Any Series passed will have their name attributes used unless row or column
    names for the cross-tabulation are specified.

    Any input passed containing Categorical data will have **all** of its
    categories included in the cross-tabulation, even if the actual data does
    not contain any instances of a particular category.

    In the event that there aren't overlapping indexes an empty DataFrame will
    be returned.

    Examples
    --------
    >>> a
    array([foo, foo, foo, foo, bar, bar,
           bar, bar, foo, foo, foo], dtype=object)
    >>> b
    array([one, one, one, two, one, one,
           one, two, two, two, one], dtype=object)
    >>> c
    array([dull, dull, shiny, dull, dull, shiny,
           shiny, dull, shiny, shiny, shiny], dtype=object)

    >>> crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])
    b    one          two
    c    dull  shiny  dull  shiny
    a
    bar  1     2      1     0
    foo  2     2      1     2

    >>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])
    >>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f'])
    >>> crosstab(foo, bar)  # 'c' and 'f' are not represented in the data,
                            # but they still will be counted in the output
    col_0  d  e  f
    row_0
    a      1  0  0
    b      0  1  0
    c      0  0  0

    Returns
    -------
    crosstab : DataFrame
    """

    index = com._maybe_make_list(index)
    columns = com._maybe_make_list(columns)

    rownames = _get_names(index, rownames, prefix='row')
    colnames = _get_names(columns, colnames, prefix='col')

    data = {}
    data.update(zip(rownames, index))
    data.update(zip(colnames, columns))

    if values is None and aggfunc is not None:
        raise ValueError("aggfunc cannot be used without values.")

    if values is not None and aggfunc is None:
        raise ValueError("values cannot be used without an aggfunc.")

    if values is None:
        df = DataFrame(data)
        df['__dummy__'] = 0
        table = df.pivot_table('__dummy__',
                               index=rownames,
                               columns=colnames,
                               aggfunc=len,
                               margins=margins,
                               dropna=dropna)
        table = table.fillna(0).astype(np.int64)

    else:
        data['__dummy__'] = values
        df = DataFrame(data)
        table = df.pivot_table('__dummy__',
                               index=rownames,
                               columns=colnames,
                               aggfunc=aggfunc,
                               margins=margins,
                               dropna=dropna)

    # Post-process
    if normalize is not False:
        table = _normalize(table, normalize=normalize, margins=margins)

    return table
예제 #10
0
파일: pivot.py 프로젝트: zhlijia/pandas
def crosstab(index,
             columns,
             values=None,
             rownames=None,
             colnames=None,
             aggfunc=None,
             margins=False,
             margins_name='All',
             dropna=True,
             normalize=False):
    """
    Compute a simple cross-tabulation of two (or more) factors. By default
    computes a frequency table of the factors unless an array of values and an
    aggregation function are passed

    Parameters
    ----------
    index : array-like, Series, or list of arrays/Series
        Values to group by in the rows
    columns : array-like, Series, or list of arrays/Series
        Values to group by in the columns
    values : array-like, optional
        Array of values to aggregate according to the factors.
        Requires `aggfunc` be specified.
    aggfunc : function, optional
        If specified, requires `values` be specified as well
    rownames : sequence, default None
        If passed, must match number of row arrays passed
    colnames : sequence, default None
        If passed, must match number of column arrays passed
    margins : boolean, default False
        Add row/column margins (subtotals)
    margins_name : string, default 'All'
        Name of the row / column that will contain the totals
        when margins is True.

        .. versionadded:: 0.21.0

    dropna : boolean, default True
        Do not include columns whose entries are all NaN
    normalize : boolean, {'all', 'index', 'columns'}, or {0,1}, default False
        Normalize by dividing all values by the sum of values.

        - If passed 'all' or `True`, will normalize over all values.
        - If passed 'index' will normalize over each row.
        - If passed 'columns' will normalize over each column.
        - If margins is `True`, will also normalize margin values.

        .. versionadded:: 0.18.1


    Notes
    -----
    Any Series passed will have their name attributes used unless row or column
    names for the cross-tabulation are specified.

    Any input passed containing Categorical data will have **all** of its
    categories included in the cross-tabulation, even if the actual data does
    not contain any instances of a particular category.

    In the event that there aren't overlapping indexes an empty DataFrame will
    be returned.

    Examples
    --------
    >>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar",
    ...               "bar", "bar", "foo", "foo", "foo"], dtype=object)
    >>> b = np.array(["one", "one", "one", "two", "one", "one",
    ...               "one", "two", "two", "two", "one"], dtype=object)
    >>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny",
    ...               "shiny", "dull", "shiny", "shiny", "shiny"],
    ...               dtype=object)

    >>> pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])
    ... # doctest: +NORMALIZE_WHITESPACE
    b   one        two
    c   dull shiny dull shiny
    a
    bar    1     2    1     0
    foo    2     2    1     2

    >>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])
    >>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f'])
    >>> crosstab(foo, bar)  # 'c' and 'f' are not represented in the data,
    ...                     # but they still will be counted in the output
    ... # doctest: +SKIP
    col_0  d  e  f
    row_0
    a      1  0  0
    b      0  1  0
    c      0  0  0

    Returns
    -------
    crosstab : DataFrame
    """

    index = com._maybe_make_list(index)
    columns = com._maybe_make_list(columns)

    rownames = _get_names(index, rownames, prefix='row')
    colnames = _get_names(columns, colnames, prefix='col')

    common_idx = _get_objs_combined_axis(index + columns, intersect=True)

    data = {}
    data.update(zip(rownames, index))
    data.update(zip(colnames, columns))

    if values is None and aggfunc is not None:
        raise ValueError("aggfunc cannot be used without values.")

    if values is not None and aggfunc is None:
        raise ValueError("values cannot be used without an aggfunc.")

    from pandas import DataFrame
    df = DataFrame(data, index=common_idx)
    if values is None:
        df['__dummy__'] = 0
        kwargs = {'aggfunc': len, 'fill_value': 0}
    else:
        df['__dummy__'] = values
        kwargs = {'aggfunc': aggfunc}

    table = df.pivot_table('__dummy__',
                           index=rownames,
                           columns=colnames,
                           margins=margins,
                           margins_name=margins_name,
                           dropna=dropna,
                           **kwargs)

    # Post-process
    if normalize is not False:
        table = _normalize(table,
                           normalize=normalize,
                           margins=margins,
                           margins_name=margins_name)

    return table
예제 #11
0
def crosstab(rows,
             cols,
             values=None,
             rownames=None,
             colnames=None,
             aggfunc=None,
             margins=False):
    """
    Compute a simple cross-tabulation of two (or more) factors. By default
    computes a frequency table of the factors unless an array of values and an
    aggregation function are passed

    Parameters
    ----------
    rows : array-like, Series, or list of arrays/Series
        Values to group by in the rows
    cols : array-like, Series, or list of arrays/Series
        Values to group by in the columns
    values : array-like, optional
        Array of values to aggregate according to the factors
    aggfunc : function, optional
        If no values array is passed, computes a frequency table
    rownames : sequence, default None
        If passed, must match number of row arrays passed
    colnames : sequence, default None
        If passed, must match number of column arrays passed
    margins : boolean, default False
        Add row/column margins (subtotals)

    Notes
    -----
    Any Series passed will have their name attributes used unless row or column
    names for the cross-tabulation are specified

    Examples
    --------
    >>> a
    array([foo, foo, foo, foo, bar, bar,
           bar, bar, foo, foo, foo], dtype=object)
    >>> b
    array([one, one, one, two, one, one,
           one, two, two, two, one], dtype=object)
    >>> c
    array([dull, dull, shiny, dull, dull, shiny,
           shiny, dull, shiny, shiny, shiny], dtype=object)

    >>> crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])
    b    one          two
    c    dull  shiny  dull  shiny
    a
    bar  1     2      1     0
    foo  2     2      1     2

    Returns
    -------
    crosstab : DataFrame
    """
    rows = com._maybe_make_list(rows)
    cols = com._maybe_make_list(cols)

    rownames = _get_names(rows, rownames, prefix='row')
    colnames = _get_names(cols, colnames, prefix='col')

    data = {}
    data.update(list(zip(rownames, rows)))
    data.update(list(zip(colnames, cols)))

    if values is None:
        df = DataFrame(data)
        df['__dummy__'] = 0
        table = df.pivot_table('__dummy__',
                               rows=rownames,
                               cols=colnames,
                               aggfunc=len,
                               margins=margins)
        return table.fillna(0).astype(np.int64)
    else:
        data['__dummy__'] = values
        df = DataFrame(data)
        table = df.pivot_table('__dummy__',
                               rows=rownames,
                               cols=colnames,
                               aggfunc=aggfunc,
                               margins=margins)
        return table
예제 #12
0
파일: pivot.py 프로젝트: cscanlin/pandas
def crosstab(index, columns, values=None, rownames=None, colnames=None,
             aggfunc=None, margins=False, dropna=True):
    """
    Compute a simple cross-tabulation of two (or more) factors. By default
    computes a frequency table of the factors unless an array of values and an
    aggregation function are passed

    Parameters
    ----------
    index : array-like, Series, or list of arrays/Series
        Values to group by in the rows
    columns : array-like, Series, or list of arrays/Series
        Values to group by in the columns
    values : array-like, optional
        Array of values to aggregate according to the factors
    aggfunc : function, optional
        If no values array is passed, computes a frequency table
    rownames : sequence, default None
        If passed, must match number of row arrays passed
    colnames : sequence, default None
        If passed, must match number of column arrays passed
    margins : boolean, default False
        Add row/column margins (subtotals)
    dropna : boolean, default True
        Do not include columns whose entries are all NaN

    Notes
    -----
    Any Series passed will have their name attributes used unless row or column
    names for the cross-tabulation are specified

    Examples
    --------
    >>> a
    array([foo, foo, foo, foo, bar, bar,
           bar, bar, foo, foo, foo], dtype=object)
    >>> b
    array([one, one, one, two, one, one,
           one, two, two, two, one], dtype=object)
    >>> c
    array([dull, dull, shiny, dull, dull, shiny,
           shiny, dull, shiny, shiny, shiny], dtype=object)

    >>> crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])
    b    one          two
    c    dull  shiny  dull  shiny
    a
    bar  1     2      1     0
    foo  2     2      1     2

    Returns
    -------
    crosstab : DataFrame
    """

    index = com._maybe_make_list(index)
    columns = com._maybe_make_list(columns)

    rownames = _get_names(index, rownames, prefix='row')
    colnames = _get_names(columns, colnames, prefix='col')

    data = {}
    data.update(zip(rownames, index))
    data.update(zip(colnames, columns))

    if values is None:
        df = DataFrame(data)
        df['__dummy__'] = 0
        table = df.pivot_table('__dummy__', index=rownames, columns=colnames,
                               aggfunc=len, margins=margins, dropna=dropna)
        return table.fillna(0).astype(np.int64)
    else:
        data['__dummy__'] = values
        df = DataFrame(data)
        table = df.pivot_table('__dummy__', index=rownames, columns=colnames,
                               aggfunc=aggfunc, margins=margins, dropna=dropna)
        return table
예제 #13
0
파일: pivot.py 프로젝트: Alias4bb/pandas
def crosstab(index, columns, values=None, rownames=None, colnames=None,
             aggfunc=None, margins=False, dropna=True, normalize=False):
    """
    Compute a simple cross-tabulation of two (or more) factors. By default
    computes a frequency table of the factors unless an array of values and an
    aggregation function are passed

    Parameters
    ----------
    index : array-like, Series, or list of arrays/Series
        Values to group by in the rows
    columns : array-like, Series, or list of arrays/Series
        Values to group by in the columns
    values : array-like, optional
        Array of values to aggregate according to the factors.
        Requires `aggfunc` be specified.
    aggfunc : function, optional
        If specified, requires `values` be specified as well
    rownames : sequence, default None
        If passed, must match number of row arrays passed
    colnames : sequence, default None
        If passed, must match number of column arrays passed
    margins : boolean, default False
        Add row/column margins (subtotals)
    dropna : boolean, default True
        Do not include columns whose entries are all NaN
    normalize : boolean, {'all', 'index', 'columns'}, or {0,1}, default False
        Normalize by dividing all values by the sum of values.

        - If passed 'all' or `True`, will normalize over all values.
        - If passed 'index' will normalize over each row.
        - If passed 'columns' will normalize over each column.
        - If margins is `True`, will also normalize margin values.

        .. versionadded:: 0.18.1


    Notes
    -----
    Any Series passed will have their name attributes used unless row or column
    names for the cross-tabulation are specified.

    Any input passed containing Categorical data will have **all** of its
    categories included in the cross-tabulation, even if the actual data does
    not contain any instances of a particular category.

    In the event that there aren't overlapping indexes an empty DataFrame will
    be returned.

    Examples
    --------
    >>> a
    array([foo, foo, foo, foo, bar, bar,
           bar, bar, foo, foo, foo], dtype=object)
    >>> b
    array([one, one, one, two, one, one,
           one, two, two, two, one], dtype=object)
    >>> c
    array([dull, dull, shiny, dull, dull, shiny,
           shiny, dull, shiny, shiny, shiny], dtype=object)

    >>> crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])
    b    one          two
    c    dull  shiny  dull  shiny
    a
    bar  1     2      1     0
    foo  2     2      1     2

    >>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])
    >>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f'])
    >>> crosstab(foo, bar)  # 'c' and 'f' are not represented in the data,
                            # but they still will be counted in the output
    col_0  d  e  f
    row_0
    a      1  0  0
    b      0  1  0
    c      0  0  0

    Returns
    -------
    crosstab : DataFrame
    """

    index = com._maybe_make_list(index)
    columns = com._maybe_make_list(columns)

    rownames = _get_names(index, rownames, prefix='row')
    colnames = _get_names(columns, colnames, prefix='col')

    data = {}
    data.update(zip(rownames, index))
    data.update(zip(colnames, columns))

    if values is None and aggfunc is not None:
        raise ValueError("aggfunc cannot be used without values.")

    if values is not None and aggfunc is None:
        raise ValueError("values cannot be used without an aggfunc.")

    if values is None:
        df = DataFrame(data)
        df['__dummy__'] = 0
        table = df.pivot_table('__dummy__', index=rownames, columns=colnames,
                               aggfunc=len, margins=margins, dropna=dropna)
        table = table.fillna(0).astype(np.int64)

    else:
        data['__dummy__'] = values
        df = DataFrame(data)
        table = df.pivot_table('__dummy__', index=rownames, columns=colnames,
                               aggfunc=aggfunc, margins=margins, dropna=dropna)

    # Post-process
    if normalize is not False:
        table = _normalize(table, normalize=normalize, margins=margins)

    return table
예제 #14
0
def crosstab(index, columns, values=None, rownames=None, colnames=None,
             aggfunc=None, margins=False, margins_name='All', dropna=True,
             normalize=False):
    """
    Compute a simple cross-tabulation of two (or more) factors. By default
    computes a frequency table of the factors unless an array of values and an
    aggregation function are passed

    Parameters
    ----------
    index : array-like, Series, or list of arrays/Series
        Values to group by in the rows
    columns : array-like, Series, or list of arrays/Series
        Values to group by in the columns
    values : array-like, optional
        Array of values to aggregate according to the factors.
        Requires `aggfunc` be specified.
    aggfunc : function, optional
        If specified, requires `values` be specified as well
    rownames : sequence, default None
        If passed, must match number of row arrays passed
    colnames : sequence, default None
        If passed, must match number of column arrays passed
    margins : boolean, default False
        Add row/column margins (subtotals)
    margins_name : string, default 'All'
        Name of the row / column that will contain the totals
        when margins is True.

        .. versionadded:: 0.21.0

    dropna : boolean, default True
        Do not include columns whose entries are all NaN
    normalize : boolean, {'all', 'index', 'columns'}, or {0,1}, default False
        Normalize by dividing all values by the sum of values.

        - If passed 'all' or `True`, will normalize over all values.
        - If passed 'index' will normalize over each row.
        - If passed 'columns' will normalize over each column.
        - If margins is `True`, will also normalize margin values.

        .. versionadded:: 0.18.1


    Notes
    -----
    Any Series passed will have their name attributes used unless row or column
    names for the cross-tabulation are specified.

    Any input passed containing Categorical data will have **all** of its
    categories included in the cross-tabulation, even if the actual data does
    not contain any instances of a particular category.

    In the event that there aren't overlapping indexes an empty DataFrame will
    be returned.

    Examples
    --------
    >>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar",
    ...               "bar", "bar", "foo", "foo", "foo"], dtype=object)
    >>> b = np.array(["one", "one", "one", "two", "one", "one",
    ...               "one", "two", "two", "two", "one"], dtype=object)
    >>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny",
    ...               "shiny", "dull", "shiny", "shiny", "shiny"],
    ...               dtype=object)

    >>> pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])
    ... # doctest: +NORMALIZE_WHITESPACE
    b   one        two
    c   dull shiny dull shiny
    a
    bar    1     2    1     0
    foo    2     2    1     2

    >>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c'])
    >>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f'])
    >>> crosstab(foo, bar)  # 'c' and 'f' are not represented in the data,
    ...                     # but they still will be counted in the output
    ... # doctest: +SKIP
    col_0  d  e  f
    row_0
    a      1  0  0
    b      0  1  0
    c      0  0  0

    Returns
    -------
    crosstab : DataFrame
    """

    index = com._maybe_make_list(index)
    columns = com._maybe_make_list(columns)

    rownames = _get_names(index, rownames, prefix='row')
    colnames = _get_names(columns, colnames, prefix='col')

    obs_idxes = [obj.index for objs in (index, columns) for obj in objs
                 if hasattr(obj, 'index')]
    if obs_idxes:
        common_idx = _get_combined_index(obs_idxes, intersect=True)
    else:
        common_idx = None

    data = {}
    data.update(zip(rownames, index))
    data.update(zip(colnames, columns))

    if values is None and aggfunc is not None:
        raise ValueError("aggfunc cannot be used without values.")

    if values is not None and aggfunc is None:
        raise ValueError("values cannot be used without an aggfunc.")

    df = DataFrame(data, index=common_idx)
    if values is None:
        df['__dummy__'] = 0
        kwargs = {'aggfunc': len, 'fill_value': 0}
    else:
        df['__dummy__'] = values
        kwargs = {'aggfunc': aggfunc}

    table = df.pivot_table('__dummy__', index=rownames, columns=colnames,
                           margins=margins, margins_name=margins_name,
                           dropna=dropna, **kwargs)

    # Post-process
    if normalize is not False:
        table = _normalize(table, normalize=normalize, margins=margins,
                           margins_name=margins_name)

    return table
예제 #15
0
파일: pivot.py 프로젝트: IcyLeez/pandas
def crosstab(index, columns, values=None, rownames=None, colnames=None,
             aggfunc=None, margins=False, dropna=True, **kwarg):
    """
    Compute a simple cross-tabulation of two (or more) factors. By default
    computes a frequency table of the factors unless an array of values and an
    aggregation function are passed

    Parameters
    ----------
    index : array-like, Series, or list of arrays/Series
        Values to group by in the rows
    columns : array-like, Series, or list of arrays/Series
        Values to group by in the columns
    values : array-like, optional
        Array of values to aggregate according to the factors
    aggfunc : function, optional
        If no values array is passed, computes a frequency table
    rownames : sequence, default None
        If passed, must match number of row arrays passed
    colnames : sequence, default None
        If passed, must match number of column arrays passed
    margins : boolean, default False
        Add row/column margins (subtotals)
    dropna : boolean, default True
        Do not include columns whose entries are all NaN
    rows : kwarg only alias of index [deprecated]
    cols : kwarg only alias of columns [deprecated]

    Notes
    -----
    Any Series passed will have their name attributes used unless row or column
    names for the cross-tabulation are specified

    Examples
    --------
    >>> a
    array([foo, foo, foo, foo, bar, bar,
           bar, bar, foo, foo, foo], dtype=object)
    >>> b
    array([one, one, one, two, one, one,
           one, two, two, two, one], dtype=object)
    >>> c
    array([dull, dull, shiny, dull, dull, shiny,
           shiny, dull, shiny, shiny, shiny], dtype=object)

    >>> crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])
    b    one          two
    c    dull  shiny  dull  shiny
    a
    bar  1     2      1     0
    foo  2     2      1     2

    Returns
    -------
    crosstab : DataFrame
    """
    # Parse old-style keyword arguments
    rows = kwarg.pop('rows', None)
    if rows is not None:
        warnings.warn("rows is deprecated, use index", FutureWarning)
        if index is None:
            index = rows
        else:
            msg = "Can only specify either 'rows' or 'index'"
            raise TypeError(msg)

    cols = kwarg.pop('cols', None)
    if cols is not None:
        warnings.warn("cols is deprecated, use columns", FutureWarning)
        if columns is None:
            columns = cols
        else:
            msg = "Can only specify either 'cols' or 'columns'"
            raise TypeError(msg)

    if kwarg:
        raise TypeError("Unexpected argument(s): %s" % kwarg.keys())

    index = com._maybe_make_list(index)
    columns = com._maybe_make_list(columns)

    rownames = _get_names(index, rownames, prefix='row')
    colnames = _get_names(columns, colnames, prefix='col')

    data = {}
    data.update(zip(rownames, index))
    data.update(zip(colnames, columns))

    if values is None:
        df = DataFrame(data)
        df['__dummy__'] = 0
        table = df.pivot_table('__dummy__', index=rownames, columns=colnames,
                               aggfunc=len, margins=margins, dropna=dropna)
        return table.fillna(0).astype(np.int64)
    else:
        data['__dummy__'] = values
        df = DataFrame(data)
        table = df.pivot_table('__dummy__', index=rownames, columns=colnames,
                               aggfunc=aggfunc, margins=margins, dropna=dropna)
        return table