def __init__(self, left, right, how='inner', on=None, left_on=None, right_on=None, axis=1, left_index=False, right_index=False, sort=True, suffixes=('_x', '_y'), copy=True): """Construct a join object providing the functions on top of the provided RDD. """ self.left = left self.right = right self.how = self._map_how(how) self.axis = axis self.on = com._maybe_make_list(on) self.left_on = com._maybe_make_list(left_on) self.right_on = com._maybe_make_list(right_on) self.copy = copy self.suffixes = suffixes self.sort = sort self.left_index = left_index self.right_index = right_index
def __init__(self, left, right, how='inner', on=None, left_on=None, right_on=None, axis=1, left_index=False, right_index=False, sort=True, suffixes=('_x', '_y'), copy=True): self.left = self.orig_left = left self.right = self.orig_right = right self.how = how self.axis = axis self.on = com._maybe_make_list(on) self.left_on = com._maybe_make_list(left_on) self.right_on = com._maybe_make_list(right_on) self.copy = copy self.suffixes = suffixes self.sort = sort self.left_index = left_index self.right_index = right_index # note this function has side effects (self.left_join_keys, self.right_join_keys, self.join_names) = self._get_merge_keys()
def __init__(self, left, right, how='inner', on=None, left_on=None, right_on=None, axis=1, left_index=False, right_index=False, sort=True, suffixes=('.x', '.y'), copy=True): self.left = self.orig_left = left self.right = self.orig_right = right self.how = how self.axis = axis self.on = com._maybe_make_list(on) self.left_on = com._maybe_make_list(left_on) self.right_on = com._maybe_make_list(right_on) self.drop_keys = False # set this later...kludge self.copy = copy self.suffixes = suffixes self.sort = sort self.left_index = left_index self.right_index = right_index # note this function has side effects (self.left_join_keys, self.right_join_keys, self.join_names) = self._get_merge_keys()
def __init__(self, left, right, how='inner', on=None, left_on=None, right_on=None, axis=1, left_index=False, right_index=False, sort=True, suffixes=('_x', '_y'), copy=True, indicator=False): self.left = self.orig_left = left self.right = self.orig_right = right self.how = how self.axis = axis self.on = com._maybe_make_list(on) self.left_on = com._maybe_make_list(left_on) self.right_on = com._maybe_make_list(right_on) self.copy = copy self.suffixes = suffixes self.sort = sort self.left_index = left_index self.right_index = right_index self.indicator = indicator if isinstance(self.indicator, compat.string_types): self.indicator_name = self.indicator elif isinstance(self.indicator, bool): self.indicator_name = '_merge' if self.indicator else None else: raise ValueError('indicator option can only accept boolean or string arguments') # note this function has side effects (self.left_join_keys, self.right_join_keys, self.join_names) = self._get_merge_keys()
def __init__( self, left, right, how="inner", on=None, left_on=None, right_on=None, axis=1, left_index=False, right_index=False, sort=True, suffixes=("_x", "_y"), copy=True, ): self.left = self.orig_left = left self.right = self.orig_right = right self.how = how self.axis = axis self.on = com._maybe_make_list(on) self.left_on = com._maybe_make_list(left_on) self.right_on = com._maybe_make_list(right_on) self.copy = copy self.suffixes = suffixes self.sort = sort self.left_index = left_index self.right_index = right_index # note this function has side effects (self.left_join_keys, self.right_join_keys, self.join_names) = self._get_merge_keys()
def __init__(self, left, right, how='inner', on=None, left_on=None, right_on=None, axis=1, left_index=False, right_index=False, sort=True, suffixes=('_x', '_y'), copy=True, indicator=False): self.left = self.orig_left = left self.right = self.orig_right = right self.how = how self.axis = axis self.on = com._maybe_make_list(on) self.left_on = com._maybe_make_list(left_on) self.right_on = com._maybe_make_list(right_on) self.copy = copy self.suffixes = suffixes self.sort = sort self.left_index = left_index self.right_index = right_index self.indicator = indicator if isinstance(self.indicator, compat.string_types): self.indicator_name = self.indicator elif isinstance(self.indicator, bool): self.indicator_name = '_merge' if self.indicator else None else: raise ValueError( 'indicator option can only accept boolean or string arguments') if not isinstance(left, DataFrame): raise ValueError( 'can not merge DataFrame with instance of ' 'type {0}'.format(type(left))) if not isinstance(right, DataFrame): raise ValueError( 'can not merge DataFrame with instance of ' 'type {0}'.format(type(right))) # warn user when merging between different levels if left.columns.nlevels != right.columns.nlevels: msg = ('merging between different levels can give an unintended ' 'result ({0} levels on the left, {1} on the right)') msg = msg.format(left.columns.nlevels, right.columns.nlevels) warnings.warn(msg, UserWarning) # note this function has side effects (self.left_join_keys, self.right_join_keys, self.join_names) = self._get_merge_keys()
def crosstab(index, columns, values=None, rownames=None, colnames=None, aggfunc=None, margins=False, dropna=True, normalize=False): """ Compute a simple cross-tabulation of two (or more) factors. By default computes a frequency table of the factors unless an array of values and an aggregation function are passed Parameters ---------- index : array-like, Series, or list of arrays/Series Values to group by in the rows columns : array-like, Series, or list of arrays/Series Values to group by in the columns values : array-like, optional Array of values to aggregate according to the factors. Requires `aggfunc` be specified. aggfunc : function, optional If specified, requires `values` be specified as well rownames : sequence, default None If passed, must match number of row arrays passed colnames : sequence, default None If passed, must match number of column arrays passed margins : boolean, default False Add row/column margins (subtotals) dropna : boolean, default True Do not include columns whose entries are all NaN normalize : boolean, {'all', 'index', 'columns'}, or {0,1}, default False Normalize by dividing all values by the sum of values. - If passed 'all' or `True`, will normalize over all values. - If passed 'index' will normalize over each row. - If passed 'columns' will normalize over each column. - If margins is `True`, will also normalize margin values. .. versionadded:: 0.18.1 Notes ----- Any Series passed will have their name attributes used unless row or column names for the cross-tabulation are specified. Any input passed containing Categorical data will have **all** of its categories included in the cross-tabulation, even if the actual data does not contain any instances of a particular category. In the event that there aren't overlapping indexes an empty DataFrame will be returned. Examples -------- >>> a array([foo, foo, foo, foo, bar, bar, bar, bar, foo, foo, foo], dtype=object) >>> b array([one, one, one, two, one, one, one, two, two, two, one], dtype=object) >>> c array([dull, dull, shiny, dull, dull, shiny, shiny, dull, shiny, shiny, shiny], dtype=object) >>> crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) b one two c dull shiny dull shiny a bar 1 2 1 0 foo 2 2 1 2 >>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c']) >>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f']) >>> crosstab(foo, bar) # 'c' and 'f' are not represented in the data, # but they still will be counted in the output col_0 d e f row_0 a 1 0 0 b 0 1 0 c 0 0 0 Returns ------- crosstab : DataFrame """ index = com._maybe_make_list(index) columns = com._maybe_make_list(columns) rownames = _get_names(index, rownames, prefix='row') colnames = _get_names(columns, colnames, prefix='col') data = {} data.update(zip(rownames, index)) data.update(zip(colnames, columns)) if values is None and aggfunc is not None: raise ValueError("aggfunc cannot be used without values.") if values is not None and aggfunc is None: raise ValueError("values cannot be used without an aggfunc.") if values is None: df = DataFrame(data) df['__dummy__'] = 0 table = df.pivot_table('__dummy__', index=rownames, columns=colnames, aggfunc=len, margins=margins, dropna=dropna) table = table.fillna(0).astype(np.int64) else: data['__dummy__'] = values df = DataFrame(data) table = df.pivot_table('__dummy__', index=rownames, columns=colnames, aggfunc=aggfunc, margins=margins, dropna=dropna) # Post-process if normalize is not False: table = _normalize(table, normalize=normalize, margins=margins) return table
def crosstab(index, columns, values=None, rownames=None, colnames=None, aggfunc=None, margins=False, margins_name='All', dropna=True, normalize=False): """ Compute a simple cross-tabulation of two (or more) factors. By default computes a frequency table of the factors unless an array of values and an aggregation function are passed Parameters ---------- index : array-like, Series, or list of arrays/Series Values to group by in the rows columns : array-like, Series, or list of arrays/Series Values to group by in the columns values : array-like, optional Array of values to aggregate according to the factors. Requires `aggfunc` be specified. aggfunc : function, optional If specified, requires `values` be specified as well rownames : sequence, default None If passed, must match number of row arrays passed colnames : sequence, default None If passed, must match number of column arrays passed margins : boolean, default False Add row/column margins (subtotals) margins_name : string, default 'All' Name of the row / column that will contain the totals when margins is True. .. versionadded:: 0.21.0 dropna : boolean, default True Do not include columns whose entries are all NaN normalize : boolean, {'all', 'index', 'columns'}, or {0,1}, default False Normalize by dividing all values by the sum of values. - If passed 'all' or `True`, will normalize over all values. - If passed 'index' will normalize over each row. - If passed 'columns' will normalize over each column. - If margins is `True`, will also normalize margin values. .. versionadded:: 0.18.1 Notes ----- Any Series passed will have their name attributes used unless row or column names for the cross-tabulation are specified. Any input passed containing Categorical data will have **all** of its categories included in the cross-tabulation, even if the actual data does not contain any instances of a particular category. In the event that there aren't overlapping indexes an empty DataFrame will be returned. Examples -------- >>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar", ... "bar", "bar", "foo", "foo", "foo"], dtype=object) >>> b = np.array(["one", "one", "one", "two", "one", "one", ... "one", "two", "two", "two", "one"], dtype=object) >>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny", ... "shiny", "dull", "shiny", "shiny", "shiny"], ... dtype=object) >>> pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) ... # doctest: +NORMALIZE_WHITESPACE b one two c dull shiny dull shiny a bar 1 2 1 0 foo 2 2 1 2 >>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c']) >>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f']) >>> crosstab(foo, bar) # 'c' and 'f' are not represented in the data, ... # but they still will be counted in the output ... # doctest: +SKIP col_0 d e f row_0 a 1 0 0 b 0 1 0 c 0 0 0 Returns ------- crosstab : DataFrame """ index = com._maybe_make_list(index) columns = com._maybe_make_list(columns) rownames = _get_names(index, rownames, prefix='row') colnames = _get_names(columns, colnames, prefix='col') common_idx = _get_objs_combined_axis(index + columns, intersect=True) data = {} data.update(zip(rownames, index)) data.update(zip(colnames, columns)) if values is None and aggfunc is not None: raise ValueError("aggfunc cannot be used without values.") if values is not None and aggfunc is None: raise ValueError("values cannot be used without an aggfunc.") from pandas import DataFrame df = DataFrame(data, index=common_idx) if values is None: df['__dummy__'] = 0 kwargs = {'aggfunc': len, 'fill_value': 0} else: df['__dummy__'] = values kwargs = {'aggfunc': aggfunc} table = df.pivot_table('__dummy__', index=rownames, columns=colnames, margins=margins, margins_name=margins_name, dropna=dropna, **kwargs) # Post-process if normalize is not False: table = _normalize(table, normalize=normalize, margins=margins, margins_name=margins_name) return table
def crosstab(rows, cols, values=None, rownames=None, colnames=None, aggfunc=None, margins=False): """ Compute a simple cross-tabulation of two (or more) factors. By default computes a frequency table of the factors unless an array of values and an aggregation function are passed Parameters ---------- rows : array-like, Series, or list of arrays/Series Values to group by in the rows cols : array-like, Series, or list of arrays/Series Values to group by in the columns values : array-like, optional Array of values to aggregate according to the factors aggfunc : function, optional If no values array is passed, computes a frequency table rownames : sequence, default None If passed, must match number of row arrays passed colnames : sequence, default None If passed, must match number of column arrays passed margins : boolean, default False Add row/column margins (subtotals) Notes ----- Any Series passed will have their name attributes used unless row or column names for the cross-tabulation are specified Examples -------- >>> a array([foo, foo, foo, foo, bar, bar, bar, bar, foo, foo, foo], dtype=object) >>> b array([one, one, one, two, one, one, one, two, two, two, one], dtype=object) >>> c array([dull, dull, shiny, dull, dull, shiny, shiny, dull, shiny, shiny, shiny], dtype=object) >>> crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) b one two c dull shiny dull shiny a bar 1 2 1 0 foo 2 2 1 2 Returns ------- crosstab : DataFrame """ rows = com._maybe_make_list(rows) cols = com._maybe_make_list(cols) rownames = _get_names(rows, rownames, prefix='row') colnames = _get_names(cols, colnames, prefix='col') data = {} data.update(list(zip(rownames, rows))) data.update(list(zip(colnames, cols))) if values is None: df = DataFrame(data) df['__dummy__'] = 0 table = df.pivot_table('__dummy__', rows=rownames, cols=colnames, aggfunc=len, margins=margins) return table.fillna(0).astype(np.int64) else: data['__dummy__'] = values df = DataFrame(data) table = df.pivot_table('__dummy__', rows=rownames, cols=colnames, aggfunc=aggfunc, margins=margins) return table
def crosstab(index, columns, values=None, rownames=None, colnames=None, aggfunc=None, margins=False, dropna=True): """ Compute a simple cross-tabulation of two (or more) factors. By default computes a frequency table of the factors unless an array of values and an aggregation function are passed Parameters ---------- index : array-like, Series, or list of arrays/Series Values to group by in the rows columns : array-like, Series, or list of arrays/Series Values to group by in the columns values : array-like, optional Array of values to aggregate according to the factors aggfunc : function, optional If no values array is passed, computes a frequency table rownames : sequence, default None If passed, must match number of row arrays passed colnames : sequence, default None If passed, must match number of column arrays passed margins : boolean, default False Add row/column margins (subtotals) dropna : boolean, default True Do not include columns whose entries are all NaN Notes ----- Any Series passed will have their name attributes used unless row or column names for the cross-tabulation are specified Examples -------- >>> a array([foo, foo, foo, foo, bar, bar, bar, bar, foo, foo, foo], dtype=object) >>> b array([one, one, one, two, one, one, one, two, two, two, one], dtype=object) >>> c array([dull, dull, shiny, dull, dull, shiny, shiny, dull, shiny, shiny, shiny], dtype=object) >>> crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) b one two c dull shiny dull shiny a bar 1 2 1 0 foo 2 2 1 2 Returns ------- crosstab : DataFrame """ index = com._maybe_make_list(index) columns = com._maybe_make_list(columns) rownames = _get_names(index, rownames, prefix='row') colnames = _get_names(columns, colnames, prefix='col') data = {} data.update(zip(rownames, index)) data.update(zip(colnames, columns)) if values is None: df = DataFrame(data) df['__dummy__'] = 0 table = df.pivot_table('__dummy__', index=rownames, columns=colnames, aggfunc=len, margins=margins, dropna=dropna) return table.fillna(0).astype(np.int64) else: data['__dummy__'] = values df = DataFrame(data) table = df.pivot_table('__dummy__', index=rownames, columns=colnames, aggfunc=aggfunc, margins=margins, dropna=dropna) return table
def crosstab(index, columns, values=None, rownames=None, colnames=None, aggfunc=None, margins=False, margins_name='All', dropna=True, normalize=False): """ Compute a simple cross-tabulation of two (or more) factors. By default computes a frequency table of the factors unless an array of values and an aggregation function are passed Parameters ---------- index : array-like, Series, or list of arrays/Series Values to group by in the rows columns : array-like, Series, or list of arrays/Series Values to group by in the columns values : array-like, optional Array of values to aggregate according to the factors. Requires `aggfunc` be specified. aggfunc : function, optional If specified, requires `values` be specified as well rownames : sequence, default None If passed, must match number of row arrays passed colnames : sequence, default None If passed, must match number of column arrays passed margins : boolean, default False Add row/column margins (subtotals) margins_name : string, default 'All' Name of the row / column that will contain the totals when margins is True. .. versionadded:: 0.21.0 dropna : boolean, default True Do not include columns whose entries are all NaN normalize : boolean, {'all', 'index', 'columns'}, or {0,1}, default False Normalize by dividing all values by the sum of values. - If passed 'all' or `True`, will normalize over all values. - If passed 'index' will normalize over each row. - If passed 'columns' will normalize over each column. - If margins is `True`, will also normalize margin values. .. versionadded:: 0.18.1 Notes ----- Any Series passed will have their name attributes used unless row or column names for the cross-tabulation are specified. Any input passed containing Categorical data will have **all** of its categories included in the cross-tabulation, even if the actual data does not contain any instances of a particular category. In the event that there aren't overlapping indexes an empty DataFrame will be returned. Examples -------- >>> a = np.array(["foo", "foo", "foo", "foo", "bar", "bar", ... "bar", "bar", "foo", "foo", "foo"], dtype=object) >>> b = np.array(["one", "one", "one", "two", "one", "one", ... "one", "two", "two", "two", "one"], dtype=object) >>> c = np.array(["dull", "dull", "shiny", "dull", "dull", "shiny", ... "shiny", "dull", "shiny", "shiny", "shiny"], ... dtype=object) >>> pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) ... # doctest: +NORMALIZE_WHITESPACE b one two c dull shiny dull shiny a bar 1 2 1 0 foo 2 2 1 2 >>> foo = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c']) >>> bar = pd.Categorical(['d', 'e'], categories=['d', 'e', 'f']) >>> crosstab(foo, bar) # 'c' and 'f' are not represented in the data, ... # but they still will be counted in the output ... # doctest: +SKIP col_0 d e f row_0 a 1 0 0 b 0 1 0 c 0 0 0 Returns ------- crosstab : DataFrame """ index = com._maybe_make_list(index) columns = com._maybe_make_list(columns) rownames = _get_names(index, rownames, prefix='row') colnames = _get_names(columns, colnames, prefix='col') obs_idxes = [obj.index for objs in (index, columns) for obj in objs if hasattr(obj, 'index')] if obs_idxes: common_idx = _get_combined_index(obs_idxes, intersect=True) else: common_idx = None data = {} data.update(zip(rownames, index)) data.update(zip(colnames, columns)) if values is None and aggfunc is not None: raise ValueError("aggfunc cannot be used without values.") if values is not None and aggfunc is None: raise ValueError("values cannot be used without an aggfunc.") df = DataFrame(data, index=common_idx) if values is None: df['__dummy__'] = 0 kwargs = {'aggfunc': len, 'fill_value': 0} else: df['__dummy__'] = values kwargs = {'aggfunc': aggfunc} table = df.pivot_table('__dummy__', index=rownames, columns=colnames, margins=margins, margins_name=margins_name, dropna=dropna, **kwargs) # Post-process if normalize is not False: table = _normalize(table, normalize=normalize, margins=margins, margins_name=margins_name) return table
def crosstab(index, columns, values=None, rownames=None, colnames=None, aggfunc=None, margins=False, dropna=True, **kwarg): """ Compute a simple cross-tabulation of two (or more) factors. By default computes a frequency table of the factors unless an array of values and an aggregation function are passed Parameters ---------- index : array-like, Series, or list of arrays/Series Values to group by in the rows columns : array-like, Series, or list of arrays/Series Values to group by in the columns values : array-like, optional Array of values to aggregate according to the factors aggfunc : function, optional If no values array is passed, computes a frequency table rownames : sequence, default None If passed, must match number of row arrays passed colnames : sequence, default None If passed, must match number of column arrays passed margins : boolean, default False Add row/column margins (subtotals) dropna : boolean, default True Do not include columns whose entries are all NaN rows : kwarg only alias of index [deprecated] cols : kwarg only alias of columns [deprecated] Notes ----- Any Series passed will have their name attributes used unless row or column names for the cross-tabulation are specified Examples -------- >>> a array([foo, foo, foo, foo, bar, bar, bar, bar, foo, foo, foo], dtype=object) >>> b array([one, one, one, two, one, one, one, two, two, two, one], dtype=object) >>> c array([dull, dull, shiny, dull, dull, shiny, shiny, dull, shiny, shiny, shiny], dtype=object) >>> crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) b one two c dull shiny dull shiny a bar 1 2 1 0 foo 2 2 1 2 Returns ------- crosstab : DataFrame """ # Parse old-style keyword arguments rows = kwarg.pop('rows', None) if rows is not None: warnings.warn("rows is deprecated, use index", FutureWarning) if index is None: index = rows else: msg = "Can only specify either 'rows' or 'index'" raise TypeError(msg) cols = kwarg.pop('cols', None) if cols is not None: warnings.warn("cols is deprecated, use columns", FutureWarning) if columns is None: columns = cols else: msg = "Can only specify either 'cols' or 'columns'" raise TypeError(msg) if kwarg: raise TypeError("Unexpected argument(s): %s" % kwarg.keys()) index = com._maybe_make_list(index) columns = com._maybe_make_list(columns) rownames = _get_names(index, rownames, prefix='row') colnames = _get_names(columns, colnames, prefix='col') data = {} data.update(zip(rownames, index)) data.update(zip(colnames, columns)) if values is None: df = DataFrame(data) df['__dummy__'] = 0 table = df.pivot_table('__dummy__', index=rownames, columns=colnames, aggfunc=len, margins=margins, dropna=dropna) return table.fillna(0).astype(np.int64) else: data['__dummy__'] = values df = DataFrame(data) table = df.pivot_table('__dummy__', index=rownames, columns=colnames, aggfunc=aggfunc, margins=margins, dropna=dropna) return table