示例#1
0
 def _wrap_aggregated_output(self, output, mask):
     # sort of a kludge
     output = output[self.name]
     index = self._get_multi_index(mask)
     return Series(output, index=index)
示例#2
0
def _generate_marginal_results(table,
                               data,
                               values,
                               rows,
                               cols,
                               aggfunc,
                               observed,
                               grand_margin,
                               margins_name="All"):
    if len(cols) > 0:
        # need to "interleave" the margins
        table_pieces = []
        margin_keys = []

        def _all_key(key):
            return (key, margins_name) + ("", ) * (len(cols) - 1)

        if len(rows) > 0:
            margin = data[rows + values].groupby(
                rows, observed=observed).agg(aggfunc)
            cat_axis = 1

            for key, piece in table.groupby(level=0,
                                            axis=cat_axis,
                                            observed=observed):
                all_key = _all_key(key)

                # we are going to mutate this, so need to copy!
                piece = piece.copy()
                try:
                    piece[all_key] = margin[key]
                except TypeError:

                    # we cannot reshape, so coerce the axis
                    piece.set_axis(
                        piece._get_axis(cat_axis)._to_safe_for_reshape(),
                        axis=cat_axis,
                        inplace=True,
                    )
                    piece[all_key] = margin[key]

                table_pieces.append(piece)
                margin_keys.append(all_key)
        else:
            margin = grand_margin
            cat_axis = 0
            for key, piece in table.groupby(level=0,
                                            axis=cat_axis,
                                            observed=observed):
                all_key = _all_key(key)
                table_pieces.append(piece)
                table_pieces.append(Series(margin[key], index=[all_key]))
                margin_keys.append(all_key)

        result = concat(table_pieces, axis=cat_axis)

        if len(rows) == 0:
            return result
    else:
        result = table
        margin_keys = table.columns

    if len(cols) > 0:
        row_margin = data[cols + values].groupby(
            cols, observed=observed).agg(aggfunc)
        row_margin = row_margin.stack()

        # slight hack
        new_order = [len(cols)] + list(range(len(cols)))
        row_margin.index = row_margin.index.reorder_levels(new_order)
    else:
        row_margin = Series(np.nan, index=result.columns)

    return result, margin_keys, row_margin
示例#3
0
    def _get_empty_meta(self,
                        columns,
                        index_col,
                        index_names,
                        dtype: DtypeArg | None = None):
        columns = list(columns)

        # Convert `dtype` to a defaultdict of some kind.
        # This will enable us to write `dtype[col_name]`
        # without worrying about KeyError issues later on.
        if not is_dict_like(dtype):
            # if dtype == None, default will be object.
            default_dtype = dtype or object
            # error: Argument 1 to "defaultdict" has incompatible type "Callable[[],
            # Union[ExtensionDtype, str, dtype[Any], Type[object], Dict[Hashable,
            # Union[ExtensionDtype, Union[str, dtype[Any]], Type[str], Type[float],
            # Type[int], Type[complex], Type[bool], Type[object]]]]]"; expected
            # "Optional[Callable[[], Union[ExtensionDtype, str, dtype[Any],
            # Type[object]]]]"
            # error: Incompatible return value type (got "Union[ExtensionDtype, str,
            # dtype[Any], Type[object], Dict[Hashable, Union[ExtensionDtype, Union[str,
            # dtype[Any]], Type[str], Type[float], Type[int], Type[complex], Type[bool],
            # Type[object]]]]", expected "Union[ExtensionDtype, str, dtype[Any],
            # Type[object]]")
            dtype = defaultdict(
                lambda: default_dtype  # type: ignore[arg-type, return-value]
            )
        else:
            dtype = cast(dict, dtype)
            dtype = defaultdict(
                lambda: object,
                {
                    columns[k] if is_integer(k) else k: v
                    for k, v in dtype.items()
                },
            )

        # Even though we have no data, the "index" of the empty DataFrame
        # could for example still be an empty MultiIndex. Thus, we need to
        # check whether we have any index columns specified, via either:
        #
        # 1) index_col (column indices)
        # 2) index_names (column names)
        #
        # Both must be non-null to ensure a successful construction. Otherwise,
        # we have to create a generic empty Index.
        if (index_col is None or index_col is False) or index_names is None:
            index = Index([])
        else:
            data = [Series([], dtype=dtype[name]) for name in index_names]
            index = ensure_index_from_sequences(data, names=index_names)
            index_col.sort()

            for i, n in enumerate(index_col):
                columns.pop(n - i)

        col_dict = {
            col_name: Series([], dtype=dtype[col_name])
            for col_name in columns
        }

        return index, columns, col_dict
示例#4
0
    def __new__(cls,
                data,
                index=None,
                sparse_index=None,
                kind='block',
                fill_value=None,
                name=None,
                copy=False):

        is_sparse_array = isinstance(data, SparseArray)
        if fill_value is None:
            if is_sparse_array:
                fill_value = data.fill_value
            else:
                fill_value = nan

        if is_sparse_array:
            if isinstance(data, SparseSeries) and index is None:
                index = data.index
            elif index is not None:
                assert (len(index) == len(data))

            sparse_index = data.sp_index
            values = np.asarray(data)
        elif isinstance(data, (Series, dict)):
            if index is None:
                index = data.index

            data = Series(data)
            values, sparse_index = make_sparse(data,
                                               kind=kind,
                                               fill_value=fill_value)
        elif np.isscalar(data):  # pragma: no cover
            if index is None:
                raise Exception('must pass index!')

            values = np.empty(len(index))
            values.fill(data)

            # TODO: more efficient

            values, sparse_index = make_sparse(values,
                                               kind=kind,
                                               fill_value=fill_value)

        else:
            # array-like
            if sparse_index is None:
                values, sparse_index = make_sparse(data,
                                                   kind=kind,
                                                   fill_value=fill_value)
            else:
                values = data
                assert (len(values) == sparse_index.npoints)

        if index is None:
            index = Index(np.arange(sparse_index.length))
        index = _ensure_index(index)

        # Create array, do *not* copy data by default
        if copy:
            subarr = np.array(values, dtype=np.float64, copy=True)
        else:
            subarr = np.asarray(values, dtype=np.float64)

        if index.is_all_dates():
            cls = SparseTimeSeries

        # Change the class of the array to be the subclass type.
        output = subarr.view(cls)
        output._sp_values = subarr
        output.sp_index = sparse_index
        output.fill_value = np.float64(fill_value)
        output.index = index
        output.name = name
        return output
示例#5
0
    def _initDict(self, data, index, columns, objects, dtype):
        """
        Segregate Series based on type and coerce into matrices.

        Needs to handle a lot of exceptional cases.

        Somehow this got outrageously complicated
        """
        # pre-filter out columns if we passed it
        if columns is not None:
            colset = set(columns)
            data = dict((k, v) for k, v in data.iteritems() if k in colset)

        index = _extract_index(data, index)

        objectDict = {}
        if objects is not None and isinstance(objects, dict):
            objectDict.update(objects)

        valueDict = {}
        for k, v in data.iteritems():
            if isinstance(v, Series):
                if v.index is not index:
                    # Forces alignment. No need to copy data since we
                    # are putting it into an ndarray later
                    v = v.reindex(index)
            else:
                if isinstance(v, dict):
                    v = [v.get(i, NaN) for i in index]
                else:
                    assert (len(v) == len(index))

                try:
                    v = Series(v, dtype=dtype, index=index)
                except Exception:
                    v = Series(v, index=index)

            if issubclass(v.dtype.type, (np.bool_, float, int)):
                valueDict[k] = v
            else:
                objectDict[k] = v

        if columns is None:
            columns = Index(_try_sort(valueDict))
            objectColumns = Index(_try_sort(objectDict))
        else:
            objectColumns = Index([c for c in columns if c in objectDict])
            columns = Index([c for c in columns if c not in objectDict])

        if len(valueDict) == 0:
            dtype = np.object_
            valueDict = objectDict
            columns = objectColumns
        else:
            dtypes = set(v.dtype for v in valueDict.values())

            if len(dtypes) > 1:
                dtype = np.float_
            else:
                dtype = list(dtypes)[0]

            if len(objectDict) > 0:
                new_objects = DataMatrix(objectDict,
                                         dtype=np.object_,
                                         index=index,
                                         columns=objectColumns)
                if isinstance(objects, DataMatrix):
                    objects = objects.join(new_objects, how='left')
                else:
                    objects = new_objects

        values = np.empty((len(index), len(columns)), dtype=dtype)

        for i, col in enumerate(columns):
            if col in valueDict:
                values[:, i] = valueDict[col]
            else:
                values[:, i] = np.NaN

        return index, columns, values, objects
示例#6
0
文件: plm.py 项目: vickyvishal/pandas
 def beta(self):
     return Series(self._beta_raw, index=self._x.columns)
示例#7
0
def _generate_marginal_results(
    table,
    data,
    values,
    rows,
    cols,
    aggfunc,
    observed,
    margins_name: str = "All",
):
    if len(cols) > 0:
        # need to "interleave" the margins
        table_pieces = []
        margin_keys = []

        def _all_key(key):
            return (key, margins_name) + ("", ) * (len(cols) - 1)

        if len(rows) > 0:
            margin = data[rows + values].groupby(
                rows, observed=observed).agg(aggfunc)
            cat_axis = 1

            for key, piece in table.groupby(level=0,
                                            axis=cat_axis,
                                            observed=observed):
                all_key = _all_key(key)

                # we are going to mutate this, so need to copy!
                piece = piece.copy()
                try:
                    piece[all_key] = margin[key]
                except TypeError:

                    # we cannot reshape, so coerce the axis
                    piece.set_axis(
                        piece._get_axis(cat_axis)._to_safe_for_reshape(),
                        axis=cat_axis,
                        inplace=True,
                    )
                    piece[all_key] = margin[key]

                table_pieces.append(piece)
                margin_keys.append(all_key)
        else:
            from pandas import DataFrame

            cat_axis = 0
            for key, piece in table.groupby(level=0,
                                            axis=cat_axis,
                                            observed=observed):
                if len(cols) > 1:
                    all_key = _all_key(key)
                else:
                    all_key = margins_name
                table_pieces.append(piece)
                # GH31016 this is to calculate margin for each group, and assign
                # corresponded key as index
                transformed_piece = DataFrame(piece.apply(aggfunc)).T
                transformed_piece.index = Index([all_key],
                                                name=piece.index.name)

                # append piece for margin into table_piece
                table_pieces.append(transformed_piece)
                margin_keys.append(all_key)

        result = concat(table_pieces, axis=cat_axis)

        if len(rows) == 0:
            return result
    else:
        result = table
        margin_keys = table.columns

    if len(cols) > 0:
        row_margin = data[cols + values].groupby(
            cols, observed=observed).agg(aggfunc)
        row_margin = row_margin.stack()

        # slight hack
        new_order = [len(cols)] + list(range(len(cols)))
        row_margin.index = row_margin.index.reorder_levels(new_order)
    else:
        row_margin = Series(np.nan, index=result.columns)

    return result, margin_keys, row_margin
示例#8
0
    def __new__(cls,
                data,
                index=None,
                sparse_index=None,
                kind='block',
                fill_value=None,
                name=None,
                copy=False):

        is_sparse_array = isinstance(data, SparseArray)
        if fill_value is None:
            if is_sparse_array:
                fill_value = data.fill_value
            else:
                fill_value = nan

        if is_sparse_array:
            if isinstance(data, SparseSeries) and index is None:
                index = data.index
            elif index is not None:
                assert (len(index) == len(data))

            sparse_index = data.sp_index
            values = np.asarray(data)
        elif isinstance(data, (Series, dict)):
            if index is None:
                index = data.index

            data = Series(data)
            values, sparse_index = make_sparse(data,
                                               kind=kind,
                                               fill_value=fill_value)
        elif isinstance(data, (tuple, list, np.ndarray)):
            # array-like
            if sparse_index is None:
                values, sparse_index = make_sparse(data,
                                                   kind=kind,
                                                   fill_value=fill_value)
            else:
                values = data
                assert (len(values) == sparse_index.npoints)
        else:
            if index is None:
                raise Exception('must pass index!')

            length = len(index)

            if data == fill_value or (isnull(data) and isnull(fill_value)):
                if kind == 'block':
                    sparse_index = BlockIndex(length, [], [])
                else:
                    sparse_index = IntIndex(length, [])
                values = np.array([])
            else:
                if kind == 'block':
                    locs, lens = ([0], [length]) if length else ([], [])
                    sparse_index = BlockIndex(length, locs, lens)
                else:
                    sparse_index = IntIndex(length, index)
                values = np.empty(length)
                values.fill(data)

        if index is None:
            index = com._default_index(sparse_index.length)
        index = _ensure_index(index)

        # Create array, do *not* copy data by default
        if copy:
            subarr = np.array(values, dtype=np.float64, copy=True)
        else:
            subarr = np.asarray(values, dtype=np.float64)

        if index.is_all_dates:
            cls = SparseTimeSeries

        # Change the class of the array to be the subclass type.
        output = subarr.view(cls)
        output.sp_index = sparse_index
        output.fill_value = np.float64(fill_value)
        output.index = index
        output.name = name
        return output
 def selectorOfRouletteWheelRatedItem(votesOfCandidatesDict: dict):
     votesOfCandidatesSer: Series = Series(
         votesOfCandidatesDict, index=votesOfCandidatesDict.keys())
     return RouletteWheelSelector.run(votesOfCandidatesSer)
示例#10
0
    def __init__(self,
                 data=None,
                 index=None,
                 sparse_index=None,
                 kind='block',
                 fill_value=None,
                 name=None,
                 dtype=None,
                 copy=False,
                 fastpath=False):

        # we are called internally, so short-circuit
        if fastpath:

            # data is an ndarray, index is defined

            if not isinstance(data, SingleBlockManager):
                data = SingleBlockManager(data, index, fastpath=True)
            if copy:
                data = data.copy()

        else:

            if data is None:
                data = []

            if isinstance(data, Series) and name is None:
                name = data.name

            if isinstance(data, SparseArray):
                if index is not None:
                    assert (len(index) == len(data))
                sparse_index = data.sp_index
                if fill_value is None:
                    fill_value = data.fill_value

                data = np.asarray(data)

            elif isinstance(data, SparseSeries):
                if index is None:
                    index = data.index.view()
                if fill_value is None:
                    fill_value = data.fill_value
                # extract the SingleBlockManager
                data = data._data

            elif isinstance(data, (Series, dict)):
                data = Series(data, index=index)
                index = data.index.view()

                res = make_sparse(data, kind=kind, fill_value=fill_value)
                data, sparse_index, fill_value = res

            elif isinstance(data, (tuple, list, np.ndarray)):
                # array-like
                if sparse_index is None:
                    res = make_sparse(data, kind=kind, fill_value=fill_value)
                    data, sparse_index, fill_value = res
                else:
                    assert (len(data) == sparse_index.npoints)

            elif isinstance(data, SingleBlockManager):
                if dtype is not None:
                    data = data.astype(dtype)
                if index is None:
                    index = data.index.view()
                elif not data.index.equals(index) or copy:  # pragma: no cover
                    # GH#19275 SingleBlockManager input should only be called
                    # internally
                    raise AssertionError('Cannot pass both SingleBlockManager '
                                         '`data` argument and a different '
                                         '`index` argument.  `copy` must '
                                         'be False.')

            else:
                length = len(index)

                if data == fill_value or (isna(data) and isna(fill_value)):
                    if kind == 'block':
                        sparse_index = BlockIndex(length, [], [])
                    else:
                        sparse_index = IntIndex(length, [])
                    data = np.array([])

                else:
                    if kind == 'block':
                        locs, lens = ([0], [length]) if length else ([], [])
                        sparse_index = BlockIndex(length, locs, lens)
                    else:
                        sparse_index = IntIndex(length, index)
                    v = data
                    data = np.empty(length)
                    data.fill(v)

            if index is None:
                index = com._default_index(sparse_index.length)
            index = _ensure_index(index)

            # create/copy the manager
            if isinstance(data, SingleBlockManager):

                if copy:
                    data = data.copy()
            else:

                # create a sparse array
                if not isinstance(data, SparseArray):
                    data = SparseArray(data,
                                       sparse_index=sparse_index,
                                       fill_value=fill_value,
                                       dtype=dtype,
                                       copy=copy)

                data = SingleBlockManager(data, index)

        generic.NDFrame.__init__(self, data)

        self.index = index
        self.name = name
示例#11
0
def value_counts(values,
                 sort=True,
                 ascending=False,
                 normalize=False,
                 bins=None):
    """
    Compute a histogram of the counts of non-null values

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order
    normalize: boolean, default False
        If True then compute a relative histogram
    bins : integer, optional
        Rather than count values, group them into half-open bins,
        convenience for pd.cut, only works with numeric data

    Returns
    -------
    value_counts : Series

    """
    from pandas.core.series import Series
    from pandas.tools.tile import cut

    values = Series(values).values

    if bins is not None:
        try:
            cat, bins = cut(values, bins, retbins=True)
        except TypeError:
            raise TypeError("bins argument only works with numeric data.")
        values = cat.labels

    if com.is_integer_dtype(values.dtype):
        values = com._ensure_int64(values)
        keys, counts = htable.value_count_int64(values)

    elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)):
        dtype = values.dtype
        values = values.view(np.int64)
        keys, counts = htable.value_count_int64(values)

        # convert the keys back to the dtype we came in
        keys = Series(keys, dtype=dtype)

    else:
        mask = com.isnull(values)
        values = com._ensure_object(values)
        keys, counts = htable.value_count_object(values, mask)

    result = Series(counts, index=com._values_from_object(keys))

    if bins is not None:
        # TODO: This next line should be more efficient
        result = result.reindex(np.arange(len(cat.levels)), fill_value=0)
        result.index = bins[:-1]

    if sort:
        result.sort()
        if not ascending:
            result = result[::-1]

    if normalize:
        result = result / float(values.size)

    return result
示例#12
0
 def setup_method(self) -> None:
     self.data_backend = PandasBackend(
         DataFrame({"a": ["a", "b", "c"], "b": [1, 2, 3], "c": [True, False, True]})
     )
     self.test_series0 = PandasBackend(Series({"a": "a", "b": 1, "c": True}), index=[0])
     self.test_series2 = PandasBackend(Series({"a": "c", "b": 3, "c": True}), index=[2])
    def setUp(self):
        """
        Setup the dataframes used for the groupby tests derived from pandas
        """
        self.dateRange = bdate_range('1/1/2005', periods=250)
        self.stringIndex = Index([rands(8).upper() for x in range(250)])

        self.groupId = Series([x[0] for x in self.stringIndex],
                              index=self.stringIndex)
        self.groupDict = dict((k, v)
                              for k, v in compat.iteritems(self.groupId))

        self.columnIndex = Index(['A', 'B', 'C', 'D', 'E'])

        randMat = np.random.randn(250, 5)
        self.stringMatrix = DataFrame(randMat, columns=self.columnIndex,
                                      index=self.stringIndex)

        self.timeMatrix = DataFrame(randMat, columns=self.columnIndex,
                                    index=self.dateRange)
        self.ts = tm.makeTimeSeries()

        self.seriesd = tm.getSeriesData()
        self.tsd = tm.getTimeSeriesData()
        self.frame = DataFrame(self.seriesd)
        self.tsframe = DataFrame(self.tsd)

        self.df = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                                   'foo', 'bar', 'foo', 'foo'],
                             'B': ['one', 'one', 'two', 'three',
                                   'two', 'two', 'one', 'three'],
                             'C': np.random.randn(8),
                             'D': np.random.randn(8)})

        self.df_mixed_floats = DataFrame({'A': ['foo', 'bar', 'foo', 'bar',
                                                'foo', 'bar', 'foo', 'foo'],
                                          'B': ['one', 'one', 'two', 'three',
                                                'two', 'two', 'one', 'three'],
                                          'C': np.random.randn(8),
                                          'D': np.array(np.random.randn(8),
                                                        dtype='float32')})

        index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
                                   ['one', 'two', 'three']],
                           labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
                                   [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
                           names=['first', 'second'])
        self.mframe = DataFrame(np.random.randn(10, 3), index=index,
                                columns=['A', 'B', 'C'])

        self.three_group = DataFrame({'A': ['foo', 'foo', 'foo', 'foo',
                                            'bar', 'bar', 'bar', 'bar',
                                            'foo', 'foo', 'foo'],
                                      'B': ['one', 'one', 'one', 'two',
                                            'one', 'one', 'one', 'two',
                                            'two', 'two', 'one'],
                                      'C': ['dull', 'dull', 'shiny', 'dull',
                                            'dull', 'shiny', 'shiny', 'dull',
                                            'shiny', 'shiny', 'shiny'],
                                      'D': np.random.randn(11),
                                      'E': np.random.randn(11),
                                      'F': np.random.randn(11)})
        super(self.__class__, self).setUp()
示例#14
0
文件: tools.py 项目: while/pandas
                result = DatetimeIndex(result, tz='utc' if utc else None)
            return result
        except ValueError, e:
            try:
                values, tz = lib.datetime_to_datetime64(arg)
                return DatetimeIndex._simple_new(values, None, tz=tz)
            except (ValueError, TypeError):
                raise e

    if arg is None:
        return arg
    elif isinstance(arg, datetime):
        return arg
    elif isinstance(arg, Series):
        values = _convert_f(arg.values)
        return Series(values, index=arg.index, name=arg.name)
    elif isinstance(arg, (np.ndarray, list)):
        if isinstance(arg, list):
            arg = np.array(arg, dtype='O')
        result = _convert_f(arg)
        return result
    try:
        if not arg:
            return arg
        return parse(arg, dayfirst=dayfirst)
    except Exception:
        if errors == 'raise':
            raise
        return arg

示例#15
0
def _unstack_multiple(data, clocs):
    if len(clocs) == 0:
        return data

    # NOTE: This doesn't deal with hierarchical columns yet

    index = data.index

    clocs = [index._get_level_number(i) for i in clocs]

    rlocs = [i for i in range(index.nlevels) if i not in clocs]

    clevels = [index.levels[i] for i in clocs]
    clabels = [index.labels[i] for i in clocs]
    cnames = [index.names[i] for i in clocs]
    rlevels = [index.levels[i] for i in rlocs]
    rlabels = [index.labels[i] for i in rlocs]
    rnames = [index.names[i] for i in rlocs]

    shape = [len(x) for x in clevels]
    group_index = get_group_index(clabels, shape)

    comp_ids, obs_ids = _compress_group_index(group_index, sort=False)
    recons_labels = decons_group_index(obs_ids, shape)

    dummy_index = MultiIndex(levels=rlevels + [obs_ids],
                             labels=rlabels + [comp_ids],
                             names=rnames + ['__placeholder__'])

    if isinstance(data, Series):
        dummy = Series(data.values, index=dummy_index)
        unstacked = dummy.unstack('__placeholder__')
        new_levels = clevels
        new_names = cnames
        new_labels = recons_labels
    else:
        if isinstance(data.columns, MultiIndex):
            result = data
            for i in range(len(clocs)):
                val = clocs[i]
                result = result.unstack(val)
                clocs = [val if i > val else val - 1 for val in clocs]

            return result

        dummy = DataFrame(data.values, index=dummy_index, columns=data.columns)

        unstacked = dummy.unstack('__placeholder__')
        if isinstance(unstacked, Series):
            unstcols = unstacked.index
        else:
            unstcols = unstacked.columns
        new_levels = [unstcols.levels[0]] + clevels
        new_names = [data.columns.name] + cnames

        new_labels = [unstcols.labels[0]]
        for rec in recons_labels:
            new_labels.append(rec.take(unstcols.labels[-1]))

    new_columns = MultiIndex(levels=new_levels,
                             labels=new_labels,
                             names=new_names)

    if isinstance(unstacked, Series):
        unstacked.index = new_columns
    else:
        unstacked.columns = new_columns

    return unstacked
示例#16
0
    def test_groupby_multi_categorical_as_index(self):
        # GH13204
        df = DataFrame({
            'cat': Categorical([1, 2, 2], [1, 2, 3]),
            'A': [10, 11, 11],
            'B': [101, 102, 103]
        })
        result = df.groupby(['cat', 'A'], as_index=False).sum()
        expected = DataFrame(
            {
                'cat': Categorical([1, 1, 2, 2, 3, 3]),
                'A': [10, 11, 10, 11, 10, 11],
                'B': [101.0, nan, nan, 205.0, nan, nan]
            },
            columns=['cat', 'A', 'B'])
        tm.assert_frame_equal(result, expected)

        # function grouper
        f = lambda r: df.loc[r, 'A']
        result = df.groupby(['cat', f], as_index=False).sum()
        expected = DataFrame(
            {
                'cat': Categorical([1, 1, 2, 2, 3, 3]),
                'A': [10.0, nan, nan, 22.0, nan, nan],
                'B': [101.0, nan, nan, 205.0, nan, nan]
            },
            columns=['cat', 'A', 'B'])
        tm.assert_frame_equal(result, expected)

        # another not in-axis grouper (conflicting names in index)
        s = Series(['a', 'b', 'b'], name='cat')
        result = df.groupby(['cat', s], as_index=False).sum()
        expected = DataFrame(
            {
                'cat': Categorical([1, 1, 2, 2, 3, 3]),
                'A': [10.0, nan, nan, 22.0, nan, nan],
                'B': [101.0, nan, nan, 205.0, nan, nan]
            },
            columns=['cat', 'A', 'B'])
        tm.assert_frame_equal(result, expected)

        # is original index dropped?
        expected = DataFrame(
            {
                'cat': Categorical([1, 1, 2, 2, 3, 3]),
                'A': [10, 11, 10, 11, 10, 11],
                'B': [101.0, nan, nan, 205.0, nan, nan]
            },
            columns=['cat', 'A', 'B'])

        group_columns = ['cat', 'A']

        for name in [None, 'X', 'B', 'cat']:
            df.index = Index(list("abc"), name=name)

            if name in group_columns and name in df.index.names:
                with tm.assert_produces_warning(FutureWarning,
                                                check_stacklevel=False):
                    result = df.groupby(group_columns, as_index=False).sum()

            else:
                result = df.groupby(group_columns, as_index=False).sum()

            tm.assert_frame_equal(result, expected, check_index_type=True)
示例#17
0
def value_counts(values,
                 sort=True,
                 ascending=False,
                 normalize=False,
                 bins=None,
                 dropna=True):
    """
    Compute a histogram of the counts of non-null values.

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order
    normalize: boolean, default False
        If True then compute a relative histogram
    bins : integer, optional
        Rather than count values, group them into half-open bins,
        convenience for pd.cut, only works with numeric data
    dropna : boolean, default True
        Don't include counts of NaN

    Returns
    -------
    value_counts : Series

    """
    from pandas.core.series import Series
    from pandas.tools.tile import cut
    from pandas import Index, PeriodIndex, DatetimeIndex

    name = getattr(values, 'name', None)
    values = Series(values).values

    if bins is not None:
        try:
            cat, bins = cut(values, bins, retbins=True)
        except TypeError:
            raise TypeError("bins argument only works with numeric data.")
        values = cat.codes

    if com.is_categorical_dtype(values.dtype):
        result = values.value_counts(dropna)

    else:

        dtype = values.dtype
        is_period = com.is_period_arraylike(values)
        is_datetimetz = com.is_datetimetz(values)

        if com.is_datetime_or_timedelta_dtype(dtype) or is_period or \
                is_datetimetz:

            if is_period:
                values = PeriodIndex(values)
            elif is_datetimetz:
                tz = getattr(values, 'tz', None)
                values = DatetimeIndex(values).tz_localize(None)

            values = values.view(np.int64)
            keys, counts = htable.value_count_scalar64(values, dropna)

            if dropna:
                from pandas.tslib import iNaT
                msk = keys != iNaT
                keys, counts = keys[msk], counts[msk]

            # localize to the original tz if necessary
            if is_datetimetz:
                keys = DatetimeIndex(keys).tz_localize(tz)

            # convert the keys back to the dtype we came in
            else:
                keys = keys.astype(dtype)

        elif com.is_integer_dtype(dtype):
            values = com._ensure_int64(values)
            keys, counts = htable.value_count_scalar64(values, dropna)
        elif com.is_float_dtype(dtype):
            values = com._ensure_float64(values)
            keys, counts = htable.value_count_scalar64(values, dropna)

        else:
            values = com._ensure_object(values)
            mask = com.isnull(values)
            keys, counts = htable.value_count_object(values, mask)
            if not dropna and mask.any():
                keys = np.insert(keys, 0, np.NaN)
                counts = np.insert(counts, 0, mask.sum())

        if not isinstance(keys, Index):
            keys = Index(keys)
        result = Series(counts, index=keys, name=name)

        if bins is not None:
            # TODO: This next line should be more efficient
            result = result.reindex(np.arange(len(cat.categories)),
                                    fill_value=0)
            result.index = bins[:-1]

    if sort:
        result = result.sort_values(ascending=ascending)

    if normalize:
        result = result / float(values.size)

    return result
示例#18
0
    def data(self, convert_dates=True, convert_categoricals=True, index=None):
        """
        Reads observations from Stata file, converting them into a dataframe

        Parameters
        ----------
        convert_dates : boolean, defaults to True
            Convert date variables to DataFrame time values
        convert_categoricals : boolean, defaults to True
            Read value labels and convert columns to Categorical/Factor
            variables
        index : identifier of index column
            identifier of column that should be used as index of the DataFrame

        Returns
        -------
        y : DataFrame instance
        """
        if self._data_read:
            raise Exception("Data has already been read.")
        self._data_read = True

        if self.format_version >= 117:
            self._read_strls()

        stata_dta = self._dataset()

        data = []
        for rownum, line in enumerate(stata_dta):
            # doesn't handle missing value objects, just casts
            # None will only work without missing value object.
            for i, val in enumerate(line):
                #NOTE: This will only be scalar types because missing strings
                # are empty not None in Stata
                if val is None:
                    line[i] = np.nan
            data.append(tuple(line))

        if convert_categoricals:
            self._read_value_labels()

        data = DataFrame(data, columns=self.varlist, index=index)

        cols_ = np.where(self.dtyplist)[0]
        for i in cols_:
            if self.dtyplist[i] is not None:
                col = data.columns[i]
                if data[col].dtype is not np.dtype(object):
                    data[col] = Series(data[col], data[col].index,
                                       self.dtyplist[i])

        if convert_dates:
            cols = np.where(lmap(lambda x: x in _date_formats,
                                 self.fmtlist))[0]
            for i in cols:
                col = data.columns[i]
                data[col] = data[col].apply(_stata_elapsed_date_to_datetime,
                                            args=(self.fmtlist[i],))

        if convert_categoricals:
            cols = np.where(
                lmap(lambda x: x in compat.iterkeys(self.value_label_dict),
                     self.lbllist)
            )[0]
            for i in cols:
                col = data.columns[i]
                labeled_data = np.copy(data[col])
                labeled_data = labeled_data.astype(object)
                for k, v in compat.iteritems(
                        self.value_label_dict[self.lbllist[i]]):
                    labeled_data[(data[col] == k).values] = v
                data[col] = Categorical.from_array(labeled_data)

        return data
示例#19
0
def _add_margins(
    table: Union["Series", "DataFrame"],
    data,
    values,
    rows,
    cols,
    aggfunc,
    observed=None,
    margins_name: str = "All",
    fill_value=None,
):
    if not isinstance(margins_name, str):
        raise ValueError("margins_name argument must be a string")

    msg = 'Conflicting name "{name}" in margins'.format(name=margins_name)
    for level in table.index.names:
        if margins_name in table.index.get_level_values(level):
            raise ValueError(msg)

    grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name)

    if table.ndim == 2:
        # i.e. DataFramae
        for level in table.columns.names[1:]:
            if margins_name in table.columns.get_level_values(level):
                raise ValueError(msg)

    key: Union[str, Tuple[str, ...]]
    if len(rows) > 1:
        key = (margins_name, ) + ("", ) * (len(rows) - 1)
    else:
        key = margins_name

    if not values and isinstance(table, ABCSeries):
        # If there are no values and the table is a series, then there is only
        # one column in the data. Compute grand margin and return it.
        return table.append(Series({key: grand_margin[margins_name]}))

    elif values:
        marginal_result_set = _generate_marginal_results(
            table,
            data,
            values,
            rows,
            cols,
            aggfunc,
            observed,
            margins_name,
        )
        if not isinstance(marginal_result_set, tuple):
            return marginal_result_set
        result, margin_keys, row_margin = marginal_result_set
    else:
        # no values, and table is a DataFrame
        assert isinstance(table, ABCDataFrame)
        marginal_result_set = _generate_marginal_results_without_values(
            table, data, rows, cols, aggfunc, observed, margins_name)
        if not isinstance(marginal_result_set, tuple):
            return marginal_result_set
        result, margin_keys, row_margin = marginal_result_set

    row_margin = row_margin.reindex(result.columns, fill_value=fill_value)
    # populate grand margin
    for k in margin_keys:
        if isinstance(k, str):
            row_margin[k] = grand_margin[k]
        else:
            row_margin[k] = grand_margin[k[0]]

    from pandas import DataFrame

    margin_dummy = DataFrame(row_margin, columns=[key]).T

    row_names = result.index.names
    try:
        # check the result column and leave floats
        for dtype in set(result.dtypes):
            cols = result.select_dtypes([dtype]).columns
            margin_dummy[cols] = margin_dummy[cols].apply(
                maybe_downcast_to_dtype, args=(dtype, ))
        result = result.append(margin_dummy)
    except TypeError:

        # we cannot reshape, so coerce the axis
        result.index = result.index._to_safe_for_reshape()
        result = result.append(margin_dummy)
    result.index.names = row_names

    return result
示例#20
0
文件: tools.py 项目: nhanitvn/pandas
def to_datetime(arg,
                errors='ignore',
                dayfirst=False,
                utc=None,
                box=True,
                format=None,
                coerce=False,
                unit='ns'):
    """
    Convert argument to datetime

    Parameters
    ----------
    arg : string, datetime, array of strings (with possible NAs)
    errors : {'ignore', 'raise'}, default 'ignore'
        Errors are ignored by default (values left untouched)
    dayfirst : boolean, default False
        If True parses dates with the day first, eg 20/01/2005
        Warning: dayfirst=True is not strict, but will prefer to parse
        with day first (this is a known bug).
    utc : boolean, default None
        Return UTC DatetimeIndex if True (converting any tz-aware
        datetime.datetime objects as well)
    box : boolean, default True
        If True returns a DatetimeIndex, if False returns ndarray of values
    format : string, default None
        strftime to parse time, eg "%d/%m/%Y"
    coerce : force errors to NaT (False by default)
    unit : unit of the arg (D,s,ms,us,ns) denote the unit in epoch
        (e.g. a unix timestamp), which is an integer/float number

    Returns
    -------
    ret : datetime if parsing succeeded
    """
    from pandas import Timestamp
    from pandas.core.series import Series
    from pandas.tseries.index import DatetimeIndex

    def _convert_listlike(arg, box):

        if isinstance(arg, (list, tuple)):
            arg = np.array(arg, dtype='O')

        if com.is_datetime64_ns_dtype(arg):
            if box and not isinstance(arg, DatetimeIndex):
                try:
                    return DatetimeIndex(arg, tz='utc' if utc else None)
                except ValueError:
                    pass

            return arg

        arg = com._ensure_object(arg)
        try:
            if format is not None:
                result = None

                # shortcut formatting here
                if format == '%Y%m%d':
                    try:
                        result = _attempt_YYYYMMDD(arg)
                    except:
                        raise ValueError(
                            "cannot convert the input to '%Y%m%d' date format")

                # fallback
                if result is None:
                    result = tslib.array_strptime(arg, format, coerce=coerce)
            else:
                result = tslib.array_to_datetime(arg,
                                                 raise_=errors == 'raise',
                                                 utc=utc,
                                                 dayfirst=dayfirst,
                                                 coerce=coerce,
                                                 unit=unit)
            if com.is_datetime64_dtype(result) and box:
                result = DatetimeIndex(result, tz='utc' if utc else None)
            return result

        except ValueError as e:
            try:
                values, tz = tslib.datetime_to_datetime64(arg)
                return DatetimeIndex._simple_new(values, None, tz=tz)
            except (ValueError, TypeError):
                raise e

    if arg is None:
        return arg
    elif isinstance(arg, Timestamp):
        return arg
    elif isinstance(arg, Series):
        values = _convert_listlike(arg.values, box=False)
        return Series(values, index=arg.index, name=arg.name)
    elif com.is_list_like(arg):
        return _convert_listlike(arg, box=box)

    return _convert_listlike(np.array([arg]), box=box)[0]
示例#21
0
def _get_dummies_1d(data,
                    prefix,
                    prefix_sep='_',
                    dummy_na=False,
                    sparse=False,
                    drop_first=False):
    # Series avoids inconsistent NaN handling
    codes, levels = _factorize_from_iterable(Series(data))

    def get_empty_Frame(data, sparse):
        if isinstance(data, Series):
            index = data.index
        else:
            index = np.arange(len(data))
        if not sparse:
            return DataFrame(index=index)
        else:
            return SparseDataFrame(index=index, default_fill_value=0)

    # if all NaN
    if not dummy_na and len(levels) == 0:
        return get_empty_Frame(data, sparse)

    codes = codes.copy()
    if dummy_na:
        codes[codes == -1] = len(levels)
        levels = np.append(levels, np.nan)

    # if dummy_na, we just fake a nan level. drop_first will drop it again
    if drop_first and len(levels) == 1:
        return get_empty_Frame(data, sparse)

    number_of_cols = len(levels)

    if prefix is not None:
        dummy_strs = [
            u'{prefix}{sep}{level}'
            if isinstance(v, text_type) else '{prefix}{sep}{level}'
            for v in levels
        ]
        dummy_cols = [
            dummy_str.format(prefix=prefix, sep=prefix_sep, level=v)
            for dummy_str, v in zip(dummy_strs, levels)
        ]
    else:
        dummy_cols = levels

    if isinstance(data, Series):
        index = data.index
    else:
        index = None

    if sparse:
        sparse_series = {}
        N = len(data)
        sp_indices = [[] for _ in range(len(dummy_cols))]
        for ndx, code in enumerate(codes):
            if code == -1:
                # Blank entries if not dummy_na and code == -1, #GH4446
                continue
            sp_indices[code].append(ndx)

        if drop_first:
            # remove first categorical level to avoid perfect collinearity
            # GH12042
            sp_indices = sp_indices[1:]
            dummy_cols = dummy_cols[1:]
        for col, ixs in zip(dummy_cols, sp_indices):
            sarr = SparseArray(np.ones(len(ixs), dtype=np.uint8),
                               sparse_index=IntIndex(N, ixs),
                               fill_value=0,
                               dtype=np.uint8)
            sparse_series[col] = SparseSeries(data=sarr, index=index)

        out = SparseDataFrame(sparse_series,
                              index=index,
                              columns=dummy_cols,
                              default_fill_value=0,
                              dtype=np.uint8)
        return out

    else:
        dummy_mat = np.eye(number_of_cols, dtype=np.uint8).take(codes, axis=0)

        if not dummy_na:
            # reset NaN GH4446
            dummy_mat[codes == -1] = 0

        if drop_first:
            # remove first GH12042
            dummy_mat = dummy_mat[:, 1:]
            dummy_cols = dummy_cols[1:]
        return DataFrame(dummy_mat, index=index, columns=dummy_cols)
示例#22
0
    if not args.base_category:
        print(
            "Warning: you did not specify a base category. This can take *very* long time to complete. See matcher -h for help."
        )

    # load product csv file
    print("Parsing input file: %s" % product_file)
    product_data = pd.read_csv(product_file,
                               sep=',',
                               usecols=product_columns +
                               [google_category_column])
    print("Processing %d rows ..." % product_data.shape[0])

    # if target google category column doesnt exist in file: add
    if not google_category_column in product_data.columns:
        product_data[google_category_column] = Series()

    # iterate through data row by row and match category
    index = 1
    replacements = 0
    for row_index, row in product_data.iterrows():
        index += 1
        if index % 10 == 0:
            print("Progress: %d rows finished" % index)
        p = {}
        for col in product_columns:
            value = safe_get(row, col)
            if value:
                p[col] = row.get(col)
        gcat = safe_get(row, google_category_column)
示例#23
0
def str_extract(arr, pat, flags=0):
    """
    Find groups in each string using passed regular expression

    Parameters
    ----------
    pat : string
        Pattern or regular expression
    flags : int, default 0 (no flags)
        re module flags, e.g. re.IGNORECASE

    Returns
    -------
    extracted groups : Series (one group) or DataFrame (multiple groups)
        Note that dtype of the result is always object, even when no match is
        found and the result is a Series or DataFrame containing only NaN
        values.

    Examples
    --------
    A pattern with one group will return a Series. Non-matches will be NaN.

    >>> Series(['a1', 'b2', 'c3']).str.extract('[ab](\d)')
    0      1
    1      2
    2    NaN
    dtype: object

    A pattern with more than one group will return a DataFrame.

    >>> Series(['a1', 'b2', 'c3']).str.extract('([ab])(\d)')
         0    1
    0    a    1
    1    b    2
    2  NaN  NaN

    A pattern may contain optional groups.

    >>> Series(['a1', 'b2', 'c3']).str.extract('([ab])?(\d)')
         0  1
    0    a  1
    1    b  2
    2  NaN  3

    Named groups will become column names in the result.

    >>> Series(['a1', 'b2', 'c3']).str.extract('(?P<letter>[ab])(?P<digit>\d)')
      letter digit
    0      a     1
    1      b     2
    2    NaN   NaN

    """
    from pandas.core.series import Series
    from pandas.core.frame import DataFrame

    regex = re.compile(pat, flags=flags)
    # just to be safe, check this
    if regex.groups == 0:
        raise ValueError("This pattern contains no groups to capture.")
    empty_row = [np.nan] * regex.groups

    def f(x):
        if not isinstance(x, compat.string_types):
            return empty_row
        m = regex.search(x)
        if m:
            return [np.nan if item is None else item for item in m.groups()]
        else:
            return empty_row

    if regex.groups == 1:
        result = Series([f(val)[0] for val in arr],
                        name=_get_single_group_name(regex),
                        index=arr.index,
                        dtype=object)
    else:
        names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
        columns = [names.get(1 + i, i) for i in range(regex.groups)]
        if arr.empty:
            result = DataFrame(columns=columns, dtype=object)
        else:
            result = DataFrame([f(val) for val in arr],
                               columns=columns,
                               index=arr.index,
                               dtype=object)
    return result
示例#24
0
文件: reshape.py 项目: yazici/pandas
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False):
    """
    Convert categorical variable into dummy/indicator variables

    Parameters
    ----------
    data : array-like or Series
    prefix : string, default None
        String to append DataFrame column names
    prefix_sep : string, default '_'
        If appending prefix, separator/delimiter to use
    dummy_na : bool, default False
        Add a column to indicate NaNs, if False NaNs are ignored.

    Returns
    -------
    dummies : DataFrame

    Examples
    --------
    >>> import pandas as pd
    >>> s = pd.Series(list('abca'))

    >>> get_dummies(s)
       a  b  c
    0  1  0  0
    1  0  1  0
    2  0  0  1
    3  1  0  0

    >>> s1 = ['a', 'b', np.nan]

    >>> get_dummies(s1)
       a  b
    0  1  0
    1  0  1
    2  0  0

    >>> get_dummies(s1, dummy_na=True)
       a  b  NaN
    0  1  0    0
    1  0  1    0
    2  0  0    1

    See also ``Series.str.get_dummies``.

    """
    # Series avoids inconsistent NaN handling
    cat = Categorical.from_array(Series(data))
    levels = cat.levels

    # if all NaN
    if not dummy_na and len(levels) == 0:
        if isinstance(data, Series):
            index = data.index
        else:
            index = np.arange(len(data))
        return DataFrame(index=index)

    number_of_cols = len(levels)
    if dummy_na:
        number_of_cols += 1

    dummy_mat = np.eye(number_of_cols).take(cat.codes, axis=0)

    if dummy_na:
        levels = np.append(cat.levels, np.nan)
    else:
        # reset NaN GH4446
        dummy_mat[cat.codes == -1] = 0

    if prefix is not None:
        dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v)
                      for v in levels]
    else:
        dummy_cols = levels

    if isinstance(data, Series):
        index = data.index
    else:
        index = None

    return DataFrame(dummy_mat, index=index, columns=dummy_cols)
示例#25
0
def _add_margins(
    table,
    data,
    values,
    rows,
    cols,
    aggfunc,
    observed=None,
    margins_name="All",
    fill_value=None,
):
    if not isinstance(margins_name, str):
        raise ValueError("margins_name argument must be a string")

    msg = 'Conflicting name "{name}" in margins'.format(name=margins_name)
    for level in table.index.names:
        if margins_name in table.index.get_level_values(level):
            raise ValueError(msg)

    grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name)

    # could be passed a Series object with no 'columns'
    if hasattr(table, "columns"):
        for level in table.columns.names[1:]:
            if margins_name in table.columns.get_level_values(level):
                raise ValueError(msg)

    if len(rows) > 1:
        key = (margins_name, ) + ("", ) * (len(rows) - 1)
    else:
        key = margins_name

    if not values and isinstance(table, ABCSeries):
        # If there are no values and the table is a series, then there is only
        # one column in the data. Compute grand margin and return it.
        return table.append(Series({key: grand_margin[margins_name]}))

    if values:
        marginal_result_set = _generate_marginal_results(
            table,
            data,
            values,
            rows,
            cols,
            aggfunc,
            observed,
            grand_margin,
            margins_name,
        )
        if not isinstance(marginal_result_set, tuple):
            return marginal_result_set
        result, margin_keys, row_margin = marginal_result_set
    else:
        marginal_result_set = _generate_marginal_results_without_values(
            table, data, rows, cols, aggfunc, observed, margins_name)
        if not isinstance(marginal_result_set, tuple):
            return marginal_result_set
        result, margin_keys, row_margin = marginal_result_set
    row_margin = row_margin.reindex(result.columns, fill_value=fill_value)
    # populate grand margin
    for k in margin_keys:
        if isinstance(k, str):
            row_margin[k] = grand_margin[k]
        else:
            row_margin[k] = grand_margin[k[0]]

    from pandas import DataFrame

    margin_dummy = DataFrame(row_margin, columns=[key]).T

    row_names = result.index.names
    try:
        for dtype in set(result.dtypes):
            cols = result.select_dtypes([dtype]).columns
            margin_dummy[cols] = margin_dummy[cols].astype(dtype)
        result = result.append(margin_dummy)
    except TypeError:

        # we cannot reshape, so coerce the axis
        result.index = result.index._to_safe_for_reshape()
        result = result.append(margin_dummy)
    result.index.names = row_names

    return result
示例#26
0
    def __init__(self, data, index=None, sparse_index=None, kind='block',
                 fill_value=None, name=None, dtype=None, copy=False,
                 fastpath=False):

        # we are called internally, so short-circuit
        if fastpath:

            # data is an ndarray, index is defined
            data = SingleBlockManager(data, index, fastpath=True)
            if copy:
                data = data.copy()
        else:

            is_sparse_array = isinstance(data, SparseArray)
            if fill_value is None:
                if is_sparse_array:
                    fill_value = data.fill_value
                else:
                    fill_value = nan

            if is_sparse_array:
                if isinstance(data, SparseSeries) and index is None:
                    index = data.index.view()
                elif index is not None:
                    assert(len(index) == len(data))

                sparse_index = data.sp_index
                data = np.asarray(data)

            elif isinstance(data, SparseSeries):
                if index is None:
                    index = data.index.view()

                # extract the SingleBlockManager
                data = data._data

            elif isinstance(data, (Series, dict)):
                if index is None:
                    index = data.index.view()

                data = Series(data)
                data, sparse_index = make_sparse(data, kind=kind,
                                                 fill_value=fill_value)

            elif isinstance(data, (tuple, list, np.ndarray)):
                # array-like
                if sparse_index is None:
                    data, sparse_index = make_sparse(data, kind=kind,
                                                     fill_value=fill_value)
                else:
                    assert(len(data) == sparse_index.npoints)

            elif isinstance(data, SingleBlockManager):
                if dtype is not None:
                    data = data.astype(dtype)
                if index is None:
                    index = data.index.view()
                else:
                    data = data.reindex(index, copy=False)

            else:

                length = len(index)

                if data == fill_value or (isnull(data)
                                          and isnull(fill_value)):
                    if kind == 'block':
                        sparse_index = BlockIndex(length, [], [])
                    else:
                        sparse_index = IntIndex(length, [])
                    data = np.array([])

                else:
                    if kind == 'block':
                        locs, lens = ([0], [length]) if length else ([], [])
                        sparse_index = BlockIndex(length, locs, lens)
                    else:
                        sparse_index = IntIndex(length, index)
                    v = data
                    data = np.empty(length)
                    data.fill(v)

            if index is None:
                index = com._default_index(sparse_index.length)
            index = _ensure_index(index)

            # create/copy the manager
            if isinstance(data, SingleBlockManager):

                if copy:
                    data = data.copy()
            else:

                # create a sparse array
                if not isinstance(data, SparseArray):
                    data = SparseArray(
                        data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype, copy=copy)

                data = SingleBlockManager(data, index)

        generic.NDFrame.__init__(self, data)

        self.index = index
        self.name = name
示例#27
0
def dict_to_mgr(
    data: dict,
    index,
    columns,
    *,
    dtype: DtypeObj | None = None,
    typ: str = "block",
    copy: bool = True,
) -> Manager:
    """
    Segregate Series based on type and coerce into matrices.
    Needs to handle a lot of exceptional cases.

    Used in DataFrame.__init__
    """
    arrays: Sequence[Any] | Series

    if columns is not None:
        from pandas.core.series import Series

        arrays = Series(data, index=columns, dtype=object)
        data_names = arrays.index
        missing = arrays.isna()
        if index is None:
            # GH10856
            # raise ValueError if only scalars in dict
            index = extract_index(arrays[~missing])
        else:
            index = ensure_index(index)

        # no obvious "empty" int column
        if missing.any() and not is_integer_dtype(dtype):
            nan_dtype: DtypeObj

            if dtype is None or (isinstance(dtype, np.dtype)
                                 and np.issubdtype(dtype, np.flexible)):
                # GH#1783
                nan_dtype = np.dtype("object")
            else:
                nan_dtype = dtype
            val = construct_1d_arraylike_from_scalar(np.nan, len(index),
                                                     nan_dtype)
            arrays.loc[missing] = [val] * missing.sum()

        arrays = list(arrays)

    else:
        keys = list(data.keys())
        columns = data_names = Index(keys)
        arrays = [com.maybe_iterable_to_list(data[k]) for k in keys]
        # GH#24096 need copy to be deep for datetime64tz case
        # TODO: See if we can avoid these copies
        arrays = [
            arr if not isinstance(arr, ABCIndex) else arr._data
            for arr in arrays
        ]
        arrays = [
            arr if not is_datetime64tz_dtype(arr) else arr.copy()
            for arr in arrays
        ]

    if copy:
        # arrays_to_mgr (via form_blocks) won't make copies for EAs
        # dtype attr check to exclude EADtype-castable strs
        arrays = [
            x if not hasattr(x, "dtype")
            or not isinstance(x.dtype, ExtensionDtype) else x.copy()
            for x in arrays
        ]
        # TODO: can we get rid of the dt64tz special case above?

    return arrays_to_mgr(arrays,
                         data_names,
                         index,
                         columns,
                         dtype=dtype,
                         typ=typ,
                         consolidate=copy)
示例#28
0
def _get_dummies_1d(
    data,
    prefix,
    prefix_sep="_",
    dummy_na=False,
    sparse=False,
    drop_first=False,
    dtype=None,
):
    from pandas.core.reshape.concat import concat

    # Series avoids inconsistent NaN handling
    codes, levels = _factorize_from_iterable(Series(data))

    if dtype is None:
        dtype = np.uint8
    dtype = np.dtype(dtype)

    if is_object_dtype(dtype):
        raise ValueError("dtype=object is not a valid dtype for get_dummies")

    def get_empty_frame(data):
        if isinstance(data, Series):
            index = data.index
        else:
            index = np.arange(len(data))
        return DataFrame(index=index)

    # if all NaN
    if not dummy_na and len(levels) == 0:
        return get_empty_frame(data)

    codes = codes.copy()
    if dummy_na:
        codes[codes == -1] = len(levels)
        levels = np.append(levels, np.nan)

    # if dummy_na, we just fake a nan level. drop_first will drop it again
    if drop_first and len(levels) == 1:
        return get_empty_frame(data)

    number_of_cols = len(levels)

    if prefix is None:
        dummy_cols = levels
    else:

        # PY2 embedded unicode, gh-22084
        def _make_col_name(prefix, prefix_sep, level):
            fstr = "{prefix}{prefix_sep}{level}"
            return fstr.format(prefix=prefix,
                               prefix_sep=prefix_sep,
                               level=level)

        dummy_cols = [
            _make_col_name(prefix, prefix_sep, level) for level in levels
        ]

    if isinstance(data, Series):
        index = data.index
    else:
        index = None

    if sparse:

        if is_integer_dtype(dtype):
            fill_value = 0
        elif dtype == bool:
            fill_value = False
        else:
            fill_value = 0.0

        sparse_series = []
        N = len(data)
        sp_indices = [[] for _ in range(len(dummy_cols))]
        mask = codes != -1
        codes = codes[mask]
        n_idx = np.arange(N)[mask]

        for ndx, code in zip(n_idx, codes):
            sp_indices[code].append(ndx)

        if drop_first:
            # remove first categorical level to avoid perfect collinearity
            # GH12042
            sp_indices = sp_indices[1:]
            dummy_cols = dummy_cols[1:]
        for col, ixs in zip(dummy_cols, sp_indices):
            sarr = SparseArray(
                np.ones(len(ixs), dtype=dtype),
                sparse_index=IntIndex(N, ixs),
                fill_value=fill_value,
                dtype=dtype,
            )
            sparse_series.append(Series(data=sarr, index=index, name=col))

        out = concat(sparse_series, axis=1, copy=False)
        return out

    else:
        dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0)

        if not dummy_na:
            # reset NaN GH4446
            dummy_mat[codes == -1] = 0

        if drop_first:
            # remove first GH12042
            dummy_mat = dummy_mat[:, 1:]
            dummy_cols = dummy_cols[1:]
        return DataFrame(dummy_mat, index=index, columns=dummy_cols)
示例#29
0
def _get_dummies_1d(
    data,
    prefix,
    prefix_sep="_",
    dummy_na: bool = False,
    sparse: bool = False,
    drop_first: bool = False,
    dtype: Dtype | None = None,
) -> DataFrame:
    from pandas.core.reshape.concat import concat

    # Series avoids inconsistent NaN handling
    codes, levels = factorize_from_iterable(Series(data))

    if dtype is None:
        dtype = np.dtype(np.uint8)
    # error: Argument 1 to "dtype" has incompatible type "Union[ExtensionDtype, str,
    # dtype[Any], Type[object]]"; expected "Type[Any]"
    dtype = np.dtype(dtype)  # type: ignore[arg-type]

    if is_object_dtype(dtype):
        raise ValueError("dtype=object is not a valid dtype for get_dummies")

    def get_empty_frame(data) -> DataFrame:
        if isinstance(data, Series):
            index = data.index
        else:
            index = np.arange(len(data))
        return DataFrame(index=index)

    # if all NaN
    if not dummy_na and len(levels) == 0:
        return get_empty_frame(data)

    codes = codes.copy()
    if dummy_na:
        codes[codes == -1] = len(levels)
        levels = np.append(levels, np.nan)

    # if dummy_na, we just fake a nan level. drop_first will drop it again
    if drop_first and len(levels) == 1:
        return get_empty_frame(data)

    number_of_cols = len(levels)

    if prefix is None:
        dummy_cols = levels
    else:
        dummy_cols = Index(
            [f"{prefix}{prefix_sep}{level}" for level in levels])

    index: Index | None
    if isinstance(data, Series):
        index = data.index
    else:
        index = None

    if sparse:

        fill_value: bool | float | int
        if is_integer_dtype(dtype):
            fill_value = 0
        elif dtype == np.dtype(bool):
            fill_value = False
        else:
            fill_value = 0.0

        sparse_series = []
        N = len(data)
        sp_indices: list[list] = [[] for _ in range(len(dummy_cols))]
        mask = codes != -1
        codes = codes[mask]
        n_idx = np.arange(N)[mask]

        for ndx, code in zip(n_idx, codes):
            sp_indices[code].append(ndx)

        if drop_first:
            # remove first categorical level to avoid perfect collinearity
            # GH12042
            sp_indices = sp_indices[1:]
            dummy_cols = dummy_cols[1:]
        for col, ixs in zip(dummy_cols, sp_indices):
            sarr = SparseArray(
                np.ones(len(ixs), dtype=dtype),
                sparse_index=IntIndex(N, ixs),
                fill_value=fill_value,
                dtype=dtype,
            )
            sparse_series.append(Series(data=sarr, index=index, name=col))

        return concat(sparse_series, axis=1, copy=False)

    else:
        # take on axis=1 + transpose to ensure ndarray layout is column-major
        dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=1).T

        if not dummy_na:
            # reset NaN GH4446
            dummy_mat[codes == -1] = 0

        if drop_first:
            # remove first GH12042
            dummy_mat = dummy_mat[:, 1:]
            dummy_cols = dummy_cols[1:]
        return DataFrame(dummy_mat, index=index, columns=dummy_cols)
示例#30
0
    def aggregate(self, func_or_funcs, *args, **kwargs):
        """
        Apply aggregation function or functions to groups, yielding most likely
        Series but in some cases DataFrame depending on the output of the
        aggregation function

        Parameters
        ----------
        func_or_funcs : function or list / dict of functions
            List/dict of functions will produce DataFrame with column names
            determined by the function names themselves (list) or the keys in
            the dict

        Notes
        -----
        agg is an alias for aggregate. Use it.

        Example
        -------
        >>> series
        bar    1.0
        baz    2.0
        qot    3.0
        qux    4.0

        >>> mapper = lambda x: x[0] # first letter
        >>> grouped = series.groupby(mapper)

        >>> grouped.aggregate(np.sum)
        b    3.0
        q    7.0

        >>> grouped.aggregate([np.sum, np.mean, np.std])
           mean  std  sum
        b  1.5   0.5  3
        q  3.5   0.5  7

        >>> grouped.agg({'result' : lambda x: x.mean() / x.std(),
        ...              'total' : np.sum})
           result  total
        b  2.121   3
        q  4.95    7

        See also
        --------
        apply, transform

        Returns
        -------
        Series or DataFrame
        """
        if isinstance(func_or_funcs, basestring):
            return getattr(self, func_or_funcs)(*args, **kwargs)

        if hasattr(func_or_funcs, '__iter__'):
            ret = self._aggregate_multiple_funcs(func_or_funcs)
        else:
            if len(self.groupings) > 1:
                return self._python_agg_general(func_or_funcs, *args, **kwargs)

            try:
                return self._python_agg_general(func_or_funcs, *args, **kwargs)
            except Exception:
                result = self._aggregate_named(func_or_funcs, *args, **kwargs)

            index = Index(sorted(result), name=self.groupings[0].name)
            ret = Series(result, index=index)

        if not self.as_index:  # pragma: no cover
            print 'Warning, ignoring as_index=True'

        return ret