예제 #1
0
    def from_pandas(cls,
            value: 'pandas.DataFrame',
            ) -> 'IndexBase':
        '''
        Given a Pandas index, return the appropriate IndexBase derived class.
        '''
        import pandas
        from static_frame.core.index_datetime import IndexDatetime
        from static_frame import Index
        from static_frame import IndexGO
        from static_frame import IndexHierarchy
        from static_frame import IndexHierarchyGO

        if isinstance(value, pandas.MultiIndex):
            # iterating over a hierarchucal index will iterate over labels
            name = tuple(value.names)
            if not cls.STATIC:
                return IndexHierarchyGO.from_labels(value, name=name)
            return IndexHierarchy.from_labels(value, name=name)
        elif isinstance(value, pandas.DatetimeIndex):
            # coming from a Pandas datetime index, in the absence of other information, the best match is a Nanosecond index
            if not issubclass(cls, IndexDatetime):
                raise ErrorInitIndex(f'cannot create a datetime Index from {cls}')
            if not cls.STATIC:
                return cls(value, name=value.name)
            return cls(value, name=value.name)

        if not cls.STATIC:
            return IndexGO(value, name=value.name)
        return Index(value, name=value.name)
예제 #2
0
    def _extract_labels(mapping: tp.Optional[tp.Dict[tp.Hashable, int]],
                        labels: tp.Iterable[tp.Hashable],
                        dtype: tp.Optional[np.dtype] = None) -> np.ndarray:
        '''Derive labels, a cache of the mapping keys in a sequence type (either an ndarray or a list).

        If the labels passed at instantiation are an ndarray, they are used after immutable filtering. Otherwise, the mapping keys are used to create an ndarray.

        This method is overridden in the derived class.

        Args:
            mapping: Can be None if loc_is_iloc.
            labels: might be an expired Generator, but if it is an immutable ndarray, we can use it without a copy.
        '''
        # pre-fetching labels for faster get_item construction
        if isinstance(labels, np.ndarray):
            if dtype is not None and dtype != labels.dtype:
                raise ErrorInitIndex('invalid label dtype for this Index')
            return immutable_filter(labels)

        if hasattr(labels, '__len__'):  # not a generator, not an array
            # resolving the dtype is expensive, pass if possible
            if len(labels) == 0:  #type: ignore
                labels = EMPTY_ARRAY
            else:
                labels, _ = iterable_to_array_1d(labels, dtype=dtype)
        else:  # labels may be an expired generator, must use the mapping
            if len(mapping) == 0:  #type: ignore
                labels = EMPTY_ARRAY
            else:
                labels, _ = iterable_to_array_1d(mapping,
                                                 dtype=dtype)  #type: ignore
        # all arrays are immutable
        # assert labels.flags.writeable == False
        return labels
예제 #3
0
    def from_pandas(
        cls,
        value: 'pandas.Index',
    ) -> 'IndexBase':
        '''
        Given a Pandas index, return the appropriate IndexBase derived class.
        '''
        import pandas
        if not isinstance(value, pandas.Index):
            raise ErrorInitIndex(
                f'from_pandas must be called with a Pandas Index object, not: {type(value)}'
            )

        from static_frame import Index
        from static_frame import IndexGO
        from static_frame import IndexHierarchy
        from static_frame import IndexHierarchyGO
        from static_frame import IndexNanosecond
        from static_frame import IndexNanosecondGO
        from static_frame.core.index_datetime import IndexDatetime

        if isinstance(value, pandas.MultiIndex):
            # iterating over a hierarchical index will iterate over labels
            name: tp.Optional[tp.Tuple[tp.Hashable, ...]] = tuple(value.names)
            # if not assigned Pandas returns None for all components, which will raise issue if trying to unset this index.
            if all(n is None for n in name):  #type: ignore
                name = None
            depth = value.nlevels

            if not cls.STATIC:
                return IndexHierarchyGO.from_labels(value,
                                                    name=name,
                                                    depth_reference=depth)
            return IndexHierarchy.from_labels(value,
                                              name=name,
                                              depth_reference=depth)
        elif isinstance(value, pandas.DatetimeIndex):
            # if IndexDatetime, use cls, else use IndexNanosecond
            if issubclass(cls, IndexDatetime):
                return cls(value, name=value.name)
            else:
                if not cls.STATIC:
                    return IndexNanosecondGO(value, name=value.name)
                return IndexNanosecond(value, name=value.name)

        if not cls.STATIC:
            return IndexGO(value, name=value.name)
        return Index(value, name=value.name)
예제 #4
0
    def from_pandas(cls,
            value: 'pandas.Index',
            ) -> 'IndexBase':
        '''
        Given a Pandas index, return the appropriate IndexBase derived class.
        '''
        import pandas
        if not isinstance(value, pandas.Index):
            raise ErrorInitIndex(f'from_pandas must be called with a Pandas Index object, not: {type(value)}')

        from static_frame import Index
        from static_frame import IndexGO
        from static_frame import IndexHierarchy
        from static_frame import IndexHierarchyGO
        from static_frame import IndexNanosecond
        from static_frame import IndexNanosecondGO
        from static_frame.core.index_datetime import IndexDatetime

        if isinstance(value, pandas.MultiIndex):
            if value.has_duplicates:
                raise ErrorInitIndex(f'cannot create IndexHierarchy from a MultiIndex with duplicates: {value}')

            # iterating over a hierarchical index will iterate over labels
            name: tp.Optional[tp.Tuple[tp.Hashable, ...]] = tuple(value.names)

            # if not assigned Pandas returns None for all components, which will raise issue if trying to unset this index.
            if all(n is None for n in name): #type: ignore
                name = None

            hierarchy_constructor = IndexHierarchy if cls.STATIC else IndexHierarchyGO

            def build_index(pd_idx: pandas.Index) -> Index:
                # NOTE: Newer versions of pandas will not require Python date objects to live inside
                # a DatetimeIndex. Instead, it will be a regular Index with dtype=object.
                # Only numpy datetime objects are put into a DatetimeIndex.
                if isinstance(pd_idx, pandas.DatetimeIndex):
                    constructor: tp.Type[Index] = IndexNanosecond
                else:
                    constructor = Index

                if cls.STATIC:
                    return constructor(pd_idx, name=pd_idx.name)
                return tp.cast(Index, constructor._MUTABLE_CONSTRUCTOR(pd_idx))

            indices: tp.List[Index] = []
            indexers: np.ndarray = np.empty((value.nlevels, len(value)), dtype=DTYPE_INT_DEFAULT)

            for i, (levels, codes) in enumerate(zip(value.levels, value.codes)):
                indexers[i] = codes
                indices.append(build_index(levels))

            indexers.flags.writeable = False

            return hierarchy_constructor(
                    indices=indices,
                    indexers=indexers,
                    name=name,
                    )

        elif isinstance(value, pandas.DatetimeIndex):
            # if IndexDatetime, use cls, else use IndexNanosecond
            if issubclass(cls, IndexDatetime):
                return cls(value, name=value.name)
            else:
                if not cls.STATIC:
                    return IndexNanosecondGO(value, name=value.name)
                return IndexNanosecond(value, name=value.name)

        if not cls.STATIC:
            return IndexGO(value, name=value.name)
        return Index(value, name=value.name)
예제 #5
0
    def from_labels(
        cls: tp.Type[IH],
        labels: tp.Iterable[tp.Sequence[tp.Hashable]],
        *,
        name: tp.Hashable = None,
        index_constructors: tp.Optional[IndexConstructors] = None,
        continuation_token: tp.Union[tp.Hashable,
                                     None] = CONTINUATION_TOKEN_INACTIVE
    ) -> IH:
        '''
        Construct an ``IndexHierarhcy`` from an iterable of labels, where each label is tuple defining the component labels for all hierarchies.

        Args:
            labels: an iterator or generator of tuples.
            continuation_token: a Hashable that will be used as a token to identify when a value in a label should use the previously encountered value at the same depth.

        Returns:
            :obj:`static_frame.IndexHierarchy`
        '''
        labels_iter = iter(labels)
        try:
            first = next(labels_iter)
        except StopIteration:
            # if iterable is empty, return empty index
            return cls(levels=cls._LEVEL_CONSTRUCTOR(cls._INDEX_CONSTRUCTOR(
                ())),
                       name=name)

        depth = len(first)
        # minimum permitted depth is 2
        if depth < 2:
            raise ErrorInitIndex(
                'cannot create an IndexHierarchy from only one level.')
        if index_constructors and len(index_constructors) != depth:
            raise ErrorInitIndex(
                'if providing index constructors, number of index constructors must equal depth of IndexHierarchy.'
            )

        depth_max = depth - 1
        depth_pre_max = depth - 2

        token = object()
        observed_last = [token for _ in range(depth)]

        tree = dict()  # order assumed and necessary
        # put first back in front
        for label in chain((first, ), labels_iter):
            current = tree  # NOTE: over the life of this loop, current can be a dict or a list
            # each label is an iterable
            for d, v in enumerate(label):
                # print('d', d, 'v', v, 'depth_pre_max', depth_pre_max, 'depth_max', depth_max)
                if continuation_token is not CONTINUATION_TOKEN_INACTIVE:
                    if v == continuation_token:
                        # might check that observed_last[d] != token
                        v = observed_last[d]
                if d < depth_pre_max:
                    if v not in current:
                        current[v] = dict()  # order necessary
                    else:
                        # can only fetch this node (and not create a new node) if this is the sequential predecessor
                        if v != observed_last[d]:
                            raise ErrorInitIndex(
                                'invalid tree-form for IndexHierarchy: {} in {} cannot follow {} when {} has already been defined'
                                .format(v, label, observed_last[d], v))
                    current = current[v]
                    observed_last[d] = v
                elif d < depth_max:
                    if v not in current:
                        current[v] = list()
                    else:
                        # cannot just fetch this list if it is not the predecessor
                        if v != observed_last[d]:
                            raise ErrorInitIndex(
                                'invalid tree-form for IndexHierarchy: {} in {} cannot follow {} when {} has already been defined.'
                                .format(v, label, observed_last[d], v))
                    current = current[v]
                    observed_last[d] = v
                elif d == depth_max:  # at depth max
                    # if there are redundancies here they will be caught in index creation
                    current.append(v)
                else:
                    raise ErrorInitIndex('label exceeded expected depth',
                                         label)

        return cls(levels=cls._tree_to_index_level(
            tree, index_constructors=index_constructors),
                   name=name)
예제 #6
0
    def __init__(self,
                 labels: IndexInitializer,
                 *,
                 loc_is_iloc: bool = False,
                 name: tp.Hashable = None,
                 dtype: DtypeSpecifier = None) -> None:

        self._recache: bool = False
        self._map: tp.Dict[tp.Hashable, int] = None

        positions = None

        # resolve the targetted labels dtype, by lookin at the class attr _DTYPE and/or the passed dtype argument
        if dtype is None:
            dtype_extract = self._DTYPE  # set in some specialized Index classes
        else:  # passed dtype is not None
            if self._DTYPE is not None and dtype != self._DTYPE:
                raise ErrorInitIndex('invalid dtype argument for this Index',
                                     dtype, self._DTYPE)
            # self._DTYPE is None, passed dtype is not None, use dtype
            dtype_extract = dtype

        # handle all Index subclasses
        # check isinstance(labels, IndexBase)
        if issubclass(labels.__class__, IndexBase):
            if labels._recache:
                labels._update_array_cache()
            if name is None and labels.name is not None:
                name = labels.name  # immutable, so no copy necessary
            if labels.depth == 1:  # not an IndexHierarchy
                if labels.STATIC:  # can take the map
                    self._map = labels._map
                # get a reference to the immutable arrays, even if this is an IndexGO index, we can take the cached arrays, assuming they are up to date
                positions = labels._positions
                loc_is_iloc = labels._loc_is_iloc
                labels = labels._labels
            else:  # IndexHierarchy
                # will be a generator of tuples; already updated caches
                labels = array2d_to_tuples(labels._labels)
        elif isinstance(labels, ContainerOperand):
            # it is a Series or similar
            array = labels.values
            if array.ndim == 1:
                labels = array
            else:
                labels = array2d_to_tuples(array)
        # else: assume an iterable suitable for labels usage

        if self._DTYPE is not None:
            # do not need to check arrays, as will and checked to match dtype_extract in _extract_labels
            if not isinstance(labels, np.ndarray):
                # for now, assume that if _DTYPE is defined, we have a date
                labels = (to_datetime64(v, dtype_extract) for v in labels)
            else:  # coerce to target type
                labels = labels.astype(dtype_extract)

        self._name = name if name is None else name_filter(name)

        if self._map is None:
            self._map = self._get_map(labels, positions)

        # this might be NP array, or a list, depending on if static or grow only; if an array, dtype will be compared with passed dtype_extract
        self._labels = self._extract_labels(self._map, labels, dtype_extract)
        self._positions = self._extract_positions(self._map, positions)

        if self._DTYPE and self._labels.dtype != self._DTYPE:
            raise ErrorInitIndex('invalid label dtype for this Index',
                                 self._labels.dtype, self._DTYPE)
        if len(self._map) != len(self._labels):
            raise ErrorInitIndex(
                f'labels ({len(self._labels)}) have non-unique values ({len(self._map)})'
            )

        # NOTE: automatic discovery is possible, but not yet implemented
        self._loc_is_iloc = loc_is_iloc
예제 #7
0
    def __init__(self,
                 labels: IndexInitializer,
                 *,
                 loc_is_iloc: bool = False,
                 name: NameType = NAME_DEFAULT,
                 dtype: DtypeSpecifier = None) -> None:

        self._recache: bool = False
        self._map: tp.Optional[FrozenAutoMap] = None

        positions = None
        is_typed = self._DTYPE is not None

        # resolve the targetted labels dtype, by lookin at the class attr _DTYPE and/or the passed dtype argument
        if dtype is None:
            dtype_extract = self._DTYPE  # set in some specialized Index classes
        else:  # passed dtype is not None
            if is_typed and dtype != self._DTYPE:
                # NOTE: should never get to this branch, as derived Index classes that set _DTYPE remove dtype from __init__
                raise ErrorInitIndex('invalid dtype argument for this Index',
                                     dtype, self._DTYPE)  #pragma: no cover
            # self._DTYPE is None, passed dtype is not None, use dtype
            dtype_extract = dtype

        #-----------------------------------------------------------------------
        # handle all Index subclasses
        if isinstance(labels, IndexBase):
            if labels._recache:
                labels._update_array_cache()
            if name is NAME_DEFAULT:
                name = labels.name  # immutable, so no copy necessary
            if isinstance(labels, Index):  # not an IndexHierarchy
                if (labels.STATIC and self.STATIC and dtype is None):
                    if not is_typed or (is_typed
                                        and self._DTYPE == labels.dtype):
                        # can take the map if static and if types in the dict are the same as those in the labels (or to become the labels after conversion)
                        self._map = labels._map
                # get a reference to the immutable arrays, even if this is an IndexGO index, we can take the cached arrays, assuming they are up to date; for datetime64 indices, we might need to translate to a different type
                positions = labels._positions
                loc_is_iloc = labels._map is None
                labels = labels._labels
            else:  # IndexHierarchy
                # will be a generator of tuples; already updated caches
                labels = array2d_to_tuples(labels.__iter__())
        elif isinstance(labels, ContainerOperand):
            # it is a Series or similar
            array = labels.values
            if array.ndim == 1:
                labels = array
            else:
                labels = array2d_to_tuples(array)
        # else: assume an iterable suitable for labels usage

        #-----------------------------------------------------------------------
        if is_typed:
            # do not need to check arrays, as will and checked to match dtype_extract in _extract_labels
            if not isinstance(labels, np.ndarray):
                # for now, assume that if _DTYPE is defined, we have a date
                labels = (to_datetime64(v, dtype_extract) for v in labels)
            # coerce to target type
            elif labels.dtype != dtype_extract:
                labels = labels.astype(dtype_extract)
                labels.flags.writeable = False  #type: ignore

        self._name = None if name is NAME_DEFAULT else name_filter(name)

        if self._map is None:  # if _map not shared from another Index
            if not loc_is_iloc:
                try:
                    self._map = FrozenAutoMap(
                        labels) if self.STATIC else AutoMap(labels)
                except ValueError:  # Automap will raise ValueError of non-unique values are encountered
                    pass
                if self._map is None:
                    raise ErrorInitIndex(
                        f'labels ({len(tuple(labels))}) have non-unique values ({len(set(labels))})'
                    )
                size = len(self._map)
            else:  # must assume labels are unique
                # labels must not be a generator, but we assume that internal clients that provided loc_is_iloc will not give a generator
                size = len(labels)  #type: ignore
                if positions is None:
                    positions = PositionsAllocator.get(size)
        else:  # map shared from another Index
            size = len(self._map)

        # this might be NP array, or a list, depending on if static or grow only; if an array, dtype will be compared with passed dtype_extract
        self._labels = self._extract_labels(self._map, labels, dtype_extract)
        self._positions = self._extract_positions(size, positions)

        if self._DTYPE and self._labels.dtype != self._DTYPE:
            raise ErrorInitIndex(
                'invalid label dtype for this Index',  #pragma: no cover
                self._labels.dtype,
                self._DTYPE)