Exemplo n.º 1
0
    def test_get_loc_raises_bad_label(self, method):
        index = Index([0, 1, 2])
        if method:
            msg = "not supported between"
        else:
            msg = "invalid key"

        with pytest.raises(TypeError, match=msg):
            index.get_loc([1, 2], method=method)
Exemplo n.º 2
0
    def test_get_loc(self, method):
        index = Index([0, 1, 2])
        warn = None if method is None else FutureWarning

        with tm.assert_produces_warning(warn, match="deprecated"):
            assert index.get_loc(1, method=method) == 1

        if method:
            with tm.assert_produces_warning(warn, match="deprecated"):
                assert index.get_loc(1, method=method, tolerance=0) == 1
Exemplo n.º 3
0
    def test_get_loc_overflows(self):
        # unique but non-monotonic goes through IndexEngine.mapping.get_item
        idx = Index([0, 2, 1])

        val = np.iinfo(np.int64).max + 1

        with pytest.raises(KeyError, match=str(val)):
            idx.get_loc(val)
        with pytest.raises(KeyError, match=str(val)):
            idx._engine.get_loc(val)
Exemplo n.º 4
0
def test_get_loc_duplicates():
    index = Index([2, 2, 2, 2])
    result = index.get_loc(2)
    expected = slice(0, 4)
    assert result == expected
    # pytest.raises(Exception, index.get_loc, 2)

    index = Index(["c", "a", "a", "b", "b"])
    rs = index.get_loc("c")
    xp = 0
    assert rs == xp
Exemplo n.º 5
0
def test_get_loc_duplicates():
    index = Index([2, 2, 2, 2])
    result = index.get_loc(2)
    expected = slice(0, 4)
    assert result == expected
    # pytest.raises(Exception, index.get_loc, 2)

    index = Index(['c', 'a', 'a', 'b', 'b'])
    rs = index.get_loc('c')
    xp = 0
    assert rs == xp
def test_get_loc_duplicates():
    index = Index([2, 2, 2, 2])
    result = index.get_loc(2)
    expected = slice(0, 4)
    assert result == expected
    # pytest.raises(Exception, index.get_loc, 2)

    index = Index(['c', 'a', 'a', 'b', 'b'])
    rs = index.get_loc('c')
    xp = 0
    assert rs == xp
Exemplo n.º 7
0
    def test_get_loc_raises_bad_label(self, method):
        index = Index([0, 1, 2])
        if method:
            msg = "not supported between"
            err = TypeError
        else:
            msg = r"\[1, 2\]"
            err = InvalidIndexError

        with pytest.raises(err, match=msg):
            index.get_loc([1, 2], method=method)
Exemplo n.º 8
0
    def test_get_loc_duplicates(self):
        index = Index([2, 2, 2, 2])
        result = index.get_loc(2)
        expected = slice(0, 4)
        assert result == expected
        # FIXME: dont leave commented-out
        # pytest.raises(Exception, index.get_loc, 2)

        index = Index(["c", "a", "a", "b", "b"])
        rs = index.get_loc("c")
        xp = 0
        assert rs == xp
Exemplo n.º 9
0
    def test_get_loc_duplicates(self):
        index = Index([2, 2, 2, 2])
        result = index.get_loc(2)
        expected = slice(0, 4)
        assert result == expected

        index = Index(["c", "a", "a", "b", "b"])
        rs = index.get_loc("c")
        xp = 0
        assert rs == xp

        with pytest.raises(KeyError):
            index.get_loc(2)
Exemplo n.º 10
0
def _get_end(y_index: pd.Index, fh: ForecastingHorizon) -> int:
    """Compute the end of the last training window for a forecasting horizon.

    For a time series index `y_index`, `y_index[end]` will give
    the index of the training window.
    Correspondingly, for a time series `y` with index `y_index`,
    `y.iloc[end]` or `y.loc[y_index[end]]`
    will provide the last index of the training window.

    Parameters
    ----------
    y_index : pd.Index
        Index of time series
    fh : int, timedelta, list or np.ndarray of ints or timedeltas

    Returns
    -------
    end : int
        0-indexed integer end of the training window
    """
    # `fh` is assumed to be ordered and checked by `_check_fh` and `window_length` by
    # `check_window_length`.
    n_timepoints = y_index.shape[0]
    assert isinstance(y_index, pd.Index)

    # For purely in-sample forecasting horizons, the last split point is the end of the
    # training data.
    # Otherwise, the last point must ensure that the last horizon is within the data.
    null = 0 if array_is_int(fh) else pd.Timedelta(0)
    fh_offset = null if fh.is_all_in_sample() else fh[-1]
    if array_is_int(fh):
        return n_timepoints - fh_offset - 1
    else:
        return y_index.get_loc(y_index[-1] - fh_offset)
Exemplo n.º 11
0
 def StudentDetailInfo(self):
     if self.file != '' and self.SeatNum.get().upper() in self.SeatData:
         data = self.data
         idx = Index(self.SeatData)
         locData = idx.get_loc(self.SeatNum.get().upper())
         Marks = array(data['Total [ 20 ]'])[locData + 2]
         NameValue = array(data['Name'])[locData]
         Grade = array(data['Grade'])[locData + 2]
         Remark = array(data['Grade'])[locData + 1].replace('..', 'l')
         RemarkColor = ['green' if Remark == 'Successful' else 'red']
         CGPA = array(data['Grade'])[locData]
         StudentFrame = Toplevel()
         StudentFrame.geometry('700x350')
         SeatLabel = ttk.Label(master=StudentFrame,
                               text='Seat  No. : {}'.format(
                                   self.SeatNum.get().upper()))
         SeatLabel.grid(padx=20, pady=20, sticky='W')
         Name = ttk.Label(master=StudentFrame,
                          text='Name : {}'.format(NameValue))
         Name.grid(row=1, column=0, padx=20, sticky='W')
         MarksLabel = ttk.Label(master=StudentFrame,
                                text='Marks : {}'.format(Marks))
         MarksLabel.grid(row=2, column=0, padx=20, pady=20, sticky='W')
         GradeLabel = ttk.Label(master=StudentFrame,
                                text='Grade : {}'.format(Grade))
         GradeLabel.grid(row=3, column=0, padx=20, sticky='W')
         CGPALabel = ttk.Label(master=StudentFrame,
                               text='CGPA : {}'.format(CGPA))
         CGPALabel.grid(row=4, column=0, padx=20, pady=20, sticky='W')
         RemarkLabel = Label(master=StudentFrame,
                             text='Remark : {}'.format(Remark),
                             fg=RemarkColor)
         RemarkLabel.grid(row=5, column=0, padx=20, sticky='W')
     else:
         messagebox.showwarning('Error 404', 'File not found')
Exemplo n.º 12
0
    def test_get_loc_nan_object_dtype_nonmonotonic_nonunique(self):
        # case that goes through _maybe_get_bool_indexer
        idx = Index(["foo", np.nan, None, "foo", 1.0, None], dtype=object)

        # we dont raise KeyError on nan
        res = idx.get_loc(np.nan)
        assert res == 1

        # we only match on None, not on np.nan
        res = idx.get_loc(None)
        expected = np.array([False, False, True, False, False, True])
        tm.assert_numpy_array_equal(res, expected)

        # we don't match at all on mismatched NA
        with pytest.raises(KeyError, match="NaT"):
            idx.get_loc(NaT)
Exemplo n.º 13
0
def _normalize_index(
    indexer: Union[slice, np.integer, int, str,
                   Sequence[Union[int, np.integer]], np.ndarray, pd.Index, ],
    index: pd.Index,
) -> Union[slice, int, np.ndarray]:  # ndarray of int
    if not isinstance(index, pd.RangeIndex):
        assert (
            index.dtype != float and index.dtype != int
        ), "Don’t call _normalize_index with non-categorical/string names"

    # the following is insanely slow for sequences,
    # we replaced it using pandas below
    def name_idx(i):
        if isinstance(i, str):
            i = index.get_loc(i)
        return i

    if isinstance(indexer, slice):
        start = name_idx(indexer.start)
        stop = name_idx(indexer.stop)
        # string slices can only be inclusive, so +1 in that case
        if isinstance(indexer.stop, str):
            stop = None if stop is None else stop + 1
        step = indexer.step
        return slice(start, stop, step)
    elif isinstance(indexer, (np.integer, int)):
        return indexer
    elif isinstance(indexer, str):
        return index.get_loc(indexer)  # int
    elif isinstance(indexer,
                    (Sequence, np.ndarray, pd.Index, spmatrix, np.matrix)):
        if hasattr(indexer,
                   "shape") and ((indexer.shape == (index.shape[0], 1)) or
                                 (indexer.shape == (1, index.shape[0]))):
            if isinstance(indexer, spmatrix):
                indexer = indexer.toarray()
            indexer = np.ravel(indexer)
        if not isinstance(indexer, (np.ndarray, pd.Index)):
            indexer = np.array(indexer)
        if issubclass(indexer.dtype.type, (np.integer, np.floating)):
            return indexer  # Might not work for range indexes
        elif issubclass(indexer.dtype.type, np.bool_):
            if indexer.shape != index.shape:
                raise IndexError(
                    f"Boolean index does not match AnnData’s shape along this "
                    f"dimension. Boolean index has shape {indexer.shape} while "
                    f"AnnData index has shape {index.shape}.")
            positions = np.where(indexer)[0]
            return positions  # np.ndarray[int]
        else:  # indexer should be string array
            positions = index.get_indexer(indexer)
            if np.any(positions < 0):
                not_found = indexer[positions < 0]
                raise KeyError(
                    f"Values {list(not_found)}, from {list(indexer)}, "
                    "are not valid obs/ var names or indices.")
            return positions  # np.ndarray[int]
    else:
        raise IndexError(
            f"Unknown indexer {indexer!r} of type {type(indexer)}")
Exemplo n.º 14
0
 def all_sem_performance(self):
     if self.file != '' and self.SeatNum.get().upper() in self.SeatData:
         data = self.data
         col = data.columns
         Subjects = [''.join(list(c)[:-6]) for c in col][3:10]
         SubjectData = []
         Student = []
         for i in range(0, len(self.SeatData), 6):
             idx = Index(self.SeatData)
             locData = idx.get_loc(self.SeatData[i])
             StudentData = data.loc[locData + 2, :]
             OneData = array(StudentData[3:10]).astype(int)
             if self.SeatData[i] == self.SeatNum.get().upper():
                 Student = array(StudentData[3:10]).astype(int)
             SubjectData.append(OneData)
         SubjectData = transpose(SubjectData)
         MaxOfAll = [max(i) for i in SubjectData]
         AvgOfAll = [average(i) for i in SubjectData]
         fig = figure()
         ax = subplot(111)
         title('Your Performance vs Class')
         xlabel('Subjects')
         ylabel('Marks Range')
         ax.bar(arange(7) + 0.00, MaxOfAll, color='#004c6d', width=0.25)
         ax.bar(arange(7) + 0.25, AvgOfAll, color='#286d8a', width=0.25)
         ax.bar(arange(7) + 0.50, Student, color='#008cc9', width=0.25)
         xticks(arange(7), Subjects)
         ax.legend(labels=['Max', 'Average', self.SeatNum.get().upper()])
         if self.cb[1].get():
             fig.savefig('Saved Images\Your Performance VS Class.pdf')
         fig.show()
     else:
         messagebox.showwarning('Error 404', 'File not found')
Exemplo n.º 15
0
    def test_get_loc(self):
        # GH 12531
        cidx1 = CategoricalIndex(list("abcde"), categories=list("edabc"))
        idx1 = Index(list("abcde"))
        assert cidx1.get_loc("a") == idx1.get_loc("a")
        assert cidx1.get_loc("e") == idx1.get_loc("e")

        for i in [cidx1, idx1]:
            with pytest.raises(KeyError, match="'NOT-EXIST'"):
                i.get_loc("NOT-EXIST")

        # non-unique
        cidx2 = CategoricalIndex(list("aacded"), categories=list("edabc"))
        idx2 = Index(list("aacded"))

        # results in bool array
        res = cidx2.get_loc("d")
        tm.assert_numpy_array_equal(res, idx2.get_loc("d"))
        tm.assert_numpy_array_equal(
            res, np.array([False, False, False, True, False, True])
        )
        # unique element results in scalar
        res = cidx2.get_loc("e")
        assert res == idx2.get_loc("e")
        assert res == 4

        for i in [cidx2, idx2]:
            with pytest.raises(KeyError, match="'NOT-EXIST'"):
                i.get_loc("NOT-EXIST")

        # non-unique, sliceable
        cidx3 = CategoricalIndex(list("aabbb"), categories=list("abc"))
        idx3 = Index(list("aabbb"))

        # results in slice
        res = cidx3.get_loc("a")
        assert res == idx3.get_loc("a")
        assert res == slice(0, 2, None)

        res = cidx3.get_loc("b")
        assert res == idx3.get_loc("b")
        assert res == slice(2, 5, None)

        for i in [cidx3, idx3]:
            with pytest.raises(KeyError, match="'c'"):
                i.get_loc("c")
Exemplo n.º 16
0
    def _insert_dict_val(key: tuple, choice_index: pd.Index, val, new_array):
        max_levels = choice_index.nlevels

        if max_levels == 1:
            # Dotted dict keys are not supported for multinomial models
            assert len(key) == 1

            loc = choice_index.get_loc(key[0])
            new_array[0, loc] = val
        else:
            '''
            Dotted dict literals require care to ensure explicit reindexing with nested logit models. For example,
            specifying the key "A.B" in a 3-level model (where "A.B.C" exists and is meaningful) there is need to
            disambiguate between the node with the unique ID of ('A', 'B', '.') (i.e. the parent node of "A.B.C"),
            versus applying a value to ALL children of that parent node. Pandas during reindexing will apply value
            to ALL nodes with the "A.B" pattern e.g. the parent and all of its children.

            To avoid this, some special naming conventions are enforced here (assuming max_level = 3):
                "A.B" refers to the parent node ONLY. The output new key is ('A', 'B', '.')
                "A.B._" refers to ALL children of parent node A.B. The output new key is ('A', 'B')

            The edge case "A._.B" (where an underscore occurs in the middle of a name) is technically meaningless,
            so the code below assumes that "A._" was meant instead
            '''
            new_key = []
            partial_key = False
            for sub_key in key:
                if sub_key == '_':
                    partial_key = True
                    break
                new_key.append(str(sub_key))

            delta = ['.'] * (max_levels - len(new_key))

            if partial_key:
                locs = choice_index.get_loc(tuple(new_key))
                parent_loc = choice_index.get_loc(tuple(new_key + delta))

                new_array[0, locs] = val
                new_array[0, parent_loc] = 0
            else:
                loc = choice_index.get_loc(tuple(new_key + delta))
                new_array[0, loc] = val
    def test_parse_combinations(self):
        ability_names = Series(["Ability 1", "Ability 2", "Ability 3"])
        ability_index = Index(ability_names)

        print(ability_index.get_loc("Ability 1"))

        print(ability_index)

        index = SpreadsheetReader.parse_combination_string(ability_index, "Ability 2 > Ability 1")
        self.assertTrue(numpy.array_equal(index, [1, 0]))
        pass
Exemplo n.º 18
0
 def _splitter(index: pd.Index, *args, **kwargs):
     if isinstance(index, pd.MultiIndex):
         group_indexes = [
             splitter(index[index.get_loc(group)], *args, **kwargs)
             for group in unique_level_rows(index)
         ]
         return concat_indices([gi[0]
                                for gi in group_indexes]), concat_indices(
                                    [gi[1] for gi in group_indexes])
     else:
         splitter(index, *args, **kwargs)
Exemplo n.º 19
0
 def _get_train_start(
     start, window_length: ACCEPTED_WINDOW_LENGTH_TYPES, y: pd.Index
 ) -> int:
     if is_timedelta_or_date_offset(x=window_length):
         train_start = y.get_loc(
             max(y[min(start, len(y) - 1)] - window_length, min(y))
         )
         if start >= len(y):
             train_start += 1
     else:
         train_start = start - window_length
     return train_start
Exemplo n.º 20
0
    def _split_for_initial_window(self, y: pd.Index) -> SPLIT_ARRAY_TYPE:
        """Get train/test splits for non-empty initial window.

        Parameters
        ----------
        y : pd.Index
            Index of the time series to split

        Returns
        -------
        (np.ndarray, np.ndarray)
            Integer indices of the train/test windows

        """
        fh = _check_fh(self.fh)
        if not self.start_with_window:
            raise ValueError(
                "`start_with_window` must be True if `initial_window` is given"
            )
        if self.initial_window <= self.window_length:
            raise ValueError("`initial_window` must greater than `window_length`")
        if is_timedelta_or_date_offset(x=self.initial_window):
            initial_window_threshold = y.get_loc(y[0] + self.initial_window)
        else:
            initial_window_threshold = self.initial_window
        # For in-sample forecasting horizons, the first split must ensure that
        # in-sample test set is still within the data.
        if not fh.is_all_out_of_sample() and abs(fh[0]) >= initial_window_threshold:
            initial_start = abs(fh[0]) - self.initial_window + 1
        else:
            initial_start = 0
        if is_timedelta_or_date_offset(x=self.initial_window):
            initial_end = y.get_loc(y[initial_start] + self.initial_window)
        else:
            initial_end = initial_start + self.initial_window
        train = self._get_train_window(
            y=y, train_start=initial_start, split_point=initial_end
        )
        test = initial_end + fh.to_numpy() - 1
        return train, test
Exemplo n.º 21
0
    def parse_combination_string(ability_names: Index,
                                 combination: str) -> List[int]:
        """
        Parses combination string
        :param ability_names:
        :param combination:
        :return:
        """
        return [
            ability_names.get_loc(entry.strip())
            for entry in combination.split(">")
        ]

        pass
Exemplo n.º 22
0
    def _split(self, y: pd.Index) -> SPLIT_GENERATOR_TYPE:
        n_timepoints = y.shape[0]
        cutoffs = check_cutoffs(cutoffs=self.cutoffs)
        fh = _check_fh(fh=self.fh)
        window_length = check_window_length(
            window_length=self.window_length, n_timepoints=n_timepoints
        )
        _check_cutoffs_and_y(cutoffs=cutoffs, y=y)
        _check_cutoffs_fh_y(cutoffs=cutoffs, fh=fh, y=y)
        max_fh = fh.max()
        max_cutoff = np.max(cutoffs)

        for cutoff in cutoffs:
            if is_int(x=window_length) and is_int(x=cutoff):
                train_start = cutoff - window_length
            elif is_timedelta_or_date_offset(x=window_length) and is_datetime(x=cutoff):
                train_start = y.get_loc(max(y[0], cutoff - window_length))
            else:
                raise TypeError(
                    f"Unsupported combination of types: "
                    f"`window_length`: {type(window_length)}, "
                    f"`cutoff`: {type(cutoff)}"
                )

            split_point = cutoff if is_int(x=cutoff) else y.get_loc(y[y <= cutoff][-1])
            training_window = self._get_train_window(
                y=y, train_start=train_start + 1, split_point=split_point + 1
            )

            test_window = cutoff + fh.to_numpy()
            if is_datetime(x=max_cutoff) and is_timedelta(x=max_fh):
                test_window = test_window[test_window >= y.min()]
                test_window = np.array(
                    [y.get_loc(timestamp) for timestamp in test_window]
                )
            yield training_window, test_window
Exemplo n.º 23
0
    def _get_start(self, y: pd.Index, fh: ForecastingHorizon) -> int:
        """Get the first split point."""
        # By default, the first split point is the index zero, the first
        # observation in
        # the data.
        start = 0

        # If we start with a full window, the first split point depends on the window
        # length.
        if hasattr(self, "start_with_window") and self.start_with_window:

            if hasattr(self, "initial_window") and self.initial_window is not None:

                if hasattr(self, "step_length"):
                    step_length = self._get_step_length(x=self.step_length)
                else:
                    step_length = 1

                if is_timedelta_or_date_offset(x=self.initial_window):
                    start = y.get_loc(y[start] + self.initial_window) + step_length
                else:
                    start += self.initial_window + step_length
            else:
                if is_timedelta_or_date_offset(x=self.window_length):
                    start = y.get_loc(y[start] + self.window_length)
                else:
                    start += self.window_length

        # For in-sample forecasting horizons, the first split must ensure that
        # in-sample test set is still within the data.
        if not fh.is_all_out_of_sample():
            fh_min = abs(fh[0])
            if fh_min >= start:
                start = fh_min + 1

        return start
Exemplo n.º 24
0
class Graph:
    def __init__(self, size: int, names: List[str]):
        """
        Constructs a graph object (weighted, undirected)
        :param size: Number of nodes in the graph
        :param names: Names of the nodes
        """
        self.adjacency = numpy.zeros((size, size))  # Weights between nodes
        self.size = size
        self.names = names
        self.name_index = Index(names)

    def minimum_distance(self, nodes_from: Union[int, List[int]],
                         nodes_to: Union[int, List[int]]) -> float:
        """
        Gets the minimum distance
        :param nodes_from: Index of node or nodes from which to compute the distance
        :param nodes_to: Index of node or nodes to which to compute the distance
        :return: Minimum distance between nodes
        """
        if isinstance(nodes_from, Iterable):
            return numpy.min(
                [numpy.min(self.adjacency[i, nodes_to]) for i in nodes_from])
        return numpy.min(self.adjacency[nodes_from, nodes_to])

    def path_length(self, nodes: List[int]) -> float:
        """
        :param nodes:
        :return: Length of path
        """
        if len(nodes) is 1:
            return 0
        length = 0
        for index in range(len(nodes) - 1):
            length = length + self.adjacency[nodes[index], nodes[index + 1]]

        return length

    def get_node_index(self, name: str) -> int:
        """
        Gets index of node
        :param name: Name of node
        :return: index of node
        """
        return self.name_index.get_loc(name)

    def get_node_indices(self, names: List[str]) -> List[int]:
        return [self.get_node_index(name) for name in names]
Exemplo n.º 25
0
 def ClassGrowthInfo(self):
     if self.file != '' and self.SeatNum.get().upper() in self.SeatData:
         semData = [
             mean(self.data.iloc[:, i])
             for i in range(1,
                            len(self.data.columns) - 2)
         ]
         idx = Index(self.SeatData)
         locData = idx.get_loc(self.SeatNum.get().upper())
         StudentData = array(self.data.loc[locData, :])[1:-2]
         plot(semData)
         plot(StudentData)
         ylim(5, 10)
         legend(labels=['Average', self.SeatNum.get().upper()])
         xticks(arange(6), ['I', 'II', 'III', 'IV', 'V', 'VI'])
         show()
     else:
         messagebox.showwarning('Error 404', 'File not found.')
Exemplo n.º 26
0
 def IndStud(self):
     if self.file != '' and self.SeatNum.get().upper() in self.SeatData:
         data = self.data
         col = data.columns
         idx = Index(self.SeatData)
         locData = idx.get_loc(self.SeatNum.get().upper())
         StudentData = data.loc[locData + 2, :]
         SubjectData = array(StudentData[3:10]).astype(int)
         Subjects = [''.join(list(c)[:-6]) for c in col][3:10]
         fig = figure()
         ax = subplot(111)
         title('Marks of ' + self.SeatNum.get().upper())
         xlabel('Subjects')
         ylabel('Marks Range')
         ylim(0, 75)
         ax.bar(Subjects, SubjectData)
         fig.show()
     else:
         messagebox.showwarning('Error 404', 'File not found')
Exemplo n.º 27
0
    def _split(self, y: pd.Index) -> SPLIT_GENERATOR_TYPE:
        n_timepoints = y.shape[0]
        window_length = check_window_length(self.window_length, n_timepoints)
        fh = _check_fh(self.fh)
        end = _get_end(y_index=y, fh=fh)

        if window_length is None:
            start = 0
        elif is_int(window_length):
            start = end - window_length + 1
        else:
            start = np.argwhere(y > y[end] - window_length).flatten()[0]

        train = self._get_train_window(y=y, train_start=start, split_point=end + 1)

        if array_is_int(fh):
            test = end + fh.to_numpy()
        else:
            test = np.array([y.get_loc(y[end] + x) for x in fh.to_pandas()])

        yield train, test
class Grouping(object):
    def __init__(self, index, names=None):
        """
        index : index-like
            Can be pandas MultiIndex or Index or array-like. If array-like
            and is a MultipleIndex (more than one grouping variable),
            groups are expected to be in each row. E.g., [('red', 1),
            ('red', 2), ('green', 1), ('green', 2)]
        names : list or str, optional
            The names to use for the groups. Should be a str if only
            one grouping variable is used.

        Notes
        -----
        If index is already a pandas Index then there is no copy.
        """
        if isinstance(index, (Index, MultiIndex)):
            if names is not None:
                if hasattr(index, 'set_names'):  # newer pandas
                    index.set_names(names, inplace=True)
                else:
                    index.names = names
            self.index = index
        else:  # array_like
            if _is_hierarchical(index):
                self.index = _make_hierarchical_index(index, names)
            else:
                self.index = Index(index, name=names)
            if names is None:
                names = _make_generic_names(self.index)
                if hasattr(self.index, 'set_names'):
                    self.index.set_names(names, inplace=True)
                else:
                    self.index.names = names

        self.nobs = len(self.index)
        self.nlevels = len(self.index.names)
        self.slices = None

    @property
    def index_shape(self):
        if hasattr(self.index, 'levshape'):
            return self.index.levshape
        else:
            return self.index.shape

    @property
    def levels(self):
        if hasattr(self.index, 'levels'):
            return self.index.levels
        else:
            return pd.Categorical(self.index).levels

    @property
    def labels(self):
        # this was index_int, but that's not a very good name...
        codes = getattr(self.index, 'codes', None)
        if codes is None:
            if hasattr(self.index, 'labels'):
                codes = self.index.labels
            else:
                codes = pd.Categorical(self.index).codes[None]
        return codes

    @property
    def group_names(self):
        return self.index.names

    def reindex(self, index=None, names=None):
        """
        Resets the index in-place.
        """
        # NOTE: this is not of much use if the rest of the data does not change
        # This needs to reset cache
        if names is None:
            names = self.group_names
        self = Grouping(index, names)

    def get_slices(self, level=0):
        """
        Sets the slices attribute to be a list of indices of the sorted
        groups for the first index level. I.e., self.slices[0] is the
        index where each observation is in the first (sorted) group.
        """
        # TODO: refactor this
        groups = self.index.get_level_values(level).unique()
        groups = np.array(groups)
        groups.sort()
        if isinstance(self.index, MultiIndex):
            self.slices = [
                self.index.get_loc_level(x, level=level)[0] for x in groups
            ]
        else:
            self.slices = [self.index.get_loc(x) for x in groups]

    def count_categories(self, level=0):
        """
        Sets the attribute counts to equal the bincount of the (integer-valued)
        labels.
        """
        # TODO: refactor this not to set an attribute. Why would we do this?
        self.counts = np.bincount(self.labels[level])

    def check_index(self, is_sorted=True, unique=True, index=None):
        """Sanity checks"""
        if not index:
            index = self.index
        if is_sorted:
            test = pd.DataFrame(lrange(len(index)), index=index)
            test_sorted = test.sort()
            if not test.index.equals(test_sorted.index):
                raise Exception('Data is not be sorted')
        if unique:
            if len(index) != len(index.unique()):
                raise Exception('Duplicate index entries')

    def sort(self, data, index=None):
        """Applies a (potentially hierarchical) sort operation on a numpy array
        or pandas series/dataframe based on the grouping index or a
        user-supplied index.  Returns an object of the same type as the
        original data as well as the matching (sorted) Pandas index.
        """

        if index is None:
            index = self.index
        if data_util._is_using_ndarray_type(data, None):
            if data.ndim == 1:
                out = pd.Series(data, index=index, copy=True)
                out = out.sort_index()
            else:
                out = pd.DataFrame(data, index=index)
                out = out.sort_index(inplace=False)  # copies
            return np.array(out), out.index
        elif data_util._is_using_pandas(data, None):
            out = data
            out = out.reindex(index)  # copies?
            out = out.sort_index()
            return out, out.index
        else:
            msg = 'data must be a Numpy array or a Pandas Series/DataFrame'
            raise ValueError(msg)

    def transform_dataframe(self, dataframe, function, level=0, **kwargs):
        """Apply function to each column, by group
        Assumes that the dataframe already has a proper index"""
        if dataframe.shape[0] != self.nobs:
            raise Exception('dataframe does not have the same shape as index')
        out = dataframe.groupby(level=level).apply(function, **kwargs)
        if 1 in out.shape:
            return np.ravel(out)
        else:
            return np.array(out)

    def transform_array(self, array, function, level=0, **kwargs):
        """Apply function to each column, by group
        """
        if array.shape[0] != self.nobs:
            raise Exception('array does not have the same shape as index')
        dataframe = pd.DataFrame(array, index=self.index)
        return self.transform_dataframe(dataframe,
                                        function,
                                        level=level,
                                        **kwargs)

    def transform_slices(self, array, function, level=0, **kwargs):
        """Apply function to each group. Similar to transform_array but does
        not coerce array to a DataFrame and back and only works on a 1D or 2D
        numpy array. function is called function(group, group_idx, **kwargs).
        """
        array = np.asarray(array)
        if array.shape[0] != self.nobs:
            raise Exception('array does not have the same shape as index')
        # always reset because level is given. need to refactor this.
        self.get_slices(level=level)
        processed = []
        for s in self.slices:
            if array.ndim == 2:
                subset = array[s, :]
            elif array.ndim == 1:
                subset = array[s]
            processed.append(function(subset, s, **kwargs))
        processed = np.array(processed)
        return processed.reshape(-1, processed.shape[-1])

    # TODO: this is not general needs to be a PanelGrouping object
    def dummies_time(self):
        self.dummy_sparse(level=1)
        return self._dummies

    def dummies_groups(self, level=0):
        self.dummy_sparse(level=level)
        return self._dummies

    def dummy_sparse(self, level=0):
        """create a sparse indicator from a group array with integer labels

        Parameters
        ----------
        groups : ndarray, int, 1d (nobs,)
            An array of group indicators for each observation. Group levels
            are assumed to be defined as consecutive integers, i.e.
            range(n_groups) where n_groups is the number of group levels.
            A group level with no observations for it will still produce a
            column of zeros.

        Returns
        -------
        indi : ndarray, int8, 2d (nobs, n_groups)
            an indicator array with one row per observation, that has 1 in the
            column of the group level for that observation

        Examples
        --------

        >>> g = np.array([0, 0, 2, 1, 1, 2, 0])
        >>> indi = dummy_sparse(g)
        >>> indi
        <7x3 sparse matrix of type '<type 'numpy.int8'>'
            with 7 stored elements in Compressed Sparse Row format>
        >>> indi.todense()
        matrix([[1, 0, 0],
                [1, 0, 0],
                [0, 0, 1],
                [0, 1, 0],
                [0, 1, 0],
                [0, 0, 1],
                [1, 0, 0]], dtype=int8)


        current behavior with missing groups
        >>> g = np.array([0, 0, 2, 0, 2, 0])
        >>> indi = dummy_sparse(g)
        >>> indi.todense()
        matrix([[1, 0, 0],
                [1, 0, 0],
                [0, 0, 1],
                [1, 0, 0],
                [0, 0, 1],
                [1, 0, 0]], dtype=int8)
        """
        indi = dummy_sparse(self.labels[level])
        self._dummies = indi
Exemplo n.º 29
0
 def test_get_loc_raises_missized_tolerance(self):
     index = Index([0, 1, 2])
     with pytest.raises(ValueError, match="tolerance size must match"):
         with tm.assert_produces_warning(FutureWarning, match="deprecated"):
             index.get_loc(1.1, "nearest", tolerance=[1, 1])
Exemplo n.º 30
0
 def test_get_loc_tolerance_no_method_raises(self):
     index = Index([0, 1, 2])
     with pytest.raises(ValueError, match="tolerance .* valid if"):
         index.get_loc(1.1, tolerance=1)
Exemplo n.º 31
0
    def _parse_one_index(base_idx: pd.Index,
                         index_1d: Union[INDEX1D, None],
                         index_name: str,
                         view_index: CINDEX = None) -> CINDEX:
        """ index_name: 'row' or 'column' """
        if view_index is not None:
            base_idx = base_idx[view_index]

        indexer = None

        if isinstance(index_1d, slice):
            step = 1 if index_1d.step is None else index_1d.step

            start = index_1d.start
            if isinstance(start, str):
                if start not in base_idx:
                    raise ValueError(
                        f"Cannot locate slice.start '{start}' in {index_name} index!"
                    )
                start = base_idx.get_loc(start)
            elif start is None:
                start = 0
            else:
                if start < 0 or start >= base_idx.size:
                    raise ValueError(
                        f"slice.start '{start}' is out of the boundary [0, {base_idx.size}) for {index_name} index!"
                    )

            stop = index_1d.stop
            if isinstance(stop, str):
                if stop not in base_idx:
                    raise ValueError(
                        f"Cannot locate slice.stop '{stop}' in {index_name} index!"
                    )
                stop = base_idx.get_loc(stop) + np.sign(
                    step)  # if str , use [] instead of [)
            elif stop is None:
                stop = base_idx.size
            else:
                if stop - step < 0 or stop - step >= base_idx.size:
                    raise ValueError(
                        f"slice.stop '{stop}' is out of the boundary [0, {base_idx.size}) for {index_name} index!"
                    )

            indexer = slice(start, stop, step)
        elif isinstance(index_1d, np.ndarray) and (index_1d.dtype.kind
                                                   in {'b', 'i', 'u'}):
            assert index_1d.ndim == 1

            if index_1d.dtype.kind == 'b':
                if index_1d.size != base_idx.size:
                    raise ValueError(
                        f"{index_name} index size does not match: actual size {base_idx.size}, input size {index_1d.size}!"
                    )
                indexer = np.where(index_1d)[0]
            elif index_1d.dtype.kind == 'i' or index_1d.dtype.kind == 'u':
                if np.any(index_1d < 0):
                    raise ValueError(
                        f"Detect negative values in {index_name} index!")
                if np.any(index_1d >= base_idx.size):
                    raise ValueError(
                        f"Detect values exceeding the largest valid position {base_idx.size - 1} in {index_name} index!"
                    )
                if np.unique(index_1d).size < index_1d.size:
                    raise ValueError(
                        f"{index_name} index values are not unique!")
                indexer = index_1d
        else:
            if not isinstance(index_1d, pd.Index):
                assert isinstance(index_1d, np.ndarray) and index_1d.ndim == 1
                index_1d = pd.Index(index_1d)
            indexer = _process_pd_index(base_idx, index_1d)

        if view_index is not None:
            if isinstance(view_index, slice):
                if isinstance(indexer, slice):
                    indexer = slice(
                        view_index.start + (view_index.step * indexer.start),
                        view_index.start + (view_index.step * indexer.stop),
                        view_index.step * indexer.step)
                else:
                    indexer = np.array(
                        range(view_index.start, view_index.stop,
                              view_index.step))[indexer]
            else:
                indexer = view_index[indexer]

        return indexer
Exemplo n.º 32
0
class Grouping(object):
    def __init__(self, index, names=None):
        """
        index : index-like
            Can be pandas MultiIndex or Index or array-like. If array-like
            and is a MultipleIndex (more than one grouping variable),
            groups are expected to be in each row. E.g., [('red', 1),
            ('red', 2), ('green', 1), ('green', 2)]
        names : list or str, optional
            The names to use for the groups. Should be a str if only
            one grouping variable is used.

        Notes
        -----
        If index is already a pandas Index then there is no copy.
        """
        if isinstance(index, (Index, MultiIndex)):
            if names is not None:
                if hasattr(index, 'set_names'):  # newer pandas
                    index.set_names(names, inplace=True)
                else:
                    index.names = names
            self.index = index
        else:  # array-like
            if _is_hierarchical(index):
                self.index = _make_hierarchical_index(index, names)
            else:
                self.index = Index(index, name=names)
            if names is None:
                names = _make_generic_names(self.index)
                if hasattr(self.index, 'set_names'):
                    self.index.set_names(names, inplace=True)
                else:
                    self.index.names = names

        self.nobs = len(self.index)
        self.nlevels = len(self.index.names)
        self.slices = None

    @property
    def index_shape(self):
        if hasattr(self.index, 'levshape'):
            return self.index.levshape
        else:
            return self.index.shape

    @property
    def levels(self):
        if hasattr(self.index, 'levels'):
            return self.index.levels
        else:
            return pd.Categorical(self.index).levels

    @property
    def labels(self):
        # this was index_int, but that's not a very good name...
        if hasattr(self.index, 'labels'):
            return self.index.labels
        else:  # pandas version issue here
            # Compat code for the labels -> codes change in pandas 0.15
            # FIXME: use .codes directly when we don't want to support
            # pandas < 0.15
            tmp = pd.Categorical(self.index)
            try:
                labl = tmp.codes
            except AttributeError:
                labl = tmp.labels  # Old pandsd

            return labl[None]

    @property
    def group_names(self):
        return self.index.names

    def reindex(self, index=None, names=None):
        """
        Resets the index in-place.
        """
        # NOTE: this isn't of much use if the rest of the data doesn't change
        # This needs to reset cache
        if names is None:
            names = self.group_names
        self = Grouping(index, names)

    def get_slices(self, level=0):
        """
        Sets the slices attribute to be a list of indices of the sorted
        groups for the first index level. I.e., self.slices[0] is the
        index where each observation is in the first (sorted) group.
        """
        # TODO: refactor this
        groups = self.index.get_level_values(level).unique()
        groups = np.array(groups)
        groups.sort()
        if isinstance(self.index, MultiIndex):
            self.slices = [self.index.get_loc_level(x, level=level)[0]
                           for x in groups]
        else:
            self.slices = [self.index.get_loc(x) for x in groups]

    def count_categories(self, level=0):
        """
        Sets the attribute counts to equal the bincount of the (integer-valued)
        labels.
        """
        # TODO: refactor this not to set an attribute. Why would we do this?
        self.counts = np.bincount(self.labels[level])

    def check_index(self, is_sorted=True, unique=True, index=None):
        """Sanity checks"""
        if not index:
            index = self.index
        if is_sorted:
            test = pd.DataFrame(lrange(len(index)), index=index)
            test_sorted = test.sort()
            if not test.index.equals(test_sorted.index):
                raise Exception('Data is not be sorted')
        if unique:
            if len(index) != len(index.unique()):
                raise Exception('Duplicate index entries')

    def sort(self, data, index=None):
        """Applies a (potentially hierarchical) sort operation on a numpy array
        or pandas series/dataframe based on the grouping index or a
        user-supplied index.  Returns an object of the same type as the
        original data as well as the matching (sorted) Pandas index.
        """

        if index is None:
            index = self.index
        if data_util._is_using_ndarray_type(data, None):
            if data.ndim == 1:
                out = pd.Series(data, index=index, copy=True)
                out = out.sort_index()
            else:
                out = pd.DataFrame(data, index=index)
                out = out.sort_index(inplace=False)  # copies
            return np.array(out), out.index
        elif data_util._is_using_pandas(data, None):
            out = data
            out = out.reindex(index)  # copies?
            out = out.sort_index()
            return out, out.index
        else:
            msg = 'data must be a Numpy array or a Pandas Series/DataFrame'
            raise ValueError(msg)

    def transform_dataframe(self, dataframe, function, level=0, **kwargs):
        """Apply function to each column, by group
        Assumes that the dataframe already has a proper index"""
        if dataframe.shape[0] != self.nobs:
            raise Exception('dataframe does not have the same shape as index')
        out = dataframe.groupby(level=level).apply(function, **kwargs)
        if 1 in out.shape:
            return np.ravel(out)
        else:
            return np.array(out)

    def transform_array(self, array, function, level=0, **kwargs):
        """Apply function to each column, by group
        """
        if array.shape[0] != self.nobs:
            raise Exception('array does not have the same shape as index')
        dataframe = pd.DataFrame(array, index=self.index)
        return self.transform_dataframe(dataframe, function, level=level,
                                        **kwargs)

    def transform_slices(self, array, function, level=0, **kwargs):
        """Apply function to each group. Similar to transform_array but does
        not coerce array to a DataFrame and back and only works on a 1D or 2D
        numpy array. function is called function(group, group_idx, **kwargs).
        """
        array = np.asarray(array)
        if array.shape[0] != self.nobs:
            raise Exception('array does not have the same shape as index')
        # always reset because level is given. need to refactor this.
        self.get_slices(level=level)
        processed = []
        for s in self.slices:
            if array.ndim == 2:
                subset = array[s, :]
            elif array.ndim == 1:
                subset = array[s]
            processed.append(function(subset, s, **kwargs))
        processed = np.array(processed)
        return processed.reshape(-1, processed.shape[-1])

    # TODO: this isn't general needs to be a PanelGrouping object
    def dummies_time(self):
        self.dummy_sparse(level=1)
        return self._dummies

    def dummies_groups(self, level=0):
        self.dummy_sparse(level=level)
        return self._dummies

    def dummy_sparse(self, level=0):
        """create a sparse indicator from a group array with integer labels

        Parameters
        ----------
        groups: ndarray, int, 1d (nobs,) an array of group indicators for each
            observation. Group levels are assumed to be defined as consecutive
            integers, i.e. range(n_groups) where n_groups is the number of
            group levels. A group level with no observations for it will still
            produce a column of zeros.

        Returns
        -------
        indi : ndarray, int8, 2d (nobs, n_groups)
            an indicator array with one row per observation, that has 1 in the
            column of the group level for that observation

        Examples
        --------

        >>> g = np.array([0, 0, 2, 1, 1, 2, 0])
        >>> indi = dummy_sparse(g)
        >>> indi
        <7x3 sparse matrix of type '<type 'numpy.int8'>'
            with 7 stored elements in Compressed Sparse Row format>
        >>> indi.todense()
        matrix([[1, 0, 0],
                [1, 0, 0],
                [0, 0, 1],
                [0, 1, 0],
                [0, 1, 0],
                [0, 0, 1],
                [1, 0, 0]], dtype=int8)


        current behavior with missing groups
        >>> g = np.array([0, 0, 2, 0, 2, 0])
        >>> indi = dummy_sparse(g)
        >>> indi.todense()
        matrix([[1, 0, 0],
                [1, 0, 0],
                [0, 0, 1],
                [1, 0, 0],
                [0, 0, 1],
                [1, 0, 0]], dtype=int8)
        """
        from scipy import sparse
        groups = self.labels[level]
        indptr = np.arange(len(groups)+1)
        data = np.ones(len(groups), dtype=np.int8)
        self._dummies = sparse.csr_matrix((data, groups, indptr))