Exemplo n.º 1
0
    def test_datetime64_dtype_array_returned(self):
        # GH 9431
        expected = np.array(
            ["2015-01-03T00:00:00.000000000+0000", "2015-01-01T00:00:00.000000000+0000"], dtype="M8[ns]"
        )

        dt_index = pd.to_datetime(
            [
                "2015-01-03T00:00:00.000000000+0000",
                "2015-01-01T00:00:00.000000000+0000",
                "2015-01-01T00:00:00.000000000+0000",
            ]
        )
        result = algos.unique(dt_index)
        tm.assert_numpy_array_equal(result, expected)
        self.assertEqual(result.dtype, expected.dtype)

        s = pd.Series(dt_index)
        result = algos.unique(s)
        tm.assert_numpy_array_equal(result, expected)
        self.assertEqual(result.dtype, expected.dtype)

        arr = s.values
        result = algos.unique(arr)
        tm.assert_numpy_array_equal(result, expected)
        self.assertEqual(result.dtype, expected.dtype)
Exemplo n.º 2
0
    def _get_wom_rule(self):
        wdiffs = unique(np.diff(self.index.week))
        if not lib.ismember(wdiffs, set([4, 5])).all():
            return None

        weekdays = unique(self.index.weekday)
        if len(weekdays) > 1:
            return None

        # get which week
        week = (self.index[0].day - 1) // 7 + 1
        wd = _weekday_rule_aliases[weekdays[0]]

        return "WOM-%d%s" % (week, wd)
Exemplo n.º 3
0
def _maybe_cache(arg, format, cache, tz, convert_listlike):
    """
    Create a cache of unique dates from an array of dates

    Parameters
    ----------
    arg : integer, float, string, datetime, list, tuple, 1-d array, Series
    format : string
        Strftime format to parse time
    cache : boolean
        True attempts to create a cache of converted values
    tz : string
        Timezone of the dates
    convert_listlike : function
        Conversion function to apply on dates

    Returns
    -------
    cache_array : Series
        Cache of converted, unique dates. Can be empty
    """
    from pandas import Series
    cache_array = Series()
    if cache:
        # Perform a quicker unique check
        from pandas import Index
        if not Index(arg).is_unique:
            unique_dates = algorithms.unique(arg)
            cache_dates = convert_listlike(unique_dates, True, format, tz=tz)
            cache_array = Series(cache_dates, index=unique_dates)
    return cache_array
Exemplo n.º 4
0
def _infer_precision(base_precision, bins):
    """Infer an appropriate precision for _round_frac
    """
    for precision in range(base_precision, 20):
        levels = [_round_frac(b, precision) for b in bins]
        if algos.unique(levels).size == bins.size:
            return precision
    return base_precision  # default
Exemplo n.º 5
0
def _bins_to_cuts(x, bins, right=True, labels=None,
                  precision=3, include_lowest=False,
                  dtype=None, duplicates='raise'):

    if duplicates not in ['raise', 'drop']:
        raise ValueError("invalid value for 'duplicates' parameter, "
                         "valid options are: raise, drop")

    unique_bins = algos.unique(bins)
    if len(unique_bins) < len(bins):
        if duplicates == 'raise':
            raise ValueError("Bin edges must be unique: {}.\nYou "
                             "can drop duplicate edges by setting "
                             "the 'duplicates' kwarg".format(repr(bins)))
        else:
            bins = unique_bins

    side = 'left' if right else 'right'
    ids = bins.searchsorted(x, side=side)

    if include_lowest:
        ids[x == bins[0]] = 1

    na_mask = isnull(x) | (ids == len(bins)) | (ids == 0)
    has_nas = na_mask.any()

    if labels is not False:
        if labels is None:
            increases = 0
            while True:
                try:
                    levels = _format_levels(bins, precision, right=right,
                                            include_lowest=include_lowest,
                                            dtype=dtype)
                except ValueError:
                    increases += 1
                    precision += 1
                    if increases >= 20:
                        raise
                else:
                    break

        else:
            if len(labels) != len(bins) - 1:
                raise ValueError('Bin labels must be one fewer than '
                                 'the number of bin edges')
            levels = labels

        levels = np.asarray(levels, dtype=object)
        np.putmask(ids, na_mask, 0)
        fac = Categorical(ids - 1, levels, ordered=True, fastpath=True)
    else:
        fac = ids - 1
        if has_nas:
            fac = fac.astype(np.float64)
            np.putmask(fac, na_mask, np.nan)

    return fac, bins
Exemplo n.º 6
0
def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, precision=3, name=None, include_lowest=False):
    x_is_series = isinstance(x, Series)
    series_index = None

    if x_is_series:
        series_index = x.index
        if name is None:
            name = x.name

    x = np.asarray(x)

    side = "left" if right else "right"
    ids = bins.searchsorted(x, side=side)

    if len(algos.unique(bins)) < len(bins):
        raise ValueError("Bin edges must be unique: %s" % repr(bins))

    if include_lowest:
        ids[x == bins[0]] = 1

    na_mask = isnull(x) | (ids == len(bins)) | (ids == 0)
    has_nas = na_mask.any()

    if labels is not False:
        if labels is None:
            increases = 0
            while True:
                try:
                    levels = _format_levels(bins, precision, right=right, include_lowest=include_lowest)
                except ValueError:
                    increases += 1
                    precision += 1
                    if increases >= 20:
                        raise
                else:
                    break

        else:
            if len(labels) != len(bins) - 1:
                raise ValueError("Bin labels must be one fewer than " "the number of bin edges")
            levels = labels

        levels = np.asarray(levels, dtype=object)
        np.putmask(ids, na_mask, 0)
        fac = Categorical(ids - 1, levels, ordered=True, fastpath=True)
    else:
        fac = ids - 1
        if has_nas:
            fac = fac.astype(np.float64)
            np.putmask(fac, na_mask, np.nan)

    if x_is_series:
        fac = Series(fac, index=series_index, name=name)

    if not retbins:
        return fac

    return fac, bins
Exemplo n.º 7
0
    def _get_annual_rule(self):
        if len(self.ydiffs) > 1:
            return None

        if len(algos.unique(self.fields["M"])) > 1:
            return None

        pos_check = self.month_position_check()
        return {"cs": "AS", "bs": "BAS", "ce": "A", "be": "BA"}.get(pos_check)
Exemplo n.º 8
0
    def test_timedelta64_dtype_array_returned(self):
        # GH 9431
        expected = np.array([31200, 45678, 10000], dtype='m8[ns]')

        td_index = pd.to_timedelta([31200, 45678, 31200, 10000, 45678])
        result = algos.unique(td_index)
        tm.assert_numpy_array_equal(result, expected)
        self.assertEqual(result.dtype, expected.dtype)

        s = pd.Series(td_index)
        result = algos.unique(s)
        tm.assert_numpy_array_equal(result, expected)
        self.assertEqual(result.dtype, expected.dtype)

        arr = s.values
        result = algos.unique(arr)
        tm.assert_numpy_array_equal(result, expected)
        self.assertEqual(result.dtype, expected.dtype)
Exemplo n.º 9
0
def _bins_to_cuts(x, bins, right=True, labels=None,
                  precision=3, include_lowest=False,
                  dtype=None, duplicates='raise'):

    if duplicates not in ['raise', 'drop']:
        raise ValueError("invalid value for 'duplicates' parameter, "
                         "valid options are: raise, drop")

    if isinstance(bins, IntervalIndex):
        # we have a fast-path here
        ids = bins.get_indexer(x)
        result = algos.take_nd(bins, ids)
        result = Categorical(result, categories=bins, ordered=True)
        return result, bins

    unique_bins = algos.unique(bins)
    if len(unique_bins) < len(bins) and len(bins) != 2:
        if duplicates == 'raise':
            raise ValueError("Bin edges must be unique: {bins!r}.\nYou "
                             "can drop duplicate edges by setting "
                             "the 'duplicates' kwarg".format(bins=bins))
        else:
            bins = unique_bins

    side = 'left' if right else 'right'
    ids = _ensure_int64(bins.searchsorted(x, side=side))

    if include_lowest:
        # Numpy 1.9 support: ensure this mask is a Numpy array
        ids[np.asarray(x == bins[0])] = 1

    na_mask = isna(x) | (ids == len(bins)) | (ids == 0)
    has_nas = na_mask.any()

    if labels is not False:
        if labels is None:
            labels = _format_labels(bins, precision, right=right,
                                    include_lowest=include_lowest,
                                    dtype=dtype)
        else:
            if len(labels) != len(bins) - 1:
                raise ValueError('Bin labels must be one fewer than '
                                 'the number of bin edges')
        if not is_categorical_dtype(labels):
            labels = Categorical(labels, categories=labels, ordered=True)

        np.putmask(ids, na_mask, 0)
        result = algos.take_nd(labels, ids - 1)

    else:
        result = ids - 1
        if has_nas:
            result = result.astype(np.float64)
            np.putmask(result, na_mask, np.nan)

    return result, bins
Exemplo n.º 10
0
    def _get_annual_rule(self):
        if len(self.ydiffs) > 1:
            return None

        if len(algos.unique(self.fields['M'])) > 1:
            return None

        pos_check = self.month_position_check()
        return {'cs': 'AS', 'bs': 'BAS',
                'ce': 'A', 'be': 'BA'}.get(pos_check)
Exemplo n.º 11
0
    def _get_wom_rule(self):
#         wdiffs = unique(np.diff(self.index.week))
        #We also need -47, -49, -48 to catch index spanning year boundary
#         if not lib.ismember(wdiffs, set([4, 5, -47, -49, -48])).all():
#             return None

        weekdays = unique(self.index.weekday)
        if len(weekdays) > 1:
            return None

        week_of_months = unique((self.index.day - 1) // 7)
        if len(week_of_months) > 1:
            return None

        # get which week
        week = week_of_months[0] + 1
        wd = _weekday_rule_aliases[weekdays[0]]

        return 'WOM-%d%s' % (week, wd)
Exemplo n.º 12
0
    def _get_wom_rule(self):
        #         wdiffs = unique(np.diff(self.index.week))
        # We also need -47, -49, -48 to catch index spanning year boundary
        #     if not lib.ismember(wdiffs, set([4, 5, -47, -49, -48])).all():
        #         return None

        weekdays = unique(self.index.weekday)
        if len(weekdays) > 1:
            return None

        week_of_months = unique((self.index.day - 1) // 7)
        # Only attempt to infer up to WOM-4. See #9425
        week_of_months = week_of_months[week_of_months < 4]
        if len(week_of_months) == 0 or len(week_of_months) > 1:
            return None

        # get which week
        week = week_of_months[0] + 1
        wd = _weekday_rule_aliases[weekdays[0]]

        return 'WOM-{week}{weekday}'.format(week=week, weekday=wd)
Exemplo n.º 13
0
def _bins_to_cuts(x, bins, right=True, labels=None,
                  precision=3, include_lowest=False,
                  dtype=None):

    side = 'left' if right else 'right'
    ids = bins.searchsorted(x, side=side)

    if len(algos.unique(bins)) < len(bins):
        raise ValueError('Bin edges must be unique: %s' % repr(bins))

    if include_lowest:
        ids[x == bins[0]] = 1

    na_mask = isnull(x) | (ids == len(bins)) | (ids == 0)
    has_nas = na_mask.any()

    if labels is not False:
        if labels is None:
            increases = 0
            while True:
                try:
                    levels = _format_levels(bins, precision, right=right,
                                            include_lowest=include_lowest,
                                            dtype=dtype)
                except ValueError:
                    increases += 1
                    precision += 1
                    if increases >= 20:
                        raise
                else:
                    break

        else:
            if len(labels) != len(bins) - 1:
                raise ValueError('Bin labels must be one fewer than '
                                 'the number of bin edges')
            levels = labels

        levels = np.asarray(levels, dtype=object)
        np.putmask(ids, na_mask, 0)
        fac = Categorical(ids - 1, levels, ordered=True, fastpath=True)
    else:
        fac = ids - 1
        if has_nas:
            fac = fac.astype(np.float64)
            np.putmask(fac, na_mask, np.nan)

    return fac, bins
Exemplo n.º 14
0
    def _read_table_native(self,group,where,name=None):
        if name == None:
            name = getattr(group._v_attrs, 'tables')[0]
        table = getattr(group, name)
        
        #if not found then some sort of default behaviour maybe?
        info = table.attrs._pandas_info
        
        #no selection implemented
        
        #need to deal with tz info etc.
        
        indices = []
        index_names = []
        column_names = []
        data = []
        for i,col in enumerate(table.colnames):
            if info[col].get('isIndex',False):
                indices.append(_maybe_convert(table.read(field=col),
                                              info[col]['kind']))
                index_names.append(info[col]['name_data'])
            else:
                data.append(_maybe_convert(table.read(field=col),
                                           info[col]['kind']))
                column_names.append(info[col]['name_data'])

        index = MultiIndex.from_arrays(indices,names=index_names)
        
        kind = info['pandas_type']
        if kind == 'series' and len(column_names) == 1:
            return Series(data=data[0],index=index,name=column_names[0])
        elif kind == 'frame':
            if len(unique(column_names)) == len(column_names):
                return DataFrame(dict(zip(column_names,data)),index=index)
            else:
                raise NotImplementedError, "No support for duplicate column names"
        else:
            raise NotImplementedError, "Only series and frame are supported at this time"
Exemplo n.º 15
0
    def _read_panel_table(self, group, where=None):
        table = getattr(group, 'table')
        fields = table._v_attrs.fields

        # create the selection
        sel = Selection(table, where, table._v_attrs.index_kind)
        sel.select()
        fields = table._v_attrs.fields

        columns = _maybe_convert(sel.values['column'],
                                 table._v_attrs.columns_kind)
        index = _maybe_convert(sel.values['index'],
                               table._v_attrs.index_kind)
        values = sel.values['values']

        major = Factor(index)
        minor = Factor(columns)

        J, K = len(major.levels), len(minor.levels)
        key = major.labels * K + minor.labels

        if len(unique(key)) == len(key):
            sorter, _ = lib.groupsort_indexer(key, J * K)

            # the data need to be sorted
            sorted_values = values.take(sorter, axis=0)
            major_labels = major.labels.take(sorter)
            minor_labels = minor.labels.take(sorter)

            block = block2d_to_block3d(sorted_values, fields, (J, K),
                                       major_labels, minor_labels)

            mgr = BlockManager([block], [block.items,
                                         major.levels, minor.levels])
            wp = Panel(mgr)
        else:
            if not self._quiet:  # pragma: no cover
                print ('Duplicate entries in table, taking most recently '
                       'appended')

            # reconstruct
            long_index = MultiIndex.from_arrays([index, columns])
            lp = DataFrame(values, index=long_index, columns=fields)

            # need a better algorithm
            tuple_index = long_index.get_tuple_index()

            unique_tuples = lib.fast_unique(tuple_index)
            unique_tuples = _asarray_tuplesafe(unique_tuples)

            indexer = match(unique_tuples, tuple_index)

            new_index = long_index.take(indexer)
            new_values = lp.values.take(indexer, axis=0)

            lp = DataFrame(new_values, index=new_index, columns=lp.columns)
            wp = lp.to_panel()

        if sel.column_filter:
            new_minor = sorted(set(wp.minor_axis) & sel.column_filter)
            wp = wp.reindex(minor=new_minor)
        return wp
Exemplo n.º 16
0
Arquivo: tile.py Projeto: xuvw/pandas
def _bins_to_cuts(x,
                  bins,
                  right=True,
                  labels=None,
                  precision=3,
                  include_lowest=False,
                  dtype=None,
                  duplicates='raise'):

    if duplicates not in ['raise', 'drop']:
        raise ValueError("invalid value for 'duplicates' parameter, "
                         "valid options are: raise, drop")

    if isinstance(bins, IntervalIndex):
        # we have a fast-path here
        ids = bins.get_indexer(x)
        result = algos.take_nd(bins, ids)
        result = Categorical(result, categories=bins, ordered=True)
        return result, bins

    unique_bins = algos.unique(bins)
    if len(unique_bins) < len(bins) and len(bins) != 2:
        if duplicates == 'raise':
            raise ValueError("Bin edges must be unique: {bins!r}.\nYou "
                             "can drop duplicate edges by setting "
                             "the 'duplicates' kwarg".format(bins=bins))
        else:
            bins = unique_bins

    side = 'left' if right else 'right'
    ids = ensure_int64(bins.searchsorted(x, side=side))

    if include_lowest:
        ids[x == bins[0]] = 1

    na_mask = isna(x) | (ids == len(bins)) | (ids == 0)
    has_nas = na_mask.any()

    if labels is not False:
        if labels is None:
            labels = _format_labels(bins,
                                    precision,
                                    right=right,
                                    include_lowest=include_lowest,
                                    dtype=dtype)
        else:
            if len(labels) != len(bins) - 1:
                raise ValueError('Bin labels must be one fewer than '
                                 'the number of bin edges')
        if not is_categorical_dtype(labels):
            labels = Categorical(labels, categories=labels, ordered=True)

        np.putmask(ids, na_mask, 0)
        result = algos.take_nd(labels, ids - 1)

    else:
        result = ids - 1
        if has_nas:
            result = result.astype(np.float64)
            np.putmask(result, na_mask, np.nan)

    return result, bins
Exemplo n.º 17
0
def _bins_to_cuts(
    x,
    bins,
    right: bool = True,
    labels=None,
    precision: int = 3,
    include_lowest: bool = False,
    dtype=None,
    duplicates: str = "raise",
):

    if duplicates not in ["raise", "drop"]:
        raise ValueError(
            "invalid value for 'duplicates' parameter, valid options are: raise, drop"
        )

    if isinstance(bins, IntervalIndex):
        # we have a fast-path here
        ids = bins.get_indexer(x)
        result = Categorical.from_codes(ids, categories=bins, ordered=True)
        return result, bins

    unique_bins = algos.unique(bins)
    if len(unique_bins) < len(bins) and len(bins) != 2:
        if duplicates == "raise":
            raise ValueError(
                f"Bin edges must be unique: {repr(bins)}.\n"
                f"You can drop duplicate edges by setting the 'duplicates' kwarg"
            )
        else:
            bins = unique_bins

    side = "left" if right else "right"
    ids = ensure_int64(bins.searchsorted(x, side=side))

    if include_lowest:
        ids[x == bins[0]] = 1

    na_mask = isna(x) | (ids == len(bins)) | (ids == 0)
    has_nas = na_mask.any()

    if labels is not False:
        if not (labels is None or is_list_like(labels)):
            raise ValueError(
                "Bin labels must either be False, None or passed in as a "
                "list-like argument")

        elif labels is None:
            labels = _format_labels(bins,
                                    precision,
                                    right=right,
                                    include_lowest=include_lowest,
                                    dtype=dtype)

        else:
            if len(labels) != len(bins) - 1:
                raise ValueError(
                    "Bin labels must be one fewer than the number of bin edges"
                )

        if not is_categorical_dtype(labels):
            labels = Categorical(labels, categories=labels, ordered=True)

        np.putmask(ids, na_mask, 0)
        result = algos.take_nd(labels, ids - 1)

    else:
        result = ids - 1
        if has_nas:
            result = result.astype(np.float64)
            np.putmask(result, na_mask, np.nan)

    return result, bins
Exemplo n.º 18
0
 def test_object_refcount_bug(self):
     lst = ["A", "B", "C", "D", "E"]
     for i in range(1000):
         len(algos.unique(lst))
Exemplo n.º 19
0
 def test_object_refcount_bug(self):
     lst = ['A', 'B', 'C', 'D', 'E']
     for i in range(1000):
         len(algos.unique(lst))
Exemplo n.º 20
0
 def test_object_refcount_bug(self):
     lst = ['A', 'B', 'C', 'D', 'E']
     for i in range(1000):
         len(algos.unique(lst))
Exemplo n.º 21
0
 def unique(
         self: NDArrayBackedExtensionArrayT
 ) -> NDArrayBackedExtensionArrayT:
     new_data = unique(self._ndarray)
     return self._from_backing_data(new_data)
Exemplo n.º 22
0
def precalculate_probability_table_dynamic_programming(
        D: pd.DataFrame, G: nx.DiGraph, ci: Hashable) -> np.ndarray:
    n = D.shape[0]
    H = np.zeros((n, n))
    P = [p for p in G.predecessors(ci)]
    C = [c for c in G.successors(ci)]
    S = [None] * len(C)

    J_P = 1
    S_c = [None] * len(C)
    J_C = [1] * len(C)
    J_S = [1] * len(C)
    for p in P:
        J_P *= len(np.unique(D[p]))
    for i, c in enumerate(C):
        S[i] = [s for s in G.predecessors(c)]
        S[i].remove(ci)
        S_c[i] = [s for s in S[i]]
        S_c[i].append(c)
        J_C[i] = len(np.unique(D[c]))
        for spouse in S[i]:
            J_S[i] *= len(np.unique(D[spouse]))

    for v in range(n):
        for u in range(v + 1):
            H[u, v] = math.log(sc.special.comb(v - u + J_P, J_P - 1))

    # Parent table
    for p in P:
        p_dist = pd.get_dummies(D[p]).to_numpy()
        dist_table = np.zeros((n, n, len(p_dist[0, :])), dtype=int)
        for v in range(n):
            for u in range(v + 1):
                # fill dist_table
                if v == u:
                    dist_table[u, v] = p_dist[v, :]
                else:
                    dist_table[u, v] = dist_table[u, v - 1] + p_dist[v, :]

                # calculate probability
                h = math.log(math.factorial(v + 1 - u))
                #h -= sum(np.log(sc.special.factorial(dist_table[u, v])))
                h -= sum(sc.special.gammaln(dist_table[u, v] + 1))
                H[u, v] += h

    # Child-Spouse table
    for i, c in enumerate(C):
        c_dist = pd.get_dummies(D[c]).to_numpy()
        n_c = len(c_dist[0, :])

        s_class: pd.Series
        if len(S[i]) > 0:
            s_class = D[S[i]].groupby(S[i]).ngroup()
        else:
            s_class = pd.Series(np.zeros(n))
        n_s_class = len(unique(s_class))

        dist_table = np.zeros((n, n, n_s_class, n_c), dtype=int)
        for v in range(n):
            for u in range(v + 1):
                h = 0

                for i_s_class in range(n_s_class):
                    z = np.zeros(n_c)
                    if i_s_class == s_class[v]:
                        z = c_dist[v, :]

                    # fill dist_table
                    if v == u:
                        dist_table[u, v, i_s_class] = z
                    else:
                        dist_table[u, v, i_s_class] = dist_table[u, v - 1,
                                                                 i_s_class] + z

                    # calculate probability
                    c_over_s_dist = dist_table[u, v, i_s_class]
                    n_c_over_s = sum(c_over_s_dist)
                    #h += math.log(sc.special.comb(n_c_over_s + J_C[i] - 1, J_C[i] - 1))
                    #h += math.log(math.factorial(n_c_over_s))
                    #h -= sum(np.log(sc.special.factorial(c_over_s_dist)))

                    # Vectors for faster gammaln calculation
                    add = np.asarray([n_c_over_s + J_C[i], n_c_over_s + 1])
                    sub = np.append([J_C[i], n_c_over_s + 1],
                                    c_over_s_dist + 1)
                    h += sum(sc.special.gammaln(add))
                    h -= sum(sc.special.gammaln(sub))
                H[u, v] += h
    return H
Exemplo n.º 23
0
def precalculate_probability_table_split_up_numpy(D: pd.DataFrame,
                                                  G: nx.DiGraph,
                                                  ci: Hashable) -> np.ndarray:
    n = D.shape[0]
    H = np.zeros((n, n))
    P = [p for p in G.predecessors(ci)]
    C = [c for c in G.successors(ci)]
    S = [None] * len(C)

    J_P = 1
    S_c = [None] * len(C)
    J_C = [1] * len(C)
    J_S = [1] * len(C)
    for p in P:
        J_P *= len(np.unique(D[p]))
    for i, c in enumerate(C):
        S[i] = [s for s in G.predecessors(c)]
        S[i].remove(ci)
        S_c[i] = [s for s in S[i]]
        S_c[i].append(c)
        J_C[i] = len(np.unique(D[c]))
        for spouse in S[i]:
            J_S[i] *= len(np.unique(D[spouse]))

    vSu = np.zeros((n, n))
    for v in range(n):
        for u in range(v + 1):
            vSu[u, v] = v - u

    H = sc.special.gammaln(vSu + J_P + 1)
    H -= sc.special.gammaln(vSu + 2) + math.log(math.factorial(J_P - 1))

    # Parent table
    for p in P:
        p_dist = pd.get_dummies(D[p]).to_numpy()

        J_p = p_dist.shape[1]
        dist_table = np.reshape(np.tile(p_dist, (n, 1)), (n, n, J_p))
        tril_index = np.tril_indices(n, k=-1)
        dist_table[tril_index] = np.zeros(J_p)
        dist_table = np.cumsum(dist_table, axis=1)

        H += sc.special.gammaln(vSu + 2)
        H -= np.sum(sc.special.gammaln(dist_table + 1), axis=-1)

    # Child-Spouse table

    for i, c in enumerate(C):
        c_dist = pd.get_dummies(D[c]).to_numpy()
        n_c = len(c_dist[0, :])

        s_class: pd.Series
        if len(S[i]) > 0:
            s_class = D[S[i]].groupby(S[i]).ngroup()
        else:
            s_class = pd.Series(np.zeros(n))
        n_s_class = len(unique(s_class))

        dist_table = np.zeros((n, n_s_class, n_c), dtype=int)
        for v in range(n):
            for i_s_class in range(n_s_class):
                z = np.zeros(n_c)
                if i_s_class == s_class[v]:
                    z = c_dist[v, :]
                dist_table[v, i_s_class] = z

        intval_table = np.reshape(np.tile(dist_table, (n, 1, 1)),
                                  (n, n, n_s_class, n_c))
        tril_index = np.tril_indices(n, k=-1)
        intval_table[tril_index] = np.zeros((n_s_class, n_c))

        intval_table = np.cumsum(intval_table, axis=1)
        n_c_over_s_table = np.sum(intval_table, axis=-1)

        H += np.sum(sc.special.gammaln(n_c_over_s_table + J_C[i]), axis=-1)
        H -= math.log(math.factorial(J_C[i] - 1)) * n_s_class
        H -= np.sum(sc.special.gammaln(intval_table + 1), axis=(-1, -2))

    H = np.triu(H)
    return H
Exemplo n.º 24
0
def remove_unused_levels(self):
    """
    create a new MultiIndex from the current that removing
    unused levels, meaning that they are not expressed in the labels
    The resulting MultiIndex will have the same outward
    appearance, meaning the same .values and ordering. It will also
    be .equals() to the original.
    .. versionadded:: 0.20.0
    Returns
    -------
    MultiIndex
    Examples
    --------
    >>> i = pd.MultiIndex.from_product([range(2), list('ab')])
    MultiIndex(levels=[[0, 1], ['a', 'b']],
               codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
    >>> i[2:]
    MultiIndex(levels=[[0, 1], ['a', 'b']],
               codes=[[1, 1], [0, 1]])
    The 0 from the first level is not represented
    and can be removed
    >>> i[2:].remove_unused_levels()
    MultiIndex(levels=[[1], ['a', 'b']],
               codes=[[0, 0], [0, 1]])
    """
    import pandas.core.algorithms as algos

    new_levels = []
    new_labels = []

    changed = False
    for lev, lab in zip(self.levels, self.labels):

        # Since few levels are typically unused, bincount() is more
        # efficient than unique() - however it only accepts positive values
        # (and drops order):
        uniques = np.where(np.bincount(lab + 1) > 0)[0] - 1
        has_na = int(len(uniques) and (uniques[0] == -1))

        if len(uniques) != len(lev) + has_na:
            # We have unused levels
            changed = True

            # Recalculate uniques, now preserving order.
            # Can easily be cythonized by exploiting the already existing
            # "uniques" and stop parsing "lab" when all items are found:
            uniques = algos.unique(lab)
            if has_na:
                na_idx = np.where(uniques == -1)[0]
                # Just ensure that -1 is in first position:
                uniques[[0, na_idx[0]]] = uniques[[na_idx[0], 0]]

            # labels get mapped from uniques to 0:len(uniques)
            # -1 (if present) is mapped to last position
            label_mapping = np.zeros(len(lev) + has_na)
            # ... and reassigned value -1:
            label_mapping[uniques] = np.arange(len(uniques)) - has_na

            lab = label_mapping[lab]

            # new levels are simple
            lev = lev.take(uniques[has_na:])

        new_levels.append(lev)
        new_labels.append(lab)

    result = self._shallow_copy()

    if changed:
        result._reset_identity()
        result._set_levels(new_levels, validate=False)
        result._set_labels(new_labels, validate=False)

    return result
Exemplo n.º 25
0
    def test_objects(self):
        arr = np.random.randint(0, 100, size=50).astype('O')

        result = algos.unique(arr)
        self.assertTrue(isinstance(result, np.ndarray))
Exemplo n.º 26
0
 def unique(self):
     return type(self)(unique(self._ndarray))
Exemplo n.º 27
0
def _bins_to_cuts(x,
                  bins,
                  right=True,
                  labels=None,
                  retbins=False,
                  precision=3,
                  name=None,
                  include_lowest=False):
    x_is_series = isinstance(x, Series)
    series_index = None

    if x_is_series:
        series_index = x.index
        if name is None:
            name = x.name

    x = np.asarray(x)

    side = 'left' if right else 'right'
    ids = bins.searchsorted(x, side=side)

    if len(algos.unique(bins)) < len(bins):
        raise ValueError('Bin edges must be unique: %s' % repr(bins))

    if include_lowest:
        ids[x == bins[0]] = 1

    na_mask = com.isnull(x) | (ids == len(bins)) | (ids == 0)
    has_nas = na_mask.any()

    if labels is not False:
        if labels is None:
            increases = 0
            while True:
                try:
                    levels = _format_levels(bins,
                                            precision,
                                            right=right,
                                            include_lowest=include_lowest)
                except ValueError:
                    increases += 1
                    precision += 1
                    if increases >= 20:
                        raise
                else:
                    break

        else:
            if len(labels) != len(bins) - 1:
                raise ValueError('Bin labels must be one fewer than '
                                 'the number of bin edges')
            levels = labels

        levels = np.asarray(levels, dtype=object)
        np.putmask(ids, na_mask, 0)
        fac = Categorical(ids - 1,
                          levels,
                          ordered=True,
                          name=name,
                          fastpath=True)
    else:
        fac = ids - 1
        if has_nas:
            fac = fac.astype(np.float64)
            np.putmask(fac, na_mask, np.nan)

    if x_is_series:
        fac = Series(fac, index=series_index)

    if not retbins:
        return fac

    return fac, bins
Exemplo n.º 28
0
    def _read_panel_table(self, group, where=None):
        table = getattr(group, 'table')
        fields = table._v_attrs.fields

        # create the selection
        sel = Selection(table, where, table._v_attrs.index_kind)
        sel.select()
        fields = table._v_attrs.fields

        columns = _maybe_convert(sel.values['column'],
                                 table._v_attrs.columns_kind)
        index = _maybe_convert(sel.values['index'], table._v_attrs.index_kind)
        values = sel.values['values']

        major = Factor.from_array(index)
        minor = Factor.from_array(columns)

        J, K = len(major.levels), len(minor.levels)
        key = major.labels * K + minor.labels

        if len(unique(key)) == len(key):
            sorter, _ = lib.groupsort_indexer(com._ensure_int64(key), J * K)
            sorter = com._ensure_platform_int(sorter)

            # the data need to be sorted
            sorted_values = values.take(sorter, axis=0)
            major_labels = major.labels.take(sorter)
            minor_labels = minor.labels.take(sorter)

            block = block2d_to_block3d(sorted_values, fields, (J, K),
                                       major_labels, minor_labels)

            mgr = BlockManager([block],
                               [block.ref_items, major.levels, minor.levels])
            wp = Panel(mgr)
        else:
            if not self._quiet:  # pragma: no cover
                print(
                    'Duplicate entries in table, taking most recently '
                    'appended')

            # reconstruct
            long_index = MultiIndex.from_arrays([index, columns])
            lp = DataFrame(values, index=long_index, columns=fields)

            # need a better algorithm
            tuple_index = long_index._tuple_index

            unique_tuples = lib.fast_unique(tuple_index)
            unique_tuples = _asarray_tuplesafe(unique_tuples)

            indexer = match(unique_tuples, tuple_index)
            indexer = com._ensure_platform_int(indexer)

            new_index = long_index.take(indexer)
            new_values = lp.values.take(indexer, axis=0)

            lp = DataFrame(new_values, index=new_index, columns=lp.columns)
            wp = lp.to_panel()

        if sel.column_filter:
            new_minor = sorted(set(wp.minor_axis) & sel.column_filter)
            wp = wp.reindex(minor=new_minor)
        return wp
Exemplo n.º 29
0
 def unique(self) -> "PandasArray":
     return type(self)(unique(self._ndarray))
Exemplo n.º 30
0
def remove_unused_levels(self):
    """
    create a new MultiIndex from the current that removing
    unused levels, meaning that they are not expressed in the labels
    The resulting MultiIndex will have the same outward
    appearance, meaning the same .values and ordering. It will also
    be .equals() to the original.
    .. versionadded:: 0.20.0
    Returns
    -------
    MultiIndex
    Examples
    --------
    >>> i = pd.MultiIndex.from_product([range(2), list('ab')])
    MultiIndex(levels=[[0, 1], ['a', 'b']],
               labels=[[0, 0, 1, 1], [0, 1, 0, 1]])
    >>> i[2:]
    MultiIndex(levels=[[0, 1], ['a', 'b']],
               labels=[[1, 1], [0, 1]])
    The 0 from the first level is not represented
    and can be removed
    >>> i[2:].remove_unused_levels()
    MultiIndex(levels=[[1], ['a', 'b']],
               labels=[[0, 0], [0, 1]])
    """
    import pandas.core.algorithms as algos

    new_levels = []
    new_labels = []

    changed = False
    for lev, lab in zip(self.levels, self.labels):

        # Since few levels are typically unused, bincount() is more
        # efficient than unique() - however it only accepts positive values
        # (and drops order):
        uniques = np.where(np.bincount(lab + 1) > 0)[0] - 1
        has_na = int(len(uniques) and (uniques[0] == -1))

        if len(uniques) != len(lev) + has_na:
            # We have unused levels
            changed = True

            # Recalculate uniques, now preserving order.
            # Can easily be cythonized by exploiting the already existing
            # "uniques" and stop parsing "lab" when all items are found:
            uniques = algos.unique(lab)
            if has_na:
                na_idx = np.where(uniques == -1)[0]
                # Just ensure that -1 is in first position:
                uniques[[0, na_idx[0]]] = uniques[[na_idx[0], 0]]

            # labels get mapped from uniques to 0:len(uniques)
            # -1 (if present) is mapped to last position
            label_mapping = np.zeros(len(lev) + has_na)
            # ... and reassigned value -1:
            label_mapping[uniques] = np.arange(len(uniques)) - has_na

            lab = label_mapping[lab]

            # new levels are simple
            lev = lev.take(uniques[has_na:])

        new_levels.append(lev)
        new_labels.append(lab)

    result = self._shallow_copy()

    if changed:
        result._reset_identity()
        result._set_levels(new_levels, validate=False)
        result._set_labels(new_labels, validate=False)

    return result
Exemplo n.º 31
0
def melt(
    frame: DataFrame,
    id_vars=None,
    value_vars=None,
    var_name=None,
    value_name="value",
    col_level=None,
    ignore_index: bool = True,
) -> DataFrame:
    # If multiindex, gather names of columns on all level for checking presence
    # of `id_vars` and `value_vars`
    if isinstance(frame.columns, MultiIndex):
        cols = [x for c in frame.columns for x in c]
    else:
        cols = list(frame.columns)

    if value_name in frame.columns:
        warnings.warn(
            "This dataframe has a column name that matches the 'value_name' column "
            "name of the resulting Dataframe. "
            "In the future this will raise an error, please set the 'value_name' "
            "parameter of DataFrame.melt to a unique name.",
            FutureWarning,
            stacklevel=3,
        )

    if id_vars is not None:
        if not is_list_like(id_vars):
            id_vars = [id_vars]
        elif isinstance(frame.columns,
                        MultiIndex) and not isinstance(id_vars, list):
            raise ValueError(
                "id_vars must be a list of tuples when columns are a MultiIndex"
            )
        else:
            # Check that `id_vars` are in frame
            id_vars = list(id_vars)
            missing = Index(com.flatten(id_vars)).difference(cols)
            if not missing.empty:
                raise KeyError("The following 'id_vars' are not present "
                               f"in the DataFrame: {list(missing)}")
    else:
        id_vars = []

    if value_vars is not None:
        if not is_list_like(value_vars):
            value_vars = [value_vars]
        elif isinstance(frame.columns,
                        MultiIndex) and not isinstance(value_vars, list):
            raise ValueError(
                "value_vars must be a list of tuples when columns are a MultiIndex"
            )
        else:
            value_vars = list(value_vars)
            # Check that `value_vars` are in frame
            missing = Index(com.flatten(value_vars)).difference(cols)
            if not missing.empty:
                raise KeyError("The following 'value_vars' are not present in "
                               f"the DataFrame: {list(missing)}")
        if col_level is not None:
            idx = frame.columns.get_level_values(col_level).get_indexer(
                id_vars + value_vars)
        else:
            idx = algos.unique(
                frame.columns.get_indexer_for(id_vars + value_vars))
        frame = frame.iloc[:, idx]
    else:
        frame = frame.copy()

    if col_level is not None:  # allow list or other?
        # frame is a copy
        frame.columns = frame.columns.get_level_values(col_level)

    if var_name is None:
        if isinstance(frame.columns, MultiIndex):
            if len(frame.columns.names) == len(set(frame.columns.names)):
                var_name = frame.columns.names
            else:
                var_name = [
                    f"variable_{i}" for i in range(len(frame.columns.names))
                ]
        else:
            var_name = [
                frame.columns.name
                if frame.columns.name is not None else "variable"
            ]
    if isinstance(var_name, str):
        var_name = [var_name]

    N, K = frame.shape
    K -= len(id_vars)

    mdata = {}
    for col in id_vars:
        id_data = frame.pop(col)
        if is_extension_array_dtype(id_data):
            id_data = cast("Series", concat([id_data] * K, ignore_index=True))
        else:
            id_data = np.tile(id_data._values, K)
        mdata[col] = id_data

    mcolumns = id_vars + var_name + [value_name]

    # error: Incompatible types in assignment (expression has type "ndarray",
    # target has type "Series")
    mdata[value_name] = frame._values.ravel("F")  # type: ignore[assignment]
    for i, col in enumerate(var_name):
        # asanyarray will keep the columns as an Index

        # error: Incompatible types in assignment (expression has type "ndarray", target
        # has type "Series")
        mdata[col] = np.asanyarray(  # type: ignore[assignment]
            frame.columns._get_level_values(i)).repeat(N)

    result = frame._constructor(mdata, columns=mcolumns)

    if not ignore_index:
        result.index = tile_compat(frame.index, K)

    return result
Exemplo n.º 32
0
    def test_ints(self):
        arr = np.random.randint(0, 100, size=50)

        result = algos.unique(arr)
        tm.assertIsInstance(result, np.ndarray)
Exemplo n.º 33
0
 def unique(self):
     uniques = list(algos.unique(self.sp_values))
     fill_loc = self._first_fill_value_loc()
     if fill_loc >= 0:
         uniques.insert(fill_loc, self.fill_value)
     return type(self)._from_sequence(uniques, dtype=self.dtype)
Exemplo n.º 34
0
    def test_objects(self):
        arr = np.random.randint(0, 100, size=50).astype('O')

        result = algos.unique(arr)
        tm.assert_isinstance(result, np.ndarray)
Exemplo n.º 35
0
 def test_uint64_overflow(self):
     s = pd.Series([1, 2, 2**63, 2**63], dtype=np.uint64)
     exp = np.array([1, 2, 2**63], dtype=np.uint64)
     tm.assert_numpy_array_equal(algos.unique(s), exp)
Exemplo n.º 36
0
 def unique(self: _T) -> _T:
     new_data = unique(self._ndarray)
     return self._from_backing_data(new_data)
Exemplo n.º 37
0
    def test_ints(self):
        arr = np.random.randint(0, 100, size=50)

        result = algos.unique(arr)
        tm.assertIsInstance(result, np.ndarray)