예제 #1
0
def pivot_simple(index, columns, values):
    """
    Produce 'pivot' table based on 3 columns of this DataFrame.
    Uses unique values from index / columns and fills with values.

    Parameters
    ----------
    index : ndarray
        Labels to use to make new frame's index
    columns : ndarray
        Labels to use to make new frame's columns
    values : ndarray
        Values to use for populating new frame's values

    Note
    ----
    Obviously, all 3 of the input arguments must have the same length

    Returns
    -------
    DataFrame
    """
    if (len(index) != len(columns)) or (len(columns) != len(values)):
        raise AssertionError('Length of index, columns, and values must be the'
                             ' same')

    if len(index) == 0:
        return DataFrame(index=[])

    hindex = MultiIndex.from_arrays([index, columns])
    series = Series(values.ravel(), index=hindex)
    series = series.sortlevel(0)
    return series.unstack()
예제 #2
0
def value_counts(values, sort=True, ascending=False):
    """
    Compute a histogram of the counts of non-null values

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order

    Returns
    -------
    value_counts : Series
    """
    from pandas.core.series import Series
    from collections import defaultdict
    if com.is_integer_dtype(values.dtype):
        values = com._ensure_int64(values)
        keys, counts = lib.value_count_int64(values)
        result = Series(counts, index=keys)
    else:
        counter = defaultdict(lambda: 0)
        values = values[com.notnull(values)]
        for value in values:
            counter[value] += 1
        result = Series(counter)

    if sort:
        result.sort()
        if not ascending:
            result = result[::-1]

    return result
예제 #3
0
def get_sysprice_list(start_time, end_time, frequency='hourly'):
    '''Wrapper function for creating pandas Series object from data 
    received from the database.
    Returns: pandas Series object with Elspot daily system prices and 
    corresponding dates for predefined time period.
    Parameters:
    start_time - string representing the start of the time period, 
                format must be 'yyyy-mm-dd'.
    end_time - string representing the end of the time period, 
                format must be 'yyyy-mm-dd'.
    frequency - string representing the frequency of the output pandas 
                Series object. Currently must be one of ['hourly', 'daily']
    '''
    #Retrieve hourly system prices and timestamps from database as lists
    _ , sys_prices, times = get_system_price_volume(start_time, end_time)
    
    ts = Series(sys_prices, index=times)
    
    if frequency == 'daily':
        resampling_frequency = 'D' 
        
    if frequency == 'hourly':
        #Resampling is not necessary
        return ts
    else:
        return ts.resample(resampling_frequency, how='mean', 
                           kind='timestamp') 
예제 #4
0
파일: matrix.py 프로젝트: pedrot/pandas
    def xs(self, key):
        """
        Returns a row from the DataMatrix as a Series object.

        Parameters
        ----------
        key : some index contained in the index

        Returns
        -------
        Series
        """
        if key not in self.index:
            raise Exception('No cross-section for %s' % key)

        loc = self.index.indexMap[key]
        theSlice = self.values[loc, :].copy()
        xsIndex = self.columns

        result = Series(theSlice, index=xsIndex)

        if self.objects is not None and len(self.objects.columns) > 0:
            result = result.append(self.objects.getXS(key))

        return result
예제 #5
0
파일: frame.py 프로젝트: sechilds/pandas
    def _init_spmatrix(self, data, index, columns, dtype=None,
                       fill_value=None):
        """ Init self from scipy.sparse matrix """
        index, columns = self._prep_index(data, index, columns)
        data = data.tocoo()
        N = len(index)

        # Construct a dict of SparseSeries
        sdict = {}
        values = Series(data.data, index=data.row, copy=False)
        for col, rowvals in values.groupby(data.col):
            # get_blocks expects int32 row indices in sorted order
            rowvals = rowvals.sort_index()
            rows = rowvals.index.values.astype(np.int32)
            blocs, blens = get_blocks(rows)

            sdict[columns[col]] = SparseSeries(
                rowvals.values, index=index,
                fill_value=fill_value,
                sparse_index=BlockIndex(N, blocs, blens))

        # Add any columns that were empty and thus not grouped on above
        sdict.update({column: SparseSeries(index=index,
                                           fill_value=fill_value,
                                           sparse_index=BlockIndex(N, [], []))
                      for column in columns
                      if column not in sdict})

        return self._init_dict(sdict, index, columns, dtype)
예제 #6
0
def value_counts(values, sort=True, ascending=False):
    """
    Compute a histogram of the counts of non-null values

    Returns
    -------
    value_counts : Series
    """
    from collections import defaultdict
    if com.is_integer_dtype(values.dtype):
        values = com._ensure_int64(values)
        keys, counts = lib.value_count_int64(values)
        result = Series(counts, index=keys)
    else:
        counter = defaultdict(lambda: 0)
        values = values[com.notnull(values)]
        for value in values:
            counter[value] += 1
        result = Series(counter)

    if sort:
        result.sort()
        if not ascending:
            result = result[::-1]

    return result
예제 #7
0
파일: panel.py 프로젝트: GunioRobot/pandas
def pivot(index, columns, values):
    """
    Produce 'pivot' table based on 3 columns of this DataFrame.
    Uses unique values from index / columns and fills with values.

    Parameters
    ----------
    index : ndarray
        Labels to use to make new frame's index
    columns : ndarray
        Labels to use to make new frame's columns
    values : ndarray
        Values to use for populating new frame's values

    Note
    ----
    Obviously, all 3 of the input arguments must have the same length

    Returns
    -------
    DataFrame
    """
    assert(len(index) == len(columns) == len(values))

    if len(index) == 0:
        return DataFrame(index=[])

    hindex = _make_long_index(index, columns)

    series = Series(values.ravel(), index=hindex)
    series = series.sortlevel(0)
    return series.unstack()
예제 #8
0
def match(to_match, values, na_sentinel=-1):
    """
    Compute locations of to_match into values

    Parameters
    ----------
    to_match : array-like
        values to find positions of
    values : array-like
        Unique set of values
    na_sentinel : int, default -1
        Value to mark "not found"

    Examples
    --------

    Returns
    -------
    match : ndarray of integers
    """
    values = com._asarray_tuplesafe(values)
    if issubclass(values.dtype.type, string_types):
        values = np.array(values, dtype='O')

    f = lambda htype, caster: _match_generic(to_match, values, htype, caster)
    result = _hashtable_algo(f, values.dtype)

    if na_sentinel != -1:

        # replace but return a numpy array
        # use a Series because it handles dtype conversions properly
        from pandas.core.series import Series
        result = Series(result.ravel()).replace(-1,na_sentinel).values.reshape(result.shape)

    return result
예제 #9
0
    def test_groupby_categorical_unequal_len(self):
        # GH3011
        series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4])
        # The raises only happens with categorical, not with series of types
        # category
        bins = pd.cut(series.dropna().values, 4)

        # len(bins) != len(series) here
        self.assertRaises(ValueError, lambda: series.groupby(bins).mean())
예제 #10
0
def _value_counts_arraylike(values, dropna=True):
    is_datetimetz_type = is_datetimetz(values)
    is_period_type = (is_period_dtype(values) or
                      is_period_arraylike(values))

    orig = values

    from pandas.core.series import Series
    values = Series(values).values
    dtype = values.dtype

    if needs_i8_conversion(dtype) or is_period_type:

        from pandas.tseries.index import DatetimeIndex
        from pandas.tseries.period import PeriodIndex

        if is_period_type:
            # values may be an object
            values = PeriodIndex(values)
            freq = values.freq

        values = values.view(np.int64)
        keys, counts = htable.value_count_int64(values, dropna)

        if dropna:
            msk = keys != iNaT
            keys, counts = keys[msk], counts[msk]

        # convert the keys back to the dtype we came in
        keys = keys.astype(dtype)

        # dtype handling
        if is_datetimetz_type:
            keys = DatetimeIndex._simple_new(keys, tz=orig.dtype.tz)
        if is_period_type:
            keys = PeriodIndex._simple_new(keys, freq=freq)

    elif is_signed_integer_dtype(dtype):
        values = _ensure_int64(values)
        keys, counts = htable.value_count_int64(values, dropna)
    elif is_unsigned_integer_dtype(dtype):
        values = _ensure_uint64(values)
        keys, counts = htable.value_count_uint64(values, dropna)
    elif is_float_dtype(dtype):
        values = _ensure_float64(values)
        keys, counts = htable.value_count_float64(values, dropna)
    else:
        values = _ensure_object(values)
        keys, counts = htable.value_count_object(values, dropna)

        mask = isnull(values)
        if not dropna and mask.any():
            keys = np.insert(keys, 0, np.NaN)
            counts = np.insert(counts, 0, mask.sum())

    return keys, counts
예제 #11
0
파일: reshape.py 프로젝트: tlperkins/pandas
def pivot(self, index=None, columns=None, values=None):
    """
    See DataFrame.pivot
    """
    if values is None:
        indexed = self.set_index([index, columns])
        return indexed.unstack(columns)
    else:
        indexed = Series(self[values], index=[self[index], self[columns]])
        return indexed.unstack(columns)
예제 #12
0
파일: groupby.py 프로젝트: cournape/pandas
 def size(self):
     """
     Compute group sizes
     """
     # TODO: better impl
     labels, _, ngroups = self.group_info
     bin_counts = Series(labels).value_counts()
     bin_counts = bin_counts.reindex(np.arange(ngroups))
     bin_counts.index = self.result_index
     return bin_counts
예제 #13
0
파일: algorithms.py 프로젝트: jcfr/pandas
def _value_counts_arraylike(values, dropna=True):
    is_datetimetz = com.is_datetimetz(values)
    is_period = (isinstance(values, gt.ABCPeriodIndex) or
                 com.is_period_arraylike(values))

    orig = values

    from pandas.core.series import Series
    values = Series(values).values
    dtype = values.dtype

    if com.is_datetime_or_timedelta_dtype(dtype) or is_period:
        from pandas.tseries.index import DatetimeIndex
        from pandas.tseries.period import PeriodIndex

        if is_period:
            values = PeriodIndex(values)
            freq = values.freq

        values = values.view(np.int64)
        keys, counts = htable.value_count_scalar64(values, dropna)

        if dropna:
            msk = keys != iNaT
            keys, counts = keys[msk], counts[msk]

        # convert the keys back to the dtype we came in
        keys = keys.astype(dtype)

        # dtype handling
        if is_datetimetz:
            if isinstance(orig, gt.ABCDatetimeIndex):
                tz = orig.tz
            else:
                tz = orig.dt.tz
            keys = DatetimeIndex._simple_new(keys, tz=tz)
        if is_period:
            keys = PeriodIndex._simple_new(keys, freq=freq)

    elif com.is_integer_dtype(dtype):
        values = com._ensure_int64(values)
        keys, counts = htable.value_count_scalar64(values, dropna)
    elif com.is_float_dtype(dtype):
        values = com._ensure_float64(values)
        keys, counts = htable.value_count_scalar64(values, dropna)
    else:
        values = com._ensure_object(values)
        mask = com.isnull(values)
        keys, counts = htable.value_count_object(values, mask)
        if not dropna and mask.any():
            keys = np.insert(keys, 0, np.NaN)
            counts = np.insert(counts, 0, mask.sum())

    return keys, counts
예제 #14
0
def _coo_to_sparse_series(A, dense_index=False):
    """ Convert a scipy.sparse.coo_matrix to a SparseSeries.
    Use the defaults given in the SparseSeries constructor. """
    s = Series(A.data, MultiIndex.from_arrays((A.row, A.col)))
    s = s.sort_index()
    s = s.to_sparse()  # TODO: specify kind?
    if dense_index:
        # is there a better constructor method to use here?
        i = range(A.shape[0])
        j = range(A.shape[1])
        ind = MultiIndex.from_product([i, j])
        s = s.reindex_axis(ind)
    return s
예제 #15
0
    def test_filter_against_workaround(self):
        np.random.seed(0)
        # Series of ints
        s = Series(np.random.randint(0, 100, 1000))
        grouper = s.apply(lambda x: np.round(x, -1))
        grouped = s.groupby(grouper)
        f = lambda x: x.mean() > 10

        old_way = s[grouped.transform(f).astype('bool')]
        new_way = grouped.filter(f)
        assert_series_equal(new_way.sort_values(), old_way.sort_values())

        # Series of floats
        s = 100 * Series(np.random.random(1000))
        grouper = s.apply(lambda x: np.round(x, -1))
        grouped = s.groupby(grouper)
        f = lambda x: x.mean() > 10
        old_way = s[grouped.transform(f).astype('bool')]
        new_way = grouped.filter(f)
        assert_series_equal(new_way.sort_values(), old_way.sort_values())

        # Set up DataFrame of ints, floats, strings.
        from string import ascii_lowercase
        letters = np.array(list(ascii_lowercase))
        N = 1000
        random_letters = letters.take(np.random.randint(0, 26, N))
        df = DataFrame({'ints': Series(np.random.randint(0, 100, N)),
                        'floats': N / 10 * Series(np.random.random(N)),
                        'letters': Series(random_letters)})

        # Group by ints; filter on floats.
        grouped = df.groupby('ints')
        old_way = df[grouped.floats.
                     transform(lambda x: x.mean() > N / 20).astype('bool')]
        new_way = grouped.filter(lambda x: x['floats'].mean() > N / 20)
        assert_frame_equal(new_way, old_way)

        # Group by floats (rounded); filter on strings.
        grouper = df.floats.apply(lambda x: np.round(x, -1))
        grouped = df.groupby(grouper)
        old_way = df[grouped.letters.
                     transform(lambda x: len(x) < N / 10).astype('bool')]
        new_way = grouped.filter(lambda x: len(x.letters) < N / 10)
        assert_frame_equal(new_way, old_way)

        # Group by strings; filter on ints.
        grouped = df.groupby('letters')
        old_way = df[grouped.ints.
                     transform(lambda x: x.mean() > N / 20).astype('bool')]
        new_way = grouped.filter(lambda x: x['ints'].mean() > N / 20)
        assert_frame_equal(new_way, old_way)
예제 #16
0
        def _get_index_subset_to_coord_dict(index, subset, sort_labels=False):
            ilabels = list(zip(*[index._get_level_values(i) for i in subset]))
            labels_to_i = _get_label_to_i_dict(ilabels,
                                               sort_labels=sort_labels)
            labels_to_i = Series(labels_to_i)
            if len(subset) > 1:
                labels_to_i.index = MultiIndex.from_tuples(labels_to_i.index)
                labels_to_i.index.names = [index.names[i] for i in subset]
            else:
                labels_to_i.index = Index(x[0] for x in labels_to_i.index)
                labels_to_i.index.name = index.names[subset[0]]

            labels_to_i.name = 'value'
            return (labels_to_i)
예제 #17
0
def value_counts(values, sort=True, ascending=False, normalize=False):
    """
    Compute a histogram of the counts of non-null values

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order
    normalize: boolean, default False
        If True then compute a relative histogram

    Returns
    -------
    value_counts : Series
    """
    from pandas.core.series import Series

    values = np.asarray(values)

    if com.is_integer_dtype(values.dtype):
        values = com._ensure_int64(values)
        keys, counts = htable.value_count_int64(values)
    elif issubclass(values.dtype.type, (np.datetime64,np.timedelta64)):

        dtype = values.dtype
        values = values.view(np.int64)
        keys, counts = htable.value_count_int64(values)

        # convert the keys back to the dtype we came in
        keys = Series(keys,dtype=dtype)
    else:
        mask = com.isnull(values)
        values = com._ensure_object(values)
        keys, counts = htable.value_count_object(values, mask)

    result = Series(counts, index=keys)

    if sort:
        result.sort()
        if not ascending:
            result = result[::-1]

    if normalize:
        result = result / float(values.size)

    return result
예제 #18
0
 def __repr__(self):
     with warnings.catch_warnings():
         warnings.filterwarnings("ignore", "Sparse")
         series_rep = Series.__repr__(self)
         rep = '{series}\n{index!r}'.format(series=series_rep,
                                            index=self.sp_index)
         return rep
예제 #19
0
파일: groupby.py 프로젝트: hammer/pandas
def sort_group_labels(ids, labels, counts):
    n = len(ids)
    rng = np.arange(n)
    values = Series(ids, index=rng, dtype=object).values
    indexer = values.argsort()

    reverse_indexer = np.empty(n, dtype=np.int32)
    reverse_indexer.put(indexer, np.arange(n))

    new_labels = reverse_indexer.take(labels)
    np.putmask(new_labels, labels == -1, -1)

    new_ids = dict(izip(rng, values.take(indexer)))
    new_counts = counts.take(indexer)

    return new_ids, new_labels, new_counts
예제 #20
0
파일: reshape.py 프로젝트: aechase/pandas
def pivot(self, index=None, columns=None, values=None):
    """
    See DataFrame.pivot
    """
    if values is None:
        cols = [columns] if index is None else [index, columns]
        append = index is None
        indexed = self.set_index(cols, append=append)
        return indexed.unstack(columns)
    else:
        if index is None:
            index = self.index
        else:
            index = self[index]
        indexed = Series(self[values].values, index=MultiIndex.from_arrays([index, self[columns]]))
        return indexed.unstack(columns)
예제 #21
0
 def _get_index_subset_to_coord_dict(index, subset, sort_labels=False):
     def robust_get_level_values(i):
         # if index has labels (that are not None) use those,
         # else use the level location
         try:
             return(index.get_level_values(index.names[i]))
         except KeyError:
             return(index.get_level_values(i))
     ilabels = list(
         zip(*[robust_get_level_values(i) for i in subset]))
     labels_to_i = _get_label_to_i_dict(
         ilabels, sort_labels=sort_labels)
     labels_to_i = Series(labels_to_i)
     labels_to_i.index = MultiIndex.from_tuples(labels_to_i.index)
     labels_to_i.index.names = [index.names[i] for i in subset]
     labels_to_i.name = 'value'
     return(labels_to_i)
예제 #22
0
    def test_groupby_categorical_no_compress(self):
        data = Series(np.random.randn(9))

        codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
        cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True)

        result = data.groupby(cats).mean()
        exp = data.groupby(codes).mean()

        exp.index = CategoricalIndex(exp.index, categories=cats.categories,
                                     ordered=cats.ordered)
        assert_series_equal(result, exp)

        codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3])
        cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True)

        result = data.groupby(cats).mean()
        exp = data.groupby(codes).mean().reindex(cats.categories)
        exp.index = CategoricalIndex(exp.index, categories=cats.categories,
                                     ordered=cats.ordered)
        assert_series_equal(result, exp)

        cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"],
                           categories=["a", "b", "c", "d"], ordered=True)
        data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})

        result = data.groupby("b").mean()
        result = result["a"].values
        exp = np.array([1, 2, 4, np.nan])
        self.assert_numpy_array_equal(result, exp)
예제 #23
0
def init_dict(data, index, columns, dtype=None):
    """
    Segregate Series based on type and coerce into matrices.
    Needs to handle a lot of exceptional cases.
    """
    if columns is not None:
        from pandas.core.series import Series
        arrays = Series(data, index=columns, dtype=object)
        data_names = arrays.index

        missing = arrays.isnull()
        if index is None:
            # GH10856
            # raise ValueError if only scalars in dict
            index = extract_index(arrays[~missing])
        else:
            index = ensure_index(index)

        # no obvious "empty" int column
        if missing.any() and not is_integer_dtype(dtype):
            if dtype is None or np.issubdtype(dtype, np.flexible):
                # GH#1783
                nan_dtype = object
            else:
                nan_dtype = dtype
            val = construct_1d_arraylike_from_scalar(np.nan, len(index),
                                                     nan_dtype)
            arrays.loc[missing] = [val] * missing.sum()

    else:

        for key in data:
            if (isinstance(data[key], ABCDatetimeIndex) and
                    data[key].tz is not None):
                # GH#24096 need copy to be deep for datetime64tz case
                # TODO: See if we can avoid these copies
                data[key] = data[key].copy(deep=True)

        keys = com.dict_keys_to_ordered_list(data)
        columns = data_names = Index(keys)
        arrays = [data[k] for k in keys]

    return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
예제 #24
0
def _coo_to_sparse_series(A, dense_index: bool = False,
                          sparse_series: bool = True):
    """
    Convert a scipy.sparse.coo_matrix to a SparseSeries.

    Parameters
    ----------
    A : scipy.sparse.coo.coo_matrix
    dense_index : bool, default False
    sparse_series : bool, default True

    Returns
    -------
    Series or SparseSeries

    Raises
    ------
    TypeError if A is not a coo_matrix

    """
    from pandas import SparseDtype

    try:
        s = Series(A.data, MultiIndex.from_arrays((A.row, A.col)))
    except AttributeError:
        raise TypeError('Expected coo_matrix. Got {} instead.'
                        .format(type(A).__name__))
    s = s.sort_index()
    if sparse_series:
        # TODO(SparseSeries): remove this and the sparse_series keyword.
        # This is just here to avoid a DeprecationWarning when
        # _coo_to_sparse_series is called via Series.sparse.from_coo
        s = s.to_sparse()  # TODO: specify kind?
    else:
        s = s.astype(SparseDtype(s.dtype))
    if dense_index:
        # is there a better constructor method to use here?
        i = range(A.shape[0])
        j = range(A.shape[1])
        ind = MultiIndex.from_product([i, j])
        s = s.reindex(ind)
    return s
예제 #25
0
파일: models.py 프로젝트: OspreyX/pytrader
    def plot(self):
        """
            Plots 2 graphs. One for N-period moving average, lower and upper bands.
            One for P/N and position.
        """

        columns = {"Upper Bands": self.upper_bands,
                   "Lower Bands": self.lower_bands,
                   "Moving Means": self.moving_means,
                   "Opening Prices": self.prices}
        df = DataFrame(columns, index=self.dates)
        df.plot()

        fig = plt.figure(num=None, figsize=(18, 10), dpi=80, facecolor='w', edgecolor='k')
        fig.add_subplot(121)
        trans_dates = [tran.date for tran in self.transactions]
        # we negate the value here to show profit/loss
        trans = Series([-tran.value() for tran in self.transactions], index=trans_dates)
        position = Series([tran.units for tran in self.transactions], index=trans_dates)

        position.cumsum().plot(label="Position")
        plt.xlabel("Date")
        plt.ylabel("Position")
        plt.title("Position over Time")
        plt.legend(loc="best")

        fig.add_subplot(122)
        trans.cumsum().plot(label="P/L")
        plt.xlabel("Date")
        plt.ylabel("Profit/Loss")
        plt.title("Profit and Loss over Time")
        plt.legend(loc="best")

        plt.show()
예제 #26
0
파일: util.py 프로젝트: martin1/thesis
def get_sysprice_list(start_time, end_time, frequency='hourly'):
    '''Wrapper function for creating pandas Series object from data 
    received from the database.
    Returns: pandas Series object with Elspot daily system prices and 
    corresponding dates for predefined time period.
    Parameters:
    start_time - string representing the start of the time period, 
                format must be 'yyyy-mm-dd'.
    end_time - string representing the end of the time period, 
                format must be 'yyyy-mm-dd'.
    frequency - string representing the frequency of the output pandas 
                Series object. Currently must be one of ['hourly', 'daily']
    '''
    #Retrieve hourly system prices and timestamps from database as lists
    _ , sys_prices, times = get_system_price_volume(start_time, end_time)
    
    ts = Series(sys_prices, index=times)
    
    if frequency == 'daily':
        resampling_frequency = 'D'
    '''Weekly functionality not needed for now'''
    '''elif frequency == 'weekly':
        start_time = datetime.datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S')
        end_time = datetime.datetime.strptime(end_time, '%Y-%m-%d %H:%M:%S')

        if start_time.date().weekday() != 0:
            raise ValueError(str(start_time.date())+ " is a " + start_time.date().strftime('%A') + ". start_date must be a Monday.")
        if end_time.date().weekday() != 6:
            raise ValueError(str(end_time.date())+ " is a " + end_time.date().strftime('%A') + ". end_date must be a Sunday.")
        resampling_frequency = 'W' '''    

    if frequency == 'monthly':
        resampling_frequency = 'M'

    if frequency == 'hourly':
        #Resampling is not necessary
        return ts
    else:
        return ts.resample(resampling_frequency, how='mean', 
                           kind='timestamp') 
예제 #27
0
파일: algorithms.py 프로젝트: mrorii/pandas
def value_counts(values, sort=True, ascending=False):
    """
    Compute a histogram of the counts of non-null values

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order

    Returns
    -------
    value_counts : Series
    """
    from pandas.core.series import Series

    values = np.asarray(values)

    if com.is_integer_dtype(values.dtype):
        values = com._ensure_int64(values)
        keys, counts = htable.value_count_int64(values)
    else:
        mask = com.isnull(values)
        values = com._ensure_object(values)
        keys, counts = htable.value_count_object(values, mask)

    result = Series(counts, index=keys)

    if sort:
        result.sort()
        if not ascending:
            result = result[::-1]

    return result
예제 #28
0
파일: series.py 프로젝트: Axik/pandas
 def __unicode__(self):
     # currently, unicode is same as repr...fixes infinite loop
     series_rep = Series.__unicode__(self)
     rep = '{series}\n{index!r}'.format(series=series_rep,
                                        index=self.sp_index)
     return rep
예제 #29
0
def value_counts(values,
                 sort=True,
                 ascending=False,
                 normalize=False,
                 bins=None,
                 dropna=True):
    """
    Compute a histogram of the counts of non-null values.

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order
    normalize: boolean, default False
        If True then compute a relative histogram
    bins : integer, optional
        Rather than count values, group them into half-open bins,
        convenience for pd.cut, only works with numeric data
    dropna : boolean, default True
        Don't include counts of NaN

    Returns
    -------
    value_counts : Series

    """
    from pandas.core.series import Series
    name = getattr(values, 'name', None)

    if bins is not None:
        try:
            from pandas.tools.tile import cut
            values = Series(values).values
            cat, bins = cut(values, bins, retbins=True)
        except TypeError:
            raise TypeError("bins argument only works with numeric data.")
        values = cat.codes

    if is_extension_type(values) and not is_datetimetz(values):
        # handle Categorical and sparse,
        # datetime tz can be handeled in ndarray path
        result = Series(values).values.value_counts(dropna=dropna)
        result.name = name
        counts = result.values
    else:
        # ndarray path. pass original to handle DatetimeTzBlock
        keys, counts = _value_counts_arraylike(values, dropna=dropna)

        from pandas import Index, Series
        if not isinstance(keys, Index):
            keys = Index(keys)
        result = Series(counts, index=keys, name=name)

    if bins is not None:
        # TODO: This next line should be more efficient
        result = result.reindex(np.arange(len(cat.categories)), fill_value=0)
        result.index = bins[:-1]

    if sort:
        result = result.sort_values(ascending=ascending)

    if normalize:
        result = result / float(counts.sum())

    return result
예제 #30
0
    def aggregate(self, func_or_funcs, *args, **kwargs):
        """
        Apply aggregation function or functions to groups, yielding most likely
        Series but in some cases DataFrame depending on the output of the
        aggregation function

        Parameters
        ----------
        func_or_funcs : function or list / dict of functions
            List/dict of functions will produce DataFrame with column names
            determined by the function names themselves (list) or the keys in
            the dict

        Notes
        -----
        agg is an alias for aggregate. Use it.

        Example
        -------
        >>> series
        bar    1.0
        baz    2.0
        qot    3.0
        qux    4.0

        >>> mapper = lambda x: x[0] # first letter
        >>> grouped = series.groupby(mapper)

        >>> grouped.aggregate(np.sum)
        b    3.0
        q    7.0

        >>> grouped.aggregate([np.sum, np.mean, np.std])
           mean  std  sum
        b  1.5   0.5  3
        q  3.5   0.5  7

        >>> grouped.agg({'result' : lambda x: x.mean() / x.std(),
        ...              'total' : np.sum})
           result  total
        b  2.121   3
        q  4.95    7

        See also
        --------
        apply, transform

        Returns
        -------
        Series or DataFrame
        """
        if isinstance(func_or_funcs, basestring):
            return getattr(self, func_or_funcs)(*args, **kwargs)

        if hasattr(func_or_funcs, '__iter__'):
            ret = self._aggregate_multiple_funcs(func_or_funcs)
        else:
            if len(self.grouper.groupings) > 1:
                return self._python_agg_general(func_or_funcs, *args, **kwargs)

            try:
                return self._python_agg_general(func_or_funcs, *args, **kwargs)
            except Exception:
                result = self._aggregate_named(func_or_funcs, *args, **kwargs)

            index = Index(sorted(result), name=self.grouper.names[0])
            ret = Series(result, index=index)

        if not self.as_index:  # pragma: no cover
            print 'Warning, ignoring as_index=True'

        return ret
예제 #31
0
파일: series.py 프로젝트: weilinear/pandas
    def __init__(self, data, index=None, sparse_index=None, kind='block',
                 fill_value=None, name=None, dtype=None, copy=False,
                 fastpath=False):

        # we are called internally, so short-circuit
        if fastpath:

            # data is an ndarray, index is defined
            data = SingleBlockManager(data, index, fastpath=True)
            if copy:
                data = data.copy()
        else:

            is_sparse_array = isinstance(data, SparseArray)
            if fill_value is None:
                if is_sparse_array:
                    fill_value = data.fill_value
                else:
                    fill_value = nan

            if is_sparse_array:
                if isinstance(data, SparseSeries) and index is None:
                    index = data.index
                elif index is not None:
                    assert(len(index) == len(data))

                sparse_index = data.sp_index
                data = np.asarray(data)

            elif isinstance(data, SparseSeries):
                if index is None:
                    index = data.index

                # extract the SingleBlockManager
                data = data._data

            elif isinstance(data, (Series, dict)):
                if index is None:
                    index = data.index

                data = Series(data)
                data, sparse_index = make_sparse(data, kind=kind,
                                                 fill_value=fill_value)

            elif isinstance(data, (tuple, list, np.ndarray)):
                # array-like
                if sparse_index is None:
                    data, sparse_index = make_sparse(data, kind=kind,
                                                     fill_value=fill_value)
                else:
                    assert(len(data) == sparse_index.npoints)

            elif isinstance(data, SingleBlockManager):
                if dtype is not None:
                    data = data.astype(dtype)
                if index is None:
                    index = data.index
                else:
                    data = data.reindex(index, copy=False)

            else:

                length = len(index)

                if data == fill_value or (isnull(data)
                                          and isnull(fill_value)):
                    if kind == 'block':
                        sparse_index = BlockIndex(length, [], [])
                    else:
                        sparse_index = IntIndex(length, [])
                    data = np.array([])

                else:
                    if kind == 'block':
                        locs, lens = ([0], [length]) if length else ([], [])
                        sparse_index = BlockIndex(length, locs, lens)
                    else:
                        sparse_index = IntIndex(length, index)
                    v = data
                    data = np.empty(length)
                    data.fill(v)

            if index is None:
                index = com._default_index(sparse_index.length)
            index = _ensure_index(index)

            # create/copy the manager
            if isinstance(data, SingleBlockManager):

                if copy:
                    data = data.copy()
            else:

                # create a sparse array
                if not isinstance(data, SparseArray):
                    data = SparseArray(
                        data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype, copy=copy)

                data = SingleBlockManager(data, index)

        generic.NDFrame.__init__(self, data)

        self.index = index
        self.name = name
예제 #32
0
    def _get_empty_meta(self,
                        columns,
                        index_col,
                        index_names,
                        dtype: DtypeArg | None = None):
        columns = list(columns)

        # Convert `dtype` to a defaultdict of some kind.
        # This will enable us to write `dtype[col_name]`
        # without worrying about KeyError issues later on.
        if not is_dict_like(dtype):
            # if dtype == None, default will be object.
            default_dtype = dtype or object
            # error: Argument 1 to "defaultdict" has incompatible type "Callable[[],
            # Union[ExtensionDtype, str, dtype[Any], Type[object], Dict[Hashable,
            # Union[ExtensionDtype, Union[str, dtype[Any]], Type[str], Type[float],
            # Type[int], Type[complex], Type[bool], Type[object]]]]]"; expected
            # "Optional[Callable[[], Union[ExtensionDtype, str, dtype[Any],
            # Type[object]]]]"
            # error: Incompatible return value type (got "Union[ExtensionDtype, str,
            # dtype[Any], Type[object], Dict[Hashable, Union[ExtensionDtype, Union[str,
            # dtype[Any]], Type[str], Type[float], Type[int], Type[complex], Type[bool],
            # Type[object]]]]", expected "Union[ExtensionDtype, str, dtype[Any],
            # Type[object]]")
            dtype = defaultdict(
                lambda: default_dtype  # type: ignore[arg-type, return-value]
            )
        else:
            dtype = cast(dict, dtype)
            dtype = defaultdict(
                lambda: object,
                {
                    columns[k] if is_integer(k) else k: v
                    for k, v in dtype.items()
                },
            )

        # Even though we have no data, the "index" of the empty DataFrame
        # could for example still be an empty MultiIndex. Thus, we need to
        # check whether we have any index columns specified, via either:
        #
        # 1) index_col (column indices)
        # 2) index_names (column names)
        #
        # Both must be non-null to ensure a successful construction. Otherwise,
        # we have to create a generic empty Index.
        if (index_col is None or index_col is False) or index_names is None:
            index = Index([])
        else:
            data = [Series([], dtype=dtype[name]) for name in index_names]
            index = ensure_index_from_sequences(data, names=index_names)
            index_col.sort()

            for i, n in enumerate(index_col):
                columns.pop(n - i)

        col_dict = {
            col_name: Series([], dtype=dtype[col_name])
            for col_name in columns
        }

        return index, columns, col_dict
예제 #33
0
파일: algorithms.py 프로젝트: t1c1/pandas
def value_counts(values,
                 sort=True,
                 ascending=False,
                 normalize=False,
                 bins=None):
    """
    Compute a histogram of the counts of non-null values

    Parameters
    ----------
    values : ndarray (1-d)
    sort : boolean, default True
        Sort by values
    ascending : boolean, default False
        Sort in ascending order
    normalize: boolean, default False
        If True then compute a relative histogram
    bins : integer, optional
        Rather than count values, group them into half-open bins,
        convenience for pd.cut, only works with numeric data

    Returns
    -------
    value_counts : Series

    """
    from pandas.core.series import Series
    from pandas.tools.tile import cut

    values = Series(values).values

    if bins is not None:
        try:
            cat, bins = cut(values, bins, retbins=True)
        except TypeError:
            raise TypeError("bins argument only works with numeric data.")
        values = cat.labels

    if com.is_integer_dtype(values.dtype):
        values = com._ensure_int64(values)
        keys, counts = htable.value_count_int64(values)

    elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)):
        dtype = values.dtype
        values = values.view(np.int64)
        keys, counts = htable.value_count_int64(values)

        # convert the keys back to the dtype we came in
        keys = Series(keys, dtype=dtype)

    else:
        mask = com.isnull(values)
        values = com._ensure_object(values)
        keys, counts = htable.value_count_object(values, mask)

    result = Series(counts, index=com._values_from_object(keys))

    if bins is not None:
        # TODO: This next line should be more efficient
        result = result.reindex(np.arange(len(cat.levels)), fill_value=0)
        result.index = bins[:-1]

    if sort:
        result.sort()
        if not ascending:
            result = result[::-1]

    if normalize:
        result = result / float(values.size)

    return result
예제 #34
0
def str_extract(arr, pat, flags=0):
    """
    Find groups in each string using passed regular expression

    Parameters
    ----------
    pat : string
        Pattern or regular expression
    flags : int, default 0 (no flags)
        re module flags, e.g. re.IGNORECASE

    Returns
    -------
    extracted groups : Series (one group) or DataFrame (multiple groups)
        Note that dtype of the result is always object, even when no match is
        found and the result is a Series or DataFrame containing only NaN
        values.

    Examples
    --------
    A pattern with one group will return a Series. Non-matches will be NaN.

    >>> Series(['a1', 'b2', 'c3']).str.extract('[ab](\d)')
    0      1
    1      2
    2    NaN
    dtype: object

    A pattern with more than one group will return a DataFrame.

    >>> Series(['a1', 'b2', 'c3']).str.extract('([ab])(\d)')
         0    1
    0    a    1
    1    b    2
    2  NaN  NaN

    A pattern may contain optional groups.

    >>> Series(['a1', 'b2', 'c3']).str.extract('([ab])?(\d)')
         0  1
    0    a  1
    1    b  2
    2  NaN  3

    Named groups will become column names in the result.

    >>> Series(['a1', 'b2', 'c3']).str.extract('(?P<letter>[ab])(?P<digit>\d)')
      letter digit
    0      a     1
    1      b     2
    2    NaN   NaN

    """
    from pandas.core.series import Series
    from pandas.core.frame import DataFrame

    regex = re.compile(pat, flags=flags)
    # just to be safe, check this
    if regex.groups == 0:
        raise ValueError("This pattern contains no groups to capture.")
    empty_row = [np.nan]*regex.groups
    def f(x):
        if not isinstance(x, compat.string_types):
            return empty_row
        m = regex.search(x)
        if m:
            return [np.nan if item is None else item for item in m.groups()]
        else:
            return empty_row
    if regex.groups == 1:
        result = Series([f(val)[0] for val in arr],
                        name=_get_single_group_name(regex),
                        index=arr.index, dtype=object)
    else:
        names = dict(zip(regex.groupindex.values(), regex.groupindex.keys()))
        columns = [names.get(1 + i, i) for i in range(regex.groups)]
        if arr.empty:
            result = DataFrame(columns=columns, dtype=object)
        else:
            result = DataFrame([f(val) for val in arr],
                               columns=columns,
                               index=arr.index,
                               dtype=object)
    return result
예제 #35
0
파일: Model.py 프로젝트: A-Amani/gfkTasks
 def evaluate_model(self, x_test: pdSeries.Series, y_test: pdSeries.Series, pipe:Pipeline):
     predicted = pipe.predict(x_test)
     logger.info("model accuracy is: %.3f \n" % pipe.score(x_test, y_test) )
     logger.info(metrics.classification_report(y_test, predicted, target_names=y_test.unique()))
def _get_dummies_1d(
    data,
    prefix,
    prefix_sep="_",
    dummy_na=False,
    sparse=False,
    drop_first=False,
    dtype=None,
):
    from pandas.core.reshape.concat import concat

    # Series avoids inconsistent NaN handling
    codes, levels = _factorize_from_iterable(Series(data))

    if dtype is None:
        dtype = np.uint8
    dtype = np.dtype(dtype)

    if is_object_dtype(dtype):
        raise ValueError("dtype=object is not a valid dtype for get_dummies")

    def get_empty_frame(data):
        if isinstance(data, Series):
            index = data.index
        else:
            index = np.arange(len(data))
        return DataFrame(index=index)

    # if all NaN
    if not dummy_na and len(levels) == 0:
        return get_empty_frame(data)

    codes = codes.copy()
    if dummy_na:
        codes[codes == -1] = len(levels)
        levels = np.append(levels, np.nan)

    # if dummy_na, we just fake a nan level. drop_first will drop it again
    if drop_first and len(levels) == 1:
        return get_empty_frame(data)

    number_of_cols = len(levels)

    if prefix is None:
        dummy_cols = levels
    else:

        # PY2 embedded unicode, gh-22084
        def _make_col_name(prefix, prefix_sep, level):
            fstr = "{prefix}{prefix_sep}{level}"
            return fstr.format(prefix=prefix,
                               prefix_sep=prefix_sep,
                               level=level)

        dummy_cols = [
            _make_col_name(prefix, prefix_sep, level) for level in levels
        ]

    if isinstance(data, Series):
        index = data.index
    else:
        index = None

    if sparse:

        if is_integer_dtype(dtype):
            fill_value = 0
        elif dtype == bool:
            fill_value = False
        else:
            fill_value = 0.0

        sparse_series = []
        N = len(data)
        sp_indices = [[] for _ in range(len(dummy_cols))]
        mask = codes != -1
        codes = codes[mask]
        n_idx = np.arange(N)[mask]

        for ndx, code in zip(n_idx, codes):
            sp_indices[code].append(ndx)

        if drop_first:
            # remove first categorical level to avoid perfect collinearity
            # GH12042
            sp_indices = sp_indices[1:]
            dummy_cols = dummy_cols[1:]
        for col, ixs in zip(dummy_cols, sp_indices):
            sarr = SparseArray(
                np.ones(len(ixs), dtype=dtype),
                sparse_index=IntIndex(N, ixs),
                fill_value=fill_value,
                dtype=dtype,
            )
            sparse_series.append(Series(data=sarr, index=index, name=col))

        out = concat(sparse_series, axis=1, copy=False)
        return out

    else:
        dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0)

        if not dummy_na:
            # reset NaN GH4446
            dummy_mat[codes == -1] = 0

        if drop_first:
            # remove first GH12042
            dummy_mat = dummy_mat[:, 1:]
            dummy_cols = dummy_cols[1:]
        return DataFrame(dummy_mat, index=index, columns=dummy_cols)
예제 #37
0
 def beta(self):
     return Series(self._beta_raw, index=self._x.columns)
예제 #38
0
def _unstack_multiple(data, clocs):
    if len(clocs) == 0:
        return data

    # NOTE: This doesn't deal with hierarchical columns yet

    index = data.index

    clocs = [index._get_level_number(i) for i in clocs]

    rlocs = [i for i in range(index.nlevels) if i not in clocs]

    clevels = [index.levels[i] for i in clocs]
    clabels = [index.labels[i] for i in clocs]
    cnames = [index.names[i] for i in clocs]
    rlevels = [index.levels[i] for i in rlocs]
    rlabels = [index.labels[i] for i in rlocs]
    rnames = [index.names[i] for i in rlocs]

    shape = [len(x) for x in clevels]
    group_index = get_group_index(clabels, shape)

    comp_ids, obs_ids = _compress_group_index(group_index, sort=False)
    recons_labels = decons_group_index(obs_ids, shape)

    dummy_index = MultiIndex(levels=rlevels + [obs_ids],
                             labels=rlabels + [comp_ids],
                             names=rnames + ['__placeholder__'])

    if isinstance(data, Series):
        dummy = Series(data.values, index=dummy_index)
        unstacked = dummy.unstack('__placeholder__')
        new_levels = clevels
        new_names = cnames
        new_labels = recons_labels
    else:
        if isinstance(data.columns, MultiIndex):
            result = data
            for i in range(len(clocs)):
                val = clocs[i]
                result = result.unstack(val)
                clocs = [val if i > val else val - 1 for val in clocs]

            return result

        dummy = DataFrame(data.values, index=dummy_index,
                          columns=data.columns)

        unstacked = dummy.unstack('__placeholder__')
        if isinstance(unstacked, Series):
            unstcols = unstacked.index
        else:
            unstcols = unstacked.columns
        new_levels = [unstcols.levels[0]] + clevels
        new_names = [data.columns.name] + cnames

        new_labels = [unstcols.labels[0]]
        for rec in recons_labels:
            new_labels.append(rec.take(unstcols.labels[-1]))

    new_columns = MultiIndex(levels=new_levels, labels=new_labels,
                             names=new_names)

    if isinstance(unstacked, Series):
        unstacked.index = new_columns
    else:
        unstacked.columns = new_columns

    return unstacked
예제 #39
0
파일: ops.py 프로젝트: zitorelova/pandas
 def _chop(self, sdata: Series, slice_obj: slice) -> Series:
     # fastpath equivalent to `sdata.iloc[slice_obj]`
     mgr = sdata._mgr.get_slice(slice_obj)
     # __finalize__ not called here, must be applied by caller if applicable
     return sdata._constructor(mgr, name=sdata.name, fastpath=True)
예제 #40
0
파일: pivot.py 프로젝트: pathcl/pandas
def _generate_marginal_results(table,
                               data,
                               values,
                               rows,
                               cols,
                               aggfunc,
                               grand_margin,
                               margins_name='All'):
    if len(cols) > 0:
        # need to "interleave" the margins
        table_pieces = []
        margin_keys = []

        def _all_key(key):
            return (key, margins_name) + ('', ) * (len(cols) - 1)

        if len(rows) > 0:
            margin = data[rows + values].groupby(rows).agg(aggfunc)
            cat_axis = 1

            for key, piece in table.groupby(level=0, axis=cat_axis):
                all_key = _all_key(key)

                # we are going to mutate this, so need to copy!
                piece = piece.copy()
                try:
                    piece[all_key] = margin[key]
                except TypeError:

                    # we cannot reshape, so coerce the axis
                    piece.set_axis(
                        piece._get_axis(cat_axis)._to_safe_for_reshape(),
                        axis=cat_axis,
                        inplace=True)
                    piece[all_key] = margin[key]

                table_pieces.append(piece)
                margin_keys.append(all_key)
        else:
            margin = grand_margin
            cat_axis = 0
            for key, piece in table.groupby(level=0, axis=cat_axis):
                all_key = _all_key(key)
                table_pieces.append(piece)
                table_pieces.append(Series(margin[key], index=[all_key]))
                margin_keys.append(all_key)

        result = concat(table_pieces, axis=cat_axis)

        if len(rows) == 0:
            return result
    else:
        result = table
        margin_keys = table.columns

    if len(cols) > 0:
        row_margin = data[cols + values].groupby(cols).agg(aggfunc)
        row_margin = row_margin.stack()

        # slight hack
        new_order = [len(cols)] + lrange(len(cols))
        row_margin.index = row_margin.index.reorder_levels(new_order)
    else:
        row_margin = Series(np.nan, index=result.columns)

    return result, margin_keys, row_margin
예제 #41
0
파일: pivot.py 프로젝트: pathcl/pandas
def _add_margins(table,
                 data,
                 values,
                 rows,
                 cols,
                 aggfunc,
                 margins_name='All',
                 fill_value=None):
    if not isinstance(margins_name, compat.string_types):
        raise ValueError('margins_name argument must be a string')

    msg = u'Conflicting name "{name}" in margins'.format(name=margins_name)
    for level in table.index.names:
        if margins_name in table.index.get_level_values(level):
            raise ValueError(msg)

    grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name)

    # could be passed a Series object with no 'columns'
    if hasattr(table, 'columns'):
        for level in table.columns.names[1:]:
            if margins_name in table.columns.get_level_values(level):
                raise ValueError(msg)

    if len(rows) > 1:
        key = (margins_name, ) + ('', ) * (len(rows) - 1)
    else:
        key = margins_name

    if not values and isinstance(table, ABCSeries):
        # If there are no values and the table is a series, then there is only
        # one column in the data. Compute grand margin and return it.
        return table.append(Series({key: grand_margin[margins_name]}))

    if values:
        marginal_result_set = _generate_marginal_results(
            table, data, values, rows, cols, aggfunc, grand_margin,
            margins_name)
        if not isinstance(marginal_result_set, tuple):
            return marginal_result_set
        result, margin_keys, row_margin = marginal_result_set
    else:
        marginal_result_set = _generate_marginal_results_without_values(
            table, data, rows, cols, aggfunc, margins_name)
        if not isinstance(marginal_result_set, tuple):
            return marginal_result_set
        result, margin_keys, row_margin = marginal_result_set
    row_margin = row_margin.reindex(result.columns, fill_value=fill_value)
    # populate grand margin
    for k in margin_keys:
        if isinstance(k, compat.string_types):
            row_margin[k] = grand_margin[k]
        else:
            row_margin[k] = grand_margin[k[0]]

    from pandas import DataFrame
    margin_dummy = DataFrame(row_margin, columns=[key]).T

    row_names = result.index.names
    try:
        for dtype in set(result.dtypes):
            cols = result.select_dtypes([dtype]).columns
            margin_dummy[cols] = margin_dummy[cols].astype(dtype)
        result = result.append(margin_dummy)
    except TypeError:

        # we cannot reshape, so coerce the axis
        result.index = result.index._to_safe_for_reshape()
        result = result.append(margin_dummy)
    result.index.names = row_names

    return result
예제 #42
0
def _add_margins(
    table: DataFrame | Series,
    data: DataFrame,
    values,
    rows,
    cols,
    aggfunc,
    observed=None,
    margins_name: str = "All",
    fill_value=None,
):
    if not isinstance(margins_name, str):
        raise ValueError("margins_name argument must be a string")

    msg = f'Conflicting name "{margins_name}" in margins'
    for level in table.index.names:
        if margins_name in table.index.get_level_values(level):
            raise ValueError(msg)

    grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name)

    if table.ndim == 2:
        # i.e. DataFrame
        for level in table.columns.names[1:]:
            if margins_name in table.columns.get_level_values(level):
                raise ValueError(msg)

    key: str | tuple[str, ...]
    if len(rows) > 1:
        key = (margins_name,) + ("",) * (len(rows) - 1)
    else:
        key = margins_name

    if not values and isinstance(table, ABCSeries):
        # If there are no values and the table is a series, then there is only
        # one column in the data. Compute grand margin and return it.
        return table._append(Series({key: grand_margin[margins_name]}))

    elif values:
        marginal_result_set = _generate_marginal_results(
            table, data, values, rows, cols, aggfunc, observed, margins_name
        )
        if not isinstance(marginal_result_set, tuple):
            return marginal_result_set
        result, margin_keys, row_margin = marginal_result_set
    else:
        # no values, and table is a DataFrame
        assert isinstance(table, ABCDataFrame)
        marginal_result_set = _generate_marginal_results_without_values(
            table, data, rows, cols, aggfunc, observed, margins_name
        )
        if not isinstance(marginal_result_set, tuple):
            return marginal_result_set
        result, margin_keys, row_margin = marginal_result_set

    row_margin = row_margin.reindex(result.columns, fill_value=fill_value)
    # populate grand margin
    for k in margin_keys:
        if isinstance(k, str):
            row_margin[k] = grand_margin[k]
        else:
            row_margin[k] = grand_margin[k[0]]

    from pandas import DataFrame

    margin_dummy = DataFrame(row_margin, columns=[key]).T

    row_names = result.index.names
    # check the result column and leave floats
    for dtype in set(result.dtypes):
        cols = result.select_dtypes([dtype]).columns
        margin_dummy[cols] = margin_dummy[cols].apply(
            maybe_downcast_to_dtype, args=(dtype,)
        )
    result = result._append(margin_dummy)
    result.index.names = row_names

    return result
예제 #43
0
def dict_to_mgr(
    data: dict,
    index,
    columns,
    *,
    dtype: DtypeObj | None = None,
    typ: str = "block",
    copy: bool = True,
) -> Manager:
    """
    Segregate Series based on type and coerce into matrices.
    Needs to handle a lot of exceptional cases.

    Used in DataFrame.__init__
    """
    arrays: Sequence[Any] | Series

    if columns is not None:
        from pandas.core.series import Series

        arrays = Series(data, index=columns, dtype=object)
        data_names = arrays.index
        missing = arrays.isna()
        if index is None:
            # GH10856
            # raise ValueError if only scalars in dict
            index = extract_index(arrays[~missing])
        else:
            index = ensure_index(index)

        # no obvious "empty" int column
        if missing.any() and not is_integer_dtype(dtype):
            nan_dtype: DtypeObj

            if dtype is None or (isinstance(dtype, np.dtype)
                                 and np.issubdtype(dtype, np.flexible)):
                # GH#1783
                nan_dtype = np.dtype("object")
            else:
                nan_dtype = dtype
            val = construct_1d_arraylike_from_scalar(np.nan, len(index),
                                                     nan_dtype)
            arrays.loc[missing] = [val] * missing.sum()

        arrays = list(arrays)

    else:
        keys = list(data.keys())
        columns = data_names = Index(keys)
        arrays = [com.maybe_iterable_to_list(data[k]) for k in keys]
        # GH#24096 need copy to be deep for datetime64tz case
        # TODO: See if we can avoid these copies
        arrays = [
            arr if not isinstance(arr, ABCIndex) else arr._data
            for arr in arrays
        ]
        arrays = [
            arr if not is_datetime64tz_dtype(arr) else arr.copy()
            for arr in arrays
        ]

    if copy:
        # arrays_to_mgr (via form_blocks) won't make copies for EAs
        # dtype attr check to exclude EADtype-castable strs
        arrays = [
            x if not hasattr(x, "dtype")
            or not isinstance(x.dtype, ExtensionDtype) else x.copy()
            for x in arrays
        ]
        # TODO: can we get rid of the dt64tz special case above?

    return arrays_to_mgr(arrays,
                         data_names,
                         index,
                         columns,
                         dtype=dtype,
                         typ=typ,
                         consolidate=copy)
예제 #44
0
def _generate_marginal_results(
    table, data, values, rows, cols, aggfunc, observed, margins_name: str = "All"
):
    if len(cols) > 0:
        # need to "interleave" the margins
        table_pieces = []
        margin_keys = []

        def _all_key(key):
            return (key, margins_name) + ("",) * (len(cols) - 1)

        if len(rows) > 0:
            margin = data[rows + values].groupby(rows, observed=observed).agg(aggfunc)
            cat_axis = 1

            for key, piece in table.groupby(level=0, axis=cat_axis, observed=observed):
                all_key = _all_key(key)

                # we are going to mutate this, so need to copy!
                piece = piece.copy()
                piece[all_key] = margin[key]

                table_pieces.append(piece)
                margin_keys.append(all_key)
        else:
            from pandas import DataFrame

            cat_axis = 0
            for key, piece in table.groupby(level=0, axis=cat_axis, observed=observed):
                if len(cols) > 1:
                    all_key = _all_key(key)
                else:
                    all_key = margins_name
                table_pieces.append(piece)
                # GH31016 this is to calculate margin for each group, and assign
                # corresponded key as index
                transformed_piece = DataFrame(piece.apply(aggfunc)).T
                transformed_piece.index = Index([all_key], name=piece.index.name)

                # append piece for margin into table_piece
                table_pieces.append(transformed_piece)
                margin_keys.append(all_key)

        result = concat(table_pieces, axis=cat_axis)

        if len(rows) == 0:
            return result
    else:
        result = table
        margin_keys = table.columns

    if len(cols) > 0:
        row_margin = data[cols + values].groupby(cols, observed=observed).agg(aggfunc)
        row_margin = row_margin.stack()

        # slight hack
        new_order = [len(cols)] + list(range(len(cols)))
        row_margin.index = row_margin.index.reorder_levels(new_order)
    else:
        row_margin = Series(np.nan, index=result.columns)

    return result, margin_keys, row_margin
예제 #45
0
def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True,
                format=None, coerce=False, unit='ns',
                infer_datetime_format=False):
    """
    Convert argument to datetime

    Parameters
    ----------
    arg : string, datetime, array of strings (with possible NAs)
    errors : {'ignore', 'raise'}, default 'ignore'
        Errors are ignored by default (values left untouched)
    dayfirst : boolean, default False
        If True parses dates with the day first, eg 20/01/2005
        Warning: dayfirst=True is not strict, but will prefer to parse
        with day first (this is a known bug).
    utc : boolean, default None
        Return UTC DatetimeIndex if True (converting any tz-aware
        datetime.datetime objects as well)
    box : boolean, default True
        If True returns a DatetimeIndex, if False returns ndarray of values
    format : string, default None
        strftime to parse time, eg "%d/%m/%Y"
    coerce : force errors to NaT (False by default)
    unit : unit of the arg (D,s,ms,us,ns) denote the unit in epoch
        (e.g. a unix timestamp), which is an integer/float number
    infer_datetime_format: boolean, default False
        If no `format` is given, try to infer the format based on the first
        datetime string. Provides a large speed-up in many cases.

    Returns
    -------
    ret : datetime if parsing succeeded

    Examples
    --------
    Take separate series and convert to datetime

    >>> import pandas as pd
    >>> i = pd.date_range('20000101',periods=100)
    >>> df = pd.DataFrame(dict(year = i.year, month = i.month, day = i.day))
    >>> pd.to_datetime(df.year*10000 + df.month*100 + df.day, format='%Y%m%d')

    Or from strings

    >>> df = df.astype(str)
    >>> pd.to_datetime(df.day + df.month + df.year, format="%d%m%Y")
    """
    from pandas import Timestamp
    from pandas.core.series import Series
    from pandas.tseries.index import DatetimeIndex

    def _convert_listlike(arg, box, format):

        if isinstance(arg, (list,tuple)):
            arg = np.array(arg, dtype='O')

        if com.is_datetime64_ns_dtype(arg):
            if box and not isinstance(arg, DatetimeIndex):
                try:
                    return DatetimeIndex(arg, tz='utc' if utc else None)
                except ValueError:
                    pass

            return arg

        arg = com._ensure_object(arg)

        if infer_datetime_format and format is None:
            format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)

            if format is not None:
                # There is a special fast-path for iso8601 formatted
                # datetime strings, so in those cases don't use the inferred
                # format because this path makes process slower in this
                # special case
                format_is_iso8601 = (
                    '%Y-%m-%dT%H:%M:%S.%f'.startswith(format) or
                    '%Y-%m-%d %H:%M:%S.%f'.startswith(format)
                )
                if format_is_iso8601:
                    format = None

        try:
            result = None

            if format is not None:
                # shortcut formatting here
                if format == '%Y%m%d':
                    try:
                        result = _attempt_YYYYMMDD(arg)
                    except:
                        raise ValueError("cannot convert the input to '%Y%m%d' date format")

                # fallback
                if result is None:
                    try:
                        result = tslib.array_strptime(
                            arg, format, coerce=coerce
                        )
                    except (tslib.OutOfBoundsDatetime):
                        if errors == 'raise':
                            raise
                        result = arg
                    except ValueError:
                        # Only raise this error if the user provided the
                        # datetime format, and not when it was inferred
                        if not infer_datetime_format:
                            raise

            if result is None and (format is None or infer_datetime_format):
                result = tslib.array_to_datetime(arg, raise_=errors == 'raise',
                                                 utc=utc, dayfirst=dayfirst,
                                                 coerce=coerce, unit=unit)

            if com.is_datetime64_dtype(result) and box:
                result = DatetimeIndex(result, tz='utc' if utc else None)
            return result

        except ValueError as e:
            try:
                values, tz = tslib.datetime_to_datetime64(arg)
                return DatetimeIndex._simple_new(values, None, tz=tz)
            except (ValueError, TypeError):
                raise e

    if arg is None:
        return arg
    elif isinstance(arg, Timestamp):
        return arg
    elif isinstance(arg, Series):
        values = _convert_listlike(arg.values, False, format)
        return Series(values, index=arg.index, name=arg.name)
    elif com.is_list_like(arg):
        return _convert_listlike(arg, box, format)

    return _convert_listlike(np.array([ arg ]), box, format)[0]
예제 #46
0
파일: var.py 프로젝트: zonca/pandas
    def granger_causality(self):
        """Returns the f-stats and p-values from the Granger Causality Test.

        If the data consists of columns x1, x2, x3, then we perform the
        following regressions:

        x1 ~ L(x2, x3)
        x1 ~ L(x1, x3)
        x1 ~ L(x1, x2)

        The f-stats of these results are placed in the 'x1' column of the
        returned DataFrame.  We then repeat for x2, x3.

        Returns
        -------
        Dict, where 'f-stat' returns the DataFrame containing the f-stats,
        and 'p-value' returns the DataFrame containing the corresponding
        p-values of the f-stats.
        """
        from pandas.stats.api import ols
        from scipy.stats import f

        d = {}
        for col in self._columns:
            d[col] = {}
            for i in xrange(1, 1 + self._p):
                lagged_data = self._lagged_data[i].filter(self._columns -
                                                          [col])

                for key, value in lagged_data.iteritems():
                    d[col][_make_param_name(i, key)] = value

        f_stat_dict = {}
        p_value_dict = {}

        for col, y in self._data.iteritems():
            ssr_full = (self.resid[col]**2).sum()

            f_stats = []
            p_values = []

            for col2 in self._columns:
                result = ols(y=y, x=d[col2])

                resid = result.resid
                ssr_reduced = (resid**2).sum()

                M = self._p
                N = self._nobs
                K = self._k * self._p + 1
                f_stat = ((ssr_reduced - ssr_full) / M) / (ssr_full / (N - K))
                f_stats.append(f_stat)

                p_value = f.sf(f_stat, M, N - K)
                p_values.append(p_value)

            f_stat_dict[col] = Series(f_stats, self._columns)
            p_value_dict[col] = Series(p_values, self._columns)

        f_stat_mat = DataFrame(f_stat_dict)
        p_value_mat = DataFrame(p_value_dict)

        return {
            'f-stat': f_stat_mat,
            'p-value': p_value_mat,
        }
예제 #47
0
 def _wrap_aggregated_output(self, output):
     # sort of a kludge
     output = output[self.name]
     index = self.grouper.result_index
     return Series(output, index=index, name=self.name)
예제 #48
0
def agg_dict_like(
    obj: AggObjType,
    arg: AggFuncTypeDict,
    _axis: int,
) -> FrameOrSeriesUnion:
    """
    Compute aggregation in the case of a dict-like argument.

    Parameters
    ----------
    obj : Pandas object to compute aggregation on.
    arg : dict
        label-aggregation pairs to compute.
    _axis : int, 0 or 1
        Axis to compute aggregation on.

    Returns
    -------
    Result of aggregation.
    """
    is_aggregator = lambda x: isinstance(x, (list, tuple, dict))

    if _axis != 0:  # pragma: no cover
        raise ValueError("Can only pass dict with axis=0")

    selected_obj = obj._selected_obj

    # if we have a dict of any non-scalars
    # eg. {'A' : ['mean']}, normalize all to
    # be list-likes
    # Cannot use arg.values() because arg may be a Series
    if any(is_aggregator(x) for _, x in arg.items()):
        new_arg: AggFuncTypeDict = {}
        for k, v in arg.items():
            if not isinstance(v, (tuple, list, dict)):
                new_arg[k] = [v]
            else:
                new_arg[k] = v

            # the keys must be in the columns
            # for ndim=2, or renamers for ndim=1

            # ok for now, but deprecated
            # {'A': { 'ra': 'mean' }}
            # {'A': { 'ra': ['mean'] }}
            # {'ra': ['mean']}

            # not ok
            # {'ra' : { 'A' : 'mean' }}
            if isinstance(v, dict):
                raise SpecificationError("nested renamer is not supported")
            elif isinstance(selected_obj, ABCSeries):
                raise SpecificationError("nested renamer is not supported")
            elif (isinstance(selected_obj, ABCDataFrame)
                  and k not in selected_obj.columns):
                raise KeyError(f"Column '{k}' does not exist!")

        arg = new_arg

    else:
        # deprecation of renaming keys
        # GH 15931
        keys = list(arg.keys())
        if isinstance(selected_obj, ABCDataFrame) and len(
                selected_obj.columns.intersection(keys)) != len(keys):
            cols = list(
                safe_sort(
                    list(
                        set(keys) -
                        set(selected_obj.columns.intersection(keys))), ))
            raise SpecificationError(f"Column(s) {cols} do not exist")

    from pandas.core.reshape.concat import concat

    if selected_obj.ndim == 1:
        # key only used for output
        colg = obj._gotitem(obj._selection, ndim=1)
        results = {key: colg.agg(how) for key, how in arg.items()}
    else:
        # key used for column selection and output
        results = {
            key: obj._gotitem(key, ndim=1).agg(how)
            for key, how in arg.items()
        }

    # set the final keys
    keys = list(arg.keys())

    # Avoid making two isinstance calls in all and any below
    is_ndframe = [isinstance(r, ABCNDFrame) for r in results.values()]

    # combine results
    if all(is_ndframe):
        keys_to_use = [k for k in keys if not results[k].empty]
        # Have to check, if at least one DataFrame is not empty.
        keys_to_use = keys_to_use if keys_to_use != [] else keys
        axis = 0 if isinstance(obj, ABCSeries) else 1
        result = concat({k: results[k] for k in keys_to_use}, axis=axis)
    elif any(is_ndframe):
        # There is a mix of NDFrames and scalars
        raise ValueError("cannot perform both aggregation "
                         "and transformation operations "
                         "simultaneously")
    else:
        from pandas import Series

        # we have a dict of scalars
        # GH 36212 use name only if obj is a series
        if obj.ndim == 1:
            obj = cast("Series", obj)
            name = obj.name
        else:
            name = None

        result = Series(results, name=name)

    return result
예제 #49
0
def _count_generic(values, table_type, type_caster):
    values = type_caster(values)
    table = table_type(len(values))
    uniques, labels, counts = table.factorize(values)

    return Series(counts, index=uniques)
예제 #50
0
def agg_list_like(
    obj: AggObjType,
    arg: List[AggFuncTypeBase],
    _axis: int,
) -> FrameOrSeriesUnion:
    """
    Compute aggregation in the case of a list-like argument.

    Parameters
    ----------
    obj : Pandas object to compute aggregation on.
    arg : list
        Aggregations to compute.
    _axis : int, 0 or 1
        Axis to compute aggregation on.

    Returns
    -------
    Result of aggregation.
    """
    from pandas.core.reshape.concat import concat

    if _axis != 0:
        raise NotImplementedError("axis other than 0 is not supported")

    if obj._selected_obj.ndim == 1:
        selected_obj = obj._selected_obj
    else:
        selected_obj = obj._obj_with_exclusions

    results = []
    keys = []

    # degenerate case
    if selected_obj.ndim == 1:
        for a in arg:
            colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj)
            try:
                new_res = colg.aggregate(a)

            except TypeError:
                pass
            else:
                results.append(new_res)

                # make sure we find a good name
                name = com.get_callable_name(a) or a
                keys.append(name)

    # multiples
    else:
        for index, col in enumerate(selected_obj):
            colg = obj._gotitem(col,
                                ndim=1,
                                subset=selected_obj.iloc[:, index])
            try:
                new_res = colg.aggregate(arg)
            except (TypeError, DataError):
                pass
            except ValueError as err:
                # cannot aggregate
                if "Must produce aggregated value" in str(err):
                    # raised directly in _aggregate_named
                    pass
                elif "no results" in str(err):
                    # raised directly in _aggregate_multiple_funcs
                    pass
                else:
                    raise
            else:
                results.append(new_res)
                keys.append(col)

    # if we are empty
    if not len(results):
        raise ValueError("no results")

    try:
        return concat(results, keys=keys, axis=1, sort=False)
    except TypeError as err:

        # we are concatting non-NDFrame objects,
        # e.g. a list of scalars

        from pandas import Series

        result = Series(results, index=keys, name=obj.name)
        if is_nested_object(result):
            raise ValueError(
                "cannot combine transform and aggregation operations") from err
        return result
예제 #51
0
파일: series.py 프로젝트: weilinear/pandas
 def __unicode__(self):
     # currently, unicode is same as repr...fixes infinite loop
     series_rep = Series.__unicode__(self)
     rep = '%s\n%s' % (series_rep, repr(self.sp_index))
     return rep
예제 #52
0
    def __init__(self,
                 data=None,
                 index=None,
                 sparse_index=None,
                 kind='block',
                 fill_value=None,
                 name=None,
                 dtype=None,
                 copy=False,
                 fastpath=False):

        # we are called internally, so short-circuit
        if fastpath:

            # data is an ndarray, index is defined

            if not isinstance(data, SingleBlockManager):
                data = SingleBlockManager(data, index, fastpath=True)
            if copy:
                data = data.copy()

        else:

            if data is None:
                data = []

            if isinstance(data, Series) and name is None:
                name = data.name

            if isinstance(data, SparseArray):
                if index is not None:
                    assert (len(index) == len(data))
                sparse_index = data.sp_index
                if fill_value is None:
                    fill_value = data.fill_value

                data = np.asarray(data)

            elif isinstance(data, SparseSeries):
                if index is None:
                    index = data.index.view()
                if fill_value is None:
                    fill_value = data.fill_value
                # extract the SingleBlockManager
                data = data._data

            elif isinstance(data, (Series, dict)):
                data = Series(data, index=index)
                index = data.index.view()

                res = make_sparse(data, kind=kind, fill_value=fill_value)
                data, sparse_index, fill_value = res

            elif isinstance(data, (tuple, list, np.ndarray)):
                # array-like
                if sparse_index is None:
                    res = make_sparse(data, kind=kind, fill_value=fill_value)
                    data, sparse_index, fill_value = res
                else:
                    assert (len(data) == sparse_index.npoints)

            elif isinstance(data, SingleBlockManager):
                if dtype is not None:
                    data = data.astype(dtype)
                if index is None:
                    index = data.index.view()
                elif not data.index.equals(index) or copy:  # pragma: no cover
                    # GH#19275 SingleBlockManager input should only be called
                    # internally
                    raise AssertionError('Cannot pass both SingleBlockManager '
                                         '`data` argument and a different '
                                         '`index` argument.  `copy` must '
                                         'be False.')

            else:
                length = len(index)

                if data == fill_value or (isna(data) and isna(fill_value)):
                    if kind == 'block':
                        sparse_index = BlockIndex(length, [], [])
                    else:
                        sparse_index = IntIndex(length, [])
                    data = np.array([])

                else:
                    if kind == 'block':
                        locs, lens = ([0], [length]) if length else ([], [])
                        sparse_index = BlockIndex(length, locs, lens)
                    else:
                        sparse_index = IntIndex(length, index)
                    v = data
                    data = np.empty(length)
                    data.fill(v)

            if index is None:
                index = com._default_index(sparse_index.length)
            index = _ensure_index(index)

            # create/copy the manager
            if isinstance(data, SingleBlockManager):

                if copy:
                    data = data.copy()
            else:

                # create a sparse array
                if not isinstance(data, SparseArray):
                    data = SparseArray(data,
                                       sparse_index=sparse_index,
                                       fill_value=fill_value,
                                       dtype=dtype,
                                       copy=copy)

                data = SingleBlockManager(data, index)

        generic.NDFrame.__init__(self, data)

        self.index = index
        self.name = name
예제 #53
0
def test01():
    print("Test 01")

    userID: int = 1

    history: AHistory = HistoryHierDF("databse1")

    # userID, itemID, position, observation, clicked

    # userID, itemID, position, observation, clicked
    history.insertRecommendation(userID, 45, 1, False)
    history.insertRecommendation(userID, 45, 2, False)
    history.insertRecommendation(userID, 78, 3, False)
    history.insertRecommendation(userID, 68, 4, False)
    history.insertRecommendation(userID, 50, 5, False)
    history.insertRecommendation(userID, 50, 6, False)
    history.insertRecommendation(userID, 50, 7, False)
    history.insertRecommendation(userID, 50, 8, False)
    history.insertRecommendation(userID, 50, 9, False)
    history.insertRecommendation(userID, 68, 4, False)
    history.insertRecommendation(userID, 50, 5, False)
    history.insertRecommendation(userID, 50, 6, False)
    history.insertRecommendation(userID, 50, 7, False)
    history.insertRecommendation(userID, 50, 8, False)
    history.insertRecommendation(userID, 50, 9, False)
    history.insertRecommendation(userID, 68, 4, False)
    history.insertRecommendation(userID, 50, 5, False)
    history.insertRecommendation(userID, 50, 6, False)
    history.insertRecommendation(userID, 50, 7, False)
    history.insertRecommendation(userID, 50, 8, False)
    history.insertRecommendation(userID, 50, 9, False)
    history.insertRecommendation(userID, 68, 4, False)
    history.insertRecommendation(userID, 50, 5, False)
    history.insertRecommendation(userID, 50, 6, False)
    history.insertRecommendation(userID, 50, 7, False)
    history.insertRecommendation(userID, 50, 8, False)
    history.insertRecommendation(userID, 50, 9, False)
    history.insertRecommendation(userID, 68, 4, False)
    history.insertRecommendation(userID, 50, 5, False)
    history.insertRecommendation(userID, 50, 6, False)
    history.insertRecommendation(userID, 50, 7, False)
    history.insertRecommendation(userID, 50, 8, False)
    history.insertRecommendation(userID, 50, 9, False)
    history.insertRecommendation(userID, 68, 4, False)
    history.insertRecommendation(userID, 50, 5, False)
    history.insertRecommendation(userID, 50, 6, False)
    history.insertRecommendation(userID, 50, 7, False)
    history.insertRecommendation(userID, 50, 8, False)
    history.insertRecommendation(userID, 50, 9, False)
    history.insertRecommendation(userID, 68, 4, False)
    history.insertRecommendation(userID, 50, 5, False)
    history.insertRecommendation(userID, 50, 6, False)
    history.insertRecommendation(userID, 50, 7, False)
    history.insertRecommendation(userID, 50, 8, False)
    history.insertRecommendation(userID, 50, 9, False)
    history.insertRecommendation(userID, 68, 4, False)
    history.insertRecommendation(userID, 50, 5, False)
    history.insertRecommendation(userID, 50, 6, False)
    history.insertRecommendation(userID, 50, 7, False)
    history.insertRecommendation(userID, 50, 8, False)
    history.insertRecommendation(userID, 50, 9, False)
    history.insertRecommendation(userID, 68, 4, False)
    history.insertRecommendation(userID, 50, 5, False)
    history.insertRecommendation(userID, 50, 6, False)
    history.insertRecommendation(userID, 50, 7, False)
    history.insertRecommendation(userID, 50, 8, False)
    history.insertRecommendation(userID, 50, 9, False)
    history.insertRecommendation(userID, 68, 4, False)
    history.insertRecommendation(userID, 50, 5, False)
    history.insertRecommendation(userID, 50, 6, False)
    history.insertRecommendation(userID, 50, 7, False)
    history.insertRecommendation(userID, 50, 8, False)
    history.insertRecommendation(userID, 50, 9, False)
    history.insertRecommendation(userID, 68, 4, False)
    history.insertRecommendation(userID, 50, 5, False)
    history.insertRecommendation(userID, 50, 6, False)
    history.insertRecommendation(userID, 50, 7, False)
    history.insertRecommendation(userID, 50, 8, False)
    history.insertRecommendation(userID, 50, 9, False)
    history.insertRecommendation(userID, 68, 4, False)
    history.insertRecommendation(userID, 50, 5, False)
    history.insertRecommendation(userID, 50, 6, False)
    history.insertRecommendation(userID, 50, 7, False)
    history.insertRecommendation(userID, 50, 8, False)
    history.insertRecommendation(userID, 50, 9, False)
    history.insertRecommendation(userID, 68, 4, False)
    history.insertRecommendation(userID, 50, 5, False)
    history.insertRecommendation(userID, 50, 6, False)
    history.insertRecommendation(userID, 50, 7, False)
    history.insertRecommendation(userID, 50, 8, False)
    history.insertRecommendation(userID, 50, 9, False)
    history.insertRecommendation(userID, 68, 4, False)
    history.insertRecommendation(userID, 50, 5, False)
    history.insertRecommendation(userID, 50, 6, False)
    history.insertRecommendation(userID, 50, 7, False)
    history.insertRecommendation(userID, 50, 8, False)
    history.insertRecommendation(userID, 50, 9, False)
    history.insertRecommendation(userID, 50, 10, False)
    history.insertRecommendation(userID, 100, 1, False)
    history.insertRecommendation(userID, 6, 2, True)
    history.insertRecommendation(userID, 100, 5, True)
    history.insertRecommendation(userID, 100, 15, True)

    recommendationDict: dict = {
        100: 0.35,
        125: 0.25,
        95: 0.15,
        45: 0.1,
        78: 0.05,
        68: 0.05,
        32: 0.02,
        6: 0.01,
        18: 0.01,
        47: 0.01
    }
    recommendationSrs: Series = Series(recommendationDict)

    penalty: APenalization = PenalUsingReduceRelevance(penaltyLinear,
                                                       [0.8, 0.2, 100],
                                                       penaltyLinear,
                                                       [1.0, 0.2, 100], 100)
    pRecommendationSrs: Series = penalty.runOneMethodPenalization(
        userID, recommendationSrs, history)

    print(pRecommendationSrs)
예제 #54
0
 def __unicode__(self):
     # currently, unicode is same as repr...fixes infinite loop
     series_rep = Series.__unicode__(self)
     rep = '{series}\n{index!r}'.format(series=series_rep,
                                        index=self.sp_index)
     return rep
예제 #55
0
 def get_dtype_counts(self):
     from collections import defaultdict
     d = defaultdict(int)
     for k, v in self.iteritems():
         d[v.dtype.name] += 1
     return Series(d)
예제 #56
0
 def _chop(self, sdata: Series, slice_obj: slice) -> Series:
     return sdata._get_values(slice_obj)
예제 #57
0
 def __unicode__(self):
     # currently, unicode is same as repr...fixes infinite loop
     series_rep = Series.__unicode__(self)
     rep = '%s\n%s' % (series_rep, repr(self.sp_index))
     return rep
예제 #58
0
파일: tools.py 프로젝트: sjdenny/pandas
def _to_datetime(arg,
                 errors='raise',
                 dayfirst=False,
                 yearfirst=False,
                 utc=None,
                 box=True,
                 format=None,
                 exact=True,
                 unit='ns',
                 freq=None,
                 infer_datetime_format=False):
    """
    Same as to_datetime, but accept freq for
    DatetimeIndex internal construction
    """
    from pandas.core.series import Series
    from pandas.tseries.index import DatetimeIndex

    def _convert_listlike(arg, box, format, name=None):

        if isinstance(arg, (list, tuple)):
            arg = np.array(arg, dtype='O')

        # these are shortcutable
        if com.is_datetime64_ns_dtype(arg):
            if box and not isinstance(arg, DatetimeIndex):
                try:
                    return DatetimeIndex(arg,
                                         tz='utc' if utc else None,
                                         name=name)
                except ValueError:
                    pass

            return arg

        elif com.is_datetime64tz_dtype(arg):
            if not isinstance(arg, DatetimeIndex):
                return DatetimeIndex(arg, tz='utc' if utc else None)
            if utc:
                arg = arg.tz_convert(None)
            return arg

        elif format is None and com.is_integer_dtype(arg) and unit == 'ns':
            result = arg.astype('datetime64[ns]')
            if box:
                return DatetimeIndex(result,
                                     tz='utc' if utc else None,
                                     name=name)

            return result

        arg = com._ensure_object(arg)
        require_iso8601 = False

        if infer_datetime_format and format is None:
            format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst)

        if format is not None:
            # There is a special fast-path for iso8601 formatted
            # datetime strings, so in those cases don't use the inferred
            # format because this path makes process slower in this
            # special case
            format_is_iso8601 = (('%Y-%m-%dT%H:%M:%S.%f'.startswith(format)
                                  or '%Y-%m-%d %H:%M:%S.%f'.startswith(format))
                                 and format != '%Y')
            if format_is_iso8601:
                require_iso8601 = not infer_datetime_format
                format = None

        try:
            result = None

            if format is not None:
                # shortcut formatting here
                if format == '%Y%m%d':
                    try:
                        result = _attempt_YYYYMMDD(arg, errors=errors)
                    except:
                        raise ValueError(
                            "cannot convert the input to '%Y%m%d' date format")

                # fallback
                if result is None:
                    try:
                        result = tslib.array_strptime(arg,
                                                      format,
                                                      exact=exact,
                                                      errors=errors)
                    except (tslib.OutOfBoundsDatetime):
                        if errors == 'raise':
                            raise
                        result = arg
                    except ValueError:
                        # if format was inferred, try falling back
                        # to array_to_datetime - terminate here
                        # for specified formats
                        if not infer_datetime_format:
                            if errors == 'raise':
                                raise
                            result = arg

            if result is None and (format is None or infer_datetime_format):
                result = tslib.array_to_datetime(
                    arg,
                    errors=errors,
                    utc=utc,
                    dayfirst=dayfirst,
                    yearfirst=yearfirst,
                    freq=freq,
                    unit=unit,
                    require_iso8601=require_iso8601)

            if com.is_datetime64_dtype(result) and box:
                result = DatetimeIndex(result,
                                       tz='utc' if utc else None,
                                       name=name)
            return result

        except ValueError as e:
            try:
                values, tz = tslib.datetime_to_datetime64(arg)
                return DatetimeIndex._simple_new(values, name=name, tz=tz)
            except (ValueError, TypeError):
                raise e

    if arg is None:
        return arg
    elif isinstance(arg, tslib.Timestamp):
        return arg
    elif isinstance(arg, Series):
        values = _convert_listlike(arg._values, False, format)
        return Series(values, index=arg.index, name=arg.name)
    elif isinstance(arg, ABCIndexClass):
        return _convert_listlike(arg, box, format, name=arg.name)
    elif com.is_list_like(arg):
        return _convert_listlike(arg, box, format)

    return _convert_listlike(np.array([arg]), box, format)[0]
예제 #59
0
파일: tools.py 프로젝트: luispedraza/gasole
        except ValueError, e:
            try:
                values, tz = tslib.datetime_to_datetime64(arg)
                return DatetimeIndex._simple_new(values, None, tz=tz)
            except (ValueError, TypeError):
                raise e

    if arg is None:
        return arg
    elif isinstance(arg, datetime):
        return arg
    elif isinstance(arg, Series):
        values = arg.values
        if not com.is_datetime64_dtype(values):
            values = _convert_f(values)
        return Series(values, index=arg.index, name=arg.name)
    elif isinstance(arg, (np.ndarray, list)):
        if isinstance(arg, list):
            arg = np.array(arg, dtype='O')

        if com.is_datetime64_dtype(arg):
            if box and not isinstance(arg, DatetimeIndex):
                try:
                    return DatetimeIndex(arg, tz='utc' if utc else None)
                except ValueError, e:
                    try:
                        values, tz = tslib.datetime_to_datetime64(arg)
                        return DatetimeIndex._simple_new(values, None, tz=tz)
                    except (ValueError, TypeError):
                        raise e
            return arg
예제 #60
0
 def __repr__(self):
     series_rep = Series.__repr__(self)
     rep = '{series}\n{index!r}'.format(series=series_rep,
                                        index=self.sp_index)
     return rep