def pivot_simple(index, columns, values): """ Produce 'pivot' table based on 3 columns of this DataFrame. Uses unique values from index / columns and fills with values. Parameters ---------- index : ndarray Labels to use to make new frame's index columns : ndarray Labels to use to make new frame's columns values : ndarray Values to use for populating new frame's values Note ---- Obviously, all 3 of the input arguments must have the same length Returns ------- DataFrame """ if (len(index) != len(columns)) or (len(columns) != len(values)): raise AssertionError('Length of index, columns, and values must be the' ' same') if len(index) == 0: return DataFrame(index=[]) hindex = MultiIndex.from_arrays([index, columns]) series = Series(values.ravel(), index=hindex) series = series.sortlevel(0) return series.unstack()
def value_counts(values, sort=True, ascending=False): """ Compute a histogram of the counts of non-null values Parameters ---------- values : ndarray (1-d) sort : boolean, default True Sort by values ascending : boolean, default False Sort in ascending order Returns ------- value_counts : Series """ from pandas.core.series import Series from collections import defaultdict if com.is_integer_dtype(values.dtype): values = com._ensure_int64(values) keys, counts = lib.value_count_int64(values) result = Series(counts, index=keys) else: counter = defaultdict(lambda: 0) values = values[com.notnull(values)] for value in values: counter[value] += 1 result = Series(counter) if sort: result.sort() if not ascending: result = result[::-1] return result
def get_sysprice_list(start_time, end_time, frequency='hourly'): '''Wrapper function for creating pandas Series object from data received from the database. Returns: pandas Series object with Elspot daily system prices and corresponding dates for predefined time period. Parameters: start_time - string representing the start of the time period, format must be 'yyyy-mm-dd'. end_time - string representing the end of the time period, format must be 'yyyy-mm-dd'. frequency - string representing the frequency of the output pandas Series object. Currently must be one of ['hourly', 'daily'] ''' #Retrieve hourly system prices and timestamps from database as lists _ , sys_prices, times = get_system_price_volume(start_time, end_time) ts = Series(sys_prices, index=times) if frequency == 'daily': resampling_frequency = 'D' if frequency == 'hourly': #Resampling is not necessary return ts else: return ts.resample(resampling_frequency, how='mean', kind='timestamp')
def xs(self, key): """ Returns a row from the DataMatrix as a Series object. Parameters ---------- key : some index contained in the index Returns ------- Series """ if key not in self.index: raise Exception('No cross-section for %s' % key) loc = self.index.indexMap[key] theSlice = self.values[loc, :].copy() xsIndex = self.columns result = Series(theSlice, index=xsIndex) if self.objects is not None and len(self.objects.columns) > 0: result = result.append(self.objects.getXS(key)) return result
def _init_spmatrix(self, data, index, columns, dtype=None, fill_value=None): """ Init self from scipy.sparse matrix """ index, columns = self._prep_index(data, index, columns) data = data.tocoo() N = len(index) # Construct a dict of SparseSeries sdict = {} values = Series(data.data, index=data.row, copy=False) for col, rowvals in values.groupby(data.col): # get_blocks expects int32 row indices in sorted order rowvals = rowvals.sort_index() rows = rowvals.index.values.astype(np.int32) blocs, blens = get_blocks(rows) sdict[columns[col]] = SparseSeries( rowvals.values, index=index, fill_value=fill_value, sparse_index=BlockIndex(N, blocs, blens)) # Add any columns that were empty and thus not grouped on above sdict.update({column: SparseSeries(index=index, fill_value=fill_value, sparse_index=BlockIndex(N, [], [])) for column in columns if column not in sdict}) return self._init_dict(sdict, index, columns, dtype)
def value_counts(values, sort=True, ascending=False): """ Compute a histogram of the counts of non-null values Returns ------- value_counts : Series """ from collections import defaultdict if com.is_integer_dtype(values.dtype): values = com._ensure_int64(values) keys, counts = lib.value_count_int64(values) result = Series(counts, index=keys) else: counter = defaultdict(lambda: 0) values = values[com.notnull(values)] for value in values: counter[value] += 1 result = Series(counter) if sort: result.sort() if not ascending: result = result[::-1] return result
def pivot(index, columns, values): """ Produce 'pivot' table based on 3 columns of this DataFrame. Uses unique values from index / columns and fills with values. Parameters ---------- index : ndarray Labels to use to make new frame's index columns : ndarray Labels to use to make new frame's columns values : ndarray Values to use for populating new frame's values Note ---- Obviously, all 3 of the input arguments must have the same length Returns ------- DataFrame """ assert(len(index) == len(columns) == len(values)) if len(index) == 0: return DataFrame(index=[]) hindex = _make_long_index(index, columns) series = Series(values.ravel(), index=hindex) series = series.sortlevel(0) return series.unstack()
def match(to_match, values, na_sentinel=-1): """ Compute locations of to_match into values Parameters ---------- to_match : array-like values to find positions of values : array-like Unique set of values na_sentinel : int, default -1 Value to mark "not found" Examples -------- Returns ------- match : ndarray of integers """ values = com._asarray_tuplesafe(values) if issubclass(values.dtype.type, string_types): values = np.array(values, dtype='O') f = lambda htype, caster: _match_generic(to_match, values, htype, caster) result = _hashtable_algo(f, values.dtype) if na_sentinel != -1: # replace but return a numpy array # use a Series because it handles dtype conversions properly from pandas.core.series import Series result = Series(result.ravel()).replace(-1,na_sentinel).values.reshape(result.shape) return result
def test_groupby_categorical_unequal_len(self): # GH3011 series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4]) # The raises only happens with categorical, not with series of types # category bins = pd.cut(series.dropna().values, 4) # len(bins) != len(series) here self.assertRaises(ValueError, lambda: series.groupby(bins).mean())
def _value_counts_arraylike(values, dropna=True): is_datetimetz_type = is_datetimetz(values) is_period_type = (is_period_dtype(values) or is_period_arraylike(values)) orig = values from pandas.core.series import Series values = Series(values).values dtype = values.dtype if needs_i8_conversion(dtype) or is_period_type: from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex if is_period_type: # values may be an object values = PeriodIndex(values) freq = values.freq values = values.view(np.int64) keys, counts = htable.value_count_int64(values, dropna) if dropna: msk = keys != iNaT keys, counts = keys[msk], counts[msk] # convert the keys back to the dtype we came in keys = keys.astype(dtype) # dtype handling if is_datetimetz_type: keys = DatetimeIndex._simple_new(keys, tz=orig.dtype.tz) if is_period_type: keys = PeriodIndex._simple_new(keys, freq=freq) elif is_signed_integer_dtype(dtype): values = _ensure_int64(values) keys, counts = htable.value_count_int64(values, dropna) elif is_unsigned_integer_dtype(dtype): values = _ensure_uint64(values) keys, counts = htable.value_count_uint64(values, dropna) elif is_float_dtype(dtype): values = _ensure_float64(values) keys, counts = htable.value_count_float64(values, dropna) else: values = _ensure_object(values) keys, counts = htable.value_count_object(values, dropna) mask = isnull(values) if not dropna and mask.any(): keys = np.insert(keys, 0, np.NaN) counts = np.insert(counts, 0, mask.sum()) return keys, counts
def pivot(self, index=None, columns=None, values=None): """ See DataFrame.pivot """ if values is None: indexed = self.set_index([index, columns]) return indexed.unstack(columns) else: indexed = Series(self[values], index=[self[index], self[columns]]) return indexed.unstack(columns)
def size(self): """ Compute group sizes """ # TODO: better impl labels, _, ngroups = self.group_info bin_counts = Series(labels).value_counts() bin_counts = bin_counts.reindex(np.arange(ngroups)) bin_counts.index = self.result_index return bin_counts
def _value_counts_arraylike(values, dropna=True): is_datetimetz = com.is_datetimetz(values) is_period = (isinstance(values, gt.ABCPeriodIndex) or com.is_period_arraylike(values)) orig = values from pandas.core.series import Series values = Series(values).values dtype = values.dtype if com.is_datetime_or_timedelta_dtype(dtype) or is_period: from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex if is_period: values = PeriodIndex(values) freq = values.freq values = values.view(np.int64) keys, counts = htable.value_count_scalar64(values, dropna) if dropna: msk = keys != iNaT keys, counts = keys[msk], counts[msk] # convert the keys back to the dtype we came in keys = keys.astype(dtype) # dtype handling if is_datetimetz: if isinstance(orig, gt.ABCDatetimeIndex): tz = orig.tz else: tz = orig.dt.tz keys = DatetimeIndex._simple_new(keys, tz=tz) if is_period: keys = PeriodIndex._simple_new(keys, freq=freq) elif com.is_integer_dtype(dtype): values = com._ensure_int64(values) keys, counts = htable.value_count_scalar64(values, dropna) elif com.is_float_dtype(dtype): values = com._ensure_float64(values) keys, counts = htable.value_count_scalar64(values, dropna) else: values = com._ensure_object(values) mask = com.isnull(values) keys, counts = htable.value_count_object(values, mask) if not dropna and mask.any(): keys = np.insert(keys, 0, np.NaN) counts = np.insert(counts, 0, mask.sum()) return keys, counts
def _coo_to_sparse_series(A, dense_index=False): """ Convert a scipy.sparse.coo_matrix to a SparseSeries. Use the defaults given in the SparseSeries constructor. """ s = Series(A.data, MultiIndex.from_arrays((A.row, A.col))) s = s.sort_index() s = s.to_sparse() # TODO: specify kind? if dense_index: # is there a better constructor method to use here? i = range(A.shape[0]) j = range(A.shape[1]) ind = MultiIndex.from_product([i, j]) s = s.reindex_axis(ind) return s
def test_filter_against_workaround(self): np.random.seed(0) # Series of ints s = Series(np.random.randint(0, 100, 1000)) grouper = s.apply(lambda x: np.round(x, -1)) grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 old_way = s[grouped.transform(f).astype('bool')] new_way = grouped.filter(f) assert_series_equal(new_way.sort_values(), old_way.sort_values()) # Series of floats s = 100 * Series(np.random.random(1000)) grouper = s.apply(lambda x: np.round(x, -1)) grouped = s.groupby(grouper) f = lambda x: x.mean() > 10 old_way = s[grouped.transform(f).astype('bool')] new_way = grouped.filter(f) assert_series_equal(new_way.sort_values(), old_way.sort_values()) # Set up DataFrame of ints, floats, strings. from string import ascii_lowercase letters = np.array(list(ascii_lowercase)) N = 1000 random_letters = letters.take(np.random.randint(0, 26, N)) df = DataFrame({'ints': Series(np.random.randint(0, 100, N)), 'floats': N / 10 * Series(np.random.random(N)), 'letters': Series(random_letters)}) # Group by ints; filter on floats. grouped = df.groupby('ints') old_way = df[grouped.floats. transform(lambda x: x.mean() > N / 20).astype('bool')] new_way = grouped.filter(lambda x: x['floats'].mean() > N / 20) assert_frame_equal(new_way, old_way) # Group by floats (rounded); filter on strings. grouper = df.floats.apply(lambda x: np.round(x, -1)) grouped = df.groupby(grouper) old_way = df[grouped.letters. transform(lambda x: len(x) < N / 10).astype('bool')] new_way = grouped.filter(lambda x: len(x.letters) < N / 10) assert_frame_equal(new_way, old_way) # Group by strings; filter on ints. grouped = df.groupby('letters') old_way = df[grouped.ints. transform(lambda x: x.mean() > N / 20).astype('bool')] new_way = grouped.filter(lambda x: x['ints'].mean() > N / 20) assert_frame_equal(new_way, old_way)
def _get_index_subset_to_coord_dict(index, subset, sort_labels=False): ilabels = list(zip(*[index._get_level_values(i) for i in subset])) labels_to_i = _get_label_to_i_dict(ilabels, sort_labels=sort_labels) labels_to_i = Series(labels_to_i) if len(subset) > 1: labels_to_i.index = MultiIndex.from_tuples(labels_to_i.index) labels_to_i.index.names = [index.names[i] for i in subset] else: labels_to_i.index = Index(x[0] for x in labels_to_i.index) labels_to_i.index.name = index.names[subset[0]] labels_to_i.name = 'value' return (labels_to_i)
def value_counts(values, sort=True, ascending=False, normalize=False): """ Compute a histogram of the counts of non-null values Parameters ---------- values : ndarray (1-d) sort : boolean, default True Sort by values ascending : boolean, default False Sort in ascending order normalize: boolean, default False If True then compute a relative histogram Returns ------- value_counts : Series """ from pandas.core.series import Series values = np.asarray(values) if com.is_integer_dtype(values.dtype): values = com._ensure_int64(values) keys, counts = htable.value_count_int64(values) elif issubclass(values.dtype.type, (np.datetime64,np.timedelta64)): dtype = values.dtype values = values.view(np.int64) keys, counts = htable.value_count_int64(values) # convert the keys back to the dtype we came in keys = Series(keys,dtype=dtype) else: mask = com.isnull(values) values = com._ensure_object(values) keys, counts = htable.value_count_object(values, mask) result = Series(counts, index=keys) if sort: result.sort() if not ascending: result = result[::-1] if normalize: result = result / float(values.size) return result
def __repr__(self): with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Sparse") series_rep = Series.__repr__(self) rep = '{series}\n{index!r}'.format(series=series_rep, index=self.sp_index) return rep
def sort_group_labels(ids, labels, counts): n = len(ids) rng = np.arange(n) values = Series(ids, index=rng, dtype=object).values indexer = values.argsort() reverse_indexer = np.empty(n, dtype=np.int32) reverse_indexer.put(indexer, np.arange(n)) new_labels = reverse_indexer.take(labels) np.putmask(new_labels, labels == -1, -1) new_ids = dict(izip(rng, values.take(indexer))) new_counts = counts.take(indexer) return new_ids, new_labels, new_counts
def pivot(self, index=None, columns=None, values=None): """ See DataFrame.pivot """ if values is None: cols = [columns] if index is None else [index, columns] append = index is None indexed = self.set_index(cols, append=append) return indexed.unstack(columns) else: if index is None: index = self.index else: index = self[index] indexed = Series(self[values].values, index=MultiIndex.from_arrays([index, self[columns]])) return indexed.unstack(columns)
def _get_index_subset_to_coord_dict(index, subset, sort_labels=False): def robust_get_level_values(i): # if index has labels (that are not None) use those, # else use the level location try: return(index.get_level_values(index.names[i])) except KeyError: return(index.get_level_values(i)) ilabels = list( zip(*[robust_get_level_values(i) for i in subset])) labels_to_i = _get_label_to_i_dict( ilabels, sort_labels=sort_labels) labels_to_i = Series(labels_to_i) labels_to_i.index = MultiIndex.from_tuples(labels_to_i.index) labels_to_i.index.names = [index.names[i] for i in subset] labels_to_i.name = 'value' return(labels_to_i)
def test_groupby_categorical_no_compress(self): data = Series(np.random.randn(9)) codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True) result = data.groupby(cats).mean() exp = data.groupby(codes).mean() exp.index = CategoricalIndex(exp.index, categories=cats.categories, ordered=cats.ordered) assert_series_equal(result, exp) codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3]) cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True) result = data.groupby(cats).mean() exp = data.groupby(codes).mean().reindex(cats.categories) exp.index = CategoricalIndex(exp.index, categories=cats.categories, ordered=cats.ordered) assert_series_equal(result, exp) cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c", "d"], ordered=True) data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats}) result = data.groupby("b").mean() result = result["a"].values exp = np.array([1, 2, 4, np.nan]) self.assert_numpy_array_equal(result, exp)
def init_dict(data, index, columns, dtype=None): """ Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. """ if columns is not None: from pandas.core.series import Series arrays = Series(data, index=columns, dtype=object) data_names = arrays.index missing = arrays.isnull() if index is None: # GH10856 # raise ValueError if only scalars in dict index = extract_index(arrays[~missing]) else: index = ensure_index(index) # no obvious "empty" int column if missing.any() and not is_integer_dtype(dtype): if dtype is None or np.issubdtype(dtype, np.flexible): # GH#1783 nan_dtype = object else: nan_dtype = dtype val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype) arrays.loc[missing] = [val] * missing.sum() else: for key in data: if (isinstance(data[key], ABCDatetimeIndex) and data[key].tz is not None): # GH#24096 need copy to be deep for datetime64tz case # TODO: See if we can avoid these copies data[key] = data[key].copy(deep=True) keys = com.dict_keys_to_ordered_list(data) columns = data_names = Index(keys) arrays = [data[k] for k in keys] return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
def _coo_to_sparse_series(A, dense_index: bool = False, sparse_series: bool = True): """ Convert a scipy.sparse.coo_matrix to a SparseSeries. Parameters ---------- A : scipy.sparse.coo.coo_matrix dense_index : bool, default False sparse_series : bool, default True Returns ------- Series or SparseSeries Raises ------ TypeError if A is not a coo_matrix """ from pandas import SparseDtype try: s = Series(A.data, MultiIndex.from_arrays((A.row, A.col))) except AttributeError: raise TypeError('Expected coo_matrix. Got {} instead.' .format(type(A).__name__)) s = s.sort_index() if sparse_series: # TODO(SparseSeries): remove this and the sparse_series keyword. # This is just here to avoid a DeprecationWarning when # _coo_to_sparse_series is called via Series.sparse.from_coo s = s.to_sparse() # TODO: specify kind? else: s = s.astype(SparseDtype(s.dtype)) if dense_index: # is there a better constructor method to use here? i = range(A.shape[0]) j = range(A.shape[1]) ind = MultiIndex.from_product([i, j]) s = s.reindex(ind) return s
def plot(self): """ Plots 2 graphs. One for N-period moving average, lower and upper bands. One for P/N and position. """ columns = {"Upper Bands": self.upper_bands, "Lower Bands": self.lower_bands, "Moving Means": self.moving_means, "Opening Prices": self.prices} df = DataFrame(columns, index=self.dates) df.plot() fig = plt.figure(num=None, figsize=(18, 10), dpi=80, facecolor='w', edgecolor='k') fig.add_subplot(121) trans_dates = [tran.date for tran in self.transactions] # we negate the value here to show profit/loss trans = Series([-tran.value() for tran in self.transactions], index=trans_dates) position = Series([tran.units for tran in self.transactions], index=trans_dates) position.cumsum().plot(label="Position") plt.xlabel("Date") plt.ylabel("Position") plt.title("Position over Time") plt.legend(loc="best") fig.add_subplot(122) trans.cumsum().plot(label="P/L") plt.xlabel("Date") plt.ylabel("Profit/Loss") plt.title("Profit and Loss over Time") plt.legend(loc="best") plt.show()
def get_sysprice_list(start_time, end_time, frequency='hourly'): '''Wrapper function for creating pandas Series object from data received from the database. Returns: pandas Series object with Elspot daily system prices and corresponding dates for predefined time period. Parameters: start_time - string representing the start of the time period, format must be 'yyyy-mm-dd'. end_time - string representing the end of the time period, format must be 'yyyy-mm-dd'. frequency - string representing the frequency of the output pandas Series object. Currently must be one of ['hourly', 'daily'] ''' #Retrieve hourly system prices and timestamps from database as lists _ , sys_prices, times = get_system_price_volume(start_time, end_time) ts = Series(sys_prices, index=times) if frequency == 'daily': resampling_frequency = 'D' '''Weekly functionality not needed for now''' '''elif frequency == 'weekly': start_time = datetime.datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S') end_time = datetime.datetime.strptime(end_time, '%Y-%m-%d %H:%M:%S') if start_time.date().weekday() != 0: raise ValueError(str(start_time.date())+ " is a " + start_time.date().strftime('%A') + ". start_date must be a Monday.") if end_time.date().weekday() != 6: raise ValueError(str(end_time.date())+ " is a " + end_time.date().strftime('%A') + ". end_date must be a Sunday.") resampling_frequency = 'W' ''' if frequency == 'monthly': resampling_frequency = 'M' if frequency == 'hourly': #Resampling is not necessary return ts else: return ts.resample(resampling_frequency, how='mean', kind='timestamp')
def value_counts(values, sort=True, ascending=False): """ Compute a histogram of the counts of non-null values Parameters ---------- values : ndarray (1-d) sort : boolean, default True Sort by values ascending : boolean, default False Sort in ascending order Returns ------- value_counts : Series """ from pandas.core.series import Series values = np.asarray(values) if com.is_integer_dtype(values.dtype): values = com._ensure_int64(values) keys, counts = htable.value_count_int64(values) else: mask = com.isnull(values) values = com._ensure_object(values) keys, counts = htable.value_count_object(values, mask) result = Series(counts, index=keys) if sort: result.sort() if not ascending: result = result[::-1] return result
def __unicode__(self): # currently, unicode is same as repr...fixes infinite loop series_rep = Series.__unicode__(self) rep = '{series}\n{index!r}'.format(series=series_rep, index=self.sp_index) return rep
def value_counts(values, sort=True, ascending=False, normalize=False, bins=None, dropna=True): """ Compute a histogram of the counts of non-null values. Parameters ---------- values : ndarray (1-d) sort : boolean, default True Sort by values ascending : boolean, default False Sort in ascending order normalize: boolean, default False If True then compute a relative histogram bins : integer, optional Rather than count values, group them into half-open bins, convenience for pd.cut, only works with numeric data dropna : boolean, default True Don't include counts of NaN Returns ------- value_counts : Series """ from pandas.core.series import Series name = getattr(values, 'name', None) if bins is not None: try: from pandas.tools.tile import cut values = Series(values).values cat, bins = cut(values, bins, retbins=True) except TypeError: raise TypeError("bins argument only works with numeric data.") values = cat.codes if is_extension_type(values) and not is_datetimetz(values): # handle Categorical and sparse, # datetime tz can be handeled in ndarray path result = Series(values).values.value_counts(dropna=dropna) result.name = name counts = result.values else: # ndarray path. pass original to handle DatetimeTzBlock keys, counts = _value_counts_arraylike(values, dropna=dropna) from pandas import Index, Series if not isinstance(keys, Index): keys = Index(keys) result = Series(counts, index=keys, name=name) if bins is not None: # TODO: This next line should be more efficient result = result.reindex(np.arange(len(cat.categories)), fill_value=0) result.index = bins[:-1] if sort: result = result.sort_values(ascending=ascending) if normalize: result = result / float(counts.sum()) return result
def aggregate(self, func_or_funcs, *args, **kwargs): """ Apply aggregation function or functions to groups, yielding most likely Series but in some cases DataFrame depending on the output of the aggregation function Parameters ---------- func_or_funcs : function or list / dict of functions List/dict of functions will produce DataFrame with column names determined by the function names themselves (list) or the keys in the dict Notes ----- agg is an alias for aggregate. Use it. Example ------- >>> series bar 1.0 baz 2.0 qot 3.0 qux 4.0 >>> mapper = lambda x: x[0] # first letter >>> grouped = series.groupby(mapper) >>> grouped.aggregate(np.sum) b 3.0 q 7.0 >>> grouped.aggregate([np.sum, np.mean, np.std]) mean std sum b 1.5 0.5 3 q 3.5 0.5 7 >>> grouped.agg({'result' : lambda x: x.mean() / x.std(), ... 'total' : np.sum}) result total b 2.121 3 q 4.95 7 See also -------- apply, transform Returns ------- Series or DataFrame """ if isinstance(func_or_funcs, basestring): return getattr(self, func_or_funcs)(*args, **kwargs) if hasattr(func_or_funcs, '__iter__'): ret = self._aggregate_multiple_funcs(func_or_funcs) else: if len(self.grouper.groupings) > 1: return self._python_agg_general(func_or_funcs, *args, **kwargs) try: return self._python_agg_general(func_or_funcs, *args, **kwargs) except Exception: result = self._aggregate_named(func_or_funcs, *args, **kwargs) index = Index(sorted(result), name=self.grouper.names[0]) ret = Series(result, index=index) if not self.as_index: # pragma: no cover print 'Warning, ignoring as_index=True' return ret
def __init__(self, data, index=None, sparse_index=None, kind='block', fill_value=None, name=None, dtype=None, copy=False, fastpath=False): # we are called internally, so short-circuit if fastpath: # data is an ndarray, index is defined data = SingleBlockManager(data, index, fastpath=True) if copy: data = data.copy() else: is_sparse_array = isinstance(data, SparseArray) if fill_value is None: if is_sparse_array: fill_value = data.fill_value else: fill_value = nan if is_sparse_array: if isinstance(data, SparseSeries) and index is None: index = data.index elif index is not None: assert(len(index) == len(data)) sparse_index = data.sp_index data = np.asarray(data) elif isinstance(data, SparseSeries): if index is None: index = data.index # extract the SingleBlockManager data = data._data elif isinstance(data, (Series, dict)): if index is None: index = data.index data = Series(data) data, sparse_index = make_sparse(data, kind=kind, fill_value=fill_value) elif isinstance(data, (tuple, list, np.ndarray)): # array-like if sparse_index is None: data, sparse_index = make_sparse(data, kind=kind, fill_value=fill_value) else: assert(len(data) == sparse_index.npoints) elif isinstance(data, SingleBlockManager): if dtype is not None: data = data.astype(dtype) if index is None: index = data.index else: data = data.reindex(index, copy=False) else: length = len(index) if data == fill_value or (isnull(data) and isnull(fill_value)): if kind == 'block': sparse_index = BlockIndex(length, [], []) else: sparse_index = IntIndex(length, []) data = np.array([]) else: if kind == 'block': locs, lens = ([0], [length]) if length else ([], []) sparse_index = BlockIndex(length, locs, lens) else: sparse_index = IntIndex(length, index) v = data data = np.empty(length) data.fill(v) if index is None: index = com._default_index(sparse_index.length) index = _ensure_index(index) # create/copy the manager if isinstance(data, SingleBlockManager): if copy: data = data.copy() else: # create a sparse array if not isinstance(data, SparseArray): data = SparseArray( data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype, copy=copy) data = SingleBlockManager(data, index) generic.NDFrame.__init__(self, data) self.index = index self.name = name
def _get_empty_meta(self, columns, index_col, index_names, dtype: DtypeArg | None = None): columns = list(columns) # Convert `dtype` to a defaultdict of some kind. # This will enable us to write `dtype[col_name]` # without worrying about KeyError issues later on. if not is_dict_like(dtype): # if dtype == None, default will be object. default_dtype = dtype or object # error: Argument 1 to "defaultdict" has incompatible type "Callable[[], # Union[ExtensionDtype, str, dtype[Any], Type[object], Dict[Hashable, # Union[ExtensionDtype, Union[str, dtype[Any]], Type[str], Type[float], # Type[int], Type[complex], Type[bool], Type[object]]]]]"; expected # "Optional[Callable[[], Union[ExtensionDtype, str, dtype[Any], # Type[object]]]]" # error: Incompatible return value type (got "Union[ExtensionDtype, str, # dtype[Any], Type[object], Dict[Hashable, Union[ExtensionDtype, Union[str, # dtype[Any]], Type[str], Type[float], Type[int], Type[complex], Type[bool], # Type[object]]]]", expected "Union[ExtensionDtype, str, dtype[Any], # Type[object]]") dtype = defaultdict( lambda: default_dtype # type: ignore[arg-type, return-value] ) else: dtype = cast(dict, dtype) dtype = defaultdict( lambda: object, { columns[k] if is_integer(k) else k: v for k, v in dtype.items() }, ) # Even though we have no data, the "index" of the empty DataFrame # could for example still be an empty MultiIndex. Thus, we need to # check whether we have any index columns specified, via either: # # 1) index_col (column indices) # 2) index_names (column names) # # Both must be non-null to ensure a successful construction. Otherwise, # we have to create a generic empty Index. if (index_col is None or index_col is False) or index_names is None: index = Index([]) else: data = [Series([], dtype=dtype[name]) for name in index_names] index = ensure_index_from_sequences(data, names=index_names) index_col.sort() for i, n in enumerate(index_col): columns.pop(n - i) col_dict = { col_name: Series([], dtype=dtype[col_name]) for col_name in columns } return index, columns, col_dict
def value_counts(values, sort=True, ascending=False, normalize=False, bins=None): """ Compute a histogram of the counts of non-null values Parameters ---------- values : ndarray (1-d) sort : boolean, default True Sort by values ascending : boolean, default False Sort in ascending order normalize: boolean, default False If True then compute a relative histogram bins : integer, optional Rather than count values, group them into half-open bins, convenience for pd.cut, only works with numeric data Returns ------- value_counts : Series """ from pandas.core.series import Series from pandas.tools.tile import cut values = Series(values).values if bins is not None: try: cat, bins = cut(values, bins, retbins=True) except TypeError: raise TypeError("bins argument only works with numeric data.") values = cat.labels if com.is_integer_dtype(values.dtype): values = com._ensure_int64(values) keys, counts = htable.value_count_int64(values) elif issubclass(values.dtype.type, (np.datetime64, np.timedelta64)): dtype = values.dtype values = values.view(np.int64) keys, counts = htable.value_count_int64(values) # convert the keys back to the dtype we came in keys = Series(keys, dtype=dtype) else: mask = com.isnull(values) values = com._ensure_object(values) keys, counts = htable.value_count_object(values, mask) result = Series(counts, index=com._values_from_object(keys)) if bins is not None: # TODO: This next line should be more efficient result = result.reindex(np.arange(len(cat.levels)), fill_value=0) result.index = bins[:-1] if sort: result.sort() if not ascending: result = result[::-1] if normalize: result = result / float(values.size) return result
def str_extract(arr, pat, flags=0): """ Find groups in each string using passed regular expression Parameters ---------- pat : string Pattern or regular expression flags : int, default 0 (no flags) re module flags, e.g. re.IGNORECASE Returns ------- extracted groups : Series (one group) or DataFrame (multiple groups) Note that dtype of the result is always object, even when no match is found and the result is a Series or DataFrame containing only NaN values. Examples -------- A pattern with one group will return a Series. Non-matches will be NaN. >>> Series(['a1', 'b2', 'c3']).str.extract('[ab](\d)') 0 1 1 2 2 NaN dtype: object A pattern with more than one group will return a DataFrame. >>> Series(['a1', 'b2', 'c3']).str.extract('([ab])(\d)') 0 1 0 a 1 1 b 2 2 NaN NaN A pattern may contain optional groups. >>> Series(['a1', 'b2', 'c3']).str.extract('([ab])?(\d)') 0 1 0 a 1 1 b 2 2 NaN 3 Named groups will become column names in the result. >>> Series(['a1', 'b2', 'c3']).str.extract('(?P<letter>[ab])(?P<digit>\d)') letter digit 0 a 1 1 b 2 2 NaN NaN """ from pandas.core.series import Series from pandas.core.frame import DataFrame regex = re.compile(pat, flags=flags) # just to be safe, check this if regex.groups == 0: raise ValueError("This pattern contains no groups to capture.") empty_row = [np.nan]*regex.groups def f(x): if not isinstance(x, compat.string_types): return empty_row m = regex.search(x) if m: return [np.nan if item is None else item for item in m.groups()] else: return empty_row if regex.groups == 1: result = Series([f(val)[0] for val in arr], name=_get_single_group_name(regex), index=arr.index, dtype=object) else: names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) columns = [names.get(1 + i, i) for i in range(regex.groups)] if arr.empty: result = DataFrame(columns=columns, dtype=object) else: result = DataFrame([f(val) for val in arr], columns=columns, index=arr.index, dtype=object) return result
def evaluate_model(self, x_test: pdSeries.Series, y_test: pdSeries.Series, pipe:Pipeline): predicted = pipe.predict(x_test) logger.info("model accuracy is: %.3f \n" % pipe.score(x_test, y_test) ) logger.info(metrics.classification_report(y_test, predicted, target_names=y_test.unique()))
def _get_dummies_1d( data, prefix, prefix_sep="_", dummy_na=False, sparse=False, drop_first=False, dtype=None, ): from pandas.core.reshape.concat import concat # Series avoids inconsistent NaN handling codes, levels = _factorize_from_iterable(Series(data)) if dtype is None: dtype = np.uint8 dtype = np.dtype(dtype) if is_object_dtype(dtype): raise ValueError("dtype=object is not a valid dtype for get_dummies") def get_empty_frame(data): if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) return DataFrame(index=index) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_frame(data) codes = codes.copy() if dummy_na: codes[codes == -1] = len(levels) levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_frame(data) number_of_cols = len(levels) if prefix is None: dummy_cols = levels else: # PY2 embedded unicode, gh-22084 def _make_col_name(prefix, prefix_sep, level): fstr = "{prefix}{prefix_sep}{level}" return fstr.format(prefix=prefix, prefix_sep=prefix_sep, level=level) dummy_cols = [ _make_col_name(prefix, prefix_sep, level) for level in levels ] if isinstance(data, Series): index = data.index else: index = None if sparse: if is_integer_dtype(dtype): fill_value = 0 elif dtype == bool: fill_value = False else: fill_value = 0.0 sparse_series = [] N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] mask = codes != -1 codes = codes[mask] n_idx = np.arange(N)[mask] for ndx, code in zip(n_idx, codes): sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray( np.ones(len(ixs), dtype=dtype), sparse_index=IntIndex(N, ixs), fill_value=fill_value, dtype=dtype, ) sparse_series.append(Series(data=sarr, index=index, name=col)) out = concat(sparse_series, axis=1, copy=False) return out else: dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def beta(self): return Series(self._beta_raw, index=self._x.columns)
def _unstack_multiple(data, clocs): if len(clocs) == 0: return data # NOTE: This doesn't deal with hierarchical columns yet index = data.index clocs = [index._get_level_number(i) for i in clocs] rlocs = [i for i in range(index.nlevels) if i not in clocs] clevels = [index.levels[i] for i in clocs] clabels = [index.labels[i] for i in clocs] cnames = [index.names[i] for i in clocs] rlevels = [index.levels[i] for i in rlocs] rlabels = [index.labels[i] for i in rlocs] rnames = [index.names[i] for i in rlocs] shape = [len(x) for x in clevels] group_index = get_group_index(clabels, shape) comp_ids, obs_ids = _compress_group_index(group_index, sort=False) recons_labels = decons_group_index(obs_ids, shape) dummy_index = MultiIndex(levels=rlevels + [obs_ids], labels=rlabels + [comp_ids], names=rnames + ['__placeholder__']) if isinstance(data, Series): dummy = Series(data.values, index=dummy_index) unstacked = dummy.unstack('__placeholder__') new_levels = clevels new_names = cnames new_labels = recons_labels else: if isinstance(data.columns, MultiIndex): result = data for i in range(len(clocs)): val = clocs[i] result = result.unstack(val) clocs = [val if i > val else val - 1 for val in clocs] return result dummy = DataFrame(data.values, index=dummy_index, columns=data.columns) unstacked = dummy.unstack('__placeholder__') if isinstance(unstacked, Series): unstcols = unstacked.index else: unstcols = unstacked.columns new_levels = [unstcols.levels[0]] + clevels new_names = [data.columns.name] + cnames new_labels = [unstcols.labels[0]] for rec in recons_labels: new_labels.append(rec.take(unstcols.labels[-1])) new_columns = MultiIndex(levels=new_levels, labels=new_labels, names=new_names) if isinstance(unstacked, Series): unstacked.index = new_columns else: unstacked.columns = new_columns return unstacked
def _chop(self, sdata: Series, slice_obj: slice) -> Series: # fastpath equivalent to `sdata.iloc[slice_obj]` mgr = sdata._mgr.get_slice(slice_obj) # __finalize__ not called here, must be applied by caller if applicable return sdata._constructor(mgr, name=sdata.name, fastpath=True)
def _generate_marginal_results(table, data, values, rows, cols, aggfunc, grand_margin, margins_name='All'): if len(cols) > 0: # need to "interleave" the margins table_pieces = [] margin_keys = [] def _all_key(key): return (key, margins_name) + ('', ) * (len(cols) - 1) if len(rows) > 0: margin = data[rows + values].groupby(rows).agg(aggfunc) cat_axis = 1 for key, piece in table.groupby(level=0, axis=cat_axis): all_key = _all_key(key) # we are going to mutate this, so need to copy! piece = piece.copy() try: piece[all_key] = margin[key] except TypeError: # we cannot reshape, so coerce the axis piece.set_axis( piece._get_axis(cat_axis)._to_safe_for_reshape(), axis=cat_axis, inplace=True) piece[all_key] = margin[key] table_pieces.append(piece) margin_keys.append(all_key) else: margin = grand_margin cat_axis = 0 for key, piece in table.groupby(level=0, axis=cat_axis): all_key = _all_key(key) table_pieces.append(piece) table_pieces.append(Series(margin[key], index=[all_key])) margin_keys.append(all_key) result = concat(table_pieces, axis=cat_axis) if len(rows) == 0: return result else: result = table margin_keys = table.columns if len(cols) > 0: row_margin = data[cols + values].groupby(cols).agg(aggfunc) row_margin = row_margin.stack() # slight hack new_order = [len(cols)] + lrange(len(cols)) row_margin.index = row_margin.index.reorder_levels(new_order) else: row_margin = Series(np.nan, index=result.columns) return result, margin_keys, row_margin
def _add_margins(table, data, values, rows, cols, aggfunc, margins_name='All', fill_value=None): if not isinstance(margins_name, compat.string_types): raise ValueError('margins_name argument must be a string') msg = u'Conflicting name "{name}" in margins'.format(name=margins_name) for level in table.index.names: if margins_name in table.index.get_level_values(level): raise ValueError(msg) grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name) # could be passed a Series object with no 'columns' if hasattr(table, 'columns'): for level in table.columns.names[1:]: if margins_name in table.columns.get_level_values(level): raise ValueError(msg) if len(rows) > 1: key = (margins_name, ) + ('', ) * (len(rows) - 1) else: key = margins_name if not values and isinstance(table, ABCSeries): # If there are no values and the table is a series, then there is only # one column in the data. Compute grand margin and return it. return table.append(Series({key: grand_margin[margins_name]})) if values: marginal_result_set = _generate_marginal_results( table, data, values, rows, cols, aggfunc, grand_margin, margins_name) if not isinstance(marginal_result_set, tuple): return marginal_result_set result, margin_keys, row_margin = marginal_result_set else: marginal_result_set = _generate_marginal_results_without_values( table, data, rows, cols, aggfunc, margins_name) if not isinstance(marginal_result_set, tuple): return marginal_result_set result, margin_keys, row_margin = marginal_result_set row_margin = row_margin.reindex(result.columns, fill_value=fill_value) # populate grand margin for k in margin_keys: if isinstance(k, compat.string_types): row_margin[k] = grand_margin[k] else: row_margin[k] = grand_margin[k[0]] from pandas import DataFrame margin_dummy = DataFrame(row_margin, columns=[key]).T row_names = result.index.names try: for dtype in set(result.dtypes): cols = result.select_dtypes([dtype]).columns margin_dummy[cols] = margin_dummy[cols].astype(dtype) result = result.append(margin_dummy) except TypeError: # we cannot reshape, so coerce the axis result.index = result.index._to_safe_for_reshape() result = result.append(margin_dummy) result.index.names = row_names return result
def _add_margins( table: DataFrame | Series, data: DataFrame, values, rows, cols, aggfunc, observed=None, margins_name: str = "All", fill_value=None, ): if not isinstance(margins_name, str): raise ValueError("margins_name argument must be a string") msg = f'Conflicting name "{margins_name}" in margins' for level in table.index.names: if margins_name in table.index.get_level_values(level): raise ValueError(msg) grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name) if table.ndim == 2: # i.e. DataFrame for level in table.columns.names[1:]: if margins_name in table.columns.get_level_values(level): raise ValueError(msg) key: str | tuple[str, ...] if len(rows) > 1: key = (margins_name,) + ("",) * (len(rows) - 1) else: key = margins_name if not values and isinstance(table, ABCSeries): # If there are no values and the table is a series, then there is only # one column in the data. Compute grand margin and return it. return table._append(Series({key: grand_margin[margins_name]})) elif values: marginal_result_set = _generate_marginal_results( table, data, values, rows, cols, aggfunc, observed, margins_name ) if not isinstance(marginal_result_set, tuple): return marginal_result_set result, margin_keys, row_margin = marginal_result_set else: # no values, and table is a DataFrame assert isinstance(table, ABCDataFrame) marginal_result_set = _generate_marginal_results_without_values( table, data, rows, cols, aggfunc, observed, margins_name ) if not isinstance(marginal_result_set, tuple): return marginal_result_set result, margin_keys, row_margin = marginal_result_set row_margin = row_margin.reindex(result.columns, fill_value=fill_value) # populate grand margin for k in margin_keys: if isinstance(k, str): row_margin[k] = grand_margin[k] else: row_margin[k] = grand_margin[k[0]] from pandas import DataFrame margin_dummy = DataFrame(row_margin, columns=[key]).T row_names = result.index.names # check the result column and leave floats for dtype in set(result.dtypes): cols = result.select_dtypes([dtype]).columns margin_dummy[cols] = margin_dummy[cols].apply( maybe_downcast_to_dtype, args=(dtype,) ) result = result._append(margin_dummy) result.index.names = row_names return result
def dict_to_mgr( data: dict, index, columns, *, dtype: DtypeObj | None = None, typ: str = "block", copy: bool = True, ) -> Manager: """ Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. Used in DataFrame.__init__ """ arrays: Sequence[Any] | Series if columns is not None: from pandas.core.series import Series arrays = Series(data, index=columns, dtype=object) data_names = arrays.index missing = arrays.isna() if index is None: # GH10856 # raise ValueError if only scalars in dict index = extract_index(arrays[~missing]) else: index = ensure_index(index) # no obvious "empty" int column if missing.any() and not is_integer_dtype(dtype): nan_dtype: DtypeObj if dtype is None or (isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.flexible)): # GH#1783 nan_dtype = np.dtype("object") else: nan_dtype = dtype val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype) arrays.loc[missing] = [val] * missing.sum() arrays = list(arrays) else: keys = list(data.keys()) columns = data_names = Index(keys) arrays = [com.maybe_iterable_to_list(data[k]) for k in keys] # GH#24096 need copy to be deep for datetime64tz case # TODO: See if we can avoid these copies arrays = [ arr if not isinstance(arr, ABCIndex) else arr._data for arr in arrays ] arrays = [ arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays ] if copy: # arrays_to_mgr (via form_blocks) won't make copies for EAs # dtype attr check to exclude EADtype-castable strs arrays = [ x if not hasattr(x, "dtype") or not isinstance(x.dtype, ExtensionDtype) else x.copy() for x in arrays ] # TODO: can we get rid of the dt64tz special case above? return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype, typ=typ, consolidate=copy)
def _generate_marginal_results( table, data, values, rows, cols, aggfunc, observed, margins_name: str = "All" ): if len(cols) > 0: # need to "interleave" the margins table_pieces = [] margin_keys = [] def _all_key(key): return (key, margins_name) + ("",) * (len(cols) - 1) if len(rows) > 0: margin = data[rows + values].groupby(rows, observed=observed).agg(aggfunc) cat_axis = 1 for key, piece in table.groupby(level=0, axis=cat_axis, observed=observed): all_key = _all_key(key) # we are going to mutate this, so need to copy! piece = piece.copy() piece[all_key] = margin[key] table_pieces.append(piece) margin_keys.append(all_key) else: from pandas import DataFrame cat_axis = 0 for key, piece in table.groupby(level=0, axis=cat_axis, observed=observed): if len(cols) > 1: all_key = _all_key(key) else: all_key = margins_name table_pieces.append(piece) # GH31016 this is to calculate margin for each group, and assign # corresponded key as index transformed_piece = DataFrame(piece.apply(aggfunc)).T transformed_piece.index = Index([all_key], name=piece.index.name) # append piece for margin into table_piece table_pieces.append(transformed_piece) margin_keys.append(all_key) result = concat(table_pieces, axis=cat_axis) if len(rows) == 0: return result else: result = table margin_keys = table.columns if len(cols) > 0: row_margin = data[cols + values].groupby(cols, observed=observed).agg(aggfunc) row_margin = row_margin.stack() # slight hack new_order = [len(cols)] + list(range(len(cols))) row_margin.index = row_margin.index.reorder_levels(new_order) else: row_margin = Series(np.nan, index=result.columns) return result, margin_keys, row_margin
def to_datetime(arg, errors='ignore', dayfirst=False, utc=None, box=True, format=None, coerce=False, unit='ns', infer_datetime_format=False): """ Convert argument to datetime Parameters ---------- arg : string, datetime, array of strings (with possible NAs) errors : {'ignore', 'raise'}, default 'ignore' Errors are ignored by default (values left untouched) dayfirst : boolean, default False If True parses dates with the day first, eg 20/01/2005 Warning: dayfirst=True is not strict, but will prefer to parse with day first (this is a known bug). utc : boolean, default None Return UTC DatetimeIndex if True (converting any tz-aware datetime.datetime objects as well) box : boolean, default True If True returns a DatetimeIndex, if False returns ndarray of values format : string, default None strftime to parse time, eg "%d/%m/%Y" coerce : force errors to NaT (False by default) unit : unit of the arg (D,s,ms,us,ns) denote the unit in epoch (e.g. a unix timestamp), which is an integer/float number infer_datetime_format: boolean, default False If no `format` is given, try to infer the format based on the first datetime string. Provides a large speed-up in many cases. Returns ------- ret : datetime if parsing succeeded Examples -------- Take separate series and convert to datetime >>> import pandas as pd >>> i = pd.date_range('20000101',periods=100) >>> df = pd.DataFrame(dict(year = i.year, month = i.month, day = i.day)) >>> pd.to_datetime(df.year*10000 + df.month*100 + df.day, format='%Y%m%d') Or from strings >>> df = df.astype(str) >>> pd.to_datetime(df.day + df.month + df.year, format="%d%m%Y") """ from pandas import Timestamp from pandas.core.series import Series from pandas.tseries.index import DatetimeIndex def _convert_listlike(arg, box, format): if isinstance(arg, (list,tuple)): arg = np.array(arg, dtype='O') if com.is_datetime64_ns_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz='utc' if utc else None) except ValueError: pass return arg arg = com._ensure_object(arg) if infer_datetime_format and format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) if format is not None: # There is a special fast-path for iso8601 formatted # datetime strings, so in those cases don't use the inferred # format because this path makes process slower in this # special case format_is_iso8601 = ( '%Y-%m-%dT%H:%M:%S.%f'.startswith(format) or '%Y-%m-%d %H:%M:%S.%f'.startswith(format) ) if format_is_iso8601: format = None try: result = None if format is not None: # shortcut formatting here if format == '%Y%m%d': try: result = _attempt_YYYYMMDD(arg) except: raise ValueError("cannot convert the input to '%Y%m%d' date format") # fallback if result is None: try: result = tslib.array_strptime( arg, format, coerce=coerce ) except (tslib.OutOfBoundsDatetime): if errors == 'raise': raise result = arg except ValueError: # Only raise this error if the user provided the # datetime format, and not when it was inferred if not infer_datetime_format: raise if result is None and (format is None or infer_datetime_format): result = tslib.array_to_datetime(arg, raise_=errors == 'raise', utc=utc, dayfirst=dayfirst, coerce=coerce, unit=unit) if com.is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz='utc' if utc else None) return result except ValueError as e: try: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, None, tz=tz) except (ValueError, TypeError): raise e if arg is None: return arg elif isinstance(arg, Timestamp): return arg elif isinstance(arg, Series): values = _convert_listlike(arg.values, False, format) return Series(values, index=arg.index, name=arg.name) elif com.is_list_like(arg): return _convert_listlike(arg, box, format) return _convert_listlike(np.array([ arg ]), box, format)[0]
def granger_causality(self): """Returns the f-stats and p-values from the Granger Causality Test. If the data consists of columns x1, x2, x3, then we perform the following regressions: x1 ~ L(x2, x3) x1 ~ L(x1, x3) x1 ~ L(x1, x2) The f-stats of these results are placed in the 'x1' column of the returned DataFrame. We then repeat for x2, x3. Returns ------- Dict, where 'f-stat' returns the DataFrame containing the f-stats, and 'p-value' returns the DataFrame containing the corresponding p-values of the f-stats. """ from pandas.stats.api import ols from scipy.stats import f d = {} for col in self._columns: d[col] = {} for i in xrange(1, 1 + self._p): lagged_data = self._lagged_data[i].filter(self._columns - [col]) for key, value in lagged_data.iteritems(): d[col][_make_param_name(i, key)] = value f_stat_dict = {} p_value_dict = {} for col, y in self._data.iteritems(): ssr_full = (self.resid[col]**2).sum() f_stats = [] p_values = [] for col2 in self._columns: result = ols(y=y, x=d[col2]) resid = result.resid ssr_reduced = (resid**2).sum() M = self._p N = self._nobs K = self._k * self._p + 1 f_stat = ((ssr_reduced - ssr_full) / M) / (ssr_full / (N - K)) f_stats.append(f_stat) p_value = f.sf(f_stat, M, N - K) p_values.append(p_value) f_stat_dict[col] = Series(f_stats, self._columns) p_value_dict[col] = Series(p_values, self._columns) f_stat_mat = DataFrame(f_stat_dict) p_value_mat = DataFrame(p_value_dict) return { 'f-stat': f_stat_mat, 'p-value': p_value_mat, }
def _wrap_aggregated_output(self, output): # sort of a kludge output = output[self.name] index = self.grouper.result_index return Series(output, index=index, name=self.name)
def agg_dict_like( obj: AggObjType, arg: AggFuncTypeDict, _axis: int, ) -> FrameOrSeriesUnion: """ Compute aggregation in the case of a dict-like argument. Parameters ---------- obj : Pandas object to compute aggregation on. arg : dict label-aggregation pairs to compute. _axis : int, 0 or 1 Axis to compute aggregation on. Returns ------- Result of aggregation. """ is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) if _axis != 0: # pragma: no cover raise ValueError("Can only pass dict with axis=0") selected_obj = obj._selected_obj # if we have a dict of any non-scalars # eg. {'A' : ['mean']}, normalize all to # be list-likes # Cannot use arg.values() because arg may be a Series if any(is_aggregator(x) for _, x in arg.items()): new_arg: AggFuncTypeDict = {} for k, v in arg.items(): if not isinstance(v, (tuple, list, dict)): new_arg[k] = [v] else: new_arg[k] = v # the keys must be in the columns # for ndim=2, or renamers for ndim=1 # ok for now, but deprecated # {'A': { 'ra': 'mean' }} # {'A': { 'ra': ['mean'] }} # {'ra': ['mean']} # not ok # {'ra' : { 'A' : 'mean' }} if isinstance(v, dict): raise SpecificationError("nested renamer is not supported") elif isinstance(selected_obj, ABCSeries): raise SpecificationError("nested renamer is not supported") elif (isinstance(selected_obj, ABCDataFrame) and k not in selected_obj.columns): raise KeyError(f"Column '{k}' does not exist!") arg = new_arg else: # deprecation of renaming keys # GH 15931 keys = list(arg.keys()) if isinstance(selected_obj, ABCDataFrame) and len( selected_obj.columns.intersection(keys)) != len(keys): cols = list( safe_sort( list( set(keys) - set(selected_obj.columns.intersection(keys))), )) raise SpecificationError(f"Column(s) {cols} do not exist") from pandas.core.reshape.concat import concat if selected_obj.ndim == 1: # key only used for output colg = obj._gotitem(obj._selection, ndim=1) results = {key: colg.agg(how) for key, how in arg.items()} else: # key used for column selection and output results = { key: obj._gotitem(key, ndim=1).agg(how) for key, how in arg.items() } # set the final keys keys = list(arg.keys()) # Avoid making two isinstance calls in all and any below is_ndframe = [isinstance(r, ABCNDFrame) for r in results.values()] # combine results if all(is_ndframe): keys_to_use = [k for k in keys if not results[k].empty] # Have to check, if at least one DataFrame is not empty. keys_to_use = keys_to_use if keys_to_use != [] else keys axis = 0 if isinstance(obj, ABCSeries) else 1 result = concat({k: results[k] for k in keys_to_use}, axis=axis) elif any(is_ndframe): # There is a mix of NDFrames and scalars raise ValueError("cannot perform both aggregation " "and transformation operations " "simultaneously") else: from pandas import Series # we have a dict of scalars # GH 36212 use name only if obj is a series if obj.ndim == 1: obj = cast("Series", obj) name = obj.name else: name = None result = Series(results, name=name) return result
def _count_generic(values, table_type, type_caster): values = type_caster(values) table = table_type(len(values)) uniques, labels, counts = table.factorize(values) return Series(counts, index=uniques)
def agg_list_like( obj: AggObjType, arg: List[AggFuncTypeBase], _axis: int, ) -> FrameOrSeriesUnion: """ Compute aggregation in the case of a list-like argument. Parameters ---------- obj : Pandas object to compute aggregation on. arg : list Aggregations to compute. _axis : int, 0 or 1 Axis to compute aggregation on. Returns ------- Result of aggregation. """ from pandas.core.reshape.concat import concat if _axis != 0: raise NotImplementedError("axis other than 0 is not supported") if obj._selected_obj.ndim == 1: selected_obj = obj._selected_obj else: selected_obj = obj._obj_with_exclusions results = [] keys = [] # degenerate case if selected_obj.ndim == 1: for a in arg: colg = obj._gotitem(selected_obj.name, ndim=1, subset=selected_obj) try: new_res = colg.aggregate(a) except TypeError: pass else: results.append(new_res) # make sure we find a good name name = com.get_callable_name(a) or a keys.append(name) # multiples else: for index, col in enumerate(selected_obj): colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index]) try: new_res = colg.aggregate(arg) except (TypeError, DataError): pass except ValueError as err: # cannot aggregate if "Must produce aggregated value" in str(err): # raised directly in _aggregate_named pass elif "no results" in str(err): # raised directly in _aggregate_multiple_funcs pass else: raise else: results.append(new_res) keys.append(col) # if we are empty if not len(results): raise ValueError("no results") try: return concat(results, keys=keys, axis=1, sort=False) except TypeError as err: # we are concatting non-NDFrame objects, # e.g. a list of scalars from pandas import Series result = Series(results, index=keys, name=obj.name) if is_nested_object(result): raise ValueError( "cannot combine transform and aggregation operations") from err return result
def __unicode__(self): # currently, unicode is same as repr...fixes infinite loop series_rep = Series.__unicode__(self) rep = '%s\n%s' % (series_rep, repr(self.sp_index)) return rep
def __init__(self, data=None, index=None, sparse_index=None, kind='block', fill_value=None, name=None, dtype=None, copy=False, fastpath=False): # we are called internally, so short-circuit if fastpath: # data is an ndarray, index is defined if not isinstance(data, SingleBlockManager): data = SingleBlockManager(data, index, fastpath=True) if copy: data = data.copy() else: if data is None: data = [] if isinstance(data, Series) and name is None: name = data.name if isinstance(data, SparseArray): if index is not None: assert (len(index) == len(data)) sparse_index = data.sp_index if fill_value is None: fill_value = data.fill_value data = np.asarray(data) elif isinstance(data, SparseSeries): if index is None: index = data.index.view() if fill_value is None: fill_value = data.fill_value # extract the SingleBlockManager data = data._data elif isinstance(data, (Series, dict)): data = Series(data, index=index) index = data.index.view() res = make_sparse(data, kind=kind, fill_value=fill_value) data, sparse_index, fill_value = res elif isinstance(data, (tuple, list, np.ndarray)): # array-like if sparse_index is None: res = make_sparse(data, kind=kind, fill_value=fill_value) data, sparse_index, fill_value = res else: assert (len(data) == sparse_index.npoints) elif isinstance(data, SingleBlockManager): if dtype is not None: data = data.astype(dtype) if index is None: index = data.index.view() elif not data.index.equals(index) or copy: # pragma: no cover # GH#19275 SingleBlockManager input should only be called # internally raise AssertionError('Cannot pass both SingleBlockManager ' '`data` argument and a different ' '`index` argument. `copy` must ' 'be False.') else: length = len(index) if data == fill_value or (isna(data) and isna(fill_value)): if kind == 'block': sparse_index = BlockIndex(length, [], []) else: sparse_index = IntIndex(length, []) data = np.array([]) else: if kind == 'block': locs, lens = ([0], [length]) if length else ([], []) sparse_index = BlockIndex(length, locs, lens) else: sparse_index = IntIndex(length, index) v = data data = np.empty(length) data.fill(v) if index is None: index = com._default_index(sparse_index.length) index = _ensure_index(index) # create/copy the manager if isinstance(data, SingleBlockManager): if copy: data = data.copy() else: # create a sparse array if not isinstance(data, SparseArray): data = SparseArray(data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype, copy=copy) data = SingleBlockManager(data, index) generic.NDFrame.__init__(self, data) self.index = index self.name = name
def test01(): print("Test 01") userID: int = 1 history: AHistory = HistoryHierDF("databse1") # userID, itemID, position, observation, clicked # userID, itemID, position, observation, clicked history.insertRecommendation(userID, 45, 1, False) history.insertRecommendation(userID, 45, 2, False) history.insertRecommendation(userID, 78, 3, False) history.insertRecommendation(userID, 68, 4, False) history.insertRecommendation(userID, 50, 5, False) history.insertRecommendation(userID, 50, 6, False) history.insertRecommendation(userID, 50, 7, False) history.insertRecommendation(userID, 50, 8, False) history.insertRecommendation(userID, 50, 9, False) history.insertRecommendation(userID, 68, 4, False) history.insertRecommendation(userID, 50, 5, False) history.insertRecommendation(userID, 50, 6, False) history.insertRecommendation(userID, 50, 7, False) history.insertRecommendation(userID, 50, 8, False) history.insertRecommendation(userID, 50, 9, False) history.insertRecommendation(userID, 68, 4, False) history.insertRecommendation(userID, 50, 5, False) history.insertRecommendation(userID, 50, 6, False) history.insertRecommendation(userID, 50, 7, False) history.insertRecommendation(userID, 50, 8, False) history.insertRecommendation(userID, 50, 9, False) history.insertRecommendation(userID, 68, 4, False) history.insertRecommendation(userID, 50, 5, False) history.insertRecommendation(userID, 50, 6, False) history.insertRecommendation(userID, 50, 7, False) history.insertRecommendation(userID, 50, 8, False) history.insertRecommendation(userID, 50, 9, False) history.insertRecommendation(userID, 68, 4, False) history.insertRecommendation(userID, 50, 5, False) history.insertRecommendation(userID, 50, 6, False) history.insertRecommendation(userID, 50, 7, False) history.insertRecommendation(userID, 50, 8, False) history.insertRecommendation(userID, 50, 9, False) history.insertRecommendation(userID, 68, 4, False) history.insertRecommendation(userID, 50, 5, False) history.insertRecommendation(userID, 50, 6, False) history.insertRecommendation(userID, 50, 7, False) history.insertRecommendation(userID, 50, 8, False) history.insertRecommendation(userID, 50, 9, False) history.insertRecommendation(userID, 68, 4, False) history.insertRecommendation(userID, 50, 5, False) history.insertRecommendation(userID, 50, 6, False) history.insertRecommendation(userID, 50, 7, False) history.insertRecommendation(userID, 50, 8, False) history.insertRecommendation(userID, 50, 9, False) history.insertRecommendation(userID, 68, 4, False) history.insertRecommendation(userID, 50, 5, False) history.insertRecommendation(userID, 50, 6, False) history.insertRecommendation(userID, 50, 7, False) history.insertRecommendation(userID, 50, 8, False) history.insertRecommendation(userID, 50, 9, False) history.insertRecommendation(userID, 68, 4, False) history.insertRecommendation(userID, 50, 5, False) history.insertRecommendation(userID, 50, 6, False) history.insertRecommendation(userID, 50, 7, False) history.insertRecommendation(userID, 50, 8, False) history.insertRecommendation(userID, 50, 9, False) history.insertRecommendation(userID, 68, 4, False) history.insertRecommendation(userID, 50, 5, False) history.insertRecommendation(userID, 50, 6, False) history.insertRecommendation(userID, 50, 7, False) history.insertRecommendation(userID, 50, 8, False) history.insertRecommendation(userID, 50, 9, False) history.insertRecommendation(userID, 68, 4, False) history.insertRecommendation(userID, 50, 5, False) history.insertRecommendation(userID, 50, 6, False) history.insertRecommendation(userID, 50, 7, False) history.insertRecommendation(userID, 50, 8, False) history.insertRecommendation(userID, 50, 9, False) history.insertRecommendation(userID, 68, 4, False) history.insertRecommendation(userID, 50, 5, False) history.insertRecommendation(userID, 50, 6, False) history.insertRecommendation(userID, 50, 7, False) history.insertRecommendation(userID, 50, 8, False) history.insertRecommendation(userID, 50, 9, False) history.insertRecommendation(userID, 68, 4, False) history.insertRecommendation(userID, 50, 5, False) history.insertRecommendation(userID, 50, 6, False) history.insertRecommendation(userID, 50, 7, False) history.insertRecommendation(userID, 50, 8, False) history.insertRecommendation(userID, 50, 9, False) history.insertRecommendation(userID, 68, 4, False) history.insertRecommendation(userID, 50, 5, False) history.insertRecommendation(userID, 50, 6, False) history.insertRecommendation(userID, 50, 7, False) history.insertRecommendation(userID, 50, 8, False) history.insertRecommendation(userID, 50, 9, False) history.insertRecommendation(userID, 50, 10, False) history.insertRecommendation(userID, 100, 1, False) history.insertRecommendation(userID, 6, 2, True) history.insertRecommendation(userID, 100, 5, True) history.insertRecommendation(userID, 100, 15, True) recommendationDict: dict = { 100: 0.35, 125: 0.25, 95: 0.15, 45: 0.1, 78: 0.05, 68: 0.05, 32: 0.02, 6: 0.01, 18: 0.01, 47: 0.01 } recommendationSrs: Series = Series(recommendationDict) penalty: APenalization = PenalUsingReduceRelevance(penaltyLinear, [0.8, 0.2, 100], penaltyLinear, [1.0, 0.2, 100], 100) pRecommendationSrs: Series = penalty.runOneMethodPenalization( userID, recommendationSrs, history) print(pRecommendationSrs)
def get_dtype_counts(self): from collections import defaultdict d = defaultdict(int) for k, v in self.iteritems(): d[v.dtype.name] += 1 return Series(d)
def _chop(self, sdata: Series, slice_obj: slice) -> Series: return sdata._get_values(slice_obj)
def _to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None, box=True, format=None, exact=True, unit='ns', freq=None, infer_datetime_format=False): """ Same as to_datetime, but accept freq for DatetimeIndex internal construction """ from pandas.core.series import Series from pandas.tseries.index import DatetimeIndex def _convert_listlike(arg, box, format, name=None): if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') # these are shortcutable if com.is_datetime64_ns_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz='utc' if utc else None, name=name) except ValueError: pass return arg elif com.is_datetime64tz_dtype(arg): if not isinstance(arg, DatetimeIndex): return DatetimeIndex(arg, tz='utc' if utc else None) if utc: arg = arg.tz_convert(None) return arg elif format is None and com.is_integer_dtype(arg) and unit == 'ns': result = arg.astype('datetime64[ns]') if box: return DatetimeIndex(result, tz='utc' if utc else None, name=name) return result arg = com._ensure_object(arg) require_iso8601 = False if infer_datetime_format and format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) if format is not None: # There is a special fast-path for iso8601 formatted # datetime strings, so in those cases don't use the inferred # format because this path makes process slower in this # special case format_is_iso8601 = (('%Y-%m-%dT%H:%M:%S.%f'.startswith(format) or '%Y-%m-%d %H:%M:%S.%f'.startswith(format)) and format != '%Y') if format_is_iso8601: require_iso8601 = not infer_datetime_format format = None try: result = None if format is not None: # shortcut formatting here if format == '%Y%m%d': try: result = _attempt_YYYYMMDD(arg, errors=errors) except: raise ValueError( "cannot convert the input to '%Y%m%d' date format") # fallback if result is None: try: result = tslib.array_strptime(arg, format, exact=exact, errors=errors) except (tslib.OutOfBoundsDatetime): if errors == 'raise': raise result = arg except ValueError: # if format was inferred, try falling back # to array_to_datetime - terminate here # for specified formats if not infer_datetime_format: if errors == 'raise': raise result = arg if result is None and (format is None or infer_datetime_format): result = tslib.array_to_datetime( arg, errors=errors, utc=utc, dayfirst=dayfirst, yearfirst=yearfirst, freq=freq, unit=unit, require_iso8601=require_iso8601) if com.is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz='utc' if utc else None, name=name) return result except ValueError as e: try: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, name=name, tz=tz) except (ValueError, TypeError): raise e if arg is None: return arg elif isinstance(arg, tslib.Timestamp): return arg elif isinstance(arg, Series): values = _convert_listlike(arg._values, False, format) return Series(values, index=arg.index, name=arg.name) elif isinstance(arg, ABCIndexClass): return _convert_listlike(arg, box, format, name=arg.name) elif com.is_list_like(arg): return _convert_listlike(arg, box, format) return _convert_listlike(np.array([arg]), box, format)[0]
except ValueError, e: try: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, None, tz=tz) except (ValueError, TypeError): raise e if arg is None: return arg elif isinstance(arg, datetime): return arg elif isinstance(arg, Series): values = arg.values if not com.is_datetime64_dtype(values): values = _convert_f(values) return Series(values, index=arg.index, name=arg.name) elif isinstance(arg, (np.ndarray, list)): if isinstance(arg, list): arg = np.array(arg, dtype='O') if com.is_datetime64_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz='utc' if utc else None) except ValueError, e: try: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, None, tz=tz) except (ValueError, TypeError): raise e return arg
def __repr__(self): series_rep = Series.__repr__(self) rep = '{series}\n{index!r}'.format(series=series_rep, index=self.sp_index) return rep