def _postprocess_for_cut(fac, bins, retbins, x_is_series, series_index, name): """ handles post processing for the cut method where we combine the index information if the originally passed datatype was a series """ if x_is_series: fac = Series(fac, index=series_index, name=name) if not retbins: return fac return fac, bins
def _take_new_index(obj, indexer, new_index, axis=0): from pandas.core.api import Series, DataFrame if isinstance(obj, Series): new_values = algos.take_1d(obj.values, indexer) return Series(new_values, index=new_index, name=obj.name) elif isinstance(obj, DataFrame): if axis == 1: raise NotImplementedError("axis 1 is not supported") return DataFrame(obj._data.reindex_indexer( new_axis=new_index, indexer=indexer, axis=1)) else: raise ValueError("'obj' should be either a Series or a DataFrame")
def test_operators_corner(self): series = self.ts empty = Series([], index=Index([])) result = series + empty self.assert_(np.isnan(result).all()) result = empty + Series([], index=Index([])) self.assert_(len(result) == 0) deltas = Series([timedelta(1)] * 5, index=np.arange(5)) sub_deltas = deltas[::2] deltas5 = deltas * 5 deltas = deltas + sub_deltas # float + int int_ts = self.ts.astype(int)[:-5] added = self.ts + int_ts expected = self.ts.values[:-5] + int_ts.values self.assert_(np.array_equal(added[:-5], expected))
def test_iloc_getitem_array(self): # array like s = Series(index=range(1, 4)) self.check_result('array like', 'iloc', s.index, 'ix', { 0: [2, 4, 6], 1: [3, 6, 9], 2: [4, 8, 12] }, typs=['ints'])
def _take_new_index(obj, indexer, new_index, axis=0): from pandas.core.api import Series, DataFrame from pandas.core.internals import BlockManager if isinstance(obj, Series): new_values = com.take_1d(obj.values, indexer) return Series(new_values, index=new_index, name=obj.name) elif isinstance(obj, DataFrame): if axis == 1: raise NotImplementedError return DataFrame(obj._data.take(indexer, new_index=new_index, axis=1)) else: raise NotImplementedError
def percentileRank(frame, column=None, kind='mean'): """ Return score at percentile for each point in time (cross-section) Parameters ---------- frame: DataFrame column: string or Series, optional Column name or specific Series to compute percentiles for. If not provided, percentiles are computed for all values at each point in time. Note that this can take a LONG time. kind: {'rank', 'weak', 'strict', 'mean'}, optional This optional parameter specifies the interpretation of the resulting score: - "rank": Average percentage ranking of score. In case of multiple matches, average the percentage rankings of all matching scores. - "weak": This kind corresponds to the definition of a cumulative distribution function. A percentileofscore of 80% means that 80% of values are less than or equal to the provided score. - "strict": Similar to "weak", except that only values that are strictly less than the given score are counted. - "mean": The average of the "weak" and "strict" scores, often used in testing. See http://en.wikipedia.org/wiki/Percentile_rank Returns ------- TimeSeries or DataFrame, depending on input """ fun = lambda xs, score: percentileofscore(remove_na(xs), score, kind=kind) results = {} framet = frame.T if column is not None: if isinstance(column, Series): for date, xs in compat.iteritems(frame.T): results[date] = fun(xs, column.get(date, NaN)) else: for date, xs in compat.iteritems(frame.T): results[date] = fun(xs, xs[column]) results = Series(results) else: for column in frame.columns: for date, xs in compat.iteritems(framet): results.setdefault(date, {})[column] = fun(xs, xs[column]) results = DataFrame(results).T return results
def test_constructor_dict(self): frame = self.klass({'col1': self.ts1, 'col2': self.ts2}) common.assert_dict_equal(self.ts1, frame['col1'], compare_keys=False) common.assert_dict_equal(self.ts2, frame['col2'], compare_keys=False) frame = self.klass({ 'col1': self.ts1, 'col2': self.ts2 }, columns=['col2', 'col3', 'col4']) self.assertEqual(len(frame), len(self.ts2)) self.assert_('col1' not in frame) self.assert_(np.isnan(frame['col3']).all()) # Corner cases self.assertEqual(len(self.klass({})), 0) self.assertRaises(Exception, lambda x: self.klass([self.ts1, self.ts2])) # pass dict and array, nicht nicht self.assertRaises(Exception, self.klass, { 'A': { 'a': 'a', 'b': 'b' }, 'B': ['a', 'b'] }) # can I rely on the order? self.assertRaises(Exception, self.klass, { 'A': ['a', 'b'], 'B': { 'a': 'a', 'b': 'b' } }) self.assertRaises(Exception, self.klass, { 'A': ['a', 'b'], 'B': Series(['a', 'b'], index=['a', 'b']) }) # Length-one dict micro-optimization frame = self.klass({'A': {'1': 1, '2': 2}}) self.assert_(np.array_equal(frame.index, ['1', '2'])) # empty dict plus index idx = Index([0, 1, 2]) frame = self.klass({}, index=idx) self.assert_(frame.index is idx)
def predict(self, beta=None, x=None, fill_value=None, fill_method=None, axis=0): """ Parameters ---------- beta : Series x : Series or DataFrame fill_value : scalar or dict, default None fill_method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None axis : {0, 1}, default 0 See DataFrame.fillna for more details Notes ----- 1. If both fill_value and fill_method are None then NaNs are dropped (this is the default behavior) 2. An intercept will be automatically added to the new_y_values if the model was fitted using an intercept Returns ------- Series of predicted values """ if beta is None and x is None: return self.y_predict if beta is None: beta = self.beta else: beta = beta.reindex(self.beta.index) if isnull(beta).any(): raise ValueError('Must supply betas for same variables') if x is None: x = self._x orig_x = x else: orig_x = x if fill_value is None and fill_method is None: x = x.dropna(how='any') else: x = x.fillna(value=fill_value, method=fill_method, axis=axis) if isinstance(x, Series): x = DataFrame({'x': x}) if self._intercept: x['intercept'] = 1. x = x.reindex(columns=self._x.columns) rs = np.dot(x.values, beta.values) return Series(rs, x.index).reindex(orig_x.index)
def test_value_counts(self): np.random.seed(1234) from pandas.tools.tile import cut arr = np.random.randn(4) factor = cut(arr, 4) tm.assertIsInstance(factor, Categorical) result = algos.value_counts(factor) cats = ['(-1.194, -0.535]', '(-0.535, 0.121]', '(0.121, 0.777]', '(0.777, 1.433]'] expected_index = CategoricalIndex(cats, cats, ordered=True) expected = Series([1, 1, 1, 1], index=expected_index) tm.assert_series_equal(result.sort_index(), expected.sort_index())
def test_firstValid(self): ts = self.ts.copy() ts[:5] = np.NaN index = ts._firstTimeWithValue() self.assertEqual(index, ts.index[5]) ts[-5:] = np.NaN index = ts._lastTimeWithValue() self.assertEqual(index, ts.index[-6]) ser = Series([], index=[]) self.assert_(ser._lastTimeWithValue() is None) self.assert_(ser._firstTimeWithValue() is None)
def test_repr(self): str(self.ts) str(self.series) str(self.series.astype(int)) str(self.objSeries) str(Series(common.randn(1000), index=np.arange(1000))) # empty str(self.empty) # with NaNs self.series[5:7] = np.NaN str(self.series)
def test_loc_getitem_label_list(self): # list of labels self.check_result('list lbl', 'loc', [0,2,4], 'ix', [0,2,4], typs = ['ints'], axes=0) self.check_result('list lbl', 'loc', [3,6,9], 'ix', [3,6,9], typs = ['ints'], axes=1) self.check_result('list lbl', 'loc', [4,8,12], 'ix', [4,8,12], typs = ['ints'], axes=2) self.check_result('list lbl', 'loc', ['a','b','d'], 'ix', ['a','b','d'], typs = ['labels'], axes=0) self.check_result('list lbl', 'loc', ['A','B','C'], 'ix', ['A','B','C'], typs = ['labels'], axes=1) self.check_result('list lbl', 'loc', ['Z','Y','W'], 'ix', ['Z','Y','W'], typs = ['labels'], axes=2) self.check_result('list lbl', 'loc', [2,8,'null'], 'ix', [2,8,'null'], typs = ['mixed'], axes=0) self.check_result('list lbl', 'loc', [Timestamp('20130102'),Timestamp('20130103')], 'ix', [Timestamp('20130102'),Timestamp('20130103')], typs = ['ts'], axes=0) # fails self.check_result('list lbl', 'loc', [0,1,2], 'indexer', [0,1,2], typs = ['empty'], fails = KeyError) self.check_result('list lbl', 'loc', [0,2,3], 'ix', [0,2,3], typs = ['ints'], axes=0, fails = KeyError) self.check_result('list lbl', 'loc', [3,6,7], 'ix', [3,6,9], typs = ['ints'], axes=1, fails = KeyError) self.check_result('list lbl', 'loc', [4,8,10], 'ix', [4,8,12], typs = ['ints'], axes=2, fails = KeyError) # array like self.check_result('array like', 'loc', Series(index=[0,2,4]).index, 'ix', [0,2,4], typs = ['ints'], axes=0) self.check_result('array like', 'loc', Series(index=[3,6,9]).index, 'ix', [3,6,9], typs = ['ints'], axes=1) self.check_result('array like', 'loc', Series(index=[4,8,12]).index, 'ix', [4,8,12], typs = ['ints'], axes=2)
def test_groupby_transform(self): data = Series(np.arange(9) / 3, index=np.arange(9)) index = np.arange(9) np.random.shuffle(index) data = data.reindex(index) grouped = data.groupby(lambda x: x // 3) transformed = grouped.transform(lambda x: x * x.sum()) self.assertEqual(transformed[7], 12) # corner cases self.assertRaises(Exception, grouped.transform, lambda x: x.mean())
def test_setitem_ambig(self): # difficulties with mixed-type data from decimal import Decimal # created as float type dm = DataMatrix(index=range(3), columns=range(3)) coercable_series = Series([Decimal(1) for _ in range(3)], index=range(3)) uncoercable_series = Series(['foo', 'bzr', 'baz'], index=range(3)) dm[0] = np.ones(3) self.assertEqual(len(dm.cols()), 3) self.assert_(dm.objects is None) dm[1] = coercable_series self.assertEqual(len(dm.cols()), 3) self.assert_(dm.objects is None) dm[2] = uncoercable_series self.assertEqual(len(dm.cols()), 3) self.assert_(dm.objects is not None) self.assert_(2 in dm.objects) self.assert_(2 not in dm.columns)
def test_setitem(self): self.ts[self.ts.index[5]] = np.NaN self.ts[[1, 2, 17]] = np.NaN self.ts[6] = np.NaN self.assert_(np.isnan(self.ts[6])) self.assert_(np.isnan(self.ts[2])) self.ts[np.isnan(self.ts)] = 5 self.assert_(not np.isnan(self.ts[2])) # caught this bug when writing tests series = Series(common.makeIntIndex(20).astype(float), index=common.makeIntIndex(20)) series[::2] = 0 self.assert_((series[::2] == 0).all())
def setUp(self): import warnings warnings.filterwarnings(action='ignore', category=FutureWarning) self.series_ints = Series(np.random.rand(4), index=range(0,8,2)) self.frame_ints = DataFrame(np.random.randn(4, 4), index=range(0, 8, 2), columns=range(0,12,3)) self.panel_ints = Panel(np.random.rand(4,4,4), items=range(0,8,2),major_axis=range(0,12,3),minor_axis=range(0,16,4)) self.series_labels = Series(np.random.randn(4), index=list('abcd')) self.frame_labels = DataFrame(np.random.randn(4, 4), index=list('abcd'), columns=list('ABCD')) self.panel_labels = Panel(np.random.randn(4,4,4), items=list('abcd'), major_axis=list('ABCD'), minor_axis=list('ZYXW')) self.series_mixed = Series(np.random.randn(4), index=[2, 4, 'null', 8]) self.frame_mixed = DataFrame(np.random.randn(4, 4), index=[2, 4, 'null', 8]) self.panel_mixed = Panel(np.random.randn(4,4,4), items=[2,4,'null',8]) self.series_ts = Series(np.random.randn(4), index=date_range('20130101', periods=4)) self.frame_ts = DataFrame(np.random.randn(4, 4), index=date_range('20130101', periods=4)) self.panel_ts = Panel(np.random.randn(4, 4, 4), items=date_range('20130101', periods=4)) #self.series_floats = Series(np.random.randn(4), index=[1.00, 2.00, 3.00, 4.00]) #self.frame_floats = DataFrame(np.random.randn(4, 4), columns=[1.00, 2.00, 3.00, 4.00]) #self.panel_floats = Panel(np.random.rand(4,4,4), items = [1.00,2.00,3.00,4.00]) self.frame_empty = DataFrame({}) self.series_empty = Series({}) self.panel_empty = Panel({}) # form agglomerates for o in self._objs: d = dict() for t in self._typs: d[t] = getattr(self,'%s_%s' % (o,t),None) setattr(self,o,d)
def test_reindex_bool(self): # A series other than float, int, string, or object ts = self.ts[::2] bool_ts = Series(np.zeros(len(ts), dtype=bool), index=ts.index) # this should work fine reindexed_bool = bool_ts.reindex(self.ts.index) # if NaNs introduced self.assert_(reindexed_bool.dtype == np.object_) # NO NaNs introduced reindexed_bool = bool_ts.reindex(bool_ts.index[::2]) self.assert_(reindexed_bool.dtype == np.bool_)
def _filter_data(lhs, rhs, weights=None): """ Cleans the input for single OLS. Parameters ---------- lhs : Series Dependent variable in the regression. rhs : dict, whose values are Series, DataFrame, or dict Explanatory variables of the regression. weights : array-like, optional 1d array of weights. If None, equivalent to an unweighted OLS. Returns ------- Series, DataFrame Cleaned lhs and rhs """ if not isinstance(lhs, Series): if len(lhs) != len(rhs): raise AssertionError("length of lhs must equal length of rhs") lhs = Series(lhs, index=rhs.index) rhs = _combine_rhs(rhs) lhs = DataFrame({'__y__': lhs}, dtype=float) pre_filt_rhs = rhs.dropna(how='any') combined = rhs.join(lhs, how='outer') if weights is not None: combined['__weights__'] = weights valid = (combined.count(1) == len(combined.columns)).values index = combined.index combined = combined[valid] if weights is not None: filt_weights = combined.pop('__weights__') else: filt_weights = None filt_lhs = combined.pop('__y__') filt_rhs = combined if hasattr(filt_weights, 'to_dense'): filt_weights = filt_weights.to_dense() return (filt_lhs.to_dense(), filt_rhs.to_dense(), filt_weights, pre_filt_rhs.to_dense(), index, valid)
def test_mixed(self): # doc example reshaping.rst x = Series(['A', 'A', np.nan, 'B', 3.14, np.inf]) labels, uniques = algos.factorize(x) exp = np.array([0, 0, -1, 1, 2, 3], dtype=np.int_) self.assert_numpy_array_equal(labels, exp) exp = pd.Index(['A', 'B', 3.14, np.inf]) tm.assert_index_equal(uniques, exp) labels, uniques = algos.factorize(x, sort=True) exp = np.array([2, 2, -1, 3, 0, 1], dtype=np.int_) self.assert_numpy_array_equal(labels, exp) exp = pd.Index([3.14, np.inf, 'A', 'B']) tm.assert_index_equal(uniques, exp)
def setUp(self): arr = randn(N) arr[self._nan_locs] = np.NaN self.arr = arr self.rng = DateRange(datetime(2009, 1, 1), periods=N) self.series = Series(arr.copy(), index=self.rng) self.frame = DataFrame(randn(N, K), index=self.rng, columns=np.arange(K)) self.matrix = DataMatrix(randn(N, K), index=self.rng, columns=np.arange(K))
def test_mixed(self): # doc example reshaping.rst x = Series(['A', 'A', np.nan, 'B', 3.14, np.inf]) labels, uniques = algos.factorize(x) self.assert_numpy_array_equal(labels, np.array( [0, 0, -1, 1, 2, 3], dtype=np.int64)) self.assert_numpy_array_equal(uniques, np.array( ['A', 'B', 3.14, np.inf], dtype=object)) labels, uniques = algos.factorize(x, sort=True) self.assert_numpy_array_equal(labels, np.array( [2, 2, -1, 3, 0, 1], dtype=np.int64)) self.assert_numpy_array_equal(uniques, np.array( [3.14, np.inf, 'A', 'B'], dtype=object))
def checkDataSet(self, dataset, start=None, end=None, skip_moving=False): exog = dataset.exog[start:end] endog = dataset.endog[start:end] x = DataMatrix(exog, index=np.arange(exog.shape[0]), columns=np.arange(exog.shape[1])) y = Series(endog, index=np.arange(len(endog))) self.checkOLS(exog, endog, x, y) if not skip_moving: self.checkMovingOLS('rolling', x, y) self.checkMovingOLS('rolling', x, y, nw_lags=0) self.checkMovingOLS('expanding', x, y, nw_lags=0) self.checkMovingOLS('rolling', x, y, nw_lags=1) self.checkMovingOLS('expanding', x, y, nw_lags=1) self.checkMovingOLS('expanding', x, y, nw_lags=1, nw_overlap=True)
def test_groupby_transform(self): data = Series(np.arange(9) / 3, index=np.arange(9)) index = np.arange(9) np.random.shuffle(index) data = data.reindex(index) grouped = data.groupby(lambda x: x // 3) transformed = grouped.transform(lambda x: x * x.sum()) self.assertEqual(transformed[7], 12) transformed = grouped.transform(np.mean) for name, group in grouped: mean = group.mean() for idx in group.index: self.assertEqual(transformed[idx], mean)
def __init__(self, data_set, indicator=None): """ Args: data_set: dict(symbol=DataFrame)或DataFrame, 待分析的数据集是一个以品种名为key,value是DataFrame或者是一个DataFrame indicator: Series,指标序列,默认是一个空的Series,可以通过直接设置indicator属性设置,或者在类内编写指标获得 Notes: 数据集的长度应当与indicator长度相同,否则会报错 """ self.__identify = None # 识别标签函数对象,目前主要是_group_identify 和 _rolling_identify self.__indicator = None # 当前处理的指标对象 self.__data = None # 当前处理的数据集对象 self.__group = None # 当前处理的分组对象 self.__symbol = None # 当前品种对象 self.__profit = None # 当前品种的盈亏序列 self._data_set = data_set.copy() # 总体数据集 self._indicator = Series() if indicator is None else indicator self._ind_len = 0 # 当前处理的指标数据行数 self._group = None
def test_order(self): ts = self.ts.copy() ts[:5] = np.NaN vals = ts.values result = ts.order() self.assert_(np.isnan(result[-5:]).all()) self.assert_(np.array_equal(result[:-5], np.sort(vals[5:]))) result = ts.order(missingAtEnd=False) self.assert_(np.isnan(result[:5]).all()) self.assert_(np.array_equal(result[5:], np.sort(vals[5:]))) # something object-type ser = Series(['A', 'B'], [1, 2]) # no failure ser.order()
def test_combineSeries(self): # Series series = self.frame.getXS(self.frame.index[0]) added = self.frame + series for key, s in added.iteritems(): assert_series_equal(s, self.frame[key] + series[key]) larger_series = series.toDict() larger_series['E'] = 1 larger_series = Series(larger_series) larger_added = self.frame + larger_series for key, s in self.frame.iteritems(): assert_series_equal(larger_added[key], s + series[key]) self.assert_('E' in larger_added) self.assert_(np.isnan(larger_added['E']).all()) # TimeSeries ts = self.tsframe['A'] added = self.tsframe + ts for key, col in self.tsframe.iteritems(): assert_series_equal(added[key], col + ts) smaller_frame = self.tsframe[:-5] smaller_added = smaller_frame + ts self.assert_(smaller_added.index.equals(self.tsframe.index)) # length 0 result = self.tsframe + ts[:0] # Frame is length 0 result = self.tsframe[:0] + ts self.assertEqual(len(result), 0) # empty but with non-empty index frame = self.tsframe[:1].reindex(columns=[]) result = frame * ts self.assertEqual(len(result), len(ts))
def _take_new_index(obj, indexer, new_index, axis=0): from pandas.core.api import Series, DataFrame from pandas.core.internals import BlockManager if isinstance(obj, Series): new_values = com.take_1d(obj.values, indexer) return Series(new_values, index=new_index, name=obj.name) elif isinstance(obj, DataFrame): if axis == 1: raise NotImplementedError data = obj._data new_blocks = [b.take(indexer, axis=1) for b in data.blocks] new_axes = list(data.axes) new_axes[1] = new_index new_data = BlockManager(new_blocks, new_axes) return DataFrame(new_data) else: raise NotImplementedError
def test_asfreq(self): ts = Series([0., 1., 2.], index=[ datetime(2009, 10, 30), datetime(2009, 11, 30), datetime(2009, 12, 31) ]) daily_ts = ts.asfreq('WEEKDAY') monthly_ts = daily_ts.asfreq('EOM') self.assert_(np.array_equal(monthly_ts, ts)) daily_ts = ts.asfreq('WEEKDAY', fillMethod='pad') monthly_ts = daily_ts.asfreq('EOM') self.assert_(np.array_equal(monthly_ts, ts)) daily_ts = ts.asfreq(datetools.bday) monthly_ts = daily_ts.asfreq(datetools.bmonthEnd) self.assert_(np.array_equal(monthly_ts, ts))
def _filter_data(lhs, rhs, weights=None): """ Cleans the input for single OLS. Parameters ---------- lhs: Series Dependent variable in the regression. rhs: dict, whose values are Series, DataFrame, or dict Explanatory variables of the regression. Returns ------- Series, DataFrame Cleaned lhs and rhs """ if not isinstance(lhs, Series): assert(len(lhs) == len(rhs)) lhs = Series(lhs, index=rhs.index) rhs = _combine_rhs(rhs) lhs = DataFrame({'__y__' : lhs}, dtype=float) pre_filt_rhs = rhs.dropna(how='any') combined = rhs.join(lhs, how='outer') if weights is not None: combined['__weights__'] = weights valid = (combined.count(1) == len(combined.columns)).values index = combined.index combined = combined[valid] if weights is not None: filt_weights = combined.pop('__weights__') else: filt_weights = None filt_lhs = combined.pop('__y__') filt_rhs = combined return (filt_lhs, filt_rhs, filt_weights, pre_filt_rhs, index, valid)
def _process_data_structure(arg, kill_inf=True): if isinstance(arg, DataFrame): return_hook = lambda v: type(arg)( v, index=arg.index, columns=arg.columns) values = arg.values elif isinstance(arg, Series): values = arg.values return_hook = lambda v: Series(v, arg.index) else: return_hook = lambda v: v values = arg if not issubclass(values.dtype.type, float): values = values.astype(float) if kill_inf: values = values.copy() values[np.isinf(values)] = np.NaN return return_hook, values