def test_setitem_corner(self): # corner case df = self.klass({'B' : [1., 2., 3.], 'C' : ['a', 'b', 'c']}, index=np.arange(3)) del df['B'] df['B'] = [1., 2., 3.] self.assert_('B' in df) self.assertEqual(len(df.columns), 1) df['A'] = 'beginning' df['E'] = 'foo' df['D'] = 'bar' df[datetime.now()] = 'date' df[datetime.now()] = 5. # what to do when empty frame with index dm = DataMatrix(index=self.frame.index) dm['A'] = 'foo' dm['B'] = 'bar' self.assertEqual(len(dm.objects.columns), 2) dm['C'] = 1 self.assertEqual(len(dm.columns), 1) # set existing column dm['A'] = 'bar' self.assertEqual('bar', dm['A'][0]) dm = DataMatrix(index=np.arange(3)) dm['A'] = 1 dm['foo'] = 'bar' del dm['foo'] dm['foo'] = 'bar' self.assertEqual(len(dm.objects.columns), 1)
def createData2(self): y_data = [[1, np.NaN], [2, 3], [4, 5]] y_index = [ datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3) ] y_cols = ['A', 'B'] self.panel_y2 = DataMatrix(np.array(y_data), index=y_index, columns=y_cols) x1_data = [[6, np.NaN], [7, 8], [9, 30], [11, 12]] x1_index = [ datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 4) ] x1_cols = ['A', 'B'] x1 = DataMatrix(np.array(x1_data), index=x1_index, columns=x1_cols) x2_data = [[13, 14, np.NaN], [15, np.NaN, np.NaN], [16, 17, 48], [19, 20, 21], [22, 23, 24]] x2_index = [ datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 4), datetime(2000, 1, 5) ] x2_cols = ['C', 'A', 'B'] x2 = DataMatrix(np.array(x2_data), index=x2_index, columns=x2_cols) self.panel_x2 = {'x1': x1, 'x2': x2}
def createData3(self): y_data = [[1, 2], [3, 4]] y_index = [datetime(2000, 1, 1), datetime(2000, 1, 2)] y_cols = ['A', 'B'] self.panel_y3 = DataMatrix(np.array(y_data), index=y_index, columns=y_cols) x1_data = [['A', 'B'], ['C', 'A']] x1_index = [datetime(2000, 1, 1), datetime(2000, 1, 2)] x1_cols = ['A', 'B'] x1 = DataMatrix(np.array(x1_data), index=x1_index, columns=x1_cols) x2_data = [['3.14', '1.59'], ['2.65', '3.14']] x2_index = [datetime(2000, 1, 1), datetime(2000, 1, 2)] x2_cols = ['A', 'B'] x2 = DataMatrix(np.array(x2_data), index=x2_index, columns=x2_cols) self.panel_x3 = {'x1' : x1, 'x2' : x2}
def test_combineFirst_mixed(self): a = Series(['a','b'], index=range(2)) b = Series(range(2), index=range(2)) f = DataMatrix({'A' : a, 'B' : b}) a = Series(['a','b'], index=range(5, 7)) b = Series(range(2), index=range(5, 7)) g = DataMatrix({'A' : a, 'B' : b}) combined = f.combineFirst(g)
def test_reindex_bool(self): frame = DataMatrix(np.ones((10, 2), dtype=bool), index=np.arange(0, 20, 2), columns=[0, 2]) reindexed = frame.reindex(np.arange(10)) self.assert_(reindexed.values.dtype == np.float_) self.assert_(np.isnan(reindexed[0][1])) reindexed = frame.reindex(columns=range(3)) self.assert_(reindexed.values.dtype == np.float_) self.assert_(isnull(reindexed[1]).all())
def test_constructor_objects_corner(self): obj = {'A' : {1 : '1', 2 : '2'}} obj_dm = DataMatrix(obj) mat = np.zeros((3, 3), dtype=float) dm = DataMatrix(mat, index=[1, 2, 3], columns=['B', 'C', 'D'], objects=obj_dm) assert dm.index is not obj_dm.index dm = DataMatrix(mat, index=[1, 2, 3], columns=['B', 'C', 'D'], objects=obj) dm = DataMatrix(index=[1, 2, 3], objects=obj_dm) dm = DataMatrix(index=[1, 2, 3], objects=obj)
def frame_query(sql, con, indexField='Time', asDataMatrix=False): """ Returns a DataFrame corresponding to the result set of the query string. Optionally provide an indexField parameter to use one of the columns as the index. Otherwise will be 0 to len(results) - 1. Parameters ---------- sql: string SQL query to be executed con: DB connection object, optional indexField: string, optional column name to use for the returned DataFrame object. """ data = array_query(sql, con) if indexField is not None: try: idx = Index(data.pop(indexField)) except KeyError: raise KeyError('indexField %s not found! %s' % (indexField, sql)) else: idx = Index(np.arange(len(data.values()[0]))) if asDataMatrix: return DataMatrix(data, index=idx) else: return DataFrame(data=data, index=idx)
def _combine_rhs(rhs): """ Glue input X variables together while checking for potential duplicates """ series = {} if isinstance(rhs, Series): series['x'] = rhs elif isinstance(rhs, DataFrame): series = rhs.copy() elif isinstance(rhs, dict): for name, value in rhs.iteritems(): if isinstance(value, Series): _safe_update(series, {name: value}) elif isinstance(value, (dict, DataFrame)): _safe_update(series, value) else: raise Exception('Invalid RHS data type: %s' % type(value)) else: raise Exception('Invalid RHS type: %s' % type(rhs)) if not isinstance(series, DataFrame): series = DataMatrix(series) return series
def bucketcat(series, cats): """ Produce DataMatrix representing quantiles of a Series Parameters ---------- series : Series cat : Series or same-length array bucket by category; mutually exxlusive with 'by' Returns ------- DataMatrix """ if not isinstance(series, Series): series = Series(series, index=np.arange(len(series))) cats = np.asarray(cats) unique_labels = np.unique(cats) unique_labels = unique_labels[notnull(unique_labels)] # group by data = {} for i, label in enumerate(unique_labels): data[label] = series[cats == label] return DataMatrix(data, columns=unique_labels)
def bucket(series, k, by=None): """ Produce DataMatrix representing quantiles of a Series Parameters ---------- series : Series k : int number of quantiles by : Series or same-length array bucket by value Returns ------- DataMatrix """ if by is None: by = series else: by = by.reindex(series.index) split = _split_quantile(by, k) mat = np.empty((len(series), k), dtype=float) * np.NaN for i, v in enumerate(split): mat[:, i][v] = series.take(v) return DataMatrix(mat, index=series.index, columns=np.arange(k) + 1)
def testWithWeights(self): data = np.arange(10).reshape((5, 2)) index = [ datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 4), datetime(2000, 1, 5) ] cols = ['A', 'B'] weights = DataMatrix(data, index=index, columns=cols) result = ols(y=self.panel_y2, x=self.panel_x2, weights=weights) assert_almost_equal(result._y_trans.values.flat, [0, 16, 25]) exp_x = [[0, 0, 0], [36, 68, 4], [150, 240, 5]] assert_almost_equal(result._x_trans.values, exp_x) exp_x_filtered = [[6, 14, 1], [9, 17, 1], [30, 48, 1], [11, 20, 1], [12, 21, 1]] # exp_x_filtered = [[0, 0, 0], # [36, 68, 4], # [150, 240, 5], # [66, 120, 6], # [84, 147, 7]] assert_almost_equal(result._x_filtered.values, exp_x_filtered)
def var_beta(self): """Returns the covariance of beta.""" result = {} result_index = self._result_index for i in xrange(len(self._var_beta_raw)): dm = DataMatrix(self._var_beta_raw[i], columns=self.beta.cols(), index=self.beta.cols()) result[result_index[i]] = dm return WidePanel.fromDict(result, intersect=False)
def _cat_labels(labels): # group by data = {} unique_labels = np.unique(labels) unique_labels = unique_labels[notnull(unique_labels)] for label in unique_labels: mask = labels == label data[stringified] = series[mask] return DataMatrix(data, index=series.index)
def test_constructor_with_objects(self): index = self.mixed_frame.index[:5] dm = DataMatrix(data=None, index=index, objects=self.mixed_frame.objects) self.assert_(dm.index is index) self.assert_(dm.objects.index is index) dm = DataMatrix(data=None, index=index, objects=self.mixed_frame.objects._series) self.assert_(dm.index is index) self.assert_(dm.objects.index is index) index = self.mixed_frame.index dm = DataMatrix(data=None, index=index, objects=self.mixed_frame.objects) self.assert_(dm.index is index) self.assert_(dm.objects.index is index) index = self.mixed_frame.index dm = DataMatrix(objects=self.mixed_frame.objects) self.assert_(dm.index is self.mixed_frame.index) # take dict of objects index = self.mixed_frame.index dm = DataMatrix(data={}, objects=self.mixed_frame.objects._series) self.assert_(isinstance(dm.objects, DataMatrix)) self.assert_(dm.index is dm.objects.index) index = self.mixed_frame.index dm = DataMatrix(objects=self.mixed_frame.objects._series) self.assert_(isinstance(dm.objects, DataMatrix)) self.assert_(dm.index is dm.objects.index) index = self.mixed_frame.index dm = DataMatrix(data=self.frame._series, objects=self.mixed_frame.objects._series) self.assert_(isinstance(dm.objects, DataMatrix)) self.assert_(dm.objects.columns.equals( self.mixed_frame.objects.columns)) objs = DataMatrix({'bar' : ['bar'] * len(self.mixed_frame)}) dm = DataMatrix(self.mixed_frame._series, objects=objs) self.assert_('foo' in dm.objects)
def test_more_constructor(self): arr = randn(10) dm = self.klass(arr, columns=['A'], index=np.arange(10)) self.assertEqual(dm.values.ndim, 2) arr = randn(0) dm = self.klass(arr) self.assertEqual(dm.values.ndim, 2) self.assertEqual(dm.values.ndim, 2) # no data specified dm = self.klass(columns=['A', 'B'], index=np.arange(10)) self.assertEqual(dm.values.shape, (10, 2)) dm = self.klass(columns=['A', 'B']) self.assertEqual(dm.values.shape, (0, 2)) dm = self.klass(index=np.arange(10)) self.assertEqual(dm.values.shape, (10, 0)) # corner, silly self.assertRaises(Exception, self.klass, (1, 2, 3)) # can't cast mat = np.array(['foo', 'bar'], dtype=object).reshape(2, 1) df = DataMatrix(mat, index=[0, 1], columns=[0], dtype=float) self.assert_(df.values.dtype == np.object_) dm = self.klass(DataFrame(self.frame._series)) common.assert_frame_equal(dm, self.frame) # int cast dm = DataMatrix({'A' : np.ones(10, dtype=int), 'B' : np.ones(10, dtype=float)}, index=np.arange(10)) self.assertEqual(len(dm.columns), 2) self.assert_(dm.values.dtype == np.float_)
def test_setitem_ambig(self): # difficulties with mixed-type data from decimal import Decimal # created as float type dm = DataMatrix(index=range(3), columns=range(3)) coercable_series = Series([Decimal(1) for _ in range(3)], index=range(3)) uncoercable_series = Series(['foo', 'bzr', 'baz'], index=range(3)) dm[0] = np.ones(3) self.assertEqual(len(dm.cols()), 3) self.assert_(dm.objects is None) dm[1] = coercable_series self.assertEqual(len(dm.cols()), 3) self.assert_(dm.objects is None) dm[2] = uncoercable_series self.assertEqual(len(dm.cols()), 3) self.assert_(dm.objects is not None) self.assert_(2 in dm.objects) self.assert_(2 not in dm.columns)
def setUp(self): arr = randn(N) arr[self._nan_locs] = np.NaN self.arr = arr self.rng = DateRange(datetime(2009, 1, 1), periods=N) self.series = Series(arr.copy(), index=self.rng) self.frame = DataFrame(randn(N, K), index=self.rng, columns=np.arange(K)) self.matrix = DataMatrix(randn(N, K), index=self.rng, columns=np.arange(K))
def checkDataSet(self, dataset, start=None, end=None, skip_moving=False): exog = dataset.exog[start:end] endog = dataset.endog[start:end] x = DataMatrix(exog, index=np.arange(exog.shape[0]), columns=np.arange(exog.shape[1])) y = Series(endog, index=np.arange(len(endog))) self.checkOLS(exog, endog, x, y) if not skip_moving: self.checkMovingOLS('rolling', x, y) self.checkMovingOLS('rolling', x, y, nw_lags=0) self.checkMovingOLS('expanding', x, y, nw_lags=0) self.checkMovingOLS('rolling', x, y, nw_lags=1) self.checkMovingOLS('expanding', x, y, nw_lags=1) self.checkMovingOLS('expanding', x, y, nw_lags=1, nw_overlap=True)
def _rollingMoment(arg, window, func, minp, time_rule=None): """ Rolling statistical measure using supplied function. Designed to be used with passed-in Cython array-based functions. Parameters ---------- arg : DataFrame or numpy ndarray-like window : Number of observations used for calculating statistic func : Cython function to compute rolling statistic on raw series minp : int Minimum number of observations required to have a value """ types = (DataFrame, DataMatrix, Series) if time_rule is not None and isinstance(arg, types): # Conform to whatever frequency needed. arg = arg.asfreq(time_rule) if isinstance(arg, DataMatrix): T, N = arg.values.shape resultMatrix = np.empty((T, N), dtype=arg.values.dtype) arg.values[np.isinf(arg.values)] = NaN for i in range(N): resultMatrix[:, i] = func(arg.values[:, i], window, minp=minp) output = DataMatrix(resultMatrix, index=arg.index, columns=arg.columns) elif isinstance(arg, DataFrame): output = DataFrame(index=arg.index) for col, series in arg.iteritems(): series[np.isinf(series)] = NaN output[col] = Series(func(series, window, minp=minp), index=series.index) elif isinstance(arg, Series): arg[np.isinf(arg)] = NaN output = Series(func(arg, window, minp=minp), index=arg.index) else: try: assert (hasattr(arg, '__iter__')) except AssertionError: raise AssertionError('Expected DataFrame or array-like argument') arg[np.isinf(arg)] = NaN output = func(arg, window, minp=minp) return output
def _forecast_mean_raw(self): """Returns the raw covariance of beta.""" nobs = self._nobs window = self._window # x should be ones dummy = DataMatrix(index=self._y.index) dummy['y'] = 1 cum_xy = self._cum_xy(dummy, self._y) results = [] for n, i in enumerate(self._valid_indices): sumy = cum_xy[i] if self._is_rolling and i >= window: sumy = sumy - cum_xy[i - window] results.append(sumy[0] / nobs[n]) return np.array(results)
def _process_data_structure(arg, kill_inf=True): if isinstance(arg, DataFrame): if isinstance(arg, DataMatrix): return_hook = lambda v: DataMatrix( v, index=arg.index, columns=arg.columns, objects=arg.objects) else: return_hook = lambda v: DataFrame( v, index=arg.index, columns=arg.columns) values = arg.values elif isinstance(arg, Series): values = arg.values return_hook = lambda v: Series(v, arg.index) else: return_hook = lambda v: v values = arg if not issubclass(values.dtype.type, float): values = values.astype(float) if kill_inf: values = values.copy() values[np.isinf(values)] = np.NaN return return_hook, values
def test_count_objects(self): dm = DataMatrix(self.mixed_frame._series) df = DataFrame(self.mixed_frame._series) common.assert_series_equal(dm.count(), df.count()) common.assert_series_equal(dm.count(1), df.count(1))
def makeTimeDataMatrix(): data = getTimeSeriesData() return DataMatrix(data)
def makeDataMatrix(): data = getSeriesData() return DataMatrix(data)
def test_cumsum_corner(self): dm = DataMatrix(np.arange(20).reshape(4, 5), index=range(4), columns=range(5)) result = dm.cumsum()
def t_stat(self): """Returns the t-stat value.""" return DataMatrix(self._t_stat_raw, columns=self.beta.cols(), index=self._result_index)
def std_err(self): """Returns the standard err values.""" return DataMatrix(self._std_err_raw, columns=self.beta.cols(), index=self._result_index)
def p_value(self): """Returns the p values.""" cols = self.beta.cols() return DataMatrix(self._p_value_raw, columns=cols, index=self._result_index)
def beta(self): """Returns the betas in Series/DataMatrix form.""" return DataMatrix(self._beta_raw, index=self._result_index, columns=self._x.cols())
def var_beta(self): """Returns the variance-covariance matrix of beta.""" return DataMatrix(self._var_beta_raw, index=self.beta.index, columns=self.beta.index)