def test_default_na_values(all_parsers): _NA_VALUES = {"-1.#IND", "1.#QNAN", "1.#IND", "-1.#QNAN", "#N/A", "N/A", "n/a", "NA", "#NA", "NULL", "null", "NaN", "nan", "-NaN", "-nan", "#N/A N/A", ""} assert _NA_VALUES == com._NA_VALUES parser = all_parsers nv = len(_NA_VALUES) def f(i, v): if i == 0: buf = "" elif i > 0: buf = "".join([","] * i) buf = "{0}{1}".format(buf, v) if i < nv - 1: buf = "{0}{1}".format(buf, "".join([","] * (nv - i - 1))) return buf data = StringIO("\n".join(f(i, v) for i, v in enumerate(_NA_VALUES))) expected = DataFrame(np.nan, columns=range(nv), index=range(nv)) result = parser.read_csv(data, header=None) tm.assert_frame_equal(result, expected)
def setup(self): self.n = (10 ** 3) self.to_ts = dict(((i, pd.Timestamp(i)) for i in range(self.n))) self.to_td = dict(((i, pd.Timedelta(i)) for i in range(self.n))) self.s = Series(np.random.randint(self.n, size=(10 ** 3))) self.df = DataFrame({'A': np.random.randint(self.n, size=(10 ** 3)), 'B': np.random.randint(self.n, size=(10 ** 3))})
def test_get_duplicates(): # GH5873 for a in [101, 102]: mi = MultiIndex.from_arrays([[101, a], [3.5, np.nan]]) assert not mi.has_duplicates with tm.assert_produces_warning(FutureWarning): # Deprecated - see GH20239 assert mi.get_duplicates().equals(MultiIndex.from_arrays([[], []])) tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(2, dtype='bool')) for n in range(1, 6): # 1st level shape for m in range(1, 5): # 2nd level shape # all possible unique combinations, including nan codes = product(range(-1, n), range(-1, m)) mi = MultiIndex(levels=[list('abcde')[:n], list('WXYZ')[:m]], codes=np.random.permutation(list(codes)).T) assert len(mi) == (n + 1) * (m + 1) assert not mi.has_duplicates with tm.assert_produces_warning(FutureWarning): # Deprecated - see GH20239 assert mi.get_duplicates().equals(MultiIndex.from_arrays( [[], []])) tm.assert_numpy_array_equal(mi.duplicated(), np.zeros(len(mi), dtype='bool'))
def test_is_(): mi = MultiIndex.from_tuples(lzip(range(10), range(10))) assert mi.is_(mi) assert mi.is_(mi.view()) assert mi.is_(mi.view().view().view().view()) mi2 = mi.view() # names are metadata, they don't change id mi2.names = ["A", "B"] assert mi2.is_(mi) assert mi.is_(mi2) assert mi.is_(mi.set_names(["C", "D"])) mi2 = mi.view() mi2.set_names(["E", "F"], inplace=True) assert mi.is_(mi2) # levels are inherent properties, they change identity mi3 = mi2.set_levels([lrange(10), lrange(10)]) assert not mi3.is_(mi2) # shouldn't change assert mi2.is_(mi) mi4 = mi3.view() # GH 17464 - Remove duplicate MultiIndex levels mi4.set_levels([lrange(10), lrange(10)], inplace=True) assert not mi4.is_(mi3) mi5 = mi.view() mi5.set_levels(mi5.levels, inplace=True) assert not mi5.is_(mi)
def test_constructor_coverage(self): rng = timedelta_range("1 days", periods=10.5) exp = timedelta_range("1 days", periods=10) self.assertTrue(rng.equals(exp)) self.assertRaises(ValueError, TimedeltaIndex, start="1 days", periods="foo", freq="D") self.assertRaises(ValueError, TimedeltaIndex, start="1 days", end="10 days") self.assertRaises(ValueError, TimedeltaIndex, "1 days") # generator expression gen = (timedelta(i) for i in range(10)) result = TimedeltaIndex(gen) expected = TimedeltaIndex([timedelta(i) for i in range(10)]) self.assertTrue(result.equals(expected)) # NumPy string array strings = np.array(["1 days", "2 days", "3 days"]) result = TimedeltaIndex(strings) expected = to_timedelta([1, 2, 3], unit="d") self.assertTrue(result.equals(expected)) from_ints = TimedeltaIndex(expected.asi8) self.assertTrue(from_ints.equals(expected)) # non-conforming freq self.assertRaises(ValueError, TimedeltaIndex, ["1 days", "2 days", "4 days"], freq="D") self.assertRaises(ValueError, TimedeltaIndex, periods=10, freq="D")
def insert(self, value): # find first node on each level where node.next[levels].value > value chain = [None] * self.maxlevels steps_at_level = [0] * self.maxlevels node = self.head for level in reversed(range(self.maxlevels)): while node.next[level].value <= value: steps_at_level[level] += node.width[level] node = node.next[level] chain[level] = node # insert a link to the newnode at each level d = min(self.maxlevels, 1 - int(log(random(), 2.0))) newnode = Node(value, [None] * d, [None] * d) steps = 0 for level in range(d): prevnode = chain[level] newnode.next[level] = prevnode.next[level] prevnode.next[level] = newnode newnode.width[level] = prevnode.width[level] - steps prevnode.width[level] = steps + 1 steps += steps_at_level[level] for level in range(d, self.maxlevels): chain[level].width[level] += 1 self.size += 1
def test_basic(self): labels, uniques = algos.factorize(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) # self.assert_numpy_array_equal(labels, np.array([ 0, 1, 1, 0, 0, 2, 2, 2],dtype=np.int64)) self.assert_numpy_array_equal(uniques, np.array(['a','b','c'], dtype=object)) labels, uniques = algos.factorize(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'], sort=True) self.assert_numpy_array_equal(labels, np.array([ 0, 1, 1, 0, 0, 2, 2, 2],dtype=np.int64)) self.assert_numpy_array_equal(uniques, np.array(['a','b','c'], dtype=object)) labels, uniques = algos.factorize(list(reversed(range(5)))) self.assert_numpy_array_equal(labels, np.array([0, 1, 2, 3, 4], dtype=np.int64)) self.assert_numpy_array_equal(uniques, np.array([ 4, 3, 2, 1, 0],dtype=np.int64)) labels, uniques = algos.factorize(list(reversed(range(5))), sort=True) self.assert_numpy_array_equal(labels, np.array([ 4, 3, 2, 1, 0],dtype=np.int64)) self.assert_numpy_array_equal(uniques, np.array([0, 1, 2, 3, 4], dtype=np.int64)) labels, uniques = algos.factorize(list(reversed(np.arange(5.)))) self.assert_numpy_array_equal(labels, np.array([0., 1., 2., 3., 4.], dtype=np.float64)) self.assert_numpy_array_equal(uniques, np.array([ 4, 3, 2, 1, 0],dtype=np.int64)) labels, uniques = algos.factorize(list(reversed(np.arange(5.))), sort=True) self.assert_numpy_array_equal(labels, np.array([ 4, 3, 2, 1, 0],dtype=np.int64)) self.assert_numpy_array_equal(uniques, np.array([0., 1., 2., 3., 4.], dtype=np.float64))
def check(nlevels, with_nulls): labels = np.tile(np.arange(500), 2) level = np.arange(500) if with_nulls: # inject some null values labels[500] = -1 # common nan value labels = [labels.copy() for i in range(nlevels)] for i in range(nlevels): labels[i][500 + i - nlevels // 2] = -1 labels += [np.array([-1, 1]).repeat(500)] else: labels = [labels] * nlevels + [np.arange(2).repeat(500)] levels = [level] * nlevels + [[0, 1]] # no dups index = MultiIndex(levels=levels, labels=labels) assert not index.has_duplicates # with a dup if with_nulls: def f(a): return np.insert(a, 1000, a[0]) labels = list(map(f, labels)) index = MultiIndex(levels=levels, labels=labels) else: values = index.values.tolist() index = MultiIndex.from_tuples(values + [values[0]]) assert index.has_duplicates
def test_split_ranges(): def _bin(x, width): "return int(x) as a base2 string of given width" return ''.join(str((x >> i) & 1) for i in range(width - 1, -1, -1)) def test_locs(mask): nfalse = sum(np.array(mask) == 0) remaining = 0 for s, e in com.split_ranges(mask): remaining += e - s assert 0 not in mask[s:e] # make sure the total items covered by the ranges are a complete cover assert remaining + nfalse == len(mask) # exhaustively test all possible mask sequences of length 8 ncols = 8 for i in range(2 ** ncols): cols = lmap(int, list(_bin(i, ncols))) # count up in base2 mask = [cols[i] == 1 for i in range(len(cols))] test_locs(mask) # base cases test_locs([]) test_locs([0]) test_locs([1])
def test_rolling_max_how_resample(self): indices = [datetime(1975, 1, i) for i in range(1, 6)] # So that we can have 3 datapoints on last day (4, 10, and 20) indices.append(datetime(1975, 1, 5, 1)) indices.append(datetime(1975, 1, 5, 2)) series = Series(list(range(0, 5)) + [10, 20], index=indices) # Use floats instead of ints as values series = series.map(lambda x: float(x)) # Sort chronologically series = series.sort_index() # Default how should be max expected = Series([0.0, 1.0, 2.0, 3.0, 20.0], index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) x = mom.rolling_max(series, window=1, freq='D') assert_series_equal(expected, x) # Now specify median (10.0) expected = Series([0.0, 1.0, 2.0, 3.0, 10.0], index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) x = mom.rolling_max(series, window=1, freq='D', how='median') assert_series_equal(expected, x) # Now specify mean (4+10+20)/3 v = (4.0+10.0+20.0)/3.0 expected = Series([0.0, 1.0, 2.0, 3.0, v], index=[datetime(1975, 1, i, 0) for i in range(1, 6)]) x = mom.rolling_max(series, window=1, freq='D', how='mean') assert_series_equal(expected, x)
def _get_join_keys(llab, rlab, shape, sort): from pandas.core.groupby import _int64_overflow_possible # how many levels can be done without overflow pred = lambda i: not _int64_overflow_possible(shape[:i]) nlev = next(filter(pred, range(len(shape), 0, -1))) # get keys for the first `nlev` levels stride = np.prod(shape[1:nlev], dtype='i8') lkey = stride * llab[0].astype('i8', subok=False, copy=False) rkey = stride * rlab[0].astype('i8', subok=False, copy=False) for i in range(1, nlev): stride //= shape[i] lkey += llab[i] * stride rkey += rlab[i] * stride if nlev == len(shape): # all done! return lkey, rkey # densify current keys to avoid overflow lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort) llab = [lkey] + llab[nlev:] rlab = [rkey] + rlab[nlev:] shape = [count] + shape[nlev:] return _get_join_keys(llab, rlab, shape, sort)
def test_constructor_coverage(self): rng = timedelta_range('1 days', periods=10.5) exp = timedelta_range('1 days', periods=10) self.assertTrue(rng.equals(exp)) self.assertRaises(ValueError, TimedeltaIndex, start='1 days', periods='foo', freq='D') self.assertRaises(ValueError, TimedeltaIndex, start='1 days', end='10 days') self.assertRaises(ValueError, TimedeltaIndex, '1 days') # generator expression gen = (timedelta(i) for i in range(10)) result = TimedeltaIndex(gen) expected = TimedeltaIndex([timedelta(i) for i in range(10)]) self.assertTrue(result.equals(expected)) # NumPy string array strings = np.array(['1 days', '2 days', '3 days']) result = TimedeltaIndex(strings) expected = to_timedelta([1,2,3],unit='d') self.assertTrue(result.equals(expected)) from_ints = TimedeltaIndex(expected.asi8) self.assertTrue(from_ints.equals(expected)) # non-conforming freq self.assertRaises(ValueError, TimedeltaIndex, ['1 days', '2 days', '4 days'], freq='D') self.assertRaises(ValueError, TimedeltaIndex, periods=10, freq='D')
def test_basic(self): labels, uniques = algos.factorize(["a", "b", "b", "a", "a", "c", "c", "c"]) # self.assert_numpy_array_equal(labels, np.array([ 0, 1, 1, 0, 0, 2, 2, 2],dtype=np.int64)) self.assert_numpy_array_equal(uniques, np.array(["a", "b", "c"], dtype=object)) labels, uniques = algos.factorize(["a", "b", "b", "a", "a", "c", "c", "c"], sort=True) self.assert_numpy_array_equal(labels, np.array([0, 1, 1, 0, 0, 2, 2, 2], dtype=np.int64)) self.assert_numpy_array_equal(uniques, np.array(["a", "b", "c"], dtype=object)) labels, uniques = algos.factorize(list(reversed(range(5)))) self.assert_numpy_array_equal(labels, np.array([0, 1, 2, 3, 4], dtype=np.int64)) self.assert_numpy_array_equal(uniques, np.array([4, 3, 2, 1, 0], dtype=np.int64)) labels, uniques = algos.factorize(list(reversed(range(5))), sort=True) self.assert_numpy_array_equal(labels, np.array([4, 3, 2, 1, 0], dtype=np.int64)) self.assert_numpy_array_equal(uniques, np.array([0, 1, 2, 3, 4], dtype=np.int64)) labels, uniques = algos.factorize(list(reversed(np.arange(5.0)))) self.assert_numpy_array_equal(labels, np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=np.float64)) self.assert_numpy_array_equal(uniques, np.array([4, 3, 2, 1, 0], dtype=np.int64)) labels, uniques = algos.factorize(list(reversed(np.arange(5.0))), sort=True) self.assert_numpy_array_equal(labels, np.array([4, 3, 2, 1, 0], dtype=np.int64)) self.assert_numpy_array_equal(uniques, np.array([0.0, 1.0, 2.0, 3.0, 4.0], dtype=np.float64))
def _check_stat_op(self, name, alternative, obj=None, has_skipna=True): if obj is None: obj = self.panel4d # # set some NAs # obj.ix[5:10] = np.nan # obj.ix[15:20, -2:] = np.nan f = getattr(obj, name) if has_skipna: def skipna_wrapper(x): nona = remove_na(x) if len(nona) == 0: return np.nan return alternative(nona) def wrapper(x): return alternative(np.asarray(x)) for i in range(obj.ndim): result = f(axis=i, skipna=False) assert_panel_equal(result, obj.apply(wrapper, axis=i)) else: skipna_wrapper = alternative wrapper = alternative for i in range(obj.ndim): result = f(axis=i) assert_panel_equal(result, obj.apply(skipna_wrapper, axis=i)) self.assertRaises(Exception, f, axis=obj.ndim)
def test_index_unique(dups): uniques = dups.index.unique() expected = DatetimeIndex([datetime(2000, 1, 2), datetime(2000, 1, 3), datetime(2000, 1, 4), datetime(2000, 1, 5)]) assert uniques.dtype == 'M8[ns]' # sanity tm.assert_index_equal(uniques, expected) assert dups.index.nunique() == 4 # #2563 assert isinstance(uniques, DatetimeIndex) dups_local = dups.index.tz_localize('US/Eastern') dups_local.name = 'foo' result = dups_local.unique() expected = DatetimeIndex(expected, name='foo') expected = expected.tz_localize('US/Eastern') assert result.tz is not None assert result.name == 'foo' tm.assert_index_equal(result, expected) # NaT, note this is excluded arr = [1370745748 + t for t in range(20)] + [tslib.iNaT] idx = DatetimeIndex(arr * 3) tm.assert_index_equal(idx.unique(), DatetimeIndex(arr)) assert idx.nunique() == 20 assert idx.nunique(dropna=False) == 21 arr = [Timestamp('2013-06-09 02:42:28') + timedelta(seconds=t) for t in range(20)] + [NaT] idx = DatetimeIndex(arr * 3) tm.assert_index_equal(idx.unique(), DatetimeIndex(arr)) assert idx.nunique() == 20 assert idx.nunique(dropna=False) == 21
def test_constructor_empty(self, input_class): empty = Series() empty2 = Series(input_class()) # these are Index() and RangeIndex() which don't compare type equal # but are just .equals assert_series_equal(empty, empty2, check_index_type=False) # With explicit dtype: empty = Series(dtype='float64') empty2 = Series(input_class(), dtype='float64') assert_series_equal(empty, empty2, check_index_type=False) # GH 18515 : with dtype=category: empty = Series(dtype='category') empty2 = Series(input_class(), dtype='category') assert_series_equal(empty, empty2, check_index_type=False) if input_class is not list: # With index: empty = Series(index=lrange(10)) empty2 = Series(input_class(), index=lrange(10)) assert_series_equal(empty, empty2) # With index and dtype float64: empty = Series(np.nan, index=lrange(10)) empty2 = Series(input_class(), index=lrange(10), dtype='float64') assert_series_equal(empty, empty2) # GH 19853 : with empty string, index and dtype str empty = Series('', dtype=str, index=range(3)) empty2 = Series('', index=range(3)) assert_series_equal(empty, empty2)
def _maybe_promote_shape(values, naxes): # test to see if we have an array else leave since must be a number if not isinstance(values, np.ndarray): return values ndims = values.ndim if ndims > naxes: raise AssertionError('cannot have more dims than axes, ' '{0} > {1}'.format(ndims, naxes)) if ndims == naxes: return values ndim, nax = range(ndims), range(naxes) axes_slice = [slice(None)] * naxes # set difference of numaxes and ndims slices = list(set(nax) - set(ndim)) if ndims == naxes: if slices: raise AssertionError('slices should be empty if ndims == naxes ' '{0}'.format(slices)) else: if not slices: raise AssertionError('slices should NOT be empty if ndim != naxes ' '{0}'.format(slices)) for sl in slices: axes_slice[sl] = np.newaxis return values[tuple(axes_slice)]
def _write_regular_rows(self, fmt_values, indent): truncate_h = self.fmt.truncate_h truncate_v = self.fmt.truncate_v ncols = len(self.fmt.tr_frame.columns) nrows = len(self.fmt.tr_frame) fmt = self.fmt._get_formatter('__index__') if fmt is not None: index_values = self.fmt.tr_frame.index.map(fmt) else: index_values = self.fmt.tr_frame.index.format() row = [] for i in range(nrows): if truncate_v and i == (self.fmt.tr_row_num): str_sep_row = ['...'] * len(row) self.write_tr(str_sep_row, indent, self.indent_delta, tags=None, nindex_levels=1) row = [] row.append(index_values[i]) row.extend(fmt_values[j][i] for j in range(ncols)) if truncate_h: dot_col_ix = self.fmt.tr_col_num + 1 row.insert(dot_col_ix, '...') self.write_tr(row, indent, self.indent_delta, tags=None, nindex_levels=1)
def _apply_1d(self, func, axis): axis_name = self._get_axis_name(axis) ax = self._get_axis(axis) ndim = self.ndim values = self.values # iter thru the axes slice_axis = self._get_axis(axis) slice_indexer = [0]*(ndim-1) indexer = np.zeros(ndim, 'O') indlist = list(range(ndim)) indlist.remove(axis) indexer[axis] = slice(None, None) indexer.put(indlist, slice_indexer) planes = [ self._get_axis(axi) for axi in indlist ] shape = np.array(self.shape).take(indlist) # all the iteration points points = cartesian_product(planes) results = [] for i in range(np.prod(shape)): # construct the object pts = tuple([ p[i] for p in points ]) indexer.put(indlist, slice_indexer) obj = Series(values[tuple(indexer)],index=slice_axis,name=pts) result = func(obj) results.append(result) # increment the indexer slice_indexer[-1] += 1 n = -1 while (slice_indexer[n] >= shape[n]) and (n > (1-ndim)): slice_indexer[n-1] += 1 slice_indexer[n] = 0 n -= 1 # empty object if not len(results): return self._constructor(**self._construct_axes_dict()) # same ndim as current if isinstance(results[0],Series): arr = np.vstack([ r.values for r in results ]) arr = arr.T.reshape(tuple([len(slice_axis)] + list(shape))) tranp = np.array([axis]+indlist).argsort() arr = arr.transpose(tuple(list(tranp))) return self._constructor(arr,**self._construct_axes_dict()) # ndim-1 shape results = np.array(results).reshape(shape) if results.ndim == 2 and axis_name != self._info_axis_name: results = results.T planes = planes[::-1] return self._construct_return_type(results,planes)
def test_astype_categorical_to_other(self): df = DataFrame({'value': np.random.randint(0, 10000, 100)}) labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] cat_labels = Categorical(labels, labels) df = df.sort_values(by=['value'], ascending=True) df['value_group'] = pd.cut(df.value, range(0, 10500, 500), right=False, labels=cat_labels) s = df['value_group'] expected = s tm.assert_series_equal(s.astype('category'), expected) tm.assert_series_equal(s.astype(CategoricalDtype()), expected) msg = (r"could not convert string to float: '(0 - 499|9500 - 9999)'|" r"invalid literal for float\(\): (0 - 499|9500 - 9999)") with pytest.raises(ValueError, match=msg): s.astype('float64') cat = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])) exp = Series(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']) tm.assert_series_equal(cat.astype('str'), exp) s2 = Series(Categorical(['1', '2', '3', '4'])) exp2 = Series([1, 2, 3, 4]).astype(int) tm.assert_series_equal(s2.astype('int'), exp2) # object don't sort correctly, so just compare that we have the same # values def cmp(a, b): tm.assert_almost_equal( np.sort(np.unique(a)), np.sort(np.unique(b))) expected = Series(np.array(s.values), name='value_group') cmp(s.astype('object'), expected) cmp(s.astype(np.object_), expected) # array conversion tm.assert_almost_equal(np.array(s), np.array(s.values)) # valid conversion for valid in [lambda x: x.astype('category'), lambda x: x.astype(CategoricalDtype()), lambda x: x.astype('object').astype('category'), lambda x: x.astype('object').astype( CategoricalDtype()) ]: result = valid(s) # compare series values # internal .categories can't be compared because it is sorted tm.assert_series_equal(result, s, check_categorical=False) # invalid conversion (these are NOT a dtype) msg = (r"invalid type <class 'pandas\.core\.arrays\.categorical\." "Categorical'> for astype") for invalid in [lambda x: x.astype(Categorical), lambda x: x.astype('object').astype(Categorical)]: with pytest.raises(TypeError, match=msg): invalid(s)
def get_result(self): # series only if self._is_series: # stack blocks if self.axis == 0: new_data = com._concat_compat([x._values for x in self.objs]) name = com._consensus_name_attr(self.objs) return (Series(new_data, index=self.new_axes[0], name=name, dtype=new_data.dtype) .__finalize__(self, method='concat')) # combine as columns in a frame else: data = dict(zip(range(len(self.objs)), self.objs)) index, columns = self.new_axes tmpdf = DataFrame(data, index=index) # checks if the column variable already stores valid column # names (because set via the 'key' argument in the 'concat' # function call. If that's not the case, use the series names # as column names if (columns.equals(Index(np.arange(len(self.objs)))) and not self.ignore_index): columns = np.array([data[i].name for i in range(len(data))], dtype='object') indexer = isnull(columns) if indexer.any(): columns[indexer] = np.arange(len(indexer[indexer])) tmpdf.columns = columns return tmpdf.__finalize__(self, method='concat') # combine block managers else: mgrs_indexers = [] for obj in self.objs: mgr = obj._data indexers = {} for ax, new_labels in enumerate(self.new_axes): if ax == self.axis: # Suppress reindexing on concat axis continue obj_labels = mgr.axes[ax] if not new_labels.equals(obj_labels): indexers[ax] = obj_labels.reindex(new_labels)[1] mgrs_indexers.append((obj._data, indexers)) new_data = concatenate_block_managers( mgrs_indexers, self.new_axes, concat_axis=self.axis, copy=self.copy) if not self.copy: new_data._consolidate_inplace() return (self.objs[0]._from_axes(new_data, self.new_axes) .__finalize__(self, method='concat'))
def test_rename(): # GH 17407 s = Series(range(1, 6), index=pd.Index(range(2, 7), name='IntIndex')) result = s.rename(str) expected = s.rename(lambda i: str(i)) assert_series_equal(result, expected) assert result.name == expected.name
def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): # Write the frame cells using openpyxl. from openpyxl.cell import get_column_letter sheet_name = self._get_sheet_name(sheet_name) if sheet_name in self.sheets: wks = self.sheets[sheet_name] else: wks = self.book.create_sheet() wks.title = sheet_name self.sheets[sheet_name] = wks for cell in cells: colletter = get_column_letter(startcol + cell.col + 1) xcell = wks.cell("%s%s" % (colletter, startrow + cell.row + 1)) xcell.value = _conv_value(cell.val) style_kwargs = {} # Apply format codes before cell.style to allow override if isinstance(cell.val, datetime.datetime): style_kwargs.update(self._convert_to_style_kwargs({ 'number_format':{'format_code': self.datetime_format}})) elif isinstance(cell.val, datetime.date): style_kwargs.update(self._convert_to_style_kwargs({ 'number_format':{'format_code': self.date_format}})) if cell.style: style_kwargs.update(self._convert_to_style_kwargs(cell.style)) if style_kwargs: xcell.style = xcell.style.copy(**style_kwargs) if cell.mergestart is not None and cell.mergeend is not None: cletterstart = get_column_letter(startcol + cell.col + 1) cletterend = get_column_letter(startcol + cell.mergeend + 1) wks.merge_cells('%s%s:%s%s' % (cletterstart, startrow + cell.row + 1, cletterend, startrow + cell.mergestart + 1)) # Excel requires that the format of the first cell in a merged # range is repeated in the rest of the merged range. if style_kwargs: first_row = startrow + cell.row + 1 last_row = startrow + cell.mergestart + 1 first_col = startcol + cell.col + 1 last_col = startcol + cell.mergeend + 1 for row in range(first_row, last_row + 1): for col in range(first_col, last_col + 1): if row == first_row and col == first_col: # Ignore first cell. It is already handled. continue colletter = get_column_letter(col) xcell = wks.cell("%s%s" % (colletter, row)) xcell.style = xcell.style.copy(**style_kwargs)
def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): # Write the frame cells using openpyxl. from openpyxl.cell import get_column_letter sheet_name = self._get_sheet_name(sheet_name) if sheet_name in self.sheets: wks = self.sheets[sheet_name] else: wks = self.book.create_sheet() wks.title = sheet_name self.sheets[sheet_name] = wks for cell in cells: colletter = get_column_letter(startcol + cell.col + 1) xcell = wks.cell("%s%s" % (colletter, startrow + cell.row + 1)) xcell.value = _conv_value(cell.val) style = None if cell.style: style = self._convert_to_style(cell.style) for field in style.__fields__: xcell.style.__setattr__(field, style.__getattribute__(field)) if isinstance(cell.val, datetime.datetime): xcell.style.number_format.format_code = self.datetime_format elif isinstance(cell.val, datetime.date): xcell.style.number_format.format_code = self.date_format elif isinstance(cell.val, datetime.time): xcell.style.number_format.format_code = self.time_format if cell.mergestart is not None and cell.mergeend is not None: cletterstart = get_column_letter(startcol + cell.col + 1) cletterend = get_column_letter(startcol + cell.mergeend + 1) wks.merge_cells('%s%s:%s%s' % (cletterstart, startrow + cell.row + 1, cletterend, startrow + cell.mergestart + 1)) # Excel requires that the format of the first cell in a merged # range is repeated in the rest of the merged range. if style: first_row = startrow + cell.row + 1 last_row = startrow + cell.mergestart + 1 first_col = startcol + cell.col + 1 last_col = startcol + cell.mergeend + 1 for row in range(first_row, last_row + 1): for col in range(first_col, last_col + 1): if row == first_row and col == first_col: # Ignore first cell. It is already handled. continue colletter = get_column_letter(col) xcell = wks.cell("%s%s" % (colletter, row)) for field in style.__fields__: xcell.style.__setattr__( field, style.__getattribute__(field))
def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): # Write the frame cells using openpyxl. from openpyxl import styles sheet_name = self._get_sheet_name(sheet_name) _style_cache = {} if sheet_name in self.sheets: wks = self.sheets[sheet_name] else: wks = self.book.create_sheet() wks.title = sheet_name self.sheets[sheet_name] = wks for cell in cells: xcell = wks.cell(row=startrow + cell.row + 1, column=startcol + cell.col + 1) xcell.value = _conv_value(cell.val) style_kwargs = {} if cell.style: key = str(cell.style) style_kwargs = _style_cache.get(key) if style_kwargs is None: style_kwargs = self._convert_to_style_kwargs(cell.style) _style_cache[key] = style_kwargs if style_kwargs: for k, v in style_kwargs.items(): setattr(xcell, k, v) if cell.mergestart is not None and cell.mergeend is not None: wks.merge_cells( start_row=startrow + cell.row + 1, start_column=startcol + cell.col + 1, end_column=startcol + cell.mergeend + 1, end_row=startrow + cell.mergeend + 1, ) # When cells are merged only the top-left cell is preserved # The behaviour of the other cells in a merged range is undefined if style_kwargs: first_row = startrow + cell.row + 1 last_row = startrow + cell.mergestart + 1 first_col = startcol + cell.col + 1 last_col = startcol + cell.mergeend + 1 for row in range(first_row, last_row + 1): for col in range(first_col, last_col + 1): if row == first_row and col == first_col: # Ignore first cell. It is already handled. continue xcell = wks.cell(column=col, row=row) for k, v in style_kwargs.items(): setattr(xcell, k, v)
def test_where_unsafe(): # see gh-9731 s = Series(np.arange(10), dtype="int64") values = [2.5, 3.5, 4.5, 5.5] mask = s > 5 expected = Series(lrange(6) + values, dtype="float64") s[mask] = values assert_series_equal(s, expected) # see gh-3235 s = Series(np.arange(10), dtype='int64') mask = s < 5 s[mask] = lrange(2, 7) expected = Series(lrange(2, 7) + lrange(5, 10), dtype='int64') assert_series_equal(s, expected) assert s.dtype == expected.dtype s = Series(np.arange(10), dtype='int64') mask = s > 5 s[mask] = [0] * 4 expected = Series([0, 1, 2, 3, 4, 5] + [0] * 4, dtype='int64') assert_series_equal(s, expected) s = Series(np.arange(10)) mask = s > 5 def f(): s[mask] = [5, 4, 3, 2, 1] pytest.raises(ValueError, f) def f(): s[mask] = [0] * 5 pytest.raises(ValueError, f) # dtype changes s = Series([1, 2, 3, 4]) result = s.where(s > 2, np.nan) expected = Series([np.nan, np.nan, 3, 4]) assert_series_equal(result, expected) # GH 4667 # setting with None changes dtype s = Series(range(10)).astype(float) s[8] = None result = s[8] assert isna(result) s = Series(range(10)).astype(float) s[s > 8] = None result = s[isna(s)] expected = Series(np.nan, index=[9]) assert_series_equal(result, expected)
def test_index_type_coercion(self): with catch_warnings(record=True): simplefilter("ignore") # GH 11836 # if we have an index type and set it with something that looks # to numpy like the same, but is actually, not # (e.g. setting with a float or string '0') # then we need to coerce to object # integer indexes for s in [Series(range(5)), Series(range(5), index=range(1, 6))]: assert s.index.is_integer() for indexer in [lambda x: x.ix, lambda x: x.loc, lambda x: x]: s2 = s.copy() indexer(s2)[0.1] = 0 assert s2.index.is_floating() assert indexer(s2)[0.1] == 0 s2 = s.copy() indexer(s2)[0.0] = 0 exp = s.index if 0 not in s: exp = Index(s.index.tolist() + [0]) tm.assert_index_equal(s2.index, exp) s2 = s.copy() indexer(s2)['0'] = 0 assert s2.index.is_object() for s in [Series(range(5), index=np.arange(5.))]: assert s.index.is_floating() for idxr in [lambda x: x.ix, lambda x: x.loc, lambda x: x]: s2 = s.copy() idxr(s2)[0.1] = 0 assert s2.index.is_floating() assert idxr(s2)[0.1] == 0 s2 = s.copy() idxr(s2)[0.0] = 0 tm.assert_index_equal(s2.index, s.index) s2 = s.copy() idxr(s2)['0'] = 0 assert s2.index.is_object()
def test_range_in_series_indexing(self): # range can cause an indexing error # GH 11652 for x in [5, 999999, 1000000]: s = Series(index=range(x)) s.loc[range(1)] = 42 tm.assert_series_equal(s.loc[range(1)], Series(42.0, index=[0])) s.loc[range(2)] = 43 tm.assert_series_equal(s.loc[range(2)], Series(43.0, index=[0, 1]))
def test_metadata_propagation_indiv(self): # check that the metadata matches up on the resulting ops o = Series(range(3),range(3)) o.name = 'foo' o2 = Series(range(3),range(3)) o2.name = 'bar' result = o.T self.check_metadata(o,result)
def trellis(self, layers): """ Create a trellis structure for a list of layers. Each layer will be cloned with different data in to a two dimensional grid. Parameters: ----------- layers: a list of Layer objects Returns: -------- trellised_layers: Clones of each layer in the list arranged in a trellised latice """ trellised_layers = [] for layer in layers: data = layer.data if self.by[0] == '.': grouped = data.groupby(self.by[1]) elif self.by[1] == '.': grouped = data.groupby(self.by[0]) else: grouped = data.groupby(self.by) groups = list(grouped.groups.keys()) if self.by[0] == '.' or self.by[1] == '.': shingle1 = set([g for g in groups]) else: shingle1 = set([g[0] for g in groups]) shingle2 = set([g[1] for g in groups]) if self.by[0] == '.': self.rows = 1 self.cols = len(shingle1) elif self.by[1] == '.': self.rows = len(shingle1) self.cols = 1 else: self.rows = len(shingle1) self.cols = len(shingle2) trellised = [[None for _ in range(self.cols)] for _ in range(self.rows)] self.group_grid = [[None for _ in range( self.cols)] for _ in range(self.rows)] row = 0 col = 0 for group, data in grouped: new_layer = deepcopy(layer) new_layer.data = data trellised[row][col] = new_layer self.group_grid[row][col] = group col += 1 if col >= self.cols: col = 0 row += 1 trellised_layers.append(trellised) return trellised_layers
def test_operators_bitwise(self): # GH 9016: support bitwise op for integer types index = list('bca') s_tft = Series([True, False, True], index=index) s_fff = Series([False, False, False], index=index) s_tff = Series([True, False, False], index=index) s_empty = Series([]) # TODO: unused # s_0101 = Series([0, 1, 0, 1]) s_0123 = Series(range(4), dtype='int64') s_3333 = Series([3] * 4) s_4444 = Series([4] * 4) res = s_tft & s_empty expected = s_fff assert_series_equal(res, expected) res = s_tft | s_empty expected = s_tft assert_series_equal(res, expected) res = s_0123 & s_3333 expected = Series(range(4), dtype='int64') assert_series_equal(res, expected) res = s_0123 | s_4444 expected = Series(range(4, 8), dtype='int64') assert_series_equal(res, expected) s_a0b1c0 = Series([1], list('b')) res = s_tft & s_a0b1c0 expected = s_tff.reindex(list('abc')) assert_series_equal(res, expected) res = s_tft | s_a0b1c0 expected = s_tft.reindex(list('abc')) assert_series_equal(res, expected) n0 = 0 res = s_tft & n0 expected = s_fff assert_series_equal(res, expected) res = s_0123 & n0 expected = Series([0] * 4) assert_series_equal(res, expected) n1 = 1 res = s_tft & n1 expected = s_tft assert_series_equal(res, expected) res = s_0123 & n1 expected = Series([0, 1, 0, 1]) assert_series_equal(res, expected) s_1111 = Series([1] * 4, dtype='int8') res = s_0123 & s_1111 expected = Series([0, 1, 0, 1], dtype='int64') assert_series_equal(res, expected) res = s_0123.astype(np.int16) | s_1111.astype(np.int32) expected = Series([1, 1, 3, 3], dtype='int32') assert_series_equal(res, expected) with pytest.raises(TypeError): s_1111 & 'a' with pytest.raises(TypeError): s_1111 & ['a', 'b', 'c', 'd'] with pytest.raises(TypeError): s_0123 & np.NaN with pytest.raises(TypeError): s_0123 & 3.14 with pytest.raises(TypeError): s_0123 & [0.1, 4, 3.14, 2] # s_0123 will be all false now because of reindexing like s_tft if compat.PY3: # unable to sort incompatible object via .union. exp = Series([False] * 7, index=['b', 'c', 'a', 0, 1, 2, 3]) with tm.assert_produces_warning(RuntimeWarning): assert_series_equal(s_tft & s_0123, exp) else: exp = Series([False] * 7, index=[0, 1, 2, 3, 'a', 'b', 'c']) assert_series_equal(s_tft & s_0123, exp) # s_tft will be all false now because of reindexing like s_0123 if compat.PY3: # unable to sort incompatible object via .union. exp = Series([False] * 7, index=[0, 1, 2, 3, 'b', 'c', 'a']) with tm.assert_produces_warning(RuntimeWarning): assert_series_equal(s_0123 & s_tft, exp) else: exp = Series([False] * 7, index=[0, 1, 2, 3, 'a', 'b', 'c']) assert_series_equal(s_0123 & s_tft, exp) assert_series_equal(s_0123 & False, Series([False] * 4)) assert_series_equal(s_0123 ^ False, Series([False, True, True, True])) assert_series_equal(s_0123 & [False], Series([False] * 4)) assert_series_equal(s_0123 & (False), Series([False] * 4)) assert_series_equal(s_0123 & Series([False, np.NaN, False, False]), Series([False] * 4)) s_ftft = Series([False, True, False, True]) assert_series_equal(s_0123 & Series([0.1, 4, -3.14, 2]), s_ftft) s_abNd = Series(['a', 'b', np.NaN, 'd']) res = s_0123 & s_abNd expected = s_ftft assert_series_equal(res, expected)
def _take_multi(data, indexer, out): if not data.flags.c_contiguous: data = data.copy() for i in range(data.shape[0]): data[i].take(indexer, out=out[i])
def test_pairs(self): data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', '21dec2008', '11jan2009'], 'birthwt': [1766, 3301, 1454, 3139, 4133], 'id': [101, 102, 103, 104, 105], 'sex': ['Male', 'Female', 'Female', 'Female', 'Female'], 'visitdt1': ['11jan2009', '22dec2008', '04jan2009', '29dec2008', '20jan2009'], 'visitdt2': ['21jan2009', nan, '22jan2009', '31dec2008', '03feb2009'], 'visitdt3': ['05feb2009', nan, nan, '02jan2009', '15feb2009'], 'wt1': [1823, 3338, 1549, 3298, 4306], 'wt2': [2011.0, nan, 1892.0, 3338.0, 4575.0], 'wt3': [2293.0, nan, nan, 3377.0, 4805.0]} df = DataFrame(data) spec = {'visitdt': ['visitdt%d' % i for i in range(1, 4)], 'wt': ['wt%d' % i for i in range(1, 4)]} result = lreshape(df, spec) exp_data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', '21dec2008', '11jan2009', '08jan2009', '30dec2008', '21dec2008', '11jan2009', '08jan2009', '21dec2008', '11jan2009'], 'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 1454, 3139, 4133, 1766, 3139, 4133], 'id': [101, 102, 103, 104, 105, 101, 103, 104, 105, 101, 104, 105], 'sex': ['Male', 'Female', 'Female', 'Female', 'Female', 'Male', 'Female', 'Female', 'Female', 'Male', 'Female', 'Female'], 'visitdt': ['11jan2009', '22dec2008', '04jan2009', '29dec2008', '20jan2009', '21jan2009', '22jan2009', '31dec2008', '03feb2009', '05feb2009', '02jan2009', '15feb2009'], 'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, 1892.0, 3338.0, 4575.0, 2293.0, 3377.0, 4805.0]} exp = DataFrame(exp_data, columns=result.columns) tm.assert_frame_equal(result, exp) result = lreshape(df, spec, dropna=False) exp_data = {'birthdt': ['08jan2009', '20dec2008', '30dec2008', '21dec2008', '11jan2009', '08jan2009', '20dec2008', '30dec2008', '21dec2008', '11jan2009', '08jan2009', '20dec2008', '30dec2008', '21dec2008', '11jan2009'], 'birthwt': [1766, 3301, 1454, 3139, 4133, 1766, 3301, 1454, 3139, 4133, 1766, 3301, 1454, 3139, 4133], 'id': [101, 102, 103, 104, 105, 101, 102, 103, 104, 105, 101, 102, 103, 104, 105], 'sex': ['Male', 'Female', 'Female', 'Female', 'Female', 'Male', 'Female', 'Female', 'Female', 'Female', 'Male', 'Female', 'Female', 'Female', 'Female'], 'visitdt': ['11jan2009', '22dec2008', '04jan2009', '29dec2008', '20jan2009', '21jan2009', nan, '22jan2009', '31dec2008', '03feb2009', '05feb2009', nan, nan, '02jan2009', '15feb2009'], 'wt': [1823.0, 3338.0, 1549.0, 3298.0, 4306.0, 2011.0, nan, 1892.0, 3338.0, 4575.0, 2293.0, nan, nan, 3377.0, 4805.0]} exp = DataFrame(exp_data, columns=result.columns) tm.assert_frame_equal(result, exp) spec = {'visitdt': ['visitdt%d' % i for i in range(1, 3)], 'wt': ['wt%d' % i for i in range(1, 4)]} self.assertRaises(ValueError, lreshape, df, spec)
def test_date(self): import datetime dates = [datetime.date(2012, 1, x) for x in range(1, 20)] index = Index(dates) self.assertEqual(index.inferred_type, 'date')
def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True, margins_name='All'): index = _convert_by(index) columns = _convert_by(columns) if isinstance(aggfunc, list): pieces = [] keys = [] for func in aggfunc: table = pivot_table(data, values=values, index=index, columns=columns, fill_value=fill_value, aggfunc=func, margins=margins, margins_name=margins_name) pieces.append(table) keys.append(func.__name__) return concat(pieces, keys=keys, axis=1) keys = index + columns values_passed = values is not None if values_passed: if is_list_like(values): values_multi = True values = list(values) else: values_multi = False values = [values] # GH14938 Make sure value labels are in data for i in values: if i not in data: raise KeyError(i) to_filter = [] for x in keys + values: if isinstance(x, Grouper): x = x.key try: if x in data: to_filter.append(x) except TypeError: pass if len(to_filter) < len(data.columns): data = data[to_filter] else: values = data.columns for key in keys: try: values = values.drop(key) except (TypeError, ValueError): pass values = list(values) grouped = data.groupby(keys) agged = grouped.agg(aggfunc) table = agged if table.index.nlevels > 1: # Related GH #17123 # If index_names are integers, determine whether the integers refer # to the level position or name. index_names = agged.index.names[:len(index)] to_unstack = [] for i in range(len(index), len(keys)): name = agged.index.names[i] if name is None or name in index_names: to_unstack.append(i) else: to_unstack.append(name) table = agged.unstack(to_unstack) if not dropna: from pandas import MultiIndex try: m = MultiIndex.from_arrays(cartesian_product(table.index.levels), names=table.index.names) table = table.reindex_axis(m, axis=0) except AttributeError: pass # it's a single level try: m = MultiIndex.from_arrays(cartesian_product(table.columns.levels), names=table.columns.names) table = table.reindex_axis(m, axis=1) except AttributeError: pass # it's a single level or a series if isinstance(table, ABCDataFrame): table = table.sort_index(axis=1) if fill_value is not None: table = table.fillna(value=fill_value, downcast='infer') if margins: if dropna: data = data[data.notna().all(axis=1)] table = _add_margins(table, data, values, rows=index, cols=columns, aggfunc=aggfunc, margins_name=margins_name, fill_value=fill_value) # discard the top level if values_passed and not values_multi and not table.empty and \ (table.columns.nlevels > 1): table = table[values[0]] if len(index) == 0 and len(columns) > 0: table = table.T # GH 15193 Make sure empty columns are removed if dropna=True if isinstance(table, ABCDataFrame) and dropna: table = table.dropna(how='all', axis=1) return table
def format(self, formatter, subset=None): """ Format the text display value of cells. .. versionadded:: 0.18.0 Parameters ---------- formatter: str, callable, or dict subset: IndexSlice An argument to ``DataFrame.loc`` that restricts which elements ``formatter`` is applied to. Returns ------- self : Styler Notes ----- ``formatter`` is either an ``a`` or a dict ``{column name: a}`` where ``a`` is one of - str: this will be wrapped in: ``a.format(x)`` - callable: called with the value of an individual cell The default display value for numeric values is the "general" (``g``) format with ``pd.options.display.precision`` precision. Examples -------- >>> df = pd.DataFrame(np.random.randn(4, 2), columns=['a', 'b']) >>> df.style.format("{:.2%}") >>> df['c'] = ['a', 'b', 'c', 'd'] >>> df.style.format({'C': str.upper}) """ if subset is None: row_locs = range(len(self.data)) col_locs = range(len(self.data.columns)) else: subset = _non_reducing_slice(subset) if len(subset) == 1: subset = subset, self.data.columns sub_df = self.data.loc[subset] row_locs = self.data.index.get_indexer_for(sub_df.index) col_locs = self.data.columns.get_indexer_for(sub_df.columns) if isinstance(formatter, MutableMapping): for col, col_formatter in formatter.items(): # formatter must be callable, so '{}' are converted to lambdas col_formatter = _maybe_wrap_formatter(col_formatter) col_num = self.data.columns.get_indexer_for([col])[0] for row_num in row_locs: self._display_funcs[(row_num, col_num)] = col_formatter else: # single scalar to format all cells with locs = product(*(row_locs, col_locs)) for i, j in locs: formatter = _maybe_wrap_formatter(formatter) self._display_funcs[(i, j)] = formatter return self
def _subplots(naxes=None, sharex=False, sharey=False, squeeze=True, subplot_kw=None, ax=None, layout=None, layout_type='box', **fig_kw): """Create a figure with a set of subplots already made. This utility wrapper makes it convenient to create common layouts of subplots, including the enclosing figure object, in a single call. Keyword arguments: naxes : int Number of required axes. Exceeded axes are set invisible. Default is nrows * ncols. sharex : bool If True, the X axis will be shared amongst all subplots. sharey : bool If True, the Y axis will be shared amongst all subplots. squeeze : bool If True, extra dimensions are squeezed out from the returned axis object: - if only one subplot is constructed (nrows=ncols=1), the resulting single Axis object is returned as a scalar. - for Nx1 or 1xN subplots, the returned object is a 1-d numpy object array of Axis objects are returned as numpy 1-d arrays. - for NxM subplots with N>1 and M>1 are returned as a 2d array. If False, no squeezing at all is done: the returned axis object is always a 2-d array containing Axis instances, even if it ends up being 1x1. subplot_kw : dict Dict with keywords passed to the add_subplot() call used to create each subplots. ax : Matplotlib axis object, optional layout : tuple Number of rows and columns of the subplot grid. If not specified, calculated from naxes and layout_type layout_type : {'box', 'horziontal', 'vertical'}, default 'box' Specify how to layout the subplot grid. fig_kw : Other keyword arguments to be passed to the figure() call. Note that all keywords not recognized above will be automatically included here. Returns: fig, ax : tuple - fig is the Matplotlib Figure object - ax can be either a single axis object or an array of axis objects if more than one subplot was created. The dimensions of the resulting array can be controlled with the squeeze keyword, see above. **Examples:** x = np.linspace(0, 2*np.pi, 400) y = np.sin(x**2) # Just a figure and one subplot f, ax = plt.subplots() ax.plot(x, y) ax.set_title('Simple plot') # Two subplots, unpack the output array immediately f, (ax1, ax2) = plt.subplots(1, 2, sharey=True) ax1.plot(x, y) ax1.set_title('Sharing Y axis') ax2.scatter(x, y) # Four polar axes plt.subplots(2, 2, subplot_kw=dict(polar=True)) """ import matplotlib.pyplot as plt if subplot_kw is None: subplot_kw = {} if ax is None: fig = plt.figure(**fig_kw) else: if is_list_like(ax): ax = _flatten(ax) if layout is not None: warnings.warn( "When passing multiple axes, layout keyword is " "ignored", UserWarning) if sharex or sharey: warnings.warn( "When passing multiple axes, sharex and sharey " "are ignored. These settings must be specified " "when creating axes", UserWarning, stacklevel=4) if len(ax) == naxes: fig = ax[0].get_figure() return fig, ax else: raise ValueError("The number of passed axes must be {0}, the " "same as the output plot".format(naxes)) fig = ax.get_figure() # if ax is passed and a number of subplots is 1, return ax as it is if naxes == 1: if squeeze: return fig, ax else: return fig, _flatten(ax) else: warnings.warn( "To output multiple subplots, the figure containing " "the passed axes is being cleared", UserWarning, stacklevel=4) fig.clear() nrows, ncols = _get_layout(naxes, layout=layout, layout_type=layout_type) nplots = nrows * ncols # Create empty object array to hold all axes. It's easiest to make it 1-d # so we can just append subplots upon creation, and then axarr = np.empty(nplots, dtype=object) # Create first subplot separately, so we can share it if requested ax0 = fig.add_subplot(nrows, ncols, 1, **subplot_kw) if sharex: subplot_kw['sharex'] = ax0 if sharey: subplot_kw['sharey'] = ax0 axarr[0] = ax0 # Note off-by-one counting because add_subplot uses the MATLAB 1-based # convention. for i in range(1, nplots): kwds = subplot_kw.copy() # Set sharex and sharey to None for blank/dummy axes, these can # interfere with proper axis limits on the visible axes if # they share axes e.g. issue #7528 if i >= naxes: kwds['sharex'] = None kwds['sharey'] = None ax = fig.add_subplot(nrows, ncols, i + 1, **kwds) axarr[i] = ax if naxes != nplots: for ax in axarr[naxes:]: ax.set_visible(False) _handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey) if squeeze: # Reshape the array to have the final desired dimension (nrow,ncol), # though discarding unneeded dimensions that equal 1. If we only have # one subplot, just return it instead of a 1-element array. if nplots == 1: axes = axarr[0] else: axes = axarr.reshape(nrows, ncols).squeeze() else: # returned axis array will be always 2-d, even if nrows=ncols=1 axes = axarr.reshape(nrows, ncols) return fig, axes
def test_frame_from_json_to_json(self): def _check_orient(df, orient, dtype=None, numpy=False, convert_axes=True, check_dtype=True, raise_ok=None, sort=None, check_index_type=True, check_column_type=True, check_numpy_dtype=False): if sort is not None: df = df.sort_values(sort) else: df = df.sort_index() # if we are not unique, then check that we are raising ValueError # for the appropriate orients if not df.index.is_unique and orient in ['index', 'columns']: self.assertRaises(ValueError, lambda: df.to_json(orient=orient)) return if (not df.columns.is_unique and orient in ['index', 'columns', 'records']): self.assertRaises(ValueError, lambda: df.to_json(orient=orient)) return dfjson = df.to_json(orient=orient) try: unser = read_json(dfjson, orient=orient, dtype=dtype, numpy=numpy, convert_axes=convert_axes) except Exception as detail: if raise_ok is not None: if isinstance(detail, raise_ok): return raise if sort is not None and sort in unser.columns: unser = unser.sort_values(sort) else: unser = unser.sort_index() if dtype is False: check_dtype = False if not convert_axes and df.index.dtype.type == np.datetime64: unser.index = DatetimeIndex( unser.index.values.astype('i8') * 1e6) if orient == "records": # index is not captured in this orientation assert_almost_equal(df.values, unser.values, check_dtype=check_numpy_dtype) self.assert_index_equal(df.columns, unser.columns, exact=check_column_type) elif orient == "values": # index and cols are not captured in this orientation if numpy is True and df.shape == (0, 0): assert unser.shape[0] == 0 else: assert_almost_equal(df.values, unser.values, check_dtype=check_numpy_dtype) elif orient == "split": # index and col labels might not be strings unser.index = [str(i) for i in unser.index] unser.columns = [str(i) for i in unser.columns] if sort is None: unser = unser.sort_index() assert_almost_equal(df.values, unser.values, check_dtype=check_numpy_dtype) else: if convert_axes: assert_frame_equal(df, unser, check_dtype=check_dtype, check_index_type=check_index_type, check_column_type=check_column_type) else: assert_frame_equal(df, unser, check_less_precise=False, check_dtype=check_dtype) def _check_all_orients(df, dtype=None, convert_axes=True, raise_ok=None, sort=None, check_index_type=True, check_column_type=True): # numpy=False if convert_axes: _check_orient(df, "columns", dtype=dtype, sort=sort, check_index_type=False, check_column_type=False) _check_orient(df, "records", dtype=dtype, sort=sort, check_index_type=False, check_column_type=False) _check_orient(df, "split", dtype=dtype, sort=sort, check_index_type=False, check_column_type=False) _check_orient(df, "index", dtype=dtype, sort=sort, check_index_type=False, check_column_type=False) _check_orient(df, "values", dtype=dtype, sort=sort, check_index_type=False, check_column_type=False) _check_orient(df, "columns", dtype=dtype, convert_axes=False, sort=sort) _check_orient(df, "records", dtype=dtype, convert_axes=False, sort=sort) _check_orient(df, "split", dtype=dtype, convert_axes=False, sort=sort) _check_orient(df, "index", dtype=dtype, convert_axes=False, sort=sort) _check_orient(df, "values", dtype=dtype, convert_axes=False, sort=sort) # numpy=True and raise_ok might be not None, so ignore the error if convert_axes: _check_orient(df, "columns", dtype=dtype, numpy=True, raise_ok=raise_ok, sort=sort, check_index_type=False, check_column_type=False) _check_orient(df, "records", dtype=dtype, numpy=True, raise_ok=raise_ok, sort=sort, check_index_type=False, check_column_type=False) _check_orient(df, "split", dtype=dtype, numpy=True, raise_ok=raise_ok, sort=sort, check_index_type=False, check_column_type=False) _check_orient(df, "index", dtype=dtype, numpy=True, raise_ok=raise_ok, sort=sort, check_index_type=False, check_column_type=False) _check_orient(df, "values", dtype=dtype, numpy=True, raise_ok=raise_ok, sort=sort, check_index_type=False, check_column_type=False) _check_orient(df, "columns", dtype=dtype, numpy=True, convert_axes=False, raise_ok=raise_ok, sort=sort) _check_orient(df, "records", dtype=dtype, numpy=True, convert_axes=False, raise_ok=raise_ok, sort=sort) _check_orient(df, "split", dtype=dtype, numpy=True, convert_axes=False, raise_ok=raise_ok, sort=sort) _check_orient(df, "index", dtype=dtype, numpy=True, convert_axes=False, raise_ok=raise_ok, sort=sort) _check_orient(df, "values", dtype=dtype, numpy=True, convert_axes=False, raise_ok=raise_ok, sort=sort) # basic _check_all_orients(self.frame) self.assertEqual(self.frame.to_json(), self.frame.to_json(orient="columns")) _check_all_orients(self.intframe, dtype=self.intframe.values.dtype) _check_all_orients(self.intframe, dtype=False) # big one # index and columns are strings as all unserialised JSON object keys # are assumed to be strings biggie = DataFrame(np.zeros((200, 4)), columns=[str(i) for i in range(4)], index=[str(i) for i in range(200)]) _check_all_orients(biggie, dtype=False, convert_axes=False) # dtypes _check_all_orients(DataFrame(biggie, dtype=np.float64), dtype=np.float64, convert_axes=False) _check_all_orients(DataFrame(biggie, dtype=np.int), dtype=np.int, convert_axes=False) _check_all_orients(DataFrame(biggie, dtype='U3'), dtype='U3', convert_axes=False, raise_ok=ValueError) # categorical _check_all_orients(self.categorical, sort='sort', raise_ok=ValueError) # empty _check_all_orients(self.empty_frame, check_index_type=False, check_column_type=False) # time series data _check_all_orients(self.tsframe) # mixed data index = pd.Index(['a', 'b', 'c', 'd', 'e']) data = { 'A': [0., 1., 2., 3., 4.], 'B': [0., 1., 0., 1., 0.], 'C': ['foo1', 'foo2', 'foo3', 'foo4', 'foo5'], 'D': [True, False, True, False, True] } df = DataFrame(data=data, index=index) _check_orient(df, "split", check_dtype=False) _check_orient(df, "records", check_dtype=False) _check_orient(df, "values", check_dtype=False) _check_orient(df, "columns", check_dtype=False) # index oriented is problematic as it is read back in in a transposed # state, so the columns are interpreted as having mixed data and # given object dtypes. # force everything to have object dtype beforehand _check_orient(df.transpose().transpose(), "index", dtype=False)
def test_float_index_at_iat(self): s = Series([1, 2, 3], index=[0.1, 0.2, 0.3]) for el, item in s.iteritems(): assert s.at[el] == item for i in range(len(s)): assert s.iat[i] == i + 1
def test_xticklabels(self): # GH11529 s = Series(np.arange(10), index=['P%02d' % i for i in range(10)]) ax = s.plot(xticks=[0, 3, 5, 9]) exp = ['P%02d' % i for i in [0, 3, 5, 9]] self._check_text_labels(ax.get_xticklabels(), exp)
def melt(frame, id_vars=None, value_vars=None, var_name=None, value_name='value', col_level=None): """ "Unpivots" a DataFrame from wide format to long format, optionally leaving identifier variables set. This function is useful to massage a DataFrame into a format where one or more columns are identifier variables (`id_vars`), while all other columns, considered measured variables (`value_vars`), are "unpivoted" to the row axis, leaving just two non-identifier columns, 'variable' and 'value'. Parameters ---------- frame : DataFrame id_vars : tuple, list, or ndarray, optional Column(s) to use as identifier variables. value_vars : tuple, list, or ndarray, optional Column(s) to unpivot. If not specified, uses all columns that are not set as `id_vars`. var_name : scalar Name to use for the 'variable' column. If None it uses ``frame.columns.name`` or 'variable'. value_name : scalar, default 'value' Name to use for the 'value' column. col_level : int or string, optional If columns are a MultiIndex then use this level to melt. See also -------- pivot_table DataFrame.pivot Examples -------- >>> import pandas as pd >>> df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'}, ... 'B': {0: 1, 1: 3, 2: 5}, ... 'C': {0: 2, 1: 4, 2: 6}}) >>> df A B C 0 a 1 2 1 b 3 4 2 c 5 6 >>> pd.melt(df, id_vars=['A'], value_vars=['B']) A variable value 0 a B 1 1 b B 3 2 c B 5 >>> pd.melt(df, id_vars=['A'], value_vars=['B', 'C']) A variable value 0 a B 1 1 b B 3 2 c B 5 3 a C 2 4 b C 4 5 c C 6 The names of 'variable' and 'value' columns can be customized: >>> pd.melt(df, id_vars=['A'], value_vars=['B'], ... var_name='myVarname', value_name='myValname') A myVarname myValname 0 a B 1 1 b B 3 2 c B 5 If you have multi-index columns: >>> df.columns = [list('ABC'), list('DEF')] >>> df A B C D E F 0 a 1 2 1 b 3 4 2 c 5 6 >>> pd.melt(df, col_level=0, id_vars=['A'], value_vars=['B']) A variable value 0 a B 1 1 b B 3 2 c B 5 >>> pd.melt(df, id_vars=[('A', 'D')], value_vars=[('B', 'E')]) (A, D) variable_0 variable_1 value 0 a B E 1 1 b B E 3 2 c B E 5 """ # TODO: what about the existing index? if id_vars is not None: if not isinstance(id_vars, (tuple, list, np.ndarray)): id_vars = [id_vars] else: id_vars = list(id_vars) else: id_vars = [] if value_vars is not None: if not isinstance(value_vars, (tuple, list, np.ndarray)): value_vars = [value_vars] frame = frame.ix[:, id_vars + value_vars] else: frame = frame.copy() if col_level is not None: # allow list or other? # frame is a copy frame.columns = frame.columns.get_level_values(col_level) if var_name is None: if isinstance(frame.columns, MultiIndex): if len(frame.columns.names) == len(set(frame.columns.names)): var_name = frame.columns.names else: var_name = ['variable_%s' % i for i in range(len(frame.columns.names))] else: var_name = [frame.columns.name if frame.columns.name is not None else 'variable'] if isinstance(var_name, compat.string_types): var_name = [var_name] N, K = frame.shape K -= len(id_vars) mdata = {} for col in id_vars: mdata[col] = np.tile(frame.pop(col).values, K) mcolumns = id_vars + var_name + [value_name] mdata[value_name] = frame.values.ravel('F') for i, col in enumerate(var_name): # asanyarray will keep the columns as an Index mdata[col] = np.asanyarray(frame.columns.get_level_values(i)).repeat(N) return DataFrame(mdata, columns=mcolumns)
def _stack_multi_columns(frame, level_num=-1, dropna=True): def _convert_level_number(level_num, columns): """ Logic for converting the level number to something we can safely pass to swaplevel: We generally want to convert the level number into a level name, except when columns do not have names, in which case we must leave as a level number """ if level_num in columns.names: return columns.names[level_num] else: if columns.names[level_num] is None: return level_num else: return columns.names[level_num] this = frame.copy() # this makes life much simpler if level_num != frame.columns.nlevels - 1: # roll levels to put selected level at end roll_columns = this.columns for i in range(level_num, frame.columns.nlevels - 1): # Need to check if the ints conflict with level names lev1 = _convert_level_number(i, roll_columns) lev2 = _convert_level_number(i + 1, roll_columns) roll_columns = roll_columns.swaplevel(lev1, lev2) this.columns = roll_columns if not this.columns.is_lexsorted(): # Workaround the edge case where 0 is one of the column names, # which interferes with trying to sort based on the first # level level_to_sort = _convert_level_number(0, this.columns) this = this.sortlevel(level_to_sort, axis=1) # tuple list excluding level for grouping columns if len(frame.columns.levels) > 2: tuples = list(zip(*[ lev.take(lab) for lev, lab in zip(this.columns.levels[:-1], this.columns.labels[:-1]) ])) unique_groups = [key for key, _ in itertools.groupby(tuples)] new_names = this.columns.names[:-1] new_columns = MultiIndex.from_tuples(unique_groups, names=new_names) else: new_columns = unique_groups = this.columns.levels[0] # time to ravel the values new_data = {} level_vals = this.columns.levels[-1] level_labels = sorted(set(this.columns.labels[-1])) level_vals_used = level_vals[level_labels] levsize = len(level_labels) drop_cols = [] for key in unique_groups: loc = this.columns.get_loc(key) slice_len = loc.stop - loc.start # can make more efficient? if slice_len == 0: drop_cols.append(key) continue elif slice_len != levsize: chunk = this.ix[:, this.columns[loc]] chunk.columns = level_vals.take(chunk.columns.labels[-1]) value_slice = chunk.reindex(columns=level_vals_used).values else: if frame._is_mixed_type: value_slice = this.ix[:, this.columns[loc]].values else: value_slice = this.values[:, loc] new_data[key] = value_slice.ravel() if len(drop_cols) > 0: new_columns = new_columns.difference(drop_cols) N = len(this) if isinstance(this.index, MultiIndex): new_levels = list(this.index.levels) new_names = list(this.index.names) new_labels = [lab.repeat(levsize) for lab in this.index.labels] else: new_levels = [this.index] new_labels = [np.arange(N).repeat(levsize)] new_names = [this.index.name] # something better? new_levels.append(frame.columns.levels[level_num]) new_labels.append(np.tile(level_labels, N)) new_names.append(frame.columns.names[level_num]) new_index = MultiIndex(levels=new_levels, labels=new_labels, names=new_names, verify_integrity=False) result = DataFrame(new_data, index=new_index, columns=new_columns) # more efficient way to go about this? can do the whole masking biz but # will only save a small amount of time... if dropna: result = result.dropna(axis=0, how='all') return result
def test_week_of_month(self): days = ['MON', 'TUE', 'WED', 'THU', 'FRI', 'SAT', 'SUN'] for day in days: for i in range(1, 5): self._check_generated_range('1/1/2000', 'WOM-%d%s' % (i, day))
def create_cols(name): return ["%s%03d" % (name, i) for i in range(5)]
def _in_chunks(seq, size): """ Return sequence in 'chunks' of size defined by size """ return (seq[pos:pos + size] for pos in range(0, len(seq), size))
def _write_hierarchical_rows(self, fmt_values, indent): template = 'rowspan="{span}" valign="top"' truncate_h = self.fmt.truncate_h truncate_v = self.fmt.truncate_v frame = self.fmt.tr_frame ncols = len(frame.columns) nrows = len(frame) row_levels = self.frame.index.nlevels idx_values = frame.index.format(sparsify=False, adjoin=False, names=False) idx_values = lzip(*idx_values) if self.fmt.sparsify: # GH3547 sentinel = com.sentinel_factory() levels = frame.index.format(sparsify=sentinel, adjoin=False, names=False) level_lengths = get_level_lengths(levels, sentinel) inner_lvl = len(level_lengths) - 1 if truncate_v: # Insert ... row and adjust idx_values and # level_lengths to take this into account. ins_row = self.fmt.tr_row_num inserted = False for lnum, records in enumerate(level_lengths): rec_new = {} for tag, span in list(records.items()): if tag >= ins_row: rec_new[tag + 1] = span elif tag + span > ins_row: rec_new[tag] = span + 1 # GH 14882 - Make sure insertion done once if not inserted: dot_row = list(idx_values[ins_row - 1]) dot_row[-1] = u('...') idx_values.insert(ins_row, tuple(dot_row)) inserted = True else: dot_row = list(idx_values[ins_row]) dot_row[inner_lvl - lnum] = u('...') idx_values[ins_row] = tuple(dot_row) else: rec_new[tag] = span # If ins_row lies between tags, all cols idx cols # receive ... if tag + span == ins_row: rec_new[ins_row] = 1 if lnum == 0: idx_values.insert( ins_row, tuple([u('...')] * len(level_lengths))) # GH 14882 - Place ... in correct level elif inserted: dot_row = list(idx_values[ins_row]) dot_row[inner_lvl - lnum] = u('...') idx_values[ins_row] = tuple(dot_row) level_lengths[lnum] = rec_new level_lengths[inner_lvl][ins_row] = 1 for ix_col in range(len(fmt_values)): fmt_values[ix_col].insert(ins_row, '...') nrows += 1 for i in range(nrows): row = [] tags = {} sparse_offset = 0 j = 0 for records, v in zip(level_lengths, idx_values[i]): if i in records: if records[i] > 1: tags[j] = template.format(span=records[i]) else: sparse_offset += 1 continue j += 1 row.append(v) row.extend(fmt_values[j][i] for j in range(ncols)) if truncate_h: row.insert( row_levels - sparse_offset + self.fmt.tr_col_num, '...') self.write_tr(row, indent, self.indent_delta, tags=tags, nindex_levels=len(levels) - sparse_offset) else: for i in range(len(frame)): idx_values = list( zip(*frame.index.format( sparsify=False, adjoin=False, names=False))) row = [] row.extend(idx_values[i]) row.extend(fmt_values[j][i] for j in range(ncols)) if truncate_h: row.insert(row_levels + self.fmt.tr_col_num, '...') self.write_tr(row, indent, self.indent_delta, tags=None, nindex_levels=frame.index.nlevels)
def test_getitem_duplicates_multiindex(self): # GH 5725 the 'A' happens to be a valid Timestamp so the doesn't raise # the appropriate error, only in PY3 of course! index = MultiIndex(levels=[['D', 'B', 'C'], [0, 26, 27, 37, 57, 67, 75, 82]], codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], names=['tag', 'day']) arr = np.random.randn(len(index), 1) df = DataFrame(arr, index=index, columns=['val']) result = df.val['D'] expected = Series(arr.ravel()[0:3], name='val', index=Index([26, 37, 57], name='day')) tm.assert_series_equal(result, expected) def f(): df.val['A'] pytest.raises(KeyError, f) def f(): df.val['X'] pytest.raises(KeyError, f) # A is treated as a special Timestamp index = MultiIndex(levels=[['A', 'B', 'C'], [0, 26, 27, 37, 57, 67, 75, 82]], codes=[[0, 0, 0, 1, 2, 2, 2, 2, 2, 2], [1, 3, 4, 6, 0, 2, 2, 3, 5, 7]], names=['tag', 'day']) df = DataFrame(arr, index=index, columns=['val']) result = df.val['A'] expected = Series(arr.ravel()[0:3], name='val', index=Index([26, 37, 57], name='day')) tm.assert_series_equal(result, expected) def f(): df.val['X'] pytest.raises(KeyError, f) # GH 7866 # multi-index slicing with missing indexers idx = MultiIndex.from_product([['A', 'B', 'C'], ['foo', 'bar', 'baz']], names=['one', 'two']) s = Series(np.arange(9, dtype='int64'), index=idx).sort_index() exp_idx = MultiIndex.from_product([['A'], ['foo', 'bar', 'baz']], names=['one', 'two']) expected = Series(np.arange(3, dtype='int64'), index=exp_idx).sort_index() result = s.loc[['A']] tm.assert_series_equal(result, expected) result = s.loc[['A', 'D']] tm.assert_series_equal(result, expected) # not any values found pytest.raises(KeyError, lambda: s.loc[['D']]) # empty ok result = s.loc[[]] expected = s.iloc[[]] tm.assert_series_equal(result, expected) idx = pd.IndexSlice expected = Series( [0, 3, 6], index=MultiIndex.from_product([['A', 'B', 'C'], ['foo']], names=['one', 'two'])).sort_index() result = s.loc[idx[:, ['foo']]] tm.assert_series_equal(result, expected) result = s.loc[idx[:, ['foo', 'bah']]] tm.assert_series_equal(result, expected) # GH 8737 # empty indexer multi_index = MultiIndex.from_product((['foo', 'bar', 'baz'], ['alpha', 'beta'])) df = DataFrame(np.random.randn(5, 6), index=range(5), columns=multi_index) df = df.sort_index(level=0, axis=1) expected = DataFrame(index=range(5), columns=multi_index.reindex([])[0]) result1 = df.loc[:, ([], slice(None))] result2 = df.loc[:, (['foo'], [])] tm.assert_frame_equal(result1, expected) tm.assert_frame_equal(result2, expected) # regression from < 0.14.0 # GH 7914 df = DataFrame([[np.mean, np.median], ['mean', 'median']], columns=MultiIndex.from_tuples([('functs', 'mean'), ('functs', 'median')]), index=['function', 'name']) result = df.loc['function', ('functs', 'mean')] assert result == np.mean
def test_object_refcount_bug(self): lst = ['A', 'B', 'C', 'D', 'E'] for i in range(1000): len(algos.unique(lst))
def _translate(self): """ Convert the DataFrame in `self.data` and the attrs from `_build_styles` into a dictionary of {head, body, uuid, cellstyle} """ table_styles = self.table_styles or [] caption = self.caption ctx = self.ctx precision = self.precision uuid = self.uuid or str(uuid1()).replace("-", "_") ROW_HEADING_CLASS = "row_heading" COL_HEADING_CLASS = "col_heading" INDEX_NAME_CLASS = "index_name" DATA_CLASS = "data" BLANK_CLASS = "blank" BLANK_VALUE = "" def format_attr(pair): return "{key}={value}".format(**pair) # for sparsifying a MultiIndex idx_lengths = _get_level_lengths(self.index) col_lengths = _get_level_lengths(self.columns) cell_context = dict() n_rlvls = self.data.index.nlevels n_clvls = self.data.columns.nlevels rlabels = self.data.index.tolist() clabels = self.data.columns.tolist() if n_rlvls == 1: rlabels = [[x] for x in rlabels] if n_clvls == 1: clabels = [[x] for x in clabels] clabels = list(zip(*clabels)) cellstyle = [] head = [] for r in range(n_clvls): # Blank for Index columns... row_es = [{"type": "th", "value": BLANK_VALUE, "display_value": BLANK_VALUE, "is_visible": True, "class": " ".join([BLANK_CLASS])}] * (n_rlvls - 1) # ... except maybe the last for columns.names name = self.data.columns.names[r] cs = [BLANK_CLASS if name is None else INDEX_NAME_CLASS, "level%s" % r] name = BLANK_VALUE if name is None else name row_es.append({"type": "th", "value": name, "display_value": name, "class": " ".join(cs), "is_visible": True}) for c, value in enumerate(clabels[r]): cs = [COL_HEADING_CLASS, "level%s" % r, "col%s" % c] cs.extend(cell_context.get( "col_headings", {}).get(r, {}).get(c, [])) es = { "type": "th", "value": value, "display_value": value, "class": " ".join(cs), "is_visible": _is_visible(c, r, col_lengths), } colspan = col_lengths.get((r, c), 0) if colspan > 1: es["attributes"] = [ format_attr({"key": "colspan", "value": colspan}) ] row_es.append(es) head.append(row_es) if self.data.index.names and not all(x is None for x in self.data.index.names): index_header_row = [] for c, name in enumerate(self.data.index.names): cs = [INDEX_NAME_CLASS, "level%s" % c] name = '' if name is None else name index_header_row.append({"type": "th", "value": name, "class": " ".join(cs)}) index_header_row.extend( [{"type": "th", "value": BLANK_VALUE, "class": " ".join([BLANK_CLASS]) }] * len(clabels[0])) head.append(index_header_row) body = [] for r, idx in enumerate(self.data.index): row_es = [] for c, value in enumerate(rlabels[r]): es = { "type": "th", "is_visible": _is_visible(r, c, idx_lengths), "value": value, "display_value": value, "class": " ".join([ROW_HEADING_CLASS, "level%s" % c, "row%s" % r]), } rowspan = idx_lengths.get((c, r), 0) if rowspan > 1: es["attributes"] = [ format_attr({"key": "rowspan", "value": rowspan}) ] row_es.append(es) for c, col in enumerate(self.data.columns): cs = [DATA_CLASS, "row%s" % r, "col%s" % c] cs.extend(cell_context.get("data", {}).get(r, {}).get(c, [])) formatter = self._display_funcs[(r, c)] value = self.data.iloc[r, c] row_es.append({ "type": "td", "value": value, "class": " ".join(cs), "id": "_".join(cs[1:]), "display_value": formatter(value) }) props = [] for x in ctx[r, c]: # have to handle empty styles like [''] if x.count(":"): props.append(x.split(":")) else: props.append(['', '']) cellstyle.append({'props': props, 'selector': "row%s_col%s" % (r, c)}) body.append(row_es) return dict(head=head, cellstyle=cellstyle, body=body, uuid=uuid, precision=precision, table_styles=table_styles, caption=caption, table_attributes=self.table_attributes)
class TestDataFrameOperators(TestData): def test_operators(self): garbage = random.random(4) colSeries = Series(garbage, index=np.array(self.frame.columns)) idSum = self.frame + self.frame seriesSum = self.frame + colSeries for col, series in compat.iteritems(idSum): for idx, val in compat.iteritems(series): origVal = self.frame[col][idx] * 2 if not np.isnan(val): assert val == origVal else: assert np.isnan(origVal) for col, series in compat.iteritems(seriesSum): for idx, val in compat.iteritems(series): origVal = self.frame[col][idx] + colSeries[col] if not np.isnan(val): assert val == origVal else: assert np.isnan(origVal) added = self.frame2 + self.frame2 expected = self.frame2 * 2 assert_frame_equal(added, expected) df = DataFrame({'a': ['a', None, 'b']}) assert_frame_equal(df + df, DataFrame({'a': ['aa', np.nan, 'bb']})) # Test for issue #10181 for dtype in ('float', 'int64'): frames = [ DataFrame(dtype=dtype), DataFrame(columns=['A'], dtype=dtype), DataFrame(index=[0], dtype=dtype), ] for df in frames: assert (df + df).equals(df) assert_frame_equal(df + df, df) def test_ops_np_scalar(self): vals, xs = np.random.rand(5, 3), [nan, 7, -23, 2.718, -3.14, np.inf] f = lambda x: DataFrame(x, index=list('ABCDE'), columns=['jim', 'joe', 'jolie']) df = f(vals) for x in xs: assert_frame_equal(df / np.array(x), f(vals / x)) assert_frame_equal(np.array(x) * df, f(vals * x)) assert_frame_equal(df + np.array(x), f(vals + x)) assert_frame_equal(np.array(x) - df, f(x - vals)) def test_operators_boolean(self): # GH 5808 # empty frames, non-mixed dtype result = DataFrame(index=[1]) & DataFrame(index=[1]) assert_frame_equal(result, DataFrame(index=[1])) result = DataFrame(index=[1]) | DataFrame(index=[1]) assert_frame_equal(result, DataFrame(index=[1])) result = DataFrame(index=[1]) & DataFrame(index=[1, 2]) assert_frame_equal(result, DataFrame(index=[1, 2])) result = DataFrame(index=[1], columns=['A']) & DataFrame( index=[1], columns=['A']) assert_frame_equal(result, DataFrame(index=[1], columns=['A'])) result = DataFrame(True, index=[1], columns=['A']) & DataFrame( True, index=[1], columns=['A']) assert_frame_equal(result, DataFrame(True, index=[1], columns=['A'])) result = DataFrame(True, index=[1], columns=['A']) | DataFrame( True, index=[1], columns=['A']) assert_frame_equal(result, DataFrame(True, index=[1], columns=['A'])) # boolean ops result = DataFrame(1, index=[1], columns=['A']) | DataFrame( True, index=[1], columns=['A']) assert_frame_equal(result, DataFrame(1, index=[1], columns=['A'])) def f(): DataFrame(1.0, index=[1], columns=['A']) | DataFrame( True, index=[1], columns=['A']) pytest.raises(TypeError, f) def f(): DataFrame('foo', index=[1], columns=['A']) | DataFrame( True, index=[1], columns=['A']) pytest.raises(TypeError, f) def test_operators_none_as_na(self): df = DataFrame({"col1": [2, 5.0, 123, None], "col2": [1, 2, 3, 4]}, dtype=object) ops = [operator.add, operator.sub, operator.mul, operator.truediv] # since filling converts dtypes from object, changed expected to be # object for op in ops: filled = df.fillna(np.nan) result = op(df, 3) expected = op(filled, 3).astype(object) expected[com.isna(expected)] = None assert_frame_equal(result, expected) result = op(df, df) expected = op(filled, filled).astype(object) expected[com.isna(expected)] = None assert_frame_equal(result, expected) result = op(df, df.fillna(7)) assert_frame_equal(result, expected) result = op(df.fillna(7), df) assert_frame_equal(result, expected, check_dtype=False) def test_comparison_invalid(self): def check(df, df2): for (x, y) in [(df, df2), (df2, df)]: pytest.raises(TypeError, lambda: x == y) pytest.raises(TypeError, lambda: x != y) pytest.raises(TypeError, lambda: x >= y) pytest.raises(TypeError, lambda: x > y) pytest.raises(TypeError, lambda: x < y) pytest.raises(TypeError, lambda: x <= y) # GH4968 # invalid date/int comparisons df = DataFrame(np.random.randint(10, size=(10, 1)), columns=['a']) df['dates'] = date_range('20010101', periods=len(df)) df2 = df.copy() df2['dates'] = df['a'] check(df, df2) df = DataFrame(np.random.randint(10, size=(10, 2)), columns=['a', 'b']) df2 = DataFrame({'a': date_range('20010101', periods=len( df)), 'b': date_range('20100101', periods=len(df))}) check(df, df2) def test_timestamp_compare(self): # make sure we can compare Timestamps on the right AND left hand side # GH4982 df = DataFrame({'dates1': date_range('20010101', periods=10), 'dates2': date_range('20010102', periods=10), 'intcol': np.random.randint(1000000000, size=10), 'floatcol': np.random.randn(10), 'stringcol': list(tm.rands(10))}) df.loc[np.random.rand(len(df)) > 0.5, 'dates2'] = pd.NaT ops = {'gt': 'lt', 'lt': 'gt', 'ge': 'le', 'le': 'ge', 'eq': 'eq', 'ne': 'ne'} for left, right in ops.items(): left_f = getattr(operator, left) right_f = getattr(operator, right) # no nats expected = left_f(df, Timestamp('20010109')) result = right_f(Timestamp('20010109'), df) assert_frame_equal(result, expected) # nats expected = left_f(df, Timestamp('nat')) result = right_f(Timestamp('nat'), df) assert_frame_equal(result, expected) def test_modulo(self): # GH3590, modulo as ints p = DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) # this is technically wrong as the integer portion is coerced to float # ### expected = DataFrame({'first': Series([0, 0, 0, 0], dtype='float64'), 'second': Series([np.nan, np.nan, np.nan, 0])}) result = p % p assert_frame_equal(result, expected) # numpy has a slightly different (wrong) treatement with np.errstate(all='ignore'): arr = p.values % p.values result2 = DataFrame(arr, index=p.index, columns=p.columns, dtype='float64') result2.iloc[0:3, 1] = np.nan assert_frame_equal(result2, expected) result = p % 0 expected = DataFrame(np.nan, index=p.index, columns=p.columns) assert_frame_equal(result, expected) # numpy has a slightly different (wrong) treatement with np.errstate(all='ignore'): arr = p.values.astype('float64') % 0 result2 = DataFrame(arr, index=p.index, columns=p.columns) assert_frame_equal(result2, expected) # not commutative with series p = DataFrame(np.random.randn(10, 5)) s = p[0] res = s % p res2 = p % s assert not res.fillna(0).equals(res2.fillna(0)) def test_div(self): # integer div, but deal with the 0's (GH 9144) p = DataFrame({'first': [3, 4, 5, 8], 'second': [0, 0, 0, 3]}) result = p / p expected = DataFrame({'first': Series([1.0, 1.0, 1.0, 1.0]), 'second': Series([nan, nan, nan, 1])}) assert_frame_equal(result, expected) with np.errstate(all='ignore'): arr = p.values.astype('float') / p.values result2 = DataFrame(arr, index=p.index, columns=p.columns) assert_frame_equal(result2, expected) result = p / 0 expected = DataFrame(np.inf, index=p.index, columns=p.columns) expected.iloc[0:3, 1] = nan assert_frame_equal(result, expected) # numpy has a slightly different (wrong) treatement with np.errstate(all='ignore'): arr = p.values.astype('float64') / 0 result2 = DataFrame(arr, index=p.index, columns=p.columns) assert_frame_equal(result2, expected) p = DataFrame(np.random.randn(10, 5)) s = p[0] res = s / p res2 = p / s assert not res.fillna(0).equals(res2.fillna(0)) def test_logical_operators(self): def _check_bin_op(op): result = op(df1, df2) expected = DataFrame(op(df1.values, df2.values), index=df1.index, columns=df1.columns) assert result.values.dtype == np.bool_ assert_frame_equal(result, expected) def _check_unary_op(op): result = op(df1) expected = DataFrame(op(df1.values), index=df1.index, columns=df1.columns) assert result.values.dtype == np.bool_ assert_frame_equal(result, expected) df1 = {'a': {'a': True, 'b': False, 'c': False, 'd': True, 'e': True}, 'b': {'a': False, 'b': True, 'c': False, 'd': False, 'e': False}, 'c': {'a': False, 'b': False, 'c': True, 'd': False, 'e': False}, 'd': {'a': True, 'b': False, 'c': False, 'd': True, 'e': True}, 'e': {'a': True, 'b': False, 'c': False, 'd': True, 'e': True}} df2 = {'a': {'a': True, 'b': False, 'c': True, 'd': False, 'e': False}, 'b': {'a': False, 'b': True, 'c': False, 'd': False, 'e': False}, 'c': {'a': True, 'b': False, 'c': True, 'd': False, 'e': False}, 'd': {'a': False, 'b': False, 'c': False, 'd': True, 'e': False}, 'e': {'a': False, 'b': False, 'c': False, 'd': False, 'e': True}} df1 = DataFrame(df1) df2 = DataFrame(df2) _check_bin_op(operator.and_) _check_bin_op(operator.or_) _check_bin_op(operator.xor) # operator.neg is deprecated in numpy >= 1.9 _check_unary_op(operator.inv) @pytest.mark.parametrize('op,res', [('__eq__', False), ('__ne__', True)]) def test_logical_typeerror_with_non_valid(self, op, res): # we are comparing floats vs a string result = getattr(self.frame, op)('foo') assert bool(result.all().all()) is res def test_logical_with_nas(self): d = DataFrame({'a': [np.nan, False], 'b': [True, True]}) # GH4947 # bool comparisons should return bool result = d['a'] | d['b'] expected = Series([False, True]) assert_series_equal(result, expected) # GH4604, automatic casting here result = d['a'].fillna(False) | d['b'] expected = Series([True, True]) assert_series_equal(result, expected) result = d['a'].fillna(False, downcast=False) | d['b'] expected = Series([True, True]) assert_series_equal(result, expected) def test_neg(self): # what to do? assert_frame_equal(-self.frame, -1 * self.frame) def test_invert(self): assert_frame_equal(-(self.frame < 0), ~(self.frame < 0)) def test_arith_flex_frame(self): ops = ['add', 'sub', 'mul', 'div', 'truediv', 'pow', 'floordiv', 'mod'] if not compat.PY3: aliases = {} else: aliases = {'div': 'truediv'} for op in ops: try: alias = aliases.get(op, op) f = getattr(operator, alias) result = getattr(self.frame, op)(2 * self.frame) exp = f(self.frame, 2 * self.frame) assert_frame_equal(result, exp) # vs mix float result = getattr(self.mixed_float, op)(2 * self.mixed_float) exp = f(self.mixed_float, 2 * self.mixed_float) assert_frame_equal(result, exp) _check_mixed_float(result, dtype=dict(C=None)) # vs mix int if op in ['add', 'sub', 'mul']: result = getattr(self.mixed_int, op)(2 + self.mixed_int) exp = f(self.mixed_int, 2 + self.mixed_int) # no overflow in the uint dtype = None if op in ['sub']: dtype = dict(B='uint64', C=None) elif op in ['add', 'mul']: dtype = dict(C=None) assert_frame_equal(result, exp) _check_mixed_int(result, dtype=dtype) # rops r_f = lambda x, y: f(y, x) result = getattr(self.frame, 'r' + op)(2 * self.frame) exp = r_f(self.frame, 2 * self.frame) assert_frame_equal(result, exp) # vs mix float result = getattr(self.mixed_float, op)( 2 * self.mixed_float) exp = f(self.mixed_float, 2 * self.mixed_float) assert_frame_equal(result, exp) _check_mixed_float(result, dtype=dict(C=None)) result = getattr(self.intframe, op)(2 * self.intframe) exp = f(self.intframe, 2 * self.intframe) assert_frame_equal(result, exp) # vs mix int if op in ['add', 'sub', 'mul']: result = getattr(self.mixed_int, op)( 2 + self.mixed_int) exp = f(self.mixed_int, 2 + self.mixed_int) # no overflow in the uint dtype = None if op in ['sub']: dtype = dict(B='uint64', C=None) elif op in ['add', 'mul']: dtype = dict(C=None) assert_frame_equal(result, exp) _check_mixed_int(result, dtype=dtype) except: printing.pprint_thing("Failing operation %r" % op) raise # ndim >= 3 ndim_5 = np.ones(self.frame.shape + (3, 4, 5)) msg = "Unable to coerce to Series/DataFrame" with tm.assert_raises_regex(ValueError, msg): f(self.frame, ndim_5) with tm.assert_raises_regex(ValueError, msg): getattr(self.frame, op)(ndim_5) # res_add = self.frame.add(self.frame) # res_sub = self.frame.sub(self.frame) # res_mul = self.frame.mul(self.frame) # res_div = self.frame.div(2 * self.frame) # assert_frame_equal(res_add, self.frame + self.frame) # assert_frame_equal(res_sub, self.frame - self.frame) # assert_frame_equal(res_mul, self.frame * self.frame) # assert_frame_equal(res_div, self.frame / (2 * self.frame)) const_add = self.frame.add(1) assert_frame_equal(const_add, self.frame + 1) # corner cases result = self.frame.add(self.frame[:0]) assert_frame_equal(result, self.frame * np.nan) result = self.frame[:0].add(self.frame) assert_frame_equal(result, self.frame * np.nan) with tm.assert_raises_regex(NotImplementedError, 'fill_value'): self.frame.add(self.frame.iloc[0], fill_value=3) with tm.assert_raises_regex(NotImplementedError, 'fill_value'): self.frame.add(self.frame.iloc[0], axis='index', fill_value=3) def test_binary_ops_align(self): # test aligning binary ops # GH 6681 index = MultiIndex.from_product([list('abc'), ['one', 'two', 'three'], [1, 2, 3]], names=['first', 'second', 'third']) df = DataFrame(np.arange(27 * 3).reshape(27, 3), index=index, columns=['value1', 'value2', 'value3']).sort_index() idx = pd.IndexSlice for op in ['add', 'sub', 'mul', 'div', 'truediv']: opa = getattr(operator, op, None) if opa is None: continue x = Series([1.0, 10.0, 100.0], [1, 2, 3]) result = getattr(df, op)(x, level='third', axis=0) expected = pd.concat([opa(df.loc[idx[:, :, i], :], v) for i, v in x.iteritems()]).sort_index() assert_frame_equal(result, expected) x = Series([1.0, 10.0], ['two', 'three']) result = getattr(df, op)(x, level='second', axis=0) expected = (pd.concat([opa(df.loc[idx[:, i], :], v) for i, v in x.iteritems()]) .reindex_like(df).sort_index()) assert_frame_equal(result, expected) # GH9463 (alignment level of dataframe with series) midx = MultiIndex.from_product([['A', 'B'], ['a', 'b']]) df = DataFrame(np.ones((2, 4), dtype='int64'), columns=midx) s = pd.Series({'a': 1, 'b': 2}) df2 = df.copy() df2.columns.names = ['lvl0', 'lvl1'] s2 = s.copy() s2.index.name = 'lvl1' # different cases of integer/string level names: res1 = df.mul(s, axis=1, level=1) res2 = df.mul(s2, axis=1, level=1) res3 = df2.mul(s, axis=1, level=1) res4 = df2.mul(s2, axis=1, level=1) res5 = df2.mul(s, axis=1, level='lvl1') res6 = df2.mul(s2, axis=1, level='lvl1') exp = DataFrame(np.array([[1, 2, 1, 2], [1, 2, 1, 2]], dtype='int64'), columns=midx) for res in [res1, res2]: assert_frame_equal(res, exp) exp.columns.names = ['lvl0', 'lvl1'] for res in [res3, res4, res5, res6]: assert_frame_equal(res, exp) def test_arith_mixed(self): left = DataFrame({'A': ['a', 'b', 'c'], 'B': [1, 2, 3]}) result = left + left expected = DataFrame({'A': ['aa', 'bb', 'cc'], 'B': [2, 4, 6]}) assert_frame_equal(result, expected) def test_arith_getitem_commute(self): df = DataFrame({'A': [1.1, 3.3], 'B': [2.5, -3.9]}) self._test_op(df, operator.add) self._test_op(df, operator.sub) self._test_op(df, operator.mul) self._test_op(df, operator.truediv) self._test_op(df, operator.floordiv) self._test_op(df, operator.pow) self._test_op(df, lambda x, y: y + x) self._test_op(df, lambda x, y: y - x) self._test_op(df, lambda x, y: y * x) self._test_op(df, lambda x, y: y / x) self._test_op(df, lambda x, y: y ** x) self._test_op(df, lambda x, y: x + y) self._test_op(df, lambda x, y: x - y) self._test_op(df, lambda x, y: x * y) self._test_op(df, lambda x, y: x / y) self._test_op(df, lambda x, y: x ** y) @staticmethod def _test_op(df, op): result = op(df, 1) if not df.columns.is_unique: raise ValueError("Only unique columns supported by this test") for col in result.columns: assert_series_equal(result[col], op(df[col], 1)) def test_bool_flex_frame(self): data = np.random.randn(5, 3) other_data = np.random.randn(5, 3) df = DataFrame(data) other = DataFrame(other_data) ndim_5 = np.ones(df.shape + (1, 3)) # Unaligned def _check_unaligned_frame(meth, op, df, other): part_o = other.loc[3:, 1:].copy() rs = meth(part_o) xp = op(df, part_o.reindex(index=df.index, columns=df.columns)) assert_frame_equal(rs, xp) # DataFrame assert df.eq(df).values.all() assert not df.ne(df).values.any() for op in ['eq', 'ne', 'gt', 'lt', 'ge', 'le']: f = getattr(df, op) o = getattr(operator, op) # No NAs assert_frame_equal(f(other), o(df, other)) _check_unaligned_frame(f, o, df, other) # ndarray assert_frame_equal(f(other.values), o(df, other.values)) # scalar assert_frame_equal(f(0), o(df, 0)) # NAs msg = "Unable to coerce to Series/DataFrame" assert_frame_equal(f(np.nan), o(df, np.nan)) with tm.assert_raises_regex(ValueError, msg): f(ndim_5) # Series def _test_seq(df, idx_ser, col_ser): idx_eq = df.eq(idx_ser, axis=0) col_eq = df.eq(col_ser) idx_ne = df.ne(idx_ser, axis=0) col_ne = df.ne(col_ser) assert_frame_equal(col_eq, df == Series(col_ser)) assert_frame_equal(col_eq, -col_ne) assert_frame_equal(idx_eq, -idx_ne) assert_frame_equal(idx_eq, df.T.eq(idx_ser).T) assert_frame_equal(col_eq, df.eq(list(col_ser))) assert_frame_equal(idx_eq, df.eq(Series(idx_ser), axis=0)) assert_frame_equal(idx_eq, df.eq(list(idx_ser), axis=0)) idx_gt = df.gt(idx_ser, axis=0) col_gt = df.gt(col_ser) idx_le = df.le(idx_ser, axis=0) col_le = df.le(col_ser) assert_frame_equal(col_gt, df > Series(col_ser)) assert_frame_equal(col_gt, -col_le) assert_frame_equal(idx_gt, -idx_le) assert_frame_equal(idx_gt, df.T.gt(idx_ser).T) idx_ge = df.ge(idx_ser, axis=0) col_ge = df.ge(col_ser) idx_lt = df.lt(idx_ser, axis=0) col_lt = df.lt(col_ser) assert_frame_equal(col_ge, df >= Series(col_ser)) assert_frame_equal(col_ge, -col_lt) assert_frame_equal(idx_ge, -idx_lt) assert_frame_equal(idx_ge, df.T.ge(idx_ser).T) idx_ser = Series(np.random.randn(5)) col_ser = Series(np.random.randn(3)) _test_seq(df, idx_ser, col_ser) # list/tuple _test_seq(df, idx_ser.values, col_ser.values) # NA df.loc[0, 0] = np.nan rs = df.eq(df) assert not rs.loc[0, 0] rs = df.ne(df) assert rs.loc[0, 0] rs = df.gt(df) assert not rs.loc[0, 0] rs = df.lt(df) assert not rs.loc[0, 0] rs = df.ge(df) assert not rs.loc[0, 0] rs = df.le(df) assert not rs.loc[0, 0] # complex arr = np.array([np.nan, 1, 6, np.nan]) arr2 = np.array([2j, np.nan, 7, None]) df = DataFrame({'a': arr}) df2 = DataFrame({'a': arr2}) rs = df.gt(df2) assert not rs.values.any() rs = df.ne(df2) assert rs.values.all() arr3 = np.array([2j, np.nan, None]) df3 = DataFrame({'a': arr3}) rs = df3.gt(2j) assert not rs.values.any() # corner, dtype=object df1 = DataFrame({'col': ['foo', np.nan, 'bar']}) df2 = DataFrame({'col': ['foo', datetime.now(), 'bar']}) result = df1.ne(df2) exp = DataFrame({'col': [False, True, False]}) assert_frame_equal(result, exp) def test_return_dtypes_bool_op_costant(self): # GH15077 df = DataFrame({'x': [1, 2, 3], 'y': [1., 2., 3.]}) const = 2 # not empty DataFrame for op in ['eq', 'ne', 'gt', 'lt', 'ge', 'le']: result = getattr(df, op)(const).get_dtype_counts() tm.assert_series_equal(result, Series([2], ['bool'])) # empty DataFrame empty = df.iloc[:0] for op in ['eq', 'ne', 'gt', 'lt', 'ge', 'le']: result = getattr(empty, op)(const).get_dtype_counts() tm.assert_series_equal(result, Series([2], ['bool'])) def test_dti_tz_convert_to_utc(self): base = pd.DatetimeIndex(['2011-01-01', '2011-01-02', '2011-01-03'], tz='UTC') idx1 = base.tz_convert('Asia/Tokyo')[:2] idx2 = base.tz_convert('US/Eastern')[1:] df1 = DataFrame({'A': [1, 2]}, index=idx1) df2 = DataFrame({'A': [1, 1]}, index=idx2) exp = DataFrame({'A': [np.nan, 3, np.nan]}, index=base) assert_frame_equal(df1 + df2, exp) def test_arith_flex_series(self): df = self.simple row = df.xs('a') col = df['two'] # after arithmetic refactor, add truediv here ops = ['add', 'sub', 'mul', 'mod'] for op in ops: f = getattr(df, op) op = getattr(operator, op) assert_frame_equal(f(row), op(df, row)) assert_frame_equal(f(col, axis=0), op(df.T, col).T) # special case for some reason assert_frame_equal(df.add(row, axis=None), df + row) # cases which will be refactored after big arithmetic refactor assert_frame_equal(df.div(row), df / row) assert_frame_equal(df.div(col, axis=0), (df.T / col).T) # broadcasting issue in GH7325 df = DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype='int64') expected = DataFrame([[nan, np.inf], [1.0, 1.5], [1.0, 1.25]]) result = df.div(df[0], axis='index') assert_frame_equal(result, expected) df = DataFrame(np.arange(3 * 2).reshape((3, 2)), dtype='float64') expected = DataFrame([[np.nan, np.inf], [1.0, 1.5], [1.0, 1.25]]) result = df.div(df[0], axis='index') assert_frame_equal(result, expected) def test_arith_non_pandas_object(self): df = self.simple val1 = df.xs('a').values added = DataFrame(df.values + val1, index=df.index, columns=df.columns) assert_frame_equal(df + val1, added) added = DataFrame((df.values.T + val1).T, index=df.index, columns=df.columns) assert_frame_equal(df.add(val1, axis=0), added) val2 = list(df['two']) added = DataFrame(df.values + val2, index=df.index, columns=df.columns) assert_frame_equal(df + val2, added) added = DataFrame((df.values.T + val2).T, index=df.index, columns=df.columns) assert_frame_equal(df.add(val2, axis='index'), added) val3 = np.random.rand(*df.shape) added = DataFrame(df.values + val3, index=df.index, columns=df.columns) assert_frame_equal(df.add(val3), added) @pytest.mark.parametrize('values', [[1, 2], (1, 2), np.array([1, 2]), range(1, 3), deque([1, 2])]) def test_arith_alignment_non_pandas_object(self, values): # GH 17901 df = DataFrame({'A': [1, 1], 'B': [1, 1]}) expected = DataFrame({'A': [2, 2], 'B': [3, 3]}) result = df + values assert_frame_equal(result, expected) def test_combineFrame(self): frame_copy = self.frame.reindex(self.frame.index[::2]) del frame_copy['D'] frame_copy['C'][:5] = nan added = self.frame + frame_copy indexer = added['A'].dropna().index exp = (self.frame['A'] * 2).copy() tm.assert_series_equal(added['A'].dropna(), exp.loc[indexer]) exp.loc[~exp.index.isin(indexer)] = np.nan tm.assert_series_equal(added['A'], exp.loc[added['A'].index]) assert np.isnan(added['C'].reindex(frame_copy.index)[:5]).all() # assert(False) assert np.isnan(added['D']).all() self_added = self.frame + self.frame tm.assert_index_equal(self_added.index, self.frame.index) added_rev = frame_copy + self.frame assert np.isnan(added['D']).all() assert np.isnan(added_rev['D']).all() # corner cases # empty plus_empty = self.frame + self.empty assert np.isnan(plus_empty.values).all() empty_plus = self.empty + self.frame assert np.isnan(empty_plus.values).all() empty_empty = self.empty + self.empty assert empty_empty.empty # out of order reverse = self.frame.reindex(columns=self.frame.columns[::-1]) assert_frame_equal(reverse + self.frame, self.frame * 2) # mix vs float64, upcast added = self.frame + self.mixed_float _check_mixed_float(added, dtype='float64') added = self.mixed_float + self.frame _check_mixed_float(added, dtype='float64') # mix vs mix added = self.mixed_float + self.mixed_float2 _check_mixed_float(added, dtype=dict(C=None)) added = self.mixed_float2 + self.mixed_float _check_mixed_float(added, dtype=dict(C=None)) # with int added = self.frame + self.mixed_int _check_mixed_float(added, dtype='float64') def test_combineSeries(self): # Series series = self.frame.xs(self.frame.index[0]) added = self.frame + series for key, s in compat.iteritems(added): assert_series_equal(s, self.frame[key] + series[key]) larger_series = series.to_dict() larger_series['E'] = 1 larger_series = Series(larger_series) larger_added = self.frame + larger_series for key, s in compat.iteritems(self.frame): assert_series_equal(larger_added[key], s + series[key]) assert 'E' in larger_added assert np.isnan(larger_added['E']).all() # no upcast needed added = self.mixed_float + series _check_mixed_float(added) # vs mix (upcast) as needed added = self.mixed_float + series.astype('float32') _check_mixed_float(added, dtype=dict(C=None)) added = self.mixed_float + series.astype('float16') _check_mixed_float(added, dtype=dict(C=None)) # these raise with numexpr.....as we are adding an int64 to an # uint64....weird vs int # added = self.mixed_int + (100*series).astype('int64') # _check_mixed_int(added, dtype = dict(A = 'int64', B = 'float64', C = # 'int64', D = 'int64')) # added = self.mixed_int + (100*series).astype('int32') # _check_mixed_int(added, dtype = dict(A = 'int32', B = 'float64', C = # 'int32', D = 'int64')) # TimeSeries ts = self.tsframe['A'] # 10890 # we no longer allow auto timeseries broadcasting # and require explict broadcasting added = self.tsframe.add(ts, axis='index') for key, col in compat.iteritems(self.tsframe): result = col + ts assert_series_equal(added[key], result, check_names=False) assert added[key].name == key if col.name == ts.name: assert result.name == 'A' else: assert result.name is None smaller_frame = self.tsframe[:-5] smaller_added = smaller_frame.add(ts, axis='index') tm.assert_index_equal(smaller_added.index, self.tsframe.index) smaller_ts = ts[:-5] smaller_added2 = self.tsframe.add(smaller_ts, axis='index') assert_frame_equal(smaller_added, smaller_added2) # length 0, result is all-nan result = self.tsframe.add(ts[:0], axis='index') expected = DataFrame(np.nan, index=self.tsframe.index, columns=self.tsframe.columns) assert_frame_equal(result, expected) # Frame is all-nan result = self.tsframe[:0].add(ts, axis='index') expected = DataFrame(np.nan, index=self.tsframe.index, columns=self.tsframe.columns) assert_frame_equal(result, expected) # empty but with non-empty index frame = self.tsframe[:1].reindex(columns=[]) result = frame.mul(ts, axis='index') assert len(result) == len(ts) def test_combineFunc(self): result = self.frame * 2 tm.assert_numpy_array_equal(result.values, self.frame.values * 2) # vs mix result = self.mixed_float * 2 for c, s in compat.iteritems(result): tm.assert_numpy_array_equal( s.values, self.mixed_float[c].values * 2) _check_mixed_float(result, dtype=dict(C=None)) result = self.empty * 2 assert result.index is self.empty.index assert len(result.columns) == 0 def test_comparisons(self): df1 = tm.makeTimeDataFrame() df2 = tm.makeTimeDataFrame() row = self.simple.xs('a') ndim_5 = np.ones(df1.shape + (1, 1, 1)) def test_comp(func): result = func(df1, df2) tm.assert_numpy_array_equal(result.values, func(df1.values, df2.values)) with tm.assert_raises_regex(ValueError, 'Wrong number of dimensions'): func(df1, ndim_5) result2 = func(self.simple, row) tm.assert_numpy_array_equal(result2.values, func(self.simple.values, row.values)) result3 = func(self.frame, 0) tm.assert_numpy_array_equal(result3.values, func(self.frame.values, 0)) with tm.assert_raises_regex(ValueError, 'Can only compare identically' '-labeled DataFrame'): func(self.simple, self.simple[:2]) test_comp(operator.eq) test_comp(operator.ne) test_comp(operator.lt) test_comp(operator.gt) test_comp(operator.ge) test_comp(operator.le) def test_comparison_protected_from_errstate(self): missing_df = tm.makeDataFrame() missing_df.iloc[0]['A'] = np.nan with np.errstate(invalid='ignore'): expected = missing_df.values < 0 with np.errstate(invalid='raise'): result = (missing_df < 0).values tm.assert_numpy_array_equal(result, expected) def test_string_comparison(self): df = DataFrame([{"a": 1, "b": "foo"}, {"a": 2, "b": "bar"}]) mask_a = df.a > 1 assert_frame_equal(df[mask_a], df.loc[1:1, :]) assert_frame_equal(df[-mask_a], df.loc[0:0, :]) mask_b = df.b == "foo" assert_frame_equal(df[mask_b], df.loc[0:0, :]) assert_frame_equal(df[-mask_b], df.loc[1:1, :]) def test_float_none_comparison(self): df = DataFrame(np.random.randn(8, 3), index=lrange(8), columns=['A', 'B', 'C']) pytest.raises(TypeError, df.__eq__, None) def test_boolean_comparison(self): # GH 4576 # boolean comparisons with a tuple/list give unexpected results df = DataFrame(np.arange(6).reshape((3, 2))) b = np.array([2, 2]) b_r = np.atleast_2d([2, 2]) b_c = b_r.T l = (2, 2, 2) tup = tuple(l) # gt expected = DataFrame([[False, False], [False, True], [True, True]]) result = df > b assert_frame_equal(result, expected) result = df.values > b assert_numpy_array_equal(result, expected.values) result = df > l assert_frame_equal(result, expected) result = df > tup assert_frame_equal(result, expected) result = df > b_r assert_frame_equal(result, expected) result = df.values > b_r assert_numpy_array_equal(result, expected.values) pytest.raises(ValueError, df.__gt__, b_c) pytest.raises(ValueError, df.values.__gt__, b_c) # == expected = DataFrame([[False, False], [True, False], [False, False]]) result = df == b assert_frame_equal(result, expected) result = df == l assert_frame_equal(result, expected) result = df == tup assert_frame_equal(result, expected) result = df == b_r assert_frame_equal(result, expected) result = df.values == b_r assert_numpy_array_equal(result, expected.values) pytest.raises(ValueError, lambda: df == b_c) assert df.values.shape != b_c.shape # with alignment df = DataFrame(np.arange(6).reshape((3, 2)), columns=list('AB'), index=list('abc')) expected.index = df.index expected.columns = df.columns result = df == l assert_frame_equal(result, expected) result = df == tup assert_frame_equal(result, expected) def test_boolean_comparison_error(self): # GH 4576 # boolean comparisons with a tuple/list give unexpected results df = DataFrame(np.arange(6).reshape((3, 2))) # not shape compatible pytest.raises(ValueError, lambda: df == (2, 2)) pytest.raises(ValueError, lambda: df == [2, 2]) def test_combine_generic(self): df1 = self.frame df2 = self.frame.loc[self.frame.index[:-5], ['A', 'B', 'C']] combined = df1.combine(df2, np.add) combined2 = df2.combine(df1, np.add) assert combined['D'].isna().all() assert combined2['D'].isna().all() chunk = combined.loc[combined.index[:-5], ['A', 'B', 'C']] chunk2 = combined2.loc[combined2.index[:-5], ['A', 'B', 'C']] exp = self.frame.loc[self.frame.index[:-5], ['A', 'B', 'C']].reindex_like(chunk) * 2 assert_frame_equal(chunk, exp) assert_frame_equal(chunk2, exp) def test_inplace_ops_alignment(self): # inplace ops / ops alignment # GH 8511 columns = list('abcdefg') X_orig = DataFrame(np.arange(10 * len(columns)) .reshape(-1, len(columns)), columns=columns, index=range(10)) Z = 100 * X_orig.iloc[:, 1:-1].copy() block1 = list('bedcf') subs = list('bcdef') # add X = X_orig.copy() result1 = (X[block1] + Z).reindex(columns=subs) X[block1] += Z result2 = X.reindex(columns=subs) X = X_orig.copy() result3 = (X[block1] + Z[block1]).reindex(columns=subs) X[block1] += Z[block1] result4 = X.reindex(columns=subs) assert_frame_equal(result1, result2) assert_frame_equal(result1, result3) assert_frame_equal(result1, result4) # sub X = X_orig.copy() result1 = (X[block1] - Z).reindex(columns=subs) X[block1] -= Z result2 = X.reindex(columns=subs) X = X_orig.copy() result3 = (X[block1] - Z[block1]).reindex(columns=subs) X[block1] -= Z[block1] result4 = X.reindex(columns=subs) assert_frame_equal(result1, result2) assert_frame_equal(result1, result3) assert_frame_equal(result1, result4) def test_inplace_ops_identity(self): # GH 5104 # make sure that we are actually changing the object s_orig = Series([1, 2, 3]) df_orig = DataFrame(np.random.randint(0, 5, size=10).reshape(-1, 5)) # no dtype change s = s_orig.copy() s2 = s s += 1 assert_series_equal(s, s2) assert_series_equal(s_orig + 1, s) assert s is s2 assert s._data is s2._data df = df_orig.copy() df2 = df df += 1 assert_frame_equal(df, df2) assert_frame_equal(df_orig + 1, df) assert df is df2 assert df._data is df2._data # dtype change s = s_orig.copy() s2 = s s += 1.5 assert_series_equal(s, s2) assert_series_equal(s_orig + 1.5, s) df = df_orig.copy() df2 = df df += 1.5 assert_frame_equal(df, df2) assert_frame_equal(df_orig + 1.5, df) assert df is df2 assert df._data is df2._data # mixed dtype arr = np.random.randint(0, 10, size=5) df_orig = DataFrame({'A': arr.copy(), 'B': 'foo'}) df = df_orig.copy() df2 = df df['A'] += 1 expected = DataFrame({'A': arr.copy() + 1, 'B': 'foo'}) assert_frame_equal(df, expected) assert_frame_equal(df2, expected) assert df._data is df2._data df = df_orig.copy() df2 = df df['A'] += 1.5 expected = DataFrame({'A': arr.copy() + 1.5, 'B': 'foo'}) assert_frame_equal(df, expected) assert_frame_equal(df2, expected) assert df._data is df2._data @pytest.mark.parametrize('op', ['add', 'and', 'div', 'floordiv', 'mod', 'mul', 'or', 'pow', 'sub', 'truediv', 'xor']) def test_inplace_ops_identity2(self, op): if compat.PY3 and op == 'div': return df = DataFrame({'a': [1., 2., 3.], 'b': [1, 2, 3]}) operand = 2 if op in ('and', 'or', 'xor'): # cannot use floats for boolean ops df['a'] = [True, False, True] df_copy = df.copy() iop = '__i{}__'.format(op) op = '__{}__'.format(op) # no id change and value is correct getattr(df, iop)(operand) expected = getattr(df_copy, op)(operand) assert_frame_equal(df, expected) expected = id(df) assert id(df) == expected def test_alignment_non_pandas(self): index = ['A', 'B', 'C'] columns = ['X', 'Y', 'Z'] df = pd.DataFrame(np.random.randn(3, 3), index=index, columns=columns) align = pd.core.ops._align_method_FRAME for val in [[1, 2, 3], (1, 2, 3), np.array([1, 2, 3], dtype=np.int64), range(1, 4)]: tm.assert_series_equal(align(df, val, 'index'), Series([1, 2, 3], index=df.index)) tm.assert_series_equal(align(df, val, 'columns'), Series([1, 2, 3], index=df.columns)) # length mismatch msg = 'Unable to coerce to Series, length must be 3: given 2' for val in [[1, 2], (1, 2), np.array([1, 2]), range(1, 3)]: with tm.assert_raises_regex(ValueError, msg): align(df, val, 'index') with tm.assert_raises_regex(ValueError, msg): align(df, val, 'columns') val = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) tm.assert_frame_equal(align(df, val, 'index'), DataFrame(val, index=df.index, columns=df.columns)) tm.assert_frame_equal(align(df, val, 'columns'), DataFrame(val, index=df.index, columns=df.columns)) # shape mismatch msg = 'Unable to coerce to DataFrame, shape must be' val = np.array([[1, 2, 3], [4, 5, 6]]) with tm.assert_raises_regex(ValueError, msg): align(df, val, 'index') with tm.assert_raises_regex(ValueError, msg): align(df, val, 'columns') val = np.zeros((3, 3, 3)) with pytest.raises(ValueError): align(df, val, 'index') with pytest.raises(ValueError): align(df, val, 'columns')
def test_slice_locs_indexerror(): times = [ datetime(2000, 1, 1) + timedelta(minutes=i * 10) for i in range(100000) ] s = Series(lrange(100000), times) s.loc[datetime(1900, 1, 1):datetime(2100, 1, 1)]
def _parse_excel(self, sheetname=0, header=0, skiprows=None, skip_footer=0, index_col=None, has_index_names=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, chunksize=None, convert_float=True, **kwds): import xlrd from xlrd import (xldate, XL_CELL_DATE, XL_CELL_ERROR, XL_CELL_BOOLEAN, XL_CELL_NUMBER) epoch1904 = self.book.datemode # xlrd >= 0.9.3 can return datetime objects directly. if LooseVersion(xlrd.__VERSION__) >= LooseVersion("0.9.3"): xlrd_0_9_3 = True else: xlrd_0_9_3 = False if isinstance(sheetname, compat.string_types): sheet = self.book.sheet_by_name(sheetname) else: # assume an integer if not a string sheet = self.book.sheet_by_index(sheetname) data = [] should_parse = {} for i in range(sheet.nrows): row = [] for j, (value, typ) in enumerate(zip(sheet.row_values(i), sheet.row_types(i))): if parse_cols is not None and j not in should_parse: should_parse[j] = self._should_parse(j, parse_cols) if parse_cols is None or should_parse[j]: if typ == XL_CELL_DATE: if xlrd_0_9_3: # Use the newer xlrd datetime handling. value = xldate.xldate_as_datetime(value, epoch1904) # Excel doesn't distinguish between dates and time, # so we treat dates on the epoch as times only. # Also, Excel supports 1900 and 1904 epochs. year = (value.timetuple())[0:3] if ((not epoch1904 and year == (1899, 12, 31)) or (epoch1904 and year == (1904, 1, 1))): value = datetime.time(value.hour, value.minute, value.second, value.microsecond) else: # Use the xlrd <= 0.9.2 date handling. dt = xldate.xldate_as_tuple(value, epoch1904) if dt[0] < datetime.MINYEAR: value = datetime.time(*dt[3:]) else: value = datetime.datetime(*dt) elif typ == XL_CELL_ERROR: value = np.nan elif typ == XL_CELL_BOOLEAN: value = bool(value) elif convert_float and typ == XL_CELL_NUMBER: # GH5394 - Excel 'numbers' are always floats # it's a minimal perf hit and less suprising val = int(value) if val == value: value = val row.append(value) data.append(row) if header is not None: data[header] = _trim_excel_header(data[header]) parser = TextParser(data, header=header, index_col=index_col, has_index_names=has_index_names, na_values=na_values, thousands=thousands, parse_dates=parse_dates, date_parser=date_parser, skiprows=skiprows, skip_footer=skip_footer, chunksize=chunksize, **kwds) return parser.read()
def __iter__(self): for i in range(len(self)): yield self._get_val_at(i) raise StopIteration
def _bin(x, width): "return int(x) as a base2 string of given width" return ''.join(str((x >> i) & 1) for i in range(width - 1, -1, -1))
def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0, index_col=None, has_index_names=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, chunksize=None, **kwds): from xlrd import (xldate_as_tuple, XL_CELL_DATE, XL_CELL_ERROR, XL_CELL_BOOLEAN) datemode = self.book.datemode if isinstance(sheetname, compat.string_types): sheet = self.book.sheet_by_name(sheetname) else: # assume an integer if not a string sheet = self.book.sheet_by_index(sheetname) data = [] should_parse = {} for i in range(sheet.nrows): row = [] for j, (value, typ) in enumerate( zip(sheet.row_values(i), sheet.row_types(i))): if parse_cols is not None and j not in should_parse: should_parse[j] = self._should_parse(j, parse_cols) if parse_cols is None or should_parse[j]: if typ == XL_CELL_DATE: dt = xldate_as_tuple(value, datemode) # how to produce this first case? if dt[0] < datetime.MINYEAR: # pragma: no cover value = datetime.time(*dt[3:]) else: value = datetime.datetime(*dt) elif typ == XL_CELL_ERROR: value = np.nan elif typ == XL_CELL_BOOLEAN: value = bool(value) row.append(value) data.append(row) if header is not None: data[header] = _trim_excel_header(data[header]) parser = TextParser(data, header=header, index_col=index_col, has_index_names=has_index_names, na_values=na_values, thousands=thousands, parse_dates=parse_dates, date_parser=date_parser, skiprows=skiprows, skip_footer=skip_footer, chunksize=chunksize, **kwds) return parser.read()
def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True): """ Create a spreadsheet-style pivot table as a DataFrame. The levels in the pivot table will be stored in MultiIndex objects (hierarchical indexes) on the index and columns of the result DataFrame Parameters ---------- data : DataFrame values : column to aggregate, optional index : a column, Grouper, array which has the same length as data, or list of them. Keys to group by on the pivot table index. If an array is passed, it is being used as the same manner as column values. columns : a column, Grouper, array which has the same length as data, or list of them. Keys to group by on the pivot table column. If an array is passed, it is being used as the same manner as column values. aggfunc : function, default numpy.mean, or list of functions If list of functions passed, the resulting pivot table will have hierarchical columns whose top level are the function names (inferred from the function objects themselves) fill_value : scalar, default None Value to replace missing values with margins : boolean, default False Add all row / columns (e.g. for subtotal / grand totals) dropna : boolean, default True Do not include columns whose entries are all NaN rows : kwarg only alias of index [deprecated] cols : kwarg only alias of columns [deprecated] Examples -------- >>> df A B C D 0 foo one small 1 1 foo one large 2 2 foo one large 2 3 foo two small 3 4 foo two small 3 5 bar one large 4 6 bar one small 5 7 bar two small 6 8 bar two large 7 >>> table = pivot_table(df, values='D', index=['A', 'B'], ... columns=['C'], aggfunc=np.sum) >>> table small large foo one 1 4 two 6 NaN bar one 5 4 two 6 7 Returns ------- table : DataFrame """ index = _convert_by(index) columns = _convert_by(columns) if isinstance(aggfunc, list): pieces = [] keys = [] for func in aggfunc: table = pivot_table(data, values=values, index=index, columns=columns, fill_value=fill_value, aggfunc=func, margins=margins) pieces.append(table) keys.append(func.__name__) return concat(pieces, keys=keys, axis=1) keys = index + columns values_passed = values is not None if values_passed: if isinstance(values, (list, tuple)): values_multi = True else: values_multi = False values = [values] else: values = list(data.columns.drop(keys)) if values_passed: to_filter = [] for x in keys + values: if isinstance(x, Grouper): x = x.key try: if x in data: to_filter.append(x) except TypeError: pass if len(to_filter) < len(data.columns): data = data[to_filter] grouped = data.groupby(keys) agged = grouped.agg(aggfunc) table = agged if table.index.nlevels > 1: to_unstack = [ agged.index.names[i] for i in range(len(index), len(keys)) ] table = agged.unstack(to_unstack) if not dropna: try: m = MultiIndex.from_arrays(cartesian_product(table.index.levels)) table = table.reindex_axis(m, axis=0) except AttributeError: pass # it's a single level try: m = MultiIndex.from_arrays(cartesian_product(table.columns.levels)) table = table.reindex_axis(m, axis=1) except AttributeError: pass # it's a single level or a series if isinstance(table, DataFrame): if isinstance(table.columns, MultiIndex): table = table.sortlevel(axis=1) else: table = table.sort_index(axis=1) if fill_value is not None: table = table.fillna(value=fill_value, downcast='infer') if margins: table = _add_margins(table, data, values, rows=index, cols=columns, aggfunc=aggfunc) # discard the top level if values_passed and not values_multi: table = table[values[0]] if len(index) == 0 and len(columns) > 0: table = table.T return table
def test_to_frame(): tuples = [(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')] index = MultiIndex.from_tuples(tuples) result = index.to_frame(index=False) expected = DataFrame(tuples) tm.assert_frame_equal(result, expected) result = index.to_frame() expected.index = index tm.assert_frame_equal(result, expected) tuples = [(1, 'one'), (1, 'two'), (2, 'one'), (2, 'two')] index = MultiIndex.from_tuples(tuples, names=['first', 'second']) result = index.to_frame(index=False) expected = DataFrame(tuples) expected.columns = ['first', 'second'] tm.assert_frame_equal(result, expected) result = index.to_frame() expected.index = index tm.assert_frame_equal(result, expected) # See GH-22580 index = MultiIndex.from_tuples(tuples) result = index.to_frame(index=False, name=['first', 'second']) expected = DataFrame(tuples) expected.columns = ['first', 'second'] tm.assert_frame_equal(result, expected) result = index.to_frame(name=['first', 'second']) expected.index = index expected.columns = ['first', 'second'] tm.assert_frame_equal(result, expected) msg = "'name' must be a list / sequence of column names." with tm.assert_raises_regex(TypeError, msg): index.to_frame(name='first') msg = "'name' should have same length as number of levels on index." with tm.assert_raises_regex(ValueError, msg): index.to_frame(name=['first']) # Tests for datetime index index = MultiIndex.from_product( [range(5), pd.date_range('20130101', periods=3)]) result = index.to_frame(index=False) expected = DataFrame({ 0: np.repeat(np.arange(5, dtype='int64'), 3), 1: np.tile(pd.date_range('20130101', periods=3), 5) }) tm.assert_frame_equal(result, expected) result = index.to_frame() expected.index = index tm.assert_frame_equal(result, expected) # See GH-22580 result = index.to_frame(index=False, name=['first', 'second']) expected = DataFrame({ 'first': np.repeat(np.arange(5, dtype='int64'), 3), 'second': np.tile(pd.date_range('20130101', periods=3), 5) }) tm.assert_frame_equal(result, expected) result = index.to_frame(name=['first', 'second']) expected.index = index tm.assert_frame_equal(result, expected)
def get_data_yahoo_actions(symbol, start=None, end=None, retry_count=3, pause=0.001): """ Returns DataFrame of historical corporate actions (dividends and stock splits) from symbols, over date range, start to end. All dates in the resulting DataFrame correspond with dividend and stock split ex-dates. Parameters ---------- sym : string with a single Single stock symbol (ticker). start : string, (defaults to '1/1/2010') Starting date, timestamp. Parses many different kind of date representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980') end : string, (defaults to today) Ending date, timestamp. Same format as starting date. retry_count : int, default 3 Number of times to retry query request. pause : int, default 0 Time, in seconds, of the pause between retries. """ start, end = _sanitize_dates(start, end) url = (_HISTORICAL_YAHOO_ACTIONS_URL + 's=%s' % symbol + '&a=%s' % (start.month - 1) + '&b=%s' % start.day + '&c=%s' % start.year + '&d=%s' % (end.month - 1) + '&e=%s' % end.day + '&f=%s' % end.year + '&g=v') for _ in range(retry_count): time.sleep(pause) try: with urlopen(url) as resp: lines = resp.read() except _network_error_classes: pass else: actions_index = [] actions_entries = [] for line in csv.reader(StringIO(bytes_to_str(lines))): # Ignore lines that aren't dividends or splits (Yahoo # add a bunch of irrelevant fields.) if len(line) != 3 or line[0] not in ('DIVIDEND', 'SPLIT'): continue action, date, value = line if action == 'DIVIDEND': actions_index.append(to_datetime(date)) actions_entries.append({ 'action': action, 'value': float(value) }) elif action == 'SPLIT' and ':' in value: # Convert the split ratio to a fraction. For example a # 4:1 split expressed as a fraction is 1/4 = 0.25. denominator, numerator = value.split(':', 1) split_fraction = float(numerator) / float(denominator) actions_index.append(to_datetime(date)) actions_entries.append({ 'action': action, 'value': split_fraction }) return DataFrame(actions_entries, index=actions_index) raise IOError("after %d tries, Yahoo! did not " "return a 200 for url %r" % (retry_count, url))
def _unstack_multiple(data, clocs): from pandas.core.groupby import decons_obs_group_ids if len(clocs) == 0: return data # NOTE: This doesn't deal with hierarchical columns yet index = data.index clocs = [index._get_level_number(i) for i in clocs] rlocs = [i for i in range(index.nlevels) if i not in clocs] clevels = [index.levels[i] for i in clocs] clabels = [index.labels[i] for i in clocs] cnames = [index.names[i] for i in clocs] rlevels = [index.levels[i] for i in rlocs] rlabels = [index.labels[i] for i in rlocs] rnames = [index.names[i] for i in rlocs] shape = [len(x) for x in clevels] group_index = get_group_index(clabels, shape, sort=False, xnull=False) comp_ids, obs_ids = _compress_group_index(group_index, sort=False) recons_labels = decons_obs_group_ids(comp_ids, obs_ids, shape, clabels, xnull=False) dummy_index = MultiIndex(levels=rlevels + [obs_ids], labels=rlabels + [comp_ids], names=rnames + ['__placeholder__'], verify_integrity=False) if isinstance(data, Series): dummy = Series(data.values, index=dummy_index) unstacked = dummy.unstack('__placeholder__') new_levels = clevels new_names = cnames new_labels = recons_labels else: if isinstance(data.columns, MultiIndex): result = data for i in range(len(clocs)): val = clocs[i] result = result.unstack(val) clocs = [val if i > val else val - 1 for val in clocs] return result dummy = DataFrame(data.values, index=dummy_index, columns=data.columns) unstacked = dummy.unstack('__placeholder__') if isinstance(unstacked, Series): unstcols = unstacked.index else: unstcols = unstacked.columns new_levels = [unstcols.levels[0]] + clevels new_names = [data.columns.name] + cnames new_labels = [unstcols.labels[0]] for rec in recons_labels: new_labels.append(rec.take(unstcols.labels[-1])) new_columns = MultiIndex(levels=new_levels, labels=new_labels, names=new_names, verify_integrity=False) if isinstance(unstacked, Series): unstacked.index = new_columns else: unstacked.columns = new_columns return unstacked
def _do_test(df, r_dtype=None, c_dtype=None, rnlvl=None, cnlvl=None, dupe_col=False): kwargs = dict(parse_dates=False) if cnlvl: if rnlvl is not None: kwargs['index_col'] = lrange(rnlvl) kwargs['header'] = lrange(cnlvl) with ensure_clean('__tmp_to_csv_moar__') as path: df.to_csv(path, encoding='utf8', chunksize=chunksize, tupleize_cols=False) recons = DataFrame.from_csv(path, tupleize_cols=False, **kwargs) else: kwargs['header'] = 0 with ensure_clean('__tmp_to_csv_moar__') as path: df.to_csv(path, encoding='utf8', chunksize=chunksize) recons = DataFrame.from_csv(path, **kwargs) def _to_uni(x): if not isinstance(x, compat.text_type): return x.decode('utf8') return x if dupe_col: # read_Csv disambiguates the columns by # labeling them dupe.1,dupe.2, etc'. monkey patch columns recons.columns = df.columns if rnlvl and not cnlvl: delta_lvl = [ recons.iloc[:, i].values for i in range(rnlvl - 1) ] ix = MultiIndex.from_arrays([list(recons.index)] + delta_lvl) recons.index = ix recons = recons.iloc[:, rnlvl - 1:] type_map = dict(i='i', f='f', s='O', u='O', dt='O', p='O') if r_dtype: if r_dtype == 'u': # unicode r_dtype = 'O' recons.index = np.array(lmap(_to_uni, recons.index), dtype=r_dtype) df.index = np.array(lmap(_to_uni, df.index), dtype=r_dtype) elif r_dtype == 'dt': # unicode r_dtype = 'O' recons.index = np.array(lmap(Timestamp, recons.index), dtype=r_dtype) df.index = np.array(lmap(Timestamp, df.index), dtype=r_dtype) elif r_dtype == 'p': r_dtype = 'O' recons.index = np.array(list( map(Timestamp, to_datetime(recons.index))), dtype=r_dtype) df.index = np.array(list( map(Timestamp, df.index.to_timestamp())), dtype=r_dtype) else: r_dtype = type_map.get(r_dtype) recons.index = np.array(recons.index, dtype=r_dtype) df.index = np.array(df.index, dtype=r_dtype) if c_dtype: if c_dtype == 'u': c_dtype = 'O' recons.columns = np.array(lmap(_to_uni, recons.columns), dtype=c_dtype) df.columns = np.array(lmap(_to_uni, df.columns), dtype=c_dtype) elif c_dtype == 'dt': c_dtype = 'O' recons.columns = np.array(lmap(Timestamp, recons.columns), dtype=c_dtype) df.columns = np.array(lmap(Timestamp, df.columns), dtype=c_dtype) elif c_dtype == 'p': c_dtype = 'O' recons.columns = np.array(lmap(Timestamp, to_datetime( recons.columns)), dtype=c_dtype) df.columns = np.array(lmap(Timestamp, df.columns.to_timestamp()), dtype=c_dtype) else: c_dtype = type_map.get(c_dtype) recons.columns = np.array(recons.columns, dtype=c_dtype) df.columns = np.array(df.columns, dtype=c_dtype) assert_frame_equal(df, recons, check_names=False, check_less_precise=True)