def _init_dict(self, data, axes, dtype=None): items = axes[0] # prefilter if items passed if items is not None: items = _ensure_index(items) data = dict((k, v) for k, v in data.iteritems() if k in items) else: items = Index(_try_sort(data.keys())) # figure out the index, if necessary if index is None: index = extract_index(data) # don't force copy because getting jammed in an ndarray anyway # homogenized = _homogenize(data, index, columns, dtype) data, index, columns = _homogenize(data, intersect=intersect) # segregates dtypes and forms blocks matching to columns blocks = form_blocks(homogenized, index, columns) # consolidate for now mgr = BlockManager(blocks, [columns, index]) return mgr.consolidate()
def test_get(self): cols = Index(list('abc')) values = np.random.rand(3, 3) block = make_block(values=values.copy(), placement=np.arange(3)) mgr = BlockManager(blocks=[block], axes=[cols, np.arange(3)]) assert_almost_equal(mgr.get('a', fastpath=False), values[0]) assert_almost_equal(mgr.get('b', fastpath=False), values[1]) assert_almost_equal(mgr.get('c', fastpath=False), values[2]) assert_almost_equal(mgr.get('a').internal_values(), values[0]) assert_almost_equal(mgr.get('b').internal_values(), values[1]) assert_almost_equal(mgr.get('c').internal_values(), values[2])
def test_duplicate_ref_loc_failure(self): tmp_mgr = create_mgr('a:bool; a: f8') axes, blocks = tmp_mgr.axes, tmp_mgr.blocks blocks[0].mgr_locs = np.array([0]) blocks[1].mgr_locs = np.array([0]) # test trying to create block manager with overlapping ref locs self.assertRaises(AssertionError, BlockManager, blocks, axes) blocks[0].mgr_locs = np.array([0]) blocks[1].mgr_locs = np.array([1]) mgr = BlockManager(blocks, axes) mgr.iget(1)
def test_equals_block_order_different_dtypes(self): # GH 9330 mgr_strings = [ "a:i8;b:f8", # basic case "a:i8;b:f8;c:c8;d:b", # many types "a:i8;e:dt;f:td;g:string", # more types "a:i8;b:category;c:category2;d:category2", # categories "c:sparse;d:sparse_na;b:f8", # sparse ] for mgr_string in mgr_strings: bm = create_mgr(mgr_string) block_perms = itertools.permutations(bm.blocks) for bm_perm in block_perms: bm_this = BlockManager(bm_perm, bm.axes) self.assertTrue(bm.equals(bm_this)) self.assertTrue(bm_this.equals(bm))
def test_equals(self, mgr_string): # unique items bm1 = create_mgr(mgr_string) bm2 = BlockManager(bm1.blocks[::-1], bm1.axes) assert bm1.equals(bm2)
def decode(obj): """ Decoder for deserializing numpy data types. """ typ = obj.get(u'typ') if typ is None: return obj elif typ == u'timestamp': return Timestamp(obj[u'value'], tz=obj[u'tz'], offset=obj[u'offset']) elif typ == u'nat': return NaT elif typ == u'period': return Period(ordinal=obj[u'ordinal'], freq=obj[u'freq']) elif typ == u'index': dtype = dtype_for(obj[u'dtype']) data = unconvert(obj[u'data'], dtype, obj.get(u'compress')) return globals()[obj[u'klass']](data, dtype=dtype, name=obj[u'name']) elif typ == u'range_index': return globals()[obj[u'klass']](obj[u'start'], obj[u'stop'], obj[u'step'], name=obj[u'name']) elif typ == u'multi_index': dtype = dtype_for(obj[u'dtype']) data = unconvert(obj[u'data'], dtype, obj.get(u'compress')) data = [tuple(x) for x in data] return globals()[obj[u'klass']].from_tuples(data, names=obj[u'names']) elif typ == u'period_index': data = unconvert(obj[u'data'], np.int64, obj.get(u'compress')) d = dict(name=obj[u'name'], freq=obj[u'freq']) return globals()[obj[u'klass']](data, **d) elif typ == u'datetime_index': data = unconvert(obj[u'data'], np.int64, obj.get(u'compress')) d = dict(name=obj[u'name'], freq=obj[u'freq'], verify_integrity=False) result = globals()[obj[u'klass']](data, **d) tz = obj[u'tz'] # reverse tz conversion if tz is not None: result = result.tz_localize('UTC').tz_convert(tz) return result elif typ == u'category': from_codes = globals()[obj[u'klass']].from_codes return from_codes(codes=obj[u'codes'], categories=obj[u'categories'], ordered=obj[u'ordered'], name=obj[u'name']) elif typ == u'series': dtype = dtype_for(obj[u'dtype']) pd_dtype = pandas_dtype(dtype) np_dtype = pandas_dtype(dtype).base index = obj[u'index'] result = globals()[obj[u'klass']](unconvert(obj[u'data'], dtype, obj[u'compress']), index=index, dtype=np_dtype, name=obj[u'name']) tz = getattr(pd_dtype, 'tz', None) if tz: result = result.dt.tz_localize('UTC').dt.tz_convert(tz) return result elif typ == u'block_manager': axes = obj[u'axes'] def create_block(b): values = unconvert(b[u'values'], dtype_for(b[u'dtype']), b[u'compress']).reshape(b[u'shape']) # locs handles duplicate column names, and should be used instead # of items; see GH 9618 if u'locs' in b: placement = b[u'locs'] else: placement = axes[0].get_indexer(b[u'items']) return make_block(values=values, klass=getattr(internals, b[u'klass']), placement=placement, dtype=b[u'dtype']) blocks = [create_block(b) for b in obj[u'blocks']] return globals()[obj[u'klass']](BlockManager(blocks, axes)) elif typ == u'datetime': return parse(obj[u'data']) elif typ == u'datetime64': return np.datetime64(parse(obj[u'data'])) elif typ == u'date': return parse(obj[u'data']).date() elif typ == u'timedelta': return timedelta(*obj[u'data']) elif typ == u'timedelta64': return np.timedelta64(int(obj[u'data'])) # elif typ == 'sparse_series': # dtype = dtype_for(obj['dtype']) # return globals()[obj['klass']]( # unconvert(obj['sp_values'], dtype, obj['compress']), # sparse_index=obj['sp_index'], index=obj['index'], # fill_value=obj['fill_value'], kind=obj['kind'], name=obj['name']) # elif typ == 'sparse_dataframe': # return globals()[obj['klass']]( # obj['data'], columns=obj['columns'], # default_fill_value=obj['default_fill_value'], # default_kind=obj['default_kind'] # ) # elif typ == 'sparse_panel': # return globals()[obj['klass']]( # obj['data'], items=obj['items'], # default_fill_value=obj['default_fill_value'], # default_kind=obj['default_kind']) elif typ == u'block_index': return globals()[obj[u'klass']](obj[u'length'], obj[u'blocs'], obj[u'blengths']) elif typ == u'int_index': return globals()[obj[u'klass']](obj[u'length'], obj[u'indices']) elif typ == u'ndarray': return unconvert(obj[u'data'], np.typeDict[obj[u'dtype']], obj.get(u'compress')).reshape(obj[u'shape']) elif typ == u'np_scalar': if obj.get(u'sub_typ') == u'np_complex': return c2f(obj[u'real'], obj[u'imag'], obj[u'dtype']) else: dtype = dtype_for(obj[u'dtype']) try: return dtype(obj[u'data']) except: return dtype.type(obj[u'data']) elif typ == u'np_complex': return complex(obj[u'real'] + u'+' + obj[u'imag'] + u'j') elif isinstance(obj, (dict, list, set)): return obj else: return obj
def decode(obj): """ Decoder for deserializing numpy data types. """ typ = obj.get(u"typ") if typ is None: return obj elif typ == u"timestamp": freq = obj[u"freq"] if "freq" in obj else obj[u"offset"] return Timestamp(obj[u"value"], tz=obj[u"tz"], freq=freq) elif typ == u"nat": return NaT elif typ == u"period": return Period(ordinal=obj[u"ordinal"], freq=obj[u"freq"]) elif typ == u"index": dtype = dtype_for(obj[u"dtype"]) data = unconvert(obj[u"data"], dtype, obj.get(u"compress")) return globals()[obj[u"klass"]](data, dtype=dtype, name=obj[u"name"]) elif typ == u"range_index": return globals()[obj[u"klass"]](obj[u"start"], obj[u"stop"], obj[u"step"], name=obj[u"name"]) elif typ == u"multi_index": dtype = dtype_for(obj[u"dtype"]) data = unconvert(obj[u"data"], dtype, obj.get(u"compress")) data = [tuple(x) for x in data] return globals()[obj[u"klass"]].from_tuples(data, names=obj[u"names"]) elif typ == u"period_index": data = unconvert(obj[u"data"], np.int64, obj.get(u"compress")) d = dict(name=obj[u"name"], freq=obj[u"freq"]) if _is_pandas_legacy_version: # legacy return globals()[obj[u"klass"]](data, **d) else: freq = d['freq'] if freq is None: raise ValueError( 'freq is not specified and cannot be inferred') values = [Period(ordinal=x, freq=freq) for x in data] return PeriodIndex(values) #return globals()[obj[u"klass"]]._from_ordinals(data, **d) elif typ == u"datetime_index": data = unconvert(obj[u"data"], np.int64, obj.get(u"compress")) d = dict(name=obj[u"name"], freq=obj[u"freq"]) #, verify_integrity=False) result = globals()[obj[u"klass"]](data, **d) tz = obj[u"tz"] # reverse tz conversion if tz is not None: result = result.tz_localize("UTC").tz_convert(tz) return result elif typ == u"category": from_codes = globals()[obj[u"klass"]].from_codes return from_codes(codes=obj[u"codes"], categories=obj[u"categories"], ordered=obj[u"ordered"]) elif typ == u"series": dtype = dtype_for(obj[u"dtype"]) pd_dtype = pandas_dtype(dtype) index = obj[u"index"] result = globals()[obj[u"klass"]]( unconvert(obj[u"data"], dtype, obj[u"compress"]), index=index, dtype=pd_dtype, name=obj[u"name"], ) return result elif typ == u"block_manager": axes = obj[u"axes"] def create_block(b): values = _safe_reshape( unconvert(b[u"values"], dtype_for(b[u"dtype"]), b[u"compress"]), b[u"shape"], ) # locs handles duplicate column names, and should be used instead # of items; see GH 9618 if u"locs" in b: placement = b[u"locs"] else: placement = axes[0].get_indexer(b[u"items"]) klass = getattr(internals, b[u"klass"]) if klass == DatetimeTZBlock: raise ValueError( "Lost the ability to parse datetime with timezone. Sorry") return make_block( values=values.copy(), klass=getattr(internals, b[u"klass"]), placement=placement, dtype=b[u"dtype"], ) blocks = [create_block(b) for b in obj[u"blocks"]] return globals()[obj[u"klass"]](BlockManager(blocks, axes)) elif typ == u"datetime": return parse(obj[u"data"]) elif typ == u"datetime64": return np.datetime64(parse(obj[u"data"])) elif typ == u"date": return parse(obj[u"data"]).date() elif typ == u"timedelta": return timedelta(*obj[u"data"]) elif typ == u"timedelta64": return np.timedelta64(int(obj[u"data"])) # elif typ == 'sparse_series': # dtype = dtype_for(obj['dtype']) # return globals()[obj['klass']]( # unconvert(obj['sp_values'], dtype, obj['compress']), # sparse_index=obj['sp_index'], index=obj['index'], # fill_value=obj['fill_value'], kind=obj['kind'], name=obj['name']) # elif typ == 'sparse_dataframe': # return globals()[obj['klass']]( # obj['data'], columns=obj['columns'], # default_fill_value=obj['default_fill_value'], # default_kind=obj['default_kind'] # ) # elif typ == 'sparse_panel': # return globals()[obj['klass']]( # obj['data'], items=obj['items'], # default_fill_value=obj['default_fill_value'], # default_kind=obj['default_kind']) elif typ == u"block_index": return globals()[obj[u"klass"]](obj[u"length"], obj[u"blocs"], obj[u"blengths"]) elif typ == u"int_index": return globals()[obj[u"klass"]](obj[u"length"], obj[u"indices"]) elif typ == u"ndarray": return unconvert(obj[u"data"], np.typeDict[obj[u"dtype"]], obj.get(u"compress")).reshape(obj[u"shape"]) elif typ == u"np_scalar": if obj.get(u"sub_typ") == u"np_complex": return c2f(obj[u"real"], obj[u"imag"], obj[u"dtype"]) else: dtype = dtype_for(obj[u"dtype"]) try: return dtype(obj[u"data"]) except: return dtype.type(obj[u"data"]) elif typ == u"np_complex": return complex(obj[u"real"] + u"+" + obj[u"imag"] + u"j") elif isinstance(obj, (dict, list, set)): return obj else: return obj
def create_mgr(descr, item_shape=None): """ Construct BlockManager from string description. String description syntax looks similar to np.matrix initializer. It looks like this:: a,b,c: f8; d,e,f: i8 Rules are rather simple: * see list of supported datatypes in `create_block` method * components are semicolon-separated * each component is `NAME,NAME,NAME: DTYPE_ID` * whitespace around colons & semicolons are removed * components with same DTYPE_ID are combined into single block * to force multiple blocks with same dtype, use '-SUFFIX':: 'a:f8-1; b:f8-2; c:f8-foobar' """ if item_shape is None: item_shape = (N, ) offset = 0 mgr_items = [] block_placements = {} for d in descr.split(";"): d = d.strip() if not len(d): continue names, blockstr = d.partition(":")[::2] blockstr = blockstr.strip() names = names.strip().split(",") mgr_items.extend(names) placement = list(np.arange(len(names)) + offset) try: block_placements[blockstr].extend(placement) except KeyError: block_placements[blockstr] = placement offset += len(names) mgr_items = Index(mgr_items) blocks = [] num_offset = 0 for blockstr, placement in block_placements.items(): typestr = blockstr.split("-")[0] blocks.append( create_block(typestr, placement, item_shape=item_shape, num_offset=num_offset)) num_offset += len(placement) sblocks = sorted(blocks, key=lambda b: b.mgr_locs[0]) return BlockManager( tuple(sblocks), [mgr_items] + [Index(np.arange(n)) for n in item_shape], )
def decode(obj): """ Decoder for deserializing numpy data types. """ typ = obj.get('typ') if typ is None: return obj elif typ == 'timestamp': return Timestamp(obj['value'], tz=obj['tz'], offset=obj['offset']) elif typ == 'period': return Period(ordinal=obj['ordinal'], freq=obj['freq']) elif typ == 'index': dtype = dtype_for(obj['dtype']) data = unconvert(obj['data'], np.typeDict[obj['dtype']], obj.get('compress')) return globals()[obj['klass']](data, dtype=dtype, name=obj['name']) elif typ == 'multi_index': data = unconvert(obj['data'], np.typeDict[obj['dtype']], obj.get('compress')) data = [tuple(x) for x in data] return globals()[obj['klass']].from_tuples(data, names=obj['names']) elif typ == 'period_index': data = unconvert(obj['data'], np.int64, obj.get('compress')) return globals()[obj['klass']](data, name=obj['name'], freq=obj['freq']) elif typ == 'datetime_index': data = unconvert(obj['data'], np.int64, obj.get('compress')) result = globals()[obj['klass']](data, freq=obj['freq'], name=obj['name']) tz = obj['tz'] # reverse tz conversion if tz is not None: result = result.tz_localize('UTC').tz_convert(tz) return result elif typ == 'series': dtype = dtype_for(obj['dtype']) index = obj['index'] return globals()[obj['klass']](unconvert(obj['data'], dtype, obj['compress']), index=index, name=obj['name']) elif typ == 'block_manager': axes = obj['axes'] def create_block(b): dtype = dtype_for(b['dtype']) return make_block(unconvert(b['values'], dtype, b['compress']).reshape(b['shape']), b['items'], axes[0], klass=getattr(internals, b['klass'])) blocks = [create_block(b) for b in obj['blocks']] return globals()[obj['klass']](BlockManager(blocks, axes)) elif typ == 'datetime': return parse(obj['data']) elif typ == 'datetime64': return np.datetime64(parse(obj['data'])) elif typ == 'date': return parse(obj['data']).date() elif typ == 'timedelta': return timedelta(*obj['data']) elif typ == 'timedelta64': return np.timedelta64(int(obj['data'])) #elif typ == 'sparse_series': # dtype = dtype_for(obj['dtype']) # return globals()[obj['klass']]( # unconvert(obj['sp_values'], dtype, obj['compress']), # sparse_index=obj['sp_index'], index=obj['index'], # fill_value=obj['fill_value'], kind=obj['kind'], name=obj['name']) #elif typ == 'sparse_dataframe': # return globals()[obj['klass']]( # obj['data'], columns=obj['columns'], # default_fill_value=obj['default_fill_value'], # default_kind=obj['default_kind'] # ) #elif typ == 'sparse_panel': # return globals()[obj['klass']]( # obj['data'], items=obj['items'], # default_fill_value=obj['default_fill_value'], # default_kind=obj['default_kind']) elif typ == 'block_index': return globals()[obj['klass']](obj['length'], obj['blocs'], obj['blengths']) elif typ == 'int_index': return globals()[obj['klass']](obj['length'], obj['indices']) elif typ == 'ndarray': return unconvert(obj['data'], np.typeDict[obj['dtype']], obj.get('compress')).reshape(obj['shape']) elif typ == 'np_scalar': if obj.get('sub_typ') == 'np_complex': return c2f(obj['real'], obj['imag'], obj['dtype']) else: dtype = dtype_for(obj['dtype']) try: return dtype(obj['data']) except: return dtype.type(obj['data']) elif typ == 'np_complex': return complex(obj['real'] + '+' + obj['imag'] + 'j') elif isinstance(obj, (dict, list, set)): return obj else: return obj
def decode(obj): """ Decoder for deserializing numpy data types. """ typ = obj.get("typ") if typ is None: return obj elif typ == "timestamp": freq = obj["freq"] if "freq" in obj else obj["offset"] return Timestamp(obj["value"], tz=obj["tz"], freq=freq) elif typ == "nat": return NaT elif typ == "period": return Period(ordinal=obj["ordinal"], freq=obj["freq"]) elif typ == "index": dtype = dtype_for(obj["dtype"]) data = unconvert(obj["data"], dtype, obj.get("compress")) return Index(data, dtype=dtype, name=obj["name"]) elif typ == "range_index": return RangeIndex(obj["start"], obj["stop"], obj["step"], name=obj["name"]) elif typ == "multi_index": dtype = dtype_for(obj["dtype"]) data = unconvert(obj["data"], dtype, obj.get("compress")) data = [tuple(x) for x in data] return MultiIndex.from_tuples(data, names=obj["names"]) elif typ == "period_index": data = unconvert(obj["data"], np.int64, obj.get("compress")) d = dict(name=obj["name"], freq=obj["freq"]) freq = d.pop("freq", None) return PeriodIndex(PeriodArray(data, freq), **d) elif typ == "datetime_index": data = unconvert(obj["data"], np.int64, obj.get("compress")) d = dict(name=obj["name"], freq=obj["freq"]) result = DatetimeIndex(data, **d) tz = obj["tz"] # reverse tz conversion if tz is not None: result = result.tz_localize("UTC").tz_convert(tz) return result elif typ in ("interval_index", "interval_array"): return globals()[obj["klass"]].from_arrays(obj["left"], obj["right"], obj["closed"], name=obj["name"]) elif typ == "category": from_codes = globals()[obj["klass"]].from_codes return from_codes(codes=obj["codes"], categories=obj["categories"], ordered=obj["ordered"]) elif typ == "interval": return Interval(obj["left"], obj["right"], obj["closed"]) elif typ == "series": dtype = dtype_for(obj["dtype"]) index = obj["index"] data = unconvert(obj["data"], dtype, obj["compress"]) return Series(data, index=index, dtype=dtype, name=obj["name"]) elif typ == "block_manager": axes = obj["axes"] def create_block(b): values = _safe_reshape( unconvert(b["values"], dtype_for(b["dtype"]), b["compress"]), b["shape"]) # locs handles duplicate column names, and should be used instead # of items; see GH 9618 if "locs" in b: placement = b["locs"] else: placement = axes[0].get_indexer(b["items"]) if is_datetime64tz_dtype(b["dtype"]): assert isinstance(values, np.ndarray), type(values) assert values.dtype == "M8[ns]", values.dtype values = DatetimeArray(values, dtype=b["dtype"]) return make_block( values=values, klass=getattr(internals, b["klass"]), placement=placement, dtype=b["dtype"], ) blocks = [create_block(b) for b in obj["blocks"]] return globals()[obj["klass"]](BlockManager(blocks, axes)) elif typ == "datetime": return parse(obj["data"]) elif typ == "datetime64": return np.datetime64(parse(obj["data"])) elif typ == "date": return parse(obj["data"]).date() elif typ == "timedelta": return timedelta(*obj["data"]) elif typ == "timedelta64": return np.timedelta64(int(obj["data"])) # elif typ == 'sparse_series': # dtype = dtype_for(obj['dtype']) # return SparseSeries( # unconvert(obj['sp_values'], dtype, obj['compress']), # sparse_index=obj['sp_index'], index=obj['index'], # fill_value=obj['fill_value'], kind=obj['kind'], name=obj['name']) # elif typ == 'sparse_dataframe': # return SparseDataFrame( # obj['data'], columns=obj['columns'], # default_fill_value=obj['default_fill_value'], # default_kind=obj['default_kind'] # ) elif typ == "block_index": return globals()[obj["klass"]](obj["length"], obj["blocs"], obj["blengths"]) elif typ == "int_index": return globals()[obj["klass"]](obj["length"], obj["indices"]) elif typ == "ndarray": return unconvert(obj["data"], np.typeDict[obj["dtype"]], obj.get("compress")).reshape(obj["shape"]) elif typ == "np_scalar": if obj.get("sub_typ") == "np_complex": return c2f(obj["real"], obj["imag"], obj["dtype"]) else: dtype = dtype_for(obj["dtype"]) try: return dtype(obj["data"]) except (ValueError, TypeError): return dtype.type(obj["data"]) elif typ == "np_complex": return complex(obj["real"] + "+" + obj["imag"] + "j") elif isinstance(obj, (dict, list, set)): return obj else: return obj
def empty(types, size, cats=None, cols=None, index_type=None, index_name=None): """ Create empty DataFrame to assign into Parameters ---------- types: like np record structure, 'i4,u2,f4,f2,f4,M8,m8', or using tuples applies to non-categorical columns. If there are only categorical columns, an empty string of None will do. size: int Number of rows to allocate cats: dict {col: labels} Location and labels for categorical columns, e.g., {1: ['mary', 'mo]} will create column index 1 (inserted amongst the numerical columns) with two possible values. If labels is an integers, `{'col': 5}`, will generate temporary labels using range. If None, or column name is missing, will assume 16-bit integers (a reasonable default). cols: list of labels assigned column names, including categorical ones. Returns ------- - dataframe with correct shape and data-types - list of numpy views, in order, of the columns of the dataframe. Assign to this. """ df = DataFrame() views = {} cols = cols if cols is not None else range(cols) if isinstance(types, STR_TYPE): types = types.split(',') for t, col in zip(types, cols): if str(t) == 'category': if cats is None or col not in cats: df[str(col)] = Categorical([], categories=RangeIndex(0, 2**14), fastpath=True) elif isinstance(cats[col], int): df[str(col)] = Categorical([], categories=RangeIndex(0, cats[col]), fastpath=True) else: # explicit labels list df[str(col)] = Categorical([], categories=cats[col], fastpath=True) else: df[str(col)] = np.empty(0, dtype=t) if index_type is not None and index_type is not False: if index_name is None: raise ValueError('If using an index, must give an index name') if str(index_type) == 'category': if cats is None or index_name not in cats: c = Categorical([], categories=RangeIndex(0, 2**14), fastpath=True) elif isinstance(cats[index_name], int): c = Categorical([], categories=RangeIndex(0, cats[index_name]), fastpath=True) else: # explicit labels list c = Categorical([], categories=cats[index_name], fastpath=True) print(cats, index_name, c) vals = np.empty(size, dtype=c.codes.dtype) index = CategoricalIndex(c) index._data._codes = vals views[index_name] = vals else: index = np.empty(size, dtype=index_type) views[index_name] = index axes = [df._data.axes[0], index] else: axes = [df._data.axes[0], RangeIndex(size)] # allocate and create blocks blocks = [] for block in df._data.blocks: if block.is_categorical: categories = block.values.categories code = np.zeros(shape=size, dtype=block.values.codes.dtype) values = Categorical(values=code, categories=categories, fastpath=True) else: new_shape = (block.values.shape[0], size) values = np.empty(shape=new_shape, dtype=block.values.dtype) new_block = block.make_block_same_class(values=values) blocks.append(new_block) # create block manager df = DataFrame(BlockManager(blocks, axes)) # create views for block in df._data.blocks: dtype = block.dtype inds = block.mgr_locs.indexer if isinstance(inds, slice): inds = list(range(inds.start, inds.stop, inds.step)) for i, ind in enumerate(inds): col = df.columns[ind] if str(dtype) == 'category': views[col] = block.values._codes views[col + '-catdef'] = block.values else: views[col] = block.values[i] if index_name is not None and index_name is not False: df.index.name = index_name if str(index_type) == 'category': views[index_name + '-catdef'] = df._data.axes[1].values return df, views
def decode(obj): """ Decoder for deserializing numpy data types. """ typ = obj.get('typ') if typ is None: return obj elif typ == 'timestamp': freq = obj['freq'] if 'freq' in obj else obj['offset'] return Timestamp(obj['value'], tz=obj['tz'], freq=freq) elif typ == 'nat': return NaT elif typ == 'period': return Period(ordinal=obj['ordinal'], freq=obj['freq']) elif typ == 'index': dtype = dtype_for(obj['dtype']) data = unconvert(obj['data'], dtype, obj.get('compress')) return Index(data, dtype=dtype, name=obj['name']) elif typ == 'range_index': return RangeIndex(obj['start'], obj['stop'], obj['step'], name=obj['name']) elif typ == 'multi_index': dtype = dtype_for(obj['dtype']) data = unconvert(obj['data'], dtype, obj.get('compress')) data = [tuple(x) for x in data] return MultiIndex.from_tuples(data, names=obj['names']) elif typ == 'period_index': data = unconvert(obj['data'], np.int64, obj.get('compress')) d = dict(name=obj['name'], freq=obj['freq']) freq = d.pop('freq', None) return PeriodIndex(PeriodArray(data, freq), **d) elif typ == 'datetime_index': data = unconvert(obj['data'], np.int64, obj.get('compress')) d = dict(name=obj['name'], freq=obj['freq']) result = DatetimeIndex(data, **d) tz = obj['tz'] # reverse tz conversion if tz is not None: result = result.tz_localize('UTC').tz_convert(tz) return result elif typ in ('interval_index', 'interval_array'): return globals()[obj['klass']].from_arrays(obj['left'], obj['right'], obj['closed'], name=obj['name']) elif typ == 'category': from_codes = globals()[obj['klass']].from_codes return from_codes(codes=obj['codes'], categories=obj['categories'], ordered=obj['ordered']) elif typ == 'interval': return Interval(obj['left'], obj['right'], obj['closed']) elif typ == 'series': dtype = dtype_for(obj['dtype']) pd_dtype = pandas_dtype(dtype) index = obj['index'] result = Series(unconvert(obj['data'], dtype, obj['compress']), index=index, dtype=pd_dtype, name=obj['name']) return result elif typ == 'block_manager': axes = obj['axes'] def create_block(b): values = _safe_reshape(unconvert( b['values'], dtype_for(b['dtype']), b['compress']), b['shape']) # locs handles duplicate column names, and should be used instead # of items; see GH 9618 if 'locs' in b: placement = b['locs'] else: placement = axes[0].get_indexer(b['items']) if is_datetime64tz_dtype(b['dtype']): assert isinstance(values, np.ndarray), type(values) assert values.dtype == 'M8[ns]', values.dtype values = DatetimeArray(values, dtype=b['dtype']) return make_block(values=values, klass=getattr(internals, b['klass']), placement=placement, dtype=b['dtype']) blocks = [create_block(b) for b in obj['blocks']] return globals()[obj['klass']](BlockManager(blocks, axes)) elif typ == 'datetime': return parse(obj['data']) elif typ == 'datetime64': return np.datetime64(parse(obj['data'])) elif typ == 'date': return parse(obj['data']).date() elif typ == 'timedelta': return timedelta(*obj['data']) elif typ == 'timedelta64': return np.timedelta64(int(obj['data'])) # elif typ == 'sparse_series': # dtype = dtype_for(obj['dtype']) # return SparseSeries( # unconvert(obj['sp_values'], dtype, obj['compress']), # sparse_index=obj['sp_index'], index=obj['index'], # fill_value=obj['fill_value'], kind=obj['kind'], name=obj['name']) # elif typ == 'sparse_dataframe': # return SparseDataFrame( # obj['data'], columns=obj['columns'], # default_fill_value=obj['default_fill_value'], # default_kind=obj['default_kind'] # ) # elif typ == 'sparse_panel': # return SparsePanel( # obj['data'], items=obj['items'], # default_fill_value=obj['default_fill_value'], # default_kind=obj['default_kind']) elif typ == 'block_index': return globals()[obj['klass']](obj['length'], obj['blocs'], obj['blengths']) elif typ == 'int_index': return globals()[obj['klass']](obj['length'], obj['indices']) elif typ == 'ndarray': return unconvert(obj['data'], np.typeDict[obj['dtype']], obj.get('compress')).reshape(obj['shape']) elif typ == 'np_scalar': if obj.get('sub_typ') == 'np_complex': return c2f(obj['real'], obj['imag'], obj['dtype']) else: dtype = dtype_for(obj['dtype']) try: return dtype(obj['data']) except (ValueError, TypeError): return dtype.type(obj['data']) elif typ == 'np_complex': return complex(obj['real'] + '+' + obj['imag'] + 'j') elif isinstance(obj, (dict, list, set)): return obj else: return obj
def _init_arrays(self, arrays, arr_names, axes): # segregates dtypes and forms blocks matching to columns blocks = form_blocks(arrays, arr_names, axes) mgr = BlockManager(blocks, axes).consolidate() return mgr
def empty(types, size, cats=None, cols=None, index_types=None, index_names=None, timezones=None): """ Create empty DataFrame to assign into Parameters ---------- types: like np record structure, 'i4,u2,f4,f2,f4,M8,m8', or using tuples applies to non-categorical columns. If there are only categorical columns, an empty string of None will do. size: int Number of rows to allocate cats: dict {col: labels} Location and labels for categorical columns, e.g., {1: ['mary', 'mo]} will create column index 1 (inserted amongst the numerical columns) with two possible values. If labels is an integers, `{'col': 5}`, will generate temporary labels using range. If None, or column name is missing, will assume 16-bit integers (a reasonable default). cols: list of labels assigned column names, including categorical ones. timezones: dict {col: timezone_str} for timestamp type columns, apply this timezone to the pandas series; the numpy view will be UTC. Returns ------- - dataframe with correct shape and data-types - list of numpy views, in order, of the columns of the dataframe. Assign to this. """ views = {} timezones = timezones or {} if isinstance(types, STR_TYPE): types = types.split(',') cols = cols if cols is not None else range(len(types)) def cat(col): if cats is None or col not in cats: return RangeIndex(0, 2**14) elif isinstance(cats[col], int): return RangeIndex(0, cats[col]) else: # explicit labels list return cats[col] indexes = [] if index_names: for t, col in zip(index_types, index_names): if col is None: raise ValueError('If using an index, must give an index name') if str(t) == 'category': c = Categorical([], categories=cat(col), fastpath=True) vals = np.zeros(size, dtype=c.codes.dtype) index = CategoricalIndex(c) index._data._codes = vals views[col] = vals views[col + '-catdef'] = index._data else: d = np.empty(size, dtype=t) # if d.dtype.kind == "M" and six.text_type(col) in timezones: # d = Series(d).dt.tz_localize(timezones[six.text_type(col)]) index = Index(d) views[col] = index.values index.name = _index_name(col) indexes.append(index) df = OrderedDict() for t, col in zip(types, cols): if str(t) == 'category': df[six.text_type(col)] = Categorical([], categories=cat(col), fastpath=True) else: d = np.empty(0, dtype=t) if d.dtype.kind == "M" and six.text_type(col) in timezones: d = Series(d).dt.tz_localize(timezones[six.text_type(col)]) df[six.text_type(col)] = d df = DataFrame(df) # allocate and create blocks blocks = [] for block in df._data.blocks: if block.is_categorical: categories = block.values.categories code = np.zeros(shape=size, dtype=block.values.codes.dtype) values = Categorical(values=code, categories=categories, fastpath=True) new_block = block.make_block_same_class(values=values) elif getattr(block.dtype, 'tz', None): new_shape = (size, ) values = np.empty(shape=new_shape, dtype=block.values.values.dtype) new_block = block.make_block_same_class(values=values, dtype=block.values.dtype) else: new_shape = (block.values.shape[0], size) values = np.empty(shape=new_shape, dtype=block.values.dtype) new_block = block.make_block_same_class(values=values) blocks.append(new_block) # create block manager axes = [df._data.axes[0], RangeIndex(size)] n_indexes = len(indexes) if n_indexes == 1: axes[1] = indexes[0] elif n_indexes > 1: views['__fastparquet_multiindex__'] = indexes df = DataFrame(BlockManager(blocks, axes)) # create views for block in df._data.blocks: dtype = block.dtype inds = block.mgr_locs.indexer if isinstance(inds, slice): inds = list(range(inds.start, inds.stop, inds.step)) for i, ind in enumerate(inds): col = df.columns[ind] if is_categorical_dtype(dtype): views[col] = block.values._codes views[col + '-catdef'] = block.values elif getattr(block.dtype, 'tz', None): views[col] = block.values.values else: views[col] = block.values[i] return df, views
def _read_panel_table(self, group, where=None): table = getattr(group, 'table') fields = table._v_attrs.fields # create the selection sel = Selection(table, where, table._v_attrs.index_kind) sel.select() fields = table._v_attrs.fields columns = _maybe_convert(sel.values['column'], table._v_attrs.columns_kind) index = _maybe_convert(sel.values['index'], table._v_attrs.index_kind) values = sel.values['values'] major = Factor.from_array(index) minor = Factor.from_array(columns) J, K = len(major.levels), len(minor.levels) key = major.labels * K + minor.labels if len(unique(key)) == len(key): sorter, _ = lib.groupsort_indexer(com._ensure_int64(key), J * K) sorter = com._ensure_platform_int(sorter) # the data need to be sorted sorted_values = values.take(sorter, axis=0) major_labels = major.labels.take(sorter) minor_labels = minor.labels.take(sorter) block = block2d_to_block3d(sorted_values, fields, (J, K), major_labels, minor_labels) mgr = BlockManager([block], [block.ref_items, major.levels, minor.levels]) wp = Panel(mgr) else: if not self._quiet: # pragma: no cover print( 'Duplicate entries in table, taking most recently ' 'appended') # reconstruct long_index = MultiIndex.from_arrays([index, columns]) lp = DataFrame(values, index=long_index, columns=fields) # need a better algorithm tuple_index = long_index._tuple_index unique_tuples = lib.fast_unique(tuple_index) unique_tuples = _asarray_tuplesafe(unique_tuples) indexer = match(unique_tuples, tuple_index) indexer = com._ensure_platform_int(indexer) new_index = long_index.take(indexer) new_values = lp.values.take(indexer, axis=0) lp = DataFrame(new_values, index=new_index, columns=lp.columns) wp = lp.to_panel() if sel.column_filter: new_minor = sorted(set(wp.minor_axis) & sel.column_filter) wp = wp.reindex(minor=new_minor) return wp
def empty(types, size, cats=None, cols=None, index_types=None, index_names=None, timezones=None): """ Create empty DataFrame to assign into In the simplest case, will return a Pandas dataframe of the given size, with columns of the given names and types. The second return value `views` is a dictionary of numpy arrays into which you can assign values that show up in the dataframe. For categorical columns, you get two views to assign into: if the column name is "col", you get both "col" (the category codes) and "col-catdef" (the category labels). For a single categorical index, you should use the `.set_categories` method of the appropriate "-catdef" columns, passing an Index of values ``views['index-catdef'].set_categories(pd.Index(newvalues), fastpath=True)`` Multi-indexes work a lot like categoricals, even if the types of each index are not themselves categories, and will also have "-catdef" entries in the views. However, these will be Dummy instances, providing only a ``.set_categories`` method, to be used as above. Parameters ---------- types: like np record structure, 'i4,u2,f4,f2,f4,M8,m8', or using tuples applies to non-categorical columns. If there are only categorical columns, an empty string of None will do. size: int Number of rows to allocate cats: dict {col: labels} Location and labels for categorical columns, e.g., {1: ['mary', 'mo]} will create column index 1 (inserted amongst the numerical columns) with two possible values. If labels is an integers, `{'col': 5}`, will generate temporary labels using range. If None, or column name is missing, will assume 16-bit integers (a reasonable default). cols: list of labels assigned column names, including categorical ones. index_types: list of str For one of more index columns, make them have this type. See general description, above, for caveats about multi-indexing. If None, the index will be the default RangeIndex. index_names: list of str Names of the index column(s), if using timezones: dict {col: timezone_str} for timestamp type columns, apply this timezone to the pandas series; the numpy view will be UTC. Returns ------- - dataframe with correct shape and data-types - list of numpy views, in order, of the columns of the dataframe. Assign to this. """ views = {} timezones = timezones or {} if isinstance(types, STR_TYPE): types = types.split(',') cols = cols if cols is not None else range(len(types)) def cat(col): if cats is None or col not in cats: return RangeIndex(0, 2**14) elif isinstance(cats[col], int): return RangeIndex(0, cats[col]) else: # explicit labels list return cats[col] df = OrderedDict() for t, col in zip(types, cols): if str(t) == 'category': df[six.text_type(col)] = Categorical([], categories=cat(col), fastpath=True) else: d = np.empty(0, dtype=t) if d.dtype.kind == "M" and six.text_type(col) in timezones: d = Series(d).dt.tz_localize(timezones[six.text_type(col)]) df[six.text_type(col)] = d df = DataFrame(df) if not index_types: index = RangeIndex(size) elif len(index_types) == 1: t, col = index_types[0], index_names[0] if col is None: raise ValueError('If using an index, must give an index name') if str(t) == 'category': c = Categorical([], categories=cat(col), fastpath=True) vals = np.zeros(size, dtype=c.codes.dtype) index = CategoricalIndex(c) index._data._codes = vals views[col] = vals views[col+'-catdef'] = index._data else: d = np.empty(size, dtype=t) index = Index(d) views[col] = index.values else: index = MultiIndex([[]], [[]]) # index = MultiIndex.from_arrays(indexes) index._levels = list() index._labels = list() index._codes = list() for i, col in enumerate(index_names): index._levels.append(Index([None])) def set_cats(values, i=i, col=col, **kwargs): values.name = col if index._levels[i][0] is None: index._levels[i] = values elif not index._levels[i].equals(values): raise RuntimeError("Different dictionaries encountered" " while building categorical") x = Dummy() x._set_categories = set_cats d = np.zeros(size, dtype=int) if LooseVersion(pdver) >= LooseVersion("0.24.0"): index._codes = list(index._codes) + [d] else: index._labels.append(d) views[col] = d views[col+'-catdef'] = x axes = [df._data.axes[0], index] # allocate and create blocks blocks = [] for block in df._data.blocks: if block.is_categorical: categories = block.values.categories code = np.zeros(shape=size, dtype=block.values.codes.dtype) values = Categorical(values=code, categories=categories, fastpath=True) new_block = block.make_block_same_class(values=values) elif getattr(block.dtype, 'tz', None): new_shape = (size, ) values = np.empty(shape=new_shape, dtype='M8[ns]') new_block = block.make_block_same_class( values=values, dtype=block.values.dtype) else: new_shape = (block.values.shape[0], size) values = np.empty(shape=new_shape, dtype=block.values.dtype) new_block = block.make_block_same_class(values=values) blocks.append(new_block) # create block manager df = DataFrame(BlockManager(blocks, axes)) # create views for block in df._data.blocks: dtype = block.dtype inds = block.mgr_locs.indexer if isinstance(inds, slice): inds = list(range(inds.start, inds.stop, inds.step)) for i, ind in enumerate(inds): col = df.columns[ind] if is_categorical_dtype(dtype): views[col] = block.values._codes views[col+'-catdef'] = block.values elif getattr(block.dtype, 'tz', None): views[col] = np.asarray(block.values, dtype='M8[ns]') else: views[col] = block.values[i] if index_names: df.index.names = [ None if re.match(r'__index_level_\d+__', n) else n for n in index_names ] return df, views
def block_concat(dfs, idx, columns): manager = BlockManager(iter_blocks(dfs), [columns, idx]) return pd.DataFrame(manager).copy()