def __init__( self, partitions, index, columns, row_lengths=None, column_widths=None, dtypes=None, ): """Initialize a dataframe. Args: partitions: A 2D numpy array of partitions. Must contain partition objects. index: The index object for the dataframe. Converts to a pandas.Index. columns: The columns object for the dataframe. Converts to a pandas.Index. row_lengths: (optional) The lengths of each partition in the rows. The "height" of each of the block partitions. Is computed if not provided. column_widths: (optional) The width of each partition in the columns. The "width" of each of the block partitions. Is computed if not provided. dtypes: (optional) The data types for the dataframe. """ self._partitions = partitions self._index_cache = ensure_index(index) self._columns_cache = ensure_index(columns) self._row_lengths_cache = row_lengths self._column_widths_cache = column_widths self._dtypes = dtypes self._filter_empties()
def reorder_arrays(arrays, arr_columns, columns): # reorder according to the columns if (columns is not None and len(columns) and arr_columns is not None and len(arr_columns)): indexer = ensure_index(arr_columns).get_indexer(columns) arr_columns = ensure_index([arr_columns[i] for i in indexer]) arrays = [arrays[i] for i in indexer] return arrays, arr_columns
def to_manager(sdf, columns, index): """ create and return the block manager from a dataframe of series, columns, index """ # from BlockManager perspective axes = [ensure_index(columns), ensure_index(index)] return create_block_manager_from_arrays([sdf[c] for c in columns], columns, axes)
def to_manager(sdf, columns, index): """ create and return the block manager from a dataframe of series, columns, index """ # from BlockManager perspective axes = [ensure_index(columns), ensure_index(index)] return create_block_manager_from_arrays( [sdf[c] for c in columns], columns, axes)
def _get_axes(N, K, index, columns): # helper to create the axes as indexes # return axes or defaults if index is None: index = ibase.default_index(N) else: index = ensure_index(index) if columns is None: columns = ibase.default_index(K) else: columns = ensure_index(columns) return index, columns
def __init__(self, bins, binlabels, filter_empty=False, mutated=False, indexer=None): self.bins = ensure_int64(bins) self.binlabels = ensure_index(binlabels) self._filter_empty_groups = filter_empty self.mutated = mutated self.indexer = indexer
def init_dict(data, index, columns, dtype=None): """ Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. """ if columns is not None: from pandas.core.series import Series arrays = Series(data, index=columns, dtype=object) data_names = arrays.index missing = arrays.isnull() if index is None: # GH10856 # raise ValueError if only scalars in dict index = extract_index(arrays[~missing]) else: index = ensure_index(index) # no obvious "empty" int column if missing.any() and not is_integer_dtype(dtype): if dtype is None or np.issubdtype(dtype, np.flexible): # GH#1783 nan_dtype = object else: nan_dtype = dtype v = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype) arrays.loc[missing] = [v] * missing.sum() else: keys = com.dict_keys_to_ordered_list(data) columns = data_names = Index(keys) arrays = [data[k] for k in keys] return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
def _init_dict(self, data, index, columns, dtype=None): # pre-filter out columns if we passed it if columns is not None: columns = ensure_index(columns) data = {k: v for k, v in data.items() if k in columns} else: keys = com.dict_keys_to_ordered_list(data) columns = Index(keys) if index is None: index = extract_index(list(data.values())) def sp_maker(x): return SparseArray( x, kind=self._default_kind, fill_value=self._default_fill_value, copy=True, dtype=dtype, ) sdict = {} for k, v in data.items(): if isinstance(v, Series): # Force alignment, no copy necessary if not v.index.equals(index): v = v.reindex(index) if not isinstance(v, SparseSeries): v = sp_maker(v.values) elif isinstance(v, SparseArray): v = v.copy() else: if isinstance(v, dict): v = [v.get(i, np.nan) for i in index] v = sp_maker(v) if index is not None and len(v) != len(index): msg = "Length of passed values is {}, index implies {}" raise ValueError(msg.format(len(v), len(index))) sdict[k] = v if len(columns.difference(sdict)): # TODO: figure out how to handle this case, all nan's? # add in any other columns we want to have (completeness) nan_arr = np.empty(len(index), dtype="float64") nan_arr.fill(np.nan) nan_arr = SparseArray( nan_arr, kind=self._default_kind, fill_value=self._default_fill_value, copy=False, ) sdict.update((c, nan_arr) for c in columns if c not in sdict) return to_manager(sdict, columns, index)
def extract_index(data): index = None if len(data) == 0: index = Index([]) elif len(data) > 0: raw_lengths = [] indexes = [] have_raw_arrays = False have_series = False have_dicts = False have_ordered = False for val in data: if isinstance(val, ABCSeries): have_series = True indexes.append(val.index) elif isinstance(val, dict): have_dicts = True if isinstance(val, OrderedDict): have_ordered = True indexes.append(list(val.keys())) elif is_list_like(val) and getattr(val, "ndim", 1) == 1: have_raw_arrays = True raw_lengths.append(len(val)) if not indexes and not raw_lengths: raise ValueError( "If using all scalar values, you must pass an index") if have_series: index = _union_indexes(indexes) elif have_dicts: index = _union_indexes(indexes, sort=not (compat.PY36 or have_ordered)) if have_raw_arrays: lengths = list(set(raw_lengths)) if len(lengths) > 1: raise ValueError("arrays must all be same length") if have_dicts: raise ValueError( "Mixing dicts with non-Series may lead to ambiguous ordering." ) if have_series: if lengths[0] != len(index): msg = ("array length {length} does not match index " "length {idx_len}".format(length=lengths[0], idx_len=len(index))) raise ValueError(msg) else: index = ibase.default_index(lengths[0]) return ensure_index(index)
def arrays_to_mgr(arrays, arr_names, index, columns, dtype=None): """ Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. """ # figure out the index, if necessary if index is None: index = extract_index(arrays) else: index = ensure_index(index) # don't force copy because getting jammed in an ndarray anyway arrays = _homogenize(arrays, index, dtype) # from BlockManager perspective axes = [ensure_index(columns), index] return create_block_manager_from_arrays(arrays, arr_names, axes)
def __init__( self, partitions, index, columns, row_lengths=None, column_widths=None, dtypes=None, ): """Initialize a dataframe. Args: partitions: A 2D numpy array of partitions. Must contain partition objects. index: The index object for the dataframe. Converts to a pandas.Index. columns: The columns object for the dataframe. Converts to a pandas.Index. row_lengths: (optional) The lengths of each partition in the rows. The "height" of each of the block partitions. Is computed if not provided. column_widths: (optional) The width of each partition in the columns. The "width" of each of the block partitions. Is computed if not provided. dtypes: (optional) The data types for the dataframe. """ self._partitions = partitions self._index_cache = ensure_index(index) self._columns_cache = ensure_index(columns) if row_lengths is not None and len(self.index) > 0: ErrorMessage.catch_bugs_and_request_email( sum(row_lengths) != len(self._index_cache), "Row lengths: {} != {}".format( sum(row_lengths), len(self._index_cache) ), ) self._row_lengths_cache = row_lengths if column_widths is not None and len(self.columns) > 0: ErrorMessage.catch_bugs_and_request_email( sum(column_widths) != len(self._columns_cache), "Column widths: {} != {}".format( sum(column_widths), len(self._columns_cache) ), ) self._column_widths_cache = column_widths self._dtypes = dtypes self._filter_empties()
def _set_index(self, new_index): """Replaces the current row labels with new labels. Args: new_index: The replacement row labels. """ if self._index_cache is None: self._index_cache = ensure_index(new_index) else: new_index = self._validate_set_axis(new_index, self._index_cache) self._index_cache = new_index self._apply_index_objs(axis=0)
def masked_rec_array_to_mgr(data, index, columns, dtype, copy): """ Extract from a masked rec array and create the manager. """ # essentially process a record array then fill it fill_value = data.fill_value fdata = ma.getdata(data) if index is None: index = get_names_from_index(fdata) if index is None: index = ibase.default_index(len(data)) index = ensure_index(index) if columns is not None: columns = ensure_index(columns) arrays, arr_columns = to_arrays(fdata, columns) # fill if needed new_arrays = [] for fv, arr, col in zip(fill_value, arrays, arr_columns): # TODO: numpy docs suggest fv must be scalar, but could it be # non-scalar for object dtype? assert lib.is_scalar(fv), fv mask = ma.getmaskarray(data[col]) if mask.any(): arr, fv = maybe_upcast(arr, fill_value=fv, copy=True) arr[mask] = fv new_arrays.append(arr) # create the manager arrays, arr_columns = reorder_arrays(new_arrays, arr_columns, columns) if columns is None: columns = arr_columns mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype) if copy: mgr = mgr.copy() return mgr
def _set_columns(self, new_columns): """Replaces the current column labels with new labels. Args: new_columns: The replacement column labels. """ if self._columns_cache is None: self._columns_cache = ensure_index(new_columns) else: new_columns = self._validate_set_axis(new_columns, self._columns_cache) self._columns_cache = new_columns if self._dtypes is not None: self._dtypes.index = new_columns self._apply_index_objs(axis=1)
def masked_rec_array_to_mgr(data, index, columns, dtype, copy): """ Extract from a masked rec array and create the manager. """ # essentially process a record array then fill it fill_value = data.fill_value fdata = ma.getdata(data) if index is None: index = get_names_from_index(fdata) if index is None: index = ibase.default_index(len(data)) index = ensure_index(index) if columns is not None: columns = ensure_index(columns) arrays, arr_columns = to_arrays(fdata, columns) # fill if needed new_arrays = [] for fv, arr, col in zip(fill_value, arrays, arr_columns): mask = ma.getmaskarray(data[col]) if mask.any(): arr, fv = maybe_upcast(arr, fill_value=fv, copy=True) arr[mask] = fv new_arrays.append(arr) # create the manager arrays, arr_columns = reorder_arrays(new_arrays, arr_columns, columns) if columns is None: columns = arr_columns mgr = arrays_to_mgr(arrays, arr_columns, index, columns, dtype) if copy: mgr = mgr.copy() return mgr
def _init_dict(self, data, index, columns, dtype=None): # pre-filter out columns if we passed it if columns is not None: columns = ensure_index(columns) data = {k: v for k, v in compat.iteritems(data) if k in columns} else: keys = com.dict_keys_to_ordered_list(data) columns = Index(keys) if index is None: index = extract_index(list(data.values())) def sp_maker(x): return SparseArray(x, kind=self._default_kind, fill_value=self._default_fill_value, copy=True, dtype=dtype) sdict = {} for k, v in compat.iteritems(data): if isinstance(v, Series): # Force alignment, no copy necessary if not v.index.equals(index): v = v.reindex(index) if not isinstance(v, SparseSeries): v = sp_maker(v.values) elif isinstance(v, SparseArray): v = v.copy() else: if isinstance(v, dict): v = [v.get(i, np.nan) for i in index] v = sp_maker(v) if index is not None and len(v) != len(index): msg = "Length of passed values is {}, index implies {}" raise ValueError(msg.format(len(v), len(index))) sdict[k] = v if len(columns.difference(sdict)): # TODO: figure out how to handle this case, all nan's? # add in any other columns we want to have (completeness) nan_arr = np.empty(len(index), dtype='float64') nan_arr.fill(np.nan) nan_arr = SparseArray(nan_arr, kind=self._default_kind, fill_value=self._default_fill_value, copy=False) sdict.update((c, nan_arr) for c in columns if c not in sdict) return to_manager(sdict, columns, index)
def _get_concat_axis(self): """ Return index to be used along concatenation axis. """ if self._is_series: if self.axis == 0: indexes = [x.index for x in self.objs] elif self.ignore_index: idx = ibase.default_index(len(self.objs)) return idx elif self.keys is None: names = [None] * len(self.objs) num = 0 has_names = False for i, x in enumerate(self.objs): if not isinstance(x, Series): raise TypeError( "Cannot concatenate type 'Series' " "with object of type {type!r}".format(type=type(x).__name__) ) if x.name is not None: names[i] = x.name has_names = True else: names[i] = num num += 1 if has_names: return Index(names) else: return ibase.default_index(len(self.objs)) else: return ensure_index(self.keys).set_names(self.names) else: indexes = [x._data.axes[self.axis] for x in self.objs] if self.ignore_index: idx = ibase.default_index(sum(len(i) for i in indexes)) return idx if self.keys is None: concat_axis = _concat_indexes(indexes) else: concat_axis = _make_concat_multiindex( indexes, self.keys, self.levels, self.names ) self._maybe_check_integrity(concat_axis) return concat_axis
def extract_index(data): index = None if len(data) == 0: index = Index([]) elif len(data) > 0: raw_lengths = [] indexes = [] have_raw_arrays = False have_series = False have_dicts = False for val in data: if isinstance(val, ABCSeries): have_series = True indexes.append(val.index) elif isinstance(val, dict): have_dicts = True indexes.append(list(val.keys())) elif is_list_like(val) and getattr(val, 'ndim', 1) == 1: have_raw_arrays = True raw_lengths.append(len(val)) if not indexes and not raw_lengths: raise ValueError('If using all scalar values, you must pass' ' an index') if have_series or have_dicts: index = _union_indexes(indexes) if have_raw_arrays: lengths = list(set(raw_lengths)) if len(lengths) > 1: raise ValueError('arrays must all be same length') if have_dicts: raise ValueError('Mixing dicts with non-Series may lead to ' 'ambiguous ordering.') if have_series: if lengths[0] != len(index): msg = ('array length {length} does not match index ' 'length {idx_len}' .format(length=lengths[0], idx_len=len(index))) raise ValueError(msg) else: index = ibase.default_index(lengths[0]) return ensure_index(index)
def __set_index(self, index): if self._index is None: if not isinstance(index, list): from pandas.core.index import ensure_index index = ensure_index(index) self._index = index else: index = self._validate_set_axis(index, self._index) self._index = index if not isinstance(self._index, BaseIndex): storage = self._runtime.create_storage(len(self._index)) self._index = self._runtime.create_index_from_pandas( storage, self._index )
def extract_index(data): index = None if len(data) == 0: index = Index([]) elif len(data) > 0: raw_lengths = [] indexes = [] have_raw_arrays = False have_series = False have_dicts = False for v in data: if isinstance(v, ABCSeries): have_series = True indexes.append(v.index) elif isinstance(v, dict): have_dicts = True indexes.append(list(v.keys())) elif is_list_like(v) and getattr(v, 'ndim', 1) == 1: have_raw_arrays = True raw_lengths.append(len(v)) if not indexes and not raw_lengths: raise ValueError('If using all scalar values, you must pass' ' an index') if have_series or have_dicts: index = _union_indexes(indexes) if have_raw_arrays: lengths = list(set(raw_lengths)) if len(lengths) > 1: raise ValueError('arrays must all be same length') if have_dicts: raise ValueError('Mixing dicts with non-Series may lead to ' 'ambiguous ordering.') if have_series: if lengths[0] != len(index): msg = ('array length %d does not match index length %d' % (lengths[0], len(index))) raise ValueError(msg) else: index = ibase.default_index(lengths[0]) return ensure_index(index)
def _get_concat_axis(self): """ Return index to be used along concatenation axis. """ if self._is_series: if self.axis == 0: indexes = [x.index for x in self.objs] elif self.ignore_index: idx = ibase.default_index(len(self.objs)) return idx elif self.keys is None: names = [None] * len(self.objs) num = 0 has_names = False for i, x in enumerate(self.objs): if not isinstance(x, Series): raise TypeError("Cannot concatenate type 'Series' " "with object of type {type!r}" .format(type=type(x).__name__)) if x.name is not None: names[i] = x.name has_names = True else: names[i] = num num += 1 if has_names: return Index(names) else: return ibase.default_index(len(self.objs)) else: return ensure_index(self.keys).set_names(self.names) else: indexes = [x._data.axes[self.axis] for x in self.objs] if self.ignore_index: idx = ibase.default_index(sum(len(i) for i in indexes)) return idx if self.keys is None: concat_axis = _concat_indexes(indexes) else: concat_axis = _make_concat_multiindex(indexes, self.keys, self.levels, self.names) self._maybe_check_integrity(concat_axis) return concat_axis
def __init__( self, bins, binlabels, filter_empty: bool = False, mutated: bool = False, indexer=None, ): self.bins = ensure_int64(bins) self.binlabels = ensure_index(binlabels) self._filter_empty_groups = filter_empty self.mutated = mutated self.indexer = indexer # These lengths must match, otherwise we could call agg_series # with empty self.bins, which would raise in libreduction. assert len(self.binlabels) == len(self.bins)
def init_dict(data, index, columns, dtype=None): """ Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. """ if columns is not None: from pandas.core.series import Series arrays = Series(data, index=columns, dtype=object) data_names = arrays.index missing = arrays.isna() if index is None: # GH10856 # raise ValueError if only scalars in dict index = extract_index(arrays[~missing]) else: index = ensure_index(index) # no obvious "empty" int column if missing.any() and not is_integer_dtype(dtype): if dtype is None or np.issubdtype(dtype, np.flexible): # GH#1783 nan_dtype = object else: nan_dtype = dtype val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype) arrays.loc[missing] = [val] * missing.sum() else: keys = com.dict_keys_to_ordered_list(data) columns = data_names = Index(keys) arrays = (com.maybe_iterable_to_list(data[k]) for k in keys) # GH#24096 need copy to be deep for datetime64tz case # TODO: See if we can avoid these copies arrays = [ arr if not isinstance(arr, ABCIndexClass) else arr._data for arr in arrays ] arrays = [ arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays ] return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
def _validate_set_axis(self, new_labels, old_labels): """Validates the index or columns replacement against the old labels. Args: new_labels: The labels to replace with. old_labels: The labels to replace. Returns: The validated labels. """ new_labels = ensure_index(new_labels) old_len = len(old_labels) new_len = len(new_labels) if old_len != new_len: raise ValueError("Length mismatch: Expected axis has %d elements, " "new values have %d elements" % (old_len, new_len)) return new_labels
def init_dict(data, index, columns, dtype=None): """ Segregate Series based on type and coerce into matrices. Needs to handle a lot of exceptional cases. """ if columns is not None: from pandas.core.series import Series arrays = Series(data, index=columns, dtype=object) data_names = arrays.index missing = arrays.isnull() if index is None: # GH10856 # raise ValueError if only scalars in dict index = extract_index(arrays[~missing]) else: index = ensure_index(index) # no obvious "empty" int column if missing.any() and not is_integer_dtype(dtype): if dtype is None or np.issubdtype(dtype, np.flexible): # GH#1783 nan_dtype = object else: nan_dtype = dtype val = construct_1d_arraylike_from_scalar(np.nan, len(index), nan_dtype) arrays.loc[missing] = [val] * missing.sum() else: for key in data: if (isinstance(data[key], ABCDatetimeIndex) and data[key].tz is not None): # GH#24096 need copy to be deep for datetime64tz case # TODO: See if we can avoid these copies data[key] = data[key].copy(deep=True) keys = com.dict_keys_to_ordered_list(data) columns = data_names = Index(keys) arrays = [data[k] for k in keys] return arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype)
def _make_concat_multiindex(indexes, keys, levels=None, names=None): if ((levels is None and isinstance(keys[0], tuple)) or (levels is not None and len(levels) > 1)): zipped = compat.lzip(*keys) if names is None: names = [None] * len(zipped) if levels is None: _, levels = _factorize_from_iterables(zipped) else: levels = [ensure_index(x) for x in levels] else: zipped = [keys] if names is None: names = [None] if levels is None: levels = [ensure_index(keys)] else: levels = [ensure_index(x) for x in levels] if not _all_indexes_same(indexes): codes_list = [] # things are potentially different sizes, so compute the exact codes # for each level and pass those to MultiIndex.from_arrays for hlevel, level in zip(zipped, levels): to_concat = [] for key, index in zip(hlevel, indexes): try: i = level.get_loc(key) except KeyError: raise ValueError('Key {key!s} not in level {level!s}' .format(key=key, level=level)) to_concat.append(np.repeat(i, len(index))) codes_list.append(np.concatenate(to_concat)) concat_index = _concat_indexes(indexes) # these go at the end if isinstance(concat_index, MultiIndex): levels.extend(concat_index.levels) codes_list.extend(concat_index.codes) else: codes, categories = _factorize_from_iterable(concat_index) levels.append(categories) codes_list.append(codes) if len(names) == len(levels): names = list(names) else: # make sure that all of the passed indices have the same nlevels if not len({idx.nlevels for idx in indexes}) == 1: raise AssertionError("Cannot concat indices that do" " not have the same number of levels") # also copies names = names + _get_consensus_names(indexes) return MultiIndex(levels=levels, codes=codes_list, names=names, verify_integrity=False) new_index = indexes[0] n = len(new_index) kpieces = len(indexes) # also copies new_names = list(names) new_levels = list(levels) # construct codes new_codes = [] # do something a bit more speedy for hlevel, level in zip(zipped, levels): hlevel = ensure_index(hlevel) mapped = level.get_indexer(hlevel) mask = mapped == -1 if mask.any(): raise ValueError('Values not found in passed level: {hlevel!s}' .format(hlevel=hlevel[mask])) new_codes.append(np.repeat(mapped, n)) if isinstance(new_index, MultiIndex): new_levels.extend(new_index.levels) new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes]) else: new_levels.append(new_index) new_codes.append(np.tile(np.arange(n), kpieces)) if len(new_names) < len(new_levels): new_names.extend(new_index.names) return MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False)
def __init__(self, data=None, index=None, sparse_index=None, kind='block', fill_value=None, name=None, dtype=None, copy=False, fastpath=False): # we are called internally, so short-circuit if fastpath: # data is an ndarray, index is defined if not isinstance(data, SingleBlockManager): data = SingleBlockManager(data, index, fastpath=True) if copy: data = data.copy() else: if data is None: data = [] if isinstance(data, Series) and name is None: name = data.name if isinstance(data, SparseArray): if index is not None: assert (len(index) == len(data)) sparse_index = data.sp_index if fill_value is None: fill_value = data.fill_value data = np.asarray(data) elif isinstance(data, SparseSeries): if index is None: index = data.index.view() if fill_value is None: fill_value = data.fill_value # extract the SingleBlockManager data = data._data elif isinstance(data, (Series, dict)): data = Series(data, index=index) index = data.index.view() res = make_sparse(data, kind=kind, fill_value=fill_value) data, sparse_index, fill_value = res elif isinstance(data, (tuple, list, np.ndarray)): # array-like if sparse_index is None: res = make_sparse(data, kind=kind, fill_value=fill_value) data, sparse_index, fill_value = res else: assert (len(data) == sparse_index.npoints) elif isinstance(data, SingleBlockManager): if dtype is not None: data = data.astype(dtype) if index is None: index = data.index.view() elif not data.index.equals(index) or copy: # pragma: no cover # GH#19275 SingleBlockManager input should only be called # internally raise AssertionError('Cannot pass both SingleBlockManager ' '`data` argument and a different ' '`index` argument. `copy` must ' 'be False.') else: length = len(index) if data == fill_value or (isna(data) and isna(fill_value)): if kind == 'block': sparse_index = BlockIndex(length, [], []) else: sparse_index = IntIndex(length, []) data = np.array([]) else: if kind == 'block': locs, lens = ([0], [length]) if length else ([], []) sparse_index = BlockIndex(length, locs, lens) else: sparse_index = IntIndex(length, index) v = data data = np.empty(length) data.fill(v) if index is None: index = ibase.default_index(sparse_index.length) index = ensure_index(index) # create/copy the manager if isinstance(data, SingleBlockManager): if copy: data = data.copy() else: # create a sparse array if not isinstance(data, SparseArray): data = SparseArray(data, sparse_index=sparse_index, fill_value=fill_value, dtype=dtype, copy=copy) data = SingleBlockManager(data, index) generic.NDFrame.__init__(self, data) self.index = index self.name = name
def __init__(self, data=None, index=None, columns=None, default_kind=None, default_fill_value=None, dtype=None, copy=False): # pick up the defaults from the Sparse structures if isinstance(data, SparseDataFrame): if index is None: index = data.index if columns is None: columns = data.columns if default_fill_value is None: default_fill_value = data.default_fill_value if default_kind is None: default_kind = data.default_kind elif isinstance(data, (SparseSeries, SparseArray)): if index is None: index = data.index if default_fill_value is None: default_fill_value = data.fill_value if columns is None and hasattr(data, 'name'): columns = [data.name] if columns is None: raise Exception("cannot pass a series w/o a name or columns") data = {columns[0]: data} if default_fill_value is None: default_fill_value = np.nan if default_kind is None: default_kind = 'block' self._default_kind = default_kind self._default_fill_value = default_fill_value if is_scipy_sparse(data): mgr = self._init_spmatrix(data, index, columns, dtype=dtype, fill_value=default_fill_value) elif isinstance(data, dict): mgr = self._init_dict(data, index, columns, dtype=dtype) elif isinstance(data, (np.ndarray, list)): mgr = self._init_matrix(data, index, columns, dtype=dtype) elif isinstance(data, SparseDataFrame): mgr = self._init_mgr(data._data, dict(index=index, columns=columns), dtype=dtype, copy=copy) elif isinstance(data, DataFrame): mgr = self._init_dict(data, data.index, data.columns, dtype=dtype) elif isinstance(data, Series): mgr = self._init_dict(data.to_frame(), data.index, columns=None, dtype=dtype) elif isinstance(data, BlockManager): mgr = self._init_mgr(data, axes=dict(index=index, columns=columns), dtype=dtype, copy=copy) elif data is None: data = DataFrame() if index is None: index = Index([]) else: index = ensure_index(index) if columns is None: columns = Index([]) else: for c in columns: data[c] = SparseArray(np.nan, index=index, kind=self._default_kind, fill_value=self._default_fill_value) mgr = to_manager(data, columns, index) if dtype is not None: mgr = mgr.astype(dtype) else: msg = ('SparseDataFrame called with unknown type "{data_type}" ' 'for data argument') raise TypeError(msg.format(data_type=type(data).__name__)) generic.NDFrame.__init__(self, mgr)
def _make_concat_multiindex(indexes, keys, levels=None, names=None): if (levels is None and isinstance(keys[0], tuple)) or (levels is not None and len(levels) > 1): zipped = list(zip(*keys)) if names is None: names = [None] * len(zipped) if levels is None: _, levels = _factorize_from_iterables(zipped) else: levels = [ensure_index(x) for x in levels] else: zipped = [keys] if names is None: names = [None] if levels is None: levels = [ensure_index(keys)] else: levels = [ensure_index(x) for x in levels] if not _all_indexes_same(indexes): codes_list = [] # things are potentially different sizes, so compute the exact codes # for each level and pass those to MultiIndex.from_arrays for hlevel, level in zip(zipped, levels): to_concat = [] for key, index in zip(hlevel, indexes): try: i = level.get_loc(key) except KeyError: raise ValueError( "Key {key!s} not in level {level!s}".format( key=key, level=level)) to_concat.append(np.repeat(i, len(index))) codes_list.append(np.concatenate(to_concat)) concat_index = _concat_indexes(indexes) # these go at the end if isinstance(concat_index, MultiIndex): levels.extend(concat_index.levels) codes_list.extend(concat_index.codes) else: codes, categories = _factorize_from_iterable(concat_index) levels.append(categories) codes_list.append(codes) if len(names) == len(levels): names = list(names) else: # make sure that all of the passed indices have the same nlevels if not len({idx.nlevels for idx in indexes}) == 1: raise AssertionError("Cannot concat indices that do" " not have the same number of levels") # also copies names = names + _get_consensus_names(indexes) return MultiIndex(levels=levels, codes=codes_list, names=names, verify_integrity=False) new_index = indexes[0] n = len(new_index) kpieces = len(indexes) # also copies new_names = list(names) new_levels = list(levels) # construct codes new_codes = [] # do something a bit more speedy for hlevel, level in zip(zipped, levels): hlevel = ensure_index(hlevel) mapped = level.get_indexer(hlevel) mask = mapped == -1 if mask.any(): raise ValueError( "Values not found in passed level: {hlevel!s}".format( hlevel=hlevel[mask])) new_codes.append(np.repeat(mapped, n)) if isinstance(new_index, MultiIndex): new_levels.extend(new_index.levels) new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes]) else: new_levels.append(new_index) new_codes.append(np.tile(np.arange(n), kpieces)) if len(new_names) < len(new_levels): new_names.extend(new_index.names) return MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False)
def __init__( self, partitions=None, index=None, columns=None, row_lengths=None, column_widths=None, dtypes=None, op=None, index_cols=None, uses_rowid=False, force_execution_mode=None, ): assert dtypes is not None self.id = str(type(self)._next_id[0]) type(self)._next_id[0] += 1 if index is not None: index = ensure_index(index) columns = ensure_index(columns) self._op = op self._index_cols = index_cols self._partitions = partitions self._index_cache = index self._columns_cache = columns self._row_lengths_cache = row_lengths self._column_widths_cache = column_widths if self._op is None: self._op = FrameNode(self) self._table_cols = columns.tolist() if self._index_cols is not None: self._table_cols = self._index_cols + self._table_cols assert len(dtypes) == len( self._table_cols ), f"unaligned dtypes ({dtypes}) and table columns ({self._table_cols})" if isinstance(dtypes, list): if self._index_cols is not None: # Table stores both index and data columns but those are accessed # differently if we have a MultiIndex for columns. To unify access # to dtype we extend index column names to tuples to have a MultiIndex # of dtypes. if isinstance(columns, MultiIndex): tail = [""] * (columns.nlevels - 1) index_tuples = [(col, *tail) for col in self._index_cols] dtype_index = MultiIndex.from_tuples(index_tuples).append( columns) self._dtypes = pd.Series(dtypes, index=dtype_index) else: self._dtypes = pd.Series(dtypes, index=self._table_cols) else: self._dtypes = pd.Series(dtypes, index=columns) else: self._dtypes = dtypes if partitions is not None: self._filter_empties() # This frame uses encoding for column names to support exotic # (e.g. non-string and reserved words) column names. Encoded # names are used in OmniSci tables and corresponding Arrow tables. # If we import Arrow table, we have to rename its columns for # proper processing. if self._has_arrow_table() and self._partitions.size > 0: assert self._partitions.size == 1 table = self._partitions[0][0].get() if table.column_names[0] != f"F_{self._table_cols[0]}": new_names = [f"F_{col}" for col in table.column_names] new_table = table.rename_columns(new_names) self._partitions[0][ 0] = self._frame_mgr_cls._partition_class.put_arrow( new_table) self._uses_rowid = uses_rowid # Tests use forced execution mode to take control over frame # execution process. Supported values: # "lazy" - RuntimeError is raised if execution is triggered for the frame # "arrow" - RuntimeError is raised if execution is triggered, but we cannot # execute it using Arrow API (have to use OmniSci for execution) self._force_execution_mode = force_execution_mode