def test_periodindex(self): idx1 = PeriodIndex(['2014-01', '2014-01', '2014-02', '2014-02', '2014-03', '2014-03'], freq='M') cat1 = Categorical.from_array(idx1) exp_arr = np.array([0, 0, 1, 1, 2, 2]) exp_idx = PeriodIndex(['2014-01', '2014-02', '2014-03'], freq='M') self.assert_numpy_array_equal(cat1.labels, exp_arr) self.assertTrue(cat1.levels.equals(exp_idx)) idx2 = PeriodIndex(['2014-03', '2014-03', '2014-02', '2014-01', '2014-03', '2014-01'], freq='M') cat2 = Categorical.from_array(idx2) exp_arr = np.array([2, 2, 1, 0, 2, 0]) self.assert_numpy_array_equal(cat2.labels, exp_arr) self.assertTrue(cat2.levels.equals(exp_idx)) idx3 = PeriodIndex(['2013-12', '2013-11', '2013-10', '2013-09', '2013-08', '2013-07', '2013-05'], freq='M') cat3 = Categorical.from_array(idx3) exp_arr = np.array([6, 5, 4, 3, 2, 1, 0]) exp_idx = PeriodIndex(['2013-05', '2013-07', '2013-08', '2013-09', '2013-10', '2013-11', '2013-12'], freq='M') self.assert_numpy_array_equal(cat3.labels, exp_arr) self.assertTrue(cat3.levels.equals(exp_idx))
def panel_index(time, panels, names=['time', 'panel']): """ Returns a multi-index suitable for a panel-like DataFrame Parameters ---------- time : array-like Time index, does not have to repeat panels : array-like Panel index, does not have to repeat names : list, optional List containing the names of the indices Returns ------- multi_index : MultiIndex Time index is the first level, the panels are the second level. Examples -------- >>> years = range(1960,1963) >>> panels = ['A', 'B', 'C'] >>> panel_idx = panel_index(years, panels) >>> panel_idx MultiIndex([(1960, 'A'), (1961, 'A'), (1962, 'A'), (1960, 'B'), (1961, 'B'), (1962, 'B'), (1960, 'C'), (1961, 'C'), (1962, 'C')], dtype=object) or >>> import numpy as np >>> years = np.repeat(range(1960,1963), 3) >>> panels = np.tile(['A', 'B', 'C'], 3) >>> panel_idx = panel_index(years, panels) >>> panel_idx MultiIndex([(1960, 'A'), (1960, 'B'), (1960, 'C'), (1961, 'A'), (1961, 'B'), (1961, 'C'), (1962, 'A'), (1962, 'B'), (1962, 'C')], dtype=object) """ time, panels = _ensure_like_indices(time, panels) time_factor = Categorical.from_array(time) panel_factor = Categorical.from_array(panels) labels = [time_factor.labels, panel_factor.labels] levels = [time_factor.levels, panel_factor.levels] return MultiIndex(levels, labels, sortorder=None, names=names, verify_integrity=False)
def test_constructor_unsortable(self): raise nose.SkipTest('skipping for now') arr = np.array([1, 2, 3, datetime.now()], dtype='O') # it works! factor = Categorical.from_array(arr)
def get_result(self): # TODO: find a better way than this masking business values, value_mask = self.get_new_values() columns = self.get_new_columns() index = self.get_new_index() # filter out missing levels if values.shape[1] > 0: col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1]) # rare case, level values not observed if len(obs_ids) < self.full_shape[1]: inds = (value_mask.sum(0) > 0).nonzero()[0] values = com.take_nd(values, inds, axis=1) columns = columns[inds] # may need to coerce categoricals here if self.is_categorical is not None: values = [ Categorical.from_array( values[:, i], categories=self.is_categorical.categories, ordered=True) for i in range(values.shape[-1]) ] return DataFrame(values, index=index, columns=columns)
def get_dummies(data, prefix=None, prefix_sep='_'): """ Convert categorical variable into dummy/indicator variables Parameters ---------- data : array-like or Series prefix : string, default None String to append DataFrame column names prefix_sep : string, default '_' If appending prefix, separator/delimiter to use Returns ------- dummies : DataFrame """ cat = Categorical.from_array(np.asarray(data)) dummy_mat = np.eye(len(cat.levels)).take(cat.labels, axis=0) if prefix is not None: dummy_cols = [ '%s%s%s' % (prefix, prefix_sep, str(v)) for v in cat.levels ] else: dummy_cols = cat.levels if isinstance(data, Series): index = data.index else: index = None return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def get_result(self): # TODO: find a better way than this masking business values, value_mask = self.get_new_values() columns = self.get_new_columns() index = self.get_new_index() # filter out missing levels if values.shape[1] > 0: col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1]) # rare case, level values not observed if len(obs_ids) < self.full_shape[1]: inds = (value_mask.sum(0) > 0).nonzero()[0] values = algos.take_nd(values, inds, axis=1) columns = columns[inds] # may need to coerce categoricals here if self.is_categorical is not None: categories = self.is_categorical.categories ordered = self.is_categorical.ordered values = [Categorical.from_array(values[:, i], categories=categories, ordered=ordered) for i in range(values.shape[-1])] return DataFrame(values, index=index, columns=columns)
def make_axis_dummies(frame, axis='minor', transform=None): """ Construct 1-0 dummy variables corresponding to designated axis labels Parameters ---------- axis : {'major', 'minor'}, default 'minor' transform : function, default None Function to apply to axis labels first. For example, to get "day of week" dummies in a time series regression you might call: make_axis_dummies(panel, axis='major', transform=lambda d: d.weekday()) Returns ------- dummies : DataFrame Column names taken from chosen axis """ numbers = {'major': 0, 'minor': 1} num = numbers.get(axis, axis) items = frame.index.levels[num] labels = frame.index.labels[num] if transform is not None: mapped_items = items.map(transform) cat = Categorical.from_array(mapped_items.take(labels)) labels = cat.labels items = cat.levels values = np.eye(len(items), dtype=float) values = values.take(labels, axis=0) return DataFrame(values, columns=items, index=frame.index)
def make_axis_dummies(frame, axis="minor", transform=None): """ Construct 1-0 dummy variables corresponding to designated axis labels Parameters ---------- axis : {'major', 'minor'}, default 'minor' transform : function, default None Function to apply to axis labels first. For example, to get "day of week" dummies in a time series regression you might call: make_axis_dummies(panel, axis='major', transform=lambda d: d.weekday()) Returns ------- dummies : DataFrame Column names taken from chosen axis """ numbers = {"major": 0, "minor": 1} num = numbers.get(axis, axis) items = frame.index.levels[num] labels = frame.index.labels[num] if transform is not None: mapped_items = items.map(transform) cat = Categorical.from_array(mapped_items.take(labels)) labels = cat.labels items = cat.levels values = np.eye(len(items), dtype=float) values = values.take(labels, axis=0) return DataFrame(values, columns=items, index=frame.index)
def get_dummies(data, prefix=None, prefix_sep='_'): """ Convert categorical variable into dummy/indicator variables Parameters ---------- data : array-like or Series prefix : string, default None String to append DataFrame column names prefix_sep : string, default '_' If appending prefix, separator/delimiter to use Returns ------- dummies : DataFrame """ cat = Categorical.from_array(np.asarray(data)) dummy_mat = np.eye(len(cat.levels)).take(cat.labels, axis=0) if prefix is not None: dummy_cols = ['%s%s%s' % (prefix, prefix_sep, str(v)) for v in cat.levels] else: dummy_cols = cat.levels if isinstance(data, Series): index = data.index else: index = None return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False): # Series avoids inconsistent NaN handling cat = Categorical.from_array(Series(data), ordered=True) levels = cat.categories # if all NaN if not dummy_na and len(levels) == 0: if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) if not sparse: return DataFrame(index=index) else: return SparseDataFrame(index=index) codes = cat.codes.copy() if dummy_na: codes[codes == -1] = len(cat.categories) levels = np.append(cat.categories, np.nan) number_of_cols = len(levels) if prefix is not None: dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v) for v in levels] else: dummy_cols = levels if isinstance(data, Series): index = data.index else: index = None if sparse: sparse_series = {} N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] for ndx, code in enumerate(codes): if code == -1: # Blank entries if not dummy_na and code == -1, #GH4446 continue sp_indices[code].append(ndx) for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray(np.ones(len(ixs)), sparse_index=IntIndex(N, ixs), fill_value=0) sparse_series[col] = SparseSeries(data=sarr, index=index) return SparseDataFrame(sparse_series, index=index, columns=dummy_cols) else: dummy_mat = np.eye(number_of_cols).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False): # Series avoids inconsistent NaN handling cat = Categorical.from_array(Series(data), ordered=True) levels = cat.categories # if all NaN if not dummy_na and len(levels) == 0: if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) if not sparse: return DataFrame(index=index) else: return SparseDataFrame(index=index) codes = cat.codes.copy() if dummy_na: codes[codes == -1] = len(cat.categories) levels = np.append(cat.categories, np.nan) number_of_cols = len(levels) if prefix is not None: dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v) for v in levels] else: dummy_cols = levels if isinstance(data, Series): index = data.index else: index = None if sparse: sparse_series = {} N = len(data) sp_indices = [ [] for _ in range(len(dummy_cols)) ] for ndx, code in enumerate(codes): if code == -1: # Blank entries if not dummy_na and code == -1, #GH4446 continue sp_indices[code].append(ndx) for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray(np.ones(len(ixs)), sparse_index=IntIndex(N, ixs), fill_value=0) sparse_series[col] = SparseSeries(data=sarr, index=index) return SparseDataFrame(sparse_series, index=index, columns=dummy_cols) else: dummy_mat = np.eye(number_of_cols).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def get_result(self): # TODO: find a better way than this masking business values, value_mask = self.get_new_values() columns = self.get_new_columns() index = self.get_new_index() # filter out missing levels if values.shape[1] > 0: col_inds, obs_ids = _compress_group_index(self.sorted_labels[-1]) # rare case, level values not observed if len(obs_ids) < self.full_shape[1]: inds = (value_mask.sum(0) > 0).nonzero()[0] values = com.take_nd(values, inds, axis=1) columns = columns[inds] # we might have a missing index if len(index) != values.shape[0]: mask = isnull(index) if mask.any(): l = np.arange(len(index)) values, orig_values = (np.empty((len(index), values.shape[1])), values) values.fill(np.nan) values_indexer = com._ensure_int64(l[~mask]) for i, j in enumerate(values_indexer): values[j] = orig_values[i] else: index = index.take(self.unique_groups) # may need to coerce categoricals here if self.is_categorical is not None: values = [ Categorical.from_array(values[:,i], categories=self.is_categorical.categories) for i in range(values.shape[-1]) ] return DataFrame(values, index=index, columns=columns)
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False): # Series avoids inconsistent NaN handling cat = Categorical.from_array(Series(data)) levels = cat.categories # if all NaN if not dummy_na and len(levels) == 0: if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) return DataFrame(index=index) number_of_cols = len(levels) if dummy_na: number_of_cols += 1 dummy_mat = np.eye(number_of_cols).take(cat.codes, axis=0) if dummy_na: levels = np.append(cat.categories, np.nan) else: # reset NaN GH4446 dummy_mat[cat.codes == -1] = 0 if prefix is not None: dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v) for v in levels] else: dummy_cols = levels if isinstance(data, Series): index = data.index else: index = None return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def sparse_dummies(categorical_values): categories = Categorical.from_array(categorical_values) N = len(categorical_values) row_numbers = np.arange(N, dtype=np.int) ones = np.ones((N, )) return csr_matrix((ones, (row_numbers, categories.codes)))
def _make_concat_multiindex(indexes, keys, levels=None, names=None): if ((levels is None and isinstance(keys[0], tuple)) or (levels is not None and len(levels) > 1)): zipped = lzip(*keys) if names is None: names = [None] * len(zipped) if levels is None: levels = [Categorical.from_array( zp, ordered=True).categories for zp in zipped] else: levels = [_ensure_index(x) for x in levels] else: zipped = [keys] if names is None: names = [None] if levels is None: levels = [_ensure_index(keys)] else: levels = [_ensure_index(x) for x in levels] if not _all_indexes_same(indexes): label_list = [] # things are potentially different sizes, so compute the exact labels # for each level and pass those to MultiIndex.from_arrays for hlevel, level in zip(zipped, levels): to_concat = [] for key, index in zip(hlevel, indexes): try: i = level.get_loc(key) except KeyError: raise ValueError('Key %s not in level %s' % (str(key), str(level))) to_concat.append(np.repeat(i, len(index))) label_list.append(np.concatenate(to_concat)) concat_index = _concat_indexes(indexes) # these go at the end if isinstance(concat_index, MultiIndex): levels.extend(concat_index.levels) label_list.extend(concat_index.labels) else: factor = Categorical.from_array(concat_index, ordered=True) levels.append(factor.categories) label_list.append(factor.codes) if len(names) == len(levels): names = list(names) else: # make sure that all of the passed indices have the same nlevels if not len(set([idx.nlevels for idx in indexes])) == 1: raise AssertionError("Cannot concat indices that do" " not have the same number of levels") # also copies names = names + _get_consensus_names(indexes) return MultiIndex(levels=levels, labels=label_list, names=names, verify_integrity=False) new_index = indexes[0] n = len(new_index) kpieces = len(indexes) # also copies new_names = list(names) new_levels = list(levels) # construct labels new_labels = [] # do something a bit more speedy for hlevel, level in zip(zipped, levels): hlevel = _ensure_index(hlevel) mapped = level.get_indexer(hlevel) mask = mapped == -1 if mask.any(): raise ValueError('Values not found in passed level: %s' % str(hlevel[mask])) new_labels.append(np.repeat(mapped, n)) if isinstance(new_index, MultiIndex): new_levels.extend(new_index.levels) new_labels.extend([np.tile(lab, kpieces) for lab in new_index.labels]) else: new_levels.append(new_index) new_labels.append(np.tile(np.arange(n), kpieces)) if len(new_names) < len(new_levels): new_names.extend(new_index.names) return MultiIndex(levels=new_levels, labels=new_labels, names=new_names, verify_integrity=False)
def setUp(self): self.factor = Categorical.from_array(["a", "b", "b", "a", "a", "c", "c", "c"])
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False, drop_first=False): # Series avoids inconsistent NaN handling cat = Categorical.from_array(Series(data), ordered=True) levels = cat.categories def get_empty_Frame(data, sparse): if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) if not sparse: return DataFrame(index=index) else: return SparseDataFrame(index=index) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_Frame(data, sparse) codes = cat.codes.copy() if dummy_na: codes[codes == -1] = len(cat.categories) levels = np.append(cat.categories, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_Frame(data, sparse) number_of_cols = len(levels) if prefix is not None: dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v) for v in levels] else: dummy_cols = levels if isinstance(data, Series): index = data.index else: index = None if sparse: sparse_series = {} N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] for ndx, code in enumerate(codes): if code == -1: # Blank entries if not dummy_na and code == -1, #GH4446 continue sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray(np.ones(len(ixs), dtype=np.uint8), sparse_index=IntIndex(N, ixs), fill_value=0, dtype=np.uint8) sparse_series[col] = SparseSeries(data=sarr, index=index) out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols, dtype=np.uint8) return out else: dummy_mat = np.eye(number_of_cols, dtype=np.uint8).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def data(self, convert_dates=True, convert_categoricals=True, index=None): """ Reads observations from Stata file, converting them into a dataframe Parameters ---------- convert_dates : boolean, defaults to True Convert date variables to DataFrame time values convert_categoricals : boolean, defaults to True Read value labels and convert columns to Categorical/Factor variables index : identifier of index column identifier of column that should be used as index of the DataFrame Returns ------- y : DataFrame instance """ if self._data_read: raise Exception("Data has already been read.") self._data_read = True if self.format_version >= 117: self._read_strls() stata_dta = self._dataset() data = [] for rownum, line in enumerate(stata_dta): # doesn't handle missing value objects, just casts # None will only work without missing value object. for i, val in enumerate(line): # NOTE: This will only be scalar types because missing strings # are empty not None in Stata if val is None: line[i] = np.nan data.append(tuple(line)) if convert_categoricals: self._read_value_labels() data = DataFrame(data, columns=self.varlist, index=index) cols_ = np.where(self.dtyplist)[0] for i in cols_: if self.dtyplist[i] is not None: col = data.columns[i] if data[col].dtype is not np.dtype(object): data[col] = Series(data[col], data[col].index, self.dtyplist[i]) if convert_dates: cols = np.where(lmap(lambda x: x in _date_formats, self.fmtlist))[0] for i in cols: col = data.columns[i] data[col] = data[col].apply(_stata_elapsed_date_to_datetime, args=(self.fmtlist[i],)) if convert_categoricals: cols = np.where(lmap(lambda x: x in compat.iterkeys(self.value_label_dict), self.lbllist))[0] for i in cols: col = data.columns[i] labeled_data = np.copy(data[col]) labeled_data = labeled_data.astype(object) for k, v in compat.iteritems(self.value_label_dict[self.lbllist[i]]): labeled_data[(data[col] == k).values] = v data[col] = Categorical.from_array(labeled_data) return data
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False): """ Convert categorical variable into dummy/indicator variables Parameters ---------- data : array-like or Series prefix : string, default None String to append DataFrame column names prefix_sep : string, default '_' If appending prefix, separator/delimiter to use dummy_na : bool, default False Add a column to indicate NaNs, if False NaNs are ignored. Returns ------- dummies : DataFrame Examples -------- >>> s = pd.Series(list('abca')) >>> get_dummies(s) a b c 0 1 0 0 1 0 1 0 2 0 0 1 3 1 0 0 >>> s1 = ['a', 'b', np.nan] >>> get_dummies(s1) a b 0 1 0 1 0 1 2 0 0 >>> get_dummies(s1, dummy_na=True) a b NaN 0 1 0 0 1 0 1 0 2 0 0 1 See also ``Series.str.get_dummies``. """ # Series avoids inconsistent NaN handling cat = Categorical.from_array(Series(data)) levels = cat.levels # if all NaN if not dummy_na and len(levels) == 0: if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) return DataFrame(index=index) number_of_cols = len(levels) if dummy_na: number_of_cols += 1 dummy_mat = np.eye(number_of_cols).take(cat.labels, axis=0) if dummy_na: levels = np.append(cat.levels, np.nan) else: # reset NaN GH4446 dummy_mat[cat.labels == -1] = 0 if prefix is not None: dummy_cols = ['%s%s%s' % (prefix, prefix_sep, str(v)) for v in levels] else: dummy_cols = levels if isinstance(data, Series): index = data.index else: index = None return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def setUp(self): self.factor = Categorical.from_array(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False): """ Convert categorical variable into dummy/indicator variables Parameters ---------- data : array-like or Series prefix : string, default None String to append DataFrame column names prefix_sep : string, default '_' If appending prefix, separator/delimiter to use dummy_na : bool, default False Add a column to indicate NaNs, if False NaNs are ignored. Returns ------- dummies : DataFrame Examples -------- >>> s = pd.Series(list('abca')) >>> get_dummies(s) a b c 0 1 0 0 1 0 1 0 2 0 0 1 3 1 0 0 >>> s1 = ['a', 'b', np.nan] >>> get_dummies(s1) a b 0 1 0 1 0 1 2 0 0 >>> get_dummies(s1, dummy_na=True) a b NaN 0 1 0 0 1 0 1 0 2 0 0 1 See also ``Series.str.get_dummies``. """ # Series avoids inconsistent NaN handling cat = Categorical.from_array(Series(data)) levels = cat.levels # if all NaN if not dummy_na and len(levels) == 0: if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) return DataFrame(index=index) number_of_cols = len(levels) if dummy_na: number_of_cols += 1 dummy_mat = np.eye(number_of_cols).take(cat.labels, axis=0) if dummy_na: levels = np.append(cat.levels, np.nan) else: # reset NaN GH4446 dummy_mat[cat.labels == -1] = 0 if prefix is not None: dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v) for v in levels] else: dummy_cols = levels if isinstance(data, Series): index = data.index else: index = None return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def test_constructor_unsortable(self): arr = np.array([1, 2, 3, datetime.now()], dtype='O') # it works! factor = Categorical.from_array(arr)
def setUp(self): self.factor = Categorical.from_array( ['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
def data(self, convert_dates=True, convert_categoricals=True, index=None): """ Reads observations from Stata file, converting them into a dataframe Parameters ---------- convert_dates : boolean, defaults to True Convert date variables to DataFrame time values convert_categoricals : boolean, defaults to True Read value labels and convert columns to Categorical/Factor variables index : identifier of index column identifier of column that should be used as index of the DataFrame Returns ------- y : DataFrame instance """ if self._data_read: raise Exception("Data has already been read.") self._data_read = True stata_dta = self._dataset() data = [] for rownum, line in enumerate(stata_dta): # doesn't handle missing value objects, just casts # None will only work without missing value object. for i, val in enumerate(line): #NOTE: This will only be scalar types because missing strings # are empty not None in Stata if val is None: line[i] = np.nan data.append(tuple(line)) if convert_categoricals: self._read_value_labels() data = DataFrame(data, columns=self.varlist, index=index) cols_ = np.where(self.dtyplist)[0] for i in cols_: if self.dtyplist[i] is not None: col = data.columns[i] if data[col].dtype is not np.dtype(object): data[col] = Series(data[col], data[col].index, self.dtyplist[i]) if convert_dates: cols = np.where(lmap(lambda x: x in _date_formats, self.fmtlist))[0] for i in cols: col = data.columns[i] data[col] = data[col].apply(_stata_elapsed_date_to_datetime, args=(self.fmtlist[i], )) if convert_categoricals: cols = np.where( lmap(lambda x: x in compat.iterkeys(self.value_label_dict), self.lbllist))[0] for i in cols: col = data.columns[i] labeled_data = np.copy(data[col]) labeled_data = labeled_data.astype(object) for k, v in compat.iteritems( self.value_label_dict[self.lbllist[i]]): labeled_data[(data[col] == k).values] = v data[col] = Categorical.from_array(labeled_data) return data
def _get_dummies_1d(data, prefix, prefix_sep="_", dummy_na=False, sparse=False, drop_first=False): # Series avoids inconsistent NaN handling cat = Categorical.from_array(Series(data), ordered=True) levels = cat.categories def get_empty_Frame(data, sparse): if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) if not sparse: return DataFrame(index=index) else: return SparseDataFrame(index=index) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_Frame(data, sparse) codes = cat.codes.copy() if dummy_na: codes[codes == -1] = len(cat.categories) levels = np.append(cat.categories, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_Frame(data, sparse) number_of_cols = len(levels) if prefix is not None: dummy_cols = ["%s%s%s" % (prefix, prefix_sep, v) for v in levels] else: dummy_cols = levels if isinstance(data, Series): index = data.index else: index = None if sparse: sparse_series = {} N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] for ndx, code in enumerate(codes): if code == -1: # Blank entries if not dummy_na and code == -1, #GH4446 continue sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray(np.ones(len(ixs)), sparse_index=IntIndex(N, ixs), fill_value=0) sparse_series[col] = SparseSeries(data=sarr, index=index) return SparseDataFrame(sparse_series, index=index, columns=dummy_cols) else: dummy_mat = np.eye(number_of_cols).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)