def make_axis_dummies(frame, axis='minor', transform=None): """ Construct 1-0 dummy variables corresponding to designated axis labels Parameters ---------- frame : DataFrame axis : {'major', 'minor'}, default 'minor' transform : function, default None Function to apply to axis labels first. For example, to get "day of week" dummies in a time series regression you might call:: make_axis_dummies(panel, axis='major', transform=lambda d: d.weekday()) Returns ------- dummies : DataFrame Column names taken from chosen axis """ numbers = {'major': 0, 'minor': 1} num = numbers.get(axis, axis) items = frame.index.levels[num] codes = frame.index.codes[num] if transform is not None: mapped_items = items.map(transform) codes, items = _factorize_from_iterable(mapped_items.take(codes)) values = np.eye(len(items), dtype=float) values = values.take(codes, axis=0) return DataFrame(values, columns=items, index=frame.index)
def make_axis_dummies(frame, axis="minor", transform=None): """ Construct 1-0 dummy variables corresponding to designated axis labels Parameters ---------- frame : DataFrame axis : {'major', 'minor'}, default 'minor' transform : function, default None Function to apply to axis labels first. For example, to get "day of week" dummies in a time series regression you might call:: make_axis_dummies(panel, axis='major', transform=lambda d: d.weekday()) Returns ------- dummies : DataFrame Column names taken from chosen axis """ numbers = {"major": 0, "minor": 1} num = numbers.get(axis, axis) items = frame.index.levels[num] codes = frame.index.codes[num] if transform is not None: mapped_items = items.map(transform) codes, items = _factorize_from_iterable(mapped_items.take(codes)) values = np.eye(len(items), dtype=float) values = values.take(codes, axis=0) return DataFrame(values, columns=items, index=frame.index)
def _get_dummies_1d( data, prefix, prefix_sep="_", dummy_na=False, sparse=False, drop_first=False, dtype=None, ): from pandas.core.reshape.concat import concat # Series avoids inconsistent NaN handling codes, levels = _factorize_from_iterable(Series(data)) if dtype is None: dtype = np.uint8 dtype = np.dtype(dtype) if is_object_dtype(dtype): raise ValueError("dtype=object is not a valid dtype for get_dummies") def get_empty_frame(data): if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) return DataFrame(index=index) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_frame(data) codes = codes.copy() if dummy_na: codes[codes == -1] = len(levels) levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_frame(data) number_of_cols = len(levels) if prefix is None: dummy_cols = levels else: # PY2 embedded unicode, gh-22084 def _make_col_name(prefix, prefix_sep, level): fstr = "{prefix}{prefix_sep}{level}" return fstr.format(prefix=prefix, prefix_sep=prefix_sep, level=level) dummy_cols = [ _make_col_name(prefix, prefix_sep, level) for level in levels ] if isinstance(data, Series): index = data.index else: index = None if sparse: if is_integer_dtype(dtype): fill_value = 0 elif dtype == bool: fill_value = False else: fill_value = 0.0 sparse_series = [] N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] mask = codes != -1 codes = codes[mask] n_idx = np.arange(N)[mask] for ndx, code in zip(n_idx, codes): sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray( np.ones(len(ixs), dtype=dtype), sparse_index=IntIndex(N, ixs), fill_value=fill_value, dtype=dtype, ) sparse_series.append(Series(data=sarr, index=index, name=col)) out = concat(sparse_series, axis=1, copy=False) return out else: dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def _stack_multi_columns(frame, level_num=-1, dropna=True): def _convert_level_number(level_num, columns): """ Logic for converting the level number to something we can safely pass to swaplevel: We generally want to convert the level number into a level name, except when columns do not have names, in which case we must leave as a level number """ if level_num in columns.names: return columns.names[level_num] else: if columns.names[level_num] is None: return level_num else: return columns.names[level_num] this = frame.copy() # this makes life much simpler if level_num != frame.columns.nlevels - 1: # roll levels to put selected level at end roll_columns = this.columns for i in range(level_num, frame.columns.nlevels - 1): # Need to check if the ints conflict with level names lev1 = _convert_level_number(i, roll_columns) lev2 = _convert_level_number(i + 1, roll_columns) roll_columns = roll_columns.swaplevel(lev1, lev2) this.columns = roll_columns if not this.columns.is_lexsorted(): # Workaround the edge case where 0 is one of the column names, # which interferes with trying to sort based on the first # level level_to_sort = _convert_level_number(0, this.columns) this = this.sort_index(level=level_to_sort, axis=1) # tuple list excluding level for grouping columns if len(frame.columns.levels) > 2: tuples = list( zip(*[ lev.take(level_codes) for lev, level_codes in zip( this.columns.levels[:-1], this.columns.codes[:-1]) ])) unique_groups = [key for key, _ in itertools.groupby(tuples)] new_names = this.columns.names[:-1] new_columns = MultiIndex.from_tuples(unique_groups, names=new_names) else: new_columns = this.columns.levels[0]._shallow_copy( name=this.columns.names[0]) unique_groups = new_columns # time to ravel the values new_data = {} level_vals = this.columns.levels[-1] level_codes = sorted(set(this.columns.codes[-1])) level_vals_used = level_vals[level_codes] levsize = len(level_codes) drop_cols = [] for key in unique_groups: try: loc = this.columns.get_loc(key) except KeyError: drop_cols.append(key) continue # can make more efficient? # we almost always return a slice # but if unsorted can get a boolean # indexer if not isinstance(loc, slice): slice_len = len(loc) else: slice_len = loc.stop - loc.start if slice_len != levsize: chunk = this.loc[:, this.columns[loc]] chunk.columns = level_vals.take(chunk.columns.codes[-1]) value_slice = chunk.reindex(columns=level_vals_used).values else: if frame._is_homogeneous_type and is_extension_array_dtype( frame.dtypes.iloc[0]): dtype = this[this.columns[loc]].dtypes.iloc[0] subset = this[this.columns[loc]] value_slice = dtype.construct_array_type()._concat_same_type( [x._values for _, x in subset.items()]) N, K = this.shape idx = np.arange(N * K).reshape(K, N).T.ravel() value_slice = value_slice.take(idx) elif frame._is_mixed_type: value_slice = this[this.columns[loc]].values else: value_slice = this.values[:, loc] if value_slice.ndim > 1: # i.e. not extension value_slice = value_slice.ravel() new_data[key] = value_slice if len(drop_cols) > 0: new_columns = new_columns.difference(drop_cols) N = len(this) if isinstance(this.index, MultiIndex): new_levels = list(this.index.levels) new_names = list(this.index.names) new_codes = [lab.repeat(levsize) for lab in this.index.codes] else: old_codes, old_levels = _factorize_from_iterable(this.index) new_levels = [old_levels] new_codes = [old_codes.repeat(levsize)] new_names = [this.index.name] # something better? new_levels.append(level_vals) new_codes.append(np.tile(level_codes, N)) new_names.append(frame.columns.names[level_num]) new_index = MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False) result = frame._constructor(new_data, index=new_index, columns=new_columns) # more efficient way to go about this? can do the whole masking biz but # will only save a small amount of time... if dropna: result = result.dropna(axis=0, how="all") return result
def factorize(index): if index.is_unique: return index, np.arange(len(index)) codes, categories = _factorize_from_iterable(index) return categories, codes
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False, drop_first=False, dtype=None): # Series avoids inconsistent NaN handling codes, levels = _factorize_from_iterable(Series(data)) if dtype is None: dtype = np.uint8 dtype = np.dtype(dtype) if is_object_dtype(dtype): raise ValueError("dtype=object is not a valid dtype for get_dummies") def get_empty_Frame(data, sparse): if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) if not sparse: return DataFrame(index=index) else: return SparseDataFrame(index=index, default_fill_value=0) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_Frame(data, sparse) codes = codes.copy() if dummy_na: codes[codes == -1] = len(levels) levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_Frame(data, sparse) number_of_cols = len(levels) if prefix is not None: dummy_strs = [u'{prefix}{sep}{level}' if isinstance(v, text_type) else '{prefix}{sep}{level}' for v in levels] dummy_cols = [dummy_str.format(prefix=prefix, sep=prefix_sep, level=v) for dummy_str, v in zip(dummy_strs, levels)] else: dummy_cols = levels if isinstance(data, Series): index = data.index else: index = None if sparse: sparse_series = {} N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] mask = codes != -1 codes = codes[mask] n_idx = np.arange(N)[mask] for ndx, code in zip(n_idx, codes): sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray(np.ones(len(ixs), dtype=dtype), sparse_index=IntIndex(N, ixs), fill_value=0, dtype=dtype) sparse_series[col] = SparseSeries(data=sarr, index=index) out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols, default_fill_value=0, dtype=dtype) return out else: dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def _make_concat_multiindex(indexes, keys, levels=None, names=None): if (levels is None and isinstance(keys[0], tuple)) or (levels is not None and len(levels) > 1): zipped = list(zip(*keys)) if names is None: names = [None] * len(zipped) if levels is None: _, levels = _factorize_from_iterables(zipped) else: levels = [ensure_index(x) for x in levels] else: zipped = [keys] if names is None: names = [None] if levels is None: levels = [ensure_index(keys)] else: levels = [ensure_index(x) for x in levels] if not _all_indexes_same(indexes): codes_list = [] # things are potentially different sizes, so compute the exact codes # for each level and pass those to MultiIndex.from_arrays for hlevel, level in zip(zipped, levels): to_concat = [] for key, index in zip(hlevel, indexes): try: i = level.get_loc(key) except KeyError: raise ValueError( "Key {key!s} not in level {level!s}".format( key=key, level=level)) to_concat.append(np.repeat(i, len(index))) codes_list.append(np.concatenate(to_concat)) concat_index = _concat_indexes(indexes) # these go at the end if isinstance(concat_index, MultiIndex): levels.extend(concat_index.levels) codes_list.extend(concat_index.codes) else: codes, categories = _factorize_from_iterable(concat_index) levels.append(categories) codes_list.append(codes) if len(names) == len(levels): names = list(names) else: # make sure that all of the passed indices have the same nlevels if not len({idx.nlevels for idx in indexes}) == 1: raise AssertionError("Cannot concat indices that do" " not have the same number of levels") # also copies names = names + _get_consensus_names(indexes) return MultiIndex(levels=levels, codes=codes_list, names=names, verify_integrity=False) new_index = indexes[0] n = len(new_index) kpieces = len(indexes) # also copies new_names = list(names) new_levels = list(levels) # construct codes new_codes = [] # do something a bit more speedy for hlevel, level in zip(zipped, levels): hlevel = ensure_index(hlevel) mapped = level.get_indexer(hlevel) mask = mapped == -1 if mask.any(): raise ValueError( "Values not found in passed level: {hlevel!s}".format( hlevel=hlevel[mask])) new_codes.append(np.repeat(mapped, n)) if isinstance(new_index, MultiIndex): new_levels.extend(new_index.levels) new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes]) else: new_levels.append(new_index) new_codes.append(np.tile(np.arange(n), kpieces)) if len(new_names) < len(new_levels): new_names.extend(new_index.names) return MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False)
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False, drop_first=False, dtype=None): from pandas.core.reshape.concat import concat # Series avoids inconsistent NaN handling codes, levels = _factorize_from_iterable(Series(data)) if dtype is None: dtype = np.uint8 dtype = np.dtype(dtype) if is_object_dtype(dtype): raise ValueError("dtype=object is not a valid dtype for get_dummies") def get_empty_frame(data): if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) return DataFrame(index=index) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_frame(data) codes = codes.copy() if dummy_na: codes[codes == -1] = len(levels) levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_frame(data) number_of_cols = len(levels) if prefix is None: dummy_cols = levels else: # PY2 embedded unicode, gh-22084 def _make_col_name(prefix, prefix_sep, level): fstr = '{prefix}{prefix_sep}{level}' return fstr.format(prefix=prefix, prefix_sep=prefix_sep, level=level) dummy_cols = [_make_col_name(prefix, prefix_sep, level) for level in levels] if isinstance(data, Series): index = data.index else: index = None if sparse: if is_integer_dtype(dtype): fill_value = 0 elif dtype == bool: fill_value = False else: fill_value = 0.0 sparse_series = [] N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] mask = codes != -1 codes = codes[mask] n_idx = np.arange(N)[mask] for ndx, code in zip(n_idx, codes): sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray(np.ones(len(ixs), dtype=dtype), sparse_index=IntIndex(N, ixs), fill_value=fill_value, dtype=dtype) sparse_series.append(Series(data=sarr, index=index, name=col)) out = concat(sparse_series, axis=1, copy=False) return out else: dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def _get_dummies_1d(data, prefix, prefix_sep='_', dummy_na=False, sparse=False, drop_first=False, dtype=None): # Series avoids inconsistent NaN handling codes, levels = _factorize_from_iterable(Series(data)) if dtype is None: dtype = np.uint8 dtype = np.dtype(dtype) if is_object_dtype(dtype): raise ValueError("dtype=object is not a valid dtype for get_dummies") def get_empty_Frame(data, sparse): if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) if not sparse: return DataFrame(index=index) else: return SparseDataFrame(index=index, default_fill_value=0) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_Frame(data, sparse) codes = codes.copy() if dummy_na: codes[codes == -1] = len(levels) levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_Frame(data, sparse) number_of_cols = len(levels) if prefix is not None: dummy_strs = [u'{prefix}{sep}{level}' if isinstance(v, text_type) else '{prefix}{sep}{level}' for v in levels] dummy_cols = [dummy_str.format(prefix=prefix, sep=prefix_sep, level=v) for dummy_str, v in zip(dummy_strs, levels)] else: dummy_cols = levels if isinstance(data, Series): index = data.index else: index = None if sparse: sparse_series = {} N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] for ndx, code in enumerate(codes): if code == -1: # Blank entries if not dummy_na and code == -1, #GH4446 continue sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray(np.ones(len(ixs), dtype=dtype), sparse_index=IntIndex(N, ixs), fill_value=0, dtype=dtype) sparse_series[col] = SparseSeries(data=sarr, index=index) out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols, default_fill_value=0, dtype=dtype) return out else: dummy_mat = np.eye(number_of_cols, dtype=dtype).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)
def _make_concat_multiindex(indexes, keys, levels=None, names=None): if ((levels is None and isinstance(keys[0], tuple)) or (levels is not None and len(levels) > 1)): zipped = compat.lzip(*keys) if names is None: names = [None] * len(zipped) if levels is None: _, levels = _factorize_from_iterables(zipped) else: levels = [ensure_index(x) for x in levels] else: zipped = [keys] if names is None: names = [None] if levels is None: levels = [ensure_index(keys)] else: levels = [ensure_index(x) for x in levels] if not _all_indexes_same(indexes): codes_list = [] # things are potentially different sizes, so compute the exact codes # for each level and pass those to MultiIndex.from_arrays for hlevel, level in zip(zipped, levels): to_concat = [] for key, index in zip(hlevel, indexes): try: i = level.get_loc(key) except KeyError: raise ValueError('Key {key!s} not in level {level!s}' .format(key=key, level=level)) to_concat.append(np.repeat(i, len(index))) codes_list.append(np.concatenate(to_concat)) concat_index = _concat_indexes(indexes) # these go at the end if isinstance(concat_index, MultiIndex): levels.extend(concat_index.levels) codes_list.extend(concat_index.codes) else: codes, categories = _factorize_from_iterable(concat_index) levels.append(categories) codes_list.append(codes) if len(names) == len(levels): names = list(names) else: # make sure that all of the passed indices have the same nlevels if not len({idx.nlevels for idx in indexes}) == 1: raise AssertionError("Cannot concat indices that do" " not have the same number of levels") # also copies names = names + _get_consensus_names(indexes) return MultiIndex(levels=levels, codes=codes_list, names=names, verify_integrity=False) new_index = indexes[0] n = len(new_index) kpieces = len(indexes) # also copies new_names = list(names) new_levels = list(levels) # construct codes new_codes = [] # do something a bit more speedy for hlevel, level in zip(zipped, levels): hlevel = ensure_index(hlevel) mapped = level.get_indexer(hlevel) mask = mapped == -1 if mask.any(): raise ValueError('Values not found in passed level: {hlevel!s}' .format(hlevel=hlevel[mask])) new_codes.append(np.repeat(mapped, n)) if isinstance(new_index, MultiIndex): new_levels.extend(new_index.levels) new_codes.extend([np.tile(lab, kpieces) for lab in new_index.codes]) else: new_levels.append(new_index) new_codes.append(np.tile(np.arange(n), kpieces)) if len(new_names) < len(new_levels): new_names.extend(new_index.names) return MultiIndex(levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False)
def _get_dummies_1d(self, data, prefix, prefix_sep='_', dummy_na=False, sparse=False, drop_first=False): # Series avoids inconsistent NaN handling codes, levels = _factorize_from_iterable(Series(data)) def get_empty_Frame(data, sparse): if isinstance(data, Series): index = data.index else: index = np.arange(len(data)) if not sparse: return DataFrame(index=index) else: return SparseDataFrame(index=index, default_fill_value=0) # if all NaN if not dummy_na and len(levels) == 0: return get_empty_Frame(data, sparse) codes = codes.copy() if dummy_na: codes[codes == -1] = len(levels) levels = np.append(levels, np.nan) # if dummy_na, we just fake a nan level. drop_first will drop it again if drop_first and len(levels) == 1: return get_empty_Frame(data, sparse) number_of_cols = len(levels) if prefix is not None: dummy_cols = ['%s%s%s' % (prefix, prefix_sep, v) for v in levels] else: dummy_cols = levels if isinstance(data, Series): index = data.index else: index = None if sparse: sparse_series = {} N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] for ndx, code in enumerate(codes): if code == -1: # Blank entries if not dummy_na and code == -1, #GH4446 continue sp_indices[code].append(ndx) if drop_first: # remove first categorical level to avoid perfect collinearity # GH12042 sp_indices = sp_indices[1:] dummy_cols = dummy_cols[1:] for col, ixs in zip(dummy_cols, sp_indices): sarr = SparseArray(np.ones(len(ixs), dtype=np.uint8), sparse_index=IntIndex(N, ixs), fill_value=0, dtype=np.uint8) sparse_series[col] = SparseSeries(data=sarr, index=index) out = SparseDataFrame(sparse_series, index=index, columns=dummy_cols, default_fill_value=0, dtype=np.uint8) return out else: dummy_mat = np.eye(number_of_cols, dtype=np.uint8).take(codes, axis=0) if not dummy_na: # reset NaN GH4446 dummy_mat[codes == -1] = 0 if drop_first: # remove first GH12042 dummy_mat = dummy_mat[:, 1:] dummy_cols = dummy_cols[1:] return DataFrame(dummy_mat, index=index, columns=dummy_cols)