def na_op(x, y): try: result = expressions.evaluate( op, str_rep, x, y, raise_on_error=True, **eval_kwargs) except TypeError: xrav = x.ravel() if isinstance(y, (np.ndarray, pd.Series)): dtype = np.find_common_type([x.dtype, y.dtype], []) result = np.empty(x.size, dtype=dtype) yrav = y.ravel() mask = notnull(xrav) & notnull(yrav) xrav = xrav[mask] yrav = yrav[mask] if np.prod(xrav.shape) and np.prod(yrav.shape): result[mask] = op(xrav, yrav) else: result = np.empty(x.size, dtype=x.dtype) mask = notnull(xrav) xrav = xrav[mask] if np.prod(xrav.shape): result[mask] = op(xrav, y) result, changed = com._maybe_upcast_putmask(result, ~mask, np.nan) result = result.reshape(x.shape) result = com._fill_zeros(result, x, y, name, fill_zeros) return result
def df_lambda_replace(self, func_dict, df_list=None, inplace=False): """ replacing values columnwise using a dict of functions ('func_dict') where the keys are tuples such as ('X','column_name'). Parameters ---------- func_dict : dict() df_list : list() List of dataframes. If None then it will use self.ALL. All dataframes in df_list must be _df_XY_split NOTE: NaN, or missing values will be avoided. """ df = self.ALL for key in func_dict: idx = ('X', key) try: df.loc[com.notnull(df[idx]), idx] = \ df.loc[com.notnull(df[idx]), idx].\ map(func_dict[key]).astype(func_dict[1]) except: print 'No type specified in func_dict so float is assumed.' df.loc[com.notnull(df[idx]), idx] = \ df.loc[com.notnull(df[idx]), idx].\ map(func_dict[key]).astype(float) if inplace == False: return self
def na_op(x, y): try: result = expressions.evaluate( op, str_rep, x, y, raise_on_error=True, **eval_kwargs) except TypeError: xrav = x.ravel() if isinstance(y, (np.ndarray, pd.Series)): dtype = np.find_common_type([x.dtype, y.dtype], []) result = np.empty(x.size, dtype=dtype) yrav = y.ravel() mask = notnull(xrav) & notnull(yrav) xrav = xrav[mask] yrav = yrav[mask] if np.prod(xrav.shape) and np.prod(yrav.shape): result[mask] = op(xrav, yrav) elif hasattr(x,'size'): result = np.empty(x.size, dtype=x.dtype) mask = notnull(xrav) xrav = xrav[mask] if np.prod(xrav.shape): result[mask] = op(xrav, y) else: raise TypeError("cannot perform operation {op} between objects " "of type {x} and {y}".format(op=name,x=type(x),y=type(y))) result, changed = com._maybe_upcast_putmask(result, ~mask, np.nan) result = result.reshape(x.shape) result = com._fill_zeros(result, x, y, name, fill_zeros) return result
def relabel(key): pos = index_map[key] xlab = xlabels[pos] ylab = ylabels[pos] return '%sx%s' % (int(xlab) if com.notnull(xlab) else 'NULL', int(ylab) if com.notnull(ylab) else 'NULL')
def nancov(a, b): assert(len(a) == len(b)) if len(a) == 0: return np.nan valid = notnull(a) & notnull(b) if not valid.all(): a = a[valid] b = b[valid] return np.cov(a, b)[0, 1]
def test_notnull(): assert notnull(1.) assert not notnull(None) assert not notnull(np.NaN) with cf.option_context("mode.use_inf_as_null", False): assert notnull(np.inf) assert notnull(-np.inf) arr = np.array([1.5, np.inf, 3.5, -np.inf]) result = notnull(arr) assert result.all() with cf.option_context("mode.use_inf_as_null", True): assert not notnull(np.inf) assert not notnull(-np.inf) arr = np.array([1.5, np.inf, 3.5, -np.inf]) result = notnull(arr) assert result.sum() == 2 with cf.option_context("mode.use_inf_as_null", False): for s in [tm.makeFloatSeries(),tm.makeStringSeries(), tm.makeObjectSeries(),tm.makeTimeSeries(),tm.makePeriodSeries()]: assert(isinstance(isnull(s), Series))
def test_isnull_datetime(): assert (not isnull(datetime.now())) assert notnull(datetime.now()) idx = date_range('1/1/1990', periods=20) assert(notnull(idx).all()) idx = np.asarray(idx) idx[0] = iNaT idx = DatetimeIndex(idx) mask = isnull(idx) assert(mask[0]) assert(not mask[1:].any())
def nancov(a, b): if len(a) != len(b): raise AssertionError('Operands to nancov must have same size') valid = notnull(a) & notnull(b) if not valid.all(): a = a[valid] b = b[valid] if len(a) == 0: return np.nan return np.cov(a, b)[0, 1]
def nancorr(a, b, method='pearson'): """ a, b: ndarrays """ assert(len(a) == len(b)) if len(a) == 0: return np.nan valid = notnull(a) & notnull(b) if not valid.all(): a = a[valid] b = b[valid] f = get_corr_func(method) return f(a, b)
def test_isnull_datetime(): assert not isnull(datetime.now()) assert notnull(datetime.now()) idx = date_range("1/1/1990", periods=20) assert notnull(idx).all() import pandas.lib as lib idx = np.asarray(idx) idx[0] = lib.iNaT idx = DatetimeIndex(idx) mask = isnull(idx) assert mask[0] assert not mask[1:].any()
def value_counts(values, sort=True, ascending=False): """ Compute a histogram of the counts of non-null values Parameters ---------- values : ndarray (1-d) sort : boolean, default True Sort by values ascending : boolean, default False Sort in ascending order Returns ------- value_counts : Series """ from pandas.core.series import Series from collections import defaultdict if com.is_integer_dtype(values.dtype): values = com._ensure_int64(values) keys, counts = lib.value_count_int64(values) result = Series(counts, index=keys) else: counter = defaultdict(lambda: 0) values = values[com.notnull(values)] for value in values: counter[value] += 1 result = Series(counter) if sort: result.sort() if not ascending: result = result[::-1] return result
def _bucketpanel_cat(series, xcat, ycat): xlabels, xmapping = _intern(xcat) ylabels, ymapping = _intern(ycat) shift = 10 ** (np.ceil(np.log10(ylabels.max()))) labels = xlabels * shift + ylabels sorter = labels.argsort() sorted_labels = labels.take(sorter) sorted_xlabels = xlabels.take(sorter) sorted_ylabels = ylabels.take(sorter) unique_labels = np.unique(labels) unique_labels = unique_labels[com.notnull(unique_labels)] locs = sorted_labels.searchsorted(unique_labels) xkeys = sorted_xlabels.take(locs) ykeys = sorted_ylabels.take(locs) stringified = ['(%s, %s)' % arg for arg in zip(xmapping.take(xkeys), ymapping.take(ykeys))] result = bucketcat(series, labels) result.columns = stringified return result
def stack(frame, level=-1, dropna=True): """ Convert DataFrame to Series with multi-level Index. Columns become the second level of the resulting hierarchical index Returns ------- stacked : Series """ N, K = frame.shape if isinstance(frame.columns, MultiIndex): return _stack_multi_columns(frame, level=level, dropna=True) elif isinstance(frame.index, MultiIndex): new_levels = list(frame.index.levels) new_levels.append(frame.columns) new_labels = [lab.repeat(K) for lab in frame.index.labels] new_labels.append(np.tile(np.arange(K), N).ravel()) new_names = list(frame.index.names) new_names.append("columns") new_index = MultiIndex(levels=new_levels, labels=new_labels, names=new_names) else: ilabels = np.arange(N).repeat(K) clabels = np.tile(np.arange(K), N).ravel() new_index = MultiIndex(levels=[frame.index, frame.columns], labels=[ilabels, clabels]) new_values = frame.values.ravel() if dropna: mask = notnull(new_values) new_values = new_values[mask] new_index = new_index[mask] return Series(new_values, index=new_index)
def nancov(a, b, min_periods=None): if len(a) != len(b): raise AssertionError('Operands to nancov must have same size') if min_periods is None: min_periods = 1 valid = notnull(a) & notnull(b) if not valid.all(): a = a[valid] b = b[valid] if len(a) < min_periods: return np.nan return np.cov(a, b)[0, 1]
def make_sparse(arr, kind='block', fill_value=nan): """ Convert ndarray to sparse format Parameters ---------- arr : ndarray kind : {'block', 'integer'} fill_value : NaN or another value Returns ------- (sparse_values, index) : (ndarray, SparseIndex) """ arr = _sanitize_values(arr) if arr.ndim > 1: raise TypeError("expected dimension <= 1 data") if com.isnull(fill_value): mask = com.notnull(arr) else: mask = arr != fill_value length = len(arr) if length != mask.size: # the arr is a SparseArray indices = mask.sp_index.indices else: indices = np.arange(length, dtype=np.int32)[mask] index = _make_index(length, indices, kind) sparsified_values = arr[mask] return sparsified_values, index
def value_counts(values, sort=True, ascending=False): """ Compute a histogram of the counts of non-null values Returns ------- value_counts : Series """ from collections import defaultdict if com.is_integer_dtype(values.dtype): values = com._ensure_int64(values) keys, counts = lib.value_count_int64(values) result = Series(counts, index=keys) else: counter = defaultdict(lambda: 0) values = values[com.notnull(values)] for value in values: counter[value] += 1 result = Series(counter) if sort: result.sort() if not ascending: result = result[::-1] return result
def median(self): """ Compute median value of non-null values """ arr = self.values arr = arr[notnull(arr)] return tseries.median(arr)
def _format_strings(self): if self.float_format is None: float_format = print_config.float_format if float_format is None: fmt_str = '%% .%dg' % print_config.precision float_format = lambda x: fmt_str % x else: float_format = self.float_format formatter = com.pprint_thing if self.formatter is None else self.formatter def _format(x): if self.na_rep is not None and lib.checknull(x): if x is None: return 'None' return self.na_rep else: # object dtype return '%s' % formatter(x) vals = self.values is_float = lib.map_infer(vals, com.is_float) & notnull(vals) leading_space = is_float.any() fmt_values = [] for i, v in enumerate(vals): if not is_float[i] and leading_space: fmt_values.append(' %s' % _format(v)) elif is_float[i]: fmt_values.append(float_format(v)) else: fmt_values.append(' %s' % _format(v)) return fmt_values
def dropna(self, axis=0, how='any'): """ Drop 2D from panel, holding passed axis constant Parameters ---------- axis : int, default 0 Axis to hold constant. E.g. axis=1 will drop major_axis entries having a certain amount of NA data how : {'all', 'any'}, default 'any' 'any': one or more values are NA in the DataFrame along the axis. For 'all' they all must be. Returns ------- dropped : Panel """ axis = self._get_axis_number(axis) values = self.values mask = com.notnull(values) for ax in reversed(sorted(set(range(self._AXIS_LEN)) - set([axis]))): mask = mask.sum(ax) per_slice = np.prod(values.shape[:axis] + values.shape[axis + 1:]) if how == 'all': cond = mask > 0 else: cond = mask == per_slice new_ax = self._get_axis(axis)[cond] return self.reindex_axis(new_ax, axis=axis)
def bucketcat(series, cats): """ Produce DataFrame representing quantiles of a Series Parameters ---------- series : Series cat : Series or same-length array bucket by category; mutually exxlusive with 'by' Returns ------- DataFrame """ if not isinstance(series, Series): series = Series(series, index=np.arange(len(series))) cats = np.asarray(cats) unique_labels = np.unique(cats) unique_labels = unique_labels[com.notnull(unique_labels)] # group by data = {} for label in unique_labels: data[label] = series[cats == label] return DataFrame(data, columns=unique_labels)
def na_op(x, y): try: result = expressions.evaluate(op, str_rep, x, y, raise_on_error=True, **eval_kwargs) except TypeError: result = pa.empty(len(x), dtype=x.dtype) if isinstance(y, (pa.Array, pd.Series)): mask = notnull(x) & notnull(y) result[mask] = op(x[mask], y[mask]) else: mask = notnull(x) result[mask] = op(x[mask], y) result, changed = com._maybe_upcast_putmask(result, -mask, pa.NA) result = com._fill_zeros(result, y, fill_zeros) return result
def cumprod(self, axis=0, dtype=None, out=None): """ Overriding numpy's built-in cumprod functionality """ arr = self.copy() okLocs = notnull(arr) arr[okLocs] = np.cumprod(arr.view(ndarray)[okLocs]) return arr
def f(arr): mask = common.notnull(arr) if skipna: return _tseries.median(arr[mask]) else: if not mask.all(): return np.nan return _tseries.median(arr)
def nancorr(a, b, method='pearson'): """ a, b: ndarrays """ if len(a) != len(b): raise AssertionError('Operands to nancorr must have same size') valid = notnull(a) & notnull(b) if not valid.all(): a = a[valid] b = b[valid] if len(a) == 0: return np.nan f = get_corr_func(method) return f(a, b)
def count(self): """ Return number of observations of Series. Returns ------- nobs : int """ return notnull(self.values).sum()
def na_op(x, y): try: result = expressions.evaluate(op, str_rep, x, y, raise_on_error=True, **eval_kwargs) except TypeError: if isinstance(y, (pa.Array, pd.Series)): dtype = np.find_common_type([x.dtype, y.dtype], []) result = np.empty(x.size, dtype=dtype) mask = notnull(x) & notnull(y) result[mask] = op(x[mask], y[mask]) else: result = pa.empty(len(x), dtype=x.dtype) mask = notnull(x) result[mask] = op(x[mask], y) result, changed = com._maybe_upcast_putmask(result, ~mask, pa.NA) result = com._fill_zeros(result, x, y, name, fill_zeros) return result
def _get_workbook_rowdicts(self): ''' returns list of key-value dicts for all rows in sheet, with keys in first row. empty values are removed. ''' rows = self.workbook.parse().to_dict(outtype='records') rows_ret = list() for row in rows: ret = dict((k, v) for k, v in row.iteritems() if notnull(v)) rows_ret.append(ret) return rows_ret
def cumsum(self, axis=0, dtype=None, out=None): """ Overriding numpy's built-in cumsum functionality """ arr = self.copy() okLocs = notnull(arr) result = np.cumsum(arr.view(ndarray)[okLocs]) arr = arr.astype(result.dtype) arr[okLocs] = result return arr
def _reindex_columns(self, columns, copy, level, fill_value): if level is not None: raise Exception("Reindex by level not supported for sparse") if com.notnull(fill_value): raise NotImplementedError # TODO: fill value handling sdict = dict((k, v) for k, v in self.iteritems() if k in columns) return SparseDataFrame(sdict, index=self.index, columns=columns, default_fill_value=self.default_fill_value)
def stack(frame, level=-1, dropna=True): """ Convert DataFrame to Series with multi-level Index. Columns become the second level of the resulting hierarchical index Returns ------- stacked : Series """ def factorize(index): if index.is_unique: return index, np.arange(len(index)) cat = Categorical(index, ordered=True) return cat.categories, cat.codes N, K = frame.shape if isinstance(frame.columns, MultiIndex): if frame.columns._reference_duplicate_name(level): msg = ("Ambiguous reference to {0}. The column " "names are not unique.".format(level)) raise ValueError(msg) # Will also convert negative level numbers and check if out of bounds. level_num = frame.columns._get_level_number(level) if isinstance(frame.columns, MultiIndex): return _stack_multi_columns(frame, level_num=level_num, dropna=dropna) elif isinstance(frame.index, MultiIndex): new_levels = list(frame.index.levels) new_labels = [lab.repeat(K) for lab in frame.index.labels] clev, clab = factorize(frame.columns) new_levels.append(clev) new_labels.append(np.tile(clab, N).ravel()) new_names = list(frame.index.names) new_names.append(frame.columns.name) new_index = MultiIndex(levels=new_levels, labels=new_labels, names=new_names, verify_integrity=False) else: levels, (ilab, clab) = zip(*map(factorize, (frame.index, frame.columns))) labels = ilab.repeat(K), np.tile(clab, N).ravel() new_index = MultiIndex(levels=levels, labels=labels, names=[frame.index.name, frame.columns.name], verify_integrity=False) new_values = frame.values.ravel() if dropna: mask = notnull(new_values) new_values = new_values[mask] new_index = new_index[mask] return Series(new_values, index=new_index)
def stack(frame, level=-1, dropna=True): """ Convert DataFrame to Series with multi-level Index. Columns become the second level of the resulting hierarchical index Returns ------- stacked : Series """ N, K = frame.shape if isinstance(level, int) and level < 0: level += frame.columns.nlevels level = frame.columns._get_level_number(level) if isinstance(frame.columns, MultiIndex): return _stack_multi_columns(frame, level=level, dropna=True) elif isinstance(frame.index, MultiIndex): new_levels = list(frame.index.levels) new_levels.append(frame.columns) new_labels = [lab.repeat(K) for lab in frame.index.labels] new_labels.append(np.tile(np.arange(K), N).ravel()) new_names = list(frame.index.names) new_names.append(frame.columns.name) new_index = MultiIndex(levels=new_levels, labels=new_labels, names=new_names) else: ilabels = np.arange(N).repeat(K) clabels = np.tile(np.arange(K), N).ravel() new_index = MultiIndex(levels=[frame.index, frame.columns], labels=[ilabels, clabels], names=[frame.index.name, frame.columns.name]) new_values = frame.values.ravel() if dropna: mask = notnull(new_values) new_values = new_values[mask] new_index = new_index[mask] return Series(new_values, index=new_index)
def dropna(self, axis=0, how='any', inplace=False, **kwargs): """ Drop 2D from panel, holding passed axis constant Parameters ---------- axis : int, default 0 Axis to hold constant. E.g. axis=1 will drop major_axis entries having a certain amount of NA data how : {'all', 'any'}, default 'any' 'any': one or more values are NA in the DataFrame along the axis. For 'all' they all must be. inplace : bool, default False If True, do operation inplace and return None. Returns ------- dropped : Panel """ axis = self._get_axis_number(axis) values = self.values mask = com.notnull(values) for ax in reversed(sorted(set(range(self._AXIS_LEN)) - set([axis]))): mask = mask.sum(ax) per_slice = np.prod(values.shape[:axis] + values.shape[axis + 1:]) if how == 'all': cond = mask > 0 else: cond = mask == per_slice new_ax = self._get_axis(axis)[cond] result = self.reindex_axis(new_ax, axis=axis) if inplace: self._update_inplace(result) else: return result
def _format_strings(self, use_unicode=False): if self.float_format is None: float_format = print_config.float_format if float_format is None: fmt_str = '%% .%dg' % print_config.precision float_format = lambda x: fmt_str % x else: float_format = self.float_format if use_unicode: def _strify(x): return _stringify(x, print_config.encoding) formatter = _strify if self.formatter is None else self.formatter else: formatter = str if self.formatter is None else self.formatter def _format(x): if self.na_rep is not None and lib.checknull(x): if x is None: return 'None' return self.na_rep else: # object dtype return '%s' % formatter(x) vals = self.values is_float = lib.map_infer(vals, com.is_float) & notnull(vals) leading_space = is_float.any() fmt_values = [] for i, v in enumerate(vals): if not is_float[i] and leading_space: fmt_values.append(' %s' % _format(v)) elif is_float[i]: fmt_values.append(float_format(v)) else: fmt_values.append(' %s' % _format(v)) return fmt_values
def to_frame(self, filter_observations=True): """ Transform wide format into long (stacked) format as DataFrame Parameters ---------- filter_observations : boolean, default True Drop (major, minor) pairs without a complete set of observations across all the items Returns ------- y : DataFrame """ _, N, K = self.shape if filter_observations: mask = com.notnull(self.values).all(axis=0) # size = mask.sum() selector = mask.ravel() else: # size = N * K selector = slice(None, None) data = {} for item in self.items: data[item] = self[item].values.ravel()[selector] major_labels = np.arange(N).repeat(K)[selector] # Anyone think of a better way to do this? np.repeat does not # do what I want minor_labels = np.arange(K).reshape(1, K)[np.zeros(N, dtype=int)] minor_labels = minor_labels.ravel()[selector] index = MultiIndex(levels=[self.major_axis, self.minor_axis], labels=[major_labels, minor_labels], names=['major', 'minor']) return DataFrame(data, index=index, columns=self.items)
def _attempt_YYYYMMDD(arg, coerce): """ try to parse the YYYYMMDD/%Y%m%d format, try to deal with NaT-like, arg is a passed in as an object dtype, but could really be ints/strings with nan-like/or floats (e.g. with nan) """ def calc(carg): # calculate the actual result carg = carg.astype(object) return tslib.array_to_datetime(lib.try_parse_year_month_day( carg / 10000, carg / 100 % 100, carg % 100), coerce=coerce) def calc_with_mask(carg, mask): result = np.empty(carg.shape, dtype='M8[ns]') iresult = result.view('i8') iresult[~mask] = tslib.iNaT result[mask] = calc(carg[mask].astype(np.float64).astype( np.int64)).astype('M8[ns]') return result # try intlike / strings that are ints try: return calc(arg.astype(np.int64)) except: pass # a float with actual np.nan try: carg = arg.astype(np.float64) return calc_with_mask(carg, com.notnull(carg)) except: pass # string with NaN-like try: mask = ~lib.ismember(arg, tslib._nat_strings) return calc_with_mask(arg, mask) except: pass return None
def _reindex_columns(self, columns, copy, level, fill_value, limit=None, takeable=False): if level is not None: raise TypeError('Reindex by level not supported for sparse') if com.notnull(fill_value): raise NotImplementedError if limit: raise NotImplementedError # TODO: fill value handling sdict = dict((k, v) for k, v in compat.iteritems(self) if k in columns) return SparseDataFrame(sdict, index=self.index, columns=columns, default_fill_value=self._default_fill_value)
def value_counts(values, sort=True, ascending=False): """ Compute a histogram of the counts of non-null values Parameters ---------- values : ndarray (1-d) sort : boolean, default True Sort by values ascending : boolean, default False Sort in ascending order Returns ------- value_counts : Series """ from pandas.core.series import Series from collections import defaultdict values = np.asarray(values) if com.is_integer_dtype(values.dtype): values = com._ensure_int64(values) keys, counts = lib.value_count_int64(values) result = Series(counts, index=keys) else: counter = defaultdict(lambda: 0) values = values[com.notnull(values)] for value in values: counter[value] += 1 result = Series(counter) if sort: result.sort() if not ascending: result = result[::-1] return result
def _format_strings(self): if self.float_format is None: float_format = get_option("display.float_format") if float_format is None: fmt_str = '%% .%dg' % get_option("display.precision") float_format = lambda x: fmt_str % x else: float_format = self.float_format formatter = (lambda x: com.pprint_thing(x,escape_chars=('\t','\r','\n'))) \ if self.formatter is None else self.formatter def _format(x): if self.na_rep is not None and lib.checknull(x): if x is None: return 'None' return self.na_rep else: # object dtype return '%s' % formatter(x) vals = self.values is_float = lib.map_infer(vals, com.is_float) & notnull(vals) leading_space = is_float.any() fmt_values = [] for i, v in enumerate(vals): if not is_float[i] and leading_space: fmt_values.append(' %s' % _format(v)) elif is_float[i]: fmt_values.append(float_format(v)) else: fmt_values.append(' %s' % _format(v)) return fmt_values
def asOf(self, date): """ Return last good (non-NaN) value in TimeSeries if value is NaN for requested date. If there is no good value, NaN is returned. Parameters ---------- date : datetime or similar value Notes ----- Dates are assumed to be sorted Returns ------- value or NaN """ if isinstance(date, basestring): date = datetools.to_datetime(date) v = self.get(date) if isnull(v): candidates = self.index[notnull(self)] index = candidates.searchsorted(date) if index > 0: asOfDate = candidates[index - 1] else: return NaN return self.get(asOfDate) else: return v
def _format_with(self, fmt_str): fmt_values = [fmt_str % x if notnull(x) else self.na_rep for x in self.values] return _trim_zeros(fmt_values, self.na_rep)
def to_frame(self, filter_observations=True): """ Transform wide format into long (stacked) format as DataFrame whose columns are the Panel's items and whose index is a MultiIndex formed of the Panel's major and minor axes. Parameters ---------- filter_observations : boolean, default True Drop (major, minor) pairs without a complete set of observations across all the items Returns ------- y : DataFrame """ _, N, K = self.shape if filter_observations: # shaped like the return DataFrame mask = com.notnull(self.values).all(axis=0) # size = mask.sum() selector = mask.ravel() else: # size = N * K selector = slice(None, None) data = {} for item in self.items: data[item] = self[item].values.ravel()[selector] def construct_multi_parts(idx, n_repeat, n_shuffle=1): axis_idx = idx.to_hierarchical(n_repeat, n_shuffle) labels = [x[selector] for x in axis_idx.labels] levels = axis_idx.levels names = axis_idx.names return labels, levels, names def construct_index_parts(idx, major=True): levels = [idx] if major: labels = [np.arange(N).repeat(K)[selector]] names = idx.name or 'major' else: labels = np.arange(K).reshape(1, K)[np.zeros(N, dtype=int)] labels = [labels.ravel()[selector]] names = idx.name or 'minor' names = [names] return labels, levels, names if isinstance(self.major_axis, MultiIndex): major_labels, major_levels, major_names = construct_multi_parts( self.major_axis, n_repeat=K) else: major_labels, major_levels, major_names = construct_index_parts( self.major_axis) if isinstance(self.minor_axis, MultiIndex): minor_labels, minor_levels, minor_names = construct_multi_parts( self.minor_axis, n_repeat=N, n_shuffle=K) else: minor_labels, minor_levels, minor_names = construct_index_parts( self.minor_axis, major=False) levels = major_levels + minor_levels labels = major_labels + minor_labels names = major_names + minor_names index = MultiIndex(levels=levels, labels=labels, names=names, verify_integrity=False) return DataFrame(data, index=index, columns=self.items)
def lreshape(data, groups, dropna=True, label=None): """ Reshape long-format data to wide. Generalized inverse of DataFrame.pivot Parameters ---------- data : DataFrame groups : dict {new_name : list_of_columns} dropna : boolean, default True Examples -------- >>> data hr1 hr2 team year1 year2 0 514 545 Red Sox 2007 2008 1 573 526 Yankees 2007 2008 >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']}) team hr year 0 Red Sox 514 2007 1 Yankees 573 2007 2 Red Sox 545 2008 3 Yankees 526 2008 Returns ------- reshaped : DataFrame """ if isinstance(groups, dict): keys = groups.keys() values = groups.values() else: keys, values = zip(*groups) all_cols = list(set.union(*[set(x) for x in values])) id_cols = list(data.columns.diff(all_cols)) K = len(values[0]) for seq in values: if len(seq) != K: raise ValueError('All column lists must be same length') mdata = {} pivot_cols = [] for target, names in zip(keys, values): mdata[target] = com._concat_compat([data[col].values for col in names]) pivot_cols.append(target) for col in id_cols: mdata[col] = np.tile(data[col].values, K) if dropna: mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool) for c in pivot_cols: mask &= notnull(mdata[c]) if not mask.all(): mdata = dict((k, v[mask]) for k, v in mdata.iteritems()) return DataFrame(mdata, columns=id_cols + pivot_cols)
def _guess_datetime_format_for_array(arr, **kwargs): # Try to guess the format based on the first non-NaN element non_nan_elements = com.notnull(arr).nonzero()[0] if len(non_nan_elements): return _guess_datetime_format(arr[non_nan_elements[0]], **kwargs)
def test_notnull(): assert notnull(1.) assert not notnull(None) assert not notnull(np.NaN) assert not notnull(np.inf) assert not notnull(-np.inf)
def remove_na(arr): """ Return array containing only true/non-NaN values, possibly empty. """ return arr[notnull(arr)]
def test_isnull_datetime(): assert (not isnull(datetime.now())) assert notnull(datetime.now())
def scatter_matrix_all(frame, alpha=0.5, figsize=None, grid=False, diagonal='hist', marker='.', density_kwds=None, hist_kwds=None, range_padding=0.05, **kwds): import matplotlib.pyplot as plt from matplotlib.artist import setp import pandas.core.common as com from pandas.compat import range, lrange, zip from statsmodels.nonparametric.smoothers_lowess import lowess import numpy as np df = frame num_cols = frame._get_numeric_data().columns.values n = df.columns.size fig, axes = plt.subplots(nrows=n, ncols=n, figsize=figsize, squeeze=False) # no gaps between subplots fig.subplots_adjust(wspace=0, hspace=0) mask = com.notnull(df) marker = _get_marker_compat(marker) hist_kwds = hist_kwds or {} density_kwds = density_kwds or {} # workaround because `c='b'` is hardcoded in matplotlibs scatter method kwds.setdefault('c', plt.rcParams['patch.facecolor']) boundaries_list = [] for a in df.columns: if a in num_cols: values = df[a].values[mask[a].values] else: values = df[a].value_counts() rmin_, rmax_ = np.min(values), np.max(values) rdelta_ext = (rmax_ - rmin_) * range_padding / 2. boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext)) for i, a in zip(lrange(n), df.columns): for j, b in zip(lrange(n), df.columns): ax = axes[i, j] if i == j: if a in num_cols: # numerical variable values = df[a].values[mask[a].values] # Deal with the diagonal by drawing a histogram there. if diagonal == 'hist': ax.hist(values, **hist_kwds) elif diagonal in ('kde', 'density'): from scipy.stats import gaussian_kde y = values gkde = gaussian_kde(y) ind = np.linspace(y.min(), y.max(), 1000) ax.plot(ind, gkde.evaluate(ind), **density_kwds) ax.set_xlim(boundaries_list[i]) else: # categorical variable values = df[a].value_counts() ax.bar(list(range(df[a].nunique())), values) else: common = (mask[a] & mask[b]).values # two numerical variables if a in num_cols and b in num_cols: if i > j: ax.scatter(df[b][common], df[a][common], marker=marker, alpha=alpha, **kwds) # The following 2 lines add the lowess smoothing ys = lowess(df[a][common], df[b][common]) ax.plot(ys[:, 0], ys[:, 1], 'red') else: pearR = df[[a, b]].corr() ax.text(df[b].min(), df[a].min(), 'r = %.4f' % (pearR.iloc[0][1])) ax.set_xlim(boundaries_list[j]) ax.set_ylim(boundaries_list[i]) # two categorical variables elif a not in num_cols and b not in num_cols: if i > j: from statsmodels.graphics import mosaicplot mosaicplot.mosaic(df, [b, a], ax, labelizer=lambda k: '') # one numerical variable and one categorical variable else: if i > j: tol = pd.DataFrame(df[[a, b]]) if a in num_cols: label = [k for k, v in tol.groupby(b)] values = [v[a].tolist() for k, v in tol.groupby(b)] ax.boxplot(values, labels=label) else: label = [k for k, v in tol.groupby(a)] values = [v[b].tolist() for k, v in tol.groupby(a)] ax.boxplot(values, labels=label, vert=False) ax.set_xlabel('') ax.set_ylabel('') _label_axis(ax, kind='x', label=b, position='bottom', rotate=True) _label_axis(ax, kind='y', label=a, position='left') if j != 0: ax.yaxis.set_visible(False) if i != n - 1: ax.xaxis.set_visible(False) for ax in axes.flat: setp(ax.get_xticklabels(), fontsize=8) setp(ax.get_yticklabels(), fontsize=8) return fig
def scatter_matrix_lowess1(frame, alpha=0.5, figsize=None, grid=False, diagonal='hist', marker='.', density_kwds=None, hist_kwds=None, range_padding=0.05, **kwds): """ Draw a matrix of scatter plots with lowess smoother. This is an adapted version of the pandas scatter_matrix function. Parameters ---------- frame : DataFrame alpha : float, optional amount of transparency applied figsize : (float,float), optional a tuple (width, height) in inches ax : Matplotlib axis object, optional grid : bool, optional setting this to True will show the grid diagonal : {'hist', 'kde'} pick between 'kde' and 'hist' for either Kernel Density Estimation or Histogram plot in the diagonal marker : str, optional Matplotlib marker type, default '.' hist_kwds : other plotting keyword arguments To be passed to hist function density_kwds : other plotting keyword arguments To be passed to kernel density estimate plot range_padding : float, optional relative extension of axis range in x and y with respect to (x_max - x_min) or (y_max - y_min), default 0.05 kwds : other plotting keyword arguments To be passed to scatter function Examples -------- >>> df = DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D']) >>> scatter_matrix_lowess(df, alpha=0.2) """ import numpy as np import matplotlib.pyplot as plt from matplotlib.artist import setp import pandas.core.common as com from pandas.compat import range, lrange, lmap, map, zip from statsmodels.nonparametric.smoothers_lowess import lowess df = frame._get_numeric_data() n = df.columns.size fig, axes = plt.subplots(nrows=n, ncols=n, figsize=figsize, squeeze=False) # no gaps between subplots fig.subplots_adjust(wspace=0, hspace=0) mask = com.notnull(df) marker = _get_marker_compat(marker) hist_kwds = hist_kwds or {} density_kwds = density_kwds or {} # workaround because `c='b'` is hardcoded in matplotlibs scatter method kwds.setdefault('c', plt.rcParams['patch.facecolor']) boundaries_list = [] for a in df.columns: values = df[a].values[mask[a].values] rmin_, rmax_ = np.min(values), np.max(values) rdelta_ext = (rmax_ - rmin_) * range_padding / 2. boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext)) for i, a in zip(lrange(n), df.columns): for j, b in zip(lrange(n), df.columns): ax = axes[i, j] if i == j: values = df[a].values[mask[a].values] # Deal with the diagonal by drawing a histogram there. if diagonal == 'hist': ax.hist(values, **hist_kwds) elif diagonal in ('kde', 'density'): from scipy.stats import gaussian_kde y = values gkde = gaussian_kde(y) ind = np.linspace(y.min(), y.max(), 1000) ax.plot(ind, gkde.evaluate(ind), **density_kwds) ax.set_xlim(boundaries_list[i]) else: common = (mask[a] & mask[b]).values ax.scatter(df[b][common], df[a][common], marker=marker, alpha=alpha, **kwds) # The following 2 lines are new and add the lowess smoothing ys = lowess(df[a][common], df[b][common]) ax.plot(ys[:, 0], ys[:, 1], 'red', linewidth=1) ax.set_xlim(boundaries_list[j]) ax.set_ylim(boundaries_list[i]) ax.set_xlabel('') ax.set_ylabel('') _label_axis(ax, kind='x', label=b, position='bottom', rotate=True) _label_axis(ax, kind='y', label=a, position='left') if j != 0: ax.yaxis.set_visible(False) if i != n - 1: ax.xaxis.set_visible(False) for ax in axes.flat: setp(ax.get_xticklabels(), fontsize=8) setp(ax.get_yticklabels(), fontsize=8) return fig
def get_median(x): mask = notnull(x) if not skipna and not mask.all(): return np.nan return algos.median(x[mask])
def na_op(x, y): # dispatch to the categorical if we have a categorical # in either operand if is_categorical_dtype(x): return op(x,y) elif is_categorical_dtype(y) and not isscalar(y): return op(y,x) if is_object_dtype(x.dtype): if isinstance(y, list): y = lib.list_to_object_array(y) if isinstance(y, (np.ndarray, pd.Series)): if not is_object_dtype(y.dtype): result = lib.vec_compare(x, y.astype(np.object_), op) else: result = lib.vec_compare(x, y, op) else: result = lib.scalar_compare(x, y, op) else: # we want to compare like types # we only want to convert to integer like if # we are not NotImplemented, otherwise # we would allow datetime64 (but viewed as i8) against # integer comparisons if is_datetimelike_v_numeric(x, y): raise TypeError("invalid type comparison") # numpy does not like comparisons vs None if isscalar(y) and isnull(y): y = np.nan # we have a datetime/timedelta and may need to convert mask = None if needs_i8_conversion(x) or (not isscalar(y) and needs_i8_conversion(y)): if isscalar(y): y = _index.convert_scalar(x,_values_from_object(y)) else: y = y.view('i8') if name == '__ne__': mask = notnull(x) else: mask = isnull(x) x = x.view('i8') try: result = getattr(x, name)(y) if result is NotImplemented: raise TypeError("invalid type comparison") except AttributeError: result = op(x, y) if mask is not None and mask.any(): result[mask] = False return result
def test_notnull(): assert notnull(1.) assert not notnull(None) assert not notnull(np.NaN) with cf.option_context("mode.use_inf_as_null", False): assert notnull(np.inf) assert notnull(-np.inf) arr = np.array([1.5, np.inf, 3.5, -np.inf]) result = notnull(arr) assert result.all() with cf.option_context("mode.use_inf_as_null", True): assert not notnull(np.inf) assert not notnull(-np.inf) arr = np.array([1.5, np.inf, 3.5, -np.inf]) result = notnull(arr) assert result.sum() == 2 with cf.option_context("mode.use_inf_as_null", False): float_series = Series(np.random.randn(5)) obj_series = Series(np.random.randn(5), dtype=object) assert (isinstance(notnull(float_series), Series)) assert (isinstance(notnull(obj_series), Series))
def _valid_sp_values(self): sp_vals = self.sp_values mask = com.notnull(sp_vals) return sp_vals[mask]
def get_median(x): mask = notnull(x) if not skipna and not mask.all(): return np.nan return algos.median(_values_from_object(x[mask]))
def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False, precision=3, name=None): if name is None and isinstance(x, Series): name = x.name x = np.asarray(x) side = 'left' if right else 'right' ids = bins.searchsorted(x, side=side) na_mask = com.notnull(x) above = na_mask & (ids == len(bins)) below = na_mask & (ids == 0) if above.any(): raise ValueError('Values fall past last bin: %s' % str(x[above])) if below.any(): raise ValueError('Values fall before first bin: %s' % str(x[below])) mask = com.isnull(x) has_nas = mask.any() if labels is not False: if labels is None: fmt = lambda v: _format_label(v, precision=precision) if right: levels = [ '(%s, %s]' % (fmt(a), fmt(b)) for a, b in zip(bins, bins[1:]) ] else: levels = [ '[%s, %s)' % (fmt(a), fmt(b)) for a, b in zip(bins, bins[1:]) ] else: if len(labels) != len(bins) - 1: raise ValueError('Bin labels must be one fewer than ' 'the number of bin edges') levels = labels levels = np.asarray(levels, dtype=object) if has_nas: np.putmask(ids, mask, 0) fac = Factor(ids - 1, levels, name=name) else: fac = ids - 1 if has_nas: fac = ids.astype(np.float64) np.putmask(fac, mask, np.nan) if not retbins: return fac return fac, bins
def plots_2D_vs_response(self, range_padding=0.05): """ make 2D correlation plots (hexbin) for numerical data vs the output variable 'response' (from Prudential kaggle competition) strongly inspired by/ code copied from scatter_matrix() from pandas/tools/plotting.py plot normal and with log-z enabled next to each other. """ df = self._df._get_numeric_data() n = df.columns.size naxes = n * n mask = com.notnull(df) j = -1 boundaries_list = [] nbins_list = [] for a in df.columns: values = df[a].values[mask[a].values] rmin_, rmax_ = np.min(values), np.max(values) rdelta_ext = (rmax_ - rmin_) * range_padding / 2. boundaries_list.append((rmin_ - rdelta_ext, rmax_ + rdelta_ext)) nbins = len(df[a].unique()) if nbins > 10: nbins_list.append(10) else: nbins_list.append(nbins) #print(nbins) if a == 'Response': j = len( boundaries_list ) - 1 #j is used below to access the boundaries_list variable if j < 0: print("Error: Response-variable not found") kwds = {'bins': 'log'} for i, a in zip(lrange(n), df.columns): if a == 'Response': continue elif a == 'Unnamed: 0': continue fs = fig_summary() #fs.mean = average(df[var_name]) #fig, axes = plt.subplots(1,2, sharey=True) #http://matplotlib.org/examples/pylab_examples/subplots_demo.html fig, axes = plt.subplots( 1, 2 ) #http://matplotlib.org/examples/pylab_examples/subplots_demo.html plt.subplots_adjust(wspace=0.4) common = (mask[a] & mask['Response']).values for k, ax in enumerate(axes): #cmap=plt.cm.YlOrRd_r if k == 0: img = ax.hexbin(df['Response'][common], df[a][common], gridsize=(nbins_list[j], nbins_list[i]), cmap=plt.cm.Blues_r) else: img = ax.hexbin(df['Response'][common], df[a][common], gridsize=(nbins_list[j], nbins_list[i]), cmap=plt.cm.Blues_r, **kwds) ax.set_xlim(boundaries_list[j]) ax.set_ylim(boundaries_list[i]) ax.set_xlabel('Response') ax.set_ylabel(a) cb = plt.colorbar(img, ax=ax) if k == 0: cb.set_label('entries') else: cb.set_label('log(entries)') fs.xvar = 'Response' fs.yvar = a fs.label = "%s_%s" % (a, 'Response') #print(fs.label) fs.fig_path = self._output_dir fs.fig_rel_path = self._rel_dir + fs.label + ".png" self.list_fig_summary.append(fs) #plt.show() print("figure made: ", fs.fig_path + fs.fig_rel_path) plt.savefig(fs.fig_path + fs.fig_rel_path)
def scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False, diagonal='hist', marker='.', **kwds): """ Draw a matrix of scatter plots. Parameters ---------- alpha : amount of transparency applied figsize : a tuple (width, height) in inches ax : Matplotlib axis object grid : setting this to True will show the grid diagonal : pick between 'kde' and 'hist' for either Kernel Density Estimation or Histogram plon in the diagonal kwds : other plotting keyword arguments To be passed to scatter function Examples -------- >>> df = DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D']) >>> scatter_matrix(df, alpha=0.2) """ df = frame._get_numeric_data() n = df.columns.size fig, axes = _subplots(nrows=n, ncols=n, figsize=figsize, ax=ax, squeeze=False) # no gaps between subplots fig.subplots_adjust(wspace=0, hspace=0) mask = com.notnull(df) for i, a in zip(range(n), df.columns): for j, b in zip(range(n), df.columns): if i == j: values = df[a].values[mask[a].values] # Deal with the diagonal by drawing a histogram there. if diagonal == 'hist': axes[i, j].hist(values) elif diagonal in ('kde', 'density'): from scipy.stats import gaussian_kde y = values gkde = gaussian_kde(y) ind = np.linspace(y.min(), y.max(), 1000) axes[i, j].plot(ind, gkde.evaluate(ind), **kwds) else: common = (mask[a] & mask[b]).values axes[i, j].scatter(df[b][common], df[a][common], marker=marker, alpha=alpha, **kwds) axes[i, j].set_xlabel('') axes[i, j].set_ylabel('') axes[i, j].set_xticklabels([]) axes[i, j].set_yticklabels([]) ticks = df.index is_datetype = ticks.inferred_type in ('datetime', 'date', 'datetime64') if ticks.is_numeric() or is_datetype: """ Matplotlib supports numeric values or datetime objects as xaxis values. Taking LBYL approach here, by the time matplotlib raises exception when using non numeric/datetime values for xaxis, several actions are already taken by plt. """ ticks = ticks._mpl_repr() # setup labels if i == 0 and j % 2 == 1: axes[i, j].set_xlabel(b, visible=True) #axes[i, j].xaxis.set_visible(True) axes[i, j].set_xlabel(b) axes[i, j].set_xticklabels(ticks) axes[i, j].xaxis.set_ticks_position('top') axes[i, j].xaxis.set_label_position('top') if i == n - 1 and j % 2 == 0: axes[i, j].set_xlabel(b, visible=True) #axes[i, j].xaxis.set_visible(True) axes[i, j].set_xlabel(b) axes[i, j].set_xticklabels(ticks) axes[i, j].xaxis.set_ticks_position('bottom') axes[i, j].xaxis.set_label_position('bottom') if j == 0 and i % 2 == 0: axes[i, j].set_ylabel(a, visible=True) #axes[i, j].yaxis.set_visible(True) axes[i, j].set_ylabel(a) axes[i, j].set_yticklabels(ticks) axes[i, j].yaxis.set_ticks_position('left') axes[i, j].yaxis.set_label_position('left') if j == n - 1 and i % 2 == 1: axes[i, j].set_ylabel(a, visible=True) #axes[i, j].yaxis.set_visible(True) axes[i, j].set_ylabel(a) axes[i, j].set_yticklabels(ticks) axes[i, j].yaxis.set_ticks_position('right') axes[i, j].yaxis.set_label_position('right') axes[i, j].grid(b=grid) return axes