def compare(s, name): a = getattr(s.dt, prop) b = get_expected(s, prop) if not (is_list_like(a) and is_list_like(b)): self.assertEqual(a, b) else: tm.assert_series_equal(a, b)
def cartesian_product(X): """ Numpy version of itertools.product or pandas.compat.product. Sometimes faster (for large inputs)... Parameters ---------- X : list-like of list-likes Returns ------- product : list of ndarrays Examples -------- >>> cartesian_product([list('ABC'), [1, 2]]) [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='|S1'), array([1, 2, 1, 2, 1, 2])] See also -------- itertools.product : Cartesian product of input iterables. Equivalent to nested for-loops. pandas.compat.product : An alias for itertools.product. """ msg = "Input must be a list-like of list-likes" if not is_list_like(X): raise TypeError(msg) for x in X: if not is_list_like(x): raise TypeError(msg) if len(X) == 0: return [] lenX = np.fromiter((len(x) for x in X), dtype=np.intp) cumprodX = np.cumproduct(lenX) a = np.roll(cumprodX, 1) a[0] = 1 if cumprodX[-1] != 0: b = cumprodX[-1] / cumprodX else: # if any factor is empty, the cartesian product is empty b = np.zeros_like(cumprodX) return [np.tile(np.repeat(np.asarray(com.values_from_object(x)), b[i]), np.product(a[i])) for i, x in enumerate(X)]
def _delegate_property_get(self, name): from pandas import Series result = getattr(self.values, name) # maybe need to upcast (ints) if isinstance(result, np.ndarray): if is_integer_dtype(result): result = result.astype('int64') elif not is_list_like(result): return result result = np.asarray(result) # blow up if we operate on categories if self.orig is not None: result = take_1d(result, self.orig.cat.codes) # return the result as a Series, which is by definition a copy result = Series(result, index=self.index, name=self.name) # setting this object will show a SettingWithCopyWarning/Error result.is_copy = ("modifications to a property of a datetimelike " "object are not supported and are discarded. " "Change values on the original.") return result
def conform(self, rhs): """ inplace conform rhs """ if not is_list_like(rhs): rhs = [rhs] if isinstance(rhs, np.ndarray): rhs = rhs.ravel() return rhs
def wrapper(self, other): msg = "cannot compare a TimedeltaIndex with type {0}" func = getattr(super(TimedeltaIndex, self), opname) if _is_convertible_to_td(other) or other is NaT: try: other = _to_m8(other) except ValueError: # failed to parse as timedelta raise TypeError(msg.format(type(other))) result = func(other) if isna(other): result.fill(nat_result) else: if not is_list_like(other): raise TypeError(msg.format(type(other))) other = TimedeltaIndex(other).values result = func(other) result = _values_from_object(result) if isinstance(other, Index): o_mask = other.values.view('i8') == iNaT else: o_mask = other.view('i8') == iNaT if o_mask.any(): result[o_mask] = nat_result if self.hasnans: result[self._isnan] = nat_result # support of bool dtype indexers if is_bool_dtype(result): return result return Index(result)
def wrapper(self, other): op = getattr(self.asi8, opname) if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): return NotImplemented if is_list_like(other) and len(other) != len(self): raise ValueError("Lengths must match") if isinstance(other, Period): self._check_compatible_with(other) result = op(other.ordinal) elif isinstance(other, cls): self._check_compatible_with(other) result = op(other.asi8) mask = self._isnan | other._isnan if mask.any(): result[mask] = nat_result return result elif other is NaT: result = np.empty(len(self.asi8), dtype=bool) result.fill(nat_result) else: other = Period(other, freq=self.freq) result = op(other.ordinal) if self._hasnans: result[self._isnan] = nat_result return result
def wrapper(self, other): if _is_convertible_to_td(other) or other is NaT: try: other = _to_m8(other) except ValueError: # failed to parse as timedelta return ops.invalid_comparison(self, other, op) result = meth(self, other) if isna(other): result.fill(nat_result) elif not is_list_like(other): return ops.invalid_comparison(self, other, op) elif len(other) != len(self): raise ValueError("Lengths must match") else: try: other = type(self)._from_sequence(other)._data except (ValueError, TypeError): return ops.invalid_comparison(self, other, op) result = meth(self, other) result = com.values_from_object(result) o_mask = np.array(isna(other)) if o_mask.any(): result[o_mask] = nat_result if self._hasnans: result[self._isnan] = nat_result return result
def _gotitem(self, key, ndim, subset=None): """ Sub-classes to define. Return a sliced object. Parameters ---------- key : string / list of selections ndim : 1,2 requested ndim of result subset : object, default None subset to act on """ # create a new object to prevent aliasing if subset is None: subset = self.obj # we need to make a shallow copy of ourselves # with the same groupby kwargs = {attr: getattr(self, attr) for attr in self._attributes} # Try to select from a DataFrame, falling back to a Series try: groupby = self._groupby[key] except IndexError: groupby = self._groupby self = self.__class__(subset, groupby=groupby, parent=self, **kwargs) self._reset_cache() if subset.ndim == 2: if is_scalar(key) and key in subset or is_list_like(key): self._selection = key return self
def prep_ndarray(values, copy=True): if not isinstance(values, (np.ndarray, ABCSeries, Index)): if len(values) == 0: return np.empty((0, 0), dtype=object) def convert(v): return maybe_convert_platform(v) # we could have a 1-dim or 2-dim list here # this is equiv of np.asarray, but does object conversion # and platform dtype preservation try: if is_list_like(values[0]) or hasattr(values[0], 'len'): values = np.array([convert(v) for v in values]) elif isinstance(values[0], np.ndarray) and values[0].ndim == 0: # GH#21861 values = np.array([convert(v) for v in values]) else: values = convert(values) except (ValueError, TypeError): values = convert(values) else: # drop subclass info, do not copy data values = np.asarray(values) if copy: values = values.copy() if values.ndim == 1: values = values.reshape((values.shape[0], 1)) elif values.ndim != 2: raise ValueError('Must pass 2-d input') return values
def _is_dtype_compat(self, other): """ *this is an internal non-public method* provide a comparison between the dtype of self and other (coercing if needed) Raises ------ TypeError if the dtypes are not compatible """ if is_categorical_dtype(other): if isinstance(other, CategoricalIndex): other = other._values if not other.is_dtype_equal(self): raise TypeError("categories must match existing categories " "when appending") else: values = other if not is_list_like(values): values = [values] other = CategoricalIndex(self._create_categorical( self, other, categories=self.categories, ordered=self.ordered)) if not other.isin(values).all(): raise TypeError("cannot append a non-category item to a " "CategoricalIndex") return other
def _gotitem(self, key, ndim, subset=None): """ sub-classes to define return a sliced object Parameters ---------- key : string / list of selections ndim : 1,2 requested ndim of result subset : object, default None subset to act on """ # create a new object to prevent aliasing if subset is None: subset = self.obj # we need to make a shallow copy of ourselves # with the same groupby kwargs = dict([(attr, getattr(self, attr)) for attr in self._attributes]) self = self.__class__(subset, groupby=self._groupby[key], parent=self, **kwargs) self._reset_cache() if subset.ndim == 2: if is_scalar(key) and key in subset or is_list_like(key): self._selection = key return self
def cmp_method(self, other): op_name = op.__name__ mask = None if isinstance(other, (ABCSeries, ABCIndexClass)): # Rely on pandas to unbox and dispatch to us. return NotImplemented if isinstance(other, IntegerArray): other, mask = other._data, other._mask elif is_list_like(other): other = np.asarray(other) if other.ndim > 0 and len(self) != len(other): raise ValueError('Lengths must match to compare') other = lib.item_from_zerodim(other) # numpy will show a DeprecationWarning on invalid elementwise # comparisons, this will raise in the future with warnings.catch_warnings(): warnings.filterwarnings("ignore", "elementwise", FutureWarning) with np.errstate(all='ignore'): result = op(self._data, other) # nans propagate if mask is None: mask = self._mask else: mask = self._mask | mask result[mask] = op_name == 'ne' return result
def delete(self, loc): """ Make a new TimedeltaIndex with passed location(s) deleted. Parameters ---------- loc: int, slice or array of ints Indicate which sub-arrays to remove. Returns ------- new_index : TimedeltaIndex """ new_tds = np.delete(self.asi8, loc) freq = 'infer' if is_integer(loc): if loc in (0, -len(self), -1, len(self) - 1): freq = self.freq else: if is_list_like(loc): loc = lib.maybe_indices_to_slice( ensure_int64(np.array(loc)), len(self)) if isinstance(loc, slice) and loc.step in (1, None): if (loc.start in (0, None) or loc.stop in (len(self), None)): freq = self.freq return TimedeltaIndex(new_tds, name=self.name, freq=freq)
def _get_skiprows(skiprows): """Get an iterator given an integer, slice or container. Parameters ---------- skiprows : int, slice, container The iterator to use to skip rows; can also be a slice. Raises ------ TypeError * If `skiprows` is not a slice, integer, or Container Returns ------- it : iterable A proper iterator to use to skip rows of a DataFrame. """ if isinstance(skiprows, slice): return lrange(skiprows.start or 0, skiprows.stop, skiprows.step or 1) elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows): return skiprows elif skiprows is None: return 0 raise TypeError('%r is not a valid type for skipping rows' % type(skiprows).__name__)
def wrapper(self, other): msg = "cannot compare a {cls} with type {typ}" meth = getattr(dtl.DatetimeLikeArrayMixin, opname) if _is_convertible_to_td(other) or other is NaT: try: other = _to_m8(other) except ValueError: # failed to parse as timedelta raise TypeError(msg.format(cls=type(self).__name__, typ=type(other).__name__)) result = meth(self, other) if isna(other): result.fill(nat_result) elif not is_list_like(other): raise TypeError(msg.format(cls=type(self).__name__, typ=type(other).__name__)) else: other = type(self)(other)._data result = meth(self, other) result = com.values_from_object(result) o_mask = np.array(isna(other)) if o_mask.any(): result[o_mask] = nat_result if self.hasnans: result[self._isnan] = nat_result return result
def _pop_header_name(row, index_col): """ Pop the header name for MultiIndex parsing. Parameters ---------- row : list The data row to parse for the header name. index_col : int, list The index columns for our data. Assumed to be non-null. Returns ------- header_name : str The extracted header name. trimmed_row : list The original data row with the header name removed. """ # Pop out header name and fill w/blank. i = index_col if not is_list_like(index_col) else max(index_col) header_name = row[i] header_name = None if header_name == "" else header_name return header_name, row[:i] + [''] + row[i + 1:]
def _evaluate_compare(self, other, op): """ We have been called because a comparison between 8 aware arrays. numpy >= 1.11 will now warn about NaT comparisons """ # Called by comparison methods when comparing datetimelike # with datetimelike if not isinstance(other, type(self)): # coerce to a similar object if not is_list_like(other): # scalar other = [other] elif lib.is_scalar(lib.item_from_zerodim(other)): # ndarray scalar other = [other.item()] other = type(self)(other) # compare result = op(self.asi8, other.asi8) # technically we could support bool dtyped Index # for now just return the indexing array directly mask = (self._isnan) | (other._isnan) filler = iNaT if is_bool_dtype(result): filler = False result[mask] = filler return result
def wrapper(self, other): msg = "cannot compare a {cls} with type {typ}" func = getattr(super(TimedeltaIndex, self), opname) if _is_convertible_to_td(other) or other is NaT: try: other = _to_m8(other) except ValueError: # failed to parse as timedelta raise TypeError(msg.format(cls=type(self).__name__, typ=type(other).__name__)) result = func(other) if isna(other): result.fill(nat_result) elif not is_list_like(other): raise TypeError(msg.format(cls=type(self).__name__, typ=type(other).__name__)) else: other = TimedeltaIndex(other).values result = func(other) result = com._values_from_object(result) o_mask = np.array(isna(other)) if o_mask.any(): result[o_mask] = nat_result if self.hasnans: result[self._isnan] = nat_result # support of bool dtype indexers if is_bool_dtype(result): return result return Index(result)
def _sanitize_values(arr): """ return an ndarray for our input, in a platform independent manner """ if hasattr(arr, 'values'): arr = arr.values else: # scalar if is_scalar(arr): arr = [arr] # ndarray if isinstance(arr, np.ndarray): pass elif is_list_like(arr) and len(arr) > 0: arr = maybe_convert_platform(arr) else: arr = np.asarray(arr) return arr
def cmp_method(self, other): op_name = op.__name__ mask = None if isinstance(other, IntegerArray): other, mask = other._data, other._mask elif is_list_like(other): other = np.asarray(other) if other.ndim > 0 and len(self) != len(other): raise ValueError('Lengths must match to compare') # numpy will show a DeprecationWarning on invalid elementwise # comparisons, this will raise in the future with warnings.catch_warnings(record=True): with np.errstate(all='ignore'): result = op(self._data, other) # nans propagate if mask is None: mask = self._mask else: mask = self._mask | mask result[mask] = True if op_name == 'ne' else False return result
def __contains__(self, key): try: res = self.get_loc(key) return (is_scalar(res) or isinstance(res, slice) or (is_list_like(res) and len(res))) except (KeyError, TypeError, ValueError): return False
def check_len(item, name): length_msg = ("Length of '{0}' ({1}) did not match the length of " "the columns being encoded ({2}).") if is_list_like(item): if not len(item) == len(columns_to_encode): raise ValueError(length_msg.format(name, len(item), len(columns_to_encode)))
def get_expected(s, name): result = getattr(Index(s._values), prop) if isinstance(result, np.ndarray): if is_integer_dtype(result): result = result.astype('int64') elif not is_list_like(result): return result return Series(result, index=s.index, name=s.name)
def check_len(item, name): len_msg = ("Length of '{name}' ({len_item}) did not match the " "length of the columns being encoded ({len_enc}).") if is_list_like(item): if not len(item) == data_to_encode.shape[1]: len_msg = len_msg.format(name=name, len_item=len(item), len_enc=data_to_encode.shape[1]) raise ValueError(len_msg)
def integer_arithmetic_method(self, other): op_name = op.__name__ mask = None if isinstance(other, (ABCSeries, ABCIndexClass)): # Rely on pandas to unbox and dispatch to us. return NotImplemented if getattr(other, 'ndim', 0) > 1: raise NotImplementedError( "can only perform ops with 1-d structures") if isinstance(other, IntegerArray): other, mask = other._data, other._mask elif getattr(other, 'ndim', None) == 0: other = other.item() elif is_list_like(other): other = np.asarray(other) if not other.ndim: other = other.item() elif other.ndim == 1: if not (is_float_dtype(other) or is_integer_dtype(other)): raise TypeError( "can only perform ops with numeric values") else: if not (is_float(other) or is_integer(other)): raise TypeError("can only perform ops with numeric values") # nans propagate if mask is None: mask = self._mask else: mask = self._mask | mask # 1 ** np.nan is 1. So we have to unmask those. if op_name == 'pow': mask = np.where(self == 1, False, mask) elif op_name == 'rpow': mask = np.where(other == 1, False, mask) with np.errstate(all='ignore'): result = op(self._data, other) # divmod returns a tuple if op_name == 'divmod': div, mod = result return (self._maybe_mask_result(div, mask, other, 'floordiv'), self._maybe_mask_result(mod, mask, other, 'mod')) return self._maybe_mask_result(result, mask, other, op_name)
def _not_in(x, y): """Compute the vectorized membership of ``x not in y`` if possible, otherwise use Python. """ try: return ~x.isin(y) except AttributeError: if is_list_like(x): try: return ~y.isin(x) except AttributeError: pass return x not in y
def _maybe_convert_i8(self, key): """ Maybe convert a given key to it's equivalent i8 value(s). Used as a preprocessing step prior to IntervalTree queries (self._engine), which expects numeric data. Parameters ---------- key : scalar or list-like The key that should maybe be converted to i8. Returns ------- key: scalar or list-like The original key if no conversion occured, int if converted scalar, Int64Index if converted list-like. """ original = key if is_list_like(key): key = ensure_index(key) if not self._needs_i8_conversion(key): return original scalar = is_scalar(key) if is_interval_dtype(key) or isinstance(key, Interval): # convert left/right and reconstruct left = self._maybe_convert_i8(key.left) right = self._maybe_convert_i8(key.right) constructor = Interval if scalar else IntervalIndex.from_arrays return constructor(left, right, closed=self.closed) if scalar: # Timestamp/Timedelta key_dtype, key_i8 = infer_dtype_from_scalar(key, pandas_dtype=True) else: # DatetimeIndex/TimedeltaIndex key_dtype, key_i8 = key.dtype, Index(key.asi8) if key.hasnans: # convert NaT from it's i8 value to np.nan so it's not viewed # as a valid value, maybe causing errors (e.g. is_overlapping) key_i8 = key_i8.where(~key._isnan) # ensure consistency with IntervalIndex subtype subtype = self.dtype.subtype msg = ('Cannot index an IntervalIndex of subtype {subtype} with ' 'values of dtype {other}') if not is_dtype_equal(subtype, key_dtype): raise ValueError(msg.format(subtype=subtype, other=key_dtype)) return key_i8
def _args_adjust(self): if is_integer(self.bins): # create common bin edge values = (self.data._convert(datetime=True)._get_numeric_data()) values = np.ravel(values) values = values[~isna(values)] hist, self.bins = np.histogram( values, bins=self.bins, range=self.kwds.get('range', None), weights=self.kwds.get('weights', None)) if is_list_like(self.bottom): self.bottom = np.array(self.bottom)
def extract_index(data): index = None if len(data) == 0: index = Index([]) elif len(data) > 0: raw_lengths = [] indexes = [] have_raw_arrays = False have_series = False have_dicts = False for val in data: if isinstance(val, ABCSeries): have_series = True indexes.append(val.index) elif isinstance(val, dict): have_dicts = True indexes.append(list(val.keys())) elif is_list_like(val) and getattr(val, 'ndim', 1) == 1: have_raw_arrays = True raw_lengths.append(len(val)) if not indexes and not raw_lengths: raise ValueError('If using all scalar values, you must pass' ' an index') if have_series or have_dicts: index = _union_indexes(indexes) if have_raw_arrays: lengths = list(set(raw_lengths)) if len(lengths) > 1: raise ValueError('arrays must all be same length') if have_dicts: raise ValueError('Mixing dicts with non-Series may lead to ' 'ambiguous ordering.') if have_series: if lengths[0] != len(index): msg = ('array length {length} does not match index ' 'length {idx_len}' .format(length=lengths[0], idx_len=len(index))) raise ValueError(msg) else: index = ibase.default_index(lengths[0]) return ensure_index(index)
def _delegate_method(self, name, *args, **kwargs): from pandas import Series method = getattr(self.values, name) result = method(*args, **kwargs) if not is_list_like(result): return result result = Series(result, index=self.index, name=self.name) # setting this object will show a SettingWithCopyWarning/Error result.is_copy = ("modifications to a method of a datetimelike object " "are not supported and are discarded. Change " "values on the original.") return result
def parse(self, sheet_name=0, header=0, names=None, index_col=None, usecols=None, squeeze=False, dtype=None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, verbose=False, parse_dates=False, date_parser=None, thousands=None, comment=None, skipfooter=0, convert_float=True, mangle_dupe_cols=True, **kwds): _validate_header_arg(header) ret_dict = False # Keep sheetname to maintain backwards compatibility. if isinstance(sheet_name, list): sheets = sheet_name ret_dict = True elif sheet_name is None: sheets = self.sheet_names ret_dict = True else: sheets = [sheet_name] # handle same-type duplicates. sheets = list(OrderedDict.fromkeys(sheets).keys()) output = OrderedDict() for asheetname in sheets: if verbose: print("Reading sheet {sheet}".format(sheet=asheetname)) if isinstance(asheetname, str): sheet = self.get_sheet_by_name(asheetname) else: # assume an integer if not a string sheet = self.get_sheet_by_index(asheetname) data = self.get_sheet_data(sheet, convert_float) usecols = _maybe_convert_usecols(usecols) if not data: output[asheetname] = DataFrame() continue if is_list_like(header) and len(header) == 1: header = header[0] # forward fill and pull out names for MultiIndex column header_names = None if header is not None and is_list_like(header): header_names = [] control_row = [True] * len(data[0]) for row in header: if is_integer(skiprows): row += skiprows data[row], control_row = _fill_mi_header( data[row], control_row) if index_col is not None: header_name, _ = _pop_header_name(data[row], index_col) header_names.append(header_name) if is_list_like(index_col): # Forward fill values for MultiIndex index. if not is_list_like(header): offset = 1 + header else: offset = 1 + max(header) # Check if we have an empty dataset # before trying to collect data. if offset < len(data): for col in index_col: last = data[offset][col] for row in range(offset + 1, len(data)): if data[row][col] == "" or data[row][col] is None: data[row][col] = last else: last = data[row][col] has_index_names = is_list_like(header) and len(header) > 1 # GH 12292 : error when read one empty column from excel file try: parser = TextParser(data, names=names, header=header, index_col=index_col, has_index_names=has_index_names, squeeze=squeeze, dtype=dtype, true_values=true_values, false_values=false_values, skiprows=skiprows, nrows=nrows, na_values=na_values, parse_dates=parse_dates, date_parser=date_parser, thousands=thousands, comment=comment, skipfooter=skipfooter, usecols=usecols, mangle_dupe_cols=mangle_dupe_cols, **kwds) output[asheetname] = parser.read(nrows=nrows) if not squeeze or isinstance(output[asheetname], DataFrame): if header_names: output[asheetname].columns = output[ asheetname].columns.set_names(header_names) except EmptyDataError: # No Data, return an empty DataFrame output[asheetname] = DataFrame() if ret_dict: return output else: return output[asheetname]
def pivot_table( data, values=None, index=None, columns=None, aggfunc="mean", fill_value=None, margins=False, dropna=True, margins_name="All", observed=False, ) -> "DataFrame": index = _convert_by(index) columns = _convert_by(columns) if isinstance(aggfunc, list): pieces: List[DataFrame] = [] keys = [] for func in aggfunc: table = pivot_table( data, values=values, index=index, columns=columns, fill_value=fill_value, aggfunc=func, margins=margins, dropna=dropna, margins_name=margins_name, observed=observed, ) pieces.append(table) keys.append(getattr(func, "__name__", func)) return concat(pieces, keys=keys, axis=1) keys = index + columns values_passed = values is not None if values_passed: if is_list_like(values): values_multi = True values = list(values) else: values_multi = False values = [values] # GH14938 Make sure value labels are in data for i in values: if i not in data: raise KeyError(i) to_filter = [] for x in keys + values: if isinstance(x, Grouper): x = x.key try: if x in data: to_filter.append(x) except TypeError: pass if len(to_filter) < len(data.columns): data = data[to_filter] else: values = data.columns for key in keys: try: values = values.drop(key) except (TypeError, ValueError, KeyError): pass values = list(values) grouped = data.groupby(keys, observed=observed) agged = grouped.agg(aggfunc) if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): agged = agged.dropna(how="all") # gh-21133 # we want to down cast if # the original values are ints # as we grouped with a NaN value # and then dropped, coercing to floats for v in values: if (v in data and is_integer_dtype(data[v]) and v in agged and not is_integer_dtype(agged[v])): agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype) table = agged # GH17038, this check should only happen if index is defined (not None) if table.index.nlevels > 1 and index: # Related GH #17123 # If index_names are integers, determine whether the integers refer # to the level position or name. index_names = agged.index.names[:len(index)] to_unstack = [] for i in range(len(index), len(keys)): name = agged.index.names[i] if name is None or name in index_names: to_unstack.append(i) else: to_unstack.append(name) table = agged.unstack(to_unstack) if not dropna: if isinstance(table.index, MultiIndex): m = MultiIndex.from_arrays(cartesian_product(table.index.levels), names=table.index.names) table = table.reindex(m, axis=0) if isinstance(table.columns, MultiIndex): m = MultiIndex.from_arrays(cartesian_product(table.columns.levels), names=table.columns.names) table = table.reindex(m, axis=1) if isinstance(table, ABCDataFrame): table = table.sort_index(axis=1) if fill_value is not None: _table = table.fillna(fill_value, downcast="infer") assert _table is not None # needed for mypy table = _table if margins: if dropna: data = data[data.notna().all(axis=1)] table = _add_margins( table, data, values, rows=index, cols=columns, aggfunc=aggfunc, observed=dropna, margins_name=margins_name, fill_value=fill_value, ) # discard the top level if (values_passed and not values_multi and not table.empty and (table.columns.nlevels > 1)): table = table[values[0]] if len(index) == 0 and len(columns) > 0: table = table.T # GH 15193 Make sure empty columns are removed if dropna=True if isinstance(table, ABCDataFrame) and dropna: table = table.dropna(how="all", axis=1) return table
def sanitize_array( data, index: Optional[Index], dtype: Optional[DtypeObj] = None, copy: bool = False, raise_cast_failure: bool = False, ) -> ArrayLike: """ Sanitize input data to an ndarray or ExtensionArray, copy if specified, coerce to the dtype if specified. """ if isinstance(data, ma.MaskedArray): data = sanitize_masked_array(data) # extract ndarray or ExtensionArray, ensure we have no PandasArray data = extract_array(data, extract_numpy=True) if isinstance(data, np.ndarray) and data.ndim == 0: if dtype is None: dtype = data.dtype data = lib.item_from_zerodim(data) # GH#846 if isinstance(data, np.ndarray): if dtype is not None and is_float_dtype(data.dtype) and is_integer_dtype(dtype): # possibility of nan -> garbage try: subarr = _try_cast(data, dtype, copy, True) except ValueError: if copy: subarr = data.copy() else: subarr = np.array(data, copy=False) else: # we will try to copy by-definition here subarr = _try_cast(data, dtype, copy, raise_cast_failure) elif isinstance(data, ABCExtensionArray): # it is already ensured above this is not a PandasArray subarr = data if dtype is not None: subarr = subarr.astype(dtype, copy=copy) elif copy: subarr = subarr.copy() return subarr elif isinstance(data, (list, tuple, abc.Set, abc.ValuesView)) and len(data) > 0: # TODO: deque, array.array if isinstance(data, set): # Raise only for unordered sets, e.g., not for dict_keys raise TypeError("Set type is unordered") data = list(data) if dtype is not None: subarr = _try_cast(data, dtype, copy, raise_cast_failure) else: subarr = maybe_convert_platform(data) subarr = maybe_cast_to_datetime(subarr, dtype) elif isinstance(data, range): # GH#16804 arr = np.arange(data.start, data.stop, data.step, dtype="int64") subarr = _try_cast(arr, dtype, copy, raise_cast_failure) elif not is_list_like(data): if index is None: raise ValueError("index must be specified when data is not list-like") subarr = construct_1d_arraylike_from_scalar(data, len(index), dtype) else: subarr = _try_cast(data, dtype, copy, raise_cast_failure) subarr = _sanitize_ndim(subarr, data, dtype, index) if not (is_extension_array_dtype(subarr.dtype) or is_extension_array_dtype(dtype)): subarr = _sanitize_str_dtypes(subarr, data, dtype, copy) is_object_or_str_dtype = is_object_dtype(dtype) or is_string_dtype(dtype) if is_object_dtype(subarr.dtype) and not is_object_or_str_dtype: inferred = lib.infer_dtype(subarr, skipna=False) if inferred in {"interval", "period"}: subarr = array(subarr) return subarr
def wide_to_long(df, stubnames, i, j, sep="", suffix=r'\d+'): r""" Wide panel to long format. Less flexible but more user-friendly than melt. With stubnames ['A', 'B'], this function expects to find one or more group of columns with format Asuffix1, Asuffix2,..., Bsuffix1, Bsuffix2,... You specify what you want to call this suffix in the resulting long format with `j` (for example `j='year'`) Each row of these wide variables are assumed to be uniquely identified by `i` (can be a single column name or a list of column names) All remaining variables in the data frame are left intact. Parameters ---------- df : DataFrame The wide-format DataFrame stubnames : str or list-like The stub name(s). The wide format variables are assumed to start with the stub names. i : str or list-like Column(s) to use as id variable(s) j : str The name of the subobservation variable. What you wish to name your suffix in the long format. sep : str, default "" A character indicating the separation of the variable names in the wide format, to be stripped from the names in the long format. For example, if your column names are A-suffix1, A-suffix2, you can strip the hypen by specifying `sep='-'` .. versionadded:: 0.20.0 suffix : str, default '\\d+' A regular expression capturing the wanted suffixes. '\\d+' captures numeric suffixes. Suffixes with no numbers could be specified with the negated character class '\\D+'. You can also further disambiguate suffixes, for example, if your wide variables are of the form Aone, Btwo,.., and you have an unrelated column Arating, you can ignore the last one by specifying `suffix='(!?one|two)'` .. versionadded:: 0.20.0 Returns ------- DataFrame A DataFrame that contains each stub name as a variable, with new index (i, j) Examples -------- >>> import pandas as pd >>> import numpy as np >>> np.random.seed(123) >>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"}, ... "A1980" : {0 : "d", 1 : "e", 2 : "f"}, ... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7}, ... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1}, ... "X" : dict(zip(range(3), np.random.randn(3))) ... }) >>> df["id"] = df.index >>> df A1970 A1980 B1970 B1980 X id 0 a d 2.5 3.2 -1.085631 0 1 b e 1.2 1.3 0.997345 1 2 c f 0.7 0.1 0.282978 2 >>> pd.wide_to_long(df, ["A", "B"], i="id", j="year") ... # doctest: +NORMALIZE_WHITESPACE X A B id year 0 1970 -1.085631 a 2.5 1 1970 0.997345 b 1.2 2 1970 0.282978 c 0.7 0 1980 -1.085631 d 3.2 1 1980 0.997345 e 1.3 2 1980 0.282978 f 0.1 With multuple id columns >>> df = pd.DataFrame({ ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3], ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3], ... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1], ... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9] ... }) >>> df birth famid ht1 ht2 0 1 1 2.8 3.4 1 2 1 2.9 3.8 2 3 1 2.2 2.9 3 1 2 2.0 3.2 4 2 2 1.8 2.8 5 3 2 1.9 2.4 6 1 3 2.2 3.3 7 2 3 2.3 3.4 8 3 3 2.1 2.9 >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age') >>> l ... # doctest: +NORMALIZE_WHITESPACE ht famid birth age 1 1 1 2.8 2 3.4 2 1 2.9 2 3.8 3 1 2.2 2 2.9 2 1 1 2.0 2 3.2 2 1 1.8 2 2.8 3 1 1.9 2 2.4 3 1 1 2.2 2 3.3 2 1 2.3 2 3.4 3 1 2.1 2 2.9 Going from long back to wide just takes some creative use of `unstack` >>> w = l.reset_index().set_index(['famid', 'birth', 'age']).unstack() >>> w.columns = pd.Index(w.columns).str.join('') >>> w.reset_index() famid birth ht1 ht2 0 1 1 2.8 3.4 1 1 2 2.9 3.8 2 1 3 2.2 2.9 3 2 1 2.0 3.2 4 2 2 1.8 2.8 5 2 3 1.9 2.4 6 3 1 2.2 3.3 7 3 2 2.3 3.4 8 3 3 2.1 2.9 Less wieldy column names are also handled >>> np.random.seed(0) >>> df = pd.DataFrame({'A(quarterly)-2010': np.random.rand(3), ... 'A(quarterly)-2011': np.random.rand(3), ... 'B(quarterly)-2010': np.random.rand(3), ... 'B(quarterly)-2011': np.random.rand(3), ... 'X' : np.random.randint(3, size=3)}) >>> df['id'] = df.index >>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS A(quarterly)-2010 A(quarterly)-2011 B(quarterly)-2010 ... 0 0.548814 0.544883 0.437587 ... 1 0.715189 0.423655 0.891773 ... 2 0.602763 0.645894 0.963663 ... X id 0 0 0 1 1 1 2 1 2 >>> pd.wide_to_long(df, ['A(quarterly)', 'B(quarterly)'], i='id', ... j='year', sep='-') ... # doctest: +NORMALIZE_WHITESPACE X A(quarterly) B(quarterly) id year 0 2010 0 0.548814 0.437587 1 2010 1 0.715189 0.891773 2 2010 1 0.602763 0.963663 0 2011 0 0.544883 0.383442 1 2011 1 0.423655 0.791725 2 2011 1 0.645894 0.528895 If we have many columns, we could also use a regex to find our stubnames and pass that list on to wide_to_long >>> stubnames = sorted( ... set([match[0] for match in df.columns.str.findall( ... r'[A-B]\(.*\)').values if match != [] ]) ... ) >>> list(stubnames) ['A(quarterly)', 'B(quarterly)'] Notes ----- All extra variables are left untouched. This simply uses `pandas.melt` under the hood, but is hard-coded to "do the right thing" in a typicaly case. """ def get_var_names(df, stub, sep, suffix): regex = "^{stub}{sep}{suffix}".format(stub=re.escape(stub), sep=re.escape(sep), suffix=suffix) return df.filter(regex=regex).columns.tolist() def melt_stub(df, stub, i, j, value_vars, sep): newdf = melt(df, id_vars=i, value_vars=value_vars, value_name=stub.rstrip(sep), var_name=j) newdf[j] = Categorical(newdf[j]) newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "") return newdf.set_index(i + [j]) if any(map(lambda s: s in df.columns.tolist(), stubnames)): raise ValueError("stubname can't be identical to a column name") if not is_list_like(stubnames): stubnames = [stubnames] else: stubnames = list(stubnames) if not is_list_like(i): i = [i] else: i = list(i) if df[i].duplicated().any(): raise ValueError("the id variables need to uniquely identify each row") value_vars = list( map(lambda stub: get_var_names(df, stub, sep, suffix), stubnames)) value_vars_flattened = [e for sublist in value_vars for e in sublist] id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened)) melted = [] for s, v in zip(stubnames, value_vars): melted.append(melt_stub(df, s, i, j, v, sep)) melted = melted[0].join(melted[1:], how='outer') if len(i) == 1: new = df[id_vars].set_index(i).join(melted) return new new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j]) return new
def _aggregate(self, arg, *args, **kwargs): """ provide an implementation for the aggregators Parameters ---------- arg : string, dict, function *args : args to pass on to the function **kwargs : kwargs to pass on to the function Returns ------- tuple of result, how Notes ----- how can be a string describe the required post-processing, or None if not required """ is_aggregator = lambda x: isinstance(x, (list, tuple, dict)) is_nested_renamer = False _axis = kwargs.pop("_axis", None) if _axis is None: _axis = getattr(self, "axis", 0) _level = kwargs.pop("_level", None) if isinstance(arg, str): return self._try_aggregate_string_function(arg, *args, **kwargs), None if isinstance(arg, dict): # aggregate based on the passed dict if _axis != 0: # pragma: no cover raise ValueError("Can only pass dict with axis=0") obj = self._selected_obj def nested_renaming_depr(level=4): # deprecation of nested renaming # GH 15931 msg = textwrap.dedent( """\ using a dict with renaming is deprecated and will be removed in a future version. For column-specific groupby renaming, use named aggregation >>> df.groupby(...).agg(name=('column', aggfunc)) """ ) warnings.warn(msg, FutureWarning, stacklevel=level) # if we have a dict of any non-scalars # eg. {'A' : ['mean']}, normalize all to # be list-likes if any(is_aggregator(x) for x in arg.values()): new_arg = OrderedDict() for k, v in arg.items(): if not isinstance(v, (tuple, list, dict)): new_arg[k] = [v] else: new_arg[k] = v # the keys must be in the columns # for ndim=2, or renamers for ndim=1 # ok for now, but deprecated # {'A': { 'ra': 'mean' }} # {'A': { 'ra': ['mean'] }} # {'ra': ['mean']} # not ok # {'ra' : { 'A' : 'mean' }} if isinstance(v, dict): is_nested_renamer = True if k not in obj.columns: msg = ( "cannot perform renaming for {key} with a " "nested dictionary" ).format(key=k) raise SpecificationError(msg) nested_renaming_depr(4 + (_level or 0)) elif isinstance(obj, ABCSeries): nested_renaming_depr() elif isinstance(obj, ABCDataFrame) and k not in obj.columns: raise KeyError("Column '{col}' does not exist!".format(col=k)) arg = new_arg else: # deprecation of renaming keys # GH 15931 keys = list(arg.keys()) if isinstance(obj, ABCDataFrame) and len( obj.columns.intersection(keys) ) != len(keys): nested_renaming_depr() from pandas.core.reshape.concat import concat def _agg_1dim(name, how, subset=None): """ aggregate a 1-dim with how """ colg = self._gotitem(name, ndim=1, subset=subset) if colg.ndim != 1: raise SpecificationError( "nested dictionary is ambiguous in aggregation" ) return colg.aggregate(how, _level=(_level or 0) + 1) def _agg_2dim(name, how): """ aggregate a 2-dim with how """ colg = self._gotitem(self._selection, ndim=2, subset=obj) return colg.aggregate(how, _level=None) def _agg(arg, func): """ run the aggregations over the arg with func return an OrderedDict """ result = OrderedDict() for fname, agg_how in arg.items(): result[fname] = func(fname, agg_how) return result # set the final keys keys = list(arg.keys()) result = OrderedDict() # nested renamer if is_nested_renamer: result = list(_agg(arg, _agg_1dim).values()) if all(isinstance(r, dict) for r in result): result, results = OrderedDict(), result for r in results: result.update(r) keys = list(result.keys()) else: if self._selection is not None: keys = None # some selection on the object elif self._selection is not None: sl = set(self._selection_list) # we are a Series like object, # but may have multiple aggregations if len(sl) == 1: result = _agg( arg, lambda fname, agg_how: _agg_1dim(self._selection, agg_how) ) # we are selecting the same set as we are aggregating elif not len(sl - set(keys)): result = _agg(arg, _agg_1dim) # we are a DataFrame, with possibly multiple aggregations else: result = _agg(arg, _agg_2dim) # no selection else: try: result = _agg(arg, _agg_1dim) except SpecificationError: # we are aggregating expecting all 1d-returns # but we have 2d result = _agg(arg, _agg_2dim) # combine results def is_any_series(): # return a boolean if we have *any* nested series return any(isinstance(r, ABCSeries) for r in result.values()) def is_any_frame(): # return a boolean if we have *any* nested series return any(isinstance(r, ABCDataFrame) for r in result.values()) if isinstance(result, list): return concat(result, keys=keys, axis=1, sort=True), True elif is_any_frame(): # we have a dict of DataFrames # return a MI DataFrame return concat([result[k] for k in keys], keys=keys, axis=1), True elif isinstance(self, ABCSeries) and is_any_series(): # we have a dict of Series # return a MI Series try: result = concat(result) except TypeError: # we want to give a nice error here if # we have non-same sized objects, so # we don't automatically broadcast raise ValueError( "cannot perform both aggregation " "and transformation operations " "simultaneously" ) return result, True # fall thru from pandas import DataFrame, Series try: result = DataFrame(result) except ValueError: # we have a dict of scalars result = Series(result, name=getattr(self, "name", None)) return result, True elif is_list_like(arg): # we require a list, but not an 'str' return self._aggregate_multiple_funcs(arg, _level=_level, _axis=_axis), None else: result = None f = self._get_cython_func(arg) if f and not args and not kwargs: return getattr(self, f)(), None # caller can react return result, True
def flatten_axes(axes: Union["Axes", Sequence["Axes"]]) -> np.ndarray: if not is_list_like(axes): return np.array([axes]) elif isinstance(axes, (np.ndarray, ABCIndex)): return np.asarray(axes).ravel() return np.array(axes)
def create_subplots( naxes: int, sharex: bool = False, sharey: bool = False, squeeze: bool = True, subplot_kw=None, ax=None, layout=None, layout_type: str = "box", **fig_kw, ): """ Create a figure with a set of subplots already made. This utility wrapper makes it convenient to create common layouts of subplots, including the enclosing figure object, in a single call. Parameters ---------- naxes : int Number of required axes. Exceeded axes are set invisible. Default is nrows * ncols. sharex : bool If True, the X axis will be shared amongst all subplots. sharey : bool If True, the Y axis will be shared amongst all subplots. squeeze : bool If True, extra dimensions are squeezed out from the returned axis object: - if only one subplot is constructed (nrows=ncols=1), the resulting single Axis object is returned as a scalar. - for Nx1 or 1xN subplots, the returned object is a 1-d numpy object array of Axis objects are returned as numpy 1-d arrays. - for NxM subplots with N>1 and M>1 are returned as a 2d array. If False, no squeezing is done: the returned axis object is always a 2-d array containing Axis instances, even if it ends up being 1x1. subplot_kw : dict Dict with keywords passed to the add_subplot() call used to create each subplots. ax : Matplotlib axis object, optional layout : tuple Number of rows and columns of the subplot grid. If not specified, calculated from naxes and layout_type layout_type : {'box', 'horizontal', 'vertical'}, default 'box' Specify how to layout the subplot grid. fig_kw : Other keyword arguments to be passed to the figure() call. Note that all keywords not recognized above will be automatically included here. Returns ------- fig, ax : tuple - fig is the Matplotlib Figure object - ax can be either a single axis object or an array of axis objects if more than one subplot was created. The dimensions of the resulting array can be controlled with the squeeze keyword, see above. Examples -------- x = np.linspace(0, 2*np.pi, 400) y = np.sin(x**2) # Just a figure and one subplot f, ax = plt.subplots() ax.plot(x, y) ax.set_title('Simple plot') # Two subplots, unpack the output array immediately f, (ax1, ax2) = plt.subplots(1, 2, sharey=True) ax1.plot(x, y) ax1.set_title('Sharing Y axis') ax2.scatter(x, y) # Four polar axes plt.subplots(2, 2, subplot_kw=dict(polar=True)) """ import matplotlib.pyplot as plt if subplot_kw is None: subplot_kw = {} if ax is None: fig = plt.figure(**fig_kw) else: if is_list_like(ax): if squeeze: ax = flatten_axes(ax) if layout is not None: warnings.warn( "When passing multiple axes, layout keyword is ignored", UserWarning) if sharex or sharey: warnings.warn( "When passing multiple axes, sharex and sharey " "are ignored. These settings must be specified when creating axes", UserWarning, stacklevel=4, ) if ax.size == naxes: fig = ax.flat[0].get_figure() return fig, ax else: raise ValueError( f"The number of passed axes must be {naxes}, the " "same as the output plot") fig = ax.get_figure() # if ax is passed and a number of subplots is 1, return ax as it is if naxes == 1: if squeeze: return fig, ax else: return fig, flatten_axes(ax) else: warnings.warn( "To output multiple subplots, the figure containing " "the passed axes is being cleared", UserWarning, stacklevel=4, ) fig.clear() nrows, ncols = _get_layout(naxes, layout=layout, layout_type=layout_type) nplots = nrows * ncols # Create empty object array to hold all axes. It's easiest to make it 1-d # so we can just append subplots upon creation, and then axarr = np.empty(nplots, dtype=object) # Create first subplot separately, so we can share it if requested ax0 = fig.add_subplot(nrows, ncols, 1, **subplot_kw) if sharex: subplot_kw["sharex"] = ax0 if sharey: subplot_kw["sharey"] = ax0 axarr[0] = ax0 # Note off-by-one counting because add_subplot uses the MATLAB 1-based # convention. for i in range(1, nplots): kwds = subplot_kw.copy() # Set sharex and sharey to None for blank/dummy axes, these can # interfere with proper axis limits on the visible axes if # they share axes e.g. issue #7528 if i >= naxes: kwds["sharex"] = None kwds["sharey"] = None ax = fig.add_subplot(nrows, ncols, i + 1, **kwds) axarr[i] = ax if naxes != nplots: for ax in axarr[naxes:]: ax.set_visible(False) handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey) if squeeze: # Reshape the array to have the final desired dimension (nrow,ncol), # though discarding unneeded dimensions that equal 1. If we only have # one subplot, just return it instead of a 1-element array. if nplots == 1: axes = axarr[0] else: axes = axarr.reshape(nrows, ncols).squeeze() else: # returned axis array will be always 2-d, even if nrows=ncols=1 axes = axarr.reshape(nrows, ncols) return fig, axes
def _get_standard_colors(num_colors=None, colormap=None, color_type='default', color=None): import matplotlib.pyplot as plt if color is None and colormap is not None: if isinstance(colormap, str): import matplotlib.cm as cm cmap = colormap colormap = cm.get_cmap(colormap) if colormap is None: raise ValueError("Colormap {0} is not recognized".format(cmap)) colors = lmap(colormap, np.linspace(0, 1, num=num_colors)) elif color is not None: if colormap is not None: warnings.warn("'color' and 'colormap' cannot be used " "simultaneously. Using 'color'") colors = list(color) if is_list_like(color) else color else: if color_type == 'default': # need to call list() on the result to copy so we don't # modify the global rcParams below try: colors = [ c['color'] for c in list(plt.rcParams['axes.prop_cycle']) ] except KeyError: colors = list( plt.rcParams.get('axes.color_cycle', list('bgrcmyk'))) if isinstance(colors, str): colors = list(colors) colors = colors[0:num_colors] elif color_type == 'random': import pandas.core.common as com def random_color(column): """ Returns a random color represented as a list of length 3""" # GH17525 use common._random_state to avoid resetting the seed rs = com.random_state(column) return rs.rand(3).tolist() colors = lmap(random_color, lrange(num_colors)) else: raise ValueError("color_type must be either 'default' or 'random'") if isinstance(colors, str): import matplotlib.colors conv = matplotlib.colors.ColorConverter() def _maybe_valid_colors(colors): try: [conv.to_rgba(c) for c in colors] return True except ValueError: return False # check whether the string can be convertible to single color maybe_single_color = _maybe_valid_colors([colors]) # check whether each character can be convertible to colors maybe_color_cycle = _maybe_valid_colors(list(colors)) if maybe_single_color and maybe_color_cycle and len(colors) > 1: hex_color = [ c['color'] for c in list(plt.rcParams['axes.prop_cycle']) ] colors = [hex_color[int(colors[1])]] elif maybe_single_color: colors = [colors] else: # ``colors`` is regarded as color cycle. # mpl will raise error any of them is invalid pass # Append more colors by cycling if there is not enough color. # Extra colors will be ignored by matplotlib if there are more colors # than needed and nothing needs to be done here. if len(colors) < num_colors: try: multiple = num_colors // len(colors) - 1 except ZeroDivisionError: raise ValueError("Invalid color argument: ''") mod = num_colors % len(colors) colors += multiple * colors colors += colors[:mod] return colors
def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None, box=True, format=None, exact=True, unit=None, infer_datetime_format=False, origin='unix', cache=False): """ Convert argument to datetime. Parameters ---------- arg : integer, float, string, datetime, list, tuple, 1-d array, Series .. versionadded:: 0.18.1 or DataFrame/dict-like errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaT - If 'ignore', then invalid parsing will return the input dayfirst : boolean, default False Specify a date parse order if `arg` is str or its list-likes. If True, parses dates with the day first, eg 10/11/12 is parsed as 2012-11-10. Warning: dayfirst=True is not strict, but will prefer to parse with day first (this is a known bug, based on dateutil behavior). yearfirst : boolean, default False Specify a date parse order if `arg` is str or its list-likes. - If True parses dates with the year first, eg 10/11/12 is parsed as 2010-11-12. - If both dayfirst and yearfirst are True, yearfirst is preceded (same as dateutil). Warning: yearfirst=True is not strict, but will prefer to parse with year first (this is a known bug, based on dateutil behavior). .. versionadded:: 0.16.1 utc : boolean, default None Return UTC DatetimeIndex if True (converting any tz-aware datetime.datetime objects as well). box : boolean, default True - If True returns a DatetimeIndex or Index-like object - If False returns ndarray of values. format : string, default None strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse all the way up to nanoseconds. exact : boolean, True by default - If True, require an exact format match. - If False, allow the format to match anywhere in the target string. unit : string, default 'ns' unit of the arg (D,s,ms,us,ns) denote the unit, which is an integer or float number. This will be based off the origin. Example, with unit='ms' and origin='unix' (the default), this would calculate the number of milliseconds to the unix epoch start. infer_datetime_format : boolean, default False If True and no `format` is given, attempt to infer the format of the datetime strings, and if it can be inferred, switch to a faster method of parsing them. In some cases this can increase the parsing speed by ~5-10x. origin : scalar, default is 'unix' Define the reference date. The numeric values would be parsed as number of units (defined by `unit`) since this reference date. - If 'unix' (or POSIX) time; origin is set to 1970-01-01. - If 'julian', unit must be 'D', and origin is set to beginning of Julian Calendar. Julian day number 0 is assigned to the day starting at noon on January 1, 4713 BC. - If Timestamp convertible, origin is set to Timestamp identified by origin. .. versionadded:: 0.20.0 cache : boolean, default False If True, use a cache of unique, converted dates to apply the datetime conversion. May produce significant speed-up when parsing duplicate date strings, especially ones with timezone offsets. .. versionadded:: 0.23.0 Returns ------- ret : datetime if parsing succeeded. Return type depends on input: - list-like: DatetimeIndex - Series: Series of datetime64 dtype - scalar: Timestamp In case when it is not possible to return designated types (e.g. when any element of input is before Timestamp.min or after Timestamp.max) return will have datetime.datetime type (or corresponding array/Series). Examples -------- Assembling a datetime from multiple columns of a DataFrame. The keys can be common abbreviations like ['year', 'month', 'day', 'minute', 'second', 'ms', 'us', 'ns']) or plurals of the same >>> df = pd.DataFrame({'year': [2015, 2016], 'month': [2, 3], 'day': [4, 5]}) >>> pd.to_datetime(df) 0 2015-02-04 1 2016-03-05 dtype: datetime64[ns] If a date does not meet the `timestamp limitations <http://pandas.pydata.org/pandas-docs/stable/timeseries.html #timeseries-timestamp-limits>`_, passing errors='ignore' will return the original input instead of raising any exception. Passing errors='coerce' will force an out-of-bounds date to NaT, in addition to forcing non-dates (or non-parseable dates) to NaT. >>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore') datetime.datetime(1300, 1, 1, 0, 0) >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce') NaT Passing infer_datetime_format=True can often-times speedup a parsing if its not an ISO8601 format exactly, but in a regular format. >>> s = pd.Series(['3/11/2000', '3/12/2000', '3/13/2000']*1000) >>> s.head() 0 3/11/2000 1 3/12/2000 2 3/13/2000 3 3/11/2000 4 3/12/2000 dtype: object >>> %timeit pd.to_datetime(s,infer_datetime_format=True) 100 loops, best of 3: 10.4 ms per loop >>> %timeit pd.to_datetime(s,infer_datetime_format=False) 1 loop, best of 3: 471 ms per loop Using a unix epoch time >>> pd.to_datetime(1490195805, unit='s') Timestamp('2017-03-22 15:16:45') >>> pd.to_datetime(1490195805433502912, unit='ns') Timestamp('2017-03-22 15:16:45.433502912') .. warning:: For float arg, precision rounding might happen. To prevent unexpected behavior use a fixed-width exact type. Using a non-unix epoch origin >>> pd.to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01')) 0 1960-01-02 1 1960-01-03 2 1960-01-04 See also -------- pandas.DataFrame.astype : Cast argument to a specified dtype. pandas.to_timedelta : Convert argument to timedelta. """ if arg is None: return None if origin != 'unix': arg = _adjust_to_origin(arg, origin, unit) tz = 'utc' if utc else None convert_listlike = partial(_convert_listlike_datetimes, tz=tz, unit=unit, dayfirst=dayfirst, yearfirst=yearfirst, errors=errors, exact=exact, infer_datetime_format=infer_datetime_format) if isinstance(arg, Timestamp): result = arg elif isinstance(arg, ABCSeries): cache_array = _maybe_cache(arg, format, cache, convert_listlike) if not cache_array.empty: result = arg.map(cache_array) else: from pandas import Series values = convert_listlike(arg._values, True, format) result = Series(values, index=arg.index, name=arg.name) elif isinstance(arg, (ABCDataFrame, MutableMapping)): result = _assemble_from_unit_mappings(arg, errors=errors) elif isinstance(arg, ABCIndexClass): cache_array = _maybe_cache(arg, format, cache, convert_listlike) if not cache_array.empty: result = _convert_and_box_cache(arg, cache_array, box, errors, name=arg.name) else: convert_listlike = partial(convert_listlike, name=arg.name) result = convert_listlike(arg, box, format) elif is_list_like(arg): cache_array = _maybe_cache(arg, format, cache, convert_listlike) if not cache_array.empty: result = _convert_and_box_cache(arg, cache_array, box, errors) else: result = convert_listlike(arg, box, format) else: result = convert_listlike(np.array([arg]), box, format)[0] return result
def get_grouper( obj: FrameOrSeries, key=None, axis: int = 0, level=None, sort: bool = True, observed: bool = False, mutated: bool = False, validate: bool = True, dropna: bool = True, ) -> Tuple[ops.BaseGrouper, Set[Hashable], FrameOrSeries]: """ Create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. This may be composed of multiple Grouping objects, indicating multiple groupers Groupers are ultimately index mappings. They can originate as: index mappings, keys to columns, functions, or Groupers Groupers enable local references to axis,level,sort, while the passed in axis, level, and sort are 'global'. This routine tries to figure out what the passing in references are and then creates a Grouping for each one, combined into a BaseGrouper. If observed & we have a categorical grouper, only show the observed values. If validate, then check for key/level overlaps. """ group_axis = obj._get_axis(axis) # validate that the passed single level is compatible with the passed # axis of the object if level is not None: # TODO: These if-block and else-block are almost same. # MultiIndex instance check is removable, but it seems that there are # some processes only for non-MultiIndex in else-block, # eg. `obj.index.name != level`. We have to consider carefully whether # these are applicable for MultiIndex. Even if these are applicable, # we need to check if it makes no side effect to subsequent processes # on the outside of this condition. # (GH 17621) if isinstance(group_axis, MultiIndex): if is_list_like(level) and len(level) == 1: level = level[0] if key is None and is_scalar(level): # Get the level values from group_axis key = group_axis.get_level_values(level) level = None else: # allow level to be a length-one list-like object # (e.g., level=[0]) # GH 13901 if is_list_like(level): nlevels = len(level) if nlevels == 1: level = level[0] elif nlevels == 0: raise ValueError("No group keys passed!") else: raise ValueError( "multiple levels only valid with MultiIndex") if isinstance(level, str): if obj._get_axis(axis).name != level: raise ValueError(f"level name {level} is not the name " f"of the {obj._get_axis_name(axis)}") elif level > 0 or level < -1: raise ValueError( "level > 0 or level < -1 only valid with MultiIndex") # NOTE: `group_axis` and `group_axis.get_level_values(level)` # are same in this section. level = None key = group_axis # a passed-in Grouper, directly convert if isinstance(key, Grouper): binner, grouper, obj = key._get_grouper(obj, validate=False) if key.key is None: return grouper, set(), obj else: return grouper, {key.key}, obj # already have a BaseGrouper, just return it elif isinstance(key, ops.BaseGrouper): return key, set(), obj if not isinstance(key, list): keys = [key] match_axis_length = False else: keys = key match_axis_length = len(keys) == len(group_axis) # what are we after, exactly? any_callable = any(callable(g) or isinstance(g, dict) for g in keys) any_groupers = any(isinstance(g, Grouper) for g in keys) any_arraylike = any( isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys) # is this an index replacement? if (not any_callable and not any_arraylike and not any_groupers and match_axis_length and level is None): if isinstance(obj, DataFrame): all_in_columns_index = all(g in obj.columns or g in obj.index.names for g in keys) else: assert isinstance(obj, Series) all_in_columns_index = all(g in obj.index.names for g in keys) if not all_in_columns_index: keys = [com.asarray_tuplesafe(keys)] if isinstance(level, (tuple, list)): if key is None: keys = [None] * len(level) levels = level else: levels = [level] * len(keys) groupings: List[Grouping] = [] exclusions: Set[Hashable] = set() # if the actual grouper should be obj[key] def is_in_axis(key) -> bool: if not _is_label_like(key): # items -> .columns for DataFrame, .index for Series items = obj.axes[-1] try: items.get_loc(key) except (KeyError, TypeError, InvalidIndexError): # TypeError shows up here if we pass e.g. Int64Index return False return True # if the grouper is obj[name] def is_in_obj(gpr) -> bool: if not hasattr(gpr, "name"): return False try: return gpr is obj[gpr.name] except (KeyError, IndexError): # IndexError reached in e.g. test_skip_group_keys when we pass # lambda here return False for i, (gpr, level) in enumerate(zip(keys, levels)): if is_in_obj(gpr): # df.groupby(df['name']) in_axis, name = True, gpr.name exclusions.add(name) elif is_in_axis(gpr): # df.groupby('name') if gpr in obj: if validate: obj._check_label_or_level_ambiguity(gpr, axis=axis) in_axis, name, gpr = True, gpr, obj[gpr] exclusions.add(name) elif obj._is_level_reference(gpr, axis=axis): in_axis, name, level, gpr = False, None, gpr, None else: raise KeyError(gpr) elif isinstance(gpr, Grouper) and gpr.key is not None: # Add key to exclusions exclusions.add(gpr.key) in_axis, name = False, None else: in_axis, name = False, None if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]: raise ValueError( f"Length of grouper ({len(gpr)}) and axis ({obj.shape[axis]}) " "must be same length") # create the Grouping # allow us to passing the actual Grouping as the gpr ping = (Grouping( group_axis, gpr, obj=obj, name=name, level=level, sort=sort, observed=observed, in_axis=in_axis, dropna=dropna, ) if not isinstance(gpr, Grouping) else gpr) groupings.append(ping) if len(groupings) == 0 and len(obj): raise ValueError("No group keys passed!") elif len(groupings) == 0: groupings.append( Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) # create the internals grouper grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated, dropna=dropna) return grouper, exclusions, obj
def groupby_agg(self, by, axis, agg, groupby_args, **kwargs): # Currently we only expect 'by' to be a projection of the same frame. # If 'by' holds a list of columns/series, then we create such projection # to re-use code. if not isinstance(by, DFAlgQueryCompiler): if is_list_like(by): by_cols = [] by_frames = [] for obj in by: if isinstance(obj, str): by_cols.append(obj) elif hasattr(obj, "_query_compiler"): by_frames.append(obj._query_compiler._modin_frame) else: raise NotImplementedError("unsupported groupby args") by_cols = Index.__new__(Index, data=by_cols, dtype=self.columns.dtype) by_frame = self.mask(col_indices=by_cols) if by_frames: by_frame = by_frame._concat(axis=1, other_modin_frames=by_frames, ignore_index=True) else: raise NotImplementedError("unsupported groupby args") else: by_frame = by._modin_frame if axis != 0: raise NotImplementedError("groupby is supported for axis = 0 only") base = by_frame._find_common_projections_base(self) if base is None: raise NotImplementedError("unsupported groupby args") if groupby_args["level"] is not None: raise NotImplementedError("levels are not supported for groupby") groupby_cols = by_frame.columns.tolist() agg_cols = [col for col in self.columns if col not in by_frame.columns] # Create new base where all required columns are computed. We don't allow # complex expressions to be a group key or an aggeregate operand. assert isinstance(by_frame._op, TransformNode), "unexpected by_frame" exprs = OrderedDict(((col, by_frame.ref(col)) for col in groupby_cols)) exprs.update(((col, self.ref(col)) for col in agg_cols)) exprs = translate_exprs_to_base(exprs, base) base_cols = Index.__new__(Index, data=list(exprs.keys()), dtype=self.columns.dtype) base = self.__constructor__( columns=base_cols, dtypes=self._dtypes_for_exprs(exprs), op=TransformNode(base, exprs, fold=True), index_cols=None, force_execution_mode=self._force_execution_mode, ) new_columns = [] index_cols = None if groupby_args["as_index"]: index_cols = groupby_cols.copy() else: new_columns = groupby_cols.copy() new_dtypes = by_frame._dtypes[groupby_cols].tolist() agg_exprs = OrderedDict() if isinstance(agg, str): for col in agg_cols: agg_exprs[col] = AggregateExpr(agg, base.ref(col)) else: assert isinstance(agg, dict), "unsupported aggregate type" multiindex = any(isinstance(v, list) for v in agg.values()) for k, v in agg.items(): if isinstance(v, list): for item in v: agg_exprs[(k, item)] = AggregateExpr(item, base.ref(k)) else: col_name = (k, v) if multiindex else k agg_exprs[col_name] = AggregateExpr(v, base.ref(k)) new_columns.extend(agg_exprs.keys()) new_dtypes.extend((x._dtype for x in agg_exprs.values())) new_columns = Index.__new__(Index, data=new_columns, dtype=self.columns.dtype) new_op = GroupbyAggNode(base, groupby_cols, agg_exprs, groupby_args) new_frame = self.__constructor__( columns=new_columns, dtypes=new_dtypes, op=new_op, index_cols=index_cols, force_execution_mode=self._force_execution_mode, ) return new_frame
def align_method_FRAME(left, right, axis, flex: Optional[bool] = False, level: Level = None): """ Convert rhs to meet lhs dims if input is list, tuple or np.ndarray. Parameters ---------- left : DataFrame right : Any axis: int, str, or None flex: bool or None, default False Whether this is a flex op, in which case we reindex. None indicates not to check for alignment. level : int or level name, default None Returns ------- left : DataFrame right : Any """ def to_series(right): msg = "Unable to coerce to Series, length must be {req_len}: given {given_len}" if axis is not None and left._get_axis_name(axis) == "index": if len(left.index) != len(right): raise ValueError( msg.format(req_len=len(left.index), given_len=len(right))) right = left._constructor_sliced(right, index=left.index) else: if len(left.columns) != len(right): raise ValueError( msg.format(req_len=len(left.columns), given_len=len(right))) right = left._constructor_sliced(right, index=left.columns) return right if isinstance(right, np.ndarray): if right.ndim == 1: right = to_series(right) elif right.ndim == 2: if right.shape == left.shape: right = left._constructor(right, index=left.index, columns=left.columns) elif right.shape[0] == left.shape[0] and right.shape[1] == 1: # Broadcast across columns right = np.broadcast_to(right, left.shape) right = left._constructor(right, index=left.index, columns=left.columns) elif right.shape[1] == left.shape[1] and right.shape[0] == 1: # Broadcast along rows right = to_series(right[0, :]) else: raise ValueError("Unable to coerce to DataFrame, shape " f"must be {left.shape}: given {right.shape}") elif right.ndim > 2: raise ValueError("Unable to coerce to Series/DataFrame, " f"dimension must be <= 2: {right.shape}") elif is_list_like(right) and not isinstance(right, (ABCSeries, ABCDataFrame)): # GH 36702. Raise when attempting arithmetic with list of array-like. if any(is_array_like(el) for el in right): raise ValueError( f"Unable to coerce list of {type(right[0])} to Series/DataFrame" ) # GH17901 right = to_series(right) if flex is not None and isinstance(right, ABCDataFrame): if not left._indexed_same(right): if flex: left, right = left.align(right, join="outer", level=level, copy=False) else: raise ValueError( "Can only compare identically-labeled DataFrame objects") elif isinstance(right, ABCSeries): # axis=1 is default for DataFrame-with-Series op axis = left._get_axis_number(axis) if axis is not None else 1 if not flex: if not left.axes[axis].equals(right.index): warnings.warn( "Automatic reindexing on DataFrame vs Series comparisons " "is deprecated and will raise ValueError in a future version. " "Do `left, right = left.align(right, axis=1, copy=False)` " "before e.g. `left == right`", FutureWarning, stacklevel=5, ) left, right = left.align(right, join="outer", axis=axis, level=level, copy=False) right = _maybe_align_series_as_frame(left, right, axis) return left, right
def _extract_index(data) -> Index: """ Try to infer an Index from the passed data, raise ValueError on failure. """ index = None if len(data) == 0: index = Index([]) elif len(data) > 0: raw_lengths = [] indexes: list[list[Hashable] | Index] = [] have_raw_arrays = False have_series = False have_dicts = False for val in data: if isinstance(val, ABCSeries): have_series = True indexes.append(val.index) elif isinstance(val, dict): have_dicts = True indexes.append(list(val.keys())) elif is_list_like(val) and getattr(val, "ndim", 1) == 1: have_raw_arrays = True raw_lengths.append(len(val)) elif isinstance(val, np.ndarray) and val.ndim > 1: raise ValueError( "Per-column arrays must each be 1-dimensional") if not indexes and not raw_lengths: raise ValueError( "If using all scalar values, you must pass an index") if have_series: index = union_indexes(indexes) elif have_dicts: index = union_indexes(indexes, sort=False) if have_raw_arrays: lengths = list(set(raw_lengths)) if len(lengths) > 1: raise ValueError("All arrays must be of the same length") if have_dicts: raise ValueError( "Mixing dicts with non-Series may lead to ambiguous ordering." ) if have_series: assert index is not None # for mypy if lengths[0] != len(index): msg = (f"array length {lengths[0]} does not match index " f"length {len(index)}") raise ValueError(msg) else: index = ibase.default_index(lengths[0]) # error: Argument 1 to "ensure_index" has incompatible type "Optional[Index]"; # expected "Union[Union[Union[ExtensionArray, ndarray], Index, Series], # Sequence[Any]]" return ensure_index(index) # type: ignore[arg-type]
def get_grouper( obj: FrameOrSeries, key=None, axis: int = 0, level=None, sort: bool = True, observed: bool = False, mutated: bool = False, validate: bool = True, ) -> Tuple[BaseGrouper, List[Hashable], FrameOrSeries]: """ Create and return a BaseGrouper, which is an internal mapping of how to create the grouper indexers. This may be composed of multiple Grouping objects, indicating multiple groupers Groupers are ultimately index mappings. They can originate as: index mappings, keys to columns, functions, or Groupers Groupers enable local references to axis,level,sort, while the passed in axis, level, and sort are 'global'. This routine tries to figure out what the passing in references are and then creates a Grouping for each one, combined into a BaseGrouper. If observed & we have a categorical grouper, only show the observed values. If validate, then check for key/level overlaps. """ group_axis = obj._get_axis(axis) # validate that the passed single level is compatible with the passed # axis of the object if level is not None: # TODO: These if-block and else-block are almost same. # MultiIndex instance check is removable, but it seems that there are # some processes only for non-MultiIndex in else-block, # eg. `obj.index.name != level`. We have to consider carefully whether # these are applicable for MultiIndex. Even if these are applicable, # we need to check if it makes no side effect to subsequent processes # on the outside of this condition. # (GH 17621) if isinstance(group_axis, MultiIndex): if is_list_like(level) and len(level) == 1: level = level[0] if key is None and is_scalar(level): # Get the level values from group_axis key = group_axis.get_level_values(level) level = None else: # allow level to be a length-one list-like object # (e.g., level=[0]) # GH 13901 if is_list_like(level): nlevels = len(level) if nlevels == 1: level = level[0] elif nlevels == 0: raise ValueError("No group keys passed!") else: raise ValueError( "multiple levels only valid with MultiIndex") if isinstance(level, str): if obj.index.name != level: raise ValueError( "level name {level} is not the name of the index". format(level=level)) elif level > 0 or level < -1: raise ValueError( "level > 0 or level < -1 only valid with MultiIndex") # NOTE: `group_axis` and `group_axis.get_level_values(level)` # are same in this section. level = None key = group_axis # a passed-in Grouper, directly convert if isinstance(key, Grouper): binner, grouper, obj = key._get_grouper(obj, validate=False) if key.key is None: return grouper, [], obj else: return grouper, [key.key], obj # already have a BaseGrouper, just return it elif isinstance(key, BaseGrouper): return key, [], obj # In the future, a tuple key will always mean an actual key, # not an iterable of keys. In the meantime, we attempt to provide # a warning. We can assume that the user wanted a list of keys when # the key is not in the index. We just have to be careful with # unhashable elements of `key`. Any unhashable elements implies that # they wanted a list of keys. # https://github.com/pandas-dev/pandas/issues/18314 if isinstance(key, tuple): all_hashable = is_hashable(key) if (all_hashable and key not in obj and set(key).issubset(obj)) or not all_hashable: # column names ('a', 'b') -> ['a', 'b'] # arrays like (a, b) -> [a, b] msg = ("Interpreting tuple 'by' as a list of keys, rather than " "a single key. Use 'by=[...]' instead of 'by=(...)'. In " "the future, a tuple will always mean a single key.") warnings.warn(msg, FutureWarning, stacklevel=5) key = list(key) if not isinstance(key, list): keys = [key] match_axis_length = False else: keys = key match_axis_length = len(keys) == len(group_axis) # what are we after, exactly? any_callable = any(callable(g) or isinstance(g, dict) for g in keys) any_groupers = any(isinstance(g, Grouper) for g in keys) any_arraylike = any( isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys) # is this an index replacement? if (not any_callable and not any_arraylike and not any_groupers and match_axis_length and level is None): if isinstance(obj, DataFrame): all_in_columns_index = all(g in obj.columns or g in obj.index.names for g in keys) else: assert isinstance(obj, Series) all_in_columns_index = all(g in obj.index.names for g in keys) if not all_in_columns_index: keys = [com.asarray_tuplesafe(keys)] if isinstance(level, (tuple, list)): if key is None: keys = [None] * len(level) levels = level else: levels = [level] * len(keys) groupings = [] # type: List[Grouping] exclusions = [] # type: List[Hashable] # if the actual grouper should be obj[key] def is_in_axis(key) -> bool: if not _is_label_like(key): items = obj._data.items try: items.get_loc(key) except (KeyError, TypeError): # TypeError shows up here if we pass e.g. Int64Index return False return True # if the grouper is obj[name] def is_in_obj(gpr) -> bool: if not hasattr(gpr, "name"): return False try: return gpr is obj[gpr.name] except (KeyError, IndexError): return False for i, (gpr, level) in enumerate(zip(keys, levels)): if is_in_obj(gpr): # df.groupby(df['name']) in_axis, name = True, gpr.name exclusions.append(name) elif is_in_axis(gpr): # df.groupby('name') if gpr in obj: if validate: obj._check_label_or_level_ambiguity(gpr, axis=axis) in_axis, name, gpr = True, gpr, obj[gpr] exclusions.append(name) elif obj._is_level_reference(gpr, axis=axis): in_axis, name, level, gpr = False, None, gpr, None else: raise KeyError(gpr) elif isinstance(gpr, Grouper) and gpr.key is not None: # Add key to exclusions exclusions.append(gpr.key) in_axis, name = False, None else: in_axis, name = False, None if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]: raise ValueError( ("Length of grouper ({len_gpr}) and axis ({len_axis})" " must be same length".format(len_gpr=len(gpr), len_axis=obj.shape[axis]))) # create the Grouping # allow us to passing the actual Grouping as the gpr ping = (Grouping( group_axis, gpr, obj=obj, name=name, level=level, sort=sort, observed=observed, in_axis=in_axis, ) if not isinstance(gpr, Grouping) else gpr) groupings.append(ping) if len(groupings) == 0 and len(obj): raise ValueError("No group keys passed!") elif len(groupings) == 0: groupings.append( Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) # create the internals grouper grouper = BaseGrouper(group_axis, groupings, sort=sort, mutated=mutated) return grouper, exclusions, obj
def to_datetime( arg: DatetimeScalarOrArrayConvertible, errors: str = "raise", dayfirst: bool = False, yearfirst: bool = False, utc: Optional[bool] = None, format: Optional[str] = None, exact: bool = True, unit: Optional[str] = None, infer_datetime_format: bool = False, origin="unix", cache: bool = True, ) -> Union[DatetimeIndex, Series, DatetimeScalar, NaTType]: """ Convert argument to datetime. Parameters ---------- arg : int, float, str, datetime, list, tuple, 1-d array, Series, DataFrame/dict-like The object to convert to a datetime. errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception. - If 'coerce', then invalid parsing will be set as NaT. - If 'ignore', then invalid parsing will return the input. dayfirst : bool, default False Specify a date parse order if `arg` is str or its list-likes. If True, parses dates with the day first, eg 10/11/12 is parsed as 2012-11-10. Warning: dayfirst=True is not strict, but will prefer to parse with day first (this is a known bug, based on dateutil behavior). yearfirst : bool, default False Specify a date parse order if `arg` is str or its list-likes. - If True parses dates with the year first, eg 10/11/12 is parsed as 2010-11-12. - If both dayfirst and yearfirst are True, yearfirst is preceded (same as dateutil). Warning: yearfirst=True is not strict, but will prefer to parse with year first (this is a known bug, based on dateutil behavior). utc : bool, default None Return UTC DatetimeIndex if True (converting any tz-aware datetime.datetime objects as well). format : str, default None The strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse all the way up to nanoseconds. See strftime documentation for more information on choices: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior. exact : bool, True by default Behaves as: - If True, require an exact format match. - If False, allow the format to match anywhere in the target string. unit : str, default 'ns' The unit of the arg (D,s,ms,us,ns) denote the unit, which is an integer or float number. This will be based off the origin. Example, with unit='ms' and origin='unix' (the default), this would calculate the number of milliseconds to the unix epoch start. infer_datetime_format : bool, default False If True and no `format` is given, attempt to infer the format of the datetime strings based on the first non-NaN element, and if it can be inferred, switch to a faster method of parsing them. In some cases this can increase the parsing speed by ~5-10x. origin : scalar, default 'unix' Define the reference date. The numeric values would be parsed as number of units (defined by `unit`) since this reference date. - If 'unix' (or POSIX) time; origin is set to 1970-01-01. - If 'julian', unit must be 'D', and origin is set to beginning of Julian Calendar. Julian day number 0 is assigned to the day starting at noon on January 1, 4713 BC. - If Timestamp convertible, origin is set to Timestamp identified by origin. cache : bool, default True If True, use a cache of unique, converted dates to apply the datetime conversion. May produce significant speed-up when parsing duplicate date strings, especially ones with timezone offsets. The cache is only used when there are at least 50 values. The presence of out-of-bounds values will render the cache unusable and may slow down parsing. .. versionchanged:: 0.25.0 - changed default value from False to True. Returns ------- datetime If parsing succeeded. Return type depends on input: - list-like: DatetimeIndex - Series: Series of datetime64 dtype - scalar: Timestamp In case when it is not possible to return designated types (e.g. when any element of input is before Timestamp.min or after Timestamp.max) return will have datetime.datetime type (or corresponding array/Series). See Also -------- DataFrame.astype : Cast argument to a specified dtype. to_timedelta : Convert argument to timedelta. convert_dtypes : Convert dtypes. Examples -------- Assembling a datetime from multiple columns of a DataFrame. The keys can be common abbreviations like ['year', 'month', 'day', 'minute', 'second', 'ms', 'us', 'ns']) or plurals of the same >>> df = pd.DataFrame({'year': [2015, 2016], ... 'month': [2, 3], ... 'day': [4, 5]}) >>> pd.to_datetime(df) 0 2015-02-04 1 2016-03-05 dtype: datetime64[ns] If a date does not meet the `timestamp limitations <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html #timeseries-timestamp-limits>`_, passing errors='ignore' will return the original input instead of raising any exception. Passing errors='coerce' will force an out-of-bounds date to NaT, in addition to forcing non-dates (or non-parseable dates) to NaT. >>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore') datetime.datetime(1300, 1, 1, 0, 0) >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce') NaT Passing infer_datetime_format=True can often-times speedup a parsing if its not an ISO8601 format exactly, but in a regular format. >>> s = pd.Series(['3/11/2000', '3/12/2000', '3/13/2000'] * 1000) >>> s.head() 0 3/11/2000 1 3/12/2000 2 3/13/2000 3 3/11/2000 4 3/12/2000 dtype: object >>> %timeit pd.to_datetime(s, infer_datetime_format=True) # doctest: +SKIP 100 loops, best of 3: 10.4 ms per loop >>> %timeit pd.to_datetime(s, infer_datetime_format=False) # doctest: +SKIP 1 loop, best of 3: 471 ms per loop Using a unix epoch time >>> pd.to_datetime(1490195805, unit='s') Timestamp('2017-03-22 15:16:45') >>> pd.to_datetime(1490195805433502912, unit='ns') Timestamp('2017-03-22 15:16:45.433502912') .. warning:: For float arg, precision rounding might happen. To prevent unexpected behavior use a fixed-width exact type. Using a non-unix epoch origin >>> pd.to_datetime([1, 2, 3], unit='D', ... origin=pd.Timestamp('1960-01-01')) DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], \ dtype='datetime64[ns]', freq=None) """ if arg is None: return None if origin != "unix": arg = _adjust_to_origin(arg, origin, unit) tz = "utc" if utc else None convert_listlike = partial( _convert_listlike_datetimes, tz=tz, unit=unit, dayfirst=dayfirst, yearfirst=yearfirst, errors=errors, exact=exact, infer_datetime_format=infer_datetime_format, ) if isinstance(arg, Timestamp): result = arg if tz is not None: if arg.tz is not None: result = result.tz_convert(tz) else: result = result.tz_localize(tz) elif isinstance(arg, ABCSeries): cache_array = _maybe_cache(arg, format, cache, convert_listlike) if not cache_array.empty: result = arg.map(cache_array) else: values = convert_listlike(arg._values, format) result = arg._constructor(values, index=arg.index, name=arg.name) elif isinstance(arg, (ABCDataFrame, abc.MutableMapping)): result = _assemble_from_unit_mappings(arg, errors, tz) elif isinstance(arg, Index): cache_array = _maybe_cache(arg, format, cache, convert_listlike) if not cache_array.empty: result = _convert_and_box_cache(arg, cache_array, name=arg.name) else: result = convert_listlike(arg, format, name=arg.name) elif is_list_like(arg): try: cache_array = _maybe_cache(arg, format, cache, convert_listlike) except OutOfBoundsDatetime: # caching attempts to create a DatetimeIndex, which may raise # an OOB. If that's the desired behavior, then just reraise... if errors == "raise": raise # ... otherwise, continue without the cache. from pandas import Series cache_array = Series([], dtype=object) # just an empty array if not cache_array.empty: result = _convert_and_box_cache(arg, cache_array) else: result = convert_listlike(arg, format) else: result = convert_listlike(np.array([arg]), format)[0] return result
def _adjust_to_origin(arg, origin, unit): """ Helper function for to_datetime. Adjust input argument to the specified origin Parameters ---------- arg : list, tuple, ndarray, Series, Index date to be adjusted origin : 'julian' or Timestamp origin offset for the arg unit : string passed unit from to_datetime, must be 'D' Returns ------- ndarray or scalar of adjusted date(s) """ if origin == 'julian': original = arg j0 = Timestamp(0).to_julian_date() if unit != 'D': raise ValueError("unit must be 'D' for origin='julian'") try: arg = arg - j0 except TypeError: raise ValueError("incompatible 'arg' type for given " "'origin'='julian'") # premptively check this for a nice range j_max = Timestamp.max.to_julian_date() - j0 j_min = Timestamp.min.to_julian_date() - j0 if np.any(arg > j_max) or np.any(arg < j_min): raise tslibs.OutOfBoundsDatetime( "{original} is Out of Bounds for " "origin='julian'".format(original=original)) else: # arg must be numeric if not ((is_scalar(arg) and (is_integer(arg) or is_float(arg))) or is_numeric_dtype(np.asarray(arg))): raise ValueError( "'{arg}' is not compatible with origin='{origin}'; " "it must be numeric with a unit specified ".format( arg=arg, origin=origin)) # we are going to offset back to unix / epoch time try: offset = Timestamp(origin) except tslibs.OutOfBoundsDatetime: raise tslibs.OutOfBoundsDatetime( "origin {origin} is Out of Bounds".format(origin=origin)) except ValueError: raise ValueError("origin {origin} cannot be converted " "to a Timestamp".format(origin=origin)) if offset.tz is not None: raise ValueError( "origin offset {} must be tz-naive".format(offset)) offset -= Timestamp(0) # convert the offset to the unit of the arg # this should be lossless in terms of precision offset = offset // tslibs.Timedelta(1, unit=unit) # scalars & ndarray-like can handle the addition if is_list_like(arg) and not isinstance( arg, (ABCSeries, ABCIndexClass, np.ndarray)): arg = np.asarray(arg) arg = arg + offset return arg
def _align_method_FRAME(left, right, axis, flex: Optional[bool] = False, level: Level = None): """ Convert rhs to meet lhs dims if input is list, tuple or np.ndarray. Parameters ---------- left : DataFrame right : Any axis: int, str, or None flex: bool or None, default False Whether this is a flex op, in which case we reindex. None indicates not to check for alignment. level : int or level name, default None Returns ------- left : DataFrame right : Any """ def to_series(right): msg = "Unable to coerce to Series, length must be {req_len}: given {given_len}" if axis is not None and left._get_axis_name(axis) == "index": if len(left.index) != len(right): raise ValueError( msg.format(req_len=len(left.index), given_len=len(right))) right = left._constructor_sliced(right, index=left.index) else: if len(left.columns) != len(right): raise ValueError( msg.format(req_len=len(left.columns), given_len=len(right))) right = left._constructor_sliced(right, index=left.columns) return right if isinstance(right, np.ndarray): if right.ndim == 1: right = to_series(right) elif right.ndim == 2: if right.shape == left.shape: right = left._constructor(right, index=left.index, columns=left.columns) elif right.shape[0] == left.shape[0] and right.shape[1] == 1: # Broadcast across columns right = np.broadcast_to(right, left.shape) right = left._constructor(right, index=left.index, columns=left.columns) elif right.shape[1] == left.shape[1] and right.shape[0] == 1: # Broadcast along rows right = to_series(right[0, :]) else: raise ValueError("Unable to coerce to DataFrame, shape " f"must be {left.shape}: given {right.shape}") elif right.ndim > 2: raise ValueError("Unable to coerce to Series/DataFrame, " f"dimension must be <= 2: {right.shape}") elif is_list_like(right) and not isinstance(right, (ABCSeries, ABCDataFrame)): # GH17901 right = to_series(right) if flex is not None and isinstance(right, ABCDataFrame): if not left._indexed_same(right): if flex: left, right = left.align(right, join="outer", level=level, copy=False) else: raise ValueError( "Can only compare identically-labeled DataFrame objects") elif isinstance(right, ABCSeries): # axis=1 is default for DataFrame-with-Series op axis = left._get_axis_number(axis) if axis is not None else 1 left, right = left.align(right, join="outer", axis=axis, level=level, copy=False) return left, right
def logical_op(left: Union[np.ndarray, ABCExtensionArray], right: Any, op) -> Union[np.ndarray, ABCExtensionArray]: """ Evaluate a logical operation `|`, `&`, or `^`. Parameters ---------- left : np.ndarray or ExtensionArray right : object Cannot be a DataFrame, Series, or Index. op : {operator.and_, operator.or_, operator.xor} Or one of the reversed variants from roperator. Returns ------- ndarrray or ExtensionArray """ from pandas.core.ops import should_extension_dispatch, dispatch_to_extension_op fill_int = lambda x: x def fill_bool(x, left=None): # if `left` is specifically not-boolean, we do not cast to bool if x.dtype.kind in ["c", "f", "O"]: # dtypes that can hold NA mask = isna(x) if mask.any(): x = x.astype(object) x[mask] = False if left is None or is_bool_dtype(left.dtype): x = x.astype(bool) return x is_self_int_dtype = is_integer_dtype(left.dtype) right = lib.item_from_zerodim(right) if is_list_like(right) and not hasattr(right, "dtype"): # e.g. list, tuple right = construct_1d_object_array_from_listlike(right) # NB: We assume extract_array has already been called on left and right lvalues = left rvalues = right if should_extension_dispatch(lvalues, rvalues): res_values = dispatch_to_extension_op(op, lvalues, rvalues) else: if isinstance(rvalues, np.ndarray): is_other_int_dtype = is_integer_dtype(rvalues.dtype) rvalues = rvalues if is_other_int_dtype else fill_bool( rvalues, lvalues) else: # i.e. scalar is_other_int_dtype = lib.is_integer(rvalues) # For int vs int `^`, `|`, `&` are bitwise operators and return # integer dtypes. Otherwise these are boolean ops filler = fill_int if is_self_int_dtype and is_other_int_dtype else fill_bool res_values = na_logical_op(lvalues, rvalues, op) res_values = filler(res_values) # type: ignore return res_values
def treat_as_nested(data) -> bool: """ Check if we should use nested_data_to_arrays. """ return len(data) > 0 and is_list_like(data[0]) and getattr( data[0], "ndim", 1) == 1
def to_time(arg, format=None, infer_time_format=False, errors='raise'): """ Parse time strings to time objects using fixed strptime formats ("%H:%M", "%H%M", "%I:%M%p", "%I%M%p", "%H:%M:%S", "%H%M%S", "%I:%M:%S%p", "%I%M%S%p") Use infer_time_format if all the strings are in the same format to speed up conversion. Parameters ---------- arg : string in time format, datetime.time, list, tuple, 1-d array, Series format : str, default None Format used to convert arg into a time object. If None, fixed formats are used. infer_time_format: bool, default False Infer the time format based on the first non-NaN element. If all strings are in the same format, this will speed up conversion. errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as None - If 'ignore', then invalid parsing will return the input Returns ------- datetime.time """ from pandas.core.series import Series def _convert_listlike(arg, format): if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a string, datetime, list, tuple, ' '1-d array, or Series') arg = _ensure_object(arg) if infer_time_format and format is None: format = _guess_time_format_for_array(arg) times = [] if format is not None: for element in arg: try: times.append(datetime.strptime(element, format).time()) except (ValueError, TypeError): if errors == 'raise': raise ValueError("Cannot convert %s to a time with " "given format %s" % (element, format)) elif errors == 'ignore': return arg else: times.append(None) else: formats = _time_formats[:] format_found = False for element in arg: time_object = None for time_format in formats: try: time_object = datetime.strptime(element, time_format).time() if not format_found: # Put the found format in front fmt = formats.pop(formats.index(time_format)) formats.insert(0, fmt) format_found = True break except (ValueError, TypeError): continue if time_object is not None: times.append(time_object) elif errors == 'raise': raise ValueError("Cannot convert arg {arg} to " "a time".format(arg=arg)) elif errors == 'ignore': return arg else: times.append(None) return times if arg is None: return arg elif isinstance(arg, time): return arg elif isinstance(arg, Series): values = _convert_listlike(arg._values, format) return Series(values, index=arg.index, name=arg.name) elif isinstance(arg, ABCIndexClass): return _convert_listlike(arg, format) elif is_list_like(arg): return _convert_listlike(arg, format) return _convert_listlike(np.array([arg]), format)[0]
def quantile(self, q=0.5, **kwargs): if is_list_like(q): return self._default_to_pandas( lambda df: df.quantile(q=q, **kwargs)) return self._apply_agg_function(lambda df: df.quantile(q, **kwargs))
def _parse_errorbars(self, label, err): """ Look for error keyword arguments and return the actual errorbar data or return the error DataFrame/dict Error bars can be specified in several ways: Series: the user provides a pandas.Series object of the same length as the data ndarray: provides a np.ndarray of the same length as the data DataFrame/dict: error values are paired with keys matching the key in the plotted DataFrame str: the name of the column within the plotted DataFrame """ if err is None: return None def match_labels(data, e): e = e.reindex(data.index) return e # key-matched DataFrame if isinstance(err, ABCDataFrame): err = match_labels(self.data, err) # key-matched dict elif isinstance(err, dict): pass # Series of error values elif isinstance(err, ABCSeries): # broadcast error series across data err = match_labels(self.data, err) err = np.atleast_2d(err) err = np.tile(err, (self.nseries, 1)) # errors are a column in the dataframe elif isinstance(err, str): evalues = self.data[err].values self.data = self.data[self.data.columns.drop(err)] err = np.atleast_2d(evalues) err = np.tile(err, (self.nseries, 1)) elif is_list_like(err): if is_iterator(err): err = np.atleast_2d(list(err)) else: # raw error values err = np.atleast_2d(err) err_shape = err.shape # asymmetrical error bars if err.ndim == 3: if (err_shape[0] != self.nseries) or \ (err_shape[1] != 2) or \ (err_shape[2] != len(self.data)): msg = "Asymmetrical error bars should be provided " + \ "with the shape (%u, 2, %u)" % \ (self.nseries, len(self.data)) raise ValueError(msg) # broadcast errors to each data series if len(err) == 1: err = np.tile(err, (self.nseries, 1)) elif is_number(err): err = np.tile([err], (self.nseries, len(self.data))) else: msg = "No valid {label} detected".format(label=label) raise ValueError(msg) return err
def convert_values(param): if isinstance(param, ExtensionArray) or is_list_like(param): ovalues = param else: # Assume its an object ovalues = [param] * len(self) return ovalues
def _args_adjust(self): if is_list_like(self.bottom): self.bottom = np.array(self.bottom) if is_list_like(self.left): self.left = np.array(self.left)
def melt(frame, id_vars=None, value_vars=None, var_name=None, value_name='value', col_level=None): # TODO: what about the existing index? if id_vars is not None: if not is_list_like(id_vars): id_vars = [id_vars] elif (isinstance(frame.columns, MultiIndex) and not isinstance(id_vars, list)): raise ValueError('id_vars must be a list of tuples when columns' ' are a MultiIndex') else: id_vars = list(id_vars) else: id_vars = [] if value_vars is not None: if not is_list_like(value_vars): value_vars = [value_vars] elif (isinstance(frame.columns, MultiIndex) and not isinstance(value_vars, list)): raise ValueError('value_vars must be a list of tuples when' ' columns are a MultiIndex') else: value_vars = list(value_vars) frame = frame.loc[:, id_vars + value_vars] else: frame = frame.copy() if col_level is not None: # allow list or other? # frame is a copy frame.columns = frame.columns.get_level_values(col_level) if var_name is None: if isinstance(frame.columns, MultiIndex): if len(frame.columns.names) == len(set(frame.columns.names)): var_name = frame.columns.names else: var_name = [ 'variable_{i}'.format(i=i) for i in range(len(frame.columns.names)) ] else: var_name = [ frame.columns.name if frame.columns.name is not None else 'variable' ] if isinstance(var_name, compat.string_types): var_name = [var_name] N, K = frame.shape K -= len(id_vars) mdata = {} for col in id_vars: mdata[col] = np.tile(frame.pop(col).values, K) mcolumns = id_vars + var_name + [value_name] mdata[value_name] = frame.values.ravel('F') for i, col in enumerate(var_name): # asanyarray will keep the columns as an Index mdata[col] = np.asanyarray( frame.columns._get_level_values(i)).repeat(N) return DataFrame(mdata, columns=mcolumns)
def _try_cast( arr, dtype: Optional[Union[np.dtype, "ExtensionDtype"]], copy: bool, raise_cast_failure: bool, ): """ Convert input to numpy ndarray and optionally cast to a given dtype. Parameters ---------- arr : ndarray, list, tuple, iterator (catchall) Excludes: ExtensionArray, Series, Index. dtype : np.dtype, ExtensionDtype or None copy : bool If False, don't copy the data if not needed. raise_cast_failure : bool If True, and if a dtype is specified, raise errors during casting. Otherwise an object array is returned. """ # perf shortcut as this is the most common case if isinstance(arr, np.ndarray): if maybe_castable(arr) and not copy and dtype is None: return arr try: # GH#15832: Check if we are requesting a numeric dype and # that we can convert the data to the requested dtype. if is_integer_dtype(dtype): subarr = maybe_cast_to_integer_array(arr, dtype) subarr = maybe_cast_to_datetime(arr, dtype) # Take care in creating object arrays (but iterators are not # supported): if is_object_dtype(dtype) and ( is_list_like(subarr) and not (is_iterator(subarr) or isinstance(subarr, np.ndarray))): subarr = construct_1d_object_array_from_listlike(subarr) elif not is_extension_array_dtype(subarr): subarr = construct_1d_ndarray_preserving_na(subarr, dtype, copy=copy) except OutOfBoundsDatetime: # in case of out of bound datetime64 -> always raise raise except (ValueError, TypeError): if is_categorical_dtype(dtype): # We *do* allow casting to categorical, since we know # that Categorical is the only array type for 'category'. dtype = cast(CategoricalDtype, dtype) subarr = dtype.construct_array_type()(arr, dtype.categories, ordered=dtype.ordered) elif is_extension_array_dtype(dtype): # create an extension array from its dtype dtype = cast(ExtensionDtype, dtype) array_type = dtype.construct_array_type()._from_sequence subarr = array_type(arr, dtype=dtype, copy=copy) elif dtype is not None and raise_cast_failure: raise else: subarr = np.array(arr, dtype=object, copy=copy) return subarr
def _flatten(axes): if not is_list_like(axes): return np.array([axes]) elif isinstance(axes, (np.ndarray, Index)): return axes.ravel() return np.array(axes)
def integer_arithmetic_method(self, other): omask = None if getattr(other, "ndim", 0) > 1: raise NotImplementedError( "can only perform ops with 1-d structures") if isinstance(other, IntegerArray): other, omask = other._data, other._mask elif is_list_like(other): other = np.asarray(other) if other.ndim > 1: raise NotImplementedError( "can only perform ops with 1-d structures") if len(self) != len(other): raise ValueError("Lengths must match") if not (is_float_dtype(other) or is_integer_dtype(other)): raise TypeError("can only perform ops with numeric values") else: if not (is_float(other) or is_integer(other) or other is libmissing.NA): raise TypeError("can only perform ops with numeric values") if omask is None: mask = self._mask.copy() if other is libmissing.NA: mask |= True else: mask = self._mask | omask if op_name == "pow": # 1 ** x is 1. mask = np.where((self._data == 1) & ~self._mask, False, mask) # x ** 0 is 1. if omask is not None: mask = np.where((other == 0) & ~omask, False, mask) elif other is not libmissing.NA: mask = np.where(other == 0, False, mask) elif op_name == "rpow": # 1 ** x is 1. if omask is not None: mask = np.where((other == 1) & ~omask, False, mask) elif other is not libmissing.NA: mask = np.where(other == 1, False, mask) # x ** 0 is 1. mask = np.where((self._data == 0) & ~self._mask, False, mask) if other is libmissing.NA: result = np.ones_like(self._data) else: with np.errstate(all="ignore"): result = op(self._data, other) # divmod returns a tuple if op_name == "divmod": div, mod = result return ( self._maybe_mask_result(div, mask, other, "floordiv"), self._maybe_mask_result(mod, mask, other, "mod"), ) return self._maybe_mask_result(result, mask, other, op_name)
def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any: """ Check if `indexer` is a valid array indexer for `array`. For a boolean mask, `array` and `indexer` are checked to have the same length. The dtype is validated, and if it is an integer or boolean ExtensionArray, it is checked if there are missing values present, and it is converted to the appropriate numpy array. Other dtypes will raise an error. Non-array indexers (integer, slice, Ellipsis, tuples, ..) are passed through as is. .. versionadded:: 1.0.0 Parameters ---------- array : array-like The array that is being indexed (only used for the length). indexer : array-like or list-like The array-like that's used to index. List-like input that is not yet a numpy array or an ExtensionArray is converted to one. Other input types are passed through as is. Returns ------- numpy.ndarray The validated indexer as a numpy array that can be used to index. Raises ------ IndexError When the lengths don't match. ValueError When `indexer` cannot be converted to a numpy ndarray to index (e.g. presence of missing values). See Also -------- api.types.is_bool_dtype : Check if `key` is of boolean dtype. Examples -------- When checking a boolean mask, a boolean ndarray is returned when the arguments are all valid. >>> mask = pd.array([True, False]) >>> arr = pd.array([1, 2]) >>> pd.api.indexers.check_array_indexer(arr, mask) array([ True, False]) An IndexError is raised when the lengths don't match. >>> mask = pd.array([True, False, True]) >>> pd.api.indexers.check_array_indexer(arr, mask) Traceback (most recent call last): ... IndexError: Boolean index has wrong length: 3 instead of 2. NA values in a boolean array are treated as False. >>> mask = pd.array([True, pd.NA]) >>> pd.api.indexers.check_array_indexer(arr, mask) array([ True, False]) A numpy boolean mask will get passed through (if the length is correct): >>> mask = np.array([True, False]) >>> pd.api.indexers.check_array_indexer(arr, mask) array([ True, False]) Similarly for integer indexers, an integer ndarray is returned when it is a valid indexer, otherwise an error is (for integer indexers, a matching length is not required): >>> indexer = pd.array([0, 2], dtype="Int64") >>> arr = pd.array([1, 2, 3]) >>> pd.api.indexers.check_array_indexer(arr, indexer) array([0, 2]) >>> indexer = pd.array([0, pd.NA], dtype="Int64") >>> pd.api.indexers.check_array_indexer(arr, indexer) Traceback (most recent call last): ... ValueError: Cannot index with an integer indexer containing NA values For non-integer/boolean dtypes, an appropriate error is raised: >>> indexer = np.array([0., 2.], dtype="float64") >>> pd.api.indexers.check_array_indexer(arr, indexer) Traceback (most recent call last): ... IndexError: arrays used as indices must be of integer or boolean type """ from pandas.core.construction import array as pd_array # whathever is not an array-like is returned as-is (possible valid array # indexers that are not array-like: integer, slice, Ellipsis, None) # In this context, tuples are not considered as array-like, as they have # a specific meaning in indexing (multi-dimensional indexing) if is_list_like(indexer): if isinstance(indexer, tuple): return indexer else: return indexer # convert list-likes to array if not is_array_like(indexer): indexer = pd_array(indexer) if len(indexer) == 0: # empty list is converted to float array by pd.array indexer = np.array([], dtype=np.intp) dtype = indexer.dtype if is_bool_dtype(dtype): if is_extension_array_dtype(dtype): indexer = indexer.to_numpy(dtype=bool, na_value=False) else: indexer = np.asarray(indexer, dtype=bool) # GH26658 if len(indexer) != len(array): raise IndexError(f"Boolean index has wrong length: " f"{len(indexer)} instead of {len(array)}") elif is_integer_dtype(dtype): try: indexer = np.asarray(indexer, dtype=np.intp) except ValueError: raise ValueError( "Cannot index with an integer indexer containing NA values") else: raise IndexError( "arrays used as indices must be of integer or boolean type") return indexer
def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False, utc=None, box=True, format=None, exact=True, unit=None, infer_datetime_format=False, origin='unix'): """ Convert argument to datetime. Parameters ---------- arg : integer, float, string, datetime, list, tuple, 1-d array, Series .. versionadded: 0.18.1 or DataFrame/dict-like errors : {'ignore', 'raise', 'coerce'}, default 'raise' - If 'raise', then invalid parsing will raise an exception - If 'coerce', then invalid parsing will be set as NaT - If 'ignore', then invalid parsing will return the input dayfirst : boolean, default False Specify a date parse order if `arg` is str or its list-likes. If True, parses dates with the day first, eg 10/11/12 is parsed as 2012-11-10. Warning: dayfirst=True is not strict, but will prefer to parse with day first (this is a known bug, based on dateutil behavior). yearfirst : boolean, default False Specify a date parse order if `arg` is str or its list-likes. - If True parses dates with the year first, eg 10/11/12 is parsed as 2010-11-12. - If both dayfirst and yearfirst are True, yearfirst is preceded (same as dateutil). Warning: yearfirst=True is not strict, but will prefer to parse with year first (this is a known bug, based on dateutil beahavior). .. versionadded: 0.16.1 utc : boolean, default None Return UTC DatetimeIndex if True (converting any tz-aware datetime.datetime objects as well). box : boolean, default True - If True returns a DatetimeIndex - If False returns ndarray of values. format : string, default None strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse all the way up to nanoseconds. exact : boolean, True by default - If True, require an exact format match. - If False, allow the format to match anywhere in the target string. unit : string, default 'ns' unit of the arg (D,s,ms,us,ns) denote the unit, which is an integer or float number. This will be based off the origin. Example, with unit='ms' and origin='unix' (the default), this would calculate the number of milliseconds to the unix epoch start. infer_datetime_format : boolean, default False If True and no `format` is given, attempt to infer the format of the datetime strings, and if it can be inferred, switch to a faster method of parsing them. In some cases this can increase the parsing speed by ~5-10x. origin : scalar, default is 'unix' Define the reference date. The numeric values would be parsed as number of units (defined by `unit`) since this reference date. - If 'unix' (or POSIX) time; origin is set to 1970-01-01. - If 'julian', unit must be 'D', and origin is set to beginning of Julian Calendar. Julian day number 0 is assigned to the day starting at noon on January 1, 4713 BC. - If Timestamp convertible, origin is set to Timestamp identified by origin. .. versionadded: 0.20.0 Returns ------- ret : datetime if parsing succeeded. Return type depends on input: - list-like: DatetimeIndex - Series: Series of datetime64 dtype - scalar: Timestamp In case when it is not possible to return designated types (e.g. when any element of input is before Timestamp.min or after Timestamp.max) return will have datetime.datetime type (or correspoding array/Series). Examples -------- Assembling a datetime from multiple columns of a DataFrame. The keys can be common abbreviations like ['year', 'month', 'day', 'minute', 'second', 'ms', 'us', 'ns']) or plurals of the same >>> df = pd.DataFrame({'year': [2015, 2016], 'month': [2, 3], 'day': [4, 5]}) >>> pd.to_datetime(df) 0 2015-02-04 1 2016-03-05 dtype: datetime64[ns] If a date does not meet the `timestamp limitations <http://pandas.pydata.org/pandas-docs/stable/timeseries.html #timeseries-timestamp-limits>`_, passing errors='ignore' will return the original input instead of raising any exception. Passing errors='coerce' will force an out-of-bounds date to NaT, in addition to forcing non-dates (or non-parseable dates) to NaT. >>> pd.to_datetime('13000101', format='%Y%m%d', errors='ignore') datetime.datetime(1300, 1, 1, 0, 0) >>> pd.to_datetime('13000101', format='%Y%m%d', errors='coerce') NaT Passing infer_datetime_format=True can often-times speedup a parsing if its not an ISO8601 format exactly, but in a regular format. >>> s = pd.Series(['3/11/2000', '3/12/2000', '3/13/2000']*1000) >>> s.head() 0 3/11/2000 1 3/12/2000 2 3/13/2000 3 3/11/2000 4 3/12/2000 dtype: object >>> %timeit pd.to_datetime(s,infer_datetime_format=True) 100 loops, best of 3: 10.4 ms per loop >>> %timeit pd.to_datetime(s,infer_datetime_format=False) 1 loop, best of 3: 471 ms per loop Using a unix epoch time >>> pd.to_datetime(1490195805, unit='s') Timestamp('2017-03-22 15:16:45') >>> pd.to_datetime(1490195805433502912, unit='ns') Timestamp('2017-03-22 15:16:45.433502912') .. warning:: For float arg, precision rounding might happen. To prevent unexpected behavior use a fixed-width exact type. Using a non-unix epoch origin >>> pd.to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01')) 0 1960-01-02 1 1960-01-03 2 1960-01-04 """ from pandas.core.indexes.datetimes import DatetimeIndex tz = 'utc' if utc else None def _convert_listlike(arg, box, format, name=None, tz=tz): if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') # these are shortcutable if is_datetime64tz_dtype(arg): if not isinstance(arg, DatetimeIndex): return DatetimeIndex(arg, tz=tz, name=name) if utc: arg = arg.tz_convert(None).tz_localize('UTC') return arg elif is_datetime64_ns_dtype(arg): if box and not isinstance(arg, DatetimeIndex): try: return DatetimeIndex(arg, tz=tz, name=name) except ValueError: pass return arg elif unit is not None: if format is not None: raise ValueError("cannot specify both format and unit") arg = getattr(arg, 'values', arg) result = tslib.array_with_unit_to_datetime(arg, unit, errors=errors) if box: if errors == 'ignore': from pandas import Index return Index(result) return DatetimeIndex(result, tz=tz, name=name) return result elif getattr(arg, 'ndim', 1) > 1: raise TypeError('arg must be a string, datetime, list, tuple, ' '1-d array, or Series') arg = _ensure_object(arg) require_iso8601 = False if infer_datetime_format and format is None: format = _guess_datetime_format_for_array(arg, dayfirst=dayfirst) if format is not None: # There is a special fast-path for iso8601 formatted # datetime strings, so in those cases don't use the inferred # format because this path makes process slower in this # special case format_is_iso8601 = _format_is_iso(format) if format_is_iso8601: require_iso8601 = not infer_datetime_format format = None try: result = None if format is not None: # shortcut formatting here if format == '%Y%m%d': try: result = _attempt_YYYYMMDD(arg, errors=errors) except: raise ValueError("cannot convert the input to " "'%Y%m%d' date format") # fallback if result is None: try: result = tslib.array_strptime(arg, format, exact=exact, errors=errors) except tslib.OutOfBoundsDatetime: if errors == 'raise': raise result = arg except ValueError: # if format was inferred, try falling back # to array_to_datetime - terminate here # for specified formats if not infer_datetime_format: if errors == 'raise': raise result = arg if result is None and (format is None or infer_datetime_format): result = tslib.array_to_datetime( arg, errors=errors, utc=utc, dayfirst=dayfirst, yearfirst=yearfirst, require_iso8601=require_iso8601) if is_datetime64_dtype(result) and box: result = DatetimeIndex(result, tz=tz, name=name) return result except ValueError as e: try: values, tz = tslib.datetime_to_datetime64(arg) return DatetimeIndex._simple_new(values, name=name, tz=tz) except (ValueError, TypeError): raise e if arg is None: return None # handle origin if origin == 'julian': original = arg j0 = tslib.Timestamp(0).to_julian_date() if unit != 'D': raise ValueError("unit must be 'D' for origin='julian'") try: arg = arg - j0 except: raise ValueError("incompatible 'arg' type for given " "'origin'='julian'") # premptively check this for a nice range j_max = tslib.Timestamp.max.to_julian_date() - j0 j_min = tslib.Timestamp.min.to_julian_date() - j0 if np.any(arg > j_max) or np.any(arg < j_min): raise tslib.OutOfBoundsDatetime( "{original} is Out of Bounds for " "origin='julian'".format(original=original)) elif origin not in ['unix', 'julian']: # arg must be a numeric original = arg if not ((is_scalar(arg) and (is_integer(arg) or is_float(arg))) or is_numeric_dtype(np.asarray(arg))): raise ValueError( "'{arg}' is not compatible with origin='{origin}'; " "it must be numeric with a unit specified ".format( arg=arg, origin=origin)) # we are going to offset back to unix / epoch time try: offset = tslib.Timestamp(origin) - tslib.Timestamp(0) except tslib.OutOfBoundsDatetime: raise tslib.OutOfBoundsDatetime( "origin {} is Out of Bounds".format(origin)) except ValueError: raise ValueError("origin {} cannot be converted " "to a Timestamp".format(origin)) # convert the offset to the unit of the arg # this should be lossless in terms of precision offset = offset // tslib.Timedelta(1, unit=unit) # scalars & ndarray-like can handle the addition if is_list_like(arg) and not isinstance( arg, (ABCSeries, ABCIndexClass, np.ndarray)): arg = np.asarray(arg) arg = arg + offset if isinstance(arg, tslib.Timestamp): result = arg elif isinstance(arg, ABCSeries): from pandas import Series values = _convert_listlike(arg._values, False, format) result = Series(values, index=arg.index, name=arg.name) elif isinstance(arg, (ABCDataFrame, MutableMapping)): result = _assemble_from_unit_mappings(arg, errors=errors) elif isinstance(arg, ABCIndexClass): result = _convert_listlike(arg, box, format, name=arg.name) elif is_list_like(arg): result = _convert_listlike(arg, box, format) else: result = _convert_listlike(np.array([arg]), box, format)[0] return result