def __setitem__(self, key, value): if isinstance(value, (pd.Index, pd.Series)): value = value.to_numpy() if isinstance(value, type(self)): value = value.to_numpy() key = check_array_indexer(self, key) scalar_key = is_scalar(key) scalar_value = is_scalar(value) if scalar_key and not scalar_value: raise ValueError("setting an array element with a sequence.") # validate new items if scalar_value: if pd.isna(value): value = None elif not isinstance(value, str): raise ValueError( f"Cannot set non-string value '{value}' into a ArrowStringArray." ) else: if not is_array_like(value): value = np.asarray(value, dtype=object) if len(value) and not lib.is_string_array(value, skipna=True): raise ValueError("Must provide strings.") if self._use_arrow: string_array = np.asarray(self._arrow_array.to_pandas()) string_array[key] = value self._arrow_array = pa.chunked_array([pa.array(string_array)]) else: self._ndarray[key] = value
def fillna(self, value=None, method=None, limit=None): from pandas.api.types import is_array_like from pandas.util._validators import validate_fillna_kwargs from pandas.core.missing import pad_1d, backfill_1d value, method = validate_fillna_kwargs(value, method) mask = self.isna() if is_array_like(value): if len(value) != len(self): raise ValueError("Length of 'value' does not match. Got ({}) " " expected {}".format(len(value), len(self))) value = value[mask] if mask.any(): if method is not None: func = pad_1d if method == "pad" else backfill_1d new_values = func(self.astype(object), limit=limit, mask=mask) new_values = self._from_sequence(new_values, self._dtype) else: # fill with value new_values = np.asarray(self) if isinstance(value, Geometry): value = [value] new_values[mask] = value new_values = self.__class__(new_values, dtype=self.dtype) else: new_values = self.copy() return new_values
def __getitem__(self, item): """Select subset of self. Parameters ---------- item: int, slice * int: The position in 'self' to get. * slice: A slice object, where 'start', 'stop', and 'step' are integers or None * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' Returns -------- item: scalar or ExtensionArray Notes ----- For scalar ``item``, return a scalar value suitable for the array's type. This should be an instance of ``self.dtype.type``. For slice ``key``, return an instance of ``ExtensionArray``, even if the slice is length 0 or 1. For a boolean mask, return an instance of ``ExtensionArray``, filtered to the values where ``item`` is True. """ if isinstance(item, slice): start = item.start or 0 stop = item.stop if item.stop is not None else len(self.data) stop = min(stop, len(self.data)) if stop - start == 0: return type(self)(xnd.xnd([], type=self.data.type)) elif isinstance(item, Iterable): if not is_array_like(item): item = np.array(item) if is_integer_dtype(item): return self.take(item) elif is_bool_dtype(item): indices = np.array(item) indices = np.argwhere(indices).flatten() return self.take(indices) else: raise IndexError( "Only integers, slices and integer or boolean \ arrays are valid indices.") elif is_integer(item): if item < 0: item += len(self) if item >= len(self): return None else: return self.data[item] value = self.data[item] return type(self)(value)
def fillna(self, value=None, method=None, limit=None): """ Fill NA/NaN values using the specified method. Parameters ---------- value : scalar, array-like If a scalar value is passed it is used to fill all missing values. Alternatively, an array-like 'value' can be given. It's expected that the array-like have the same length as 'self'. method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None Method to use for filling holes in reindexed Series pad / ffill: propagate last valid observation forward to next valid backfill / bfill: use NEXT valid observation to fill gap limit : int, default None If method is specified, this is the maximum number of consecutive NaN values to forward/backward fill. In other words, if there is a gap with more than this number of consecutive NaNs, it will only be partially filled. If method is not specified, this is the maximum number of entries along the entire axis where NaNs will be filled. Returns ------- filled : ExtensionArray with NA/NaN filled """ from pandas.api.types import is_array_like from pandas.util._validators import validate_fillna_kwargs from pandas.core.missing import pad_1d, backfill_1d value, method = validate_fillna_kwargs(value, method) mask = self.isna() if is_array_like(value): if len(value) != len(self): raise ValueError( "Length of 'value' does not match. Got ({}) " " expected {}".format(len(value), len(self)) ) value = value[mask] if mask.any(): if method is not None: func = pad_1d if method == "pad" else backfill_1d new_values = func(self.astype(object), limit=limit, mask=mask) new_values = self._from_sequence(new_values, dtype=self.dtype) else: # fill with value new_values = self.copy() new_values[mask] = value else: new_values = self.copy() return new_values
def fillna(self, value=None, method=None, limit=None): """ Fill NA/NaN values using the specified method. Parameters ---------- value : scalar, array-like If a scalar value is passed it is used to fill all missing values. Alternatively, an array-like 'value' can be given. It's expected that the array-like have the same length as 'self'. method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None Method to use for filling holes in reindexed Series pad / ffill: propagate last valid observation forward to next valid backfill / bfill: use NEXT valid observation to fill gap limit : int, default None If method is specified, this is the maximum number of consecutive NaN values to forward/backward fill. In other words, if there is a gap with more than this number of consecutive NaNs, it will only be partially filled. If method is not specified, this is the maximum number of entries along the entire axis where NaNs will be filled. Returns ------- filled : ExtensionArray with NA/NaN filled """ from pandas.api.types import is_array_like from pandas.util._validators import validate_fillna_kwargs from pandas.core.missing import pad_1d, backfill_1d value, method = validate_fillna_kwargs(value, method) mask = self.isna() if is_array_like(value): if len(value) != len(self): raise ValueError("Length of 'value' does not match. Got ({}) " " expected {}".format(len(value), len(self))) value = value[mask] if mask.any(): if method is not None: func = pad_1d if method == 'pad' else backfill_1d new_values = func(self.astype(object), limit=limit, mask=mask) new_values = self._from_sequence(new_values, dtype=self.dtype) else: # fill with value new_values = self.copy() new_values[mask] = value else: new_values = self.copy() return new_values
def __init__(self, array, dtype=None, copy=None): # Choose default dtype for empty arrays try: if len(array) == 0 and dtype is None: dtype = 'float64' except: # len failed pass # See if we can determine arrow array type if isinstance(dtype, GeometryDtype): # Use arrow type as-is arrow_dtype = dtype.arrow_dtype elif isinstance(dtype, pa.DataType): arrow_dtype = dtype elif dtype is not None and dtype != np.dtype('object'): # Scalar element dtype arrow_dtype = self._arrow_type_from_numpy_element_dtype(dtype) else: # Let arrow infer type arrow_dtype = None # Unwrap GeometryList elements to numpy arrays if is_array_like(array) or isinstance(array, list): array = [_unwrap_geometry(el, self._element_type) for el in array] array = pa.array(array, type=arrow_dtype) elif isinstance(array, pa.Array): # Nothing to do pass elif isinstance(array, pa.ChunkedArray): array = pa.concat_arrays(array.chunks) else: raise ValueError( "Unsupported type passed for {}: {}".format( self.__class__.__name__, type(array) ) ) # Save off pyarrow array self.data = array # Compute types np_type = self._numpy_element_dtype_from_arrow_type(self.data.type) self._numpy_element_type = np.dtype(np_type) self._dtype = self._dtype_class(np_type) # Initialize backing property for spatial index self._sindex = None
def __init__(self, array, dtype=None, copy=None): # Copy is not used at the moment. It's only affect will be when we # allow array to be a FletcherArray if is_array_like(array) or isinstance(array, list): self.data = pa.chunked_array([pa.array(array, type=dtype)]) elif isinstance(array, pa.Array): # TODO: Assert dtype self.data = pa.chunked_array([array]) elif isinstance(array, pa.ChunkedArray): # TODO: Assert dtype self.data = array else: raise ValueError("Unsupported type passed for {}: {}".format( self.__class__.__name__, type(array))) self._dtype = FletcherDtype(self.data.type) self.offsets = self._calculate_chunk_offsets()
def fillna(self, value=None, method=None, limit=None): from pandas.util._validators import validate_fillna_kwargs value, method = validate_fillna_kwargs(value, method) mask = self.isna() from pandas.api.types import is_array_like, infer_dtype if is_array_like(value): if len(value) != len(self): raise ValueError( f"Length of 'value' does not match. Got ({len(value)}) " f"expected {len(self)}") value = value[mask] else: # because pandas infer_type(scalar) cant work on scalar value, we put the value into a list value = [value] if mask.any(): if method is not None: from pandas.core.missing import pad_1d from pandas.core.missing import backfill_1d func = pad_1d if method == "pad" else backfill_1d new_values = func(self.astype(object), limit=limit, mask=mask) new_values = self._from_sequence(new_values, dtype=self.dtype) # raise NotImplementedError("not support fillna with method") else: # translate value if not isinstance(getattr(value, "dtype", value), (GeoDtype, type(None))): inferred_type = infer_dtype(value, skipna=True) if inferred_type == "string": value = arctern.ST_GeomFromText(value) elif inferred_type == "bytes": pass else: raise ValueError( "can only fillna with wkt formed string or wkb formed bytes" ) # fill with value new_values = self.copy() new_values[mask] = value else: new_values = self.copy() return new_values
def __init__(self, array, dtype=None, copy=None): # Copy is not used at the moment. It's only affect will be when we # allow array to be a FletcherArray if is_array_like(array) or isinstance(array, list): self.data = pa.chunked_array([pa.array(array, type=dtype)]) elif isinstance(array, pa.Array): # ARROW-7008: pyarrow.chunked_array([array]) fails on array with all-None buffers if len(array) == 0 and all((b is None for b in array.buffers())): array = pa.array([], type=array.type) # TODO: Assert dtype self.data = pa.chunked_array([array]) elif isinstance(array, pa.ChunkedArray): # TODO: Assert dtype self.data = array else: raise ValueError( f"Unsupported type passed for {self.__class__.__name__}: {type(array)}" ) self._dtype = FletcherDtype(self.data.type)
def infer_dtype_bydata(data): d_type = DataType.UNKNOWN if is_scalar(data): d_type = infer_dtype_by_scaladata(data) return d_type if is_list_like(data) or is_array_like(data): failed = False try: type_str = infer_dtype(data) except TypeError: failed = True if not failed: d_type = dtype_str_map.get(type_str, DataType.UNKNOWN) if is_numeric_datatype(d_type): d_type = DataType.FLOAT_VECTOR else: d_type = DataType.UNKNOWN return d_type if d_type == DataType.UNKNOWN: try: elem = data[0] except: elem = None if elem is not None and is_scalar(elem): d_type = infer_dtype_by_scaladata(elem) if d_type == DataType.UNKNOWN: _dtype = getattr(data, "dtype", None) if _dtype is not None: d_type = map_numpy_dtype_to_datatype(_dtype) return d_type
def to_geometry_array(data, dtype=None): from . import (LineArray, MultiLineArray, MultiPointArray, MultiPolygonArray, PointArray, PolygonArray, RingArray) if sg is not None: shapely_to_spatialpandas = { sg.Point: PointArray, sg.MultiPoint: MultiPointArray, sg.LineString: LineArray, sg.LinearRing: RingArray, sg.MultiLineString: MultiLineArray, sg.Polygon: PolygonArray, sg.MultiPolygon: MultiPolygonArray, } else: shapely_to_spatialpandas = {} # Normalize dtype from string if dtype is not None: dtype = pd.array([], dtype=dtype).dtype err_msg = "Unable to convert data argument to a GeometryList array" if is_geometry_array(data): # Keep data as is pass elif (is_array_like(data) or isinstance(data, (list, tuple)) or gp and isinstance(data, (gp.GeoSeries, gp.array.GeometryArray))): if dtype is not None: data = dtype.construct_array_type()(data, dtype=dtype) elif len(data) == 0: raise ValueError( "Cannot infer spatialpandas geometry type from empty collection " "without dtype.\n") else: # Check for list/array of geometry scalars. first_valid = None for val in data: if val is not None: first_valid = val break if isinstance(first_valid, Geometry): # Pass data to constructor of appropriate geometry array data = first_valid.construct_array_type()(data) elif type(first_valid) in shapely_to_spatialpandas: if isinstance(first_valid, sg.LineString): # Handle mix of sg.LineString and sg.MultiLineString for val in data: if isinstance(val, sg.MultiLineString): first_valid = val break elif isinstance(first_valid, sg.Polygon): # Handle mix of sg.Polygon and sg.MultiPolygon for val in data: if isinstance(val, sg.MultiPolygon): first_valid = val break array_type = shapely_to_spatialpandas[type(first_valid)] data = array_type.from_geopandas(data) else: raise ValueError(err_msg) else: raise ValueError(err_msg) return data
def take(self, indices: Sequence[int], allow_fill: bool = False, fill_value: Any = None): """ Take elements from an array. Parameters ---------- indices : sequence of int Indices to be taken. allow_fill : bool, default False How to handle negative values in `indices`. * False: negative values in `indices` indicate positional indices from the right (the default). This is similar to :func:`numpy.take`. * True: negative values in `indices` indicate missing values. These values are set to `fill_value`. Any other other negative values raise a ``ValueError``. fill_value : any, optional Fill value to use for NA-indices when `allow_fill` is True. This may be ``None``, in which case the default NA value for the type, ``self.dtype.na_value``, is used. For many ExtensionArrays, there will be two representations of `fill_value`: a user-facing "boxed" scalar, and a low-level physical NA value. `fill_value` should be the user-facing version, and the implementation should handle translating that to the physical version for processing the take if necessary. Returns ------- ExtensionArray Raises ------ IndexError When the indices are out of bounds for the array. ValueError When `indices` contains negative values other than ``-1`` and `allow_fill` is True. See Also -------- numpy.take api.extensions.take Notes ----- ExtensionArray.take is called by ``Series.__getitem__``, ``.loc``, ``iloc``, when `indices` is a sequence of values. Additionally, it's called by :meth:`Series.reindex`, or any other method that causes realignment, with a `fill_value`. """ # TODO: Remove once we got rid of the (indices < 0) check if not is_array_like(indices): indices_array = np.asanyarray(indices) else: # error: Incompatible types in assignment (expression has type # "Sequence[int]", variable has type "ndarray") indices_array = indices # type: ignore[assignment] if len(self._data) == 0 and (indices_array >= 0).any(): raise IndexError("cannot do a non-empty take") if indices_array.size > 0 and indices_array.max() >= len(self._data): raise IndexError("out of bounds value in 'indices'.") if allow_fill: fill_mask = indices_array < 0 if fill_mask.any(): validate_indices(indices_array, len(self._data)) # TODO(ARROW-9433): Treat negative indices as NULL indices_array = pa.array(indices_array, mask=fill_mask) result = self._data.take(indices_array) if isna(fill_value): return type(self)(result) # TODO: ArrowNotImplementedError: Function fill_null has no # kernel matching input types (array[string], scalar[string]) result = type(self)(result) result[fill_mask] = fill_value return result # return type(self)(pc.fill_null(result, pa.scalar(fill_value))) else: # Nothing to fill return type(self)(self._data.take(indices)) else: # allow_fill=False # TODO(ARROW-9432): Treat negative indices as indices from the right. if (indices_array < 0).any(): # Don't modify in-place indices_array = np.copy(indices_array) indices_array[indices_array < 0] += len(self._data) return type(self)(self._data.take(indices_array))
def __getitem__(self, item): # type (Any) -> Any """Select a subset of self. Parameters ---------- item : int, slice, or ndarray * int: The position in 'self' to get. * slice: A slice object, where 'start', 'stop', and 'step' are integers or None * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' Returns ------- item : scalar or ExtensionArray Notes ----- For scalar ``item``, return a scalar value suitable for the array's type. This should be an instance of ``self.dtype.type``. For slice ``key``, return an instance of ``ExtensionArray``, even if the slice is length 0 or 1. For a boolean mask, return an instance of ``ExtensionArray``, filtered to the values where ``item`` is True. """ # Workaround for Arrow bug that segfaults on empty slice. # This is fixed in Arrow master, will be released in 0.10 if isinstance(item, slice): start = item.start or 0 stop = item.stop if item.stop is not None else len(self.data) stop = min(stop, len(self.data)) step = item.step if item.step is not None else 1 # Arrow can't handle slices with steps other than 1 # https://issues.apache.org/jira/browse/ARROW-2714 if step != 1: arr = np.asarray(self)[item] # ARROW-2806: Inconsistent handling of np.nan requires adding a mask if pa.types.is_integer(self.dtype.arrow_dtype) or pa.types.is_floating( self.dtype.arrow_dtype ): mask = pd.isna(arr) else: mask = None return type(self)(pa.array(arr, type=self.dtype.arrow_dtype, mask=mask)) if stop - start == 0: return type(self)(pa.array([], type=self.data.type)) elif isinstance(item, Iterable): if not is_array_like(item): item = np.array(item) if is_integer_dtype(item): return self.take(item) elif is_bool_dtype(item): indices = np.array(item) indices = np.argwhere(indices).flatten() return self.take(indices) else: raise IndexError( "Only integers, slices and integer or boolean arrays are valid indices." ) elif is_integer(item): if item < 0: item += len(self) if item >= len(self): return None value = self.data[item] if isinstance(value, pa.ChunkedArray): return type(self)(value) else: return value.as_py()