def get_ind(sdf, ind): def calc_min_max(): if len(sdf.columns) > 1: min_col = F.least(*map(F.min, sdf)) max_col = F.greatest(*map(F.max, sdf)) else: min_col = F.min(sdf.columns[-1]) max_col = F.max(sdf.columns[-1]) return sdf.select(min_col, max_col).first() if ind is None: min_val, max_val = calc_min_max() sample_range = max_val - min_val ind = np.linspace( min_val - 0.5 * sample_range, max_val + 0.5 * sample_range, 1000, ) elif is_integer(ind): min_val, max_val = calc_min_max() sample_range = max_val - min_val ind = np.linspace( min_val - 0.5 * sample_range, max_val + 0.5 * sample_range, ind, ) return ind
def _get_ind(self, y): # 'y' is a Spark DataFrame that selects one column. if self.ind is None: min_val, max_val = y.select(F.min(y.columns[-1]), F.max(y.columns[-1])).first() sample_range = max_val - min_val ind = np.linspace( min_val - 0.5 * sample_range, max_val + 0.5 * sample_range, 1000, ) elif is_integer(self.ind): min_val, max_val = y.select(F.min(y.columns[-1]), F.max(y.columns[-1])).first() sample_range = np.nanmax(y) - np.nanmin(y) ind = np.linspace( min_val - 0.5 * sample_range, max_val + 0.5 * sample_range, self.ind, ) else: ind = self.ind return ind
def ensure_python_int(value: int | np.integer) -> int: """ Ensure that a value is a python int. Parameters ---------- value: int or numpy.integer Returns ------- int Raises ------ TypeError: if the value isn't an int or can't be converted to one. """ if not (is_integer(value) or is_float(value)): if not is_scalar(value): raise TypeError( f"Value needs to be a scalar value, was type {type(value).__name__}" ) raise TypeError(f"Wrong type {type(value)} for value {value}") try: new_value = int(value) assert new_value == value except (TypeError, ValueError, AssertionError) as err: raise TypeError(f"Wrong type {type(value)} for value {value}") from err return new_value
def prepare_hist_data(data, bins): data, numeric_data = NumericPlotBase.prepare_numeric_data(data) if is_integer(bins): # computes boundaries for the column bins = HistogramPlotBase.get_bins(data._to_spark(), bins) return numeric_data, bins
def _args_adjust(self): if is_integer(self.bins): summary = KoalasHistPlotSummary(self.data, self.data.name) # computes boundaries for the column self.bins = summary.get_bins(self.bins) if is_list_like(self.bottom): self.bottom = np.array(self.bottom)
def _finalize_output(self, frame: DataFrame) -> DataFrame: """ Processes data read in based on kwargs. Parameters ---------- frame: DataFrame The DataFrame to process. Returns ------- DataFrame The processed DataFrame. """ num_cols = len(frame.columns) multi_index_named = True if self.header is None: if self.names is None: if self.prefix is not None: self.names = [f"{self.prefix}{i}" for i in range(num_cols)] elif self.header is None: self.names = range(num_cols) if len(self.names) != num_cols: # usecols is passed through to pyarrow, we only handle index col here # The only way self.names is not the same length as number of cols is # if we have int index_col. We should just pad the names(they will get # removed anyways) to expected length then. self.names = list( range(num_cols - len(self.names))) + self.names multi_index_named = False frame.columns = self.names # we only need the frame not the names # error: Incompatible types in assignment (expression has type # "Union[List[Union[Union[str, int, float, bool], Union[Period, Timestamp, # Timedelta, Any]]], Index]", variable has type "Index") [assignment] frame.columns, frame = self._do_date_conversions( # type: ignore[assignment] frame.columns, frame) if self.index_col is not None: for i, item in enumerate(self.index_col): if is_integer(item): self.index_col[i] = frame.columns[item] else: # String case if item not in frame.columns: raise ValueError(f"Index {item} invalid") frame.set_index(self.index_col, drop=True, inplace=True) # Clear names if headerless and no name given if self.header is None and not multi_index_named: frame.index.names = [None] * len(frame.index.names) if self.kwds.get("dtype") is not None: try: frame = frame.astype(self.kwds.get("dtype")) except TypeError as e: # GH#44901 reraise to keep api consistent raise ValueError(e) return frame
def _finalize_output(self, frame: DataFrame) -> DataFrame: """ Processes data read in based on kwargs. Parameters ---------- frame: DataFrame The DataFrame to process. Returns ------- DataFrame The processed DataFrame. """ num_cols = len(frame.columns) multi_index_named = True if self.header is None: if self.names is None: if self.prefix is not None: self.names = [f"{self.prefix}{i}" for i in range(num_cols)] elif self.header is None: self.names = range(num_cols) if len(self.names) != num_cols: # usecols is passed through to pyarrow, we only handle index col here # The only way self.names is not the same length as number of cols is # if we have int index_col. We should just pad the names(they will get # removed anyways) to expected length then. self.names = list( range(num_cols - len(self.names))) + self.names multi_index_named = False frame.columns = self.names # we only need the frame not the names frame.columns, frame = self._do_date_conversions(frame.columns, frame) if self.index_col is not None: for i, item in enumerate(self.index_col): if is_integer(item): self.index_col[i] = frame.columns[item] else: # String case if item not in frame.columns: raise ValueError(f"Index {item} invalid") frame.set_index(self.index_col, drop=True, inplace=True) # Clear names if headerless and no name given if self.header is None and not multi_index_named: frame.index.names = [None] * len(frame.index.names) if self.kwds.get("dtype") is not None: frame = frame.astype(self.kwds.get("dtype")) return frame
def prepare_hist_data(data, bins): # TODO: this logic is similar with KdePlotBase. Might have to deduplicate it. from pyspark.pandas.series import Series if isinstance(data, Series): data = data.to_frame() numeric_data = data.select_dtypes(include=[ "byte", "decimal", "integer", "float", "long", "double", np.datetime64 ]) # no empty frames or series allowed if len(numeric_data.columns) == 0: raise TypeError("Empty {0!r}: no numeric data to " "plot".format(numeric_data.__class__.__name__)) if is_integer(bins): # computes boundaries for the column bins = HistogramPlotBase.get_bins(data.to_spark(), bins) return numeric_data, bins
def _compute_plot_data(self): # TODO: this logic is same with KdePlot. Might have to deduplicate it. from databricks.koalas.series import Series data = self.data if isinstance(data, Series): data = data.to_frame() numeric_data = data.select_dtypes( include=["byte", "decimal", "integer", "float", "long", "double", np.datetime64] ) # no empty frames or series allowed if len(numeric_data.columns) == 0: raise TypeError( "Empty {0!r}: no numeric data to " "plot".format(numeric_data.__class__.__name__) ) if is_integer(self.bins): # computes boundaries for the column self.bins = self._get_bins(data.to_spark(), self.bins) self.data = numeric_data
def get_ind(sdf, ind): # 'sdf' is a Spark DataFrame that selects one column. if ind is None: min_val, max_val = sdf.select(F.min(sdf.columns[-1]), F.max(sdf.columns[-1])).first() sample_range = max_val - min_val ind = np.linspace( min_val - 0.5 * sample_range, max_val + 0.5 * sample_range, 1000, ) elif is_integer(ind): min_val, max_val = sdf.select(F.min(sdf.columns[-1]), F.max(sdf.columns[-1])).first() sample_range = min_val - max_val ind = np.linspace( min_val - 0.5 * sample_range, max_val + 0.5 * sample_range, ind, ) return ind
def _args_adjust(self): from databricks.koalas.series import Series data = self.data if isinstance(data, Series): data = data.to_frame() numeric_data = data.select_dtypes(include=['byte', 'decimal', 'integer', 'float', 'long', 'double', np.datetime64]) is_empty = not len(numeric_data.columns) # no empty frames or series allowed if is_empty: raise TypeError('Empty {0!r}: no numeric data to ' 'plot'.format(numeric_data.__class__.__name__)) if is_integer(self.bins): summary = KoalasHistPlotSummary(self.data, self.data.name) # computes boundaries for the column self.bins = summary.get_bins(self.bins) if is_list_like(self.bottom): self.bottom = np.array(self.bottom)