def __setitem__(self, key, value): from databricks.koalas.frame import DataFrame from databricks.koalas.series import _col super(iLocIndexer, self).__setitem__(key, value) if self._is_series: internal = self._kdf_or_kser._internal sdf = internal.spark_frame.select(internal.index_spark_columns + [internal.spark_column]) internal = internal.copy( spark_frame=sdf, column_labels=[internal.column_labels[0] or ("0", )], data_spark_columns=[ scol_for(sdf, internal.data_spark_column_names[0]) ], spark_column=None, ) kser = _col(DataFrame(internal)) self._kdf_or_kser._internal = kser._internal self._kdf_or_kser._kdf = kser._kdf else: assert self._is_df # Clean up implicitly cached properties to be able to reuse the indexer. del self._internal del self._sequence_col
def __setitem__(self, key, value): from databricks.koalas.frame import DataFrame from databricks.koalas.series import _col super(iLocIndexer, self).__setitem__(key, value) if self._is_series: internal = self._kdf_or_kser._internal sdf = internal.spark_frame.select(internal.index_spark_columns + [internal.spark_column]) internal = internal.copy( spark_frame=sdf, column_labels=[internal.column_labels[0] or ("0", )], data_spark_columns=[ scol_for(sdf, internal.data_spark_column_names[0]) ], spark_column=None, ) kser = _col(DataFrame(internal)) self._kdf_or_kser._internal = kser._internal self._kdf_or_kser._kdf = kser._kdf else: assert self._is_df # TODO: support DataFrame. delattr(self, "_lazy__internal") delattr(self, "_lazy__sequence_col")
def rank(self, method='average', ascending=True): kdf = super(SeriesGroupBy, self).rank(method, ascending).to_dataframe() return _col( DataFrame( kdf._internal.copy(sdf=kdf._sdf.select( kdf._internal.data_scols), index_map=[]))) # index is lost.
def _apply_as_series_or_frame(self, func): """ Wraps a function that handles Spark column in order to support it in both Koalas Series and DataFrame. Note that the given `func` name should be same as the API's method name. """ from databricks.koalas import DataFrame from databricks.koalas.series import _col from databricks.koalas.groupby import SeriesGroupBy kdf = self.kdf sdf = self.kdf._sdf # Here we need to include grouped key as an index, and shift previous index. # [index_column0, index_column1] -> [grouped key, index_column0, index_column1] new_index_scols = [] new_index_map = [] for groupkey in self._groupkeys: new_index_scols.append( # NOTE THAT this code intentionally uses `F.col` instead of `scol` in # given series. This is because, in case of series, we convert it into # DataFrame. So, if the given `groupkeys` is a series, they end up with # being a different series. F.col(name_like_string(groupkey.name) ).alias(SPARK_INDEX_NAME_FORMAT(len(new_index_scols)))) new_index_map.append((SPARK_INDEX_NAME_FORMAT(len(new_index_map)), groupkey._internal.column_index[0])) for new_index_scol, index_map in zip(kdf._internal.index_scols, kdf._internal.index_map): new_index_scols.append( new_index_scol.alias( SPARK_INDEX_NAME_FORMAT(len(new_index_scols)))) _, name = index_map new_index_map.append( (SPARK_INDEX_NAME_FORMAT(len(new_index_map)), name)) applied = [] for column in kdf.columns: applied.append(kdf[column]._with_new_scol(func( kdf[column]._scol)).rename(kdf[column].name)) # Seems like pandas filters out when grouped key is NA. cond = self._groupkeys[0]._scol.isNotNull() for c in self._groupkeys: cond = cond | c._scol.isNotNull() sdf = sdf.select(new_index_scols + [c._scol for c in applied]).filter(cond) internal = _InternalFrame( sdf=sdf, data_columns=[c._internal.data_columns[0] for c in applied], index_map=new_index_map) ret = DataFrame(internal) if isinstance(self._groupby, SeriesGroupBy): return _col(ret) else: return ret
def __setitem__(self, key, value): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series, _col if (not isinstance(key, tuple)) or (len(key) != 2): raise SparkPandasNotImplementedError( description="Only accepts pairs of candidates", pandas_function=".loc[..., ...] = ...", spark_target_function="withColumn, select") rows_sel, cols_sel = key if (not isinstance(rows_sel, slice)) or (rows_sel != slice(None)): raise SparkPandasNotImplementedError( description= """Can only assign value to the whole dataframe, the row index has to be `slice(None)` or `:`""", pandas_function=".loc[..., ...] = ...", spark_target_function="withColumn, select") if not isinstance(cols_sel, str): raise ValueError("""only column names can be assigned""") if isinstance(value, DataFrame): if len(value.columns) == 1: self._kdf[cols_sel] = _col(value) else: raise ValueError( "Only a dataframe with one column can be assigned") else: self._kdf[cols_sel] = value
def from_pandas(self, pdf): if isinstance(pdf, pd.Series): return _col(self.from_pandas(pd.DataFrame(pdf))) metadata = Metadata.from_pandas(pdf) reset_index = pdf.reset_index() reset_index.columns = metadata.all_fields df = self.createDataFrame(reset_index) df._metadata = metadata return df
def __setitem__(self, key, value): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series, _col if (not isinstance(key, tuple)) or (len(key) != 2): raise SparkPandasNotImplementedError( description="Only accepts pairs of candidates", pandas_function=".loc[..., ...] = ...", spark_target_function="withColumn, select") rows_sel, cols_sel = key if (not isinstance(rows_sel, slice)) or (rows_sel != slice(None)): if isinstance(rows_sel, list): if isinstance(cols_sel, str): cols_sel = [cols_sel] kdf = self._kdf.copy() for col_sel in cols_sel: # Uses `kdf` to allow operations on different DataFrames. # TODO: avoid temp column name or declare `__` prefix is # reserved for Koalas' internal columns. kdf["__indexing_temp_col__"] = value new_col = kdf["__indexing_temp_col__"]._scol kdf[col_sel] = Series( kdf[col_sel]._internal.copy(scol=F.when( kdf._internal.index_scols[0].isin(rows_sel), new_col).otherwise(kdf[col_sel]._scol)), anchor=kdf) kdf = kdf.drop(labels=['__indexing_temp_col__']) self._kdf._internal = kdf._internal.copy() else: raise SparkPandasNotImplementedError( description= """Can only assign value to the whole dataframe, the row index has to be `slice(None)` or `:`""", pandas_function=".loc[..., ...] = ...", spark_target_function="withColumn, select") if not isinstance(cols_sel, (str, list)): raise ValueError( """only column names or list of column names can be assigned""" ) if isinstance(value, DataFrame): if len(value.columns) == 1: self._kdf[cols_sel] = _col(value) else: raise ValueError( "Only a dataframe with one column can be assigned") else: if isinstance(cols_sel, str): cols_sel = [cols_sel] if (not isinstance(rows_sel, list)) and (isinstance( cols_sel, list)): for col_sel in cols_sel: self._kdf[col_sel] = value
def size(self): """ Compute group sizes. See Also -------- databricks.koalas.Series.groupby databricks.koalas.DataFrame.groupby Examples -------- >>> df = ks.DataFrame({'A': [1, 2, 2, 3, 3, 3], ... 'B': [1, 1, 2, 3, 3, 3]}, ... columns=['A', 'B']) >>> df A B 0 1 1 1 2 1 2 2 2 3 3 3 4 3 3 5 3 3 >>> df.groupby('A').size().sort_index() # doctest: +NORMALIZE_WHITESPACE A 1 1 2 2 3 3 Name: count, dtype: int64 >>> df.groupby(['A', 'B']).size().sort_index() # doctest: +NORMALIZE_WHITESPACE A B 1 1 1 2 1 1 2 1 3 3 3 Name: count, dtype: int64 """ groupkeys = self._groupkeys groupkey_cols = [ s._scol.alias('__index_level_{}__'.format(i)) for i, s in enumerate(groupkeys) ] sdf = self._kdf._sdf sdf = sdf.groupby(*groupkey_cols).count() if (len(self._agg_columns) > 0) and (self._have_agg_columns): name = self._agg_columns[0].name sdf = sdf.withColumnRenamed('count', name) else: name = 'count' internal = _InternalFrame(sdf=sdf, data_columns=[name], index_map=[('__index_level_{}__'.format(i), s.name) for i, s in enumerate(groupkeys)]) return _col(DataFrame(internal))
def from_pandas(pdf): """Create DataFrame from pandas DataFrame. This is similar to `DataFrame.createDataFrame()` with pandas DataFrame, but this also picks the index in the given pandas DataFrame. :param pdf: :class:`pandas.DataFrame` """ if isinstance(pdf, pd.Series): return _col(from_pandas(pd.DataFrame(pdf))) metadata = Metadata.from_pandas(pdf) reset_index = pdf.reset_index() reset_index.columns = metadata.all_fields df = default_session().createDataFrame(reset_index) df._metadata = metadata return df
def __getitem__(self, key): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series def raiseNotImplemented(description): raise SparkPandasNotImplementedError( description=description, pandas_function=".loc[..., ...]", spark_target_function="select, where") rows_sel, cols_sel = _unfold(key, self._ks) sdf = self._kdf._sdf if isinstance(rows_sel, Series): sdf_for_check_schema = sdf.select(rows_sel._scol) assert isinstance(sdf_for_check_schema.schema.fields[0].dataType, BooleanType), \ (str(sdf_for_check_schema), sdf_for_check_schema.schema.fields[0].dataType) sdf = sdf.where(rows_sel._scol) elif isinstance(rows_sel, slice): if rows_sel.step is not None: raiseNotImplemented("Cannot use step with Spark.") if rows_sel == slice(None): # If slice is None - select everything, so nothing to do pass elif len(self._kdf._internal.index_columns) == 0: raiseNotImplemented( "Cannot use slice for Spark if no index provided.") elif len(self._kdf._internal.index_columns) == 1: start = rows_sel.start stop = rows_sel.stop index_column = self._kdf.index.to_series() index_data_type = index_column.schema[0].dataType cond = [] if start is not None: cond.append(index_column._scol >= F.lit(start).cast( index_data_type)) if stop is not None: cond.append( index_column._scol <= F.lit(stop).cast(index_data_type) ) if len(cond) > 0: sdf = sdf.where(reduce(lambda x, y: x & y, cond)) else: raiseNotImplemented( "Cannot use slice for MultiIndex with Spark.") elif isinstance(rows_sel, str): raiseNotImplemented( "Cannot use a scalar value for row selection with Spark.") else: try: rows_sel = list(rows_sel) except TypeError: raiseNotImplemented( "Cannot use a scalar value for row selection with Spark.") if len(rows_sel) == 0: sdf = sdf.where(F.lit(False)) elif len(self._kdf._internal.index_columns) == 1: index_column = self._kdf.index.to_series() index_data_type = index_column.schema[0].dataType if len(rows_sel) == 1: sdf = sdf.where(index_column._scol == F.lit( rows_sel[0]).cast(index_data_type)) else: sdf = sdf.where( index_column._scol.isin([ F.lit(r).cast(index_data_type) for r in rows_sel ])) else: raiseNotImplemented( "Cannot select with MultiIndex with Spark.") # make cols_sel a 1-tuple of string if a single string column_index = self._kdf._internal.column_index if isinstance(cols_sel, str): if column_index is not None: return self[rows_sel, [cols_sel]]._get_from_multiindex_column( (cols_sel, )) else: cols_sel = _make_col(cols_sel) elif isinstance(cols_sel, Series): cols_sel = _make_col(cols_sel) elif isinstance(cols_sel, slice) and cols_sel != slice(None): raise raiseNotImplemented( "Can only select columns either by name or reference or all") elif isinstance(cols_sel, slice) and cols_sel == slice(None): cols_sel = None if cols_sel is None: columns = self._kdf._internal.data_scols elif isinstance(cols_sel, spark.Column): columns = [cols_sel] else: if column_index is not None: column_to_index = list( zip(self._kdf._internal.data_columns, self._kdf._internal.column_index)) columns, column_index = zip( *[(_make_col(column), idx) for key in cols_sel for column, idx in column_to_index if idx[0] == key]) columns, column_index = list(columns), list(column_index) else: columns = [_make_col(c) for c in cols_sel] try: kdf = DataFrame( sdf.select(self._kdf._internal.index_scols + columns)) except AnalysisException: raise KeyError('[{}] don\'t exist in columns'.format( [col._jc.toString() for col in columns])) kdf._internal = kdf._internal.copy( data_columns=kdf._internal.data_columns[-len(columns):], index_map=self._kdf._internal.index_map, column_index=column_index) if cols_sel is not None and isinstance(cols_sel, spark.Column): from databricks.koalas.series import _col return _col(kdf) else: return kdf
def __getitem__(self, key): from pyspark.sql.functions import lit from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series def raiseNotImplemented(description): raise SparkPandasNotImplementedError( description=description, pandas_function=".loc[..., ...]", spark_target_function="select, where") rows_sel, cols_sel = _unfold(key, self._ks) sdf = self._kdf._sdf if isinstance(rows_sel, Series): sdf_for_check_schema = sdf.select(rows_sel._scol) assert isinstance(sdf_for_check_schema.schema.fields[0].dataType, BooleanType), \ (str(sdf_for_check_schema), sdf_for_check_schema.schema.fields[0].dataType) sdf = sdf.where(rows_sel._scol) elif isinstance(rows_sel, slice): if rows_sel.step is not None: raiseNotImplemented("Cannot use step with Spark.") if rows_sel == slice(None): # If slice is None - select everything, so nothing to do pass elif len(self._kdf._index_columns) == 0: raiseNotImplemented( "Cannot use slice for Spark if no index provided.") elif len(self._kdf._index_columns) == 1: start = rows_sel.start stop = rows_sel.stop index_column = self._kdf.index index_data_type = index_column.schema[0].dataType cond = [] if start is not None: cond.append( index_column._scol >= lit(start).cast(index_data_type)) if stop is not None: cond.append( index_column._scol <= lit(stop).cast(index_data_type)) if len(cond) > 0: sdf = sdf.where(reduce(lambda x, y: x & y, cond)) else: raiseNotImplemented( "Cannot use slice for MultiIndex with Spark.") elif isinstance(rows_sel, str): raiseNotImplemented( "Cannot use a scalar value for row selection with Spark.") else: try: rows_sel = list(rows_sel) except TypeError: raiseNotImplemented( "Cannot use a scalar value for row selection with Spark.") if len(rows_sel) == 0: sdf = sdf.where(lit(False)) elif len(self._kdf._index_columns) == 1: index_column = self._kdf.index index_data_type = index_column.schema[0].dataType if len(rows_sel) == 1: sdf = sdf.where(index_column._scol == lit( rows_sel[0]).cast(index_data_type)) else: sdf = sdf.where( index_column._scol.isin( [lit(r).cast(index_data_type) for r in rows_sel])) else: raiseNotImplemented( "Cannot select with MultiIndex with Spark.") if cols_sel is None: columns = [_make_col(c) for c in self._kdf._metadata.data_columns] elif isinstance(cols_sel, spark.Column): columns = [cols_sel] else: columns = [_make_col(c) for c in cols_sel] try: kdf = DataFrame( sdf.select(self._kdf._metadata.index_columns + columns)) except AnalysisException: raise KeyError('[{}] don\'t exist in columns'.format( [col._jc.toString() for col in columns])) kdf._metadata = self._kdf._metadata.copy( data_columns=kdf._metadata.data_columns[-len(columns):]) if cols_sel is not None and isinstance(cols_sel, spark.Column): from databricks.koalas.series import _col return _col(kdf) else: return kdf
def _reduce_for_stat_function(self, sfun, only_numeric): return _col( super(SeriesGroupBy, self)._reduce_for_stat_function(sfun, only_numeric))
def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): """ Return a Series containing counts of unique values. The resulting object will be in descending order so that the first element is the most frequently-occurring element. Excludes NA values by default. Parameters ---------- normalize : boolean, default False If True then the object returned will contain the relative frequencies of the unique values. sort : boolean, default True Sort by values. ascending : boolean, default False Sort in ascending order. bins : Not Yet Supported dropna : boolean, default True Don't include counts of NaN. Returns ------- counts : Series See Also -------- Series.count: Number of non-NA elements in a Series. Examples -------- For Series >>> df = ks.DataFrame({'x':[0, 0, 1, 1, 1, np.nan]}) >>> df.x.value_counts() # doctest: +NORMALIZE_WHITESPACE 1.0 3 0.0 2 Name: x, dtype: int64 With `normalize` set to `True`, returns the relative frequency by dividing all values by the sum of values. >>> df.x.value_counts(normalize=True) # doctest: +NORMALIZE_WHITESPACE 1.0 0.6 0.0 0.4 Name: x, dtype: float64 **dropna** With `dropna` set to `False` we can also see NaN index values. >>> df.x.value_counts(dropna=False) # doctest: +NORMALIZE_WHITESPACE 1.0 3 0.0 2 NaN 1 Name: x, dtype: int64 For Index >>> from databricks.koalas.indexes import Index >>> idx = Index([3, 1, 2, 3, 4, np.nan]) >>> idx Float64Index([3.0, 1.0, 2.0, 3.0, 4.0, nan], dtype='float64') >>> idx.value_counts().sort_index() 1.0 1 2.0 1 3.0 2 4.0 1 Name: count, dtype: int64 **sort** With `sort` set to `False`, the result wouldn't be sorted by number of count. >>> idx.value_counts(sort=True).sort_index() 1.0 1 2.0 1 3.0 2 4.0 1 Name: count, dtype: int64 **normalize** With `normalize` set to `True`, returns the relative frequency by dividing all values by the sum of values. >>> idx.value_counts(normalize=True).sort_index() 1.0 0.2 2.0 0.2 3.0 0.4 4.0 0.2 Name: count, dtype: float64 **dropna** With `dropna` set to `False` we can also see NaN index values. >>> idx.value_counts(dropna=False).sort_index() # doctest: +SKIP 1.0 1 2.0 1 3.0 2 4.0 1 NaN 1 Name: count, dtype: int64 For MultiIndex. >>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'], ... ['speed', 'weight', 'length']], ... [[0, 0, 0, 1, 1, 1, 2, 2, 2], ... [1, 1, 1, 1, 1, 2, 1, 2, 2]]) >>> s = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx) >>> s.index # doctest: +SKIP MultiIndex([( 'lama', 'weight'), ( 'lama', 'weight'), ( 'lama', 'weight'), ( 'cow', 'weight'), ( 'cow', 'weight'), ( 'cow', 'length'), ('falcon', 'weight'), ('falcon', 'length'), ('falcon', 'length')], ) >>> s.index.value_counts().sort_index() (cow, length) 1 (cow, weight) 2 (falcon, length) 2 (falcon, weight) 1 (lama, weight) 3 Name: count, dtype: int64 >>> s.index.value_counts(normalize=True).sort_index() (cow, length) 0.111111 (cow, weight) 0.222222 (falcon, length) 0.222222 (falcon, weight) 0.111111 (lama, weight) 0.333333 Name: count, dtype: float64 If Index has name, keep the name up. >>> idx = Index([0, 0, 0, 1, 1, 2, 3], name='koalas') >>> idx.value_counts().sort_index() 0 3 1 2 2 1 3 1 Name: koalas, dtype: int64 """ from databricks.koalas.series import _col if bins is not None: raise NotImplementedError("value_counts currently does not support bins") if dropna: sdf_dropna = self._internal._sdf.select(self.spark_column).dropna() else: sdf_dropna = self._internal._sdf.select(self.spark_column) index_name = SPARK_DEFAULT_INDEX_NAME column_name = self._internal.data_spark_column_names[0] sdf = sdf_dropna.groupby(scol_for(sdf_dropna, column_name).alias(index_name)).count() if sort: if ascending: sdf = sdf.orderBy(F.col("count")) else: sdf = sdf.orderBy(F.col("count").desc()) if normalize: sum = sdf_dropna.count() sdf = sdf.withColumn("count", F.col("count") / F.lit(sum)) column_labels = self._internal.column_labels if (column_labels[0] is None) or (None in column_labels[0]): internal = _InternalFrame( spark_frame=sdf, index_map=OrderedDict({index_name: None}), data_spark_columns=[scol_for(sdf, "count")], ) else: internal = _InternalFrame( spark_frame=sdf, index_map=OrderedDict({index_name: None}), column_labels=column_labels, data_spark_columns=[scol_for(sdf, "count")], column_label_names=self._internal.column_label_names, ) return _col(DataFrame(internal))
def __getitem__(self, key): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series def raiseNotImplemented(description): raise SparkPandasNotImplementedError( description=description, pandas_function=".loc[..., ...]", spark_target_function="select, where") rows_sel, cols_sel = _unfold(key, self._kser) sdf = self._kdf._sdf if isinstance(rows_sel, Series): sdf_for_check_schema = sdf.select(rows_sel._scol) assert isinstance(sdf_for_check_schema.schema.fields[0].dataType, BooleanType), \ (str(sdf_for_check_schema), sdf_for_check_schema.schema.fields[0].dataType) sdf = sdf.where(rows_sel._scol) elif isinstance(rows_sel, slice): assert len(self._kdf._internal.index_columns) > 0 if rows_sel.step is not None: raiseNotImplemented("Cannot use step with Spark.") if rows_sel == slice(None): # If slice is None - select everything, so nothing to do pass elif len(self._kdf._internal.index_columns) == 1: start = rows_sel.start stop = rows_sel.stop index_column = self._kdf.index.to_series() index_data_type = index_column.spark_type cond = [] if start is not None: cond.append(index_column._scol >= F.lit(start).cast( index_data_type)) if stop is not None: cond.append( index_column._scol <= F.lit(stop).cast(index_data_type) ) if len(cond) > 0: sdf = sdf.where(reduce(lambda x, y: x & y, cond)) else: raiseNotImplemented( "Cannot use slice for MultiIndex with Spark.") elif isinstance(rows_sel, str): raiseNotImplemented( "Cannot use a scalar value for row selection with Spark.") else: try: rows_sel = list(rows_sel) except TypeError: raiseNotImplemented( "Cannot use a scalar value for row selection with Spark.") if len(rows_sel) == 0: sdf = sdf.where(F.lit(False)) elif len(self._kdf._internal.index_columns) == 1: index_column = self._kdf.index.to_series() index_data_type = index_column.spark_type if len(rows_sel) == 1: sdf = sdf.where(index_column._scol == F.lit( rows_sel[0]).cast(index_data_type)) else: sdf = sdf.where( index_column._scol.isin([ F.lit(r).cast(index_data_type) for r in rows_sel ])) else: raiseNotImplemented( "Cannot select with MultiIndex with Spark.") # make cols_sel a 1-tuple of string if a single string column_index = self._kdf._internal.column_index if isinstance(cols_sel, str): kdf = DataFrame(self._kdf._internal.copy(sdf=sdf)) return kdf._get_from_multiindex_column((cols_sel, )) elif isinstance(cols_sel, Series): cols_sel = _make_col(cols_sel) elif isinstance(cols_sel, slice) and cols_sel != slice(None): raise raiseNotImplemented( "Can only select columns either by name or reference or all") elif isinstance(cols_sel, slice) and cols_sel == slice(None): cols_sel = None if cols_sel is None: columns = self._kdf._internal.column_scols elif isinstance(cols_sel, spark.Column): columns = [cols_sel] column_index = None elif all(isinstance(key, Series) for key in cols_sel): columns = [_make_col(key) for key in cols_sel] column_index = [key._internal.column_index[0] for key in cols_sel] elif all(isinstance(key, spark.Column) for key in cols_sel): columns = cols_sel column_index = None elif (any(isinstance(key, str) for key in cols_sel) and any(isinstance(key, tuple) for key in cols_sel)): raise TypeError('Expected tuple, got str') else: if all(isinstance(key, tuple) for key in cols_sel): level = self._kdf._internal.column_index_level if any(len(key) != level for key in cols_sel): raise ValueError( 'All the key level should be the same as column index level.' ) column_to_index = list( zip(self._kdf._internal.data_columns, self._kdf._internal.column_index)) columns = [] column_index = [] for key in cols_sel: found = False for column, idx in column_to_index: if idx == key or idx[0] == key: columns.append(_make_col(column)) column_index.append(idx) found = True if not found: raise KeyError("['{}'] not in index".format(key)) try: sdf = sdf.select(self._kdf._internal.index_scols + columns) index_columns = self._kdf._internal.index_columns data_columns = [ column for column in sdf.columns if column not in index_columns ] column_scols = [scol_for(sdf, col) for col in data_columns] internal = _InternalFrame(sdf=sdf, index_map=self._kdf._internal.index_map, column_index=column_index, column_scols=column_scols) kdf = DataFrame(internal) except AnalysisException: raise KeyError('[{}] don\'t exist in columns'.format( [col._jc.toString() for col in columns])) if cols_sel is not None and isinstance(cols_sel, spark.Column): from databricks.koalas.series import _col return _col(kdf) else: return kdf
def value_counts(self, sort=None, ascending=None, dropna=True): """ Compute group sizes. Parameters ---------- sort : boolean, default None Sort by frequencies. ascending : boolean, default False Sort in ascending order. dropna : boolean, default True Don't include counts of NaN. See Also -------- databricks.koalas.Series.groupby databricks.koalas.DataFrame.groupby Examples -------- >>> df = ks.DataFrame({'A': [1, 2, 2, 3, 3, 3], ... 'B': [1, 1, 2, 3, 3, 3]}, ... columns=['A', 'B']) >>> df A B 0 1 1 1 2 1 2 2 2 3 3 3 4 3 3 5 3 3 >>> df.groupby('A')['B'].value_counts().sort_index() # doctest: +NORMALIZE_WHITESPACE A B 1 1 1 2 1 1 2 1 3 3 3 Name: B, dtype: int64 """ groupkeys = self._groupkeys + self._agg_columns groupkey_cols = [ s._scol.alias('__index_level_{}__'.format(i)) for i, s in enumerate(groupkeys) ] sdf = self._kdf._sdf agg_column = self._agg_columns[0].name sdf = sdf.groupby(*groupkey_cols).count().withColumnRenamed( 'count', agg_column) if sort: if ascending: sdf = sdf.orderBy(F.col(agg_column).asc()) else: sdf = sdf.orderBy(F.col(agg_column).desc()) internal = _InternalFrame(sdf=sdf, data_columns=[agg_column], index_map=[('__index_level_{}__'.format(i), s.name) for i, s in enumerate(groupkeys)]) return _col(DataFrame(internal))
def __getitem__(self, key): from databricks.koalas.frame import DataFrame from databricks.koalas.indexes import Index from databricks.koalas.series import Series def raiseNotImplemented(description): raise SparkPandasNotImplementedError( description=description, pandas_function=".iloc[..., ...]", spark_target_function="select, where") rows_sel, cols_sel = _unfold(key, self._ks) sdf = self._kdf._sdf if isinstance(rows_sel, Index): sdf_for_check_schema = sdf.select(rows_sel._scol) assert isinstance(sdf_for_check_schema.schema.fields[0].dataType, BooleanType), \ (str(sdf_for_check_schema), sdf_for_check_schema.schema.fields[0].dataType) sdf = sdf.where(rows_sel._scol) elif isinstance(rows_sel, slice): if rows_sel == slice(None): # If slice is None - select everything, so nothing to do pass elif (rows_sel.start is not None) or (rows_sel.step is not None): raiseNotImplemented("Cannot use start or step with Spark.") elif not isinstance(rows_sel.stop, int): raise TypeError( "cannot do slice indexing with these indexers [{}] of {}". format(rows_sel.stop, type(rows_sel.stop))) elif rows_sel.stop >= 0: sdf = sdf.limit(rows_sel.stop) else: sdf = sdf.limit(sdf.count() + rows_sel.stop) else: raiseNotImplemented( ".iloc requires numeric slice or conditional boolean Index, " "got {}".format(rows_sel)) # make cols_sel a 1-tuple of string if a single string if isinstance(cols_sel, Series): columns = [cols_sel._scol] elif isinstance(cols_sel, int): columns = [self._kdf._internal.data_scols[cols_sel]] elif cols_sel is None or cols_sel == slice(None): columns = self._kdf._internal.data_scols elif isinstance(cols_sel, slice): if all(s is None or isinstance(s, int) for s in (cols_sel.start, cols_sel.stop, cols_sel.step)): columns = self._kdf._internal.data_scols[cols_sel] else: not_none = cols_sel.start if cols_sel.start is not None \ else cols_sel.stop if cols_sel.stop is not None else cols_sel.step raise TypeError( 'cannot do slice indexing with these indexers {} of {}'. format(not_none, type(not_none))) elif is_list_like(cols_sel): if all(isinstance(s, int) for s in cols_sel): columns = [ self._kdf._internal.scol_for(col) for col in self._kdf.columns[cols_sel] ] else: raise TypeError('cannot perform reduce with flexible type') else: raise ValueError( "Location based indexing can only have [integer, integer slice, " "listlike of integers, boolean array] types, got {}".format( cols_sel)) try: kdf = DataFrame( sdf.select(self._kdf._internal.index_scols + columns)) except AnalysisException: raise KeyError('[{}] don\'t exist in columns'.format( [col._jc.toString() for col in columns])) column_index = self._kdf._internal.column_index if column_index is not None: if cols_sel is not None and isinstance(cols_sel, (Series, int)): column_index = None else: column_index = pd.MultiIndex.from_tuples( column_index)[cols_sel].tolist() kdf._internal = kdf._internal.copy( data_columns=kdf._internal.data_columns[-len(columns):], index_map=self._kdf._internal.index_map, column_index=column_index) if cols_sel is not None and isinstance(cols_sel, (Series, int)): from databricks.koalas.series import _col return _col(kdf) else: return kdf
def _reduce_for_stat_function(self, sfun, only_numeric): return _col(super(SeriesGroupBy, self)._reduce_for_stat_function(sfun, only_numeric))
def __getitem__(self, key): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series rows_sel, cols_sel = _unfold(key, self._kdf_or_kser if self._is_series else None) sdf = self._internal.sdf cond, limit = self._select_rows(rows_sel) if cond is not None: sdf = sdf.where(cond) if limit is not None: if limit >= 0: sdf = sdf.limit(limit) else: sdf = sdf.limit(sdf.count() + limit) # make cols_sel a 1-tuple of string if a single string if isinstance(cols_sel, Series) and cols_sel._equals(self._kdf_or_kser): columns = cols_sel._internal.column_scols column_index = cols_sel._internal.column_index elif isinstance(cols_sel, int): columns = [self._internal.column_scols[cols_sel]] column_index = [self._internal.column_index[cols_sel]] elif cols_sel is None or cols_sel == slice(None): columns = self._internal.column_scols column_index = self._internal.column_index elif isinstance(cols_sel, slice): if all(s is None or isinstance(s, int) for s in (cols_sel.start, cols_sel.stop, cols_sel.step)): columns = self._internal.column_scols[cols_sel] column_index = self._internal.column_index[cols_sel] else: not_none = cols_sel.start if cols_sel.start is not None \ else cols_sel.stop if cols_sel.stop is not None else cols_sel.step raise TypeError('cannot do slice indexing with these indexers {} of {}' .format(not_none, type(not_none))) elif is_list_like(cols_sel): if all(isinstance(s, bool) for s in cols_sel): cols_sel = [i for i, s in enumerate(cols_sel) if s] if all(isinstance(s, int) for s in cols_sel): columns = [self._internal.column_scols[s] for s in cols_sel] column_index = [self._internal.column_index[s] for s in cols_sel] else: raise TypeError('cannot perform reduce with flexible type') else: raise ValueError("Location based indexing can only have [integer, integer slice, " "listlike of integers, boolean array] types, got {}".format(cols_sel)) try: sdf = sdf.select(self._internal.index_scols + columns) internal = _InternalFrame(sdf=sdf, index_map=self._internal.index_map, column_index=column_index, column_index_names=self._internal.column_index_names) kdf = DataFrame(internal) except AnalysisException: raise KeyError('[{}] don\'t exist in columns' .format([col._jc.toString() for col in columns])) if cols_sel is not None and isinstance(cols_sel, (Series, int)): from databricks.koalas.series import _col return _col(kdf) else: return kdf
def transform(self, func): return _col(super(SeriesGroupBy, self).transform(func))
def __getitem__(self, key): from pyspark.sql.functions import lit from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series def raiseNotImplemented(description): raise SparkPandasNotImplementedError( description=description, pandas_function=".loc[..., ...]", spark_target_function="select, where") rows_sel, cols_sel = _unfold(key, self._ks) sdf = self._kdf._sdf if isinstance(rows_sel, Series): sdf_for_check_schema = sdf.select(rows_sel._scol) assert isinstance(sdf_for_check_schema.schema.fields[0].dataType, BooleanType), \ (str(sdf_for_check_schema), sdf_for_check_schema.schema.fields[0].dataType) sdf = sdf.where(rows_sel._scol) elif isinstance(rows_sel, slice): if rows_sel.step is not None: raiseNotImplemented("Cannot use step with Spark.") if rows_sel == slice(None): # If slice is None - select everything, so nothing to do pass elif len(self._kdf._index_columns) == 0: raiseNotImplemented("Cannot use slice for Spark if no index provided.") elif len(self._kdf._index_columns) == 1: start = rows_sel.start stop = rows_sel.stop index_column = self._kdf.index index_data_type = index_column.schema[0].dataType cond = [] if start is not None: cond.append(index_column._scol >= lit(start).cast(index_data_type)) if stop is not None: cond.append(index_column._scol <= lit(stop).cast(index_data_type)) if len(cond) > 0: sdf = sdf.where(reduce(lambda x, y: x & y, cond)) else: raiseNotImplemented("Cannot use slice for MultiIndex with Spark.") elif isinstance(rows_sel, str): raiseNotImplemented("Cannot use a scalar value for row selection with Spark.") else: try: rows_sel = list(rows_sel) except TypeError: raiseNotImplemented("Cannot use a scalar value for row selection with Spark.") if len(rows_sel) == 0: sdf = sdf.where(lit(False)) elif len(self._kdf._index_columns) == 1: index_column = self._kdf.index index_data_type = index_column.schema[0].dataType if len(rows_sel) == 1: sdf = sdf.where( index_column._scol == lit(rows_sel[0]).cast(index_data_type)) else: sdf = sdf.where(index_column._scol.isin( [lit(r).cast(index_data_type) for r in rows_sel])) else: raiseNotImplemented("Cannot select with MultiIndex with Spark.") if cols_sel is None: columns = [_make_col(c) for c in self._kdf._metadata.data_columns] elif isinstance(cols_sel, spark.Column): columns = [cols_sel] else: columns = [_make_col(c) for c in cols_sel] try: kdf = DataFrame(sdf.select(self._kdf._metadata.index_columns + columns)) except AnalysisException: raise KeyError('[{}] don\'t exist in columns' .format([col._jc.toString() for col in columns])) kdf._metadata = self._kdf._metadata.copy( data_columns=kdf._metadata.data_columns[-len(columns):]) if cols_sel is not None and isinstance(cols_sel, spark.Column): from databricks.koalas.series import _col return _col(kdf) else: return kdf
def concat(objs, axis=0, join='outer', ignore_index=False): """ Concatenate pandas objects along a particular axis with optional set logic along the other axes. Parameters ---------- objs : a sequence of Series or DataFrame Any None objects will be dropped silently unless they are all None in which case a ValueError will be raised axis : {0/'index'}, default 0 The axis to concatenate along. join : {'inner', 'outer'}, default 'outer' How to handle indexes on other axis(es) ignore_index : boolean, default False If True, do not use the index values along the concatenation axis. The resulting axis will be labeled 0, ..., n - 1. This is useful if you are concatenating objects where the concatenation axis does not have meaningful indexing information. Note the index values on the other axes are still respected in the join. Returns ------- concatenated : object, type of objs When concatenating all ``Series`` along the index (axis=0), a ``Series`` is returned. When ``objs`` contains at least one ``DataFrame``, a ``DataFrame`` is returned. See Also -------- DataFrame.merge Examples -------- Combine two ``Series``. >>> s1 = ks.Series(['a', 'b']) >>> s2 = ks.Series(['c', 'd']) >>> ks.concat([s1, s2]) 0 a 1 b 0 c 1 d Name: 0, dtype: object Clear the existing index and reset it in the result by setting the ``ignore_index`` option to ``True``. >>> ks.concat([s1, s2], ignore_index=True) 0 a 1 b 2 c 3 d Name: 0, dtype: object Combine two ``DataFrame`` objects with identical columns. >>> df1 = ks.DataFrame([['a', 1], ['b', 2]], ... columns=['letter', 'number']) >>> df1 letter number 0 a 1 1 b 2 >>> df2 = ks.DataFrame([['c', 3], ['d', 4]], ... columns=['letter', 'number']) >>> df2 letter number 0 c 3 1 d 4 >>> ks.concat([df1, df2]) letter number 0 a 1 1 b 2 0 c 3 1 d 4 Combine ``DataFrame`` and ``Series`` objects with different columns. >>> ks.concat([df2, s1, s2]) 0 letter number 0 None c 3.0 1 None d 4.0 0 a None NaN 1 b None NaN 0 c None NaN 1 d None NaN Combine ``DataFrame`` objects with overlapping columns and return everything. Columns outside the intersection will be filled with ``None`` values. >>> df3 = ks.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']], ... columns=['letter', 'number', 'animal']) >>> df3 letter number animal 0 c 3 cat 1 d 4 dog >>> ks.concat([df1, df3]) animal letter number 0 None a 1 1 None b 2 0 cat c 3 1 dog d 4 Combine ``DataFrame`` objects with overlapping columns and return only those that are shared by passing ``inner`` to the ``join`` keyword argument. >>> ks.concat([df1, df3], join="inner") letter number 0 a 1 1 b 2 0 c 3 1 d 4 """ if not isinstance(objs, (dict, Iterable)): raise TypeError('first argument must be an iterable of koalas ' 'objects, you passed an object of type ' '"{name}"'.format(name=type(objs).__name__)) if axis not in [0, 'index']: raise ValueError('axis should be either 0 or "index" currently.') if all(map(lambda obj: obj is None, objs)): raise ValueError("All objects passed were None") objs = list(filter(lambda obj: obj is not None, objs)) for obj in objs: if not isinstance(obj, (Series, DataFrame)): raise TypeError( 'cannot concatenate object of type ' "'{name}" '; only ks.Series ' 'and ks.DataFrame are valid'.format(name=type(objs).__name__)) # Series, Series ... # We should return Series if objects are all Series. should_return_series = all(map(lambda obj: isinstance(obj, Series), objs)) # DataFrame, Series ... & Series, Series ... # In this case, we should return DataFrame. new_objs = [] for obj in objs: if isinstance(obj, Series): obj = obj.to_dataframe() new_objs.append(obj) objs = new_objs # DataFrame, DataFrame, ... # All Series are converted into DataFrame and then compute concat. if not ignore_index: indices_of_kdfs = [kdf._internal.index_map for kdf in objs] index_of_first_kdf = indices_of_kdfs[0] for index_of_kdf in indices_of_kdfs: if index_of_first_kdf != index_of_kdf: raise ValueError( 'Index type and names should be same in the objects to concatenate. ' 'You passed different indices ' '{index_of_first_kdf} and {index_of_kdf}'.format( index_of_first_kdf=index_of_first_kdf, index_of_kdf=index_of_kdf)) columns_of_kdfs = [kdf._internal.columns for kdf in objs] first_kdf = objs[0] if ignore_index: columns_of_first_kdf = first_kdf._internal.data_columns else: columns_of_first_kdf = first_kdf._internal.columns if all(current_kdf == columns_of_first_kdf for current_kdf in columns_of_kdfs): # If all columns are in the same order and values, use it. kdfs = objs else: if ignore_index: columns_to_apply = [kdf._internal.data_columns for kdf in objs] else: columns_to_apply = [kdf._internal.columns for kdf in objs] if join == "inner": interested_columns = set.intersection(*map(set, columns_to_apply)) # Keep the column order with its firsts DataFrame. interested_columns = list( map( lambda c: columns_of_first_kdf[columns_of_first_kdf.index( c)], interested_columns)) kdfs = [] for kdf in objs: sdf = kdf._sdf.select(interested_columns) if ignore_index: kdfs.append(DataFrame(sdf)) else: kdfs.append(DataFrame(first_kdf._internal.copy(sdf=sdf))) elif join == "outer": # If there are columns unmatched, just sort the column names. merged_columns = set( itertools.chain.from_iterable(columns_to_apply)) kdfs = [] for kdf in objs: if ignore_index: columns_to_add = merged_columns - set( kdf._internal.data_columns) else: columns_to_add = merged_columns - set( kdf._internal.columns) # TODO: NaN and None difference for missing values. pandas seems filling NaN. kdf = kdf.assign( **dict(zip(columns_to_add, [None] * len(columns_to_add)))) if ignore_index: sdf = kdf._sdf.select(sorted(kdf._internal.data_columns)) else: sdf = kdf._sdf.select(kdf._internal.index_columns + sorted(kdf._internal.data_columns)) kdf = DataFrame( kdf._internal.copy(sdf=sdf, data_columns=sorted( kdf._internal.data_columns))) kdfs.append(kdf) else: raise ValueError( "Only can inner (intersect) or outer (union) join the other axis." ) concatenated = kdfs[0]._sdf for kdf in kdfs[1:]: concatenated = concatenated.unionByName(kdf._sdf) if ignore_index: result_kdf = DataFrame( concatenated.select(kdfs[0]._internal.data_columns)) else: result_kdf = DataFrame(kdfs[0]._internal.copy(sdf=concatenated)) if should_return_series: # If all input were Series, we should return Series. return _col(result_kdf) else: return result_kdf
def __setitem__(self, key, value): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series, _col if self._is_series: if (isinstance(key, Series) and key._kdf is not self._kdf_or_kser._kdf) or ( isinstance(value, Series) and value._kdf is not self._kdf_or_kser._kdf): kdf = self._kdf_or_kser.to_frame() temp_natural_order = verify_temp_column_name( kdf, "__temp_natural_order__") temp_key_col = verify_temp_column_name(kdf, "__temp_key_col__") temp_value_col = verify_temp_column_name( kdf, "__temp_value_col__") kdf[temp_natural_order] = F.monotonically_increasing_id() if isinstance(key, Series): kdf[temp_key_col] = key if isinstance(value, Series): kdf[temp_value_col] = value kdf = kdf.sort_values(temp_natural_order).drop( temp_natural_order) kser = kdf[self._kdf_or_kser.name] if isinstance(key, Series): key = kdf[temp_key_col] if isinstance(value, Series): value = kdf[temp_value_col] type(self)(kser)[key] = value self._kdf_or_kser._internal = kser._internal self._kdf_or_kser._kdf = kser._kdf return if isinstance(value, DataFrame): raise ValueError("Incompatible indexer with DataFrame") cond, limit, remaining_index = self._select_rows(key) if cond is None: cond = F.lit(True) if limit is not None: cond = cond & (self._internal.spark_frame[self._sequence_col] < F.lit(limit)) if isinstance(value, Series): if remaining_index is not None and remaining_index == 0: raise ValueError( "No axis named {} for object type {}".format( key, type(value))) value = value._scol else: value = F.lit(value) scol = (F.when(cond, value).otherwise(self._internal.spark_column).alias( name_like_string(self._kdf_or_kser.name or "0"))) internal = self._internal.copy(spark_column=scol) self._kdf_or_kser._internal = internal else: assert self._is_df if isinstance(key, tuple): if len(key) != 2: raise SparkPandasIndexingError( "Only accepts pairs of candidates") rows_sel, cols_sel = key else: rows_sel = key cols_sel = None if isinstance(value, DataFrame): if len(value.columns) == 1: value = _col(value) else: raise ValueError( "Only a dataframe with one column can be assigned") if (isinstance(rows_sel, Series) and rows_sel._kdf is not self._kdf_or_kser) or ( isinstance(value, Series) and value._kdf is not self._kdf_or_kser): kdf = self._kdf_or_kser.copy() temp_natural_order = verify_temp_column_name( kdf, "__temp_natural_order__") temp_key_col = verify_temp_column_name(kdf, "__temp_key_col__") temp_value_col = verify_temp_column_name( kdf, "__temp_value_col__") kdf[temp_natural_order] = F.monotonically_increasing_id() if isinstance(rows_sel, Series): kdf[temp_key_col] = rows_sel if isinstance(value, Series): kdf[temp_value_col] = value kdf = kdf.sort_values(temp_natural_order) if isinstance(rows_sel, Series): rows_sel = kdf[temp_key_col] if isinstance(value, Series): value = kdf[temp_value_col] type(self)(kdf)[rows_sel, cols_sel] = value self._kdf_or_kser._internal = kdf[list( self._kdf_or_kser.columns)]._internal return cond, limit, remaining_index = self._select_rows(rows_sel) missing_keys = [] _, data_spark_columns, _ = self._select_cols( cols_sel, missing_keys=missing_keys) if cond is None: cond = F.lit(True) if limit is not None: cond = cond & (self._internal.spark_frame[self._sequence_col] < F.lit(limit)) if isinstance(value, Series): if remaining_index is not None and remaining_index == 0: raise ValueError("Incompatible indexer with Series") if len(data_spark_columns) > 1: raise ValueError("shape mismatch") value = value._scol else: value = F.lit(value) new_data_spark_columns = [] for new_scol, spark_column_name in zip( self._internal.data_spark_columns, self._internal.data_spark_column_names): for scol in data_spark_columns: if new_scol._jc.equals(scol._jc): new_scol = F.when( cond, value).otherwise(scol).alias(spark_column_name) break new_data_spark_columns.append(new_scol) column_labels = self._internal.column_labels.copy() for label in missing_keys: if isinstance(label, str): label = (label, ) if len(label) < self._internal.column_labels_level: label = tuple( list(label) + ([""] * (self._internal.column_labels_level - len(label)))) elif len(label) > self._internal.column_labels_level: raise KeyError( "Key length ({}) exceeds index depth ({})".format( len(label), self._internal.column_labels_level)) column_labels.append(label) new_data_spark_columns.append( F.when(cond, value).alias(name_like_string(label))) internal = self._internal.with_new_columns(new_data_spark_columns, column_labels) self._kdf_or_kser._internal = internal