def drop(self, labels): """ Make new Index with passed list of labels deleted. Parameters ---------- labels : array-like Returns ------- dropped : Index Examples -------- >>> index = ks.Index([1, 2, 3]) >>> index Int64Index([1, 2, 3], dtype='int64') >>> index.drop([1]) Int64Index([2, 3], dtype='int64') """ if not isinstance(labels, (tuple, list)): labels = [labels] sdf = self._internal.sdf[~self._internal.index_scols[0].isin(labels)] return Index( DataFrame( _InternalFrame(sdf=sdf, index_map=self._kdf._internal.index_map)))
def _apply_as_series_or_frame(self, func): """ Wraps a function that handles Spark column in order to support it in both Koalas Series and DataFrame. Note that the given `func` name should be same as the API's method name. """ from databricks.koalas import DataFrame from databricks.koalas.series import _col from databricks.koalas.groupby import SeriesGroupBy kdf = self.kdf sdf = self.kdf._sdf # Here we need to include grouped key as an index, and shift previous index. # [index_column0, index_column1] -> [grouped key, index_column0, index_column1] new_index_scols = [] new_index_map = [] for groupkey in self._groupkeys: new_index_scols.append( # NOTE THAT this code intentionally uses `F.col` instead of `scol` in # given series. This is because, in case of series, we convert it into # DataFrame. So, if the given `groupkeys` is a series, they end up with # being a different series. F.col(name_like_string(groupkey.name) ).alias(SPARK_INDEX_NAME_FORMAT(len(new_index_scols)))) new_index_map.append((SPARK_INDEX_NAME_FORMAT(len(new_index_map)), groupkey._internal.column_index[0])) for new_index_scol, index_map in zip(kdf._internal.index_scols, kdf._internal.index_map): new_index_scols.append( new_index_scol.alias( SPARK_INDEX_NAME_FORMAT(len(new_index_scols)))) _, name = index_map new_index_map.append( (SPARK_INDEX_NAME_FORMAT(len(new_index_map)), name)) applied = [] for column in kdf.columns: applied.append(kdf[column]._with_new_scol(func( kdf[column]._scol)).rename(kdf[column].name)) # Seems like pandas filters out when grouped key is NA. cond = self._groupkeys[0]._scol.isNotNull() for c in self._groupkeys: cond = cond | c._scol.isNotNull() sdf = sdf.select(new_index_scols + [c._scol for c in applied]).filter(cond) internal = _InternalFrame( sdf=sdf, data_columns=[c._internal.data_columns[0] for c in applied], index_map=new_index_map) ret = DataFrame(internal) if isinstance(self._groupby, SeriesGroupBy): return _col(ret) else: return ret
def _reduce_for_stat_function(self, sfun, only_numeric): groupkeys = self._groupkeys groupkey_cols = [ s._scol.alias('__index_level_{}__'.format(i)) for i, s in enumerate(groupkeys) ] sdf = self._kdf._sdf data_columns = [] if len(self._agg_columns) > 0: stat_exprs = [] for ks in self._agg_columns: spark_type = ks.spark_type # TODO: we should have a function that takes dataframes and converts the numeric # types. Converting the NaNs is used in a few places, it should be in utils. # Special handle floating point types because Spark's count treats nan as a valid # value, whereas Pandas count doesn't include nan. if isinstance(spark_type, DoubleType) or isinstance( spark_type, FloatType): stat_exprs.append( sfun(F.nanvl(ks._scol, F.lit(None))).alias(ks.name)) data_columns.append(ks.name) elif isinstance(spark_type, NumericType) or not only_numeric: stat_exprs.append(sfun(ks._scol).alias(ks.name)) data_columns.append(ks.name) sdf = sdf.groupby(*groupkey_cols).agg(*stat_exprs) else: sdf = sdf.select(*groupkey_cols).distinct() sdf = sdf.sort(*groupkey_cols) internal = _InternalFrame(sdf=sdf, data_columns=data_columns, index_map=[('__index_level_{}__'.format(i), s.name) for i, s in enumerate(groupkeys)]) return DataFrame(internal)
def drop_duplicates(self): """ Return Index with duplicate values removed. Returns ------- deduplicated : Index See Also -------- Series.drop_duplicates : Equivalent method on Series. DataFrame.drop_duplicates : Equivalent method on DataFrame. Examples -------- Generate an pandas.Index with duplicate values. >>> idx = ks.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo']) >>> idx.drop_duplicates() # doctest: +SKIP Index(['lama', 'cow', 'beetle', 'hippo'], dtype='object') """ sdf = self._internal.sdf.select( self._internal.index_scols).drop_duplicates() internal = _InternalFrame(sdf=sdf, index_map=self._kdf._internal.index_map) result = DataFrame(internal).index return result
def size(self): """ Compute group sizes. See Also -------- databricks.koalas.Series.groupby databricks.koalas.DataFrame.groupby Examples -------- >>> df = ks.DataFrame({'A': [1, 2, 2, 3, 3, 3], ... 'B': [1, 1, 2, 3, 3, 3]}, ... columns=['A', 'B']) >>> df A B 0 1 1 1 2 1 2 2 2 3 3 3 4 3 3 5 3 3 >>> df.groupby('A').size().sort_index() # doctest: +NORMALIZE_WHITESPACE A 1 1 2 2 3 3 Name: count, dtype: int64 >>> df.groupby(['A', 'B']).size().sort_index() # doctest: +NORMALIZE_WHITESPACE A B 1 1 1 2 1 1 2 1 3 3 3 Name: count, dtype: int64 """ groupkeys = self._groupkeys groupkey_cols = [ s._scol.alias('__index_level_{}__'.format(i)) for i, s in enumerate(groupkeys) ] sdf = self._kdf._sdf sdf = sdf.groupby(*groupkey_cols).count() if (len(self._agg_columns) > 0) and (self._have_agg_columns): name = self._agg_columns[0].name sdf = sdf.withColumnRenamed('count', name) else: name = 'count' internal = _InternalFrame(sdf=sdf, data_columns=[name], index_map=[('__index_level_{}__'.format(i), s.name) for i, s in enumerate(groupkeys)]) return _col(DataFrame(internal))
def dropna(self): """ Return Index or MultiIndex without NA/NaN values Examples -------- >>> df = ks.DataFrame([[1, 2], [4, 5], [7, 8]], ... index=['cobra', 'viper', None], ... columns=['max_speed', 'shield']) >>> df max_speed shield cobra 1 2 viper 4 5 NaN 7 8 >>> df.index.dropna() Index(['cobra', 'viper'], dtype='object') Also support for MultiIndex >>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'], ... [None, 'weight', 'length']], ... [[0, 1, 1, 1, 1, 1, 2, 2, 2], ... [0, 1, 1, 0, 1, 2, 1, 1, 2]]) >>> s = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, None], ... index=midx) >>> s lama NaN 45.0 cow weight 200.0 weight 1.2 NaN 30.0 weight 250.0 length 1.5 falcon weight 320.0 weight 1.0 length NaN Name: 0, dtype: float64 >>> s.index.dropna() # doctest: +SKIP MultiIndex([( 'cow', 'weight'), ( 'cow', 'weight'), ( 'cow', 'weight'), ( 'cow', 'length'), ('falcon', 'weight'), ('falcon', 'weight'), ('falcon', 'length')], ) """ kdf = self._kdf.copy() sdf = kdf._internal.sdf.select(self._internal.index_scols).dropna() internal = _InternalFrame(sdf=sdf, index_map=self._internal.index_map) kdf = DataFrame(internal) return Index(kdf) if type(self) == Index else MultiIndex(kdf)
def __getitem__(self, key): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series rows_sel, cols_sel = _unfold(key, self._kdf_or_kser if self._is_series else None) cond, limit = self._select_rows(rows_sel) column_index, columns, returns_series = self._select_cols(cols_sel) if cond is None and limit is None and returns_series: if self._is_series: return self._kdf_or_kser._with_new_scol(columns[0]) else: return Series(self._internal.copy(scol=columns[0], column_index=[column_index[0]]), anchor=self._kdf_or_kser) else: try: sdf = self._internal._sdf if cond is not None: sdf = sdf.where(cond) if limit is not None: if limit >= 0: sdf = sdf.limit(limit) else: sdf = sdf.limit(sdf.count() + limit) sdf = sdf.select(self._internal.index_scols + columns) if self._internal.column_index_names is None: column_index_names = None else: # Manage column index names level = column_index_level(column_index) column_index_names = self._internal.column_index_names[-level:] internal = _InternalFrame(sdf=sdf, index_map=self._internal.index_map, column_index=column_index, column_index_names=column_index_names) kdf = DataFrame(internal) except AnalysisException: raise KeyError('[{}] don\'t exist in columns' .format([col._jc.toString() for col in columns])) if returns_series: return Series(kdf._internal.copy(scol=kdf._internal.column_scols[0]), anchor=kdf) else: return kdf
def drop(self, labels, level=None): """ Make new MultiIndex with passed list of labels deleted Parameters ---------- labels : array-like Must be a list of tuples level : int or level name, default None Returns ------- dropped : MultiIndex Examples -------- >>> index = ks.MultiIndex.from_tuples([('a', 'x'), ('b', 'y'), ('c', 'z')]) >>> index # doctest: +SKIP MultiIndex([('a', 'x'), ('b', 'y'), ('c', 'z')], ) >>> index.drop(['a']) # doctest: +SKIP MultiIndex([('b', 'y'), ('c', 'z')], ) >>> index.drop(['x', 'y'], level=1) # doctest: +SKIP MultiIndex([('c', 'z')], ) """ sdf = self._internal.sdf index_scols = self._internal.index_scols if level is None: scol = index_scols[0] else: scol = index_scols[level] if isinstance(level, int) else sdf[level] sdf = sdf[~scol.isin(labels)] return MultiIndex( DataFrame( _InternalFrame(sdf=sdf, index_map=self._kdf._internal.index_map)))
def unique(self, level=None): """ Return unique values in the index. Be aware the order of unique values might be different than pandas.Index.unique :param level: int or str, optional, default is None :return: Index without deuplicates Examples -------- >>> ks.DataFrame({'a': ['a', 'b', 'c']}, index=[1, 1, 3]).index.unique() Int64Index([1, 3], dtype='int64') >>> ks.DataFrame({'a': ['a', 'b', 'c']}, index=['d', 'e', 'e']).index.unique() Index(['e', 'd'], dtype='object') """ if level is not None: self._validate_index_level(level) sdf = self._kdf._sdf.select(self._scol.alias(self._internal.index_columns[0])).distinct() return DataFrame(_InternalFrame(sdf=sdf, index_map=self._kdf._internal.index_map)).index
def _apply(self, func, return_schema): index_columns = self._kdf._internal.index_columns index_names = self._kdf._internal.index_names data_columns = self._kdf._internal.data_columns def rename_output(pdf): # TODO: This logic below was borrowed from `DataFrame.pandas_df` to set the index # within each pdf properly. we might have to deduplicate it. import pandas as pd if len(index_columns) > 0: append = False for index_field in index_columns: drop = index_field not in data_columns pdf = pdf.set_index(index_field, drop=drop, append=append) append = True pdf = pdf[data_columns] if len(index_names) > 0: if isinstance(pdf.index, pd.MultiIndex): pdf.index.names = index_names else: pdf.index.name = index_names[0] pdf = func(pdf) # For now, just positionally map the column names to given schema's. pdf = pdf.rename( columns=dict(zip(pdf.columns, return_schema.fieldNames()))) return pdf grouped_map_func = pandas_udf(return_schema, PandasUDFType.GROUPED_MAP)(rename_output) sdf = self._kdf._sdf input_groupkeys = [s._scol for s in self._groupkeys] sdf = sdf.groupby(*input_groupkeys).apply(grouped_map_func) internal = _InternalFrame(sdf=sdf, data_columns=return_schema.fieldNames(), index_map=[]) # index is lost. return DataFrame(internal)
def aggregate(self, func_or_funcs, *args, **kwargs): """Aggregate using one or more operations over the specified axis. Parameters ---------- func : dict a dict mapping from column name (string) to aggregate functions (string or list of strings). Returns ------- Series or DataFrame The return can be: * Series : when DataFrame.agg is called with a single function * DataFrame : when DataFrame.agg is called with several functions Return Series or DataFrame. Notes ----- `agg` is an alias for `aggregate`. Use the alias. See Also -------- databricks.koalas.Series.groupby databricks.koalas.DataFrame.groupby Examples -------- >>> df = ks.DataFrame({'A': [1, 1, 2, 2], ... 'B': [1, 2, 3, 4], ... 'C': [0.362, 0.227, 1.267, -0.562]}, ... columns=['A', 'B', 'C']) >>> df A B C 0 1 1 0.362 1 1 2 0.227 2 2 3 1.267 3 2 4 -0.562 Different aggregations per column >>> aggregated = df.groupby('A').agg({'B': 'min', 'C': 'sum'}) >>> aggregated[['B', 'C']] # doctest: +NORMALIZE_WHITESPACE B C A 1 1 0.589 2 3 0.705 >>> aggregated = df.groupby('A').agg({'B': ['min', 'max']}) >>> aggregated # doctest: +NORMALIZE_WHITESPACE B min max A 1 1 2 2 3 4 """ if not isinstance(func_or_funcs, dict) or \ not all(isinstance(key, str) and (isinstance(value, str) or isinstance(value, list) and all(isinstance(v, str) for v in value)) for key, value in func_or_funcs.items()): raise ValueError( "aggs must be a dict mapping from column name (string) to aggregate " "functions (string or list of strings).") sdf = self._kdf._sdf groupkeys = self._groupkeys groupkey_cols = [ s._scol.alias('__index_level_{}__'.format(i)) for i, s in enumerate(groupkeys) ] multi_aggs = any(isinstance(v, list) for v in func_or_funcs.values()) reordered = [] data_columns = [] column_index = [] for key, value in func_or_funcs.items(): for aggfunc in [value] if isinstance(value, str) else value: data_col = "('{0}', '{1}')".format( key, aggfunc) if multi_aggs else key data_columns.append(data_col) column_index.append((key, aggfunc)) if aggfunc == "nunique": reordered.append( F.expr('count(DISTINCT `{0}`) as `{1}`'.format( key, data_col))) else: reordered.append( F.expr('{1}(`{0}`) as `{2}`'.format( key, aggfunc, data_col))) sdf = sdf.groupby(*groupkey_cols).agg(*reordered) internal = _InternalFrame( sdf=sdf, data_columns=data_columns, column_index=column_index if multi_aggs else None, index_map=[('__index_level_{}__'.format(i), s.name) for i, s in enumerate(groupkeys)]) kdf = DataFrame(internal) if not self._as_index: kdf = kdf.reset_index() return kdf
def aggregate(self, func_or_funcs, *args, **kwargs): """Aggregate using one or more operations over the specified axis. Parameters ---------- func : dict a dict mapping from column name (string) to aggregate functions (string). Returns ------- Series or DataFrame The return can be: * Series : when DataFrame.agg is called with a single function * DataFrame : when DataFrame.agg is called with several functions Return Series or DataFrame. Notes ----- `agg` is an alias for `aggregate`. Use the alias. See Also -------- databricks.koalas.Series.groupby databricks.koalas.DataFrame.groupby Examples -------- >>> df = ks.DataFrame({'A': [1, 1, 2, 2], ... 'B': [1, 2, 3, 4], ... 'C': [0.362, 0.227, 1.267, -0.562]}, ... columns=['A', 'B', 'C']) >>> df A B C 0 1 1 0.362 1 1 2 0.227 2 2 3 1.267 3 2 4 -0.562 Different aggregations per column >>> aggregated = df.groupby('A').agg({'B': 'min', 'C': 'sum'}) >>> aggregated[['B', 'C']] # doctest: +NORMALIZE_WHITESPACE B C A 1 1 0.589 2 3 0.705 """ if not isinstance(func_or_funcs, dict) or \ not all(isinstance(key, str) and isinstance(value, str) for key, value in func_or_funcs.items()): raise ValueError( "aggs must be a dict mapping from column name (string) to aggregate " "functions (string).") sdf = self._kdf._sdf groupkeys = self._groupkeys groupkey_cols = [ s._scol.alias('__index_level_{}__'.format(i)) for i, s in enumerate(groupkeys) ] reordered = [] for key, value in func_or_funcs.items(): if value == "nunique": reordered.append( F.expr('count(DISTINCT {0}) as {0}'.format(key))) else: reordered.append(F.expr('{1}({0}) as {0}'.format(key, value))) sdf = sdf.groupby(*groupkey_cols).agg(*reordered) internal = _InternalFrame( sdf=sdf, data_columns=[key for key, _ in func_or_funcs.items()], index_map=[('__index_level_{}__'.format(i), s.name) for i, s in enumerate(groupkeys)]) return DataFrame(internal)
def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): """ Return a Series containing counts of unique values. The resulting object will be in descending order so that the first element is the most frequently-occurring element. Excludes NA values by default. Parameters ---------- normalize : boolean, default False If True then the object returned will contain the relative frequencies of the unique values. sort : boolean, default True Sort by values. ascending : boolean, default False Sort in ascending order. bins : Not Yet Supported dropna : boolean, default True Don't include counts of NaN. Returns ------- counts : Series See Also -------- Series.count: Number of non-NA elements in a Series. Examples -------- For Series >>> df = ks.DataFrame({'x':[0, 0, 1, 1, 1, np.nan]}) >>> df.x.value_counts() # doctest: +NORMALIZE_WHITESPACE 1.0 3 0.0 2 Name: x, dtype: int64 With `normalize` set to `True`, returns the relative frequency by dividing all values by the sum of values. >>> df.x.value_counts(normalize=True) # doctest: +NORMALIZE_WHITESPACE 1.0 0.6 0.0 0.4 Name: x, dtype: float64 **dropna** With `dropna` set to `False` we can also see NaN index values. >>> df.x.value_counts(dropna=False) # doctest: +NORMALIZE_WHITESPACE 1.0 3 0.0 2 NaN 1 Name: x, dtype: int64 For Index >>> from databricks.koalas.indexes import Index >>> idx = Index([3, 1, 2, 3, 4, np.nan]) >>> idx Float64Index([3.0, 1.0, 2.0, 3.0, 4.0, nan], dtype='float64') >>> idx.value_counts().sort_index() 1.0 1 2.0 1 3.0 2 4.0 1 Name: count, dtype: int64 **sort** With `sort` set to `False`, the result wouldn't be sorted by number of count. >>> idx.value_counts(sort=True).sort_index() 1.0 1 2.0 1 3.0 2 4.0 1 Name: count, dtype: int64 **normalize** With `normalize` set to `True`, returns the relative frequency by dividing all values by the sum of values. >>> idx.value_counts(normalize=True).sort_index() 1.0 0.2 2.0 0.2 3.0 0.4 4.0 0.2 Name: count, dtype: float64 **dropna** With `dropna` set to `False` we can also see NaN index values. >>> idx.value_counts(dropna=False).sort_index() # doctest: +SKIP 1.0 1 2.0 1 3.0 2 4.0 1 NaN 1 Name: count, dtype: int64 For MultiIndex. >>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'], ... ['speed', 'weight', 'length']], ... [[0, 0, 0, 1, 1, 1, 2, 2, 2], ... [1, 1, 1, 1, 1, 2, 1, 2, 2]]) >>> s = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx) >>> s.index # doctest: +SKIP MultiIndex([( 'lama', 'weight'), ( 'lama', 'weight'), ( 'lama', 'weight'), ( 'cow', 'weight'), ( 'cow', 'weight'), ( 'cow', 'length'), ('falcon', 'weight'), ('falcon', 'length'), ('falcon', 'length')], ) >>> s.index.value_counts().sort_index() (cow, length) 1 (cow, weight) 2 (falcon, length) 2 (falcon, weight) 1 (lama, weight) 3 Name: count, dtype: int64 >>> s.index.value_counts(normalize=True).sort_index() (cow, length) 0.111111 (cow, weight) 0.222222 (falcon, length) 0.222222 (falcon, weight) 0.111111 (lama, weight) 0.333333 Name: count, dtype: float64 If Index has name, keep the name up. >>> idx = Index([0, 0, 0, 1, 1, 2, 3], name='koalas') >>> idx.value_counts().sort_index() 0 3 1 2 2 1 3 1 Name: koalas, dtype: int64 """ from databricks.koalas.series import _col if bins is not None: raise NotImplementedError("value_counts currently does not support bins") if dropna: sdf_dropna = self._internal._sdf.select(self.spark_column).dropna() else: sdf_dropna = self._internal._sdf.select(self.spark_column) index_name = SPARK_DEFAULT_INDEX_NAME column_name = self._internal.data_spark_column_names[0] sdf = sdf_dropna.groupby(scol_for(sdf_dropna, column_name).alias(index_name)).count() if sort: if ascending: sdf = sdf.orderBy(F.col("count")) else: sdf = sdf.orderBy(F.col("count").desc()) if normalize: sum = sdf_dropna.count() sdf = sdf.withColumn("count", F.col("count") / F.lit(sum)) column_labels = self._internal.column_labels if (column_labels[0] is None) or (None in column_labels[0]): internal = _InternalFrame( spark_frame=sdf, index_map=OrderedDict({index_name: None}), data_spark_columns=[scol_for(sdf, "count")], ) else: internal = _InternalFrame( spark_frame=sdf, index_map=OrderedDict({index_name: None}), column_labels=column_labels, data_spark_columns=[scol_for(sdf, "count")], column_label_names=self._internal.column_label_names, ) return _col(DataFrame(internal))
def _apply(self, func, return_schema, retain_index): should_infer_schema = return_schema is None input_groupnames = [s.name for s in self._groupkeys] if should_infer_schema: # Here we execute with the first 1000 to get the return type. # If the records were less than 1000, it uses pandas API directly for a shortcut. limit = 1000 pdf = self._kdf.head(limit + 1).to_pandas() pdf = pdf.groupby(input_groupnames).apply(func) kdf = DataFrame(pdf) return_schema = kdf._sdf.schema if len(pdf) <= limit: return kdf index_columns = self._kdf._internal.index_columns index_names = self._kdf._internal.index_names data_columns = self._kdf._internal.data_columns def rename_output(pdf): # TODO: This logic below was borrowed from `DataFrame.pandas_df` to set the index # within each pdf properly. we might have to deduplicate it. import pandas as pd if len(index_columns) > 0: append = False for index_field in index_columns: drop = index_field not in data_columns pdf = pdf.set_index(index_field, drop=drop, append=append) append = True pdf = pdf[data_columns] if len(index_names) > 0: if isinstance(pdf.index, pd.MultiIndex): pdf.index.names = index_names else: pdf.index.name = index_names[0] pdf = func(pdf) if retain_index: # If schema should be inferred, we don't restore index. Pandas seems restoring # the index in some cases. # When Spark output type is specified, without executing it, we don't know # if we should restore the index or not. For instance, see the example in # https://github.com/databricks/koalas/issues/628. # TODO: deduplicate this logic with _InternalFrame.from_pandas columns = pdf.columns index = pdf.index index_map = [] if isinstance(index, pd.MultiIndex): if index.names is None: index_map = [('__index_level_{}__'.format(i), None) for i in range(len(index.levels))] else: index_map = [('__index_level_{}__'.format(i) if name is None else name, name) for i, name in enumerate(index.names)] else: index_map = [(index.name if index.name is not None else '__index_level_0__', index.name)] new_index_columns = [ index_column for index_column, _ in index_map ] new_data_columns = [str(col) for col in columns] reset_index = pdf.reset_index() reset_index.columns = new_index_columns + new_data_columns for name, col in reset_index.iteritems(): dt = col.dtype if is_datetime64_dtype(dt) or is_datetime64tz_dtype(dt): continue reset_index[name] = col.replace({np.nan: None}) pdf = reset_index # Just positionally map the column names to given schema's. pdf = pdf.rename( columns=dict(zip(pdf.columns, return_schema.fieldNames()))) return pdf grouped_map_func = pandas_udf(return_schema, PandasUDFType.GROUPED_MAP)(rename_output) sdf = self._kdf._sdf input_groupkeys = [s._scol for s in self._groupkeys] sdf = sdf.groupby(*input_groupkeys).apply(grouped_map_func) if should_infer_schema: # If schema is inferred, we can restore indexes too. internal = kdf._internal.copy(sdf=sdf) else: # Otherwise, it loses index. internal = _InternalFrame(sdf=sdf, data_columns=return_schema.fieldNames(), index_map=[]) return DataFrame(internal)
def symmetric_difference(self, other, result_name=None, sort=None): """ Compute the symmetric difference of two Index objects. Parameters ---------- other : Index or array-like result_name : str sort : True or None, default None Whether to sort the resulting index. * True : Attempt to sort the result. * None : Do not sort the result. Returns ------- symmetric_difference : Index Notes ----- ``symmetric_difference`` contains elements that appear in either ``idx1`` or ``idx2`` but not both. Equivalent to the Index created by ``idx1.difference(idx2) | idx2.difference(idx1)`` with duplicates dropped. Examples -------- >>> s1 = ks.Series([1, 2, 3, 4], index=[1, 2, 3, 4]) >>> s2 = ks.Series([1, 2, 3, 4], index=[2, 3, 4, 5]) >>> s1.index.symmetric_difference(s2.index) Int64Index([5, 1], dtype='int64') You can set name of result Index. >>> s1.index.symmetric_difference(s2.index, result_name='koalas') Int64Index([5, 1], dtype='int64', name='koalas') You can set sort to `True`, if you want to sort the resulting index. >>> s1.index.symmetric_difference(s2.index, sort=True) Int64Index([1, 5], dtype='int64') You can also use the ``^`` operator: >>> s1.index ^ s2.index Int64Index([5, 1], dtype='int64') """ if type(self) != type(other): raise NotImplementedError( "Doesn't support symmetric_difference between Index & MultiIndex for now" ) sdf_self = self._kdf._sdf.select(self._internal.index_scols) sdf_other = other._kdf._sdf.select(other._internal.index_scols) sdf_symdiff = sdf_self.union(sdf_other) \ .subtract(sdf_self.intersect(sdf_other)) if sort: sdf_symdiff = sdf_symdiff.sort(self._internal.index_scols) internal = _InternalFrame(sdf=sdf_symdiff, index_map=self._internal.index_map) result = Index(DataFrame(internal)) if result_name: result.name = result_name return result
def symmetric_difference(self, other, result_name=None, sort=None): """ Compute the symmetric difference of two MultiIndex objects. Parameters ---------- other : Index or array-like result_name : list sort : True or None, default None Whether to sort the resulting index. * True : Attempt to sort the result. * None : Do not sort the result. Returns ------- symmetric_difference : MiltiIndex Notes ----- ``symmetric_difference`` contains elements that appear in either ``idx1`` or ``idx2`` but not both. Equivalent to the Index created by ``idx1.difference(idx2) | idx2.difference(idx1)`` with duplicates dropped. Examples -------- >>> midx1 = pd.MultiIndex([['lama', 'cow', 'falcon'], ... ['speed', 'weight', 'length']], ... [[0, 0, 0, 1, 1, 1, 2, 2, 2], ... [0, 0, 0, 0, 1, 2, 0, 1, 2]]) >>> midx2 = pd.MultiIndex([['koalas', 'cow', 'falcon'], ... ['speed', 'weight', 'length']], ... [[0, 0, 0, 1, 1, 1, 2, 2, 2], ... [0, 0, 0, 0, 1, 2, 0, 1, 2]]) >>> s1 = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], ... index=midx1) >>> s2 = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], ... index=midx2) >>> s1.index.symmetric_difference(s2.index) # doctest: +SKIP MultiIndex([('koalas', 'speed'), ( 'lama', 'speed')], ) You can set names of result Index. >>> s1.index.symmetric_difference(s2.index, result_name=['a', 'b']) # doctest: +SKIP MultiIndex([('koalas', 'speed'), ( 'lama', 'speed')], names=['a', 'b']) You can set sort to `True`, if you want to sort the resulting index. >>> s1.index.symmetric_difference(s2.index, sort=True) # doctest: +SKIP MultiIndex([('koalas', 'speed'), ( 'lama', 'speed')], ) You can also use the ``^`` operator: >>> s1.index ^ s2.index # doctest: +SKIP MultiIndex([('koalas', 'speed'), ( 'lama', 'speed')], ) """ if type(self) != type(other): raise NotImplementedError( "Doesn't support symmetric_difference between Index & MultiIndex for now" ) sdf_self = self._kdf._sdf.select(self._internal.index_scols) sdf_other = other._kdf._sdf.select(other._internal.index_scols) sdf_symdiff = sdf_self.union(sdf_other) \ .subtract(sdf_self.intersect(sdf_other)) if sort: sdf_symdiff = sdf_symdiff.sort(self._internal.index_scols) internal = _InternalFrame(sdf=sdf_symdiff, index_map=self._internal.index_map) result = MultiIndex(DataFrame(internal)) if result_name: result.names = result_name return result
def __getitem__(self, key): from databricks.koalas.frame import DataFrame from databricks.koalas.indexes import Index from databricks.koalas.series import Series def raiseNotImplemented(description): raise SparkPandasNotImplementedError( description=description, pandas_function=".iloc[..., ...]", spark_target_function="select, where") rows_sel, cols_sel = _unfold(key, self._kser) sdf = self._kdf._sdf if isinstance(rows_sel, Index): sdf_for_check_schema = sdf.select(rows_sel._scol) assert isinstance(sdf_for_check_schema.schema.fields[0].dataType, BooleanType), \ (str(sdf_for_check_schema), sdf_for_check_schema.schema.fields[0].dataType) sdf = sdf.where(rows_sel._scol) elif isinstance(rows_sel, slice): if rows_sel == slice(None): # If slice is None - select everything, so nothing to do pass elif (rows_sel.start is not None) or (rows_sel.step is not None): raiseNotImplemented("Cannot use start or step with Spark.") elif not isinstance(rows_sel.stop, int): raise TypeError( "cannot do slice indexing with these indexers [{}] of {}". format(rows_sel.stop, type(rows_sel.stop))) elif rows_sel.stop >= 0: sdf = sdf.limit(rows_sel.stop) else: sdf = sdf.limit(sdf.count() + rows_sel.stop) else: raiseNotImplemented( ".iloc requires numeric slice or conditional boolean Index, " "got {}".format(rows_sel)) # make cols_sel a 1-tuple of string if a single string if isinstance(cols_sel, Series): columns = [cols_sel._scol] elif isinstance(cols_sel, int): columns = [self._kdf._internal.column_scols[cols_sel]] elif cols_sel is None or cols_sel == slice(None): columns = self._kdf._internal.column_scols elif isinstance(cols_sel, slice): if all(s is None or isinstance(s, int) for s in (cols_sel.start, cols_sel.stop, cols_sel.step)): columns = self._kdf._internal.column_scols[cols_sel] else: not_none = cols_sel.start if cols_sel.start is not None \ else cols_sel.stop if cols_sel.stop is not None else cols_sel.step raise TypeError( 'cannot do slice indexing with these indexers {} of {}'. format(not_none, type(not_none))) elif is_list_like(cols_sel): if all(isinstance(s, int) for s in cols_sel): columns = [ self._kdf._internal.scol_for(col) for col in self._kdf.columns[cols_sel] ] else: raise TypeError('cannot perform reduce with flexible type') else: raise ValueError( "Location based indexing can only have [integer, integer slice, " "listlike of integers, boolean array] types, got {}".format( cols_sel)) try: sdf = sdf.select(self._kdf._internal.index_scols + columns) internal = _InternalFrame(sdf=sdf, index_map=self._kdf._internal.index_map) kdf = DataFrame(internal) except AnalysisException: raise KeyError('[{}] don\'t exist in columns'.format( [col._jc.toString() for col in columns])) column_index = self._kdf._internal.column_index if cols_sel is not None: if isinstance(cols_sel, (Series, int)): column_index = None else: column_index = \ pd.MultiIndex.from_tuples(self._kdf._internal.column_index)[cols_sel].tolist() kdf = DataFrame(kdf._internal.copy(column_index=column_index)) if cols_sel is not None and isinstance(cols_sel, (Series, int)): from databricks.koalas.series import _col return _col(kdf) else: return kdf
def __getitem__(self, key): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series rows_sel, cols_sel = _unfold(key, self._kdf_or_kser if self._is_series else None) cond = self._select_rows(rows_sel) # make cols_sel a 1-tuple of string if a single string if isinstance(cols_sel, Series): cols_sel = _make_col(cols_sel) elif isinstance(cols_sel, slice) and cols_sel != slice(None): raise LocIndexer._raiseNotImplemented( "Can only select columns either by name or reference or all") elif isinstance(cols_sel, slice) and cols_sel == slice(None): cols_sel = None returns_series = cols_sel is not None and isinstance(cols_sel, spark.Column) if cols_sel is None: column_index = self._internal.column_index columns = self._internal.column_scols elif isinstance(cols_sel, (str, tuple)): if isinstance(cols_sel, str): cols_sel = (cols_sel,) column_index, columns, returns_series = \ self._get_from_multiindex_column(cols_sel) elif isinstance(cols_sel, spark.Column): columns = [cols_sel] column_index = None elif all(isinstance(key, Series) for key in cols_sel): columns = [_make_col(key) for key in cols_sel] column_index = [key._internal.column_index[0] for key in cols_sel] elif all(isinstance(key, spark.Column) for key in cols_sel): columns = cols_sel column_index = None elif (any(isinstance(key, str) for key in cols_sel) and any(isinstance(key, tuple) for key in cols_sel)): raise TypeError('Expected tuple, got str') else: if all(isinstance(key, tuple) for key in cols_sel): level = self._internal.column_index_level if any(len(key) != level for key in cols_sel): raise ValueError('All the key level should be the same as column index level.') column_to_index = list(zip(self._internal.data_columns, self._internal.column_index)) columns = [] column_index = [] for key in cols_sel: found = False for column, idx in column_to_index: if idx == key or idx[0] == key: columns.append(_make_col(column)) column_index.append(idx) found = True if not found: raise KeyError("['{}'] not in index".format(key)) if cond is None and returns_series: if self._is_series: return self._kdf_or_kser._with_new_scol(columns[0]) else: return Series(self._internal.copy(scol=columns[0], column_index=[column_index[0]]), anchor=self._kdf_or_kser) else: try: sdf = self._internal._sdf if cond is not None: sdf = sdf.where(cond) sdf = sdf.select(self._internal.index_scols + columns) if self._internal.column_index_names is None: column_index_names = None else: # Manage column index names level = column_index_level(column_index) column_index_names = self._internal.column_index_names[-level:] internal = _InternalFrame(sdf=sdf, index_map=self._internal.index_map, column_index=column_index, column_index_names=column_index_names) kdf = DataFrame(internal) except AnalysisException: raise KeyError('[{}] don\'t exist in columns' .format([col._jc.toString() for col in columns])) if returns_series: return Series(kdf._internal.copy(scol=kdf._internal.column_scols[0]), anchor=kdf) else: return kdf
def __getitem__(self, key): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series if self._is_series: if isinstance(key, Series) and key._kdf is not self._kdf_or_kser._kdf: kdf = self._kdf_or_kser.to_frame() kdf["__temp_col__"] = key return type(self)(kdf[self._kdf_or_kser.name])[kdf["__temp_col__"]] cond, limit, remaining_index = self._select_rows(key) if cond is None and limit is None: return self._kdf_or_kser column_labels = self._internal.column_labels data_spark_columns = self._internal.data_spark_columns returns_series = True else: assert self._is_df if isinstance(key, tuple): if len(key) != 2: raise SparkPandasIndexingError("Only accepts pairs of candidates") rows_sel, cols_sel = key else: rows_sel = key cols_sel = None if isinstance(rows_sel, Series) and rows_sel._kdf is not self._kdf_or_kser: kdf = self._kdf_or_kser.copy() kdf["__temp_col__"] = rows_sel return type(self)(kdf)[kdf["__temp_col__"], cols_sel][ list(self._kdf_or_kser.columns) ] cond, limit, remaining_index = self._select_rows(rows_sel) column_labels, data_spark_columns, returns_series = self._select_cols(cols_sel) if cond is None and limit is None and returns_series: return self._kdf_or_kser._kser_for(column_labels[0]) if remaining_index is not None: index_scols = self._internal.index_spark_columns[-remaining_index:] index_map = OrderedDict(list(self._internal.index_map.items())[-remaining_index:]) else: index_scols = self._internal.index_spark_columns index_map = self._internal.index_map if len(column_labels) > 0: column_labels = column_labels.copy() column_labels_level = max( len(label) if label is not None else 1 for label in column_labels ) none_column = 0 for i, label in enumerate(column_labels): if label is None: label = (str(none_column),) none_column += 1 if len(label) < column_labels_level: label = tuple(list(label) + ([""]) * (column_labels_level - len(label))) column_labels[i] = label if self._internal.column_label_names is None: column_label_names = None else: # Manage column index names column_label_names = self._internal.column_label_names[-column_labels_level:] else: column_label_names = None try: sdf = self._internal._sdf if cond is not None: sdf = sdf.drop(NATURAL_ORDER_COLUMN_NAME).filter(cond) if limit is not None: if limit >= 0: sdf = sdf.limit(limit) else: sdf = sdf.limit(sdf.count() + limit) data_columns = sdf.select(data_spark_columns).columns sdf = sdf.select(index_scols + data_spark_columns) except AnalysisException: raise KeyError( "[{}] don't exist in columns".format( [col._jc.toString() for col in data_spark_columns] ) ) internal = _InternalFrame( spark_frame=sdf, index_map=index_map, column_labels=column_labels, data_spark_columns=[scol_for(sdf, col) for col in data_columns], column_label_names=column_label_names, ) kdf = DataFrame(internal) if returns_series: kdf_or_kser = Series( kdf._internal.copy(spark_column=kdf._internal.data_spark_columns[0]), anchor=kdf ) else: kdf_or_kser = kdf if remaining_index is not None and remaining_index == 0: pdf_or_pser = kdf_or_kser.head(2).to_pandas() length = len(pdf_or_pser) if length == 0: raise KeyError(name_like_string(key)) elif length == 1: return pdf_or_pser.iloc[0] else: return kdf_or_kser else: return kdf_or_kser
def __getitem__(self, key): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series def raiseNotImplemented(description): raise SparkPandasNotImplementedError( description=description, pandas_function=".loc[..., ...]", spark_target_function="select, where") rows_sel, cols_sel = _unfold(key, self._kser) sdf = self._kdf._sdf if isinstance(rows_sel, Series): sdf_for_check_schema = sdf.select(rows_sel._scol) assert isinstance(sdf_for_check_schema.schema.fields[0].dataType, BooleanType), \ (str(sdf_for_check_schema), sdf_for_check_schema.schema.fields[0].dataType) sdf = sdf.where(rows_sel._scol) elif isinstance(rows_sel, slice): assert len(self._kdf._internal.index_columns) > 0 if rows_sel.step is not None: raiseNotImplemented("Cannot use step with Spark.") if rows_sel == slice(None): # If slice is None - select everything, so nothing to do pass elif len(self._kdf._internal.index_columns) == 1: start = rows_sel.start stop = rows_sel.stop index_column = self._kdf.index.to_series() index_data_type = index_column.spark_type cond = [] if start is not None: cond.append(index_column._scol >= F.lit(start).cast( index_data_type)) if stop is not None: cond.append( index_column._scol <= F.lit(stop).cast(index_data_type) ) if len(cond) > 0: sdf = sdf.where(reduce(lambda x, y: x & y, cond)) else: raiseNotImplemented( "Cannot use slice for MultiIndex with Spark.") elif isinstance(rows_sel, str): raiseNotImplemented( "Cannot use a scalar value for row selection with Spark.") else: try: rows_sel = list(rows_sel) except TypeError: raiseNotImplemented( "Cannot use a scalar value for row selection with Spark.") if len(rows_sel) == 0: sdf = sdf.where(F.lit(False)) elif len(self._kdf._internal.index_columns) == 1: index_column = self._kdf.index.to_series() index_data_type = index_column.spark_type if len(rows_sel) == 1: sdf = sdf.where(index_column._scol == F.lit( rows_sel[0]).cast(index_data_type)) else: sdf = sdf.where( index_column._scol.isin([ F.lit(r).cast(index_data_type) for r in rows_sel ])) else: raiseNotImplemented( "Cannot select with MultiIndex with Spark.") # make cols_sel a 1-tuple of string if a single string column_index = self._kdf._internal.column_index if isinstance(cols_sel, str): kdf = DataFrame(self._kdf._internal.copy(sdf=sdf)) return kdf._get_from_multiindex_column((cols_sel, )) elif isinstance(cols_sel, Series): cols_sel = _make_col(cols_sel) elif isinstance(cols_sel, slice) and cols_sel != slice(None): raise raiseNotImplemented( "Can only select columns either by name or reference or all") elif isinstance(cols_sel, slice) and cols_sel == slice(None): cols_sel = None if cols_sel is None: columns = self._kdf._internal.column_scols elif isinstance(cols_sel, spark.Column): columns = [cols_sel] column_index = None elif all(isinstance(key, Series) for key in cols_sel): columns = [_make_col(key) for key in cols_sel] column_index = [key._internal.column_index[0] for key in cols_sel] elif all(isinstance(key, spark.Column) for key in cols_sel): columns = cols_sel column_index = None elif (any(isinstance(key, str) for key in cols_sel) and any(isinstance(key, tuple) for key in cols_sel)): raise TypeError('Expected tuple, got str') else: if all(isinstance(key, tuple) for key in cols_sel): level = self._kdf._internal.column_index_level if any(len(key) != level for key in cols_sel): raise ValueError( 'All the key level should be the same as column index level.' ) column_to_index = list( zip(self._kdf._internal.data_columns, self._kdf._internal.column_index)) columns = [] column_index = [] for key in cols_sel: found = False for column, idx in column_to_index: if idx == key or idx[0] == key: columns.append(_make_col(column)) column_index.append(idx) found = True if not found: raise KeyError("['{}'] not in index".format(key)) try: sdf = sdf.select(self._kdf._internal.index_scols + columns) index_columns = self._kdf._internal.index_columns data_columns = [ column for column in sdf.columns if column not in index_columns ] column_scols = [scol_for(sdf, col) for col in data_columns] internal = _InternalFrame(sdf=sdf, index_map=self._kdf._internal.index_map, column_index=column_index, column_scols=column_scols) kdf = DataFrame(internal) except AnalysisException: raise KeyError('[{}] don\'t exist in columns'.format( [col._jc.toString() for col in columns])) if cols_sel is not None and isinstance(cols_sel, spark.Column): from databricks.koalas.series import _col return _col(kdf) else: return kdf
def transform(self, func): """ Apply function column-by-column to the GroupBy object. The function passed to `transform` must take a Series as its first argument and return a Series. The given function is executed for each series in each grouped data. While `transform` is a very flexible method, its downside is that using it can be quite a bit slower than using more specific methods like `agg` or `transform`. Koalas offers a wide range of method that will be much faster than using `transform` for their specific purposes, so try to use them before reaching for `transform`. .. note:: unlike pandas, it is required for ``func`` to specify return type hint. .. note:: the series within ``func`` is actually a pandas series. Therefore, any pandas APIs within this function is allowed. Parameters ---------- func : callable A callable that takes a Series as its first argument, and returns a Series. Returns ------- applied : DataFrame See Also -------- aggregate : Apply aggregate function to the GroupBy object. Series.apply : Apply a function to a Series. Examples -------- >>> df = ks.DataFrame({'A': [0, 0, 1], ... 'B': [1, 2, 3], ... 'C': [4, 6, 5]}, columns=['A', 'B', 'C']) >>> g = df.groupby('A') Notice that ``g`` has two groups, ``0`` and ``1``. Calling `transform` in various ways, we can get different grouping results: Below the functions passed to `transform` takes a Series as its argument and returns a Series. `transform` applies the function on each series in each grouped data, and combine them into a new DataFrame: >>> def convert_to_string(x) -> ks.Series[str]: ... return x.apply("a string {}".format) >>> g.transform(convert_to_string) # doctest: +NORMALIZE_WHITESPACE B C 0 a string 1 a string 4 1 a string 2 a string 6 2 a string 3 a string 5 >>> def plus_max(x) -> ks.Series[np.int]: ... return x + x.max() >>> g.transform(plus_max) # doctest: +NORMALIZE_WHITESPACE B C 0 3 10 1 4 12 2 6 10 """ # TODO: codes here are similar with GroupBy.apply. Needs to deduplicate. if not isinstance(func, Callable): raise TypeError("%s object is not callable" % type(func)) assert callable( func), "the first argument should be a callable function." spec = inspect.getfullargspec(func) return_sig = spec.annotations.get("return", None) if return_sig is None: raise ValueError( "Given function must have return type hint; however, not found." ) return_type = _infer_return_type(func).tpe input_groupnames = [s.name for s in self._groupkeys] data_columns = self._kdf._internal.data_columns return_schema = StructType([ StructField(c, return_type) for c in data_columns if c not in input_groupnames ]) index_columns = self._kdf._internal.index_columns index_names = self._kdf._internal.index_names data_columns = self._kdf._internal.data_columns def rename_output(pdf): # TODO: This logic below was borrowed from `DataFrame.pandas_df` to set the index # within each pdf properly. we might have to deduplicate it. import pandas as pd if len(index_columns) > 0: append = False for index_field in index_columns: drop = index_field not in data_columns pdf = pdf.set_index(index_field, drop=drop, append=append) append = True pdf = pdf[data_columns] if len(index_names) > 0: if isinstance(pdf.index, pd.MultiIndex): pdf.index.names = index_names else: pdf.index.name = index_names[0] # pandas GroupBy.transform drops grouping columns. pdf = pdf.drop(columns=input_groupnames) pdf = pdf.transform(func) # Remaps to the original name, positionally. pdf = pdf.rename( columns=dict(zip(pdf.columns, return_schema.fieldNames()))) return pdf grouped_map_func = pandas_udf(return_schema, PandasUDFType.GROUPED_MAP)(rename_output) sdf = self._kdf._sdf input_groupkeys = [s._scol for s in self._groupkeys] sdf = sdf.groupby(*input_groupkeys).apply(grouped_map_func) internal = _InternalFrame(sdf=sdf, data_columns=return_schema.fieldNames(), index_map=[]) # index is lost. return DataFrame(internal)
def value_counts(self, sort=None, ascending=None, dropna=True): """ Compute group sizes. Parameters ---------- sort : boolean, default None Sort by frequencies. ascending : boolean, default False Sort in ascending order. dropna : boolean, default True Don't include counts of NaN. See Also -------- databricks.koalas.Series.groupby databricks.koalas.DataFrame.groupby Examples -------- >>> df = ks.DataFrame({'A': [1, 2, 2, 3, 3, 3], ... 'B': [1, 1, 2, 3, 3, 3]}, ... columns=['A', 'B']) >>> df A B 0 1 1 1 2 1 2 2 2 3 3 3 4 3 3 5 3 3 >>> df.groupby('A')['B'].value_counts().sort_index() # doctest: +NORMALIZE_WHITESPACE A B 1 1 1 2 1 1 2 1 3 3 3 Name: B, dtype: int64 """ groupkeys = self._groupkeys + self._agg_columns groupkey_cols = [ s._scol.alias('__index_level_{}__'.format(i)) for i, s in enumerate(groupkeys) ] sdf = self._kdf._sdf agg_column = self._agg_columns[0].name sdf = sdf.groupby(*groupkey_cols).count().withColumnRenamed( 'count', agg_column) if sort: if ascending: sdf = sdf.orderBy(F.col(agg_column).asc()) else: sdf = sdf.orderBy(F.col(agg_column).desc()) internal = _InternalFrame(sdf=sdf, data_columns=[agg_column], index_map=[('__index_level_{}__'.format(i), s.name) for i, s in enumerate(groupkeys)]) return _col(DataFrame(internal))
def __getitem__(self, key): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series if self._is_series: if isinstance(key, tuple): if len(key) > 1: raise SparkPandasIndexingError('Too many indexers') key = key[0] if isinstance(key, Series) and key._kdf is not self._kdf_or_kser._kdf: kdf = self._kdf_or_kser.to_frame() kdf['__temp_col__'] = key return type(self)( kdf[self._kdf_or_kser.name])[kdf['__temp_col__']] cond, limit = self._select_rows(key) if cond is None and limit is None: return self._kdf_or_kser column_index = self._internal.column_index column_scols = self._internal.column_scols returns_series = True else: assert self._is_df if isinstance(key, tuple): if len(key) != 2: raise SparkPandasIndexingError( "Only accepts pairs of candidates") rows_sel, cols_sel = key else: rows_sel = key cols_sel = None if isinstance(rows_sel, Series) and rows_sel._kdf is not self._kdf_or_kser: kdf = self._kdf_or_kser.copy() kdf['__temp_col__'] = rows_sel return type(self)(kdf)[kdf['__temp_col__'], cols_sel][list( self._kdf_or_kser.columns)] cond, limit = self._select_rows(rows_sel) column_index, column_scols, returns_series = self._select_cols( cols_sel) if cond is None and limit is None and returns_series: return Series(self._internal.copy( scol=column_scols[0], column_index=[column_index[0]]), anchor=self._kdf_or_kser) try: sdf = self._internal._sdf if cond is not None: sdf = sdf.drop(NATURAL_ORDER_COLUMN_NAME).filter(cond) if limit is not None: if limit >= 0: sdf = sdf.limit(limit) else: sdf = sdf.limit(sdf.count() + limit) sdf = sdf.select(self._internal.index_scols + column_scols) if self._internal.column_index_names is None: column_index_names = None else: # Manage column index names level = column_index_level(column_index) column_index_names = self._internal.column_index_names[-level:] internal = _InternalFrame(sdf=sdf, index_map=self._internal.index_map, column_index=column_index, column_index_names=column_index_names) kdf = DataFrame(internal) except AnalysisException: raise KeyError('[{}] don\'t exist in columns'.format( [col._jc.toString() for col in column_scols])) if returns_series: return Series( kdf._internal.copy(scol=kdf._internal.column_scols[0]), anchor=kdf) else: return kdf
def sort_values(self, ascending=True): """ Return a sorted copy of the index. .. note:: This method is not supported for pandas when index has NaN value. pandas raises unexpected TypeError, but we support treating NaN as the smallest value. Parameters ---------- ascending : bool, default True Should the index values be sorted in an ascending order. Returns ------- sorted_index : ks.Index or ks.MultiIndex Sorted copy of the index. See Also -------- Series.sort_values : Sort values of a Series. DataFrame.sort_values : Sort values in a DataFrame. Examples -------- >>> idx = ks.Index([10, 100, 1, 1000]) >>> idx Int64Index([10, 100, 1, 1000], dtype='int64') Sort values in ascending order (default behavior). >>> idx.sort_values() Int64Index([1, 10, 100, 1000], dtype='int64') Sort values in descending order. >>> idx.sort_values(ascending=False) Int64Index([1000, 100, 10, 1], dtype='int64') Support for MultiIndex. >>> kidx = ks.MultiIndex.from_tuples([('a', 'x', 1), ('c', 'y', 2), ('b', 'z', 3)]) >>> kidx # doctest: +SKIP MultiIndex([('a', 'x', 1), ('c', 'y', 2), ('b', 'z', 3)], ) >>> kidx.sort_values() # doctest: +SKIP MultiIndex([('a', 'x', 1), ('b', 'z', 3), ('c', 'y', 2)], ) >>> kidx.sort_values(ascending=False) # doctest: +SKIP MultiIndex([('c', 'y', 2), ('b', 'z', 3), ('a', 'x', 1)], ) """ sdf = self._internal.sdf sdf = sdf.orderBy(self._internal.index_scols, ascending=ascending) internal = _InternalFrame(sdf=sdf.select(self._internal.index_scols), index_map=self._kdf._internal.index_map) result = DataFrame(internal).index return result
def __getitem__(self, key): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series rows_sel, cols_sel = _unfold(key, self._kdf_or_kser if self._is_series else None) sdf = self._internal.sdf cond, limit = self._select_rows(rows_sel) if cond is not None: sdf = sdf.where(cond) if limit is not None: if limit >= 0: sdf = sdf.limit(limit) else: sdf = sdf.limit(sdf.count() + limit) # make cols_sel a 1-tuple of string if a single string if isinstance(cols_sel, Series) and cols_sel._equals(self._kdf_or_kser): columns = cols_sel._internal.column_scols column_index = cols_sel._internal.column_index elif isinstance(cols_sel, int): columns = [self._internal.column_scols[cols_sel]] column_index = [self._internal.column_index[cols_sel]] elif cols_sel is None or cols_sel == slice(None): columns = self._internal.column_scols column_index = self._internal.column_index elif isinstance(cols_sel, slice): if all(s is None or isinstance(s, int) for s in (cols_sel.start, cols_sel.stop, cols_sel.step)): columns = self._internal.column_scols[cols_sel] column_index = self._internal.column_index[cols_sel] else: not_none = cols_sel.start if cols_sel.start is not None \ else cols_sel.stop if cols_sel.stop is not None else cols_sel.step raise TypeError('cannot do slice indexing with these indexers {} of {}' .format(not_none, type(not_none))) elif is_list_like(cols_sel): if all(isinstance(s, bool) for s in cols_sel): cols_sel = [i for i, s in enumerate(cols_sel) if s] if all(isinstance(s, int) for s in cols_sel): columns = [self._internal.column_scols[s] for s in cols_sel] column_index = [self._internal.column_index[s] for s in cols_sel] else: raise TypeError('cannot perform reduce with flexible type') else: raise ValueError("Location based indexing can only have [integer, integer slice, " "listlike of integers, boolean array] types, got {}".format(cols_sel)) try: sdf = sdf.select(self._internal.index_scols + columns) internal = _InternalFrame(sdf=sdf, index_map=self._internal.index_map, column_index=column_index, column_index_names=self._internal.column_index_names) kdf = DataFrame(internal) except AnalysisException: raise KeyError('[{}] don\'t exist in columns' .format([col._jc.toString() for col in columns])) if cols_sel is not None and isinstance(cols_sel, (Series, int)): from databricks.koalas.series import _col return _col(kdf) else: return kdf
def __getitem__(self, key): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series if self._is_series: if isinstance(key, Series) and key._kdf is not self._kdf_or_kser._kdf: kdf = self._kdf_or_kser.to_frame() kdf["__temp_col__"] = key return type(self)( kdf[self._kdf_or_kser.name])[kdf["__temp_col__"]] cond, limit, remaining_index = self._select_rows(key) if cond is None and limit is None: return self._kdf_or_kser column_labels = self._internal.column_labels column_scols = self._internal.column_scols returns_series = True else: assert self._is_df if isinstance(key, tuple): if len(key) != 2: raise SparkPandasIndexingError( "Only accepts pairs of candidates") rows_sel, cols_sel = key else: rows_sel = key cols_sel = None if isinstance(rows_sel, Series) and rows_sel._kdf is not self._kdf_or_kser: kdf = self._kdf_or_kser.copy() kdf["__temp_col__"] = rows_sel return type(self)(kdf)[kdf["__temp_col__"], cols_sel][list( self._kdf_or_kser.columns)] cond, limit, remaining_index = self._select_rows(rows_sel) column_labels, column_scols, returns_series = self._select_cols( cols_sel) if cond is None and limit is None and returns_series: return Series( self._internal.copy(scol=column_scols[0], column_labels=[column_labels[0]]), anchor=self._kdf_or_kser, ) if remaining_index is not None: index_scols = self._internal.index_scols[-remaining_index:] index_map = self._internal.index_map[-remaining_index:] else: index_scols = self._internal.index_scols index_map = self._internal.index_map if self._internal.column_label_names is None: column_label_names = None else: # Manage column index names level = column_labels_level(column_labels) column_label_names = self._internal.column_label_names[-level:] try: sdf = self._internal._sdf if cond is not None: sdf = sdf.drop(NATURAL_ORDER_COLUMN_NAME).filter(cond) if limit is not None: if limit >= 0: sdf = sdf.limit(limit) else: sdf = sdf.limit(sdf.count() + limit) sdf = sdf.select(index_scols + column_scols) except AnalysisException: raise KeyError("[{}] don't exist in columns".format( [col._jc.toString() for col in column_scols])) internal = _InternalFrame( sdf=sdf, index_map=index_map, column_labels=column_labels, column_label_names=column_label_names, ) kdf = DataFrame(internal) if returns_series: kdf_or_kser = Series( kdf._internal.copy(scol=kdf._internal.column_scols[0]), anchor=kdf) else: kdf_or_kser = kdf if remaining_index is not None and remaining_index == 0: pdf_or_pser = kdf_or_kser.head(2).to_pandas() length = len(pdf_or_pser) if length == 0: raise KeyError(name_like_string(key)) elif length == 1: return pdf_or_pser.iloc[0] else: return kdf_or_kser else: return kdf_or_kser
def apply(self, func): """ Apply function `func` group-wise and combine the results together. The function passed to `apply` must take a DataFrame as its first argument and return a DataFrame. `apply` will then take care of combining the results back together into a single dataframe. `apply` is therefore a highly flexible grouping method. While `apply` is a very flexible method, its downside is that using it can be quite a bit slower than using more specific methods like `agg` or `transform`. Koalas offers a wide range of method that will be much faster than using `apply` for their specific purposes, so try to use them before reaching for `apply`. .. note:: unlike pandas, it is required for ``func`` to specify return type hint. .. note:: the output column names are `c0, c1, c2 ... cn`. These names are positionally mapped to the returned DataFrame in ``func``. See examples below. .. note:: the dataframe within ``func`` is actually a pandas dataframe. Therefore, any pandas APIs within this function is allowed. Parameters ---------- func : callable A callable that takes a DataFrame as its first argument, and returns a dataframe. Returns ------- applied : DataFrame See Also -------- aggregate : Apply aggregate function to the GroupBy object. Series.apply : Apply a function to a Series. Examples -------- >>> df = ks.DataFrame({'A': 'a a b'.split(), ... 'B': [1, 2, 3], ... 'C': [4, 6, 5]}, columns=['A', 'B', 'C']) >>> g = df.groupby('A') Notice that ``g`` has two groups, ``a`` and ``b``. Calling `apply` in various ways, we can get different grouping results: Below the functions passed to `apply` takes a DataFrame as its argument and returns a DataFrame. `apply` combines the result for each group together into a new DataFrame: >>> def pandas_div_sum(x) -> ks.DataFrame[float, float]: ... return x[['B', 'C']] / x[['B', 'C']].sum() >>> g.apply(pandas_div_sum) # doctest: +NORMALIZE_WHITESPACE c0 c1 0 1.000000 1.0 1 0.333333 0.4 2 0.666667 0.6 >>> def plus_max(x) -> ks.DataFrame[str, np.int, np.int]: ... return x + x.max() >>> g.apply(plus_max) # doctest: +NORMALIZE_WHITESPACE c0 c1 c2 0 bb 6 10 1 aa 3 10 2 aa 4 12 """ if not isinstance(func, Callable): raise TypeError("%s object is not callable" % type(func)) assert callable( func), "the first argument should be a callable function." spec = inspect.getfullargspec(func) return_sig = spec.annotations.get("return", None) if return_sig is None: raise ValueError( "Given function must have return type hint; however, not found." ) return_schema = _infer_return_type(func).tpe index_columns = self._kdf._internal.index_columns index_names = self._kdf._internal.index_names data_columns = self._kdf._internal.data_columns def rename_output(pdf): # TODO: This logic below was borrowed from `DataFrame.pandas_df` to set the index # within each pdf properly. we might have to deduplicate it. import pandas as pd if len(index_columns) > 0: append = False for index_field in index_columns: drop = index_field not in data_columns pdf = pdf.set_index(index_field, drop=drop, append=append) append = True pdf = pdf[data_columns] if len(index_names) > 0: if isinstance(pdf.index, pd.MultiIndex): pdf.index.names = index_names else: pdf.index.name = index_names[0] pdf = func(pdf) # For now, just positionally map the column names to given schema's. pdf = pdf.rename( columns=dict(zip(pdf.columns, return_schema.fieldNames()))) return pdf grouped_map_func = pandas_udf(return_schema, PandasUDFType.GROUPED_MAP)(rename_output) sdf = self._kdf._sdf input_groupkeys = [s._scol for s in self._groupkeys] sdf = sdf.groupby(*input_groupkeys).apply(grouped_map_func) internal = _InternalFrame(sdf=sdf, data_columns=return_schema.fieldNames(), index_map=[]) # index is lost. return DataFrame(internal)