def __getitem__(self, key): from databricks.koalas.frame import DataFrame from databricks.koalas.indexes import Index from databricks.koalas.series import Series def raiseNotImplemented(description): raise SparkPandasNotImplementedError( description=description, pandas_function=".iloc[..., ...]", spark_target_function="select, where") rows_sel, cols_sel = _unfold(key, self._ks) sdf = self._kdf._sdf if isinstance(rows_sel, Index): sdf_for_check_schema = sdf.select(rows_sel._scol) assert isinstance(sdf_for_check_schema.schema.fields[0].dataType, BooleanType), \ (str(sdf_for_check_schema), sdf_for_check_schema.schema.fields[0].dataType) sdf = sdf.where(rows_sel._scol) elif isinstance(rows_sel, slice): if rows_sel == slice(None): # If slice is None - select everything, so nothing to do pass elif (rows_sel.start is not None) or (rows_sel.step is not None): raiseNotImplemented("Cannot use start or step with Spark.") elif not isinstance(rows_sel.stop, int): raise TypeError( "cannot do slice indexing with these indexers [{}] of {}". format(rows_sel.stop, type(rows_sel.stop))) elif rows_sel.stop >= 0: sdf = sdf.limit(rows_sel.stop) else: sdf = sdf.limit(sdf.count() + rows_sel.stop) else: raiseNotImplemented( ".iloc requires numeric slice or conditional boolean Index, " "got {}".format(rows_sel)) # make cols_sel a 1-tuple of string if a single string if isinstance(cols_sel, Series): columns = [cols_sel._scol] elif isinstance(cols_sel, int): columns = [self._kdf._internal.data_scols[cols_sel]] elif cols_sel is None or cols_sel == slice(None): columns = self._kdf._internal.data_scols elif isinstance(cols_sel, slice): if all(s is None or isinstance(s, int) for s in (cols_sel.start, cols_sel.stop, cols_sel.step)): columns = self._kdf._internal.data_scols[cols_sel] else: not_none = cols_sel.start if cols_sel.start is not None \ else cols_sel.stop if cols_sel.stop is not None else cols_sel.step raise TypeError( 'cannot do slice indexing with these indexers {} of {}'. format(not_none, type(not_none))) elif is_list_like(cols_sel): if all(isinstance(s, int) for s in cols_sel): columns = [ self._kdf._internal.scol_for(col) for col in self._kdf.columns[cols_sel] ] else: raise TypeError('cannot perform reduce with flexible type') else: raise ValueError( "Location based indexing can only have [integer, integer slice, " "listlike of integers, boolean array] types, got {}".format( cols_sel)) try: kdf = DataFrame( sdf.select(self._kdf._internal.index_scols + columns)) except AnalysisException: raise KeyError('[{}] don\'t exist in columns'.format( [col._jc.toString() for col in columns])) column_index = self._kdf._internal.column_index if column_index is not None: if cols_sel is not None and isinstance(cols_sel, (Series, int)): column_index = None else: column_index = pd.MultiIndex.from_tuples( column_index)[cols_sel].tolist() kdf._internal = kdf._internal.copy( data_columns=kdf._internal.data_columns[-len(columns):], index_map=self._kdf._internal.index_map, column_index=column_index) if cols_sel is not None and isinstance(cols_sel, (Series, int)): from databricks.koalas.series import _col return _col(kdf) else: return kdf
def __getitem__(self, key): from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series def raiseNotImplemented(description): raise SparkPandasNotImplementedError( description=description, pandas_function=".loc[..., ...]", spark_target_function="select, where") rows_sel, cols_sel = _unfold(key, self._ks) sdf = self._kdf._sdf if isinstance(rows_sel, Series): sdf_for_check_schema = sdf.select(rows_sel._scol) assert isinstance(sdf_for_check_schema.schema.fields[0].dataType, BooleanType), \ (str(sdf_for_check_schema), sdf_for_check_schema.schema.fields[0].dataType) sdf = sdf.where(rows_sel._scol) elif isinstance(rows_sel, slice): if rows_sel.step is not None: raiseNotImplemented("Cannot use step with Spark.") if rows_sel == slice(None): # If slice is None - select everything, so nothing to do pass elif len(self._kdf._internal.index_columns) == 0: raiseNotImplemented( "Cannot use slice for Spark if no index provided.") elif len(self._kdf._internal.index_columns) == 1: start = rows_sel.start stop = rows_sel.stop index_column = self._kdf.index.to_series() index_data_type = index_column.schema[0].dataType cond = [] if start is not None: cond.append(index_column._scol >= F.lit(start).cast( index_data_type)) if stop is not None: cond.append( index_column._scol <= F.lit(stop).cast(index_data_type) ) if len(cond) > 0: sdf = sdf.where(reduce(lambda x, y: x & y, cond)) else: raiseNotImplemented( "Cannot use slice for MultiIndex with Spark.") elif isinstance(rows_sel, str): raiseNotImplemented( "Cannot use a scalar value for row selection with Spark.") else: try: rows_sel = list(rows_sel) except TypeError: raiseNotImplemented( "Cannot use a scalar value for row selection with Spark.") if len(rows_sel) == 0: sdf = sdf.where(F.lit(False)) elif len(self._kdf._internal.index_columns) == 1: index_column = self._kdf.index.to_series() index_data_type = index_column.schema[0].dataType if len(rows_sel) == 1: sdf = sdf.where(index_column._scol == F.lit( rows_sel[0]).cast(index_data_type)) else: sdf = sdf.where( index_column._scol.isin([ F.lit(r).cast(index_data_type) for r in rows_sel ])) else: raiseNotImplemented( "Cannot select with MultiIndex with Spark.") # make cols_sel a 1-tuple of string if a single string column_index = self._kdf._internal.column_index if isinstance(cols_sel, str): if column_index is not None: return self[rows_sel, [cols_sel]]._get_from_multiindex_column( (cols_sel, )) else: cols_sel = _make_col(cols_sel) elif isinstance(cols_sel, Series): cols_sel = _make_col(cols_sel) elif isinstance(cols_sel, slice) and cols_sel != slice(None): raise raiseNotImplemented( "Can only select columns either by name or reference or all") elif isinstance(cols_sel, slice) and cols_sel == slice(None): cols_sel = None if cols_sel is None: columns = self._kdf._internal.data_scols elif isinstance(cols_sel, spark.Column): columns = [cols_sel] else: if column_index is not None: column_to_index = list( zip(self._kdf._internal.data_columns, self._kdf._internal.column_index)) columns, column_index = zip( *[(_make_col(column), idx) for key in cols_sel for column, idx in column_to_index if idx[0] == key]) columns, column_index = list(columns), list(column_index) else: columns = [_make_col(c) for c in cols_sel] try: kdf = DataFrame( sdf.select(self._kdf._internal.index_scols + columns)) except AnalysisException: raise KeyError('[{}] don\'t exist in columns'.format( [col._jc.toString() for col in columns])) kdf._internal = kdf._internal.copy( data_columns=kdf._internal.data_columns[-len(columns):], index_map=self._kdf._internal.index_map, column_index=column_index) if cols_sel is not None and isinstance(cols_sel, spark.Column): from databricks.koalas.series import _col return _col(kdf) else: return kdf