示例#1
0
    def __getitem__(self, key):
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.indexes import Index
        from databricks.koalas.series import Series

        def raiseNotImplemented(description):
            raise SparkPandasNotImplementedError(
                description=description,
                pandas_function=".iloc[..., ...]",
                spark_target_function="select, where")

        rows_sel, cols_sel = _unfold(key, self._ks)

        sdf = self._kdf._sdf
        if isinstance(rows_sel, Index):
            sdf_for_check_schema = sdf.select(rows_sel._scol)
            assert isinstance(sdf_for_check_schema.schema.fields[0].dataType, BooleanType), \
                (str(sdf_for_check_schema), sdf_for_check_schema.schema.fields[0].dataType)
            sdf = sdf.where(rows_sel._scol)
        elif isinstance(rows_sel, slice):
            if rows_sel == slice(None):
                # If slice is None - select everything, so nothing to do
                pass
            elif (rows_sel.start is not None) or (rows_sel.step is not None):
                raiseNotImplemented("Cannot use start or step with Spark.")
            elif not isinstance(rows_sel.stop, int):
                raise TypeError(
                    "cannot do slice indexing with these indexers [{}] of {}".
                    format(rows_sel.stop, type(rows_sel.stop)))
            elif rows_sel.stop >= 0:
                sdf = sdf.limit(rows_sel.stop)
            else:
                sdf = sdf.limit(sdf.count() + rows_sel.stop)
        else:
            raiseNotImplemented(
                ".iloc requires numeric slice or conditional boolean Index, "
                "got {}".format(rows_sel))

        # make cols_sel a 1-tuple of string if a single string
        if isinstance(cols_sel, Series):
            columns = [cols_sel._scol]
        elif isinstance(cols_sel, int):
            columns = [self._kdf._internal.data_scols[cols_sel]]
        elif cols_sel is None or cols_sel == slice(None):
            columns = self._kdf._internal.data_scols
        elif isinstance(cols_sel, slice):
            if all(s is None or isinstance(s, int)
                   for s in (cols_sel.start, cols_sel.stop, cols_sel.step)):
                columns = self._kdf._internal.data_scols[cols_sel]
            else:
                not_none = cols_sel.start if cols_sel.start is not None \
                    else cols_sel.stop if cols_sel.stop is not None else cols_sel.step
                raise TypeError(
                    'cannot do slice indexing with these indexers {} of {}'.
                    format(not_none, type(not_none)))
        elif is_list_like(cols_sel):
            if all(isinstance(s, int) for s in cols_sel):
                columns = [
                    self._kdf._internal.scol_for(col)
                    for col in self._kdf.columns[cols_sel]
                ]
            else:
                raise TypeError('cannot perform reduce with flexible type')
        else:
            raise ValueError(
                "Location based indexing can only have [integer, integer slice, "
                "listlike of integers, boolean array] types, got {}".format(
                    cols_sel))

        try:
            kdf = DataFrame(
                sdf.select(self._kdf._internal.index_scols + columns))
        except AnalysisException:
            raise KeyError('[{}] don\'t exist in columns'.format(
                [col._jc.toString() for col in columns]))

        column_index = self._kdf._internal.column_index
        if column_index is not None:
            if cols_sel is not None and isinstance(cols_sel, (Series, int)):
                column_index = None
            else:
                column_index = pd.MultiIndex.from_tuples(
                    column_index)[cols_sel].tolist()

        kdf._internal = kdf._internal.copy(
            data_columns=kdf._internal.data_columns[-len(columns):],
            index_map=self._kdf._internal.index_map,
            column_index=column_index)
        if cols_sel is not None and isinstance(cols_sel, (Series, int)):
            from databricks.koalas.series import _col
            return _col(kdf)
        else:
            return kdf
示例#2
0
    def __getitem__(self, key):
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import Series

        def raiseNotImplemented(description):
            raise SparkPandasNotImplementedError(
                description=description,
                pandas_function=".loc[..., ...]",
                spark_target_function="select, where")

        rows_sel, cols_sel = _unfold(key, self._ks)

        sdf = self._kdf._sdf
        if isinstance(rows_sel, Series):
            sdf_for_check_schema = sdf.select(rows_sel._scol)
            assert isinstance(sdf_for_check_schema.schema.fields[0].dataType, BooleanType), \
                (str(sdf_for_check_schema), sdf_for_check_schema.schema.fields[0].dataType)
            sdf = sdf.where(rows_sel._scol)
        elif isinstance(rows_sel, slice):
            if rows_sel.step is not None:
                raiseNotImplemented("Cannot use step with Spark.")
            if rows_sel == slice(None):
                # If slice is None - select everything, so nothing to do
                pass
            elif len(self._kdf._internal.index_columns) == 0:
                raiseNotImplemented(
                    "Cannot use slice for Spark if no index provided.")
            elif len(self._kdf._internal.index_columns) == 1:
                start = rows_sel.start
                stop = rows_sel.stop

                index_column = self._kdf.index.to_series()
                index_data_type = index_column.schema[0].dataType
                cond = []
                if start is not None:
                    cond.append(index_column._scol >= F.lit(start).cast(
                        index_data_type))
                if stop is not None:
                    cond.append(
                        index_column._scol <= F.lit(stop).cast(index_data_type)
                    )

                if len(cond) > 0:
                    sdf = sdf.where(reduce(lambda x, y: x & y, cond))
            else:
                raiseNotImplemented(
                    "Cannot use slice for MultiIndex with Spark.")
        elif isinstance(rows_sel, str):
            raiseNotImplemented(
                "Cannot use a scalar value for row selection with Spark.")
        else:
            try:
                rows_sel = list(rows_sel)
            except TypeError:
                raiseNotImplemented(
                    "Cannot use a scalar value for row selection with Spark.")
            if len(rows_sel) == 0:
                sdf = sdf.where(F.lit(False))
            elif len(self._kdf._internal.index_columns) == 1:
                index_column = self._kdf.index.to_series()
                index_data_type = index_column.schema[0].dataType
                if len(rows_sel) == 1:
                    sdf = sdf.where(index_column._scol == F.lit(
                        rows_sel[0]).cast(index_data_type))
                else:
                    sdf = sdf.where(
                        index_column._scol.isin([
                            F.lit(r).cast(index_data_type) for r in rows_sel
                        ]))
            else:
                raiseNotImplemented(
                    "Cannot select with MultiIndex with Spark.")

        # make cols_sel a 1-tuple of string if a single string
        column_index = self._kdf._internal.column_index
        if isinstance(cols_sel, str):
            if column_index is not None:
                return self[rows_sel, [cols_sel]]._get_from_multiindex_column(
                    (cols_sel, ))
            else:
                cols_sel = _make_col(cols_sel)
        elif isinstance(cols_sel, Series):
            cols_sel = _make_col(cols_sel)
        elif isinstance(cols_sel, slice) and cols_sel != slice(None):
            raise raiseNotImplemented(
                "Can only select columns either by name or reference or all")
        elif isinstance(cols_sel, slice) and cols_sel == slice(None):
            cols_sel = None

        if cols_sel is None:
            columns = self._kdf._internal.data_scols
        elif isinstance(cols_sel, spark.Column):
            columns = [cols_sel]
        else:
            if column_index is not None:
                column_to_index = list(
                    zip(self._kdf._internal.data_columns,
                        self._kdf._internal.column_index))
                columns, column_index = zip(
                    *[(_make_col(column), idx) for key in cols_sel
                      for column, idx in column_to_index if idx[0] == key])
                columns, column_index = list(columns), list(column_index)
            else:
                columns = [_make_col(c) for c in cols_sel]

        try:
            kdf = DataFrame(
                sdf.select(self._kdf._internal.index_scols + columns))
        except AnalysisException:
            raise KeyError('[{}] don\'t exist in columns'.format(
                [col._jc.toString() for col in columns]))

        kdf._internal = kdf._internal.copy(
            data_columns=kdf._internal.data_columns[-len(columns):],
            index_map=self._kdf._internal.index_map,
            column_index=column_index)
        if cols_sel is not None and isinstance(cols_sel, spark.Column):
            from databricks.koalas.series import _col
            return _col(kdf)
        else:
            return kdf