示例#1
0
def _unfold(key, col):
    """ Return row selection and column selection pair.

    If col parameter is not None, the key should be row selection and the column selection will be
    the col parameter itself. Otherwise check the key contains column selection, and the selection
    is acceptable.
    """
    from databricks.koalas.series import Series
    if col is not None:
        if isinstance(key, tuple):
            if len(key) > 1:
                raise SparkPandasIndexingError('Too many indexers')
            key = key[0]
        rows_sel = key
        cols_sel = col._scol
    elif isinstance(key, tuple):
        if len(key) != 2:
            raise SparkPandasIndexingError("Only accepts pairs of candidates")
        rows_sel, cols_sel = key

        # make cols_sel a 1-tuple of string if a single string
        if isinstance(cols_sel, (str, Series)):
            cols_sel = _make_col(cols_sel)
        elif isinstance(cols_sel, slice) and cols_sel != slice(None):
            raise SparkPandasNotImplementedError(
                description="Can only select columns either by name or reference or all",
                pandas_function="loc",
                spark_target_function="select, where, withColumn")
        elif isinstance(cols_sel, slice) and cols_sel == slice(None):
            cols_sel = None
    else:
        rows_sel = key
        cols_sel = None

    return rows_sel, cols_sel
示例#2
0
    def _select_rows(self, rows_sel):
        from databricks.koalas.indexes import Index

        if isinstance(rows_sel, tuple) and len(rows_sel) > 1:
            raise SparkPandasIndexingError("Too many indexers")
        elif isinstance(rows_sel, Index):
            assert isinstance(rows_sel.spark_type,
                              BooleanType), rows_sel.spark_type
            return rows_sel._scol, None, None
        elif isinstance(rows_sel, slice):
            if rows_sel == slice(None):
                # If slice is None - select everything, so nothing to do
                return None, None, None
            elif (rows_sel.start is not None) or (rows_sel.step is not None):
                iLocIndexer._raiseNotImplemented(
                    "Cannot use start or step with Spark.")
            elif not isinstance(rows_sel.stop, int):
                raise TypeError(
                    "cannot do slice indexing with these indexers [{}] of {}".
                    format(rows_sel.stop, type(rows_sel.stop)))
            else:
                return None, rows_sel.stop, None
        elif isinstance(rows_sel, int):
            sdf = self._internal.sdf
            return (sdf[self._sequence_col] == rows_sel), None, 0
        else:
            iLocIndexer._raiseNotImplemented(
                ".iloc requires numeric slice or conditional "
                "boolean Index, got {}".format(type(rows_sel)))
示例#3
0
def _unfold(key, kseries):
    """ Return row selection and column selection pair.

    If kseries parameter is not None, the key should be row selection and the column selection will
    be the kseries parameter.

    >>> s = ks.Series([1, 2, 3], name='a')
    >>> _unfold(slice(1, 2), s)
    (slice(1, 2, None), 0    1
    1    2
    2    3
    Name: a, dtype: int64)

    >>> _unfold((slice(1, 2), slice(None)), None)
    (slice(1, 2, None), slice(None, None, None))

    >>> _unfold((slice(1, 2), s), None)
    (slice(1, 2, None), 0    1
    1    2
    2    3
    Name: a, dtype: int64)

    >>> _unfold((slice(1, 2), 'col'), None)
    (slice(1, 2, None), 'col')
    """
    if kseries is not None:
        if isinstance(key, tuple):
            if len(key) > 1:
                raise SparkPandasIndexingError('Too many indexers')
            key = key[0]
        rows_sel = key
        cols_sel = kseries
    elif isinstance(key, tuple):
        if len(key) != 2:
            raise SparkPandasIndexingError("Only accepts pairs of candidates")
        rows_sel, cols_sel = key
    else:
        rows_sel = key
        cols_sel = None

    return rows_sel, cols_sel
示例#4
0
    def _select_rows(self, rows_sel):
        from databricks.koalas.series import Series

        if isinstance(rows_sel, Series):
            assert isinstance(rows_sel.spark_type,
                              BooleanType), rows_sel.spark_type
            return rows_sel._scol, None, None
        elif isinstance(rows_sel, slice):
            assert len(self._internal.index_columns) > 0
            if rows_sel.step is not None:
                LocIndexer._raiseNotImplemented("Cannot use step with Spark.")
            if rows_sel == slice(None):
                # If slice is None - select everything, so nothing to do
                return None, None, None
            elif len(self._internal.index_columns) == 1:
                sdf = self._internal.sdf
                index = self._kdf_or_kser.index
                index_column = index.to_series()
                index_data_type = index_column.spark_type
                start = rows_sel.start
                stop = rows_sel.stop

                # get natural order from '__natural_order__' from start to stop
                # to keep natural order.
                start_and_stop = (sdf.select(
                    index_column._scol, NATURAL_ORDER_COLUMN_NAME
                ).where(
                    (index_column._scol == F.lit(start).cast(index_data_type))
                    | (index_column._scol == F.lit(stop).cast(index_data_type))
                ).collect())

                start = [row[1] for row in start_and_stop if row[0] == start]
                start = start[0] if len(start) > 0 else None

                stop = [row[1] for row in start_and_stop if row[0] == stop]
                stop = stop[-1] if len(stop) > 0 else None

                cond = []
                if start is not None:
                    cond.append(
                        F.col(NATURAL_ORDER_COLUMN_NAME) >= F.lit(start).cast(
                            LongType()))
                if stop is not None:
                    cond.append(
                        F.col(NATURAL_ORDER_COLUMN_NAME) <= F.lit(stop).cast(
                            LongType()))

                # if index order is not monotonic increasing or decreasing
                # and specified values don't exist in index, raise KeyError
                if (start is None and rows_sel.start is not None) or (
                        stop is None and rows_sel.stop is not None):
                    inc, dec = (sdf.select(
                        index_column._is_monotonic()._scol.alias(
                            "__increasing__"),
                        index_column._is_monotonic_decreasing()._scol.alias(
                            "__decreasing__"),
                    ).select(
                        F.min(F.coalesce("__increasing__", F.lit(True))),
                        F.min(F.coalesce("__decreasing__", F.lit(True))),
                    ).first())
                    if start is None and rows_sel.start is not None:
                        start = rows_sel.start
                        if inc is not False:
                            cond.append(index_column._scol >= F.lit(
                                start).cast(index_data_type))
                        elif dec is not False:
                            cond.append(index_column._scol <= F.lit(
                                start).cast(index_data_type))
                        else:
                            raise KeyError(rows_sel.start)
                    if stop is None and rows_sel.stop is not None:
                        stop = rows_sel.stop
                        if inc is not False:
                            cond.append(index_column._scol <= F.lit(stop).cast(
                                index_data_type))
                        elif dec is not False:
                            cond.append(index_column._scol >= F.lit(stop).cast(
                                index_data_type))
                        else:
                            raise KeyError(rows_sel.stop)

                if len(cond) > 0:
                    return reduce(lambda x, y: x & y, cond), None, None
            else:
                LocIndexer._raiseNotImplemented(
                    "Cannot use slice for MultiIndex with Spark.")
        elif is_list_like(rows_sel) and not isinstance(rows_sel, tuple):
            rows_sel = list(rows_sel)
            if len(rows_sel) == 0:
                return F.lit(False), None, None
            elif len(self._internal.index_columns) == 1:
                index_column = self._kdf_or_kser.index.to_series()
                index_data_type = index_column.spark_type
                if len(rows_sel) == 1:
                    return (
                        index_column._scol == F.lit(
                            rows_sel[0]).cast(index_data_type),
                        None,
                        None,
                    )
                else:
                    return (
                        index_column._scol.isin([
                            F.lit(r).cast(index_data_type) for r in rows_sel
                        ]),
                        None,
                        None,
                    )
            else:
                LocIndexer._raiseNotImplemented(
                    "Cannot select with MultiIndex with Spark.")
        else:
            if not isinstance(rows_sel, tuple):
                rows_sel = (rows_sel, )
            if len(rows_sel) > len(self._internal.index_map):
                raise SparkPandasIndexingError("Too many indexers")

            rows = [
                scol == value
                for scol, value in zip(self._internal.index_scols, rows_sel)
            ]
            return (
                reduce(lambda x, y: x & y, rows),
                None,
                len(self._internal.index_map) - len(rows_sel),
            )
示例#5
0
    def __getitem__(self, key):
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import Series

        if self._is_series:
            if isinstance(key,
                          Series) and key._kdf is not self._kdf_or_kser._kdf:
                kdf = self._kdf_or_kser.to_frame()
                kdf["__temp_col__"] = key
                return type(self)(
                    kdf[self._kdf_or_kser.name])[kdf["__temp_col__"]]

            cond, limit, remaining_index = self._select_rows(key)
            if cond is None and limit is None:
                return self._kdf_or_kser

            column_labels = self._internal.column_labels
            column_scols = self._internal.column_scols
            returns_series = True
        else:
            assert self._is_df
            if isinstance(key, tuple):
                if len(key) != 2:
                    raise SparkPandasIndexingError(
                        "Only accepts pairs of candidates")
                rows_sel, cols_sel = key
            else:
                rows_sel = key
                cols_sel = None

            if isinstance(rows_sel,
                          Series) and rows_sel._kdf is not self._kdf_or_kser:
                kdf = self._kdf_or_kser.copy()
                kdf["__temp_col__"] = rows_sel
                return type(self)(kdf)[kdf["__temp_col__"], cols_sel][list(
                    self._kdf_or_kser.columns)]

            cond, limit, remaining_index = self._select_rows(rows_sel)
            column_labels, column_scols, returns_series = self._select_cols(
                cols_sel)

            if cond is None and limit is None and returns_series:
                return Series(
                    self._internal.copy(scol=column_scols[0],
                                        column_labels=[column_labels[0]]),
                    anchor=self._kdf_or_kser,
                )

        if remaining_index is not None:
            index_scols = self._internal.index_scols[-remaining_index:]
            index_map = self._internal.index_map[-remaining_index:]
        else:
            index_scols = self._internal.index_scols
            index_map = self._internal.index_map

        if self._internal.column_label_names is None:
            column_label_names = None
        else:
            # Manage column index names
            level = column_labels_level(column_labels)
            column_label_names = self._internal.column_label_names[-level:]

        try:
            sdf = self._internal._sdf
            if cond is not None:
                sdf = sdf.drop(NATURAL_ORDER_COLUMN_NAME).filter(cond)
            if limit is not None:
                if limit >= 0:
                    sdf = sdf.limit(limit)
                else:
                    sdf = sdf.limit(sdf.count() + limit)

            sdf = sdf.select(index_scols + column_scols)
        except AnalysisException:
            raise KeyError("[{}] don't exist in columns".format(
                [col._jc.toString() for col in column_scols]))

        internal = _InternalFrame(
            sdf=sdf,
            index_map=index_map,
            column_labels=column_labels,
            column_label_names=column_label_names,
        )
        kdf = DataFrame(internal)

        if returns_series:
            kdf_or_kser = Series(
                kdf._internal.copy(scol=kdf._internal.column_scols[0]),
                anchor=kdf)
        else:
            kdf_or_kser = kdf

        if remaining_index is not None and remaining_index == 0:
            pdf_or_pser = kdf_or_kser.head(2).to_pandas()
            length = len(pdf_or_pser)
            if length == 0:
                raise KeyError(name_like_string(key))
            elif length == 1:
                return pdf_or_pser.iloc[0]
            else:
                return kdf_or_kser
        else:
            return kdf_or_kser
示例#6
0
    def __getitem__(self, key):
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import Series

        if self._is_series:
            if isinstance(key, tuple):
                if len(key) > 1:
                    raise SparkPandasIndexingError('Too many indexers')
                key = key[0]

            if isinstance(key,
                          Series) and key._kdf is not self._kdf_or_kser._kdf:
                kdf = self._kdf_or_kser.to_frame()
                kdf['__temp_col__'] = key
                return type(self)(
                    kdf[self._kdf_or_kser.name])[kdf['__temp_col__']]

            cond, limit = self._select_rows(key)
            if cond is None and limit is None:
                return self._kdf_or_kser

            column_index = self._internal.column_index
            column_scols = self._internal.column_scols
            returns_series = True
        else:
            assert self._is_df
            if isinstance(key, tuple):
                if len(key) != 2:
                    raise SparkPandasIndexingError(
                        "Only accepts pairs of candidates")
                rows_sel, cols_sel = key
            else:
                rows_sel = key
                cols_sel = None

            if isinstance(rows_sel,
                          Series) and rows_sel._kdf is not self._kdf_or_kser:
                kdf = self._kdf_or_kser.copy()
                kdf['__temp_col__'] = rows_sel
                return type(self)(kdf)[kdf['__temp_col__'], cols_sel][list(
                    self._kdf_or_kser.columns)]

            cond, limit = self._select_rows(rows_sel)
            column_index, column_scols, returns_series = self._select_cols(
                cols_sel)

            if cond is None and limit is None and returns_series:
                return Series(self._internal.copy(
                    scol=column_scols[0], column_index=[column_index[0]]),
                              anchor=self._kdf_or_kser)

        try:
            sdf = self._internal._sdf
            if cond is not None:
                sdf = sdf.drop(NATURAL_ORDER_COLUMN_NAME).filter(cond)
            if limit is not None:
                if limit >= 0:
                    sdf = sdf.limit(limit)
                else:
                    sdf = sdf.limit(sdf.count() + limit)

            sdf = sdf.select(self._internal.index_scols + column_scols)

            if self._internal.column_index_names is None:
                column_index_names = None
            else:
                # Manage column index names
                level = column_index_level(column_index)
                column_index_names = self._internal.column_index_names[-level:]

            internal = _InternalFrame(sdf=sdf,
                                      index_map=self._internal.index_map,
                                      column_index=column_index,
                                      column_index_names=column_index_names)
            kdf = DataFrame(internal)
        except AnalysisException:
            raise KeyError('[{}] don\'t exist in columns'.format(
                [col._jc.toString() for col in column_scols]))

        if returns_series:
            return Series(
                kdf._internal.copy(scol=kdf._internal.column_scols[0]),
                anchor=kdf)
        else:
            return kdf
示例#7
0
    def _select_rows(self, rows_sel):
        from databricks.koalas.indexes import Index

        if isinstance(rows_sel, tuple) and len(rows_sel) > 1:
            raise SparkPandasIndexingError("Too many indexers")
        elif isinstance(rows_sel, Index):
            assert isinstance(rows_sel.spark_type, BooleanType), rows_sel.spark_type
            return rows_sel._scol, None, None
        elif isinstance(rows_sel, slice):

            def verify_type(i):
                if not isinstance(i, int):
                    raise TypeError(
                        "cannot do slice indexing with these indexers [{}] of {}".format(i, type(i))
                    )

            has_negative = False
            start = rows_sel.start
            if start is not None:
                verify_type(start)
                if start == 0:
                    start = None
                elif start < 0:
                    has_negative = True
            stop = rows_sel.stop
            if stop is not None:
                verify_type(stop)
                if stop == 0:
                    stop = None
                elif stop < 0:
                    has_negative = True

            step = rows_sel.step
            if step is not None:
                verify_type(step)
                if step == 0:
                    raise ValueError("slice step cannot be zero")
            else:
                step = 1

            if start is None and step == 1:
                return None, stop, None

            sdf = self._internal.spark_frame
            sequence_scol = sdf[self._sequence_col]

            if has_negative or (step < 0 and start is None):
                cnt = sdf.count()

            cond = []
            if start is not None:
                if start < 0:
                    start = start + cnt
                if step >= 0:
                    cond.append(sequence_scol >= F.lit(start).cast(LongType()))
                else:
                    cond.append(sequence_scol <= F.lit(start).cast(LongType()))
            if stop is not None:
                if stop < 0:
                    stop = stop + cnt
                if step >= 0:
                    cond.append(sequence_scol < F.lit(stop).cast(LongType()))
                else:
                    cond.append(sequence_scol > F.lit(stop).cast(LongType()))
            if step != 1:
                if step > 0:
                    start = start or 0
                else:
                    start = start or (cnt - 1)
                cond.append(((sequence_scol - start) % F.lit(step).cast(LongType())) == F.lit(0))

            return reduce(lambda x, y: x & y, cond), None, None
        elif isinstance(rows_sel, int):
            sdf = self._internal.spark_frame
            return (sdf[self._sequence_col] == rows_sel), None, 0
        else:
            iLocIndexer._raiseNotImplemented(
                ".iloc requires numeric slice or conditional "
                "boolean Index, got {}".format(type(rows_sel))
            )
示例#8
0
    def __getitem__(self, key):
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import Series

        if self._is_series:
            if isinstance(key, Series) and key._kdf is not self._kdf_or_kser._kdf:
                kdf = self._kdf_or_kser.to_frame()
                kdf["__temp_col__"] = key
                return type(self)(kdf[self._kdf_or_kser.name])[kdf["__temp_col__"]]

            cond, limit, remaining_index = self._select_rows(key)
            if cond is None and limit is None:
                return self._kdf_or_kser

            column_labels = self._internal.column_labels
            data_spark_columns = self._internal.data_spark_columns
            returns_series = True
        else:
            assert self._is_df
            if isinstance(key, tuple):
                if len(key) != 2:
                    raise SparkPandasIndexingError("Only accepts pairs of candidates")
                rows_sel, cols_sel = key
            else:
                rows_sel = key
                cols_sel = None

            if isinstance(rows_sel, Series) and rows_sel._kdf is not self._kdf_or_kser:
                kdf = self._kdf_or_kser.copy()
                kdf["__temp_col__"] = rows_sel
                return type(self)(kdf)[kdf["__temp_col__"], cols_sel][
                    list(self._kdf_or_kser.columns)
                ]

            cond, limit, remaining_index = self._select_rows(rows_sel)
            column_labels, data_spark_columns, returns_series = self._select_cols(cols_sel)

            if cond is None and limit is None and returns_series:
                return self._kdf_or_kser._kser_for(column_labels[0])

        if remaining_index is not None:
            index_scols = self._internal.index_spark_columns[-remaining_index:]
            index_map = OrderedDict(list(self._internal.index_map.items())[-remaining_index:])
        else:
            index_scols = self._internal.index_spark_columns
            index_map = self._internal.index_map

        if len(column_labels) > 0:
            column_labels = column_labels.copy()
            column_labels_level = max(
                len(label) if label is not None else 1 for label in column_labels
            )
            none_column = 0
            for i, label in enumerate(column_labels):
                if label is None:
                    label = (str(none_column),)
                    none_column += 1
                if len(label) < column_labels_level:
                    label = tuple(list(label) + ([""]) * (column_labels_level - len(label)))
                column_labels[i] = label

            if self._internal.column_label_names is None:
                column_label_names = None
            else:
                # Manage column index names
                column_label_names = self._internal.column_label_names[-column_labels_level:]
        else:
            column_label_names = None

        try:
            sdf = self._internal._sdf
            if cond is not None:
                sdf = sdf.drop(NATURAL_ORDER_COLUMN_NAME).filter(cond)
            if limit is not None:
                if limit >= 0:
                    sdf = sdf.limit(limit)
                else:
                    sdf = sdf.limit(sdf.count() + limit)

            data_columns = sdf.select(data_spark_columns).columns
            sdf = sdf.select(index_scols + data_spark_columns)
        except AnalysisException:
            raise KeyError(
                "[{}] don't exist in columns".format(
                    [col._jc.toString() for col in data_spark_columns]
                )
            )

        internal = _InternalFrame(
            spark_frame=sdf,
            index_map=index_map,
            column_labels=column_labels,
            data_spark_columns=[scol_for(sdf, col) for col in data_columns],
            column_label_names=column_label_names,
        )
        kdf = DataFrame(internal)

        if returns_series:
            kdf_or_kser = Series(
                kdf._internal.copy(spark_column=kdf._internal.data_spark_columns[0]), anchor=kdf
            )
        else:
            kdf_or_kser = kdf

        if remaining_index is not None and remaining_index == 0:
            pdf_or_pser = kdf_or_kser.head(2).to_pandas()
            length = len(pdf_or_pser)
            if length == 0:
                raise KeyError(name_like_string(key))
            elif length == 1:
                return pdf_or_pser.iloc[0]
            else:
                return kdf_or_kser
        else:
            return kdf_or_kser
示例#9
0
    def _select_rows(self, rows_sel):
        from databricks.koalas.indexes import MultiIndex
        from databricks.koalas.series import Series

        if isinstance(rows_sel, Series):
            assert isinstance(rows_sel.spark_type,
                              BooleanType), rows_sel.spark_type
            return rows_sel._scol, None, None
        elif isinstance(rows_sel, slice):
            assert len(self._internal.index_spark_column_names) > 0
            if rows_sel.step is not None:
                LocIndexer._raiseNotImplemented("Cannot use step with Spark.")
            if rows_sel == slice(None):
                # If slice is None - select everything, so nothing to do
                return None, None, None
            elif len(self._internal.index_spark_column_names) == 1:
                sdf = self._internal.spark_frame
                index = self._kdf_or_kser.index
                index_column = index.to_series()
                index_data_type = index_column.spark_type
                start = rows_sel.start
                stop = rows_sel.stop

                # get natural order from '__natural_order__' from start to stop
                # to keep natural order.
                start_and_stop = (sdf.select(
                    index_column._scol, NATURAL_ORDER_COLUMN_NAME
                ).where(
                    (index_column._scol == F.lit(start).cast(index_data_type))
                    | (index_column._scol == F.lit(stop).cast(index_data_type))
                ).collect())

                start = [row[1] for row in start_and_stop if row[0] == start]
                start = start[0] if len(start) > 0 else None

                stop = [row[1] for row in start_and_stop if row[0] == stop]
                stop = stop[-1] if len(stop) > 0 else None

                cond = []
                if start is not None:
                    cond.append(
                        F.col(NATURAL_ORDER_COLUMN_NAME) >= F.lit(start).cast(
                            LongType()))
                if stop is not None:
                    cond.append(
                        F.col(NATURAL_ORDER_COLUMN_NAME) <= F.lit(stop).cast(
                            LongType()))

                # if index order is not monotonic increasing or decreasing
                # and specified values don't exist in index, raise KeyError
                if (start is None and rows_sel.start is not None) or (
                        stop is None and rows_sel.stop is not None):

                    inc = index_column.is_monotonic_increasing
                    if inc is False:
                        dec = index_column.is_monotonic_decreasing

                    if start is None and rows_sel.start is not None:
                        start = rows_sel.start
                        if inc is not False:
                            cond.append(index_column._scol >= F.lit(
                                start).cast(index_data_type))
                        elif dec is not False:
                            cond.append(index_column._scol <= F.lit(
                                start).cast(index_data_type))
                        else:
                            raise KeyError(rows_sel.start)
                    if stop is None and rows_sel.stop is not None:
                        stop = rows_sel.stop
                        if inc is not False:
                            cond.append(index_column._scol <= F.lit(stop).cast(
                                index_data_type))
                        elif dec is not False:
                            cond.append(index_column._scol >= F.lit(stop).cast(
                                index_data_type))
                        else:
                            raise KeyError(rows_sel.stop)

                return reduce(lambda x, y: x & y, cond), None, None
            else:
                index = self._kdf_or_kser.index
                index_data_type = [
                    f.dataType for f in index.to_series().spark_type
                ]

                start = rows_sel.start
                if start is not None:
                    if not isinstance(start, tuple):
                        start = (start, )
                    if len(start) == 0:
                        start = None
                stop = rows_sel.stop
                if stop is not None:
                    if not isinstance(stop, tuple):
                        stop = (stop, )
                    if len(stop) == 0:
                        stop = None

                depth = max(
                    len(start) if start is not None else 0,
                    len(stop) if stop is not None else 0)
                if depth == 0:
                    return None, None, None
                elif (depth > len(self._internal.index_map)
                      or not index.droplevel(
                          list(range(len(self._internal.index_map))
                               [depth:])).is_monotonic):
                    raise KeyError(
                        "Key length ({}) was greater than MultiIndex sort depth"
                        .format(depth))

                conds = []
                if start is not None:
                    cond = F.lit(True)
                    for scol, value, dt in list(
                            zip(self._internal.index_spark_columns, start,
                                index_data_type))[::-1]:
                        compare = MultiIndex._comparator_for_monotonic_increasing(
                            dt)
                        cond = F.when(scol.eqNullSafe(F.lit(value).cast(dt)),
                                      cond).otherwise(
                                          compare(scol,
                                                  F.lit(value).cast(dt),
                                                  spark.Column.__gt__))
                    conds.append(cond)
                if stop is not None:
                    cond = F.lit(True)
                    for scol, value, dt in list(
                            zip(self._internal.index_spark_columns, stop,
                                index_data_type))[::-1]:
                        compare = MultiIndex._comparator_for_monotonic_increasing(
                            dt)
                        cond = F.when(scol.eqNullSafe(F.lit(value).cast(dt)),
                                      cond).otherwise(
                                          compare(scol,
                                                  F.lit(value).cast(dt),
                                                  spark.Column.__lt__))
                    conds.append(cond)

                return reduce(lambda x, y: x & y, conds), None, None
        elif is_list_like(rows_sel) and not isinstance(rows_sel, tuple):
            rows_sel = list(rows_sel)
            if len(rows_sel) == 0:
                return F.lit(False), None, None
            elif len(self._internal.index_spark_column_names) == 1:
                index_column = self._kdf_or_kser.index.to_series()
                index_data_type = index_column.spark_type
                if len(rows_sel) == 1:
                    return (
                        index_column._scol == F.lit(
                            rows_sel[0]).cast(index_data_type),
                        None,
                        None,
                    )
                else:
                    return (
                        index_column._scol.isin([
                            F.lit(r).cast(index_data_type) for r in rows_sel
                        ]),
                        None,
                        None,
                    )
            else:
                LocIndexer._raiseNotImplemented(
                    "Cannot select with MultiIndex with Spark.")
        else:
            if not isinstance(rows_sel, tuple):
                rows_sel = (rows_sel, )
            if len(rows_sel) > len(self._internal.index_map):
                raise SparkPandasIndexingError("Too many indexers")

            rows = [
                scol == value for scol, value in zip(
                    self._internal.index_spark_columns, rows_sel)
            ]
            return (
                reduce(lambda x, y: x & y, rows),
                None,
                len(self._internal.index_map) - len(rows_sel),
            )
示例#10
0
    def __setitem__(self, key, value):
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import Series, _col

        if self._is_series:
            if (isinstance(key, Series)
                    and key._kdf is not self._kdf_or_kser._kdf) or (
                        isinstance(value, Series)
                        and value._kdf is not self._kdf_or_kser._kdf):
                kdf = self._kdf_or_kser.to_frame()
                temp_natural_order = verify_temp_column_name(
                    kdf, "__temp_natural_order__")
                temp_key_col = verify_temp_column_name(kdf, "__temp_key_col__")
                temp_value_col = verify_temp_column_name(
                    kdf, "__temp_value_col__")

                kdf[temp_natural_order] = F.monotonically_increasing_id()
                if isinstance(key, Series):
                    kdf[temp_key_col] = key
                if isinstance(value, Series):
                    kdf[temp_value_col] = value
                kdf = kdf.sort_values(temp_natural_order).drop(
                    temp_natural_order)

                kser = kdf[self._kdf_or_kser.name]
                if isinstance(key, Series):
                    key = kdf[temp_key_col]
                if isinstance(value, Series):
                    value = kdf[temp_value_col]

                type(self)(kser)[key] = value

                self._kdf_or_kser._internal = kser._internal
                self._kdf_or_kser._kdf = kser._kdf
                return

            if isinstance(value, DataFrame):
                raise ValueError("Incompatible indexer with DataFrame")

            cond, limit, remaining_index = self._select_rows(key)
            if cond is None:
                cond = F.lit(True)
            if limit is not None:
                cond = cond & (self._internal.spark_frame[self._sequence_col] <
                               F.lit(limit))

            if isinstance(value, Series):
                if remaining_index is not None and remaining_index == 0:
                    raise ValueError(
                        "No axis named {} for object type {}".format(
                            key, type(value)))
                value = value._scol
            else:
                value = F.lit(value)
            scol = (F.when(cond,
                           value).otherwise(self._internal.spark_column).alias(
                               name_like_string(self._kdf_or_kser.name
                                                or "0")))
            internal = self._internal.copy(spark_column=scol)
            self._kdf_or_kser._internal = internal
        else:
            assert self._is_df

            if isinstance(key, tuple):
                if len(key) != 2:
                    raise SparkPandasIndexingError(
                        "Only accepts pairs of candidates")
                rows_sel, cols_sel = key
            else:
                rows_sel = key
                cols_sel = None

            if isinstance(value, DataFrame):
                if len(value.columns) == 1:
                    value = _col(value)
                else:
                    raise ValueError(
                        "Only a dataframe with one column can be assigned")

            if (isinstance(rows_sel, Series)
                    and rows_sel._kdf is not self._kdf_or_kser) or (
                        isinstance(value, Series)
                        and value._kdf is not self._kdf_or_kser):
                kdf = self._kdf_or_kser.copy()
                temp_natural_order = verify_temp_column_name(
                    kdf, "__temp_natural_order__")
                temp_key_col = verify_temp_column_name(kdf, "__temp_key_col__")
                temp_value_col = verify_temp_column_name(
                    kdf, "__temp_value_col__")

                kdf[temp_natural_order] = F.monotonically_increasing_id()
                if isinstance(rows_sel, Series):
                    kdf[temp_key_col] = rows_sel
                if isinstance(value, Series):
                    kdf[temp_value_col] = value
                kdf = kdf.sort_values(temp_natural_order)

                if isinstance(rows_sel, Series):
                    rows_sel = kdf[temp_key_col]
                if isinstance(value, Series):
                    value = kdf[temp_value_col]

                type(self)(kdf)[rows_sel, cols_sel] = value

                self._kdf_or_kser._internal = kdf[list(
                    self._kdf_or_kser.columns)]._internal
                return

            cond, limit, remaining_index = self._select_rows(rows_sel)
            missing_keys = []
            _, data_spark_columns, _ = self._select_cols(
                cols_sel, missing_keys=missing_keys)

            if cond is None:
                cond = F.lit(True)
            if limit is not None:
                cond = cond & (self._internal.spark_frame[self._sequence_col] <
                               F.lit(limit))

            if isinstance(value, Series):
                if remaining_index is not None and remaining_index == 0:
                    raise ValueError("Incompatible indexer with Series")
                if len(data_spark_columns) > 1:
                    raise ValueError("shape mismatch")
                value = value._scol
            else:
                value = F.lit(value)

            new_data_spark_columns = []
            for new_scol, spark_column_name in zip(
                    self._internal.data_spark_columns,
                    self._internal.data_spark_column_names):
                for scol in data_spark_columns:
                    if new_scol._jc.equals(scol._jc):
                        new_scol = F.when(
                            cond,
                            value).otherwise(scol).alias(spark_column_name)
                        break
                new_data_spark_columns.append(new_scol)

            column_labels = self._internal.column_labels.copy()
            for label in missing_keys:
                if isinstance(label, str):
                    label = (label, )
                if len(label) < self._internal.column_labels_level:
                    label = tuple(
                        list(label) +
                        ([""] *
                         (self._internal.column_labels_level - len(label))))
                elif len(label) > self._internal.column_labels_level:
                    raise KeyError(
                        "Key length ({}) exceeds index depth ({})".format(
                            len(label), self._internal.column_labels_level))
                column_labels.append(label)
                new_data_spark_columns.append(
                    F.when(cond, value).alias(name_like_string(label)))

            internal = self._internal.with_new_columns(new_data_spark_columns,
                                                       column_labels)
            self._kdf_or_kser._internal = internal
示例#11
0
    def _select_rows(self, rows_sel):
        from databricks.koalas.indexes import Index

        if isinstance(rows_sel, tuple) and len(rows_sel) > 1:
            raise SparkPandasIndexingError("Too many indexers")
        elif isinstance(rows_sel, Index):
            assert isinstance(rows_sel.spark_type,
                              BooleanType), rows_sel.spark_type
            return rows_sel._scol, None, None
        elif isinstance(rows_sel, slice):

            def verify_type(i):
                if not isinstance(i, int):
                    raise TypeError(
                        "cannot do slice indexing with these indexers [{}] of {}"
                        .format(i, type(i)))

            has_negative = False
            start = rows_sel.start
            if start is not None:
                verify_type(start)
                if start == 0:
                    start = None
                elif start < 0:
                    has_negative = True
            stop = rows_sel.stop
            if stop is not None:
                verify_type(stop)
                if stop == 0:
                    stop = None
                elif stop < 0:
                    has_negative = True

            step = rows_sel.step
            if step is not None:
                verify_type(step)
                if step == 0:
                    raise ValueError("slice step cannot be zero")
            else:
                step = 1

            if start is None and step == 1:
                return None, stop, None

            sdf = self._internal.spark_frame
            sequence_scol = sdf[self._sequence_col]

            if has_negative or (step < 0 and start is None):
                cnt = sdf.count()

            cond = []
            if start is not None:
                if start < 0:
                    start = start + cnt
                if step >= 0:
                    cond.append(sequence_scol >= F.lit(start).cast(LongType()))
                else:
                    cond.append(sequence_scol <= F.lit(start).cast(LongType()))
            if stop is not None:
                if stop < 0:
                    stop = stop + cnt
                if step >= 0:
                    cond.append(sequence_scol < F.lit(stop).cast(LongType()))
                else:
                    cond.append(sequence_scol > F.lit(stop).cast(LongType()))
            if step != 1:
                if step > 0:
                    start = start or 0
                else:
                    start = start or (cnt - 1)
                cond.append(((sequence_scol - start) %
                             F.lit(step).cast(LongType())) == F.lit(0))

            return reduce(lambda x, y: x & y, cond), None, None
        elif isinstance(rows_sel, int):
            sdf = self._internal.spark_frame
            return (sdf[self._sequence_col] == rows_sel), None, 0
        elif isinstance(rows_sel, Iterable):
            sdf = self._internal.spark_frame

            if any(
                    isinstance(key, (int, np.int, np.int64,
                                     np.int32)) and key < 0
                    for key in rows_sel):
                offset = sdf.count()
            else:
                offset = 0

            new_rows_sel = []
            for key in list(rows_sel):
                if not isinstance(key, (int, np.int, np.int64, np.int32)):
                    raise TypeError(
                        "cannot do positional indexing with these indexers [{}] of {}"
                        .format(key, type(key)))
                if key < 0:
                    key = key + offset
                new_rows_sel.append(key)

            if len(new_rows_sel) != len(set(new_rows_sel)):
                raise NotImplementedError(
                    "Duplicated row selection is not currently supported; "
                    "however, normalised index was [%s]" % new_rows_sel)

            sequence_scol = sdf[self._sequence_col]
            cond = []
            for key in new_rows_sel:
                cond.append(sequence_scol == F.lit(int(key)).cast(LongType()))

            if len(cond) == 0:
                cond = [F.lit(False)]
            return reduce(lambda x, y: x | y, cond), None, None
        else:
            iLocIndexer._raiseNotImplemented(
                ".iloc requires numeric slice, conditional "
                "boolean Index or a sequence of positions as int, "
                "got {}".format(type(rows_sel)))