예제 #1
0
    def __repr__(self):
        max_display_count = get_option("display.max_rows")
        sdf = self._kdf._sdf.select(self._scol)

        if max_display_count is None:
            return repr(
                DataFrame(
                    self._kdf._internal.copy(
                        sdf=sdf,
                        index_map=[(sdf.schema[0].name,
                                    self._kdf._internal.index_names[0])],
                        data_columns=[],
                        column_index=[],
                        column_index_names=None)).index.to_pandas())

        sdf = sdf.limit(max_display_count + 1)
        internal = self._kdf._internal.copy(
            sdf=sdf,
            index_map=[(sdf.schema[0].name, self._kdf._internal.index_names[0])
                       ],
            data_columns=[],
            column_index=[],
            column_index_names=None)
        pindex = DataFrame(internal).index.to_pandas()

        pindex_length = len(pindex)
        repr_string = repr(pindex[:max_display_count])

        if pindex_length > max_display_count:
            footer = '\nShowing only the first {}'.format(max_display_count)
            return repr_string + footer
        return repr_string
예제 #2
0
    def value_counts(self,
                     normalize=False,
                     sort=True,
                     ascending=False,
                     bins=None,
                     dropna=True):
        if bins is not None:
            raise NotImplementedError(
                "value_counts currently does not support bins")

        if dropna:
            sdf_dropna = self._kdf._sdf.filter(self.notna()._scol)
        else:
            sdf_dropna = self._kdf._sdf
        sdf = sdf_dropna.groupby(self._scol).count()
        if sort:
            if ascending:
                sdf = sdf.orderBy(F.col('count'))
            else:
                sdf = sdf.orderBy(F.col('count').desc())

        if normalize:
            sum = sdf_dropna.count()
            sdf = sdf.withColumn('count', F.col('count') / F.lit(sum))

        index_name = 'index' if self.name != 'index' else 'level_0'
        kdf = DataFrame(sdf)
        kdf.columns = [index_name, self.name]
        kdf._metadata = Metadata(column_fields=[self.name],
                                 index_info=[(index_name, None)])
        return _col(kdf)
예제 #3
0
    def aggregate(self, func_or_funcs, *args, **kwargs):
        """Compute aggregates and returns the result as a :class:`DataFrame`.

        The available aggregate functions can be built-in aggregation functions, such as `avg`,
        `max`, `min`, `sum`, `count`.

        :param func_or_funcs: a dict mapping from column name (string) to aggregate functions
                              (string).
        """
        if not isinstance(func_or_funcs, dict) or \
            not all(isinstance(key, string_types) and isinstance(value, string_types)
                    for key, value in func_or_funcs.items()):
            raise ValueError(
                "aggs must be a dict mapping from column name (string) to aggregate "
                "functions (string).")
        sdf = self._groupdata.agg(func_or_funcs)

        reorder = [
            '%s(%s)' % (value, key)
            for key, value in iter(func_or_funcs.items())
        ]
        kdf = DataFrame(sdf.select(reorder))
        kdf.columns = [key for key in iter(func_or_funcs.keys())]

        return kdf
예제 #4
0
 def rank(self, method='average', ascending=True):
     kdf = super(SeriesGroupBy, self).rank(method, ascending).to_dataframe()
     return _col(
         DataFrame(
             kdf._internal.copy(sdf=kdf._sdf.select(
                 kdf._internal.data_scols),
                                index_map=[])))  # index is lost.
예제 #5
0
    def __setitem__(self, key, value):
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import _col

        super(iLocIndexer, self).__setitem__(key, value)

        if self._is_series:
            internal = self._kdf_or_kser._internal
            sdf = internal.spark_frame.select(internal.index_spark_columns +
                                              [internal.spark_column])
            internal = internal.copy(
                spark_frame=sdf,
                column_labels=[internal.column_labels[0] or ("0", )],
                data_spark_columns=[
                    scol_for(sdf, internal.data_spark_column_names[0])
                ],
                spark_column=None,
            )
            kser = _col(DataFrame(internal))

            self._kdf_or_kser._internal = kser._internal
            self._kdf_or_kser._kdf = kser._kdf
        else:
            assert self._is_df

        # Clean up implicitly cached properties to be able to reuse the indexer.
        del self._internal
        del self._sequence_col
예제 #6
0
    def __setitem__(self, key, value):
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import _col

        super(iLocIndexer, self).__setitem__(key, value)

        if self._is_series:
            internal = self._kdf_or_kser._internal
            sdf = internal.spark_frame.select(internal.index_spark_columns +
                                              [internal.spark_column])
            internal = internal.copy(
                spark_frame=sdf,
                column_labels=[internal.column_labels[0] or ("0", )],
                data_spark_columns=[
                    scol_for(sdf, internal.data_spark_column_names[0])
                ],
                spark_column=None,
            )
            kser = _col(DataFrame(internal))

            self._kdf_or_kser._internal = kser._internal
            self._kdf_or_kser._kdf = kser._kdf
        else:
            assert self._is_df

            # TODO: support DataFrame.

        delattr(self, "_lazy__internal")
        delattr(self, "_lazy__sequence_col")
예제 #7
0
파일: sql.py 프로젝트: zhouzach/koalas
    def execute(self) -> DataFrame:
        """
        Returns a DataFrame for which the SQL statement has been executed by
        the underlying SQL engine.

        >>> str0 = 'abc'
        >>> ks.sql("select {str0}")
           abc
        0  abc

        >>> str1 = 'abc"abc'
        >>> str2 = "abc'abc"
        >>> ks.sql("select {str0}, {str1}, {str2}")
           abc  abc"abc  abc'abc
        0  abc  abc"abc  abc'abc

        >>> strs = ['a', 'b']
        >>> ks.sql("select 'a' in {strs} as cond1, 'c' in {strs} as cond2")
           cond1  cond2
        0   True  False
        """
        blocks = _string.formatter_parser(self._statement)
        # TODO: use a string builder
        res = ""
        try:
            for (pre, inner, _, _) in blocks:
                var_next = "" if inner is None else self._convert(inner)
                res = res + pre + var_next
            self._normalized_statement = res

            sdf = self._session.sql(self._normalized_statement)
        finally:
            for v in self._temp_views:
                self._session.catalog.dropTempView(v)
        return DataFrame(sdf)
예제 #8
0
    def from_arrays(arrays, sortorder=None, names=None):
        """
        Convert arrays to MultiIndex.

        Parameters
        ----------
        arrays: list / sequence of array-likes
            Each array-like gives one level’s value for each data point. len(arrays)
            is the number of levels.
        sortorder: int or None
            Level of sortedness (must be lexicographically sorted by that level).
        names: list / sequence of str, optional
            Names for the levels in the index.

        Returns
        -------
        index: MultiIndex

        Examples
        --------

        >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']]
        >>> ks.MultiIndex.from_arrays(arrays, names=('number', 'color'))  # doctest: +SKIP
        MultiIndex([(1,  'red'),
                    (1, 'blue'),
                    (2,  'red'),
                    (2, 'blue')],
                   names=['number', 'color'])
        """
        return DataFrame(index=pd.MultiIndex.from_arrays(
            arrays=arrays, sortorder=sortorder, names=names
        )).index
예제 #9
0
    def _reduce_for_stat_function(self, sfun, only_numeric):
        groupkeys = self._groupkeys
        groupkey_cols = [
            s._scol.alias('__index_level_{}__'.format(i))
            for i, s in enumerate(groupkeys)
        ]
        sdf = self._kdf._sdf

        data_columns = []
        if len(self._agg_columns) > 0:
            stat_exprs = []
            for ks in self._agg_columns:
                spark_type = ks.spark_type
                # TODO: we should have a function that takes dataframes and converts the numeric
                # types. Converting the NaNs is used in a few places, it should be in utils.
                # Special handle floating point types because Spark's count treats nan as a valid
                # value, whereas Pandas count doesn't include nan.
                if isinstance(spark_type, DoubleType) or isinstance(
                        spark_type, FloatType):
                    stat_exprs.append(
                        sfun(F.nanvl(ks._scol, F.lit(None))).alias(ks.name))
                    data_columns.append(ks.name)
                elif isinstance(spark_type, NumericType) or not only_numeric:
                    stat_exprs.append(sfun(ks._scol).alias(ks.name))
                    data_columns.append(ks.name)
            sdf = sdf.groupby(*groupkey_cols).agg(*stat_exprs)
        else:
            sdf = sdf.select(*groupkey_cols).distinct()
        sdf = sdf.sort(*groupkey_cols)
        internal = _InternalFrame(sdf=sdf,
                                  data_columns=data_columns,
                                  index_map=[('__index_level_{}__'.format(i),
                                              s.name)
                                             for i, s in enumerate(groupkeys)])
        return DataFrame(internal)
예제 #10
0
def from_pandas(
        pobj: Union['pd.DataFrame',
                    'pd.Series']) -> Union['Series', 'DataFrame']:
    """Create a Koalas DataFrame or Series from a pandas DataFrame or Series.

    This is similar to Spark's `SparkSession.createDataFrame()` with pandas DataFrame,
    but this also works with pandas Series and picks the index.

    Parameters
    ----------
    pobj : pandas.DataFrame or pandas.Series
        pandas DataFrame or Series to read.

    Returns
    -------
    Series or DataFrame
        If a pandas Series is passed in, this function returns a Koalas Series.
        If a pandas DataFrame is passed in, this function returns a Koalas DataFrame.
    """
    if isinstance(pobj, pd.Series):
        return Series(pobj)
    elif isinstance(pobj, pd.DataFrame):
        return DataFrame(pobj)
    else:
        raise ValueError("Unknown data type: {}".format(type(pobj)))
예제 #11
0
def read_table(name: str) -> DataFrame:
    """
    Read a Spark table and return a DataFrame.

    Parameters
    ----------
    name : string
        Table name in Spark.

    Returns
    -------
    DataFrame

    See Also
    --------
    DataFrame.to_table
    read_delta
    read_parquet
    read_spark_io

    Examples
    --------
    >>> ks.range(1).to_table('%s.my_table' % db)
    >>> ks.read_table('%s.my_table' % db)
       id
    0   0
    """
    sdf = default_session().read.table(name)
    return DataFrame(sdf)
예제 #12
0
    def _cum(self, func):
        # This is used for cummin, cummax, cumxum, etc.
        if func == F.min:
            func = "cummin"
        elif func == F.max:
            func = "cummax"
        elif func == F.sum:
            func = "cumsum"
        elif func.__name__ == "cumprod":
            func = "cumprod"

        if len(self._kdf._internal.index_columns) == 0:
            raise ValueError("Index must be set.")

        applied = []
        kdf = self._kdf
        groupkey_columns = [s.name for s in self._groupkeys]
        for column in kdf._internal.data_columns:
            # pandas groupby.cumxxx ignores the grouping key itself.
            if column not in groupkey_columns:
                applied.append(
                    getattr(kdf[column].groupby(self._groupkeys), func)())

        sdf = kdf._sdf.select(kdf._internal.index_scols +
                              [c._scol for c in applied])
        internal = kdf._internal.copy(sdf=sdf,
                                      data_columns=[c.name for c in applied])
        return DataFrame(internal)
예제 #13
0
    def fillna(self, value):
        """
        Fill NA/NaN values with the specified value.

        Parameters
        ----------
        value : scalar
            Scalar value to use to fill holes (e.g. 0). This value cannot be a list-likes.

        Returns
        -------
        Index :
            filled with value

        Examples
        --------
        >>> ki = ks.DataFrame({'a': ['a', 'b', 'c']}, index=[1, 2, None]).index
        >>> ki
        Float64Index([1.0, 2.0, nan], dtype='float64')

        >>> ki.fillna(0)
        Float64Index([1.0, 2.0, 0.0], dtype='float64')
        """
        if not isinstance(value, (float, int, str, bool)):
            raise TypeError("Unsupported type %s" % type(value))
        sdf = self._internal.sdf.fillna(value)
        result = DataFrame(self._kdf._internal.copy(sdf=sdf)).index
        return result
예제 #14
0
    def drop_duplicates(self):
        """
        Return Index with duplicate values removed.

        Returns
        -------
        deduplicated : Index

        See Also
        --------
        Series.drop_duplicates : Equivalent method on Series.
        DataFrame.drop_duplicates : Equivalent method on DataFrame.

        Examples
        --------
        Generate an pandas.Index with duplicate values.

        >>> idx = ks.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo'])

        >>> idx.drop_duplicates() # doctest: +SKIP
        Index(['lama', 'cow', 'beetle', 'hippo'], dtype='object')
        """
        sdf = self._internal.sdf.select(
            self._internal.index_scols).drop_duplicates()
        internal = _InternalFrame(sdf=sdf,
                                  index_map=self._kdf._internal.index_map)
        result = DataFrame(internal).index
        return result
예제 #15
0
    def unique(self):
        """
        Return unique values of Series object.

        Uniques are returned in order of appearance. Hash table-based unique,
        therefore does NOT sort.

        .. note:: This method returns newly creased Series whereas Pandas returns
                  the unique values as a NumPy array.

        Returns
        -------
        Returns the unique values as a Series.

        See Examples section.

        Examples
        --------
        >>> ks.Series([2, 1, 3, 3], name='A').unique()
        0    1
        1    3
        2    2
        Name: A, dtype: int64

        >>> ks.Series([pd.Timestamp('2016-01-01') for _ in range(3)]).unique()
        0   2016-01-01
        Name: 0, dtype: datetime64[ns]
        """
        sdf = self.to_dataframe()._sdf
        return _col(DataFrame(sdf.select(self._scol).distinct()))
예제 #16
0
    def drop(self, labels):
        """
        Make new Index with passed list of labels deleted.

        Parameters
        ----------
        labels : array-like

        Returns
        -------
        dropped : Index

        Examples
        --------
        >>> index = ks.Index([1, 2, 3])
        >>> index
        Int64Index([1, 2, 3], dtype='int64')

        >>> index.drop([1])
        Int64Index([2, 3], dtype='int64')
        """
        if not isinstance(labels, (tuple, list)):
            labels = [labels]
        sdf = self._internal.sdf[~self._internal.index_scols[0].isin(labels)]
        return Index(
            DataFrame(
                _InternalFrame(sdf=sdf,
                               index_map=self._kdf._internal.index_map)))
예제 #17
0
def read_parquet(path, columns=None):
    """Load a parquet object from the file path, returning a DataFrame.

    Parameters
    ----------
    path : string
        File path
    columns : list, default=None
        If not None, only these columns will be read from the file.

    Returns
    -------
    DataFrame

    Examples
    --------
    >>> ks.read_parquet('data.parquet', columns=['name', 'gender'])  # doctest: +SKIP
    """
    if columns is not None:
        columns = list(columns)
    if columns is None or len(columns) > 0:
        sdf = default_session().read.parquet(path)
        if columns is not None:
            fields = [field.name for field in sdf.schema]
            cols = [col for col in columns if col in fields]
            if len(cols) > 0:
                sdf = sdf.select(cols)
            else:
                sdf = default_session().createDataFrame([],
                                                        schema=StructType())
    else:
        sdf = default_session().createDataFrame([], schema=StructType())
    return DataFrame(sdf)
예제 #18
0
    def to_pandas(self) -> pd.Index:
        """
        Return a pandas Series.

        .. note:: This method should only be used if the resulting Pandas object is expected
                  to be small, as all the data is loaded into the driver's memory. If the input
                  is large, set max_rows parameter.

        Examples
        --------
        >>> df = ks.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
        ...                   columns=['dogs', 'cats'],
        ...                   index=list('abcd'))
        >>> df['dogs'].index.to_pandas()
        Index(['a', 'b', 'c', 'd'], dtype='object')
        """
        sdf = self._kdf._sdf.select(self._scol)
        internal = self._kdf._internal.copy(
            sdf=sdf,
            index_map=[(sdf.schema[0].name, self._kdf._internal.index_names[0])
                       ],
            data_columns=[],
            column_index=[],
            column_index_names=None)
        return DataFrame(internal)._to_internal_pandas().index
예제 #19
0
    def from_tuples(tuples, sortorder=None, names=None):
        """
        Convert list of tuples to MultiIndex.

        Parameters
        ----------
        tuples : list / sequence of tuple-likes
            Each tuple is the index of one row/column.
        sortorder : int or None
            Level of sortedness (must be lexicographically sorted by that level).
        names : list / sequence of str, optional
            Names for the levels in the index.

        Returns
        -------
        index : MultiIndex

        Examples
        --------

        >>> tuples = [(1, 'red'), (1, 'blue'),
        ...           (2, 'red'), (2, 'blue')]
        >>> ks.MultiIndex.from_tuples(tuples, names=('number', 'color'))  # doctest: +SKIP
        MultiIndex([(1,  'red'),
                    (1, 'blue'),
                    (2,  'red'),
                    (2, 'blue')],
                   names=['number', 'color'])
        """
        return DataFrame(index=pd.MultiIndex.from_tuples(
            tuples=tuples, sortorder=sortorder, names=names)).index
예제 #20
0
    def intersection(self, other) -> "MultiIndex":
        """
        Form the intersection of two Index objects.

        This returns a new Index with elements common to the index and `other`.

        Parameters
        ----------
        other : Index or array-like

        Returns
        -------
        intersection : MultiIndex

        Examples
        --------
        >>> midx1 = ks.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")])
        >>> midx2 = ks.MultiIndex.from_tuples([("c", "z"), ("d", "w")])
        >>> midx1.intersection(midx2).sort_values()  # doctest: +SKIP
        MultiIndex([('c', 'z')],
                   )
        """
        if isinstance(other, Series) or not is_list_like(other):
            raise TypeError("other must be a MultiIndex or a list of tuples")
        elif isinstance(other, DataFrame):
            raise ValueError("Index data must be 1-dimensional")
        elif isinstance(other, MultiIndex):
            spark_frame_other = other.to_frame().to_spark()
            keep_name = self.names == other.names
        elif isinstance(other, Index):
            # Always returns an empty MultiIndex if `other` is Index.
            return self.to_frame().head(0).index  # type: ignore
        elif not all(isinstance(item, tuple) for item in other):
            raise TypeError("other must be a MultiIndex or a list of tuples")
        else:
            other = MultiIndex.from_tuples(list(other))
            spark_frame_other = other.to_frame().to_spark()
            keep_name = True

        default_name = [
            SPARK_INDEX_NAME_FORMAT(i) for i in range(self.nlevels)
        ]
        spark_frame_self = self.to_frame(name=default_name).to_spark()
        spark_frame_intersected = spark_frame_self.intersect(spark_frame_other)
        if keep_name:
            index_names = self._internal.index_names
        else:
            index_names = None
        internal = InternalFrame(  # TODO: dtypes?
            spark_frame=spark_frame_intersected,
            index_spark_columns=[
                scol_for(spark_frame_intersected, col) for col in default_name
            ],
            index_names=index_names,
        )
        return cast(MultiIndex, DataFrame(internal).index)
예제 #21
0
    def size(self):
        """
        Compute group sizes.

        See Also
        --------
        databricks.koalas.Series.groupby
        databricks.koalas.DataFrame.groupby

        Examples
        --------
        >>> df = ks.DataFrame({'A': [1, 2, 2, 3, 3, 3],
        ...                    'B': [1, 1, 2, 3, 3, 3]},
        ...                   columns=['A', 'B'])
        >>> df
           A  B
        0  1  1
        1  2  1
        2  2  2
        3  3  3
        4  3  3
        5  3  3

        >>> df.groupby('A').size().sort_index()  # doctest: +NORMALIZE_WHITESPACE
        A
        1  1
        2  2
        3  3
        Name: count, dtype: int64

        >>> df.groupby(['A', 'B']).size().sort_index()  # doctest: +NORMALIZE_WHITESPACE
        A  B
        1  1    1
        2  1    1
           2    1
        3  3    3
        Name: count, dtype: int64
        """
        groupkeys = self._groupkeys
        groupkey_cols = [
            s._scol.alias('__index_level_{}__'.format(i))
            for i, s in enumerate(groupkeys)
        ]
        sdf = self._kdf._sdf
        sdf = sdf.groupby(*groupkey_cols).count()
        if (len(self._agg_columns) > 0) and (self._have_agg_columns):
            name = self._agg_columns[0].name
            sdf = sdf.withColumnRenamed('count', name)
        else:
            name = 'count'
        internal = _InternalFrame(sdf=sdf,
                                  data_columns=[name],
                                  index_map=[('__index_level_{}__'.format(i),
                                              s.name)
                                             for i, s in enumerate(groupkeys)])
        return _col(DataFrame(internal))
예제 #22
0
파일: indexes.py 프로젝트: zatang007/koalas
 def __init__(self,
              kdf: DataFrame,
              scol: Optional[spark.Column] = None) -> None:
     assert len(kdf._metadata._index_map) == 1
     if scol is None:
         self._kdf = kdf
         self._scol = self._columns[0]
     else:
         self._kdf = kdf.copy()
         self._scol = scol
예제 #23
0
    def _init_from_pandas(self, s, *args):
        """
        Creates Koalas Series from Pandas Series.

        :param s: Pandas Series
        """

        kdf = DataFrame(pd.DataFrame(s))
        self._init_from_spark(kdf._sdf[kdf._metadata.column_fields[0]],
                              kdf, kdf._metadata.index_info)
예제 #24
0
def range(start: int,
          end: Optional[int] = None,
          step: int = 1,
          num_partitions: Optional[int] = None) -> DataFrame:
    """
    Create a DataFrame with some range of numbers.

    The resulting DataFrame has a single int64 column named `id`, containing elements in a range
    from ``start`` to ``end`` (exclusive) with step value ``step``. If only the first parameter
    (i.e. start) is specified, we treat it as the end value with the start value being 0.

    This is similar to the range function in SparkSession and is used primarily for testing.

    Parameters
    ----------
    start : int
        the start value (inclusive)
    end : int, optional
        the end value (exclusive)
    step : int, optional, default 1
        the incremental step
    num_partitions : int, optional
        the number of partitions of the DataFrame

    Returns
    -------
    DataFrame

    Examples
    --------
    When the first parameter is specified, we generate a range of values up till that number.

    >>> ks.range(5)
       id
    0   0
    1   1
    2   2
    3   3
    4   4

    When start, end, and step are specified:

    >>> ks.range(start = 100, end = 200, step = 20)
        id
    0  100
    1  120
    2  140
    3  160
    4  180
    """
    sdf = default_session().range(start=start,
                                  end=end,
                                  step=step,
                                  numPartitions=num_partitions)
    return DataFrame(sdf)
예제 #25
0
    def dropna(self):
        """
        Return Index or MultiIndex without NA/NaN values

        Examples
        --------

        >>> df = ks.DataFrame([[1, 2], [4, 5], [7, 8]],
        ...                   index=['cobra', 'viper', None],
        ...                   columns=['max_speed', 'shield'])
        >>> df
               max_speed  shield
        cobra          1       2
        viper          4       5
        NaN            7       8

        >>> df.index.dropna()
        Index(['cobra', 'viper'], dtype='object')

        Also support for MultiIndex

        >>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'],
        ...                       [None, 'weight', 'length']],
        ...                      [[0, 1, 1, 1, 1, 1, 2, 2, 2],
        ...                       [0, 1, 1, 0, 1, 2, 1, 1, 2]])
        >>> s = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, None],
        ...               index=midx)
        >>> s
        lama    NaN        45.0
        cow     weight    200.0
                weight      1.2
                NaN        30.0
                weight    250.0
                length      1.5
        falcon  weight    320.0
                weight      1.0
                length      NaN
        Name: 0, dtype: float64

        >>> s.index.dropna()  # doctest: +SKIP
        MultiIndex([(   'cow', 'weight'),
                    (   'cow', 'weight'),
                    (   'cow', 'weight'),
                    (   'cow', 'length'),
                    ('falcon', 'weight'),
                    ('falcon', 'weight'),
                    ('falcon', 'length')],
                   )
        """
        kdf = self._kdf.copy()
        sdf = kdf._internal.sdf.select(self._internal.index_scols).dropna()
        internal = _InternalFrame(sdf=sdf, index_map=self._internal.index_map)
        kdf = DataFrame(internal)
        return Index(kdf) if type(self) == Index else MultiIndex(kdf)
예제 #26
0
    def rename(self, name, inplace=False):
        """
        Alter Index name.
        Able to set new names without level. Defaults to returning new index.

        Parameters
        ----------
        name : label or list of labels
            Name(s) to set.
        inplace : boolean, default False
            Modifies the object directly, instead of creating a new Index.

        Returns
        -------
        Index
            The same type as the caller or None if inplace is True.

        Examples
        --------
        >>> df = ks.DataFrame({'a': ['A', 'C'], 'b': ['A', 'B']}, columns=['a', 'b'])
        >>> df.index.rename("c")
        Int64Index([0, 1], dtype='int64', name='c')

        >>> df.set_index("a", inplace=True)
        >>> df.index.rename("d")
        Index(['A', 'C'], dtype='object', name='d')

        You can also change the index name in place.

        >>> df.index.rename("e", inplace=True)
        Index(['A', 'C'], dtype='object', name='e')

        >>> df  # doctest: +NORMALIZE_WHITESPACE
           b
        e
        A  A
        C  B
        """
        index_columns = self._kdf._internal.index_columns
        assert len(index_columns) == 1

        sdf = self._kdf._sdf.select([self._scol] +
                                    self._kdf._internal.data_scols)
        internal = self._kdf._internal.copy(sdf=sdf,
                                            index_map=[(sdf.schema[0].name,
                                                        name)])

        if inplace:
            self._kdf._internal = internal
            return self
        else:
            return DataFrame(internal).index
예제 #27
0
    def analyzed(self) -> "ks.Series":
        """
        Returns a new Series with the analyzed Spark DataFrame.

        After multiple operations, the underlying Spark plan could grow huge
        and make the Spark planner take a long time to finish the planning.

        This function is for the workaround to avoid it.

        .. note:: After analyzed, operations between the analyzed Series and the original one
            will **NOT** work without setting a config `compute.ops_on_diff_frames` to `True`.

        Returns
        -------
        Series

        Examples
        --------
        >>> ser = ks.Series([1, 2, 3])
        >>> ser
        0    1
        1    2
        2    3
        dtype: int64

        The analyzed one should return the same value.

        >>> ser.spark.analyzed
        0    1
        1    2
        2    3
        dtype: int64

        However, it won't work with the same anchor Series.

        >>> ser + ser.spark.analyzed
        Traceback (most recent call last):
        ...
        ValueError: ... enable 'compute.ops_on_diff_frames' option.

        >>> with ks.option_context('compute.ops_on_diff_frames', True):
        ...     (ser + ser.spark.analyzed).sort_index()
        0    2
        1    4
        2    6
        dtype: int64
        """
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import first_series

        return first_series(DataFrame(self._data._internal.resolved_copy))
예제 #28
0
def from_pandas(pdf):
    """Create DataFrame from pandas DataFrame.

    This is similar to `DataFrame.createDataFrame()` with pandas DataFrame, but this also picks
    the index in the given pandas DataFrame.

    :param pdf: :class:`pandas.DataFrame`
    """
    if isinstance(pdf, pd.Series):
        return Series(pdf)
    elif isinstance(pdf, pd.DataFrame):
        return DataFrame(pdf)
    else:
        raise ValueError("Unknown data type: {}".format(type(pdf)))
예제 #29
0
def _spark_col_apply(kdf_or_ks, sfun):
    """
    Performs a function to all cells on a dataframe, the function being a known sql function.
    """
    from databricks.koalas.frame import DataFrame
    from databricks.koalas.series import Series
    if isinstance(kdf_or_ks, Series):
        ks = kdf_or_ks
        return Series(ks._kdf._internal.copy(scol=sfun(kdf_or_ks._scol)), anchor=ks._kdf)
    assert isinstance(kdf_or_ks, DataFrame)
    kdf = kdf_or_ks
    sdf = kdf._sdf
    sdf = sdf.select([sfun(sdf[col]).alias(col) for col in kdf.columns])
    return DataFrame(sdf)
예제 #30
0
    def coalesce(self, num_partitions: int) -> "ks.DataFrame":
        """
        Returns a new DataFrame that has exactly `num_partitions` partitions.

        .. note:: This operation results in a narrow dependency, e.g. if you go from 1000
            partitions to 100 partitions, there will not be a shuffle, instead each of the 100 new
            partitions will claim 10 of the current partitions. If a larger number of partitions is
            requested, it will stay at the current number of partitions. However, if you're doing a
            drastic coalesce, e.g. to num_partitions = 1, this may result in your computation taking
            place on fewer nodes than you like (e.g. one node in the case of num_partitions = 1). To
            avoid this, you can call repartition(). This will add a shuffle step, but means the
            current upstream partitions will be executed in parallel (per whatever the current
            partitioning is).

        Parameters
        ----------
        num_partitions : int
            The target number of partitions.

        Returns
        -------
        DataFrame

        Examples
        --------
        >>> kdf = ks.DataFrame({"age": [5, 5, 2, 2],
        ...         "name": ["Bob", "Bob", "Alice", "Alice"]}).set_index("age")
        >>> kdf.sort_index()  # doctest: +NORMALIZE_WHITESPACE
              name
        age
        2    Alice
        2    Alice
        5      Bob
        5      Bob
        >>> new_kdf = kdf.spark.coalesce(1)
        >>> new_kdf.to_spark().rdd.getNumPartitions()
        1
        >>> new_kdf.sort_index()   # doctest: +NORMALIZE_WHITESPACE
              name
        age
        2    Alice
        2    Alice
        5      Bob
        5      Bob
        """
        from databricks.koalas.frame import DataFrame

        internal = self._kdf._internal.resolved_copy
        coalesced_sdf = internal.spark_frame.coalesce(num_partitions)
        return DataFrame(internal.with_new_sdf(coalesced_sdf))
예제 #31
0
    def __getitem__(self, key):
        from pyspark.sql.functions import lit
        from databricks.koalas.frame import DataFrame
        from databricks.koalas.series import Series

        def raiseNotImplemented(description):
            raise SparkPandasNotImplementedError(
                description=description,
                pandas_function=".loc[..., ...]",
                spark_target_function="select, where")

        rows_sel, cols_sel = _unfold(key, self._ks)

        sdf = self._kdf._sdf
        if isinstance(rows_sel, Series):
            sdf_for_check_schema = sdf.select(rows_sel._scol)
            assert isinstance(sdf_for_check_schema.schema.fields[0].dataType, BooleanType), \
                (str(sdf_for_check_schema), sdf_for_check_schema.schema.fields[0].dataType)
            sdf = sdf.where(rows_sel._scol)
        elif isinstance(rows_sel, slice):
            if rows_sel.step is not None:
                raiseNotImplemented("Cannot use step with Spark.")
            if rows_sel == slice(None):
                # If slice is None - select everything, so nothing to do
                pass
            elif len(self._kdf._index_columns) == 0:
                raiseNotImplemented("Cannot use slice for Spark if no index provided.")
            elif len(self._kdf._index_columns) == 1:
                start = rows_sel.start
                stop = rows_sel.stop

                index_column = self._kdf.index
                index_data_type = index_column.schema[0].dataType
                cond = []
                if start is not None:
                    cond.append(index_column._scol >= lit(start).cast(index_data_type))
                if stop is not None:
                    cond.append(index_column._scol <= lit(stop).cast(index_data_type))

                if len(cond) > 0:
                    sdf = sdf.where(reduce(lambda x, y: x & y, cond))
            else:
                raiseNotImplemented("Cannot use slice for MultiIndex with Spark.")
        elif isinstance(rows_sel, str):
            raiseNotImplemented("Cannot use a scalar value for row selection with Spark.")
        else:
            try:
                rows_sel = list(rows_sel)
            except TypeError:
                raiseNotImplemented("Cannot use a scalar value for row selection with Spark.")
            if len(rows_sel) == 0:
                sdf = sdf.where(lit(False))
            elif len(self._kdf._index_columns) == 1:
                index_column = self._kdf.index
                index_data_type = index_column.schema[0].dataType
                if len(rows_sel) == 1:
                    sdf = sdf.where(
                        index_column._scol == lit(rows_sel[0]).cast(index_data_type))
                else:
                    sdf = sdf.where(index_column._scol.isin(
                        [lit(r).cast(index_data_type) for r in rows_sel]))
            else:
                raiseNotImplemented("Cannot select with MultiIndex with Spark.")
        if cols_sel is None:
            columns = [_make_col(c) for c in self._kdf._metadata.data_columns]
        elif isinstance(cols_sel, spark.Column):
            columns = [cols_sel]
        else:
            columns = [_make_col(c) for c in cols_sel]
        try:
            kdf = DataFrame(sdf.select(self._kdf._metadata.index_columns + columns))
        except AnalysisException:
            raise KeyError('[{}] don\'t exist in columns'
                           .format([col._jc.toString() for col in columns]))
        kdf._metadata = self._kdf._metadata.copy(
            data_columns=kdf._metadata.data_columns[-len(columns):])
        if cols_sel is not None and isinstance(cols_sel, spark.Column):
            from databricks.koalas.series import _col
            return _col(kdf)
        else:
            return kdf