def rank(self, method='average', ascending=True): kdf = super(SeriesGroupBy, self).rank(method, ascending).to_dataframe() return _col( DataFrame( kdf._internal.copy(sdf=kdf._sdf.select( kdf._internal.data_scols), index_map=[]))) # index is lost.
def drop(self, labels): """ Make new Index with passed list of labels deleted. Parameters ---------- labels : array-like Returns ------- dropped : Index Examples -------- >>> index = ks.Index([1, 2, 3]) >>> index Int64Index([1, 2, 3], dtype='int64') >>> index.drop([1]) Int64Index([2, 3], dtype='int64') """ if not isinstance(labels, (tuple, list)): labels = [labels] sdf = self._internal.sdf[~self._internal.index_scols[0].isin(labels)] return Index( DataFrame( _InternalFrame(sdf=sdf, index_map=self._kdf._internal.index_map)))
def __setitem__(self, key, value): from databricks.koalas.frame import DataFrame from databricks.koalas.series import _col super(iLocIndexer, self).__setitem__(key, value) if self._is_series: internal = self._kdf_or_kser._internal sdf = internal.spark_frame.select(internal.index_spark_columns + [internal.spark_column]) internal = internal.copy( spark_frame=sdf, column_labels=[internal.column_labels[0] or ("0", )], data_spark_columns=[ scol_for(sdf, internal.data_spark_column_names[0]) ], spark_column=None, ) kser = _col(DataFrame(internal)) self._kdf_or_kser._internal = kser._internal self._kdf_or_kser._kdf = kser._kdf else: assert self._is_df # TODO: support DataFrame. delattr(self, "_lazy__internal") delattr(self, "_lazy__sequence_col")
def execute(self) -> DataFrame: """ Returns a DataFrame for which the SQL statement has been executed by the underlying SQL engine. >>> str0 = 'abc' >>> ks.sql("select {str0}") abc 0 abc >>> str1 = 'abc"abc' >>> str2 = "abc'abc" >>> ks.sql("select {str0}, {str1}, {str2}") abc abc"abc abc'abc 0 abc abc"abc abc'abc >>> strs = ['a', 'b'] >>> ks.sql("select 'a' in {strs} as cond1, 'c' in {strs} as cond2") cond1 cond2 0 True False """ blocks = _string.formatter_parser(self._statement) # TODO: use a string builder res = "" try: for (pre, inner, _, _) in blocks: var_next = "" if inner is None else self._convert(inner) res = res + pre + var_next self._normalized_statement = res sdf = self._session.sql(self._normalized_statement) finally: for v in self._temp_views: self._session.catalog.dropTempView(v) return DataFrame(sdf)
def aggregate(self, func_or_funcs, *args, **kwargs): """Compute aggregates and returns the result as a :class:`DataFrame`. The available aggregate functions can be built-in aggregation functions, such as `avg`, `max`, `min`, `sum`, `count`. :param func_or_funcs: a dict mapping from column name (string) to aggregate functions (string). """ if not isinstance(func_or_funcs, dict) or \ not all(isinstance(key, string_types) and isinstance(value, string_types) for key, value in func_or_funcs.items()): raise ValueError( "aggs must be a dict mapping from column name (string) to aggregate " "functions (string).") sdf = self._groupdata.agg(func_or_funcs) reorder = [ '%s(%s)' % (value, key) for key, value in iter(func_or_funcs.items()) ] kdf = DataFrame(sdf.select(reorder)) kdf.columns = [key for key in iter(func_or_funcs.keys())] return kdf
def _reduce_for_stat_function(self, sfun, only_numeric): groupkeys = self._groupkeys groupkey_cols = [ s._scol.alias('__index_level_{}__'.format(i)) for i, s in enumerate(groupkeys) ] sdf = self._kdf._sdf data_columns = [] if len(self._agg_columns) > 0: stat_exprs = [] for ks in self._agg_columns: spark_type = ks.spark_type # TODO: we should have a function that takes dataframes and converts the numeric # types. Converting the NaNs is used in a few places, it should be in utils. # Special handle floating point types because Spark's count treats nan as a valid # value, whereas Pandas count doesn't include nan. if isinstance(spark_type, DoubleType) or isinstance( spark_type, FloatType): stat_exprs.append( sfun(F.nanvl(ks._scol, F.lit(None))).alias(ks.name)) data_columns.append(ks.name) elif isinstance(spark_type, NumericType) or not only_numeric: stat_exprs.append(sfun(ks._scol).alias(ks.name)) data_columns.append(ks.name) sdf = sdf.groupby(*groupkey_cols).agg(*stat_exprs) else: sdf = sdf.select(*groupkey_cols).distinct() sdf = sdf.sort(*groupkey_cols) internal = _InternalFrame(sdf=sdf, data_columns=data_columns, index_map=[('__index_level_{}__'.format(i), s.name) for i, s in enumerate(groupkeys)]) return DataFrame(internal)
def drop_duplicates(self): """ Return Index with duplicate values removed. Returns ------- deduplicated : Index See Also -------- Series.drop_duplicates : Equivalent method on Series. DataFrame.drop_duplicates : Equivalent method on DataFrame. Examples -------- Generate an pandas.Index with duplicate values. >>> idx = ks.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo']) >>> idx.drop_duplicates() # doctest: +SKIP Index(['lama', 'cow', 'beetle', 'hippo'], dtype='object') """ sdf = self._internal.sdf.select( self._internal.index_scols).drop_duplicates() internal = _InternalFrame(sdf=sdf, index_map=self._kdf._internal.index_map) result = DataFrame(internal).index return result
def _cum(self, func): # This is used for cummin, cummax, cumxum, etc. if func == F.min: func = "cummin" elif func == F.max: func = "cummax" elif func == F.sum: func = "cumsum" elif func.__name__ == "cumprod": func = "cumprod" if len(self._kdf._internal.index_columns) == 0: raise ValueError("Index must be set.") applied = [] kdf = self._kdf groupkey_columns = [s.name for s in self._groupkeys] for column in kdf._internal.data_columns: # pandas groupby.cumxxx ignores the grouping key itself. if column not in groupkey_columns: applied.append( getattr(kdf[column].groupby(self._groupkeys), func)()) sdf = kdf._sdf.select(kdf._internal.index_scols + [c._scol for c in applied]) internal = kdf._internal.copy(sdf=sdf, data_columns=[c.name for c in applied]) return DataFrame(internal)
def from_tuples(tuples, sortorder=None, names=None): """ Convert list of tuples to MultiIndex. Parameters ---------- tuples : list / sequence of tuple-likes Each tuple is the index of one row/column. sortorder : int or None Level of sortedness (must be lexicographically sorted by that level). names : list / sequence of str, optional Names for the levels in the index. Returns ------- index : MultiIndex Examples -------- >>> tuples = [(1, 'red'), (1, 'blue'), ... (2, 'red'), (2, 'blue')] >>> ks.MultiIndex.from_tuples(tuples, names=('number', 'color')) # doctest: +SKIP MultiIndex([(1, 'red'), (1, 'blue'), (2, 'red'), (2, 'blue')], names=['number', 'color']) """ return DataFrame(index=pd.MultiIndex.from_tuples( tuples=tuples, sortorder=sortorder, names=names)).index
def from_arrays(arrays, sortorder=None, names=None): """ Convert arrays to MultiIndex. Parameters ---------- arrays: list / sequence of array-likes Each array-like gives one level’s value for each data point. len(arrays) is the number of levels. sortorder: int or None Level of sortedness (must be lexicographically sorted by that level). names: list / sequence of str, optional Names for the levels in the index. Returns ------- index: MultiIndex Examples -------- >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] >>> ks.MultiIndex.from_arrays(arrays, names=('number', 'color')) # doctest: +SKIP MultiIndex([(1, 'red'), (1, 'blue'), (2, 'red'), (2, 'blue')], names=['number', 'color']) """ return DataFrame(index=pd.MultiIndex.from_arrays( arrays=arrays, sortorder=sortorder, names=names)).index
def read_table(name: str) -> DataFrame: """ Read a Spark table and return a DataFrame. Parameters ---------- name : string Table name in Spark. Returns ------- DataFrame See Also -------- DataFrame.to_table read_delta read_parquet read_spark_io Examples -------- >>> ks.range(1).to_table('%s.my_table' % db) >>> ks.read_table('%s.my_table' % db) id 0 0 """ sdf = default_session().read.table(name) return DataFrame(sdf)
def from_pandas( pobj: Union['pd.DataFrame', 'pd.Series']) -> Union['Series', 'DataFrame']: """Create a Koalas DataFrame or Series from a pandas DataFrame or Series. This is similar to Spark's `SparkSession.createDataFrame()` with pandas DataFrame, but this also works with pandas Series and picks the index. Parameters ---------- pobj : pandas.DataFrame or pandas.Series pandas DataFrame or Series to read. Returns ------- Series or DataFrame If a pandas Series is passed in, this function returns a Koalas Series. If a pandas DataFrame is passed in, this function returns a Koalas DataFrame. """ if isinstance(pobj, pd.Series): return Series(pobj) elif isinstance(pobj, pd.DataFrame): return DataFrame(pobj) else: raise ValueError("Unknown data type: {}".format(type(pobj)))
def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True): if bins is not None: raise NotImplementedError( "value_counts currently does not support bins") if dropna: sdf_dropna = self._kdf._sdf.filter(self.notna()._scol) else: sdf_dropna = self._kdf._sdf sdf = sdf_dropna.groupby(self._scol).count() if sort: if ascending: sdf = sdf.orderBy(F.col('count')) else: sdf = sdf.orderBy(F.col('count').desc()) if normalize: sum = sdf_dropna.count() sdf = sdf.withColumn('count', F.col('count') / F.lit(sum)) index_name = 'index' if self.name != 'index' else 'level_0' kdf = DataFrame(sdf) kdf.columns = [index_name, self.name] kdf._metadata = Metadata(column_fields=[self.name], index_info=[(index_name, None)]) return _col(kdf)
def read_parquet(path, columns=None): """Load a parquet object from the file path, returning a DataFrame. Parameters ---------- path : string File path columns : list, default=None If not None, only these columns will be read from the file. Returns ------- DataFrame Examples -------- >>> ks.read_parquet('data.parquet', columns=['name', 'gender']) # doctest: +SKIP """ if columns is not None: columns = list(columns) if columns is None or len(columns) > 0: sdf = default_session().read.parquet(path) if columns is not None: fields = [field.name for field in sdf.schema] cols = [col for col in columns if col in fields] if len(cols) > 0: sdf = sdf.select(cols) else: sdf = default_session().createDataFrame([], schema=StructType()) else: sdf = default_session().createDataFrame([], schema=StructType()) return DataFrame(sdf)
def to_pandas(self) -> pd.Index: """ Return a pandas Series. .. note:: This method should only be used if the resulting Pandas object is expected to be small, as all the data is loaded into the driver's memory. If the input is large, set max_rows parameter. Examples -------- >>> df = ks.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)], ... columns=['dogs', 'cats'], ... index=list('abcd')) >>> df['dogs'].index.to_pandas() Index(['a', 'b', 'c', 'd'], dtype='object') """ sdf = self._kdf._sdf.select(self._scol) internal = self._kdf._internal.copy( sdf=sdf, index_map=[(sdf.schema[0].name, self._kdf._internal.index_names[0]) ], data_columns=[], column_index=[], column_index_names=None) return DataFrame(internal)._to_internal_pandas().index
def fillna(self, value): """ Fill NA/NaN values with the specified value. Parameters ---------- value : scalar Scalar value to use to fill holes (e.g. 0). This value cannot be a list-likes. Returns ------- Index : filled with value Examples -------- >>> ki = ks.DataFrame({'a': ['a', 'b', 'c']}, index=[1, 2, None]).index >>> ki Float64Index([1.0, 2.0, nan], dtype='float64') >>> ki.fillna(0) Float64Index([1.0, 2.0, 0.0], dtype='float64') """ if not isinstance(value, (float, int, str, bool)): raise TypeError("Unsupported type %s" % type(value)) sdf = self._internal.sdf.fillna(value) result = DataFrame(self._kdf._internal.copy(sdf=sdf)).index return result
def __setitem__(self, key, value): from databricks.koalas.frame import DataFrame from databricks.koalas.series import _col super(iLocIndexer, self).__setitem__(key, value) if self._is_series: internal = self._kdf_or_kser._internal sdf = internal.spark_frame.select(internal.index_spark_columns + [internal.spark_column]) internal = internal.copy( spark_frame=sdf, column_labels=[internal.column_labels[0] or ("0", )], data_spark_columns=[ scol_for(sdf, internal.data_spark_column_names[0]) ], spark_column=None, ) kser = _col(DataFrame(internal)) self._kdf_or_kser._internal = kser._internal self._kdf_or_kser._kdf = kser._kdf else: assert self._is_df # Clean up implicitly cached properties to be able to reuse the indexer. del self._internal del self._sequence_col
def unique(self): """ Return unique values of Series object. Uniques are returned in order of appearance. Hash table-based unique, therefore does NOT sort. .. note:: This method returns newly creased Series whereas Pandas returns the unique values as a NumPy array. Returns ------- Returns the unique values as a Series. See Examples section. Examples -------- >>> ks.Series([2, 1, 3, 3], name='A').unique() 0 1 1 3 2 2 Name: A, dtype: int64 >>> ks.Series([pd.Timestamp('2016-01-01') for _ in range(3)]).unique() 0 2016-01-01 Name: 0, dtype: datetime64[ns] """ sdf = self.to_dataframe()._sdf return _col(DataFrame(sdf.select(self._scol).distinct()))
def intersection(self, other) -> "MultiIndex": """ Form the intersection of two Index objects. This returns a new Index with elements common to the index and `other`. Parameters ---------- other : Index or array-like Returns ------- intersection : MultiIndex Examples -------- >>> midx1 = ks.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) >>> midx2 = ks.MultiIndex.from_tuples([("c", "z"), ("d", "w")]) >>> midx1.intersection(midx2).sort_values() # doctest: +SKIP MultiIndex([('c', 'z')], ) """ if isinstance(other, Series) or not is_list_like(other): raise TypeError("other must be a MultiIndex or a list of tuples") elif isinstance(other, DataFrame): raise ValueError("Index data must be 1-dimensional") elif isinstance(other, MultiIndex): spark_frame_other = other.to_frame().to_spark() keep_name = self.names == other.names elif isinstance(other, Index): # Always returns an empty MultiIndex if `other` is Index. return self.to_frame().head(0).index # type: ignore elif not all(isinstance(item, tuple) for item in other): raise TypeError("other must be a MultiIndex or a list of tuples") else: other = MultiIndex.from_tuples(list(other)) spark_frame_other = other.to_frame().to_spark() keep_name = True default_name = [ SPARK_INDEX_NAME_FORMAT(i) for i in range(self.nlevels) ] spark_frame_self = self.to_frame(name=default_name).to_spark() spark_frame_intersected = spark_frame_self.intersect(spark_frame_other) if keep_name: index_names = self._internal.index_names else: index_names = None internal = InternalFrame( # TODO: dtypes? spark_frame=spark_frame_intersected, index_spark_columns=[ scol_for(spark_frame_intersected, col) for col in default_name ], index_names=index_names, ) return cast(MultiIndex, DataFrame(internal).index)
def size(self): """ Compute group sizes. See Also -------- databricks.koalas.Series.groupby databricks.koalas.DataFrame.groupby Examples -------- >>> df = ks.DataFrame({'A': [1, 2, 2, 3, 3, 3], ... 'B': [1, 1, 2, 3, 3, 3]}, ... columns=['A', 'B']) >>> df A B 0 1 1 1 2 1 2 2 2 3 3 3 4 3 3 5 3 3 >>> df.groupby('A').size().sort_index() # doctest: +NORMALIZE_WHITESPACE A 1 1 2 2 3 3 Name: count, dtype: int64 >>> df.groupby(['A', 'B']).size().sort_index() # doctest: +NORMALIZE_WHITESPACE A B 1 1 1 2 1 1 2 1 3 3 3 Name: count, dtype: int64 """ groupkeys = self._groupkeys groupkey_cols = [ s._scol.alias('__index_level_{}__'.format(i)) for i, s in enumerate(groupkeys) ] sdf = self._kdf._sdf sdf = sdf.groupby(*groupkey_cols).count() if (len(self._agg_columns) > 0) and (self._have_agg_columns): name = self._agg_columns[0].name sdf = sdf.withColumnRenamed('count', name) else: name = 'count' internal = _InternalFrame(sdf=sdf, data_columns=[name], index_map=[('__index_level_{}__'.format(i), s.name) for i, s in enumerate(groupkeys)]) return _col(DataFrame(internal))
def _init_from_pandas(self, s, *args): """ Creates Koalas Series from Pandas Series. :param s: Pandas Series """ kdf = DataFrame(pd.DataFrame(s)) self._init_from_spark(kdf._sdf[kdf._metadata.column_fields[0]], kdf, kdf._metadata.index_info)
def range(start: int, end: Optional[int] = None, step: int = 1, num_partitions: Optional[int] = None) -> DataFrame: """ Create a DataFrame with some range of numbers. The resulting DataFrame has a single int64 column named `id`, containing elements in a range from ``start`` to ``end`` (exclusive) with step value ``step``. If only the first parameter (i.e. start) is specified, we treat it as the end value with the start value being 0. This is similar to the range function in SparkSession and is used primarily for testing. Parameters ---------- start : int the start value (inclusive) end : int, optional the end value (exclusive) step : int, optional, default 1 the incremental step num_partitions : int, optional the number of partitions of the DataFrame Returns ------- DataFrame Examples -------- When the first parameter is specified, we generate a range of values up till that number. >>> ks.range(5) id 0 0 1 1 2 2 3 3 4 4 When start, end, and step are specified: >>> ks.range(start = 100, end = 200, step = 20) id 0 100 1 120 2 140 3 160 4 180 """ sdf = default_session().range(start=start, end=end, step=step, numPartitions=num_partitions) return DataFrame(sdf)
def dropna(self): """ Return Index or MultiIndex without NA/NaN values Examples -------- >>> df = ks.DataFrame([[1, 2], [4, 5], [7, 8]], ... index=['cobra', 'viper', None], ... columns=['max_speed', 'shield']) >>> df max_speed shield cobra 1 2 viper 4 5 NaN 7 8 >>> df.index.dropna() Index(['cobra', 'viper'], dtype='object') Also support for MultiIndex >>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'], ... [None, 'weight', 'length']], ... [[0, 1, 1, 1, 1, 1, 2, 2, 2], ... [0, 1, 1, 0, 1, 2, 1, 1, 2]]) >>> s = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, None], ... index=midx) >>> s lama NaN 45.0 cow weight 200.0 weight 1.2 NaN 30.0 weight 250.0 length 1.5 falcon weight 320.0 weight 1.0 length NaN Name: 0, dtype: float64 >>> s.index.dropna() # doctest: +SKIP MultiIndex([( 'cow', 'weight'), ( 'cow', 'weight'), ( 'cow', 'weight'), ( 'cow', 'length'), ('falcon', 'weight'), ('falcon', 'weight'), ('falcon', 'length')], ) """ kdf = self._kdf.copy() sdf = kdf._internal.sdf.select(self._internal.index_scols).dropna() internal = _InternalFrame(sdf=sdf, index_map=self._internal.index_map) kdf = DataFrame(internal) return Index(kdf) if type(self) == Index else MultiIndex(kdf)
def rename(self, name, inplace=False): """ Alter Index name. Able to set new names without level. Defaults to returning new index. Parameters ---------- name : label or list of labels Name(s) to set. inplace : boolean, default False Modifies the object directly, instead of creating a new Index. Returns ------- Index The same type as the caller or None if inplace is True. Examples -------- >>> df = ks.DataFrame({'a': ['A', 'C'], 'b': ['A', 'B']}, columns=['a', 'b']) >>> df.index.rename("c") Int64Index([0, 1], dtype='int64', name='c') >>> df.set_index("a", inplace=True) >>> df.index.rename("d") Index(['A', 'C'], dtype='object', name='d') You can also change the index name in place. >>> df.index.rename("e", inplace=True) Index(['A', 'C'], dtype='object', name='e') >>> df # doctest: +NORMALIZE_WHITESPACE b e A A C B """ index_columns = self._kdf._internal.index_columns assert len(index_columns) == 1 sdf = self._kdf._sdf.select([self._scol] + self._kdf._internal.data_scols) internal = self._kdf._internal.copy(sdf=sdf, index_map=[(sdf.schema[0].name, name)]) if inplace: self._kdf._internal = internal return self else: return DataFrame(internal).index
def __repr__(self): sdf = self._kdf._sdf.select(self._scol).limit(max_display_count + 1) internal = self._kdf._internal.copy( sdf=sdf, index_map=[(sdf.schema[0].name, self._kdf._internal.index_names[0])], data_columns=[]) pindex = DataFrame(internal).index.to_pandas() pindex_length = len(pindex) repr_string = repr(pindex[:max_display_count]) if pindex_length > max_display_count: footer = '\nShowing only the first {}'.format(max_display_count) return repr_string + footer return repr_string
def analyzed(self) -> "ks.Series": """ Returns a new Series with the analyzed Spark DataFrame. After multiple operations, the underlying Spark plan could grow huge and make the Spark planner take a long time to finish the planning. This function is for the workaround to avoid it. .. note:: After analyzed, operations between the analyzed Series and the original one will **NOT** work without setting a config `compute.ops_on_diff_frames` to `True`. Returns ------- Series Examples -------- >>> ser = ks.Series([1, 2, 3]) >>> ser 0 1 1 2 2 3 dtype: int64 The analyzed one should return the same value. >>> ser.spark.analyzed 0 1 1 2 2 3 dtype: int64 However, it won't work with the same anchor Series. >>> ser + ser.spark.analyzed Traceback (most recent call last): ... ValueError: ... enable 'compute.ops_on_diff_frames' option. >>> with ks.option_context('compute.ops_on_diff_frames', True): ... (ser + ser.spark.analyzed).sort_index() 0 2 1 4 2 6 dtype: int64 """ from databricks.koalas.frame import DataFrame from databricks.koalas.series import first_series return first_series(DataFrame(self._data._internal.resolved_copy))
def from_pandas(pdf): """Create DataFrame from pandas DataFrame. This is similar to `DataFrame.createDataFrame()` with pandas DataFrame, but this also picks the index in the given pandas DataFrame. :param pdf: :class:`pandas.DataFrame` """ if isinstance(pdf, pd.Series): return Series(pdf) elif isinstance(pdf, pd.DataFrame): return DataFrame(pdf) else: raise ValueError("Unknown data type: {}".format(type(pdf)))
def _spark_col_apply(kdf_or_ks, sfun): """ Performs a function to all cells on a dataframe, the function being a known sql function. """ from databricks.koalas.frame import DataFrame from databricks.koalas.series import Series if isinstance(kdf_or_ks, Series): ks = kdf_or_ks return Series(ks._kdf._internal.copy(scol=sfun(kdf_or_ks._scol)), anchor=ks._kdf) assert isinstance(kdf_or_ks, DataFrame) kdf = kdf_or_ks sdf = kdf._sdf sdf = sdf.select([sfun(sdf[col]).alias(col) for col in kdf.columns]) return DataFrame(sdf)
def coalesce(self, num_partitions: int) -> "ks.DataFrame": """ Returns a new DataFrame that has exactly `num_partitions` partitions. .. note:: This operation results in a narrow dependency, e.g. if you go from 1000 partitions to 100 partitions, there will not be a shuffle, instead each of the 100 new partitions will claim 10 of the current partitions. If a larger number of partitions is requested, it will stay at the current number of partitions. However, if you're doing a drastic coalesce, e.g. to num_partitions = 1, this may result in your computation taking place on fewer nodes than you like (e.g. one node in the case of num_partitions = 1). To avoid this, you can call repartition(). This will add a shuffle step, but means the current upstream partitions will be executed in parallel (per whatever the current partitioning is). Parameters ---------- num_partitions : int The target number of partitions. Returns ------- DataFrame Examples -------- >>> kdf = ks.DataFrame({"age": [5, 5, 2, 2], ... "name": ["Bob", "Bob", "Alice", "Alice"]}).set_index("age") >>> kdf.sort_index() # doctest: +NORMALIZE_WHITESPACE name age 2 Alice 2 Alice 5 Bob 5 Bob >>> new_kdf = kdf.spark.coalesce(1) >>> new_kdf.to_spark().rdd.getNumPartitions() 1 >>> new_kdf.sort_index() # doctest: +NORMALIZE_WHITESPACE name age 2 Alice 2 Alice 5 Bob 5 Bob """ from databricks.koalas.frame import DataFrame internal = self._kdf._internal.resolved_copy coalesced_sdf = internal.spark_frame.coalesce(num_partitions) return DataFrame(internal.with_new_sdf(coalesced_sdf))
def analyzed(self) -> "ks.DataFrame": """ Returns a new DataFrame with the analyzed Spark DataFrame. After multiple operations, the underlying Spark plan could grow huge and make the Spark planner take a long time to finish the planning. This function is for the workaround to avoid it. .. note:: After analyzed, operations between the analyzed DataFrame and the original one will **NOT** work without setting a config `compute.ops_on_diff_frames` to `True`. Returns ------- DataFrame Examples -------- >>> df = ks.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}, columns=["a", "b"]) >>> df a b 0 1 4 1 2 5 2 3 6 The analyzed one should return the same value. >>> df.spark.analyzed a b 0 1 4 1 2 5 2 3 6 However, it won't work with the same anchor Series. >>> df + df.spark.analyzed Traceback (most recent call last): ... ValueError: ... enable 'compute.ops_on_diff_frames' option. >>> with ks.option_context('compute.ops_on_diff_frames', True): ... (df + df.spark.analyzed).sort_index() a b 0 2 8 1 4 10 2 6 12 """ from databricks.koalas.frame import DataFrame return DataFrame(self._kdf._internal.resolved_copy)