def _verify_for_rename(self, name): if is_list_like(name): if self._internal.index_level != len(name): raise ValueError( "Length of new names must be {}, got {}".format( self._internal.index_level, len(name))) if any(not is_hashable(n) for n in name): raise TypeError("MultiIndex.name must be a hashable type") return [n if is_name_like_tuple(n) else (n, ) for n in name] else: raise TypeError("Must pass list-like as `names`.")
def attach_id_column(self, id_type: str, column: Name) -> "DataFrame": """ Attach a column to be used as identifier of rows similar to the default index. See also `Default Index type <https://koalas.readthedocs.io/en/latest/user_guide/options.html#default-index-type>`_. Parameters ---------- id_type : string The id type. - 'sequence' : a sequence that increases one by one. .. note:: this uses Spark's Window without specifying partition specification. This leads to move all data into single partition in single machine and could cause serious performance degradation. Avoid this method against very large dataset. - 'distributed-sequence' : a sequence that increases one by one, by group-by and group-map approach in a distributed manner. - 'distributed' : a monotonically increasing sequence simply by using PySpark’s monotonically_increasing_id function in a fully distributed manner. column : string or tuple of string The column name. Returns ------- DataFrame The DataFrame attached the column. Examples -------- >>> df = ps.DataFrame({"x": ['a', 'b', 'c']}) >>> df.pandas_on_spark.attach_id_column(id_type="sequence", column="id") x id 0 a 0 1 b 1 2 c 2 >>> df.pandas_on_spark.attach_id_column(id_type="distributed-sequence", column=0) x 0 0 a 0 1 b 1 2 c 2 >>> df.pandas_on_spark.attach_id_column(id_type="distributed", column=0.0) ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE x 0.0 0 a ... 1 b ... 2 c ... For multi-index columns: >>> df = ps.DataFrame({("x", "y"): ['a', 'b', 'c']}) >>> df.pandas_on_spark.attach_id_column(id_type="sequence", column=("id-x", "id-y")) x id-x y id-y 0 a 0 1 b 1 2 c 2 >>> df.pandas_on_spark.attach_id_column(id_type="distributed-sequence", column=(0, 1.0)) x 0 y 1.0 0 a 0 1 b 1 2 c 2 """ from pyspark.pandas.frame import DataFrame if id_type == "sequence": attach_func = InternalFrame.attach_sequence_column elif id_type == "distributed-sequence": attach_func = InternalFrame.attach_distributed_sequence_column elif id_type == "distributed": attach_func = InternalFrame.attach_distributed_column else: raise ValueError( "id_type should be one of 'sequence', 'distributed-sequence' and 'distributed'" ) assert is_name_like_value(column, allow_none=False), column if not is_name_like_tuple(column): column = (column,) internal = self._psdf._internal if len(column) != internal.column_labels_level: raise ValueError( "The given column `{}` must be the same length as the existing columns.".format( column ) ) elif column in internal.column_labels: raise ValueError( "The given column `{}` already exists.".format(name_like_string(column)) ) # Make sure the underlying Spark column names are the form of # `name_like_string(column_label)`. sdf = internal.spark_frame.select( [ scol.alias(SPARK_INDEX_NAME_FORMAT(i)) for i, scol in enumerate(internal.index_spark_columns) ] + [ scol.alias(name_like_string(label)) for scol, label in zip(internal.data_spark_columns, internal.column_labels) ] ) sdf = attach_func(sdf, name_like_string(column)) return DataFrame( InternalFrame( spark_frame=sdf, index_spark_columns=[ scol_for(sdf, SPARK_INDEX_NAME_FORMAT(i)) for i in range(internal.index_level) ], index_names=internal.index_names, index_fields=internal.index_fields, column_labels=internal.column_labels + [column], data_spark_columns=( [scol_for(sdf, name_like_string(label)) for label in internal.column_labels] + [scol_for(sdf, name_like_string(column))] ), data_fields=internal.data_fields + [ InternalField.from_struct_field( StructField(name_like_string(column), LongType(), nullable=False) ) ], column_label_names=internal.column_label_names, ).resolved_copy )
def to_frame( # type: ignore[override] self, index: bool = True, name: Optional[List[Name]] = None) -> DataFrame: """ Create a DataFrame with the levels of the MultiIndex as columns. Column ordering is determined by the DataFrame constructor with data as a dict. Parameters ---------- index : boolean, default True Set the index of the returned DataFrame as the original MultiIndex. name : list / sequence of strings, optional The passed names should substitute index level names. Returns ------- DataFrame : a DataFrame containing the original MultiIndex data. See Also -------- DataFrame Examples -------- >>> tuples = [(1, 'red'), (1, 'blue'), ... (2, 'red'), (2, 'blue')] >>> idx = ps.MultiIndex.from_tuples(tuples, names=('number', 'color')) >>> idx # doctest: +SKIP MultiIndex([(1, 'red'), (1, 'blue'), (2, 'red'), (2, 'blue')], names=['number', 'color']) >>> idx.to_frame() # doctest: +NORMALIZE_WHITESPACE number color number color 1 red 1 red blue 1 blue 2 red 2 red blue 2 blue By default, the original Index is reused. To enforce a new Index: >>> idx.to_frame(index=False) number color 0 1 red 1 1 blue 2 2 red 3 2 blue To override the name of the resulting column, specify `name`: >>> idx.to_frame(name=['n', 'c']) # doctest: +NORMALIZE_WHITESPACE n c number color 1 red 1 red blue 1 blue 2 red 2 red blue 2 blue """ if name is None: name = [ name if name is not None else (i, ) for i, name in enumerate(self._internal.index_names) ] elif is_list_like(name): if len(name) != self._internal.index_level: raise ValueError( "'name' should have same length as number of levels on index." ) name = [n if is_name_like_tuple(n) else (n, ) for n in name] else: raise TypeError( "'name' must be a list / sequence of column names.") return self._to_frame(index=index, names=name)
def from_frame(df: DataFrame, names: Optional[List[Name]] = None) -> "MultiIndex": """ Make a MultiIndex from a DataFrame. Parameters ---------- df : DataFrame DataFrame to be converted to MultiIndex. names : list-like, optional If no names are provided, use the column names, or tuple of column names if the columns is a MultiIndex. If a sequence, overwrite names with the given sequence. Returns ------- MultiIndex The MultiIndex representation of the given DataFrame. See Also -------- MultiIndex.from_arrays : Convert list of arrays to MultiIndex. MultiIndex.from_tuples : Convert list of tuples to MultiIndex. MultiIndex.from_product : Make a MultiIndex from cartesian product of iterables. Examples -------- >>> df = ps.DataFrame([['HI', 'Temp'], ['HI', 'Precip'], ... ['NJ', 'Temp'], ['NJ', 'Precip']], ... columns=['a', 'b']) >>> df # doctest: +SKIP a b 0 HI Temp 1 HI Precip 2 NJ Temp 3 NJ Precip >>> ps.MultiIndex.from_frame(df) # doctest: +SKIP MultiIndex([('HI', 'Temp'), ('HI', 'Precip'), ('NJ', 'Temp'), ('NJ', 'Precip')], names=['a', 'b']) Using explicit names, instead of the column names >>> ps.MultiIndex.from_frame(df, names=['state', 'observation']) # doctest: +SKIP MultiIndex([('HI', 'Temp'), ('HI', 'Precip'), ('NJ', 'Temp'), ('NJ', 'Precip')], names=['state', 'observation']) """ if not isinstance(df, DataFrame): raise TypeError("Input must be a DataFrame") sdf = df.to_spark() if names is None: names = df._internal.column_labels elif not is_list_like(names): raise TypeError("Names should be list-like for a MultiIndex") else: names = [ name if is_name_like_tuple(name) else (name, ) for name in names ] internal = InternalFrame( spark_frame=sdf, index_spark_columns=[scol_for(sdf, col) for col in sdf.columns], index_names=names, ) return cast(MultiIndex, DataFrame(internal).index)