def astype(self, dtype): """ Cast a Koalas object to a specified dtype ``dtype``. Parameters ---------- dtype : data type Use a numpy.dtype or Python type to cast entire pandas object to the same type. Returns ------- casted : same type as caller See Also -------- to_datetime : Convert argument to datetime. Examples -------- >>> ser = ks.Series([1, 2], dtype='int32') >>> ser 0 1 1 2 Name: 0, dtype: int32 >>> ser.astype('int64') 0 1 1 2 Name: 0, dtype: int64 >>> ser.rename("a").to_frame().set_index("a").index.astype('int64') Int64Index([1, 2], dtype='int64', name='a') """ from databricks.koalas.typedef import as_spark_type spark_type = as_spark_type(dtype) if not spark_type: raise ValueError("Type {} not understood".format(dtype)) return self._with_new_scol(self._scol.cast(spark_type))
def astype(self, dtype) -> 'Series': """ Cast a Koalas object to a specified dtype ``dtype``. Parameters ---------- dtype : data type Use a numpy.dtype or Python type to cast entire pandas object to the same type. Returns ------- casted : same type as caller See Also -------- to_datetime : Convert argument to datetime. Examples -------- >>> ser = ks.Series([1, 2], dtype='int32') >>> ser 0 1 1 2 Name: 0, dtype: int32 >>> ser.astype('int64') 0 1 1 2 Name: 0, dtype: int64 """ from databricks.koalas.typedef import as_spark_type spark_type = as_spark_type(dtype) if not spark_type: raise ValueError("Type {} not understood".format(dtype)) return Series(self._scol.cast(spark_type), anchor=self._kdf, index=self._index_map)
def __init__( self, spark_frame: spark.DataFrame, index_map: Optional[Dict[str, Optional[Tuple]]], column_labels: Optional[List[Tuple]] = None, data_spark_columns: Optional[List[spark.Column]] = None, column_label_names: Optional[List[Optional[Tuple[str, ...]]]] = None, ) -> None: """ Create a new internal immutable DataFrame to manage Spark DataFrame, column fields and index fields and names. :param spark_frame: Spark DataFrame to be managed. :param index_map: dictionary of string pairs Each pair holds the index field name which exists in Spark fields, and the index name. :param column_labels: list of tuples with the same length The multi-level values in the tuples. :param data_spark_columns: list of Spark Column Spark Columns to appear as columns. If spark_column is not None, this argument is ignored, otherwise if this is None, calculated from spark_frame. :param column_label_names: Names for each of the column index levels. See the examples below to refer what each parameter means. >>> column_labels = pd.MultiIndex.from_tuples( ... [('a', 'x'), ('a', 'y'), ('b', 'z')], names=["column_labels_a", "column_labels_b"]) >>> row_index = pd.MultiIndex.from_tuples( ... [('foo', 'bar'), ('foo', 'bar'), ('zoo', 'bar')], ... names=["row_index_a", "row_index_b"]) >>> kdf = ks.DataFrame( ... [[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=row_index, columns=column_labels) >>> kdf.set_index(('a', 'x'), append=True, inplace=True) >>> kdf # doctest: +NORMALIZE_WHITESPACE column_labels_a a b column_labels_b y z row_index_a row_index_b (a, x) foo bar 1 2 3 4 5 6 zoo bar 7 8 9 >>> internal = kdf._internal >>> internal._sdf.show() # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS +-----------------+-----------------+------+------+------+... |__index_level_0__|__index_level_1__|(a, x)|(a, y)|(b, z)|... +-----------------+-----------------+------+------+------+... | foo| bar| 1| 2| 3|... | foo| bar| 4| 5| 6|... | zoo| bar| 7| 8| 9|... +-----------------+-----------------+------+------+------+... >>> internal._index_map # doctest: +NORMALIZE_WHITESPACE OrderedDict([('__index_level_0__', ('row_index_a',)), ('__index_level_1__', ('row_index_b',)), ('(a, x)', ('a', 'x'))]) >>> internal._column_labels [('a', 'y'), ('b', 'z')] >>> internal._data_spark_columns [Column<b'(a, y)'>, Column<b'(b, z)'>] >>> internal._column_label_names [('column_labels_a',), ('column_labels_b',)] """ assert isinstance(spark_frame, spark.DataFrame) assert not spark_frame.isStreaming, "Koalas does not support Structured Streaming." if index_map is None: assert not any( SPARK_INDEX_NAME_PATTERN.match(name) for name in spark_frame.columns ), ("Index columns should not appear in columns of the Spark DataFrame. Avoid " "index column names [%s]." % SPARK_INDEX_NAME_PATTERN) if data_spark_columns is not None: spark_frame = spark_frame.select(data_spark_columns) # Create default index. spark_frame = InternalFrame.attach_default_index(spark_frame) index_map = OrderedDict({SPARK_DEFAULT_INDEX_NAME: None}) if data_spark_columns is not None: data_spark_columns = [ scol_for(spark_frame, col) for col in spark_frame.columns if col != SPARK_DEFAULT_INDEX_NAME ] if NATURAL_ORDER_COLUMN_NAME not in spark_frame.columns: spark_frame = spark_frame.withColumn( NATURAL_ORDER_COLUMN_NAME, F.monotonically_increasing_id()) assert isinstance(index_map, OrderedDict), index_map assert all( isinstance(index_field, str) and ( index_name is None or (isinstance(index_name, tuple) and all( name is None or as_spark_type(type(name)) is not None for name in index_name))) for index_field, index_name in index_map.items()), index_map assert data_spark_columns is None or all( isinstance(scol, spark.Column) for scol in data_spark_columns) self._sdf = spark_frame # type: spark.DataFrame self._index_map = index_map # type: Dict[str, Optional[Tuple]] if data_spark_columns is None: index_columns = set(index_column for index_column in self._index_map) self._data_spark_columns = [ scol_for(spark_frame, col) for col in spark_frame.columns if col not in index_columns and col not in HIDDEN_COLUMNS ] else: self._data_spark_columns = data_spark_columns if column_labels is None: self._column_labels = [ (col, ) for col in spark_frame.select(self._data_spark_columns).columns ] # type: List[Tuple] else: assert len(column_labels) == len(self._data_spark_columns), ( len(column_labels), len(self._data_spark_columns), ) if len(column_labels) == 1: column_label = column_labels[0] assert column_label is None or (isinstance( column_label, tuple) and len(column_label) > 0 and all( label is None or as_spark_type(type(label)) is not None for label in column_label)), column_label else: assert all( isinstance(column_label, tuple) and len(column_label) > 0 and all( label is None or as_spark_type(type(label)) is not None for label in column_label) for column_label in column_labels), column_labels assert len(set(len(label) for label in column_labels)) <= 1, column_labels self._column_labels = column_labels if column_label_names is None: self._column_label_names = [None] * column_labels_level( self._column_labels) # type: List[Optional[Tuple[str, ...]]] else: if len(self._column_labels) > 0: assert len(column_label_names) == column_labels_level( self._column_labels), ( len(column_label_names), column_labels_level(self._column_labels), ) else: assert len(column_label_names) > 0, len(column_label_names) assert all(column_label_name is None or ( isinstance(column_label_name, tuple) and all( name is None or as_spark_type(type(name)) is not None for name in column_label_name)) for column_label_name in column_label_names), column_label_names self._column_label_names = column_label_names
def astype(self, dtype): from databricks.koalas.typedef import as_spark_type spark_type = as_spark_type(dtype) if not spark_type: raise ValueError("Type {} not understood".format(dtype)) return Series(self._scol.cast(spark_type), self._kdf, self._index_info)
def test_as_spark_type(self): type_mapper = { # binary np.character: BinaryType(), np.bytes_: BinaryType(), np.string_: BinaryType(), bytes: BinaryType(), # integer np.int8: ByteType(), np.byte: ByteType(), np.int16: ShortType(), np.int32: IntegerType(), np.int64: LongType(), np.int: LongType(), int: LongType(), # floating np.float32: FloatType(), np.float: DoubleType(), np.float64: DoubleType(), float: DoubleType(), # string np.str: StringType(), np.unicode_: StringType(), str: StringType(), # bool np.bool: BooleanType(), bool: BooleanType(), # datetime np.datetime64: TimestampType(), datetime.datetime: TimestampType(), # DateType datetime.date: DateType(), # DecimalType decimal.Decimal: DecimalType(38, 18), # ArrayType np.ndarray: ArrayType(StringType()), List[bytes]: ArrayType(BinaryType()), List[np.character]: ArrayType(BinaryType()), List[np.bytes_]: ArrayType(BinaryType()), List[np.string_]: ArrayType(BinaryType()), List[bool]: ArrayType(BooleanType()), List[np.bool]: ArrayType(BooleanType()), List[datetime.date]: ArrayType(DateType()), List[np.int8]: ArrayType(ByteType()), List[np.byte]: ArrayType(ByteType()), List[decimal.Decimal]: ArrayType(DecimalType(38, 18)), List[float]: ArrayType(DoubleType()), List[np.float]: ArrayType(DoubleType()), List[np.float64]: ArrayType(DoubleType()), List[np.float32]: ArrayType(FloatType()), List[np.int32]: ArrayType(IntegerType()), List[int]: ArrayType(LongType()), List[np.int]: ArrayType(LongType()), List[np.int64]: ArrayType(LongType()), List[np.int16]: ArrayType(ShortType()), List[str]: ArrayType(StringType()), List[np.unicode_]: ArrayType(StringType()), List[datetime.datetime]: ArrayType(TimestampType()), List[np.datetime64]: ArrayType(TimestampType()), } for numpy_or_python_type, spark_type in type_mapper.items(): self.assertEqual(as_spark_type(numpy_or_python_type), spark_type)
def test_as_spark_type_koalas_dtype(self): type_mapper = { # binary np.character: (np.character, BinaryType()), np.bytes_: (np.bytes_, BinaryType()), np.string_: (np.bytes_, BinaryType()), bytes: (np.bytes_, BinaryType()), # integer np.int8: (np.int8, ByteType()), np.byte: (np.int8, ByteType()), np.int16: (np.int16, ShortType()), np.int32: (np.int32, IntegerType()), np.int64: (np.int64, LongType()), np.int: (np.int64, LongType()), int: (np.int64, LongType()), # floating np.float32: (np.float32, FloatType()), np.float: (np.float64, DoubleType()), np.float64: (np.float64, DoubleType()), float: (np.float64, DoubleType()), # string np.str: (np.unicode_, StringType()), np.unicode_: (np.unicode_, StringType()), str: (np.unicode_, StringType()), # bool np.bool: (np.bool, BooleanType()), bool: (np.bool, BooleanType()), # datetime np.datetime64: (np.datetime64, TimestampType()), datetime.datetime: (np.dtype("datetime64[ns]"), TimestampType()), # DateType datetime.date: (np.dtype("object"), DateType()), # DecimalType decimal.Decimal: (np.dtype("object"), DecimalType(38, 18)), # ArrayType np.ndarray: (np.dtype("object"), ArrayType(StringType())), List[bytes]: (np.dtype("object"), ArrayType(BinaryType())), List[np.character]: (np.dtype("object"), ArrayType(BinaryType())), List[np.bytes_]: (np.dtype("object"), ArrayType(BinaryType())), List[np.string_]: (np.dtype("object"), ArrayType(BinaryType())), List[bool]: (np.dtype("object"), ArrayType(BooleanType())), List[np.bool]: (np.dtype("object"), ArrayType(BooleanType())), List[datetime.date]: (np.dtype("object"), ArrayType(DateType())), List[np.int8]: (np.dtype("object"), ArrayType(ByteType())), List[np.byte]: (np.dtype("object"), ArrayType(ByteType())), List[decimal.Decimal]: (np.dtype("object"), ArrayType(DecimalType(38, 18))), List[float]: (np.dtype("object"), ArrayType(DoubleType())), List[np.float]: (np.dtype("object"), ArrayType(DoubleType())), List[np.float64]: (np.dtype("object"), ArrayType(DoubleType())), List[np.float32]: (np.dtype("object"), ArrayType(FloatType())), List[np.int32]: (np.dtype("object"), ArrayType(IntegerType())), List[int]: (np.dtype("object"), ArrayType(LongType())), List[np.int]: (np.dtype("object"), ArrayType(LongType())), List[np.int64]: (np.dtype("object"), ArrayType(LongType())), List[np.int16]: (np.dtype("object"), ArrayType(ShortType())), List[str]: (np.dtype("object"), ArrayType(StringType())), List[np.unicode_]: (np.dtype("object"), ArrayType(StringType())), List[datetime.datetime]: (np.dtype("object"), ArrayType(TimestampType())), List[np.datetime64]: (np.dtype("object"), ArrayType(TimestampType())), # CategoricalDtype CategoricalDtype(categories=["a", "b", "c"]): ( CategoricalDtype(categories=["a", "b", "c"]), LongType(), ), } for numpy_or_python_type, (dtype, spark_type) in type_mapper.items(): self.assertEqual(as_spark_type(numpy_or_python_type), spark_type) self.assertEqual(koalas_dtype(numpy_or_python_type), (dtype, spark_type)) with self.assertRaisesRegex(TypeError, "Type uint64 was not understood."): as_spark_type(np.dtype("uint64")) with self.assertRaisesRegex(TypeError, "Type object was not understood."): as_spark_type(np.dtype("object")) with self.assertRaisesRegex(TypeError, "Type uint64 was not understood."): koalas_dtype(np.dtype("uint64")) with self.assertRaisesRegex(TypeError, "Type object was not understood."): koalas_dtype(np.dtype("object"))