def test_data_type_ops(self): _mock_spark_type = DataType() _mock_dtype = ExtensionDtype() _mappings = ( (CategoricalDtype(), _mock_spark_type, CategoricalOps), (_mock_dtype, DecimalType(), DecimalOps), (_mock_dtype, FractionalType(), FractionalOps), (_mock_dtype, IntegralType(), IntegralOps), (_mock_dtype, StringType(), StringOps), (_mock_dtype, BooleanType(), BooleanOps), (_mock_dtype, TimestampType(), DatetimeOps), (_mock_dtype, TimestampNTZType(), DatetimeNTZOps), (_mock_dtype, DateType(), DateOps), (_mock_dtype, DayTimeIntervalType(), TimedeltaOps), (_mock_dtype, BinaryType(), BinaryOps), (_mock_dtype, ArrayType(StringType()), ArrayOps), (_mock_dtype, MapType(StringType(), IntegralType()), MapOps), (_mock_dtype, StructType(), StructOps), (_mock_dtype, NullType(), NullOps), (_mock_dtype, UserDefinedType(), UDTOps), ) for _dtype, _spark_type, _ops in _mappings: self.assertIsInstance(DataTypeOps(_dtype, _spark_type), _ops) _unknow_spark_type = _mock_spark_type self.assertRaises(TypeError, DataTypeOps, BooleanType(), _unknow_spark_type)
def setUpClass(cls): from datetime import date, datetime from decimal import Decimal super(ArrowTests, cls).setUpClass() cls.warnings_lock = threading.Lock() # Synchronize default timezone between Python and Java cls.tz_prev = os.environ.get("TZ", None) # save current tz if set tz = "America/Los_Angeles" os.environ["TZ"] = tz time.tzset() cls.spark.conf.set("spark.sql.session.timeZone", tz) # Test fallback cls.spark.conf.set("spark.sql.execution.arrow.enabled", "false") assert cls.spark.conf.get( "spark.sql.execution.arrow.pyspark.enabled") == "false" cls.spark.conf.set("spark.sql.execution.arrow.enabled", "true") assert cls.spark.conf.get( "spark.sql.execution.arrow.pyspark.enabled") == "true" cls.spark.conf.set("spark.sql.execution.arrow.fallback.enabled", "true") assert cls.spark.conf.get( "spark.sql.execution.arrow.pyspark.fallback.enabled") == "true" cls.spark.conf.set("spark.sql.execution.arrow.fallback.enabled", "false") assert cls.spark.conf.get( "spark.sql.execution.arrow.pyspark.fallback.enabled") == "false" # Enable Arrow optimization in this tests. cls.spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true") # Disable fallback by default to easily detect the failures. cls.spark.conf.set( "spark.sql.execution.arrow.pyspark.fallback.enabled", "false") cls.schema_wo_null = StructType([ StructField("1_str_t", StringType(), True), StructField("2_int_t", IntegerType(), True), StructField("3_long_t", LongType(), True), StructField("4_float_t", FloatType(), True), StructField("5_double_t", DoubleType(), True), StructField("6_decimal_t", DecimalType(38, 18), True), StructField("7_date_t", DateType(), True), StructField("8_timestamp_t", TimestampType(), True), StructField("9_binary_t", BinaryType(), True) ]) cls.schema = cls.schema_wo_null.add("10_null_t", NullType(), True) cls.data_wo_null = [ (u"a", 1, 10, 0.2, 2.0, Decimal("2.0"), date(1969, 1, 1), datetime(1969, 1, 1, 1, 1, 1), bytearray(b"a")), (u"b", 2, 20, 0.4, 4.0, Decimal("4.0"), date(2012, 2, 2), datetime(2012, 2, 2, 2, 2, 2), bytearray(b"bb")), (u"c", 3, 30, 0.8, 6.0, Decimal("6.0"), date(2100, 3, 3), datetime(2100, 3, 3, 3, 3, 3), bytearray(b"ccc")), (u"d", 4, 40, 1.0, 8.0, Decimal("8.0"), date(2262, 4, 12), datetime(2262, 3, 3, 3, 3, 3), bytearray(b"dddd")), ] cls.data = [tuple(list(d) + [None]) for d in cls.data_wo_null]
def makeSchema(columns): struct_field_map = {'string': StringType(), 'date': TimestampType(), 'double': DoubleType(), 'int': IntegerType(), 'none': NullType()} fields = [StructField(k, struct_field_map[v], True) for k, v in columns] return StructType(fields)
def from_arrow_type(at): """ Convert pyarrow type to Spark data type. """ from distutils.version import LooseVersion import pyarrow as pa import pyarrow.types as types if types.is_boolean(at): spark_type = BooleanType() elif types.is_int8(at): spark_type = ByteType() elif types.is_int16(at): spark_type = ShortType() elif types.is_int32(at): spark_type = IntegerType() elif types.is_int64(at): spark_type = LongType() elif types.is_float32(at): spark_type = FloatType() elif types.is_float64(at): spark_type = DoubleType() elif types.is_decimal(at): spark_type = DecimalType(precision=at.precision, scale=at.scale) elif types.is_string(at): spark_type = StringType() elif types.is_binary(at): spark_type = BinaryType() elif types.is_date32(at): spark_type = DateType() elif types.is_timestamp(at): spark_type = TimestampType() elif types.is_list(at): if types.is_timestamp(at.value_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = ArrayType(from_arrow_type(at.value_type)) elif types.is_map(at): if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): raise TypeError("MapType is only supported with pyarrow 2.0.0 and above") if types.is_timestamp(at.key_type) or types.is_timestamp(at.item_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = MapType(from_arrow_type(at.key_type), from_arrow_type(at.item_type)) elif types.is_struct(at): if any(types.is_struct(field.type) for field in at): raise TypeError("Nested StructType not supported in conversion from Arrow: " + str(at)) return StructType( [StructField(field.name, from_arrow_type(field.type), nullable=field.nullable) for field in at]) elif types.is_dictionary(at): spark_type = from_arrow_type(at.value_type) elif types.is_null(at): spark_type = NullType() else: raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) return spark_type
def test_unsupported_types(self): common_err_msg = 'Invalid return type.*grouped map Pandas UDF.*' unsupported_types = [ StructField('arr_ts', ArrayType(TimestampType())), StructField('null', NullType()), StructField('struct', StructType([StructField('l', LongType())])), ] for unsupported_type in unsupported_types: schema = StructType([StructField('id', LongType(), True), unsupported_type]) with QuietTest(self.sc): with self.assertRaisesRegex(NotImplementedError, common_err_msg): pandas_udf(lambda x: x, schema, PandasUDFType.GROUPED_MAP)
def test_get_col_info(self): with spark_session('test_get_col_info') as spark: data = [[ 0, 0.0, None, [1, 1], DenseVector([1.0, 1.0]), SparseVector(2, {1: 1.0}), DenseVector([1.0, 1.0]) ], [ 1, None, None, [1, 1], DenseVector([1.0, 1.0]), SparseVector(2, {1: 1.0}), SparseVector(2, {1: 1.0}) ]] schema = StructType([ StructField('int', IntegerType()), StructField('float', FloatType()), StructField('null', NullType()), StructField('array', ArrayType(IntegerType())), StructField('dense', VectorUDT()), StructField('sparse', VectorUDT()), StructField('mixed', VectorUDT()) ]) df = create_test_data_from_schema(spark, data, schema) all_col_types, col_shapes, col_max_sizes = util._get_col_info(df) expected = [ ('int', {int}, 1, 1), ('float', {float, NullType}, 1, 1), ('null', {NullType}, 1, 1), ('array', {list}, 2, 2), ('dense', {DenseVector}, 2, 2), ('sparse', {SparseVector}, 2, 1), ('mixed', {DenseVector, SparseVector}, 2, 2) ] for expected_col_info in expected: col_name, col_types, col_shape, col_size = expected_col_info assert all_col_types[col_name] == col_types, col_name assert col_shapes[col_name] == col_shape, col_name assert col_max_sizes[col_name] == col_size, col_name
def test_toPandas_empty_df_arrow_enabled(self): # SPARK-30537 test that toPandas() on an empty dataframe has the correct dtypes # when arrow is enabled from datetime import date from decimal import Decimal schema = StructType([ StructField("a", StringType(), True), StructField("a", IntegerType(), True), StructField("c", TimestampType(), True), StructField("d", NullType(), True), StructField("e", LongType(), True), StructField("f", FloatType(), True), StructField("g", DateType(), True), StructField("h", BinaryType(), True), StructField("i", DecimalType(38, 18), True), StructField("k", TimestampNTZType(), True), StructField("L", DayTimeIntervalType(0, 3), True), ]) df = self.spark.createDataFrame(self.spark.sparkContext.emptyRDD(), schema=schema) non_empty_df = self.spark.createDataFrame( [( "a", 1, datetime.datetime(1969, 1, 1, 1, 1, 1), None, 10, 0.2, date(1969, 1, 1), bytearray(b"a"), Decimal("2.0"), datetime.datetime(1969, 1, 1, 1, 1, 1), datetime.timedelta(microseconds=123), )], schema=schema, ) pdf, pdf_arrow = self._toPandas_arrow_toggle(df) pdf_non_empty, pdf_arrow_non_empty = self._toPandas_arrow_toggle( non_empty_df) assert_frame_equal(pdf, pdf_arrow) self.assertTrue(pdf_arrow.dtypes.equals(pdf_arrow_non_empty.dtypes)) self.assertTrue(pdf_arrow.dtypes.equals(pdf_non_empty.dtypes))
package_id_product_id_map = entity_package_data_frame.dropna( subset=['PACKAGE_FDC_ID', 'ESD_PRODUCT_FDC_ID']).set_index( 'PACKAGE_FDC_ID')['ESD_PRODUCT_FDC_ID'].to_dict() # In[ ]: # package_id_product_id_map # In[ ]: from pyspark.sql.types import NullType audit_data_frame = audit_data_frame.dropna(subset=['PACKAGE_FDC_ID']) get_product_id = spark_session.udf.register('get_product_id', lambda package_id: package_id_product_id_map[package_id] \ if package_id in package_id_product_id_map else NullType()) audit_data_frame = audit_data_frame.withColumn( 'PRODUCT_FDC_ID', get_product_id('PACKAGE_FDC_ID')) audit_data_frame = audit_data_frame.dropna(subset=['PRODUCT_FDC_ID']) audit_data_frame.head(1) # In[ ]: audit_data_frame.count() # In[ ]: from pyspark.ml.feature import StringIndexer str_indexer = StringIndexer(inputCol='PRODUCT_FDC_ID', outputCol='label') audit_data_frame = str_indexer.fit(audit_data_frame).transform( audit_data_frame)
def test_supported_types(self): values = [ 1, 2, 3, 4, 5, 1.1, 2.2, Decimal(1.123), [1, 2, 2], True, 'hello', bytearray([0x01, 0x02]), None ] output_fields = [('id', IntegerType()), ('byte', ByteType()), ('short', ShortType()), ('int', IntegerType()), ('long', LongType()), ('float', FloatType()), ('double', DoubleType()), ('decim', DecimalType(10, 3)), ('array', ArrayType(IntegerType())), ('bool', BooleanType()), ('str', StringType()), ('bin', BinaryType()), ('null', NullType())] output_schema = StructType([StructField(*x) for x in output_fields]) df = self.spark.createDataFrame([values], schema=output_schema) # Different forms of group map pandas UDF, results of these are the same udf1 = pandas_udf( lambda pdf: pdf.assign(byte=pdf.byte * 2, short=pdf.short * 2, int=pdf.int * 2, long=pdf.long * 2, float=pdf.float * 2, double=pdf.double * 2, decim=pdf.decim * 2, bool=False if pdf.bool else True, str=pdf.str + 'there', array=pdf.array, bin=pdf.bin, null=pdf.null), output_schema, PandasUDFType.GROUPED_MAP) udf2 = pandas_udf( lambda _, pdf: pdf.assign(byte=pdf.byte * 2, short=pdf.short * 2, int=pdf.int * 2, long=pdf.long * 2, float=pdf.float * 2, double=pdf.double * 2, decim=pdf.decim * 2, bool=False if pdf.bool else True, str=pdf.str + 'there', array=pdf.array, bin=pdf.bin, null=pdf.null), output_schema, PandasUDFType.GROUPED_MAP) udf3 = pandas_udf( lambda key, pdf: pdf.assign(id=key[0], byte=pdf.byte * 2, short=pdf.short * 2, int=pdf.int * 2, long=pdf.long * 2, float=pdf.float * 2, double=pdf.double * 2, decim=pdf.decim * 2, bool=False if pdf.bool else True, str=pdf.str + 'there', array=pdf.array, bin=pdf.bin, null=pdf.null), output_schema, PandasUDFType.GROUPED_MAP) result1 = df.groupby('id').apply(udf1).sort('id').toPandas() expected1 = df.toPandas().groupby('id').apply( udf1.func).reset_index(drop=True) result2 = df.groupby('id').apply(udf2).sort('id').toPandas() expected2 = expected1 result3 = df.groupby('id').apply(udf3).sort('id').toPandas() expected3 = expected1 assert_frame_equal(expected1, result1) assert_frame_equal(expected2, result2) assert_frame_equal(expected3, result3)
"null": "null", "vector": "vector", "timestamp": "datetime" } SPARK_DTYPES_DICT = {"string": StringType, "int": IntegerType, "float": FloatType, "double": DoubleType, "boolean": BooleanType, "struct": StructType, "array": ArrayType, "bigint": LongType, "date": DateType, "byte": ByteType, "short": ShortType, "datetime": TimestampType, "binary": BinaryType, "null": NullType, "vector": VectorUDT } SPARK_DTYPES_DICT_OBJECTS = \ {"string": StringType(), "int": IntegerType(), "float": FloatType(), "double": DoubleType(), "boolean": BooleanType(), "struct": StructType(), "array": ArrayType(StringType()), "bigint": LongType(), "date": DateType(), "byte": ByteType(), "short": ShortType(), "datetime": TimestampType(), "binary": BinaryType(), "null": NullType() } # Profiler PROFILER_COLUMN_TYPES = {"categorical", "numeric", "date", "null", "array", "binary"} SPARK_DTYPES_TO_PROFILER = {"int": ["smallint", "tinyint", "bigint", "int"], "decimal": ["float", "double"], "string": "string", "date": {"date", "timestamp"}, "boolean": "boolean", "binary": "binary", "array": "array", "object": "object", "null": "null", "missing": "missing"} from enum import Enum class Actions(Enum): """ Actions that modify a columns.
def test_supported_types(self): values = [ 1, 2, 3, 4, 5, 1.1, 2.2, Decimal(1.123), [1, 2, 2], True, "hello", bytearray([0x01, 0x02]), None, ] output_fields = [ ("id", IntegerType()), ("byte", ByteType()), ("short", ShortType()), ("int", IntegerType()), ("long", LongType()), ("float", FloatType()), ("double", DoubleType()), ("decim", DecimalType(10, 3)), ("array", ArrayType(IntegerType())), ("bool", BooleanType()), ("str", StringType()), ("bin", BinaryType()), ("null", NullType()), ] output_schema = StructType([StructField(*x) for x in output_fields]) df = self.spark.createDataFrame([values], schema=output_schema) # Different forms of group map pandas UDF, results of these are the same udf1 = pandas_udf( lambda pdf: pdf.assign( byte=pdf.byte * 2, short=pdf.short * 2, int=pdf.int * 2, long=pdf.long * 2, float=pdf.float * 2, double=pdf.double * 2, decim=pdf.decim * 2, bool=False if pdf.bool else True, str=pdf.str + "there", array=pdf.array, bin=pdf.bin, null=pdf.null, ), output_schema, PandasUDFType.GROUPED_MAP, ) udf2 = pandas_udf( lambda _, pdf: pdf.assign( byte=pdf.byte * 2, short=pdf.short * 2, int=pdf.int * 2, long=pdf.long * 2, float=pdf.float * 2, double=pdf.double * 2, decim=pdf.decim * 2, bool=False if pdf.bool else True, str=pdf.str + "there", array=pdf.array, bin=pdf.bin, null=pdf.null, ), output_schema, PandasUDFType.GROUPED_MAP, ) udf3 = pandas_udf( lambda key, pdf: pdf.assign( id=key[0], byte=pdf.byte * 2, short=pdf.short * 2, int=pdf.int * 2, long=pdf.long * 2, float=pdf.float * 2, double=pdf.double * 2, decim=pdf.decim * 2, bool=False if pdf.bool else True, str=pdf.str + "there", array=pdf.array, bin=pdf.bin, null=pdf.null, ), output_schema, PandasUDFType.GROUPED_MAP, ) result1 = df.groupby("id").apply(udf1).sort("id").toPandas() expected1 = df.toPandas().groupby("id").apply(udf1.func).reset_index(drop=True) result2 = df.groupby("id").apply(udf2).sort("id").toPandas() expected2 = expected1 result3 = df.groupby("id").apply(udf3).sort("id").toPandas() expected3 = expected1 assert_frame_equal(expected1, result1) assert_frame_equal(expected2, result2) assert_frame_equal(expected3, result3)
def sql_types_example(spark): # DataType dp = DataType() python_obj = dp.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = dp.toInternal(1) print(sql_obj, type(sql_obj)) print(dp.json()) print(dp.jsonValue()) print(dp.needConversion()) print(dp.simpleString()) print(DataType.typeName()) # NullType nt = NullType() python_obj = nt.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = nt.toInternal(1) print(sql_obj, type(sql_obj)) print(nt.json()) print(nt.jsonValue()) print(nt.needConversion()) print(nt.simpleString()) print(NullType.typeName()) # AtomicType at = AtomicType() python_obj = at.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = at.toInternal(1) print(sql_obj, type(sql_obj)) print(at.json()) print(at.jsonValue()) print(at.needConversion()) print(at.simpleString()) print(AtomicType.typeName()) # NumericType nt = NumericType() python_obj = nt.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = nt.toInternal(1) print(sql_obj, type(sql_obj)) print(nt.json()) print(nt.jsonValue()) print(nt.needConversion()) print(nt.simpleString()) print(NumericType.typeName()) # IntegralType it = IntegralType() python_obj = it.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = it.toInternal(1) print(sql_obj, type(sql_obj)) print(it.json()) print(it.jsonValue()) print(it.needConversion()) print(it.simpleString()) print(IntegralType.typeName()) # FractionalType ft = FractionalType() python_obj = ft.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = ft.toInternal(1) print(sql_obj, type(sql_obj)) print(ft.json()) print(ft.jsonValue()) print(ft.needConversion()) print(ft.simpleString()) print(FractionalType.typeName()) # StringType st = StringType() python_obj = st.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = st.toInternal(1) print(sql_obj, type(sql_obj)) print(st.json()) print(st.jsonValue()) print(st.needConversion()) print(st.simpleString()) print(StringType.typeName()) # BinaryType bt = BinaryType() python_obj = bt.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = bt.toInternal(1) print(sql_obj, type(sql_obj)) print(bt.json()) print(bt.jsonValue()) print(bt.needConversion()) print(bt.simpleString()) print(BinaryType.typeName()) # BooleanType bt = BooleanType() python_obj = bt.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = bt.toInternal(1) print(sql_obj, type(sql_obj)) print(bt.json()) print(bt.jsonValue()) print(bt.needConversion()) print(bt.simpleString()) print(BooleanType.typeName()) # DateType from datetime import datetime dt = DateType() python_obj = dt.fromInternal(1000) print(python_obj, type(python_obj)) today = datetime.today() sql_obj = dt.toInternal(today) print(sql_obj, type(sql_obj)) print(dt.json()) print(dt.jsonValue()) print(dt.needConversion()) print(dt.simpleString()) print(DateType.typeName()) # TimestampType tt = TimestampType() python_obj = tt.fromInternal(365000000) print(python_obj, type(python_obj)) today = datetime.today() sql_obj = tt.toInternal(today) print(sql_obj, type(sql_obj)) print(tt.json()) print(tt.jsonValue()) print(tt.needConversion()) print(tt.simpleString()) print(TimestampType.typeName()) # DecimalType dt = DecimalType() python_obj = dt.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = dt.toInternal(1) print(sql_obj, type(sql_obj)) print(dt.json()) print(dt.jsonValue()) print(dt.needConversion()) print(dt.simpleString()) print(DecimalType.typeName()) # DoubleType dt = DoubleType() python_obj = dt.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = dt.toInternal(1) print(sql_obj, type(sql_obj)) print(dt.json()) print(dt.jsonValue()) print(dt.needConversion()) print(dt.simpleString()) print(DoubleType.typeName()) # FloatType ft = FloatType() python_obj = ft.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = ft.toInternal(1) print(sql_obj, type(sql_obj)) print(ft.json()) print(ft.jsonValue()) print(ft.needConversion()) print(ft.simpleString()) print(FloatType.typeName()) # ByteType bt = ByteType() python_obj = bt.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = bt.toInternal(1) print(sql_obj, type(sql_obj)) print(bt.json()) print(bt.jsonValue()) print(bt.needConversion()) print(bt.simpleString()) print(ByteType.typeName()) # IntegerType it = IntegerType() python_obj = it.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = it.toInternal(1) print(sql_obj, type(sql_obj)) print(it.json()) print(it.jsonValue()) print(it.needConversion()) print(it.simpleString()) print(IntegerType.typeName()) # LongType lt = LongType() python_obj = lt.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = lt.toInternal(1) print(sql_obj, type(sql_obj)) print(lt.json()) print(lt.jsonValue()) print(lt.needConversion()) print(lt.simpleString()) print(LongType.typeName()) # ShortType st = ShortType() python_obj = st.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = st.toInternal(1) print(sql_obj, type(sql_obj)) print(st.json()) print(st.jsonValue()) print(st.needConversion()) print(st.simpleString()) print(ShortType.typeName()) # ArrayType dt = DataType() at = ArrayType(dt) python_obj = at.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = at.toInternal(1) print(sql_obj, type(sql_obj)) print(at.json()) print(at.jsonValue()) print(at.needConversion()) print(at.simpleString()) print(ArrayType.typeName()) print(ArrayType.fromJson({"containsNull": True, "elementType": "string"})) # MapType key_type = DataType() value_type = DataType() mt = MapType(key_type, value_type) python_obj = mt.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = mt.toInternal(1) print(sql_obj, type(sql_obj)) print(mt.json()) print(mt.jsonValue()) print(mt.needConversion()) print(mt.simpleString()) print(MapType.typeName()) print(MapType.fromJson({"valueContainsNull": True, "keyType": "string", "valueType": "integer"})) # StructField dt = DataType() sf = StructField("first_struct", dt) python_obj = sf.fromInternal(1) print(python_obj, type(python_obj)) sql_obj = sf.toInternal(1) print(sql_obj, type(sql_obj)) print(sf.json()) print(sf.jsonValue()) print(sf.needConversion()) print(sf.simpleString()) print(StructField.fromJson({"metadata": None, "nullable": True, "name": "first_struct", "type": "string"})) # StructType string_type = StringType() st = StructType([StructField("first_struct", StringType()), StructField("second_struct", DataType())]) print("------") print(st.names) print(st.fields) print(st._needConversion) print(st._needSerializeAnyField) python_obj = st.fromInternal(["first_struct", "second_struct"]) print(python_obj, type(python_obj)) sql_obj = st.toInternal(["first_struct", "second_struct"]) print(sql_obj, type(sql_obj)) print(st.json()) print(st.jsonValue()) print(st.needConversion()) print(st.simpleString()) print(st.fieldNames()) fields = { "fields": [ {"metadata": None, "nullable": True, "name": "first", "type": "string"}, {"metadata": None, "nullable": True, "name": "second", "type": "integer"} ] } print(st.fromJson(fields)) st.add(StructField("first_struct", StringType())) print("st.add success!") print("Finish running types module API")
def test_merge_type(self): self.assertEqual(_merge_type(LongType(), NullType()), LongType()) self.assertEqual(_merge_type(NullType(), LongType()), LongType()) self.assertEqual(_merge_type(LongType(), LongType()), LongType()) self.assertEqual( _merge_type(ArrayType(LongType()), ArrayType(LongType())), ArrayType(LongType())) with self.assertRaisesRegexp(TypeError, 'element in array'): _merge_type(ArrayType(LongType()), ArrayType(DoubleType())) self.assertEqual( _merge_type(MapType(StringType(), LongType()), MapType(StringType(), LongType())), MapType(StringType(), LongType())) with self.assertRaisesRegexp(TypeError, 'key of map'): _merge_type(MapType(StringType(), LongType()), MapType(DoubleType(), LongType())) with self.assertRaisesRegexp(TypeError, 'value of map'): _merge_type(MapType(StringType(), LongType()), MapType(StringType(), DoubleType())) self.assertEqual( _merge_type( StructType([ StructField("f1", LongType()), StructField("f2", StringType()) ]), StructType([ StructField("f1", LongType()), StructField("f2", StringType()) ])), StructType([ StructField("f1", LongType()), StructField("f2", StringType()) ])) with self.assertRaisesRegexp(TypeError, 'field f1'): _merge_type( StructType([ StructField("f1", LongType()), StructField("f2", StringType()) ]), StructType([ StructField("f1", DoubleType()), StructField("f2", StringType()) ])) self.assertEqual( _merge_type( StructType([ StructField("f1", StructType([StructField("f2", LongType())])) ]), StructType([ StructField("f1", StructType([StructField("f2", LongType())])) ])), StructType([ StructField("f1", StructType([StructField("f2", LongType())])) ])) with self.assertRaisesRegexp(TypeError, 'field f2 in field f1'): _merge_type( StructType([ StructField("f1", StructType([StructField("f2", LongType())])) ]), StructType([ StructField("f1", StructType([StructField("f2", StringType())])) ])) self.assertEqual( _merge_type( StructType([ StructField("f1", ArrayType(LongType())), StructField("f2", StringType()) ]), StructType([ StructField("f1", ArrayType(LongType())), StructField("f2", StringType()) ])), StructType([ StructField("f1", ArrayType(LongType())), StructField("f2", StringType()) ])) with self.assertRaisesRegexp(TypeError, 'element in array field f1'): _merge_type( StructType([ StructField("f1", ArrayType(LongType())), StructField("f2", StringType()) ]), StructType([ StructField("f1", ArrayType(DoubleType())), StructField("f2", StringType()) ])) self.assertEqual( _merge_type( StructType([ StructField("f1", MapType(StringType(), LongType())), StructField("f2", StringType()) ]), StructType([ StructField("f1", MapType(StringType(), LongType())), StructField("f2", StringType()) ])), StructType([ StructField("f1", MapType(StringType(), LongType())), StructField("f2", StringType()) ])) with self.assertRaisesRegexp(TypeError, 'value of map field f1'): _merge_type( StructType([ StructField("f1", MapType(StringType(), LongType())), StructField("f2", StringType()) ]), StructType([ StructField("f1", MapType(StringType(), DoubleType())), StructField("f2", StringType()) ])) self.assertEqual( _merge_type( StructType([ StructField("f1", ArrayType(MapType(StringType(), LongType()))) ]), StructType([ StructField("f1", ArrayType(MapType(StringType(), LongType()))) ])), StructType([ StructField("f1", ArrayType(MapType(StringType(), LongType()))) ])) with self.assertRaisesRegexp(TypeError, 'key of map element in array field f1'): _merge_type( StructType([ StructField("f1", ArrayType(MapType(StringType(), LongType()))) ]), StructType([ StructField("f1", ArrayType(MapType(DoubleType(), LongType()))) ]))
spark = SparkSession.builder.master("local[2]").appName( 'link-prediction').getOrCreate() spark.sparkContext.setLogLevel("ERROR") # Create spark context sc = spark.sparkContext # Create a dataframe from training_set and testing_set trainingRDD = sc.textFile("training_set.txt").map( lambda x: x.strip().split(' ')) trainingDF = trainingRDD.toDF(['from_node_id', 'to_node_id', 'label']).sample(False, 0.3, 10) predRDD = sc.textFile("testing_set.txt").map( lambda x: x.strip().split(' ')) predDF = predRDD.toDF(['from_node_id', 'to_node_id']) predictDF = predDF.withColumn('label', lit(None).cast(NullType())) combinedDF = trainingDF.union(predictDF) print("Input DataFrame contains %d elements" % trainingDF.count()) print("To predict DataFrame contains %d elements" % predictDF.count()) print("The combined DataFrame contains %d elements" % combinedDF.count()) # Create a dataframe for paper information (title, authors, abstract, etc) infoRDD = sc.textFile("node_information.csv") infoRDD = infoRDD.mapPartitions(lambda x: csv.reader(x)) infoDF = infoRDD.toDF( ['node_id', 'year', 'title', 'authors', 'journal', 'abstract']) infoDF.printSchema() infoDF.show(5) raw_input("Press enter ... ")
'_TIMESTAMP_TYPE', '_DATETIME_TYPES', '_DECIMAL_10_0_TYPE', '_DECIMAL_38_18_TYPE', '_DECIMAL_TYPE_PREFIX', '_ARRAY_TYPE_PREFIX', '_MAP_TYPE_PREFIX', '_STRUCT_TYPE_PREFIX', '_VECTOR_TYPE', ) __null_type: NullType = NullType() _NULL_TYPE: str = __null_type.simpleString() assert _NULL_TYPE == __null_type.typeName() __bool_type: BooleanType = BooleanType() _BOOL_TYPE: str = __bool_type.simpleString() assert _BOOL_TYPE == __bool_type.typeName() __str_type: StringType = StringType() _STR_TYPE: str = __str_type.simpleString() assert _STR_TYPE == __str_type.typeName() __binary_type: BinaryType = BinaryType()
# | | # Complex Types # | | ArrayType, # array<...> | tuple, list, array | MapType, # map<...> | dict | StructField, # ... : ... | | StructType, # struct<...> | tuple, list, dict | _atomic_types, _all_atomic_types, _all_complex_types, _type_mappings, _array_signed_int_typecode_ctype_mappings, _array_unsigned_int_typecode_ctype_mappings, _array_type_mappings, _acceptable_types) __null_type = NullType() _NULL_TYPE = __null_type.simpleString() assert _NULL_TYPE == __null_type.typeName() __bool_type = BooleanType() _BOOL_TYPE = __bool_type.simpleString() assert _BOOL_TYPE == __bool_type.typeName() __str_type = StringType() _STR_TYPE = __str_type.simpleString() assert _STR_TYPE == __str_type.typeName() __binary_type = BinaryType() _BINARY_TYPE = __binary_type.simpleString() assert _BINARY_TYPE == __binary_type.typeName()