def sqlType(cls): return StructType([ StructField("type", ByteType(), False), StructField("size", IntegerType(), True), StructField("indices", ArrayType(IntegerType(), False), True), StructField("values", ArrayType(DoubleType(), False), True) ])
def dshape_to_schema(ds): """Convert datashape to SparkSQL type system. Examples -------- >>> print(dshape_to_schema('int32')) # doctest: +SKIP IntegerType >>> print(dshape_to_schema('5 * int32') # doctest: +SKIP ArrayType(IntegerType,false) >>> print(dshape_to_schema('5 * ?int32')) # doctest: +SKIP ArrayType(IntegerType,true) >>> print(dshape_to_schema('{name: string, amount: int32}')) # doctest: +SKIP StructType(List(StructField(name,StringType,false),StructField(amount,IntegerType,false) # doctest: +SKIP)) >>> print(dshape_to_schema('10 * {name: string, amount: ?int32}')) # doctest: +SKIP ArrayType(StructType(List(StructField(name,StringType,false),StructField(amount,IntegerType,true))),false) """ if isinstance(ds, str): return dshape_to_schema(dshape(ds)) if isinstance(ds, Tuple): raise TypeError('Please provide a Record dshape for these column ' 'types: %s' % (ds.dshapes, )) if isinstance(ds, Record): return StructType([ StructField(name, dshape_to_schema(deoption(typ)), isinstance(typ, datashape.Option)) for name, typ in ds.fields ]) if isinstance(ds, DataShape): if isdimension(ds[0]): elem = ds.subshape[0] if isinstance(elem, DataShape) and len(elem) == 1: elem = elem[0] return ArrayType(dshape_to_schema(deoption(elem)), isinstance(elem, Option)) else: return dshape_to_schema(ds[0]) if ds in dshape_to_sparksql: return dshape_to_sparksql[ds] raise NotImplementedError()
# coding: utf-8 from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext from pyspark.sql import StructType, StructField, ArrayType, MapType conf = SparkConf().setAppName("spark_sql_datatype_complex") sc = SparkContext(conf=conf) hc = HiveContext(sc) source = sc.parallelize([([1, 2, 3], {"key1": 1, "key2": 2}, (1, 2.0, "3.0"))]) schema = StructType([ StructField("array", ArrayType(IntegerType(), False), False), StructField("col_map", MapType(StringType(), IntegerType(), False), False), StructField( "struct", StructType([ StructField("first", IntegerType(), False), StructField("second", FloatType(), False), StructField("third", StringType(), False) ]), False) ]) table = hc.applySchema(source, schema) table.registerAsTable("temp_table") rows = hc.sql(
def func_int(): return 123 hc.registerFunction("func_int", func_int, IntegerType()) rows = hc.sql("select func_int() from temp_table").collect() def func_array(): # list or tuple return [1, 2, 3] hc.registerFunction("func_array", func_array, ArrayType(IntegerType())) rows = hc.sql( "select val[0], val[1], val[2] from (select func_array() as val from temp_table) t" ).collect() def func_struct(): # tuple return (1, 2.0, "3") hc.registerFunction( "func_struct", func_struct, StructType([ StructField("first", IntegerType()),