示例#1
0
 def sqlType(cls):
     return StructType([
         StructField("type", ByteType(), False),
         StructField("size", IntegerType(), True),
         StructField("indices", ArrayType(IntegerType(), False), True),
         StructField("values", ArrayType(DoubleType(), False), True)
     ])
示例#2
0
def dshape_to_schema(ds):
    """Convert datashape to SparkSQL type system.

    Examples
    --------
    >>> print(dshape_to_schema('int32'))  # doctest: +SKIP
    IntegerType
    >>> print(dshape_to_schema('5 * int32')  # doctest: +SKIP
    ArrayType(IntegerType,false)
    >>> print(dshape_to_schema('5 * ?int32'))  # doctest: +SKIP
    ArrayType(IntegerType,true)
    >>> print(dshape_to_schema('{name: string, amount: int32}'))  # doctest: +SKIP
    StructType(List(StructField(name,StringType,false),StructField(amount,IntegerType,false)  # doctest: +SKIP))
    >>> print(dshape_to_schema('10 * {name: string, amount: ?int32}'))  # doctest: +SKIP
    ArrayType(StructType(List(StructField(name,StringType,false),StructField(amount,IntegerType,true))),false)
    """
    if isinstance(ds, str):
        return dshape_to_schema(dshape(ds))
    if isinstance(ds, Tuple):
        raise TypeError('Please provide a Record dshape for these column '
                        'types: %s' % (ds.dshapes, ))
    if isinstance(ds, Record):
        return StructType([
            StructField(name, dshape_to_schema(deoption(typ)),
                        isinstance(typ, datashape.Option))
            for name, typ in ds.fields
        ])
    if isinstance(ds, DataShape):
        if isdimension(ds[0]):
            elem = ds.subshape[0]
            if isinstance(elem, DataShape) and len(elem) == 1:
                elem = elem[0]
            return ArrayType(dshape_to_schema(deoption(elem)),
                             isinstance(elem, Option))
        else:
            return dshape_to_schema(ds[0])
    if ds in dshape_to_sparksql:
        return dshape_to_sparksql[ds]
    raise NotImplementedError()
# coding: utf-8

from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
from pyspark.sql import StructType, StructField, ArrayType, MapType

conf = SparkConf().setAppName("spark_sql_datatype_complex")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize([([1, 2, 3], {"key1": 1, "key2": 2}, (1, 2.0, "3.0"))])

schema = StructType([
    StructField("array", ArrayType(IntegerType(), False), False),
    StructField("col_map", MapType(StringType(), IntegerType(), False), False),
    StructField(
        "struct",
        StructType([
            StructField("first", IntegerType(), False),
            StructField("second", FloatType(), False),
            StructField("third", StringType(), False)
        ]), False)
])

table = hc.applySchema(source, schema)

table.registerAsTable("temp_table")

rows = hc.sql(
示例#4
0
def func_int():
    return 123


hc.registerFunction("func_int", func_int, IntegerType())

rows = hc.sql("select func_int() from temp_table").collect()


def func_array():
    # list or tuple
    return [1, 2, 3]


hc.registerFunction("func_array", func_array, ArrayType(IntegerType()))

rows = hc.sql(
    "select val[0], val[1], val[2] from (select func_array() as val from temp_table) t"
).collect()


def func_struct():
    # tuple
    return (1, 2.0, "3")


hc.registerFunction(
    "func_struct", func_struct,
    StructType([
        StructField("first", IntegerType()),