Python HiveContext.applySchema示例

        StructField("timestamp", TimestampType(), False),
        StructField("date", DateType(), False),
        StructField("array", ArrayType(IntegerType(), False), False),
        StructField("col_map", MapType(StringType(), StringType(), False), False),
        StructField(
            "struct",
            StructType(
                [
                    StructField("first", IntegerType(), False),
                    StructField("second", FloatType(), False),
                    StructField("third", StringType(), False),
                ]
            ),
            False,
        ),
    ]
)

table = hc.applySchema(source, schema)

table.registerAsTable("temp_table")

rows = hc.sql(
    "select byte, short, int, long, float, double, decimal, string, boolean, timestamp, date, array[0], array[1], array[2], col_map['key'], struct.first, struct.second, struct.third from temp_table"
).collect()

sc.stop()

for row in rows:
    print row

示例#2

显示文件

文件： spark_app_toDebugString.py 项目： imran273/pyspark-1

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize([
    "row1_col1 row1_col2 row1_col3", "row2_col1 row2_col2 row3_col3",
    "row3_col1 row3_col2 row3_col3"
])

columns = source.map(lambda line: line.split(" ")).filter(
    lambda columns: columns and len(columns) == 3)

rows = columns.map(lambda columns: (columns[0], columns[1], columns[2]))

schema = StructType([
    StructField("col1", StringType(), False),
    StructField("col2", StringType(), False),
    StructField("col3", StringType(), False)
])

table = hc.applySchema(rows, schema)

table.registerAsTable("temp_mytable")

rdd = hc.sql("select count(*) from temp_mytable where col1 = '' group by col2")

print rdd.toDebugString()

sc.stop()

示例#3

显示文件

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

datas = ["1 a 28", "2 b 29", "3 c 30"]

source = sc.parallelize(datas)

splits = source.map(lambda line: line.split(" "))

rows = splits.map(lambda words: (int(words[0]), words[1], int(words[2])))

fields = []

fields.append(StructField("id", IntegerType(). True))
fields.append(StructField("name", StringType(). True))
fields.append(StructField("age", IntegerType(). True))

schema = StructType(fields)

people = hc.applySchema(rows, schema)

people.registerTempTable("people")

results = hc.sql("select * from people where age>28 and age<30").collect()

sc.stop()

for result in results:
    print("id: %s, name: %s, age: %s" % (result.id, result.name, result.age))

示例#4

显示文件

文件： spark_sql_regex_specify_schema.py 项目： Leaderman/pyspark

def parse(line):
    matcher = pattern.match(line)

    if matcher:
        return matcher.groups()
    else:
        return None

columns = source.map(parse).filter(
    lambda columns: columns and len(columns) == 3)

rows = columns.map(
    lambda columns: (columns[0], columns[1], columns[2]))

schema = StructType([StructField("col1", StringType(), False), StructField(
    "col2", StringType(), False), StructField("col3", StringType(), False)])


table = hc.applySchema(rows, schema)

table.registerAsTable("temp_mytable")

datas = hc.sql("select * from temp_mytable").collect()

sc.stop()

if datas:
    for data in datas:
        print data

示例#5

显示文件

文件： spark_sql_datatype_complex.py 项目： imran273/pyspark-1

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize([([1, 2, 3], {"key1": 1, "key2": 2}, (1, 2.0, "3.0"))])

schema = StructType([
    StructField("array", ArrayType(IntegerType(), False), False),
    StructField("col_map", MapType(StringType(), IntegerType(), False), False),
    StructField(
        "struct",
        StructType([
            StructField("first", IntegerType(), False),
            StructField("second", FloatType(), False),
            StructField("third", StringType(), False)
        ]), False)
])

table = hc.applySchema(source, schema)

table.registerAsTable("temp_table")

rows = hc.sql(
    "select array[0], array[1], array[2], col_map['key1'], col_map['key2'], struct.first, struct.second, struct.third from temp_table"
).collect()

sc.stop()

for row in rows:
    print row