StructField("timestamp", TimestampType(), False), StructField("date", DateType(), False), StructField("array", ArrayType(IntegerType(), False), False), StructField("col_map", MapType(StringType(), StringType(), False), False), StructField( "struct", StructType( [ StructField("first", IntegerType(), False), StructField("second", FloatType(), False), StructField("third", StringType(), False), ] ), False, ), ] ) table = hc.applySchema(source, schema) table.registerAsTable("temp_table") rows = hc.sql( "select byte, short, int, long, float, double, decimal, string, boolean, timestamp, date, array[0], array[1], array[2], col_map['key'], struct.first, struct.second, struct.third from temp_table" ).collect() sc.stop() for row in rows: print row
sc = SparkContext(conf=conf) hc = HiveContext(sc) source = sc.parallelize([ "row1_col1 row1_col2 row1_col3", "row2_col1 row2_col2 row3_col3", "row3_col1 row3_col2 row3_col3" ]) columns = source.map(lambda line: line.split(" ")).filter( lambda columns: columns and len(columns) == 3) rows = columns.map(lambda columns: (columns[0], columns[1], columns[2])) schema = StructType([ StructField("col1", StringType(), False), StructField("col2", StringType(), False), StructField("col3", StringType(), False) ]) table = hc.applySchema(rows, schema) table.registerAsTable("temp_mytable") rdd = hc.sql("select count(*) from temp_mytable where col1 = '' group by col2") print rdd.toDebugString() sc.stop()
sc = SparkContext(conf=conf) hc = HiveContext(sc) datas = ["1 a 28", "2 b 29", "3 c 30"] source = sc.parallelize(datas) splits = source.map(lambda line: line.split(" ")) rows = splits.map(lambda words: (int(words[0]), words[1], int(words[2]))) fields = [] fields.append(StructField("id", IntegerType(). True)) fields.append(StructField("name", StringType(). True)) fields.append(StructField("age", IntegerType(). True)) schema = StructType(fields) people = hc.applySchema(rows, schema) people.registerTempTable("people") results = hc.sql("select * from people where age>28 and age<30").collect() sc.stop() for result in results: print("id: %s, name: %s, age: %s" % (result.id, result.name, result.age))
def parse(line): matcher = pattern.match(line) if matcher: return matcher.groups() else: return None columns = source.map(parse).filter( lambda columns: columns and len(columns) == 3) rows = columns.map( lambda columns: (columns[0], columns[1], columns[2])) schema = StructType([StructField("col1", StringType(), False), StructField( "col2", StringType(), False), StructField("col3", StringType(), False)]) table = hc.applySchema(rows, schema) table.registerAsTable("temp_mytable") datas = hc.sql("select * from temp_mytable").collect() sc.stop() if datas: for data in datas: print data
sc = SparkContext(conf=conf) hc = HiveContext(sc) source = sc.parallelize([([1, 2, 3], {"key1": 1, "key2": 2}, (1, 2.0, "3.0"))]) schema = StructType([ StructField("array", ArrayType(IntegerType(), False), False), StructField("col_map", MapType(StringType(), IntegerType(), False), False), StructField( "struct", StructType([ StructField("first", IntegerType(), False), StructField("second", FloatType(), False), StructField("third", StringType(), False) ]), False) ]) table = hc.applySchema(source, schema) table.registerAsTable("temp_table") rows = hc.sql( "select array[0], array[1], array[2], col_map['key1'], col_map['key2'], struct.first, struct.second, struct.third from temp_table" ).collect() sc.stop() for row in rows: print row