from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext, Row from pyspark.sql.types import StringType conf = SparkConf().setAppName("spark_sql_udf") sc = SparkContext(conf=conf) hc = HiveContext(sc) lines = sc.parallelize(["a", "b", "c"]) people = lines.map(lambda value: Row(name=value)) peopleSchema = hc.inferSchema(people) peopleSchema.registerTempTable("people") def myfunc(value): return value.upper() hc.registerFunction("myfunc", myfunc, StringType()) rows = hc.sql("select myfunc(name) from people").rdd.filter( lambda row: isinstance(row, tuple)).collect() sc.stop() for row in rows:
# 寻找熊猫爱好者 tbl = rows.registerTempTable("people") pandaFriends = hiveCtx.sql("SELECT name FROM people WHERE favouriteAnimal = \"panda\"") print "Panda friends" print pandaFriends.map(lambda row: row.name).collect() # Parquet文件保存 pandaFriends.saveAsTextFile("hdfs://...") # 在python中使用Spark SQL 读取JSON数据 input = hiveCtx.jsonFile(inputFile) # 在python中使用Row和具名远足创建SchemaRDD happyPeopleRDD = sc.parallelize([Row(name = "holden", favouriteBeverage = "coffee")]) happyPeopleSchemaRDD = hiveCtx.inferSchema(happyPeopleRDD) happyPeopleSchemaRDD.registerTempTable("happy_people") #**************** 使用Beeline JDBC/ODBC服务器 ************************************ # 用户自定义函数 # 字符串长度 UDF # 写一个求字符串长度的UDF hiveCtx.registerFunction("strLenPython", lambda x: len(x), IntegerType()) lengthSchemaRDD = hiveCtx.sql("SELECT strLenPython('text') FROM tweets LIMIT 10") # ----------------------------------------------------------------------------------######## Spark Streaming使用离散化流(discretized stream)作为抽象表示,叫做DStream。
from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext, Row conf = SparkConf().setAppName("spark_sql_delimiter_infer_schema") sc = SparkContext(conf=conf) hc = HiveContext(sc) source = sc.parallelize(["row1_col1 row1_col2 row1_col3", "row2_col1 row2_col2 row3_col3", "row3_col1 row3_col2 row3_col3"]) columns = source.map(lambda line: line.split(" ")).filter( lambda columns: columns and len(columns) == 3) rows = columns.map( lambda columns: Row(col1=columns[0], col2=columns[1], col3=columns[2])) table = hc.inferSchema(rows) table.registerAsTable("temp_mytable") datas = hc.sql("select * from temp_mytable").collect() sc.stop() if datas: for data in datas: print data
from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext from pyspark.sql.types import Row conf = SparkConf().setAppName("spark_sql_cache_table_extend") sc = SparkContext(conf=conf) hc = HiveContext(sc) dataRDD = sc.textFile( "/user/hdfs/rawlog/app_weibomobile03x4ts1kl_mwb_interface/" ).map(lambda line: line.split(",")).filter(lambda words: len(words) >= 3).map( lambda words: Row(col1=words[0], col2=words[1], col3=words[2])) sourceRDD = hc.inferSchema(dataRDD) sourceRDD.registerAsTable("source") hc.cacheTable("source") hc.sql("select count(*) from source").collect() hc.sql("select col2, max(col3) from source group by col2").collect() hc.sql("select col3, min(col2) from source group by col3").collect() # hc.uncacheTable("source") sc.stop()
# coding=utf-8 from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext from pyspark.sql.types import Row conf = SparkConf().setAppName("spark_sql_cache_table_extend") sc = SparkContext(conf=conf) hc = HiveContext(sc) dataRDD = sc.textFile("/user/hdfs/rawlog/app_weibomobile03x4ts1kl_mwb_interface/").map(lambda line: line.split( ",")).filter(lambda words: len(words) >= 3).map(lambda words: Row(col1=words[0], col2=words[1], col3=words[2])) sourceRDD = hc.inferSchema(dataRDD) sourceRDD.registerAsTable("source") hc.cacheTable("source") hc.sql("select count(*) from source").collect() hc.sql("select col2, max(col3) from source group by col2").collect() hc.sql("select col3, min(col2) from source group by col3").collect() # hc.uncacheTable("source") sc.stop()
from pyspark import SparkContext, SparkConf from pyspark.sql import HiveContext, Row from pyspark.sql.types import IntegerType import json import sys if __name__ == "__main__": inputFile = sys.argv[1] conf = SparkConf().setAppName("TwitterAnalytics") sc = SparkContext() hiveCtx = HiveContext(sc) print "Loading tweets from " + inputFile input = hiveCtx.jsonFile(inputFile) input.registerTempTable("tweets") topTweets = hiveCtx.sql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10") print topTweets.collect() topTweetText = topTweets.map(lambda row : row.text) print topTweetText.collect() # Make a happy person row happyPeopleRDD = sc.parallelize([Row(name="ganguly", favouriteBeverage="coffee")]) happyPeopleSchemaRDD = hiveCtx.inferSchema(happyPeopleRDD) happyPeopleSchemaRDD.registerTempTable("strong_people") # Make a UDF to tell us how long some text is hiveCtx.registerFunction("strLenPython", lambda x: len(x), IntegerType()) lengthSchemaRDD = hiveCtx.sql("SELECT strLenPython('text') FROM tweets LIMIT 10") print lengthSchemaRDD.collect() sc.stop()
pattern = re.compile("(.*) (.*) (.*)") def parse(line): matcher = pattern.match(line) if matcher: return matcher.groups() else: return None columns = source.map(parse).filter( lambda columns: columns and len(columns) == 3) rows = columns.map( lambda columns: Row(col1=columns[0], col2=columns[1], col3=columns[2])) table = hc.inferSchema(rows) table.registerAsTable("temp_mytable") datas = hc.sql("select * from temp_mytable").collect() sc.stop() if datas: for data in datas: print data
sc = SparkContext(conf=conf) hc = HiveContext(sc) hc.sql("set spark.sql.shuffle.partitions = 10") orderMap = sc.textFile("hdfs:///user/hive/warehouse/retaildb.db/orders")\ .map(lambda record : record.split(","))\ .map(lambda record : Row(orderID=int(record[0]),orderDate=record[1][:11])) itemMap = sc.textFile("hdfs:///user/hive/warehouse/retaildb.db/order_items")\ .map(lambda record : record.split(","))\ .map(lambda row: (int(row[1]),float(row[4]))).reduceByKey(lambda x,y : x+y)\ .map(lambda record : Row(orderID=int(record[0]),Total=record[1])) oSchema = hc.inferSchema(orderMap) iSchema = hc.inferSchema(itemMap) oSchema.registerTempTable("orders") iSchema.registerTempTable("items") avgSalesPerDay = hc.sql(" SELECT o.orderDate,avg(i.Total) as avgSales \ from orders o join items i \ where o.orderID = i.orderID \ group by o.orderDate \ order by avgSales DESC") avgSalesPerDay.map(lambda row : ",".join([row.orderDate,str(row.avgSales)]))\ .coalesce(1) \ .saveAsTextFile("AvgSalesPerDay2")
from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext, Row from pyspark.sql.types import StringType conf = SparkConf().setAppName("spark_sql_udf") sc = SparkContext(conf=conf) hc = HiveContext(sc) lines = sc.parallelize(["a", "b", "c"]) people = lines.map(lambda value: Row(name=value)) peopleSchema = hc.inferSchema(people) peopleSchema.registerTempTable("people") def myfunc(value): return value.upper() hc.registerFunction("myfunc", myfunc, StringType()) rows = hc.sql("select myfunc(name) from people").rdd.filter( lambda row: isinstance(row, tuple)).collect() sc.stop() for row in rows: print row, type(row[0])
conf = SparkConf().setAppName('spar_sql_test') sc = SparkContext(conf=conf) #sqlContext = SQLContext(sc); hc = HiveContext(sc) # Parallelize a list and convert each line to a Row # Row(id=1, name="a", age=28) # datas -> Spark RDD source, type = str datas = ['1 a 28', '2 b 39', '3 c 30'] source = sc.parallelize(datas) splits = source.map(lambda line: line.split(" ")) rows = splits.map(lambda words: Row(id=words[0], name=words[1], age=words[2])) # Infer the schema, and register the Schema as a table people = hc.inferSchema(rows) people.printSchema() # SQL can be run over SchemaRDD that have been registered as a table people.registerTempTable("people") results = hc.sql('select * from people where age > 28 and age < 30') results.printSchema() # The results of SQL queries are SchemaRDD, so register it as a table results.registerTempTable("people2") results2 = hc.sql('select name from people2') results2.printSchema() # The SchemaRDD support all the normal RDD operations results3 = results2.map(lambda row: row.name.upper()).collect()
for results in results: print results from pyspark import SparkConf, SparkContext conf = SparkConf().setAppName("bleh") sc = SparkContext(conf=conf) from pyspark.sql import HiveContext sqlContext = HiveContext(sc) sql = """ select distinct date_category from ck_membership.date_driver LIMIt 10 """ r = sqlContext.sql(sql) for i in r.collect(): print(i) r.saveAsParquetFile("hdfs://nameservice1/data/unmanaged/datascience_ck/vish/mydata") sqlContext.sql("CREATE EXTERNAL TABLE vsubr2.some_date_driver1 (date_category String) STORED AS PARQUET LOCATION 'hdfs://nameservice1/data/unmanaged/datascience_ck/vish/mydata'") like rdd r2 = sqlContext.inferSchema(r) r.createExternalTable('vsubr2.some_Date_driver',path='hdfs://nameservice1/data/unmanaged/datascience_ck/vish/') sqlContext.sql("create table if not exists vsubr2.some_date_driver1") create table if not exists vsubr2.some_date_driver1 LOCATION 'hdfs://nameservice1/data/unmanaged/datascience_ck/vish/' as