# A simple hive demo. If you do not have a table to load from look run MakeHiveTable.py from pyspark import SparkContext from pyspark.sql import HiveContext import json import sys if __name__ == "__main__": if len(sys.argv) != 3: print "Error usage: LoadHive [sparkmaster] [inputtable]" sys.exit(-1) master = sys.argv[1] inputTable = sys.argv[2] sc = SparkContext(master, "LoadHive") hiveCtx = HiveContext(sc) # Query hive input = hiveCtx.hql("FROM " + inputTable + " SELECT key, value") data = input.map(lambda x: x['key'] * x['key']) result = data.collect() for element in result: print "Got data " + str(element) sc.stop() print "Done!"
#!/usr/bin/env python # -*- coding: utf-8 -*- ######################################################################### # File Name: loadHive.py # Author: lpqiu # mail: [email protected] # Created Time: 2014年11月30日 星期日 21时31分06秒 ######################################################################### from pyspark import SparkConf,SparkContext,StoreLevel from pyspark.sql import HiveContext, SQLContext conf = SparkConf().setMaster("local").setAppName("My App") sc = SparkContext(conf) hiveCtx = HiveContext(sc) rows = hiveCtx.hql("SELECT key,value FROM src") keys = rows.map(lambda row: row["key"]) # Parquet load example sqlCtx = SQLContext(sc) rows = sqlCtx.ParquetFile("people.parquet") names = rows.map(lambda row:row["name"])
# Createas a hive table and loads an input file into it # For input you can use examples/src/main/resources/kv1.txt from the spark distribution from pyspark import SparkContext from pyspark.sql import HiveContext import json import sys if __name__ == "__main__": if len(sys.argv) != 4: print "Error usage: LoadHive [sparkmaster] [inputFile] [inputtable]" sys.exit(-1) master = sys.argv[1] inputFile = sys.argv[2] inputTable = sys.argv[3] sc = SparkContext(master, "LoadHive") hiveCtx = HiveContext(sc) # Load some data into hive hiveCtx.hql("CREATE TABLE IF NOT EXISTS " + inputTable + " (key INT, value STRING)") hiveCtx.hql("LOAD DATA LOCAL INPATH '" + inputFile + "' INTO TABLE " + inputTable)
# A simple demo for working with SparkSQL and Tweets from pyspark import SparkContext, SparkConf from pyspark.sql import HiveContext, Row, IntegerType import json import sys if __name__ == "__main__": inputFile = sys.argv[1] conf = SparkConf().setAppName("SparkSQLTwitter") sc = SparkContext() hiveCtx = HiveContext(sc) print "Loading tweets from " + inputFile input = hiveCtx.jsonFile(inputFile) input.registerTempTable("tweets") topTweets = hiveCtx.hql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10") print topTweets.collect() topTweetText = topTweets.map(lambda row : row.text) print topTweetText.collect() # Make a happy person row happyPeopleRDD = sc.parallelize([Row(name="holden", favouriteBeverage="coffee")]) happyPeopleSchemaRDD = hiveCtx.inferSchema(happyPeopleRDD) happyPeopleSchemaRDD.registerTempTable("happy_people") # Make a UDF to tell us how long some text is hiveCtx.registerFunction("strLenPython", lambda x: len(x), IntegerType()) lengthSchemaRDD = hiveCtx.hql("SELECT strLenPython('text') FROM tweets LIMIT 10") print lengthSchemaRDD.collect() sc.stop()
# Createas a hive table and loads an input file into it # For input you can use examples/src/main/resources/kv1.txt from the spark # distribution from pyspark import SparkContext from pyspark.sql import HiveContext import json import sys if __name__ == "__main__": if len(sys.argv) != 4: print "Error usage: LoadHive [sparkmaster] [inputFile] [inputtable]" sys.exit(-1) master = sys.argv[1] inputFile = sys.argv[2] inputTable = sys.argv[3] sc = SparkContext(master, "LoadHive") hiveCtx = HiveContext(sc) # Load some data into hive hiveCtx.hql( "CREATE TABLE IF NOT EXISTS " + inputTable + " (key INT, value STRING)") hiveCtx.hql( "LOAD DATA LOCAL INPATH '" + inputFile + "' INTO TABLE " + inputTable)