예제 #1
0
# A simple hive demo. If you do not have a table to load from look run MakeHiveTable.py
from pyspark import SparkContext
from pyspark.sql import HiveContext
import json
import sys

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print "Error usage: LoadHive [sparkmaster] [inputtable]"
        sys.exit(-1)
    master = sys.argv[1]
    inputTable = sys.argv[2]
    sc = SparkContext(master, "LoadHive")
    hiveCtx = HiveContext(sc)
    # Query hive
    input = hiveCtx.hql("FROM " + inputTable + " SELECT key, value")
    data = input.map(lambda x: x['key'] * x['key'])
    result = data.collect()
    for element in result:
        print "Got data " + str(element)
    sc.stop()
    print "Done!"
예제 #2
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#########################################################################
# File Name: loadHive.py
# Author: lpqiu
# mail: [email protected]
# Created Time: 2014年11月30日 星期日 21时31分06秒
#########################################################################


from pyspark import SparkConf,SparkContext,StoreLevel
from pyspark.sql import HiveContext, SQLContext


conf = SparkConf().setMaster("local").setAppName("My App")
sc = SparkContext(conf)

hiveCtx = HiveContext(sc)
rows = hiveCtx.hql("SELECT key,value FROM src")
keys = rows.map(lambda row: row["key"])

# Parquet load example
sqlCtx = SQLContext(sc)
rows = sqlCtx.ParquetFile("people.parquet")
names = rows.map(lambda row:row["name"])
예제 #3
0
# Createas a hive table and loads an input file into it
# For input you can use examples/src/main/resources/kv1.txt from the spark distribution
from pyspark import SparkContext
from pyspark.sql import HiveContext
import json
import sys

if __name__ == "__main__":
    if len(sys.argv) != 4:
        print "Error usage: LoadHive [sparkmaster] [inputFile] [inputtable]"
        sys.exit(-1)
    master = sys.argv[1]
    inputFile = sys.argv[2]
    inputTable = sys.argv[3]
    sc = SparkContext(master, "LoadHive")
    hiveCtx = HiveContext(sc)
    # Load some data into hive
    hiveCtx.hql("CREATE TABLE IF NOT EXISTS " + inputTable +
                " (key INT, value STRING)")
    hiveCtx.hql("LOAD DATA LOCAL INPATH '" + inputFile + "' INTO TABLE " +
                inputTable)
# A simple demo for working with SparkSQL and Tweets
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext, Row, IntegerType
import json
import sys

if __name__ == "__main__":
    inputFile = sys.argv[1]
    conf = SparkConf().setAppName("SparkSQLTwitter")
    sc = SparkContext()
    hiveCtx = HiveContext(sc)
    print "Loading tweets from " + inputFile
    input = hiveCtx.jsonFile(inputFile)
    input.registerTempTable("tweets")
    topTweets = hiveCtx.hql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10")
    print topTweets.collect()
    topTweetText = topTweets.map(lambda row : row.text)
    print topTweetText.collect()
    # Make a happy person row
    happyPeopleRDD = sc.parallelize([Row(name="holden", favouriteBeverage="coffee")])
    happyPeopleSchemaRDD = hiveCtx.inferSchema(happyPeopleRDD)
    happyPeopleSchemaRDD.registerTempTable("happy_people")
    # Make a UDF to tell us how long some text is
    hiveCtx.registerFunction("strLenPython", lambda x: len(x), IntegerType())
    lengthSchemaRDD = hiveCtx.hql("SELECT strLenPython('text') FROM tweets LIMIT 10")
    print lengthSchemaRDD.collect()
    sc.stop()
예제 #5
0
# Createas a hive table and loads an input file into it
# For input you can use examples/src/main/resources/kv1.txt from the spark
# distribution
from pyspark import SparkContext
from pyspark.sql import HiveContext
import json
import sys

if __name__ == "__main__":
    if len(sys.argv) != 4:
        print "Error usage: LoadHive [sparkmaster] [inputFile] [inputtable]"
        sys.exit(-1)
    master = sys.argv[1]
    inputFile = sys.argv[2]
    inputTable = sys.argv[3]
    sc = SparkContext(master, "LoadHive")
    hiveCtx = HiveContext(sc)
    # Load some data into hive
    hiveCtx.hql(
        "CREATE TABLE IF NOT EXISTS " +
        inputTable +
        " (key INT, value STRING)")
    hiveCtx.hql(
        "LOAD DATA LOCAL INPATH '" + inputFile + "' INTO TABLE " + inputTable)