from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext, Row
from pyspark.sql.types import IntegerType
import json
import sys

if __name__ == "__main__":
    inputFile = sys.argv[1]
    conf = SparkConf().setAppName("TwitterAnalytics")
    sc = SparkContext()
    hiveCtx = HiveContext(sc)
    print "Loading tweets from " + inputFile
    input = hiveCtx.jsonFile(inputFile)
    input.registerTempTable("tweets")
    topTweets = hiveCtx.sql("SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10")
    print topTweets.collect()
    topTweetText = topTweets.map(lambda row : row.text)
    print topTweetText.collect()
    # Make a happy person row
    happyPeopleRDD = sc.parallelize([Row(name="ganguly", favouriteBeverage="coffee")])
    happyPeopleSchemaRDD = hiveCtx.inferSchema(happyPeopleRDD)
    happyPeopleSchemaRDD.registerTempTable("strong_people")
    # Make a UDF to tell us how long some text is
    hiveCtx.registerFunction("strLenPython", lambda x: len(x), IntegerType())
    lengthSchemaRDD = hiveCtx.sql("SELECT strLenPython('text') FROM tweets LIMIT 10")
    print lengthSchemaRDD.collect()
    sc.stop()
Exemplo n.º 2
0
    return Row(**mydict)

convertRDD = hc.sql(
    "select col1, col2, col3 from temp_source").map(convert)

mytable = hc.inferSchema(convertRDD)

mytable.registerTempTable("temp_mytable")
"""


def convert(val):
    return val.upper()

hc.registerFunction("temp_convert", convert)

convertRDD = hc.sql(
    "select temp_convert(col1) as col1, col2, col3 from temp_source")

convertRDD.registerAsTable("temp_mytable")


hc.cacheTable("temp_mytable")


def printRows(rows):
    for row in rows:
        print row

datas = hc.sql("select * from temp_mytable").collect()
Exemplo n.º 3
0
from pyspark.sql import HiveContext, Row
from pyspark.sql.types import StringType

conf = SparkConf().setAppName("spark_sql_udf")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

lines = sc.parallelize(["a", "b", "c"])

people = lines.map(lambda value: Row(name=value))

peopleSchema = hc.inferSchema(people)

peopleSchema.registerTempTable("people")


def myfunc(value):
    return value.upper()

hc.registerFunction("myfunc", myfunc, StringType())

rows = hc.sql("select myfunc(name) from people").rdd.filter(
    lambda row: isinstance(row, tuple)).collect()

sc.stop()

for row in rows:
    print row, type(row[0])
    table = hc.applySchema(rows, schema)

    table.registerTempTable("temp_table")

    def parseCDN(video_cdn):
        if not video_cdn:
            return ""

        words = video_cdn.split("s=")

        if len(words) >= 2:
            return words[1].split(",")[0]

        return ""

    hc.registerFunction("parseCDN", parseCDN)

    def cal_buffer_num(set):
        buffer_count = 0
        buffer_t_sum = 0
        buffer_smaller_500ms_count = 0
        buffer_bigger_2min_count = 0

        if set == None:
            pass
        else:
            list = set

            for s in list:
                if s >= 500 and s <= 120000:
                    buffer_count = buffer_count + 1
sc = SparkContext(conf=conf)

hc = HiveContext(sc)


def split_idc(idc):
    if idc == None or idc == '' or (not isinstance(idc, basestring)):
        return ''
    else:
        words = idc.split('.')
        if len(words) >= 2:
            return words[0] + '.' + words[1]
        else:
            return ''

hc.registerFunction("temp_split_idc", split_idc)

#--------------------------2.0 RDD-----------------------
spark_sql = '''select '1' as job_date,cdn,province,isp,ua,idc,play_process_group,version,init_timetag,buffer_count,
             sum(sum_play_process) as sum_play_process,
             sum(sum_video_init_duration) as sum_video_init_duration,
             sum(sum_buffer_t_sum) as sum_buffer_t_sum,
             sum(num) as num
             from(
             select cdn,province,isp,ua,play_process_group,version,init_timetag,buffer_count,sum_play_process,sum_video_init_duration,sum_buffer_t_sum,num,
             temp_split_idc(idc) as idc
             from datacubic.app_picserversweibof6vwt_wapvideodownload
             where log_dir= '20151012110000' and version>='5.4.5' limit 10
             )a
             group by cdn,province,isp,ua,idc,play_process_group,version,init_timetag,buffer_count'''
from __future__ import absolute_import, print_function, division, unicode_literals

import sys

from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext, IntegerType

if __name__ == '__main__':
    conf = SparkConf().setAppName('Restaurants Parquet')
    sc = SparkContext(conf=conf)
    hive_ctx = HiveContext(sc)

    inputs = hive_ctx.parquetFile(sys.argv[1])
    inputs.registerTempTable('restaurants')

    hive_ctx.registerFunction("LEN", lambda s: len(s), IntegerType())

    print('### Schema ###')
    inputs.printSchema()
    print()

    print('### Restaurants in Tokyo ###')
    restaurants_in_tokyo = hive_ctx.sql("""
        SELECT
            r.id,
            r.alphabet
        FROM
            restaurants r
        WHERE
            r.pref_id = '13'
        AND r.alphabet <> ''
Exemplo n.º 7
0
jsonRDD = hc.jsonFile("hdfs://dip.cdh5.dev:8020/user/hdfs/rawlog/app_saesinacomkafka12345_nginx/2015_10_22/09")

hc.registerRDDAsTable(jsonRDD, "temp_schema")


def if_in_top_10_domain(domain):
    if domain == "" or domain == None or len(domain) < 3:
        return "no"
    else:
        if top_domain_dict.has_key(domain):
            return top_domain_dict[domain]
        else:
            return "no"


hc.registerFunction("temp_if_in_top_10_domain", if_in_top_10_domain)

spark_sql = """select domain,url,cast(sum(body_bytes_sent) as bigint) as flow from (
                select domain,
                split(request,'\\\\?')[0] as url,
                body_bytes_sent
                from temp_schema
                where body_bytes_sent>0 and temp_if_in_top_10_domain(domain)!='no'
                )A
           group by domain,url limit 100
"""

rows_temp = hc.sql(spark_sql).map(lambda row: ((row.domain, if_in_top_10_domain(row.domain), row.url, row.flow), None))


def partitionFunc(key):
Exemplo n.º 8
0

if __name__ == "__main__":

    if len(sys.argv) != 2:
        print("Usage: hive input file")
        exit(-1)

    path = sys.argv[1]

    conf = SparkConf().setAppName("spark_sql_hive")

    sc = SparkContext(conf=conf)

    hc = HiveContext(sc)

    # 创建表
    hc.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
    # 加载数据
    hc.sql("LOAD DATA INPATH '%s' INTO TABLE src" % path)
    # 注册函数
    hc.registerFunction("myfunc", lambda name: name.upper())

    rows = hc.sql("select key, myfunc(value) from src").take(5)

    for row in rows:
        print row

    sc.stop()

Exemplo n.º 9
0
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext, Row
from pyspark.sql.types import IntegerType
import json
import sys

if __name__ == "__main__":
    inputFile = sys.argv[1]
    conf = SparkConf().setAppName("SparkSQLTwitter")
    sc = SparkContext()
    hiveCtx = HiveContext(sc)
    print "Loading tweets from " + inputFile
    input = hiveCtx.jsonFile(inputFile)
    input.registerTempTable("tweets")
    topTweets = hiveCtx.sql(
        "SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10")
    print topTweets.collect()
    topTweetText = topTweets.map(lambda row: row.text)
    print topTweetText.collect()
    # Make a happy person row
    happyPeopleRDD = sc.parallelize(
        [Row(name="holden", favouriteBeverage="coffee")])
    happyPeopleSchemaRDD = hiveCtx.inferSchema(happyPeopleRDD)
    happyPeopleSchemaRDD.registerTempTable("happy_people")
    # Make a UDF to tell us how long some text is
    hiveCtx.registerFunction("strLenPython", lambda x: len(x), IntegerType())
    lengthSchemaRDD = hiveCtx.sql(
        "SELECT strLenPython('text') FROM tweets LIMIT 10")
    print lengthSchemaRDD.collect()
    sc.stop()
Exemplo n.º 10
0
    return Row(**mydict)

convertRDD = hc.sql(
    "select col1, col2, col3 from temp_source").map(convert)

mytable = hc.inferSchema(convertRDD)

mytable.registerTempTable("temp_mytable")
"""


def convert(val):
    return val.upper()


hc.registerFunction("temp_convert", convert)

convertRDD = hc.sql(
    "select temp_convert(col1) as col1, col2, col3 from temp_source")

convertRDD.registerAsTable("temp_mytable")

hc.cacheTable("temp_mytable")


def printRows(rows):
    for row in rows:
        print row


datas = hc.sql("select * from temp_mytable").collect()
Exemplo n.º 11
0
hc = HiveContext(sc)

source = sc.parallelize([("value",)])

schema = StructType([StructField("col", StringType(), False)])

table = hc.applySchema(source, schema)

table.registerTempTable("temp_table")


def func_string():
    return "abc"

hc.registerFunction("func_string", func_string)

rows = hc.sql("select func_string() from temp_table").collect()


def func_int():
    return 123

hc.registerFunction("func_int", func_int, IntegerType())

rows = hc.sql("select func_int() from temp_table").collect()


def func_array():
    # list or tuple
    return [1, 2, 3]