示例#1
0
# imports
from pyspark import SparkContext
from pyspark.sql import HiveContext

# start Spark and  Hive SQL contexts
sc = SparkContext("local", "demo app")
hc = HiveContext(sc)

# get the table
print "Printing tables in DB:"
print hc.tableNames()

print "Printing first 10 rows in zip_neighborhood_borough_xref table."
sqlQuery = "SELECT * FROM zip_neighborhood_borough_xref limit 10"
hc.sql(sqlQuery).show()
def sql_hive_context_example(spark):
    
    # create hive context object.
    hive_ctx = HiveContext(spark.sparkContext)

    # createDataFrame
    l = [('Alice', 18), ('Bob', 20), ('Charley', 22)]
    df = hive_ctx.createDataFrame(l, ('name', 'age'))
    print("createDataFrame API finished")

    # registerDataFrameAsTable 
    hive_ctx.registerDataFrameAsTable(df, "table1")
    print("registerDataFrameAsTable API finished")

    # sql
    tmp_df = hive_ctx.sql("select * from table1")
    tmp_df.show()
    print("sql API finished")

    # table
    tmp_df = hive_ctx.table("table1")
    tmp_df.show()
    print("table API finished")

    # tableNames
    table_names = hive_ctx.tableNames()
    print(table_names)
    print("tableNames API finished")

    # tables
    tables = hive_ctx.tables()
    print(tables)
    print("tables API finished")

    # range
    tmp_df = hive_ctx.range(1,10,2)
    tmp_df.show()
    print("range API finished")

    # dropTempTable
    hive_ctx.dropTempTable("table1")
    table_names = hive_ctx.tableNames()
    print(table_names)
    print("dropTempTable API finished")

    # cacheTable & uncacheTable & clearCache
    df = hive_ctx.range(1,10,2)
    hive_ctx.registerDataFrameAsTable(df, "table")
    hive_ctx.cacheTable("table")
    hive_ctx.uncacheTable("table")
    hive_ctx.clearCache()
    print("cacheTable & uncacheTable & clearCache API finished")

    # createExternalTable

    # newSession

    # registerFunction
    # Deprecated in 2.3.0. Use :func:`spark.udf.register` instead

    # registerJavaFunction
    # Deprecated in 2.3.0. Use :func:`spark.udf.registerJavaFunction` instead

    # setConf & getConf
    hive_ctx.setConf("key1", "value1")
    value = hive_ctx.getConf("key1")
    print(value)
    print("setConf & getConf API finished")

    # refreshTable
    # Exception: An error occurred while calling o26.refreshTable:
    # Method refreshTable([class java.lang.String]) does not exist
    
    print("Finish running HiveContext API")
示例#3
0
from pyspark import SparkContext
from pyspark.sql import HiveContext
from pyspark.sql.types import *

from udf.pyspark.udfs import *


if __name__ == "__main__":
    sc = SparkContext(appName="SparkSQL:[demo][pysparkdemo]")
    sqlContext = HiveContext(sc)

    # RDD is created from a list of rows
    df = sqlContext.read.parquet("/mvad/warehouse/session/dspan/date=2015-09-01/")
    df.registerTempTable("sessionlog")
    for table in sqlContext.tableNames():
        print table
    df.printSchema()

    sqlContext.udf.register("toNormalCookie",toNormalCookie )
    sql1 = """ select toNormalCookie(cookie) as cookiestr,eventTime,eventType,geoInfo.country as country,
      geoInfo.province as province from sessionlog limit 10 """.replace('\n',' ')
    sample = sqlContext.sql(sql1)
    sample.show()


    sql2 = """select eventType, count(cookie) as count from sessionlog
      group by eventType """.replace('\n',' ')
    result = sqlContext.sql(sql2)
    result.cache()

    # only show 20 records
示例#4
0
from pyspark import SparkContext
from pyspark.sql import HiveContext
from pyspark.sql.types import Row, StructField, StructType, StringType, IntegerType

if __name__ == "__main__":
    sc = SparkContext(appName="SparkSQL:[com.mvad.spark.demo][pysparkdemo]")
    sqlContext = HiveContext(sc)

    # RDD is created from a list of rows
    df = sqlContext.parquetFile(
        "/mvad/warehouse/session/dspan/date=2015-05-01/")
    df.registerTempTable("sessionlog")
    for table in sqlContext.tableNames():
        print table
    df.printSchema()

    sqlContext.udf.register("intarr2str",
                            lambda array: "".join(map(str, array)))
    sql1 = """ select intarr2str(cookie) as cookiestr,eventTime,eventType,geoInfo.country as country,
      geoInfo.province as province from sessionlog limit 10 """.replace(
        '\n', ' ')
    sample = sqlContext.sql(sql1)
    sample.show()

    sql2 = """select eventType, count(cookie) as count from sessionlog
      group by eventType """.replace('\n', ' ')
    result = sqlContext.sql(sql2)
    result.cache()

    # only show 20 records
    result.show()
示例#5
0
spark_df = sqlCtx.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load("./data/clicks_test.csv")
spark_df.registerTempTable("clicks_train")

try:
    sqlCtx.sql("drop table table_7")
except pyspark.sql.utils.AnalysisException as e:
    pass
except Exception as e:
    pass

print("*** CREATING TABLE 7 ***")
# table 7 is train but geo location needs to be coded.
# I don't know how to coded in sql so I will just remove it for now
sqlCtx.sql("create table table_7 as select a.document_id, a.platform, "
           "a.traffic_source, a.display_id, a.source_id, a.publisher_id, "
           "a.category_id, b.ad_id, a.topic_id from table_6 a inner join clicks_train b on a.display_id = b.display_id")

print("*** FINISHED CREATING TABLE 7 ***")


# create train file from table_7
train_spark_df = sqlCtx.sql("select * from table_7")
train_spark_df.write.csv('./cleaned_data/test_files_from_spark')

# TODO do test file

spark_tables = sqlCtx.tableNames()
print(spark_tables)