import sys from pyspark import SparkConf, SparkContext from pyspark.sql.context import SQLContext, HiveContext from pyspark.sql.functions import * inputs = sys.argv[1] conf = SparkConf().setAppName('Month Wise Top 3 Crime') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) hiveContext = HiveContext(sc) crime = sqlContext.read.parquet(inputs) crime.registerTempTable('Crime') Crime_month = sqlContext.sql('''select Month,Category,count(Category) as cnt from Crime group by Month,Category order by Month ''') Crime_month = Crime_month.na.replace( ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12'], [ 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec' ], 'Month') Crime_month.registerTempTable('Crime_month') #loading only aggregated records to save in csv so that hive can Query only less amount of records . Crime_month.coalesce(1).write.format('com.databricks.spark.csv').save( 'MonthCategory') # Using Hive and creating table from csv hiveContext.sql("DROP TABLE IF EXISTS Crime_month") hiveContext.sql( "CREATE TABLE Crime_month (Month STRING, Category STRING, counts int) row format delimited fields terminated by ',' stored as textfile"
from pyspark.context import SparkContext from pyspark.sql.functions import explode, least from decimal import Decimal from pyspark.sql.types import * import json from chardet.latin1prober import UDF from types import NoneType conf = SparkConf().setMaster("local").setAppName("My application").set("spark.executor.memory", "1g") sc = SparkContext() # sc = SparkContext(conf=conf) sqlContext = HiveContext(sc) df = sqlContext.read.format("json").load("/home/mihai/ArhivaDateLema/somedata/temp/testDB.json") # print df.count() # a = df.filter(max(df['temp_apa_r']['S1'] , df['temp_apa_r']['S2']) > 1 ).count() # df.select(df.data,df.bloc_soft.valoare).show() # df.filter(df.temp_apa_r.S1 != '-').show() # print df.take(1)[0].asDict() # print k
import sys from pyspark import SparkConf, SparkContext from pyspark.sql.context import SQLContext, HiveContext from pyspark.sql.functions import * inputs = sys.argv[1] conf = SparkConf().setAppName('Time-wise Top 3 Crimes') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) hiveContext = HiveContext(sc) crime = sqlContext.read.parquet(inputs) crime.registerTempTable('Crime') Crime_Time = sqlContext.sql( '''select SUBSTR(Time,1,2) as hour,Category,count(Category) as cnt from Crime group by SUBSTR(Time,1,2),Category order by SUBSTR(Time,1,2) ''') Crime_Time.registerTempTable('Crime_Time') #loading only aggregated records to save in csv so that hive can Query only less amount of records . Crime_Time.coalesce(1).write.format('com.databricks.spark.csv').save( 'TimeCategory') # Using Hive and creating table from csv hiveContext.sql("DROP TABLE IF EXISTS TimeCategory") hiveContext.sql( "CREATE TABLE TimeCategory (Hour STRING, Category STRING, counts int) row format delimited fields terminated by ',' stored as textfile" ) #loading csv contents into csv hiveContext.sql(
import sys from pyspark import SparkConf, SparkContext from pyspark.sql.context import SQLContext,HiveContext from pyspark.sql.functions import * inputs=sys.argv[1] conf = SparkConf().setAppName('District wise Analysis') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) hiveContext = HiveContext(sc) crime=sqlContext.read.parquet(inputs) crime.registerTempTable('Crime') Crime_District=sqlContext.sql('''select PdDistrict,Category,count(Category) as cnt from Crime group by PdDistrict,Category order by PdDistrict ''') Crime_District.registerTempTable('Crime_District') Crime_count=sqlContext.sql('''select * from Crime_District where cnt<>1''') #loading only aggregated records to save in csv so that hive can Query only less amount of records . Crime_count.coalesce(1).write.format('com.databricks.spark.csv').save('District_top') # Using Hive and creating table from csv hiveContext.sql("DROP TABLE IF EXISTS district") hiveContext.sql("CREATE TABLE district (PdDistrict STRING, Category STRING, counts int) row format delimited fields terminated by ',' stored as textfile") #loading csv contents into csv hiveContext.sql("LOAD DATA INPATH '/user/chandras/District_top' INTO TABLE district")
# zookeeper地址 zkHost = '192.168.10.130:2181' # 写入行为信息的topic topic = 'userBehavior' # 需要处理的行为类型 behaviorsSet = {'click', 'buy', 'addCart'} if __name__ == '__main__': sparkContext = SparkContext(appName=appName) streamingContext = StreamingContext(sparkContext, 15) # spark sql 支持 sqlContext = SQLContext(sparkContext) # 添加hive支持 hiveContext = HiveContext(sparkContext) # 创建kafka连接 kafkaStream = KafkaUtils.createStream(streamingContext, zkHost, "kafka-streaming-hive", {topic: 1}) # 如果是点击或添加购物车操作更新用户推荐列表 def updateUserRecommendList(msg): if (msg[2] == "click" or msg[2] == 'addCart'): print 'click---->' + msg[3] return msg # 定义存储行为的表模式 schemaString = "time uid type content" fields = list(map(lambda fieldName: StructField(fieldName, StringType(), nullable=True), \ schemaString.split(" "))) schema = StructType(fields)
import json from chardet.latin1prober import UDF from types import NoneType from pyspark.mllib.stat import Statistics from numpy.core.numeric import NaN import pandas as pd import numpy as np conf = SparkConf().setMaster("local").setAppName("My application").set("spark.executor.memory", "1g") sc = SparkContext() # sc = SparkContext(conf=conf) sqlContext = HiveContext(sc) df = sqlContext.read.format("json").load("/home/mihai/ArhivaDateLema/somedata/temp/testDB.json") from pyspark.sql.functions import udf from pyspark.sql.types import StringType, BooleanType import operator def is_value_in_marime(dict): d = dict.asDict() return "0x30" in d.values()
from pyspark.sql.context import SQLContext, HiveContext from pyspark.context import SparkContext from pyspark.sql.functions import explode, least from decimal import Decimal from pyspark.sql.types import * import json from chardet.latin1prober import UDF from types import NoneType conf = SparkConf().setMaster("local").setAppName("My application").set( "spark.executor.memory", "1g") sc = SparkContext() # sc = SparkContext(conf=conf) sqlContext = HiveContext(sc) df = sqlContext.read.format("json").load( "/home/mihai/ArhivaDateLema/somedata/temp/testDB.json") # print df.count() # a = df.filter(max(df['temp_apa_r']['S1'] , df['temp_apa_r']['S2']) > 1 ).count() # df.select(df.data,df.bloc_soft.valoare).show() # df.filter(df.temp_apa_r.S1 != '-').show() # print df.take(1)[0].asDict() # print k from pyspark.sql.functions import udf from pyspark.sql.types import StringType, BooleanType
#Read input parameter from command line if len(sys.argv) == 2: print "Base path " + ROOT_DIR + os.sep file_name = sys.argv[1] print "performing analysis on file_name " + file_name #Read input file using spark-csv and create spark dataframe conf = SparkConf().setAppName('Spark-Assignment').setMaster('local[*]') # create spark context and sql context sc = SparkContext(conf=conf) hive_context = HiveContext(sc) # read the input data file and create spark dataframe using com.databricks.spark.csv library input_dataframe = hive_context.read.format("com.databricks.spark.csv")\ .option("header", "false") \ .option("inferschema", "true") \ .option("delimiter", ",") \ .option("mode", "DROPMALFORMED") \ .load("file://" + file_name) #dataframe schema based on data column_list = [ "date_value", "date", "register", "private_agency", "state", "district", "sub_district", "pincode", "gender", "age", "aadhaar_generated", "rejected", "mobile_number", "email_id" ]
from time import time ''' How to run: /usr/local/spark-2.0.2/bin/spark-submit /home/mandar/Downloads/Spark_Example/pyspark/example/dataframe/SparkExample3.py ''' if __name__ == '__main__': # Set Spark properties which will used to create sparkcontext conf = SparkConf().setAppName('SparkExample1').setMaster('local[*]') # create spark context and sql context sc = SparkContext(conf=conf) hive_context = HiveContext(sc) # read the input data file and create spark dataframe record_dataframe = hive_context.read.format("com.databricks.spark.csv")\ .option("header", "false") \ .option("inferschema", "true") \ .option("delimiter", "\n") \ .load("file:///home/mandar/Downloads/Spark_Example/resources/1").withColumnRenamed("_c0", "record") # meta config dataframe metaconfig_dataframe = hive_context.read.format("com.databricks.spark.csv")\ .option("header", "true") \ .option("inferschema", "true") \ .option("delimiter", "\t") \ .load("file:///home/mandar/Downloads/Spark_Example/resources/meta_config")
from subprocess import call import math from collections import OrderedDict from pyspark.sql.functions import monotonically_increasing_id from time import time # /usr/local/spark-2.0.2/bin/spark-submit /home/mandar/ProjectWorkspace/Example/com/spark/example/DataDifferenceSpark.py if __name__ == '__main__': # Set Spark properties which will used to create sparkcontext conf = SparkConf().setAppName('SparkExample1').setMaster('local[*]') # create spark context and sql context sc = SparkContext(conf=conf) hive_context = HiveContext(sc) # read the input data file and create pandas dataframe type_2_dataframe = hive_context.read.format("com.databricks.spark.csv")\ .option("header", "false") \ .option("inferschema", "true") \ .option("delimiter", "|") \ .option("mode", "DROPMALFORMED") \ .load("/home/mandar/ProjectWorkspace/Example/resources/data_difference_input") \ type_2_dataframe = type_2_dataframe .withColumnRenamed('_c0', 'date_value') # register dataframe to perform sql DSL type_2_dataframe.registerTempTable("date_table") # get hash value for column first and second
def initSparkConf(isLocal, appName): conf = SparkConf() conf.setAppName(appName) if isLocal is True: conf.setMaster("local[*]") return conf def initSparkContext(conf): return SparkContext(conf=conf) conf = initSparkConf(False, "HiveDataSource") sc = initSparkContext(conf) hiveContext = HiveContext(sc) hiveContext.sql("DROP TABLE IF EXISTS student_infos") hiveContext.sql( "CREATE TABLE IF NOT EXISTS student_infos (name STRING, age INT) row format delimited fields terminated by '\t'" ) hiveContext.sql("LOAD DATA " + "LOCAL INPATH '/root/resource/student_infos' " + "INTO TABLE student_infos") hiveContext.sql("DROP TABLE IF EXISTS student_scores") hiveContext.sql( "CREATE TABLE IF NOT EXISTS student_scores (name STRING, score INT) row format delimited fields terminated by '\t'" ) hiveContext.sql("LOAD DATA " + "LOCAL INPATH '/root/resource/student_scores' " + "INTO TABLE student_scores")