def main(): conf = SparkConf().setAppName('Sentiment Analysis_TfIdf') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) train_inputs = sys.argv[1] test_inputs = sys.argv[2] schema = StructType([ StructField('reviewText', StringType(), False), StructField('overall', DoubleType(), False), ]) read_json = sqlContext.read.json(train_inputs, schema) read_json.registerTempTable('read_json') lowercase = sqlContext.sql(""" SELECT lower(reviewText) as reviewText, overall as label FROM read_json """) regexTokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W") remover = StopWordsRemover(inputCol="words", outputCol="filtered") hashingTF = HashingTF(numFeatures=10, inputCol="filtered", outputCol="rawFeatures") idf = IDF(inputCol="rawFeatures", outputCol="features") lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8) pipeline = Pipeline(stages=[regexTokenizer, remover, hashingTF, idf, lr]) paramGrid = ParamGridBuilder() \ .addGrid(hashingTF.numFeatures, [10, 100, 1000]) \ .addGrid(lr.regParam, [0.1, 0.01]) \ .build() crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=RegressionEvaluator(), numFolds=5) cvModel = crossval.fit(lowercase) testDF = sqlContext.read.json(test_inputs, schema) testDF.registerTempTable('test_data') test_data = sqlContext.sql(""" SELECT lower(reviewText) as reviewText, overall as label FROM test_data """) train_prediction = cvModel.transform(lowercase) test_prediction = cvModel.transform(test_data) evaluator = RegressionEvaluator() print "Training dataset RMSE error: %s" %str(evaluator.evaluate(train_prediction)) print "Testing dataset RMSE: %s" %str(evaluator.evaluate(test_prediction))
def func2(rdd): ''' 利用之前的知识, 我们回顾下: 从rdd创建dataframe dataframe创建表 使用sql 返回dataframe进行各种保存 ''' sqlContext = SQLContext(rdd.context) newrdd = rdd.map(lambda line: [line]) df = sqlContext.createDataFrame(newrdd, StructType([StructField(name="content", dataType=StringType())])) df.createOrReplaceTempView("data") sqlContext.sql("select content from data").show()
def main(): conf = SparkConf().setAppName('ingest logs') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) inputs = sys.argv[1] output = sys.argv[2] #Reading the input file, and then matching the pattern file_data = sc.textFile(inputs) linere = re.compile("^(\\S+) - - \\[(\\S+) [+-]\\d+\\] \"[A-Z]+ (\\S+) HTTP/\\d\\.\\d\" \\d+ (\\d+)$") #Mapping the data after fetching the required values out of the Nasa Web server logs file KeyValue = file_data.map(lambda line : linere.split(line)).filter(lambda x : len(x)==6).map(lambda y : (y[1],(dt.datetime.strptime(y[2], '%d/%b/%Y:%H:%M:%S')),y[3],y[-2])).cache() #Mapping the KeyValue RDD as the required format of 4 columns Nasa = KeyValue.map(lambda p: {"host": p[0], "datetime": p[1], "path": p[2], "bytes": long(p[3])}) #Converting Nasa to DataFrame and then registering it as Table schemaNasa = sqlContext.createDataFrame(Nasa) schemaNasa.registerTempTable("NasaLogs") #Writing the data into a parquet file schemaNasa.write.format('parquet').save(output) #Reading the data from Parquet file and then Registering it in Table Format parquetdata = sqlContext.read.parquet(output) parquetdata.registerTempTable("parquetTable") #Firing SQL query to count the total number of bytes transferred using SUM(bytes) totalbytes = sqlContext.sql(""" SELECT SUM(bytes) FROM parquetTable """) totalbytes.show()
def run_spark(): spark = SparkSession.builder.master("local").appName("Word Count").getOrCreate() df = spark.read.format("csv").option("header", "true").load("hdfs:///project/samples sqlContext=SQLContext(spark) sqlContext.registerDataFrameAsTable(df, "table1") sentences=sqlContext.sql("""SELECT `reviews.rating`,`reviews.text` FROM table1""").rdd sentences.collect() alist=sentences.map(lambda x:x[0] if x[0] is not None else 0).collect() blist=sentences.map(lambda x:get_score(x[1]) if x[1] is not None else 0).collect() return alist,blist def draw_plots(alist,blist) alist,blist = run_spark() t_plt, = plt.plot(np.arange(1, len(alist)+1), alist, 'r') v_plt, = plt.plot(np.arange(1, len(alist)+1), blist) plt.title('NLP Emotion Analysis') # plt.xlabel('epoch') # plt.ylabel('score') # plt.legend((t_plt, v_plt), ('rating', 'score')) # plt.savefig("result.png") with open("x1.txt","w+") as input_: input_.write(','.join([str(x) for x in alist])) input_.write(','.join([str(x) for x in blist])) if __name__=='__main__': run_spark()
def main(): conf = SparkConf().setAppName('reddit relative sql score') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) inputs = sys.argv[1] output = sys.argv[2] #Defining the Schema schema = StructType([ StructField('subreddit', StringType(), False), StructField('score', IntegerType(), False), ]) #Reading the reddit json files comments = sqlContext.read.json(inputs, schema) #Registering the Json data in Table Format and Querying the table to get subreddit names and average scores comments.registerTempTable('comments') averages = sqlContext.sql(""" SELECT subreddit, AVG(score) FROM comments GROUP BY subreddit """) averages.write.save(output, format='json', mode='overwrite')
def run(): conf = SparkConf().setAppName("word count") \ .setMaster("local[2]") context = SparkContext(conf=conf) sqlContext = SQLContext(context) context.setLogLevel('ERROR') data = [('Song', 25), ('Trump', 22), ('Yong', 20), ('Obama', 26)] rdd = context.parallelize(data, 2).map(lambda x: Row(name=x[0], age=int(x[1]))) people = sqlContext.createDataFrame(rdd).cache() people.printSchema() old_guy = people.orderBy('age', ascending=False).take(1) print(old_guy) same_old_guy = [ Row(name=x['name'], age=x['age'], other=1) for x in old_guy ] print(same_old_guy) total = people.groupBy().sum('age').collect()[0][0] print('Total age is {}'.format(total)) people.createTempView('people_table') new_people = sqlContext.sql( 'select name, age from people_table order by age desc limit 1') new_people.show()
def query2(sc, file_in_name, file_out_name): rdd_file_data = sc.textFile(file_in_name) data_header = rdd_file_data \ .filter(lambda l: "datetime" in l) cites = weather.gen_city_keys(sc) header_position = run2.get_position(data_header) data = rdd_file_data \ .subtract(data_header) \ .flatMap(lambda line: generate_tuple(header_position, line, cites)) sqlc = SQLContext(sc) df = sqlc.createDataFrame(data) #df.show() df.createOrReplaceTempView("dati") query1 = "SELECT country, year, month, " \ "cast(min(value) as decimal (10,2)) as my_min, " \ "cast(max(value) as decimal (10,2)) my_max, " \ "cast(avg(value) as decimal (10,2)) as my_avg," \ "cast(stddev(value) as decimal (10,2)) as my_std " \ "FROM dati " \ "GROUP BY country, year, month" df2 = sqlc.sql(query1).orderBy('dati.country', 'dati.year', 'dati.month') df2.show() ''' Save data in HDFS ''' df2.coalesce(1).write.format("json").save(file_out_name)
class SparkConnector: def __init__(self): # findspark.init() sc = SparkSession.builder \ .master("local[*]") \ .appName('SCA') \ .config('spark.jars.packages', 'com.datastax.spark:spark-cassandra-connector_2.11:2.3.0') \ .config('spark.cassandra.connection.host', os.environ['DB_ADDR']) \ .config('spark.cassandra.auth.username', os.environ['DB_USER']) \ .config('spark.cassandra.auth.password', os.environ['DB_PASS']) \ .config('spark.executor.memory', '15g') \ .config('spark.driver.memory','6g') \ .getOrCreate() self.sqlContext = SQLContext(sc) self.sqlContext.setConf('spark.sql.shuffle.partitions', '10') def submit_sql(self, query): return self.sqlContext.sql(query).collect() def load_and_get_table_df(self, keys_space_name, table_name): table_df = self.sqlContext.read \ .format("org.apache.spark.sql.cassandra") \ .options(table=table_name, keyspace=keys_space_name) \ .load() return table_df
def main(): keyspace = sys.argv[1] output = sys.argv[2] orderkeys = sys.argv[3:] cluster_seeds = ['199.60.17.136', '199.60.17.173'] conf = SparkConf().set('spark.cassandra.connection.host', ','.join(cluster_seeds)) conf.set('spark.dynamicAllocation.maxExecutors', 20) conf.setAppName('tpch orders sql') sc = pyspark_cassandra.CassandraSparkContext(conf=conf) sqlContext = SQLContext(sc) df_for_orders(keyspace, sc, sqlContext) df_for_lineitem(keyspace, sc, sqlContext) df_for_part(keyspace, sc, sqlContext) df_orders = sqlContext.sql("""SELECT o.orderkey, o.totalprice, p.name FROM orders o JOIN lineitem l ON (o.orderkey = l.orderkey) JOIN part p ON (l.partkey = p.partkey) WHERE o.orderkey in (""" + ",".join(orderkeys) \ +")") rdd_orders = df_orders.rdd rdd_orders = rdd_orders.map(map_key) \ .reduceByKey(reduce_parts)\ .sortByKey() \ .map(map_output).coalesce(1) rdd_orders.saveAsTextFile(output)
def sample_function(sc: SparkContext): schema = StructType([StructField("odd_numbers", IntegerType(), True)]) print(" Odds number sample") big_list = range(10) rdd = sc.parallelize(big_list, 2) odds = rdd.filter(lambda x: x % 2 != 0) odds.foreach(my_print) sql_context = SQLContext(sc) odd_numbers = sql_context.createDataFrame(odds.map(lambda _: Row(_)), schema) odd_numbers.printSchema() odd_numbers.show(truncate=False) print("odd_numbers count:" + str(odd_numbers.count())) odd_numbers.createOrReplaceTempView("odd_numbers_table") sql_context.sql("select * from odd_numbers_table limit 2;").show() return (odd_numbers)
def load_parquet(database, table, quiet): sc = SparkContext() if quiet: sc = quiet_log(sc) sqlContext = SQLContext(sc) sqlContext.setConf('spark.sql.parquet.binaryAsString', 'True') print(database, table) return sqlContext.sql( 'Select * from parquet.`/user/hive/warehouse/{:s}.db/{:s}`'.format( database, table)), sc, sqlContext
def execute(self, sql): """ This method actually executes the sql query :param sql: SQL query to be executed :return: Newly created PySpark dataframe """ if sql is None: raise ValueError("Query is None") self.logger.info("# Executing an SQL query") sqlContext = SQLContext(self.spark) df = sqlContext.sql(sql) return df
def main(): input_keyspace = sys.argv[1] output_keyspace = sys.argv[2] cluster_seeds = ['199.60.17.136', '199.60.17.173'] conf = SparkConf().set('spark.cassandra.connection.host', ','.join(cluster_seeds)) conf.set('spark.dynamicAllocation.maxExecutors', 20) conf.setAppName('tpch orders denormalize cassandra') sc = pyspark_cassandra.CassandraSparkContext(conf=conf) sqlContext = SQLContext(sc) df_for_part(input_keyspace, sc, sqlContext) df_lineitems = df_for_lineitem(input_keyspace, sc, sqlContext) df_part = sqlContext.sql("""SELECT l.orderkey, p.name FROM lineitem l JOIN part p ON (l.partkey = p.partkey)""") df_part_names = rows_to_list(df_part, "orderkey", "name", "names", sqlContext) df_part_names.registerTempTable("part_names") df_orders = df_for_orders(input_keyspace, sc, sqlContext) df_orders = sqlContext.sql("""SELECT o.*, p.names as part_names FROM orders o JOIN part_names p ON (o.orderkey = p.orderkey) """ ) rdd_orders = df_orders.rdd.map(map_primary_key) rdd_orders.saveToCassandra(output_keyspace, 'orders_parts', parallelism_level=64)
def main(): conf = SparkConf().setAppName('artist_career') sc = SparkContext(conf=conf) assert sc.version >= '1.5.1' sqlContext=SQLContext(sc) inputs = sys.argv[1] output = sys.argv[2] customSchema = StructType([StructField('SongNumber', StringType(),False),StructField('SongID', StringType(),False),StructField('AlbumID', StringType(),False),StructField('AlbumName', StringType(),False),StructField('ArtistID', StringType(),False),StructField('ArtistLatitude', StringType(),False),StructField('ArtistLocation', StringType(),False),StructField('ArtistLongitude', StringType(),False),StructField('ArtistName', StringType(),False),StructField('Danceability', StringType(),False),StructField('Duration', StringType(),False),StructField('KeySignature', StringType(),False),StructField('KeySignatureConfidence', StringType(),False),StructField('Tempo', StringType(),False),StructField('TimeSignature', StringType(),False),StructField('TimeSignatureConfidence', StringType(),False),StructField('Title', StringType(),False),StructField('Year', StringType(),False),StructField('Energy', StringType(),False),StructField('ArtistFamiliarity', StringType(),False),StructField('ArtistMbid', StringType(),False),StructField('SongHotttnesss', StringType(),False),StructField('Loudness', StringType(),False),StructField('StartOfFadeOut', StringType(),False),StructField('EndOfFadeIn', StringType(),False),StructField('ModeConfidence', StringType(),False)]) df= sqlContext.read.format('com.databricks.spark.csv').options(header='true').load(inputs,schema = customSchema) df.registerTempTable('artist_data') million_song=sqlContext.sql("SELECT SongNumber,SongID,AlbumID,AlbumName,ArtistID,ArtistLatitude,ArtistLocation,ArtistLongitude,ArtistName,Danceability,Duration,KeySignature,KeySignatureConfidence,Tempo,TimeSignature,TimeSignatureConfidence,Title,Year,Energy,ArtistFamiliarity,ArtistMbid,SongHotttnesss,Loudness,StartOfFadeOut,EndOfFadeIn,ModeConfidence from artist_data where Year!=0 AND ArtistFamiliarity!='nan'") million_song.write.format('parquet').save(output)
def main(): sc = SparkContext("local", "Query 1") rawWeather, weatherHeader, cities = run.getRDDFromCSV( sc, Constants.WEATHER_DESCRIPTION_FILE) weatherDescription = rawWeather \ .subtract(weatherHeader) \ .filter(lambda l: re.search('^\d{4}-03|^\d{4}-04|^\d{4}-05', l)) # month filter daysOfMonth = weatherDescription \ .flatMap(lambda line: generateTuple(line, cities)) sqlc = SQLContext(sc) df = sqlc.createDataFrame(daysOfMonth) df.show() df.createOrReplaceTempView("dati") query1 = "SELECT city, year, month, day, sum(sunny) as n_sunny_h FROM dati GROUP BY city, year, month, day" df2 = sqlc.sql(query1) df2.show() #applicazione regola sunny day (75%) df2.createOrReplaceTempView("dati") query2 = "SELECT city, year, month, day FROM dati where n_sunny_h >13" df3 = sqlc.sql(query2) df3.show() #almeno 15 gg sereno al mese df3.createOrReplaceTempView("dati") query2 = "SELECT city, year, month, count(*) AS n_day FROM dati GROUP BY year,city, month" df4 = sqlc.sql(query2) df4.show() #filtra n_giorni df4.createOrReplaceTempView("dati") query2 = "SELECT city, year, month, n_day FROM dati where n_day>=15" df5 = sqlc.sql(query2) df5.show() # filtra n mesi = 3 df5.createOrReplaceTempView("dati") query2 = "SELECT city, year, count(*) AS n_month FROM dati GROUP BY city, year " df6 = sqlc.sql(query2) df6.show() df6.createOrReplaceTempView("dati") query2 = "SELECT city, year FROM dati WHERE n_month = 3 " df7 = sqlc.sql(query2) df7.show()
def run(sparkContext, source_table_name1, source_table_name2, sink_table_name): SparkApp.logger.info(sparkContext.appName + "Starting run()") sqlContext = SQLContext(sparkContext) SparkApp.logger.info(sparkContext.appName + "Starting jdbc read() !") df_hl7 = sqlContext.read.jdbc(url=ConfigFramework.getPostgres_URL() , table=source_table_name1 , properties=ConfigFramework.getPostgres_Properties()) df_fhir = sqlContext.read.jdbc(url=ConfigFramework.getPostgres_URL() , table=source_table_name2 , properties=ConfigFramework.getPostgres_Properties()) SparkApp.logger.info(sparkContext.appName + "End jdbc read() !") print(df_hl7.show()) print(df_fhir.show()) df_hl7.registerTempTable("lab_obs") df_fhir.registerTempTable("medications") df_sql_result = sqlContext.sql("SELECT m.patient_id, m.resource_medicationReference_display , m.resource_authoredOn_dt FROM medications m " "JOIN (SELECT patient_id, " "CASE WHEN ob_value_noted = 'Notdetected' AND lag(ob_value_noted) " "OVER (partition by patient_id order by l.ob_dt_cleaned) = 'Detected' " "THEN 1 ELSE 0 END as qualify " "FROM lab_obs l) l ON l.patient_id = m.patient_id AND l.qualify = 1") # df_sql_result = sqlContext.sql("select m.resource_medicationReference_display, m.patient_id from medications m where " # "m.resource_authoredOn_dt not in (select l.ob_dt_cleaned from lab_obs l where l.patient_id = m.patient_id and l.ob_value_noted = 'Detected' ) " # ) # "and " # "m.resource_authoredOn_dt < any (select l.ob_dt_cleaned from lab_obs l where l.patient_id = m.patient_id and l.ob_value_noted = 'Notdetected' )" # ) print(df_sql_result.show()) SparkApp.logger.info(sparkContext.appName + "Starting jdbc write() !") df_sql_result.write.jdbc(url=ConfigFramework.getPostgres_URL(), table=sink_table_name, mode="overwrite" , properties=ConfigFramework.getPostgres_Properties()) SparkApp.logger.info(sparkContext.appName + "End jdbc write() !") SparkApp.logger.info(sparkContext.appName + "Ending run()")
def main(): start = time.time() conf = SparkConf().setMaster("local[*]") \ .setAppName("secondaryStructureSegmentDemo") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) # retrive PDB annotation: Binding affinities (Ki, Kd) # group name of the ligand (hetId), and the # Enzyme Classification number (ecNo) ds = customReportService.getDataset(["Ki", "Kd", "hetId", "ecNo"]) # Show schema of this dataset ds.printSchema() # Select structures that either have Ki or Kd value(s) and # are protein-serine/threonine kinases (EC 2.7.1.*): # A. By using dataset operations ds = ds.filter( "(Ki IS NOT NULL OR Kd IS NOT NULL) AND ecNo LIKE '2.7.11.%'") ds.show(10) # B. by creating a temporary query and running SQL ds.createOrReplaceTempView("table") ds = sqlContext.sql( "SELECT * from table WHERE (Ki IS NOT NULL OR Kd IS NOT NULL) AND ecNo LIKE '2.7.11.%'" ) ds.show(10) end = time.time() print("Time: %f sec." % (end - start)) sc.stop()
lines = sc.textFile(file_path) info = lines.map(lambda lines: lines.split("----")). \ map(lambda info: Row(email=info[0], username=info[1], realname=info[2], idcard=info[3], password=info[4], phone=info[5])) schemaInfo = sqlContext.createDataFrame(info) schemaInfo.registerTempTable("information") # cache表 #sqlContext.cacheTable("information") #sqlContext.uncacheTable("information") """ :邮箱分析与统计 """ email_str = "SELECT analysis_email(email) AS email FROM information" emailSQL = sqlContext.sql(email_str) # 求总数 count = emailSQL.count() # 分组统计 emailCollect = emailSQL.groupBy("email").count().collect() # email分析结果 result_email(emailCollect, count) """ :用户名与姓名分析与统计 """ # 用户名长度统计 username_len_str = "SELECT LENGTH(username) AS username_len FROM information" usernameSQL = sqlContext.sql(username_len_str) usernameLenCollect = usernameSQL.groupBy("username_len").count().collect() result_username_len(usernameLenCollect, count)
from pyspark import SparkConf, SparkContext, SQLContext conf = SparkConf() context = SparkContext(conf=conf) sql_context = SQLContext(context) dfmain = sql_context.read.json ("/home/spark/Desktop/reviews_Musical_Instruments.json") //load data into a dataframe dfmain.show() //validation to see whether data has loaded properly dfmain.createOrReplaceTempView("json_view") //creating a temp view from the dataframe dfnew=sql_context.sql("select reviewerID,asin,overall from json_view") //forming a new dataframe with the desired fields dfnew.show() //validation to see whether data has loaded properly rdd_allreviews = dfnew.select('reviewerID', 'asin', 'overall').rdd //converting the dataframe into a rdd print rdd_allreviews.take(10) //printing the first 10 rows to verify whether data has been loaded properly training_RDD,test_RDD = rdd_allreviews.randomSplit([8, 2], seed=0L) //splitting the original rdd into train and test using the randomsplit function.
print ("Successfully imported Spark Modules -- `SparkContext, SQLContext`") print_horizontal() except ImportError as e: print ("Can not import Spark Modules", e) sys.exit(1) sqlContext = SQLContext(sparkContext=sc) # Loads parquet file located in AWS S3 into RDD Data Frame parquetFile = sqlContext.read.parquet("s3://jon-parquet-format/nation.plain.parquet") # Stores the DataFrame into an "in-memory temporary table" parquetFile.registerTempTable("parquetFile") # Run standard SQL queries against temporary table nations_all_sql = sqlContext.sql("SELECT * FROM parquetFile") # Print the result set nations_all = nations_all_sql.map(lambda p: "Country: {0:15} Ipsum Comment: {1}".format(p.name, p.comment_col)) print("All Nations and Comments -- `SELECT * FROM parquetFile`") print_horizontal() for nation in nations_all.collect(): print(nation) # Use standard SQL to filter nations_filtered_sql = sqlContext.sql("SELECT name FROM parquetFile WHERE name LIKE '%UNITED%'") # Print the result set nations_filtered = nations_filtered_sql.map(lambda p: "Country: {0:20}".format(p.name))
sqc = SQLContext(sc) #1)Create RDD emps by importing that file into Spark. emps = sc.textFile(spark_root_dir + '/data/emps.txt') #2)Next create a new RDD emps_fields by transforming the content of every line in RDD emps into # a tuple with three individual elements by splitting the lines on commas. emps_fields = emps.map(lambda line: line.split(',')).map( lambda row: tuple(row)) #emps_fields = emps.map(lambda line: line.split(',')) #3)You need to apply "constructor" Row to every tuple in RDD emps_fields employees = emps_fields.map( lambda e: Row(name=e[0], age=int(e[1]), salary=float(e[2]))) #You generate a DataFrame by passing an RDD of Row elements to the method createDataFrame() of class SQLContext. df = sqc.createDataFrame(employees) #Show the content of new DataFrame. df.show() tmp_table_name = 'tmp_employees' #Transform this DataFrame into a Temporary Table df.registerTempTable(tmp_table_name) #select names of all employees who have a salary greater than 3500 sqc.sql('SELECT * FROM {0} WHERE salary > 3500'.format(tmp_table_name)).show() #print emps_fields.collect() #print type(emps_fields.take(1)[0]) #print employees.take(1) #paragraphA = linesA.flatMap(lambda line: line.split(' ')).map(lambda word: clean(word))
sqlContext = SQLContext(sc) # path to hillary/enron avro enr = sqlContext.read.format( "com.databricks.spark.avro").load( "s3n://datasets-396316040607/enron_data/*.avro").repartition(16) hil = sqlContext.read.format( "com.databricks.spark.avro").load( "s3n://datasets-396316040607/hillary/*.avro").repartition(16) # register tables sqlContext.registerDataFrameAsTable(hil, "hillary") sqlContext.registerDataFrameAsTable(enr, "enron") # register udf sqlContext.registerFunction( "getCos", lambda x, y: get_cosine(text_to_vector(x), text_to_vector(y)) ) # do the cosine similarity on the text, get the top 1000 matches out = sqlContext.sql("SELECT h.author h_auth, e.author e_auth, " "e.contents e_mail, h.contents h_mail, " "getCos(e.contents, h.contents) as cos_sim " "from hillary as h join enron as e order by cos_sim " "desc limit 1000") # write back out to s3 # write back out to s3 out.save("s3n://datasets-396316040607/cos_sim/", format="json")
# Databricks notebook source from pyspark import SparkContext from pyspark import SQLContext from pyspark.sql.functions import * target_query="SELECT * from databse_name.byod_dbfs_table" sparkContext = SparkContext.getOrCreate() sqlContext = SQLContext(sparkContext) dataframe = sqlContext.sql(target_query) dataframe.repartition(1).write.format('com.databricks.spark.csv').options(delimiter=",").save("s3n://amgen-edl-acux-aaaa123-bkt/BYOD/", header="true", mode="overwrite") # COMMAND ---------- print "test2" # COMMAND ---------- print "test3"
def filterData(record): flag = True if (int(record[-4])<1) or (record[-2] not in (['1','4'])) or (record[-1] != ''): flag = False return flag if __name__ == '__main__': sc = SparkContext(appName = 'CF_prod_in_transaction') sqlContext = SQLContext(sc) in_file = sc.textFile(sys.argv[1]) data = in_file.map(oritentData).filter(filterData).map(lambda x: [int(i) for i in x[:-3]]) Record = Row('customer_id','product_id','invoice_id','units') data = data.map(lambda x: Record(*x)) data = sqlContext.createDataFrame(data) sqlContext.registerDataFrameAsTable(data,'table1') df = sqlContext.sql('select customer_id, product_id, sum(units) as prod_in_transactions from table1 group by customer_id, product_id') df.map(lambda x: ','.join([str(r) for r in x])).saveAsTextFile(sys.argv[2]) sc.stop() data_path,header,train_sample,number,support,confidence,lift,k,testing,testing_split,seed,output_path write = open('test.csv','w') wrtr = csv.writer(write) import csv read = open('arqiva.csv') for line in read: wrtr.writerow(line) from e envelope = Envelope(
lines_with_keyword = textFile.filter(lambda line: "@TMobileHelp" in line) print "Number of tweets with TMobileHelp: " + str(lines_with_keyword.count()) print lines_with_keyword.lookup("text") #print lines_with_keyword #schemaTweets = sqlContext.createDataFrame(lines_with_keyword) #schemaTweets.registerTempTable("tweets") #row = Row("text") #lines_with_keyword.map(row).toDF() #lines_with_keyword.printSchema() #print tweets.take(5) #print keyword_onlytext.take(5) df = sqlContext.read.json(currentFile) #df.printSchema() #df_distinctlang = df.select("lang").distinct() #df_lang = sql.read.json("TwitterLanguages.json") #df_join =df_distinctlang.join(df_lang, df_distinctlang.lang == df_lang.code).drop(df_lang.code).drop(df_lang.status) #df_join.show() df.registerTempTable("df") df_text = df.select("text") #df_text.printSchema() df_sql = sqlContext.sql("SELECT text FROM df WHERE text like '%TMobileHelp%'") print df_sql.collect() #df_sql.rdd.map(lambda x: ",".join(map(str, x))).coalesce(1).saveAsTextFile("file.csv") #df.toPandas().to_csv('mycsv.csv')
if __name__ == "__main__": file_path = os.path.abspath("../doc/book.txt") print file_path conf = SparkConf().setAppName("schema_test").setMaster("local") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) lines = sc.textFile(file_path) # 切分 parts = lines.map(lambda lines: lines.split(",")) # 隐射表间关系(定义表结构) book = parts.map(lambda book: Row(name=book[0], author=book[1], price=float(book[2]), publish=book[3])) # 转换成schema并注册 schemaPeople = sqlContext.createDataFrame(book) schemaPeople.registerTempTable("book") # 定义sqk语句(查询prize在50、60之间的书) book = sqlContext.sql("SELECT * FROM book WHERE price > 50.0 AND price < 60 OR name LIKE '%Spark%'") # 查询结果进行隐射 bookMap = book.map(lambda books: (books.name, books.author, books.price, books.publish)) for book in bookMap.collect(): print "|Name: " + book[0], "|Author: " + book[1], "|Price: " + str(book[2]), "|Publish: " + book[3] + "|" sc.stop()
print "|Time: " + row[0], "|Name: " + row[1], "|Oid: " + row[2], "|Value: " + str(row[3]) + "|" his.registerTempTable("his_data_zadd") #sqlContext.cacheTable("his_data_zadd") #assets = sqlContext.sql("SELECT his.name, his.oid FROM his_data_zadd as his WHERE his.value > 200 AND his.oid < 3000000") #sql_str = "SELECT his.name, his.oid FROM (SELECT MAX(temp_t.value), temp_t.name, temp_t.oid FROM his_data_zadd AS temp_t) his" #sql_str = "SELECT his.name, his.oid, his.value FROM his_data_zadd AS his ORDER BY his.value DESC LIMIT 10" #sql_str = 'SELECT his.name, his.oid FROM his_data_zadd AS his WHERE EXISTS (SELECT MAX(temp_t.value) FROM his_data_zadd AS temp_t)' """ Spark 1.5 does not support subquery. """ #sql_str = 'SELECT MAX(his_t.value) as max_value FROM his_data_zadd AS his_t GROUP BY his_t.name,his_t.oid,his_t.collect_time,his_t.value' sql_str = 'SELECT * FROM his_data_zadd his_t ORDER BY his_t.value DESC LIMIT 1' #sql_str = 'SELECT * FROM (SELECT his_t.value,his_t.name FROM his_data_zadd his_t WHERE his_t.value > 100)' assets = sqlContext.sql(sql_str) # 结果存储parquet格式 #assets.saveAsParquetFile("hdfs://localhost:9000/user/parquet_1") assets.show() # 查询结果进行隐射 # assetMap = assets.map(lambda asset: (asset.name, asset.value)).foreachPartition(print_fun) #assetMap = assets.map(lambda asset: (asset.name, asset.value)) #assetMap = assets.map(lambda asset: (asset.max_value)) assetMap = assets.map(lambda asset: (asset.name, asset.oid, asset.value)) collect_asset = assetMap.collect() print_fun(collect_asset)
sparkConf = (SparkConf() .set(key="carbon.insert.batch.size", value="1000") .set(key="spark.driver.allowMultipleContexts",value="true") .set(key="spark.executor.extraJavaOptions", value="-Dwso2_custom_conf_dir=/home/supun/Downloads/wso2das-3.1.0/repository/conf")) #Create a new SparkContext using the above SparkConf. sparkCtx = SparkContext(conf=sparkConf) # Check spark master. print(sparkConf.get("spark.master")); ###################### Get data from DAS table ##################### sqlCtx = SQLContext(sparkCtx) sqlCtx.sql('CREATE TEMPORARY TABLE table1 ' + 'USING org.wso2.carbon.analytics.spark.core.sources.AnalyticsRelationProvider ' + 'OPTIONS (tenantId "-1234", tableName "IRIS_DATA_STREAM")') df = sqlCtx.sql("SELECT * FROM table1"); ##################### Prepare the data ##################### assembler = VectorAssembler(inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"], outputCol="features") assembledDf = assembler.transform(df) assembledDf.show() transformedDf = assembledDf.select(col("class").alias("label"), col("features"))
df = sqlContext.read.parquet(logFile) destDF=df.select('logintype','logtype','hosid','suppid','logtime','usermac','gwid').map(lambda x:trimf(x)) fields = [ StructField('logintype', StringType(), True), StructField('logtype', StringType(), True), StructField('hosid', StringType(), True), StructField('suppid', StringType(), True), StructField('logtime', LongType(), True), StructField('usermac', StringType(), True), StructField('gwid', StringType(), True) ] schema = StructType(fields) schemaDest = sqlContext.createDataFrame(destDF, schema) schemaDest.registerTempTable("loginflowlog") sqlContext.registerFunction("todatestr", lambda x:longTime2str(x),StringType()) sqlContext.registerFunction("trimx", lambda x:trimx(x),StringType()) midDF = sqlContext.sql("select count(1) userlogintimes,count(distinct(usermac)) userlogincount,hosid,gwid,todatestr(logtime) day from loginflowlog " "where logtype like '5-%-arrive' and gwid!='' group by hosid,gwid,todatestr(logtime)") hosiddayList=midDF.rdd.map(lambda x:(x[2],x[3],x[4],x[2],x[3],x[4])).collect() resultList=midDF.rdd.collect() dao=MysqlDao() dao.insertMany('INSERT INTO `bblink_data`.`bblink_data_hos_subject` (`hosid`,`gwid`,`day`)VALUES(%s,%s,%s) ON DUPLICATE KEY UPDATE hosid=%s,gwid=%s,day=%s',hosiddayList) dao.insertMany("update `bblink_data`.`bblink_data_hos_subject` set userlogintimes=%s,userlogincount=%s where hosid=%s and gwid=%s and day=%s",resultList);
def main(): conf = SparkConf().setAppName('Scenario') sc = SparkContext(conf=conf) assert sc.version >= '1.5.1' sqlContext = SQLContext(sc) read_parq=sqlContext.read.parquet(inputs).cache() read_parq.registerTempTable('data') scenario_1=sqlContext.sql(""" SELECT ArtistName , MAX(ArtistFamiliarity) AS MaxFamiliarity FROM data where ArtistFamiliarity!='nan' and ArtistLocation is NOT NULL GROUP BY ArtistName ORDER BY MAX(ArtistFamiliarity) DESC LIMIT 100 """) #scenario_1.show() scenario_1.registerTempTable('scenario1') joined_artist=sqlContext.sql(""" SELECT A.ArtistName , B.ArtistFamiliarity,B.SongID,B.SongHotttnesss,B.ArtistLocation,B.ArtistLatitude,B.ArtistLongitude FROM scenario1 A INNER JOIN data B ON A.ArtistName=B.ArtistName """) #joined_artist.show() joined_artist.registerTempTable('joined_artist') AvgFamiliarity=sqlContext.sql(""" SELECT first(ArtistName) AS Artist_Name, MIN(ArtistFamiliarity) AS MinFamiliarity,MAX(ArtistFamiliarity) AS MaxFamiliarity,ArtistLocation AS Location, first(ArtistLatitude) AS Latitude, first(ArtistLongitude) AS Longitude FROM joined_artist where ArtistFamiliarity!='nan' and ArtistLatitude <> '' and ArtistLatitude is not NULL GROUP BY ArtistLocation ORDER BY MAX(ArtistFamiliarity) DESC """) #AvgFamiliarity.show() AvgFamiliarity.rdd.map(tuple).coalesce(1).saveAsTextFile(output+'/scenario1') scenario_2=sqlContext.sql(""" SELECT ArtistName , MAX(SongHotttnesss) as MaxHotttness FROM data where SongHotttnesss!='nan' GROUP BY ArtistName ORDER BY MAX(SongHotttnesss) DESC LIMIT 10 """) scenario_2.registerTempTable('scenario2') joined_artist_hotness=sqlContext.sql(""" SELECT B.Title, A.ArtistName , B.SongHotttnesss,B.ArtistFamiliarity FROM scenario2 A INNER JOIN data B ON A.ArtistName=B.ArtistName and A.MaxHotttness=B.SongHotttnesss """) #joined_artist_hotness.show() joined_artist_hotness.rdd.map(tuple).coalesce(1).saveAsTextFile(output+'/scenario2') #Of a particular artist scenario_3=sqlContext.sql(""" SELECT ArtistName , Year, AVG(ArtistFamiliarity) AS AvgFamiliarity,COUNT(SongID) AS Total_Songs FROM data where ArtistName='Britney Spears' and Year!=0 GROUP BY Year,ArtistName ORDER BY Year """) scenario_3.rdd.map(tuple).coalesce(1).saveAsTextFile(output+'/scenario3') scenario_4=sqlContext.sql(""" SELECT ArtistName , MAX(ArtistFamiliarity)-MIN(ArtistFamiliarity) AS Difference FROM data GROUP BY ArtistName """) scenario_4.registerTempTable('scenario4') top_diff=sqlContext.sql(""" SELECT ArtistName , MAX(Difference) as MAXDiff FROM scenario4 GROUP BY ArtistName ORDER BY MAXDiff DESC LIMIT 10 """)
def main(): conf = SparkConf().setAppName("climate") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) climateSchema = StructType( [ StructField("station", StringType(), False), StructField("date", IntegerType(), False), StructField("element", StringType(), False), StructField("value", IntegerType(), True), StructField("mflag", StringType(), True), StructField("qflag", StringType(), True), StructField("sflag", StringType(), True), StructField("obstime", StringType(), True), ] ) info = sqlContext.read.format("com.databricks.spark.csv").options(header="false").schema(climateSchema).load(inputs) info.registerTempTable("info") stationinfo = sqlContext.sql("SELECT station, date, element, value, FLOOR(date/10000) as yy FROM info ") stationinfo.registerTempTable("stationinfo") stationinfo.cache() prcpTable = sqlContext.sql("SELECT station, date, value as prcp, yy FROM stationinfo WHERE element='PRCP' ") prcpTable.registerTempTable("prcpTable") prcpTable.cache() # prcpTable.show() # create 3 tables that hold the monthly average of min, max temperature and prcp yearlyprcp = sqlContext.sql( "SELECT station, yy, ROUND(Avg(prcp),0) as avg_prcp FROM prcpTable GROUP BY station, yy " ) yearlyprcp.registerTempTable("prcpMean") # yearlyprcp.show() # get information about stations from stations.txt def getdata(line): line = line.split(" ") values = [x.strip() for x in line] return values stations = sc.textFile(input2) stations = stations.map(getdata) stations = stations.map(lambda (a, b, c): Row(station=a, latitude=float(b), longitude=float(c))).cache() stationDF = sqlContext.createDataFrame(stations) stationDF.registerTempTable("StationTable") stationDF.cache() # param = sqlContext.sql("SELECT MAX(latitude) as max_lat, Min(latitude) as min_lat, MAX(longitude) as max_long, Min(longitude) as min_long FROM StationTable") # param.show() # Join to station file to add latitude and longitude and stationID result = ( stationDF.join(yearlyprcp) .where(stationDF.station == yearlyprcp.station) .select(yearlyprcp.avg_prcp, yearlyprcp.station, yearlyprcp.yy, stationDF.latitude, stationDF.longitude) ) # save into parquet file result.write.format("parquet").save(output)
productsFile.write.mode("overwrite").format("parquet").option("path", "<s3 path>").saveAsTable("staging.products") # Load orders data from S3 into Datafram ordersFile = sqlContext.read.parquet("<s3 path>") ordersFile.registerTempTable("orders") #ordersFile.write.mode("overwrite").saveAsTable("retail.orders") ordersFile.write.mode("overwrite").format("parquet").option("path", "<s3 path>").saveAsTable("staging.orders") # Load orders data from S3 into Datafram customersFile = sqlContext.read.parquet("<s3 path>") customersFile.registerTempTable("customers") #ordersFile.write.mode("overwrite").saveAsTable("retail.orders") customersFile.write.mode("overwrite").format("parquet").option("path", "<s3 path>").saveAsTable("staging.customers") # Join orders and products to get the sales rollup sales_breakup_sql = sqlContext.sql("SELECT sum(orders.price) total_sales, products.sku, products.product_category " " FROM orders join products where orders.sku = products.sku group by products.sku, products.product_category") #products_all = products_sql.map(lambda p: "Counts: {0} Ipsum Comment: {1}".format(p.name, p.comment_col)) sales_breakup_sql.show(n=2) # Write output back to s3 under processed sales_breakup_sql.write.mode('overwrite').format("parquet").option("path", "<s3 path>").saveAsTable("processed.sales")
from pyspark import SQLContext from pyspark import SparkContext, SparkConf conf = SparkConf().setAppName("Problem 13") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) employee = sqlContext.read.json("/user/smartlin1/data/employee.json") # Register a temp table using registerTempTable employee.registerTempTable("Employee") # Query and print data emplyeeInfo = sqlContext.sql("SELECT * FROM Employee") for row in emplyeeInfo.collect(): print(row)
#setting the configurations for the SparkConf object here conf = (SparkConf() .setMaster("local[4]") .setAppName("convert.py") .set("spark.executor.memory", "1g")) #creating the SparkConf object here sc = SparkContext(conf = conf) #creating the sqlContext that will be used sqlContext = SQLContext(sc) #reading the parquet file #Change this line to be the directory where the parquet file exists parquetFile = sqlContext.read.parquet('data/test2') parquetFile.registerTempTable("parquetFile") #Queries are made from the base + command. #base SELECTS elements of what you are interested from WHERE base = "SELECT * FROM parquetFile WHERE" #command is the query you make. command = ' ip_len >= 1500' test = sqlContext.sql(base + command) print test.show()
sc = SparkContext().getOrCreate() sqlContext = SQLContext(sc) df_wordgame = sqlContext.read.format("csv").option("header", "true").csv('wordgame.csv') df_wordgame.registerTempTable("wordgame") print("-----------------------------wordgame---------------------------------") df_wordgame.show() # removing duplicates after_duplicate = sqlContext.sql( 'select author,word1 as id,word2,source,sourceID from wordgame group by author,word1,word2,source,sourceID limit 1000' ) after_duplicate.registerTempTable("newwordgame") vertices = sqlContext.sql( 'select author,id,word2,source,sourceID from newwordgame group by author,id,word2,source,sourceID' ) edges = sqlContext.sql( 'select id as src,word2 as dst, source as source from newwordgame group by id,word2,source' ) # importing graph frames from graphframes import * graph = GraphFrame(vertices, edges) pageRank = graph.pageRank(resetProbability=0.15, maxIter=1) print("---------------------------vertices----------------------")
root |-- lpep_pickup_datetime: string (nullable = true) |-- Pickup_longitude: double (nullable = true) |-- Pickup_latitude: double (nullable = true) |-- Dropoff_longitude: double (nullable = true) |-- Dropoff_latitude: double (nullable = true) |-- Passenger_count: integer (nullable = true) |-- Total_amount: double (nullable = true) |-- Trip_type: integer (nullable = true) """ tripsData.registerTempTable("tripsData") # The trips whose pickup time is between 2am -- 6am GIS_Points = sqlContext.sql("select lpep_pickup_datetime, Pickup_longitude, Pickup_latitude from tripsData " + "where lpep_pickup_datetime like '%2015-__-__ 02:%' " "or lpep_pickup_datetime like '%2015-__-__ 03:%' " "or lpep_pickup_datetime like '%2015-__-__ 04:%' " "or lpep_pickup_datetime like '%2015-__-__ 05:%' " ).cache() sqlContext.dropTempTable("tripsData") # This is a list shapeRecs = nyc_shapefile.shapeRecords() def point_in_poly(x, y, poly): n = len(poly) inside = False p1x, p1y = poly[0] for i in range(n + 1): p2x, p2y = poly[i % n]
if __name__ == "__main__": json_path = os.path.abspath("../doc/book.json") txt_path = os.path.abspath("../doc/book.txt") conf = SparkConf().setAppName("mul_sources").setMaster("local[2]") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) # json读取并隐射 json_df = sqlContext.read.json(json_path) json_df.registerTempTable("json_book") # txt读取并隐射 lines = sc.textFile(txt_path) parts = lines.map(lambda lines: lines.split(",")) book = parts.map(lambda book: Row(name=book[0], author=book[1], price=float(book[2]), publish=book[3])) schemaPeople = sqlContext.createDataFrame(book) schemaPeople.registerTempTable("txt_book") # sql_book = sqlContext.sql("SELECT * FROM json_book AS jbook LEFT JOIN txt_book AS tbook ON tbook.name=jbook.name") sql_book = sqlContext.sql("SELECT * FROM json_book AS jbook , txt_book AS tbook " "WHERE jbook.name=tbook.name ORDER BY tbook.price") bookMap = sql_book.map(lambda books: (books.name, books.author, books.price, books.publish, books.place)) for book in bookMap.collect(): print book[0], book[1], book[2], book[3], book[4] sc.stop()
spark_home = '/opt/cloud/spark' os.environ['SPARK_HOME'] = spark_home conf = (SparkConf() .setMaster(master) .setAppName(appName) .set("spark.sql.parquet.binaryAsString","true") ) sc = SparkContext(conf = conf) sql_context = SQLContext(sc) sql_context.registerFunction("to_mac", lambda x: normal_mac(x), StringType()) parquet_df = sql_context.read.parquet(input) sql_context.registerDataFrameAsTable(parquet_df, "loginflowlog") #_sql = "select to_mac(upper(usermac)),count(distinct dat) days from loginflowlog group by to_mac(upper(usermac))" _sql = "select to_mac(upper(usermac)),count(distinct logtime) days from loginflowlog group by to_mac(upper(usermac))" rs_df = sql_context.sql(_sql) rs = rs_df.collect() logger.info("---->" + str(len(rs))) lists = [] for r in rs: usermac = r[0] days = r[1] t = (usermac,days) lists.append(t) #logger.debug(t) dao = MysqlDao() _sql = "TRUNCATE TABLE user_days" dao.insert(_sql)
# Spark SQL - DataFrame API #### # 3. Filtering (10 points) Count the number of appearances of word 'ATTRIBUTE' #### schema = StructType([StructField("word", StringType(), True), StructField("count1", IntegerType(), True), StructField("count2", IntegerType(), True), StructField("count3", IntegerType(), True)]) df = sqlContext.read.csv('gbooks', schema=schema, sep='\t') #df.select("word", df.word.like("ATTRIBUTE")).show(10) #df.word.like("ATTRIBUTE").show(10) df.createOrReplaceTempView("tableTemp") sqlContext.sql("SELECT count(*) from tableTemp where word ='ATTRIBUTE'").show() # Spark SQL # +--------+ # |count(1)| # +--------+ # | 201| # +--------+
from pyspark import SparkContext, SparkConf from pyspark import SQLContext import itertools def print_fun(collect): for item in itertools.chain(collect): print "|Name: " + item[0], "|Value: " + str(item[1]), "|Attribute: " + item[2] if __name__ == "__main__": conf = SparkConf().setAppName("json_ops").setMaster("local[3]") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) # 将json格式直接直接parallelize为RDD equipmentRDD = sc.parallelize(['{"name":"asset1","value":1,"attribute":"属性1"}', '{"name":"asset2","value":2,"attribute":"属性2"}', '{"name":"asset3","value":3,"attribute":"属性3"}']) equipment = sqlContext.jsonRDD(equipmentRDD) equipment.registerTempTable("equipment") assets = sqlContext.sql("SELECT * FROM equipment as eq WHERE eq.value >= 1 AND eq.value <= 2") assets.show() # 查询结果进行隐射 assetMap = assets.map(lambda asset: (asset.name, asset.value, asset.attribute)).foreachPartition(print_fun) sc.stop()
# ## Filtering # # ### Select structures that either have Ki or Kd values(s) and are protein-serine/threonine kinases (EC 2.7.1.*) # # # #### A. By using dataset operations # In[5]: ds = ds.filter("(Ki IS NOT NULL OR Kd IS NOT NULL) AND ecNo LIKE '2.7.11.%'") ds.show(10) # #### B. By creating a temporary query and running SQL # In[6]: ds.createOrReplaceTempView("table") ds = sqlContext.sql( "SELECT * from table WHERE (Ki IS NOT NULL OR Kd IS NOT NULL) AND ecNo LIKE '2.7.11.%'" ) ds.show(10) # ## Terminate Spark # In[7]: sc.stop()
conf = SparkConf().setAppName('bag_words') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) with open('clusterFinal.pickle', 'rb') as f: cluster=pickle.load(f) schema = StructType([ StructField('reviewText', StringType(), False),StructField('overall', FloatType(), False),StructField('reviewTime', StringType(), False) ]) df = sqlContext.read.json(input, schema=schema) df.registerTempTable('review_table') sd=sqlContext.sql(""" SELECT reviewText FROM review_table """) fin=sd.rdd.map(lambda x: str(x.reviewText)).map(clean_words) sparse_vectors=fin.map(get_sparseVector) time=sqlContext.sql(""" SELECT reviewTime FROM review_table """) time_split=time.rdd.map(lambda x: str(x.reviewTime)).map(lambda line: line.split(', ')) year_list=time_split.map(lambda (x,y):y).collect() score=sqlContext.sql(""" SELECT overall FROM review_table """) score_list=score.rdd.map(lambda x:str(x.overall)).collect() sparse_list=sparse_vectors.collect() zip_list=zip(sparse_list, year_list, score_list)
from pyspark import SQLContext from pyspark import SparkContext sc = SparkContext() sqlContext = SQLContext(sc) df = sqlContext.read.format('com.databricks.spark.csv').options( header='true', inferschema='true').load( '/Users/mohiulalamprince/work/python/cricstat/player-info-[0-9]*') df.registerTempTable('players') sqlContext.sql("select count(distinct Player) from players").show()
print("------- ******* Task 1 ******* -------") columns = ['id', 'price', 'date', 'zip', 'type', 'new', 'duration', 'PAON', 'SAON', 'street', 'locality', 'town', 'district', 'county', 'ppd', 'status'] rdd = sc.textFile(datadir + "prop-prices.csv") df = pyspark_csv.csvToDataFrame(sqlContext, rdd, columns=columns) df.registerTempTable("properties") df.persist() # Task 2: let's do some basic analysis on the data. # Find how many records we have per year, and print them out sorted by year. print("------- ******* Task 2 ******* -------") year_count = sqlContext.sql( """select year(date) as year, count(*) as count from properties group by year(date) order by year(date)""").collect() print(year_count) # Task 3: Everyone knows that properties in London are expensive. # Find the average property price by county, # and print the top 10 most expensive counties print("------- ******* Task 3 ******* -------") county_prices = sqlContext.sql( """select county, avg(price) as avg_price from properties group by county order by avg(price) desc limit 10""").collect() print(county_prices)
from pyspark import SparkContext from pyspark import SQLContext from pyspark.sql import SparkSession from pyspark.sql.functions import * if __name__ == '__main__': sc = sparkContext.getOrCreate() sqlContext = SQLContext(sc) df = spark.read.csv( 'linkToS3Bucket', inferSchema=True, header=True, ) cleaningdf = df.dropna(thresh=3) cleaningdf = cleaningdf.fillna(0) cleaningdf.registerTempTable('df_table') sqlContext.sql('select * from df_table').show() cleaningdf.write.csv('linkToS3Bucket')
batch_no user_type supp_id ''' user_login = parts.map(lambda p: (p[1].strip(), p[2].strip(),p[17].strip(),p[3].strip(),p[16].strip(), p[4].strip(),p[5].strip(),p[6].strip(),p[7].strip(),p[8].strip(), p[9].strip(),p[10].strip(),p[11].strip(),p[12].strip(),p[13].strip(), p[14].strip(),p[15].strip())) schema_string = "id gw_id supp_id user_id user_type " \ "user_name login_time logout_time mac ip " \ "user_agent download_flow upload_flow os browser " \ "ratio batch_no" fields = [StructField(field_name, StringType(), True) for field_name in schema_string.split(' ')] schema = StructType(fields) df = sql_context.createDataFrame(user_login, schema) df.registerTempTable("tb_user_login_info") #_sql="select distinct mac,gw_id,'%s' as day,'1' as flag from tb_user_login_info" % date _sql="select distinct user_name,gw_id,'%s' as day,'1' as flag from tb_user_login_info" % date rs = sql_context.sql(_sql) re_rdd = rs.map(lambda r:(r.mac+"_"+r.gw_id,r.day+sep+r.flag))\ .reduceByKey(lambda vs:vs[0],1) list =[] for t in re_rdd.collect(): line = t[0]+sep+t[1] list.append(line) BaseService()._write_file(list, output) sc.stop()
# callDf.toPandas().to_csv('time.csv', index=False) # convert string to timestamp # method 1 # callDf1 = callDf.withColumn("start_time_new", callDf['start_time'].cast(TimestampType())) # convert string to timestamp # method 2 # callDf_time = callDf.select('*', to_timestamp('start_time', 'HH:mm:ss').cast(TimestampType()).alias('new_start_time')) # callDf_time = callDf_time.select('*', to_timestamp('end_time', 'HH:mm:ss').cast(TimestampType()).alias('new_end_time')) # callDf_time.show() # 每日平均通话次数 callNumber = sparksql.sql( "SELECT calling_nbr, round(COUNT(Call.calling_nbr)/29, 2) AS num FROM Call GROUP BY calling_nbr" ) callNumber.toPandas().to_csv('../myresult/callNumber.csv', index=False) # 不同通话类型下各个运营商的数量占比 # 本地 local = sparksql.sql( "SELECT called_optr, round(COUNT(Call.called_nbr) * 1.000/(select COUNT(Call.called_nbr) from Call WHERE call_type == 1) ,3) AS num FROM Call WHERE call_type == 1 GROUP BY called_optr" ) local.toPandas().to_csv('../myresult/local.csv', index=False) # 长途 longDistance = sparksql.sql( "SELECT called_optr, round(COUNT(Call.called_nbr) * 1.000/(select COUNT(Call.called_nbr) from Call WHERE call_type == 2) ,3) AS num FROM Call WHERE call_type == 2 GROUP BY called_optr" ) longDistance.toPandas().to_csv('../myresult/longDistance.csv', index=False)
{'uid': '3', 'adid': 'c','guuid':'aa','guuidctime':1,'url':'','referer':'','hosid':'132','gwid':'','ua':'','ip':'','createtime':1450823550766}, {'uid': '4', 'adid': 'd','guuid':'bb','guuidctime':1,'url':'','referer':'','hosid':'133','gwid':'','ua':'','ip':'','createtime':1450823268766}, ]).registerAsTable("adload") _adPlayDF=sqlContext.createDataFrame([ {'uid': '1', 'adid': 'a','guuid':'aa','createtime':1450823568766}, {'uid': '2', 'adid': 'b','guuid':'aa','createtime':1450823569766}, {'uid': '4', 'adid': 'd','guuid':'bb','createtime':1450823268766}, ]).registerAsTable("adplay") _adClickDF =sqlContext.createDataFrame([ {'uid': '1', 'adid': 'a','guuid':'aa','createtime':1450823580766}, ]).registerAsTable("adclick") ''' sqlContext.registerFunction("dateformat", lambda x:longTime2str(x),StringType()) adLoadDf=sqlContext.sql('select hosid,dateformat(createtime) day,adid,count(guuid) pv,count(distinct guuid) uv ' 'from adload where createtime is not null and dateformat(createtime)=%s ' 'group by adid,hosid,dateformat(createtime)' % (lastdate)).registerAsTable("radload") adPlayDf=sqlContext.sql('select gh.hosid,dateformat(ap.createtime) day,adid,count(ap.guuid) pv,count(distinct ap.guuid) uv ' 'from adplay ap left join ghid gh on ap.guuid=gh.guuid where dateformat(ap.createtime)=%s ' 'group by ap.adid,gh.hosid,dateformat(ap.createtime)' % (lastdate)).registerAsTable("radplay") # sqlContext.sql('select sum(pv) from radplay').foreach(printx) adClick=sqlContext.sql('select gh.hosid,dateformat(ac.createtime) day,ac.adid,count(ac.guuid) pv,count(distinct ac.guuid) uv ' 'from adclick ac left join ghid gh on ac.guuid=gh.guuid where dateformat(ac.createtime)=%s ' 'group by ac.adid,gh.hosid,dateformat(ac.createtime)' % (lastdate)).registerAsTable("radclick") _df=sqlContext.sql('select A.hosid,A.day,A.adid,A.pv,A.uv,B.pv,B.uv,C.pv,C.uv from radload A ' 'left join radplay B on A.hosid=B.hosid and A.adid=B.adid '
payload['data']=[] if(len(pub_data[key]['data'])>batch_size): total_len=len(pub_data[key]['data']) for index in range(0,total_len/batch_size,batch_size): if(index+batch_size>total_len): payload['data']=pub_data[key]['data'][index:total_len-index] else: payload['data']=pub_data[key]['data'][index:index+batch_size] pushToRedis(payload) else: payload['data']=pub_data[key]['data'] pushToRedis(payload) # cerating publisher realted data from daily data pub_dfp_df=sqlContext.sql('select publisher_id,id5 as user_identifier, collect_list(device_finger_print) as dfps , collect_list(c_dfp) as dfp_counts from (select publisher_id ,id5, device_finger_print ,count(device_finger_print) as c_dfp from testtable where id5!="" group by publisher_id,id5,device_finger_print) group by publisher_id,id5') pub_ip_df=sqlContext.sql('select publisher_id,id5 as user_identifier, collect_list(ip) as ips , collect_list(c_ip) as ip_counts from (select publisher_id ,id5, ip ,count(ip) as c_ip from testtable where id5!="" group by publisher_id,id5,ip) group by publisher_id,id5') pub_session_df=sqlContext.sql('select publisher_id,id5 as user_identifier, collect_list(sessionId) as sessions , collect_list(c_session) as session_counts from (select publisher_id ,id5, sessionId ,count(sessionId) as c_session from testtable where id5!="" group by publisher_id,id5,sessionId) group by publisher_id,id5') pub_df=pub_dfp_df.join(pub_ip_df,['publisher_id','user_identifier'],'outer') pub_df=pub_df.join(pub_session_df,['publisher_id','user_identifier'],'outer') pub_daily_df=sqlContext.read.json(daily_fetch_path) if(len(pub_daily_df.columns)==0): for dtype in pub_df.dtypes: pub_daily_df=pub_daily_df.withColumn(dtype[0],lit(None).cast(dtype[1])) pub_daily_df=pub_daily_df.select(pub_df.columns) pub_df=pub_df.union(pub_daily_df) pub_daily_df=pub_daily_df.select(pub_df.columns)
import os import sys from pyspark import SQLContext from pyspark import SparkContext #os.environ["SPARK_HOME"] = "/opt/spark-1.6.1-bin-hadoop2.6" #os.environ["HADOOP_HOME"] = "/opt/hadoop" #os.environ["HADOOP_PREFIX"] = "/opt/hadoop" #os.environ["HIVE_HOME"] = "/opt/hive" sc = SparkContext('local[1]') sql_context = SQLContext(sc) sql_context.setConf( "spark.sql.shuffle.partitions", "1") sql_context.sql(""" use fex_test """)
class DataHandler: def __init__(self): self.conf = (SparkConf() .setAppName("BandCard") .set("spark.cores.max", "2") .set('spark.executor.extraClassPath', '/usr/local/env/lib/mysql-connector-java-5.1.38-bin.jar')) self.sc = SparkContext(conf=self.conf) self.sqlctx = SQLContext(self.sc) self.mysql_helper = MySQLHelper('core', host='10.9.29.212') def load_from_mysql(self, table, database='core'): url = "jdbc:mysql://10.9.29.212:3306/%s?user=root&characterEncoding=UTF-8" % database df = self.sqlctx.read.format("jdbc").options(url=url, dbtable=table, driver="com.mysql.jdbc.Driver").load() return df def prepare_life_cycle(self, year, season): ''' 准备生命周期数据 从t_CMMS_ASSLIB_ASSET中获取每日AUM数据 prepare data saum1 (last season sum aum) saum2 (current season sum aum) aum_now account_age (months) last_tr_date (days) :param year: :param season: 1,2,3,4 :return: ''' # 计算月份 print('----------------------生命周期-Start----------------------') print('开始准备生命周期数据...') print('开始计算月份') if season == 1: # date1 当前季度月份 date1 = [str(year) + '-01', str(year) + '-02', str(year) + '-03'] # date2 上一季月份 date2 = [str(year - 1) + '-10', str(year - 1) + '-11', str(year - 1) + '-12'] elif season == 4: date1 = [str(year) + '-10', str(year) + '-11', str(year) + '-12'] date2 = [str(year) + '-07', str(year) + '-08', str(year) + '-9'] else: date1 = [str(year) + '-0' + str(3 * season - 2), str(year) + '-0' + str(3 * season - 1), str(year) + '-0' + str(3 * season)] date2 = [str(year) + '-0' + str(3 * season - 5), str(year) + '-0' + str(3 * season - 4), str(year) + '-0' + str(3 * season - 3)] print('当前季度月份 new:', date1) print('上一季度月份 old:', date2) # 加载AUM表 aum = self.load_from_mysql('t_CMMS_ASSLIB_ASSET_c').cache() # 拼接每季度三个月断数据 season_new = aum.filter(aum.STAT_DAT == date1[0]).unionAll(aum.filter(aum.STAT_DAT == date1[1])).unionAll( aum.filter(aum.STAT_DAT == date1[2])) season_old = aum.filter(aum.STAT_DAT == date2[0]).unionAll(aum.filter(aum.STAT_DAT == date2[1])).unionAll( aum.filter(aum.STAT_DAT == date2[2])) # 计算每季度AUM aum_season_old = season_old.select('CUST_NO', season_old.AUM.alias('AUM1')).groupBy('CUST_NO').sum('AUM1') aum_season_new = season_new.select('CUST_NO', season_new.AUM.alias('AUM2')).groupBy('CUST_NO').sum('AUM2') # 两个季度进行外联接 ''' +-----------+---------+---------+ | CUST_NO|sum(AUM2)|sum(AUM1)| +-----------+---------+---------+ |81005329523| null|294844.59| |81011793167| null| 365.20| |81015319088| null| 9640.96| +-----------+---------+---------+ ''' union_season = aum_season_old.join(aum_season_new, 'CUST_NO', 'outer') # 筛选当前AUM temp_result = aum.select('CUST_NO', 'AUM', 'STAT_DAT').groupBy('CUST_NO', 'STAT_DAT').sum('AUM').sort( 'CUST_NO').sort(aum.STAT_DAT.desc()) temp_result.select('CUST_NO', temp_result['sum(AUM)'].alias('AUM'), 'STAT_DAT').registerTempTable('group_in') aum_now_sql = "select CUST_NO,first(AUM) as AUM_NOW from group_in group by CUST_NO" aum_now = self.sqlctx.sql(aum_now_sql) # 清除缓存表 self.sqlctx.dropTempTable('group_in') # 联合 union_season_aumnow = union_season.join(aum_now, 'CUST_NO', 'outer') # 计算用户开户至今时间(months) # 载入账户表 account = self.load_from_mysql('t_CMMS_ACCOUNT_LIST').cache() account.select('CUST_NO', 'OPEN_DAT').registerTempTable('account') account_age_aql = "select CUST_NO, first(ACCOUNT_AGE) as ACCOUNT_AGE from " \ "(select CUST_NO, round(datediff(now(), OPEN_DAT) / 30) as ACCOUNT_AGE " \ "from account order by CUST_NO, ACCOUNT_AGE desc ) as t group by CUST_NO" account_age = self.sqlctx.sql(account_age_aql) # calculate last tran date account_1 = account.select('CUST_NO', 'ACC_NO15') detail = self.load_from_mysql('t_CMMS_ACCOUNT_DETAIL').select('ACC_NO15', 'TRAN_DAT') a_d = account_1.join(detail, 'ACC_NO15', 'outer') a_d.filter(a_d.CUST_NO != '').registerTempTable('adtable') last_tr_date_sql = "select CUST_NO,first(TRAN_DAT) as LAST_TR_DATE from (select CUST_NO,TRAN_DAT from adtable order by TRAN_DAT desc) as t group by CUST_NO" last_tr_date = self.sqlctx.sql(last_tr_date_sql) # 联合 season aum_now account_age last_tr_date unions = union_season_aumnow.join(account_age, 'CUST_NO', 'outer').join(last_tr_date, 'CUST_NO', 'outer') # 清除缓存表 self.sqlctx.dropTempTable('account') self.sqlctx.dropTempTable('adtable') self.sqlctx.clearCache() # 结果插入表 print('结果插入临时表:t_CMMS_TEMP_LIFECYCLE...') insert_lifecycle_sql = "replace into t_CMMS_TEMP_LIFECYCLE(CUST_NO,SAUM1,SAUM2,INCREASE,ACCOUNT_AGE,AUM_NOW,LAST_TR_DATE) values(%s,%s,%s,%s,%s,%s,%s)" # 缓冲区 temp = [] for row in unions.collect(): row_dic = row.asDict() if len(temp) >= 1000: # 批量写入数据库 self.mysql_helper.executemany(insert_lifecycle_sql, temp) temp.clear() # 加载数据到缓冲区 try: # 计算增长率 increase = (row_dic['sum(AUM2)'] - row_dic['sum(AUM1)']) / row_dic['sum(AUM1)'] except Exception: increase = 0 # 计算开户时长(月份数) 若无则视为6个月以上 if row_dic['ACCOUNT_AGE'] is None: row_dic['ACCOUNT_AGE'] = 7 # 最后交易日期 ltd = row_dic['LAST_TR_DATE'] if ltd is not None: try: ltd = datetime.datetime.strptime(ltd, '%Y-%m-%d') except Exception: ltd = ltd[:4] + '-' + ltd[4:6] + '-' + ltd[6:] ltd = datetime.datetime.strptime(ltd, '%Y-%m-%d') days = (datetime.datetime.now() - ltd).days else: days = 366 temp.append((row_dic['CUST_NO'], row_dic['sum(AUM1)'], row_dic['sum(AUM2)'], increase, row_dic['ACCOUNT_AGE'], row_dic['AUM_NOW'], days)) if len(temp) != 0: self.mysql_helper.executemany(insert_lifecycle_sql, temp) temp.clear() def calculate_life_cycle(self): ''' 根据AUM变化情况计算生命周期阶段 calculate life cycle period :return: ''' print('开始计算生命周期...') life_cycle = self.load_from_mysql('t_CMMS_TEMP_LIFECYCLE').cache() def clcmap(line): cust_no = line['CUST_NO'] account_age = line['ACCOUNT_AGE'] last_tr_date = line['LAST_TR_DATE'] aum_now = line['AUM_NOW'] increase = line['INCREASE'] period = 0 if aum_now is None: period = 9 # 未知 elif aum_now < 1000 and last_tr_date > 365: period = 3 # 流失期 else: if increase > 20 or account_age < 6: period = 0 # 成长期 elif increase >= -20 and increase <= 20: period = 1 # 成熟期 else: period = 2 # 稳定期 return period, cust_no map_result = life_cycle.map(clcmap).collect() # clear the life_cycle cache self.sqlctx.clearCache() temp = [] print('结果更新到临时表:t_CMMS_TEMP_LIFECYCLE...') update_life_period_sql = "update t_CMMS_TEMP_LIFECYCLE set PERIOD = %s where CUST_NO = %s" for row in map_result: if len(temp) >= 1000: self.mysql_helper.executemany(update_life_period_sql, temp) temp.clear() temp.append(row) if len(temp) != 1000: self.mysql_helper.executemany(update_life_period_sql, temp) temp.clear() def lifecycle_to_real_table(self, year, season): ''' 将生命周期数据写入正式表中 put life_cycle tmp table to real table :return: ''' print('开始将生命周期数据写入正式表中...') life_cycle = self.load_from_mysql('t_CMMS_TEMP_LIFECYCLE').select('CUST_NO', 'PERIOD') cust_info = self.load_from_mysql('t_CMMS_INFO_CUSTOMER').select('CUST_NO', 'CUST_ID', 'CUST_NAM') union = life_cycle.join(cust_info, 'CUST_NO', 'left_outer').cache() temp = [] sql = "replace into t_CMMS_ANALYSE_LIFE(CUST_NO,CUST_ID,CUST_NM,LIFE_CYC,QUARTER,UPDATE_TIME) values(%s,%s,%s,%s,%s,now())" quarter = str(year) + '-' + str(season) for row in union.collect(): if len(temp) >= 1000: self.mysql_helper.executemany(sql, temp) temp.clear() cust_id = row['CUST_ID'] if row['CUST_ID'] is not None else '0' temp.append((row['CUST_NO'], cust_id, row['CUST_NAM'], row['PERIOD'], quarter)) if len(temp) != 1000: self.mysql_helper.executemany(sql, temp) temp.clear() self.sqlctx.clearCache() def run_life_cycle(self,year,season): ''' 运行完整的生命周期流程 1 准备生命周期数据,计算AUM及其变化幅度 2 根据变化幅度计算生命周期阶段 3 将数据从缓存表放到实际表 :param year: :param season: :return: ''' self.prepare_life_cycle(year,season) self.calculate_life_cycle() self.lifecycle_to_real_table(year,season) #------------------------------------------------------------------------生命周期结束------------------------------------------------------------------------# def customer_value(self, year, half_year): ''' 计算客户价值 calculate customer value :param year: which year to calculate :param half_year: 0 for month 1-6 , 1 for month 7-12 :return: ''' print('---------------------------客户价值-Start--------------------------') cust_info = self.load_from_mysql('t_CMMS_INFO_CUSTOMER').select('CUST_NO', 'CUST_ID', 'CUST_NAM').cache() aum = self.load_from_mysql('t_CMMS_ASSLIB_ASSET_c').select('CUST_NO', 'STAT_DAT', 'AUM', 'ASS_TYPE').cache() base = half_year * 6 aum_slot_filter = None for i in range(1, 7): i = base + i if i < 10: i = '0' + str(i) else: i = str(i) slot = str(year) + '-' + i slot_filter = aum.filter(aum.STAT_DAT == slot) if aum_slot_filter is None: aum_slot_filter = slot_filter else: aum_slot_filter = aum_slot_filter.unionAll(slot_filter) # CUST_NO sum(AUM) huoqi_aum = aum_slot_filter.select('CUST_NO', 'ASS_TYPE', aum_slot_filter['AUM'].alias('AUM_HQ')).filter( aum_slot_filter.ASS_TYPE == '1').groupBy('CUST_NO').sum('AUM_HQ') dingqi_aum = aum_slot_filter.select('CUST_NO', 'ASS_TYPE', (aum_slot_filter.AUM * 0.8).alias('AUM_DQ')).filter( aum_slot_filter.ASS_TYPE == '2').groupBy('CUST_NO').sum('AUM_DQ') # 定期活期已计算好,sum(AUM_HQ),sum(AUM_DQ) j = huoqi_aum.join(dingqi_aum, 'CUST_NO', 'outer') # j.show() # 清除原有数据 self.mysql_helper.execute('truncate core.t_CMMS_ANALYSE_VALUE') # 开始联合其他表 all_col = j.join(cust_info, 'CUST_NO', 'outer') print(j.count(), cust_info.count()) # all_col.show() #根据客户价值计算客户等级 def calculate_rank(value): if value < 1000: return 0 elif value < 10000: return 1 elif value < 100000: return 2 elif value < 500000: return 3 elif value < 2000000: return 4 elif value < 5000000: return 5 else: return 6 temp = [] print('将数据replace到正式表...') update_value_sql = "replace into t_CMMS_ANALYSE_VALUE(CUST_ID,CUST_NO,CUST_NM,CUST_VALUE,CUST_RANK,SLOT,UPDATE_TIME) values(%s,%s,%s,%s,%s,%s,now())" for row in all_col.collect(): if len(temp) >= 1000: self.mysql_helper.executemany(update_value_sql, temp) temp.clear() val_dq = row['sum(AUM_DQ)'] if row['sum(AUM_DQ)'] is not None else 0 val_hq = row['sum(AUM_HQ)'] if row['sum(AUM_HQ)'] is not None else 0 cust_val = float(val_dq) + float(val_hq) cust_rank = calculate_rank(cust_val) slot = str(year) + '-' + str(half_year) cust_id = row['CUST_ID'] if row['CUST_ID'] is not None else 1 temp.append((cust_id, row['CUST_NO'], row['CUST_NAM'], cust_val, cust_rank, slot)) if len(temp) != 1000: self.mysql_helper.executemany(update_value_sql, temp) temp.clear() def aum_total(self): ''' 计算AUM总和 data for t_CMMS_ASSLIB_ASSTOT :return: ''' print('---------------------------总资产-Start--------------------------') # TODO t_CMMS_ASSLIB_ASSET_c 要改成正式表t_CMMS_ASSLIB_ASSET df_asset = self.load_from_mysql('t_CMMS_ASSLIB_ASSET_c').select('CUST_NO', 'CUST_ID', 'STAT_DAT', 'AUM', 'CUR', 'ACC_NAM').cache() # print(df_asset.count(), df_asset.columns) other_col = df_asset.select('CUST_NO', 'CUST_ID', 'CUR', 'ACC_NAM').distinct() # print(other_col.count(),other_col.columns) aum = df_asset.select('CUST_NO', 'STAT_DAT', 'AUM') # print(aum.count(), aum.columns) aum = aum.select('CUST_NO', 'STAT_DAT', 'AUM').groupBy(['CUST_NO', 'STAT_DAT']).sum('AUM').sort( ['CUST_NO', aum.STAT_DAT.desc()]) \ .groupBy('CUST_NO').agg({'sum(AUM)': 'first', 'STAT_DAT': 'first'}) # print(aum.count(), aum.columns) total = aum.select('CUST_NO', aum['first(sum(AUM))'].alias('AUM'), aum['first(STAT_DAT)'].alias('STAT_DAT')). \ join(other_col, 'CUST_NO', 'left_outer').distinct() # total.filter(total.STAT_DAT == '2016-06-') .show() # prepare params def list_map(line): return line['CUST_ID'], line['CUST_NO'], line['ACC_NAM'], line['STAT_DAT'], line['CUR'], line['AUM'] df = total.map(list_map) # clear old data self.mysql_helper.execute('truncate t_CMMS_ASSLIB_ASSTOT') sql = "insert into t_CMMS_ASSLIB_ASSTOT(CUST_ID,CUST_NO,ACC_NAM,STAT_DAT,CUR,AUM) values(%s,%s,%s,%s,%s,%s)" # execute sql self.mysql_helper.batch_operate(sql, df, 100) def debt_total(self): ''' prepare data for total debt :return: ''' print('---------------------------总负债-Start--------------------------') df_debt = self.load_from_mysql('t_CMMS_ASSLIB_DEBT').select('LOAN_ACC', 'CUST_NO', 'CUST_ID', 'CUST_NAM', 'BAL_AMT', 'GRANT_AMT', 'CUR') df_debt = df_debt.filter(df_debt.LOAN_ACC != '') df_sum = df_debt.groupBy('CUST_NO').sum('GRANT_AMT', 'BAL_AMT') df_other = df_debt.groupBy('CUST_NO').agg({'CUST_ID': 'first', 'CUST_NAM': 'first', 'CUR': 'first'}) df_total = df_sum.join(df_other, 'CUST_NO', 'left_outer').distinct() stat_dat = datetime.datetime.now().strftime('%Y%m%d') def m(line): return line['CUST_NO'], line['first(CUST_ID)'], line['first(CUST_NAM)'], line['first(CUR)'], line[ 'sum(GRANT_AMT)'], line['sum(BAL_AMT)'], stat_dat df = df_total.map(m) sql = "replace into t_CMMS_ASSLIB_DEBTOT(CUST_NO,CUST_ID,ACC_NAM,CUR,LOAN_AMT,BAL_AMT,STAT_DAT) values(%s,%s,%s,%s,%s,%s,%s)" self.mysql_helper.batch_operate(sql, df) def run(self): # 生命周期 年份 季度1,2,3,4 dh.run_life_cycle(2016, 2) # 客户价值 上半年:0,下半年:1 dh.customer_value(2016, 0) # 总资产 dh.aum_total() # 总负债 dh.debt_total()
info = lines.map(lambda lines: lines.split("----")). \ map(lambda info: Row(email=info[0], username=info[1], realname=info[2], idcard=info[3], password=info[4], phone=info[5])) schemaInfo = sqlContext.createDataFrame(info) schemaInfo.registerTempTable("information") # cache表 #sqlContext.cacheTable("information") #sqlContext.uncacheTable("information") """ :邮箱分析与统计 """ email_str = "SELECT analysis_email(email) AS email FROM information" emailSQL = sqlContext.sql(email_str) # 求总数 count = emailSQL.count() # 分组统计 emailCollect = emailSQL.groupBy("email").count().collect() # email分析结果 result_email(emailCollect, count) """ :用户名与姓名分析与统计 """ # 用户名长度统计 username_len_str = "SELECT LENGTH(username) AS username_len FROM information" usernameSQL = sqlContext.sql(username_len_str) usernameLenCollect = usernameSQL.groupBy("username_len").count().collect() result_username_len(usernameLenCollect, count)
from pyspark import SparkConf, SparkContext, SQLContext import sys inputfile = sys.argv[2] outputfile = sys.argv[3] masterip = sys.argv[1] conf = SparkConf().setMaster(masterip).setAppName("TableData") sc = SQLContext(SparkContext(conf = conf)) data = sc.read.csv(inputfile, header='true') data.createGlobalTempView('Table') sc.sql('''SELECT * FROM global_temp.Table ORDER BY cca2, timestamp''').coalesce(1).write.csv(outputfile)
StructField('suppid', StringType(), True), StructField('logtime', LongType(), True), StructField('usermac', StringType(), True) ] schema = StructType(fields) rdd1 = rdd.map(convert_logtype).filter(lambda tup: tup != None) # rdd1.foreach(printx) # sc.stop() ret_df = sqlContext.createDataFrame(rdd1, schema) ret_df.registerTempTable("loginflowlog_overall") _sql = "SELECT count(usermac) pv,count(distinct usermac) uv,logtype " \ "from loginflowlog_overall " \ "group by logtype" rs_df = sqlContext.sql(_sql) service = LoginflowlogMysqlService() ret_overall_list = service.getRetOverall(rs_df.collect(), day) _sql_delete = "delete from login_flow_global_count where date ='%s'" % day _sql_insert = "insert into login_flow_global_count(date," \ "prelogin_num,prelogin_pnum,login_num,login_pnum," \ "login_click_num,login_click_pnum,forward_num,forward_pnum," \ "preArrive_num,preArrive_pnum,arrive_num,arrive_pnum) " \ "values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" service.write_mysql(ret_overall_list, _sql_delete, _sql_insert) logger.info(len(ret_overall_list)) # detail rdd2 = rdd.map(convert_list).filter(lambda tup: tup != None) \ .map(convert_copy_logtype).values().flatMap(list)
if __name__ == "__main__": conf = SparkConf().setMaster("local[2]").setAppName("sql_udf") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) json_path = os.path.abspath("../doc/book.json") # json读取并隐射 json_df = sqlContext.read.json(json_path) json_df.registerTempTable("json_book") # UDF自定义函数 sqlContext.registerFunction("name_place", name_place) evalRDD = sqlContext.sql("SELECT name_place(name, place, price,evaluation) AS book_eval FROM json_book") #bookMap = lengthRDD.map(lambda books: (books.name, books.author, books.price, books.publish, books.place)) evalRDD.show() # 查询结果进行隐射 bookMap = evalRDD.map(lambda books: (books.book_eval)) general_list = [] good_list = [] for book in bookMap.collect(): book = book.encode("utf-8").split(',') print book[0], book[1] if book[1] == "general":
def main(): conf = SparkConf().setAppName('housingprice') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) taxreportSchema = StructType([ StructField('PID', StringType(), False), StructField('Legal_Type', StringType(), False), StructField('FOLIO', StringType(), False), StructField('Coordinates', StringType(), True), StructField('ZoneName', StringType(), True), StructField('ZoneCat', StringType(), True), StructField('LOT', StringType(), True), StructField('Block', StringType(), True), StructField('plan', StringType(), True), StructField('DisLot', StringType(), True), StructField('FCiviNum', StringType(), True), StructField('TCiviNum', StringType(), True), StructField('StreetName', StringType(), True), StructField('PostalCode', StringType(), True), StructField('NLegalName1', StringType(), True), StructField('NLegalName2', StringType(), True), StructField('NLegalName3', StringType(), True), StructField('NLegalName4', StringType(), True), StructField('NLegalName5', StringType(), True), StructField('CurVal', StringType(), True), StructField('CurImpVal', StringType(), True), StructField('Taxassess', StringType(), True), StructField('prevVal', StringType(), True), StructField('prevImpVal', StringType(), True), StructField('YearBuilt', StringType(), True), StructField('BigImpYear', StringType(), True), StructField('Tax_levy', StringType(), True), StructField('NeighbourhoodCode', StringType(), True), ]) conversionSchema = StructType([ StructField('date', StringType(), False), StructField('USD', StringType(), False), StructField('rate', StringType(), False), StructField('reciprate', StringType(), False), ]) crudeoilSchema = StructType([ StructField('date', DateType(), False), StructField('oilprice', StringType(), False), ]) def fixdate(convVal): a = convVal.split(" ") dates = a[0].split("/") alldate = "20"+dates[2]+'/'+dates[0] return (alldate,a[1]) def filterYear(dates): a = dates.split('/') if (a[1]=='2016'): return False else: return True def processDate(df): def splitMonth(cols): a = cols.split('/') return a[1] def splitYear(cols): a = cols.split('/') return a[0] fUDF = udf(splitMonth, StringType()) df1 = df.withColumn("month", fUDF('year')) fUDFyear = udf(splitYear, StringType()) return df1.withColumn("year", fUDFyear('year')) #Reading the Tax Report Dataset taxreportinfo = sqlContext.read.format('com.databricks.spark.csv').options(header='true').schema(taxreportSchema).load(inputs+"taxreport/test") taxreportinfo.registerTempTable("taxreport") #Selecting the price,TaxAssessment Year and Postalcode of each property propertyVal = sqlContext.sql("SELECT CurVal, Taxassess, PostalCode FROM taxreport") propertyVal.registerTempTable("propertyVal") #Reading the CAN to USD conversion dataset conversion = sqlContext.read.format('com.databricks.spark.csv').options(header='true').schema(conversionSchema).load(inputs+"conversion") conversion.registerTempTable("Conversion") #Selecting only the date and rate conversionrate = sqlContext.sql("SELECT date,rate FROM Conversion WHERE rate regexp '^[0-9]+'") conversionRDD = conversionrate.repartition(40).rdd.map(lambda w: (w.date+" "+w.rate)) conversiondates = conversionRDD.map(fixdate).filter(lambda (w,x):filterYear(w)).map(lambda l: Row(date=l[0], rate=l[1])) schemaConv = sqlContext.inferSchema(conversiondates) schemaConv.registerTempTable("ConversionDate") ConverDF = sqlContext.sql(" SELECT date,CAST(AVG(rate) AS DECIMAL(4,2)) as conversionrate FROM ConversionDate WHERE rate IS NOT NULL GROUP BY date") ConverDF.cache() #Reading the Canada Crude oil price dataset crudeoil = sc.textFile(inputs+"crudeoil") crudeoilRDD = crudeoil.map(lambda l: l.split()).map(lambda l: Row(date=l[0], oilprice=l[1])) crudeoilDF = sqlContext.inferSchema(crudeoilRDD) crudeoilDF.registerTempTable("crudeoil") #Selecting the date on M/Y format and oilprice oilprice = sqlContext.sql("SELECT DATE_FORMAT(date,'Y/M') as date,oilprice FROM crudeoil") oilprice.registerTempTable('oilprice') #Reading the interestrate of BC Dataset interestRate = sqlContext.read.format('com.databricks.spark.csv').options(header='true').load(inputs+"interestrate") interestRate.registerTempTable("interest") #Selecting the date and 5-year fixed mortgage price from the dataset interestDF = sqlContext.sql("SELECT DATE_FORMAT(date,'Y/M') as date,CAST(`5y-fixed-posted` AS DECIMAL(4,2)) AS interestrate FROM interest WHERE date >='2006-01' AND date <= '2015-12'") interestDF.registerTempTable("allrates") #Getting the average of each month on days whose value is not null. avgInterest = sqlContext.sql(" SELECT date,AVG(interestrate) as interestrates FROM allrates WHERE interestrate IS NOT NULL GROUP BY date") avgInterest.cache() joinedTable = avgInterest.join(oilprice,(avgInterest['date']==oilprice['date'])).select(avgInterest['date'],avgInterest['interestrates'],oilprice['oilprice']) JoinedConversion = joinedTable.join(ConverDF,(joinedTable['date']==ConverDF['date'])).select(joinedTable['date'].alias('year'),joinedTable['interestrates'],joinedTable['oilprice'],ConverDF['conversionrate']) JoinedConversion.registerTempTable("joinedConversion") ls = processDate(JoinedConversion) ls.show()