def hash_rating(author_subreddit_rating_rdd, sc): sql_context = SQLContext(sc) author_sub_schema = StructType([ StructField("author", StringType(), True), StructField("subreddit", StringType(), True), StructField("rating", LongType(), True) ]) asr_df = sql_context.createDataFrame(author_subreddit_rating_rdd, author_sub_schema) author_rdd = author_subreddit_rating_rdd.map(lambda (a, s, r): a) aid_rdd = author_rdd.distinct().zipWithUniqueId().cache() author_id_schema = StructType([ StructField("author", StringType(), True), StructField("author_id", LongType(), True) ]) aid_df = sql_context.createDataFrame(aid_rdd, author_id_schema) aid_s_r_df = aid_df.join(asr_df, on='author').drop('author').cache() subreddit_rdd = author_subreddit_rating_rdd.map(lambda (a, s, r): s) sid_rdd = subreddit_rdd.distinct().zipWithUniqueId().cache() subreddit_id_schema = StructType([ StructField("subreddit", StringType(), True), StructField("subreddit_id", LongType(), True) ]) sid_df = sql_context.createDataFrame(sid_rdd, subreddit_id_schema) aid_sid_r_df = sid_df.join(aid_s_r_df, on='subreddit').drop('subreddit').cache() row_aid_sid_r_rdd = aid_sid_r_df.rdd aid_sid_r_rdd = row_aid_sid_r_rdd.map(lambda row: (row.author_id, row.subreddit_id, row.rating)) return aid_rdd, sid_rdd, aid_sid_r_rdd
def _get_data(self): sql_context = SQLContext(self.sc) l = [ ( "I dont know why people think this is such a bad movie.", Vectors.sparse(3, {1: 1.0, 2: 1.0, 3: 1.0}) ), ] return sql_context.createDataFrame(l, ['text', 'features'])
def _get_train_data(self): sql_context = SQLContext(self.sc) l = [ (1, Vectors.dense([1, 2, 3]), 1.0), (2, Vectors.dense([1, 2, 3]), 0.0), (3, Vectors.dense([1, 2, 3]), 1.0), (4, Vectors.dense([1, 2, 3]), 0.0), ] return sql_context.createDataFrame(l, ['id', 'features', 'label'])
def main(): conf = SparkConf().setAppName('artist_career') sc = SparkContext(conf=conf) assert sc.version >= '1.5.1' sqlContext=SQLContext(sc) inputs = sys.argv[1] output = sys.argv[2] customSchema = StructType([StructField('SongNumber', StringType(),False),StructField('SongID', StringType(),False),StructField('AlbumID', StringType(),False),StructField('AlbumName', StringType(),False),StructField('ArtistID', StringType(),False),StructField('ArtistLatitude', StringType(),False),StructField('ArtistLocation', StringType(),False),StructField('ArtistLongitude', StringType(),False),StructField('ArtistName', StringType(),False),StructField('Danceability', StringType(),False),StructField('Duration', StringType(),False),StructField('KeySignature', StringType(),False),StructField('KeySignatureConfidence', StringType(),False),StructField('Tempo', StringType(),False),StructField('TimeSignature', StringType(),False),StructField('TimeSignatureConfidence', StringType(),False),StructField('Title', StringType(),False),StructField('Year', StringType(),False),StructField('Energy', StringType(),False),StructField('ArtistFamiliarity', StringType(),False),StructField('ArtistMbid', StringType(),False),StructField('SongHotttnesss', StringType(),False),StructField('Loudness', StringType(),False),StructField('StartOfFadeOut', StringType(),False),StructField('EndOfFadeIn', StringType(),False),StructField('ModeConfidence', StringType(),False)]) df= sqlContext.read.format('com.databricks.spark.csv').options(header='true').load(inputs,schema = customSchema) df.registerTempTable('artist_data') million_song=sqlContext.sql("SELECT SongNumber,SongID,AlbumID,AlbumName,ArtistID,ArtistLatitude,ArtistLocation,ArtistLongitude,ArtistName,Danceability,Duration,KeySignature,KeySignatureConfidence,Tempo,TimeSignature,TimeSignatureConfidence,Title,Year,Energy,ArtistFamiliarity,ArtistMbid,SongHotttnesss,Loudness,StartOfFadeOut,EndOfFadeIn,ModeConfidence from artist_data where Year!=0 AND ArtistFamiliarity!='nan'") million_song.write.format('parquet').save(output)
def __init__(self): self.conf = (SparkConf() .setAppName("BandCard") .set("spark.cores.max", "2") .set('spark.executor.extraClassPath', '/usr/local/env/lib/mysql-connector-java-5.1.38-bin.jar')) self.sc = SparkContext(conf=self.conf) self.sqlctx = SQLContext(self.sc) self.mysql_helper = MySQLHelper('core', host='10.9.29.212')
def ALS_fit(): usern = request.args.get('usern') users_df = pd.read_sql_query('''SELECT DISTINCT mt3ratings.user, user_id FROM mt3ratings WHERE appdata = 1''', engine) if usern not in users_df['user'].values: return_str = "can't find user" return jsonify(result = return_str) user_id = users_df.user_id[users_df.user == usern].values[0] try: key = request.args.get('key') except NameError: key = 'e' if key == 'abcd': #start spark try: conf = SparkConf().setAppName("BeerSleuthALS").set("spark.executor.memory", "4g") sc = SparkContext(conf=conf) except ValueError: pass sqlContext = SQLContext(sc) ratings_sqldf = modeling.get_item_user_rev_from_pg(engine, sqlContext) sqlContext.registerDataFrameAsTable(ratings_sqldf, "ratings") print('fitting model') model = modeling.fit_final_model(ratings_sqldf) beer_ids = beer_dict.values() to_predict = zip([user_id]*len(beer_ids), beer_ids) to_predict_top20 = zip([user_id]*len(beer_id_filt), beer_id_filt) user_preds = model.predictAll(sc.parallelize(to_predict)).collect() user_preds_top20 = model.predictAll(sc.parallelize(to_predict_top20)).collect() print('got preds') preds = Counter({x[1]: x[2] for x in user_preds}) preds_top20 = Counter({x[1]: x[2] for x in user_preds_top20}) with open('%s%s_preds.pkl'%(pred_path, user_id),'wb') as f: pickle.dump(preds, f) with open('%s%s_preds_top20.pkl'%(pred_path, user_id),'wb') as f: pickle.dump(preds_top20, f) print('done') sc.stop() return jsonify(result="Model training complete, you may now get predictions")
def batching(spark, i): bucket = "mimic3waveforms3" p = '1.0/p0'+ str(i) +'/' session = boto3.session.Session() client = session.client('s3') s3 = session.resource('s3') patientList = [] result = client.list_objects(Bucket=bucket, Prefix=p, Delimiter='/') for patient in result.get('CommonPrefixes'): patientList.append(patient.get('Prefix')) my_bucket = s3.Bucket("mimic3waveforms3") patientInfoObj = client.get_object(Bucket=bucket, Key='PATIENTS.csv') f_schema = StructType([\ StructField('min', DoubleType(), True),\ StructField('max', DoubleType(), True),\ StructField('mean', DoubleType(), True),\ StructField('median', DoubleType(), True),\ StructField('mode', DoubleType(), True),\ StructField('std', DoubleType(), True),\ StructField('kurtosis', DoubleType(), True),\ StructField('mortality_flag', IntegerType(), True),\ StructField('patient_id', IntegerType(), True)]) sqlContext = SQLContext(spark) data = [(0.0, 0.0,0.0, 0.0, 0.0, 0.0,0.0, 0, 0)] df_features = sqlContext.createDataFrame(data, f_schema) df = pd.read_csv(patientInfoObj['Body']) # for first 6 batch of records extract train features for patient in patientList: patientRecords = [] for object_summary in my_bucket.objects.filter(Prefix=patient): patientRecord = {} patientRecord['file_name']=object_summary.key if len(patientRecord['file_name']) != 32 or 'layout' in patientRecord['file_name']: continue 287,63 72% patientRecord['body']=object_summary.get()['Body'].read() patientRecords.append(patientRecord) patientId =int(patient[-7:-1]) mortality = (df[df['SUBJECT_ID']==patientId]['EXPIRE_FLAG']) # for first 6 batch of records extract train features if i < 6: df_new_features = featureExtraction.TrainFeatureExtraction(sqlContext, patientRecords, f_schema, mortality, patientId) df_features = df_features.union(df_new_features) # for last 2 batchs of records extract test features else: df_new_features = featureExtraction.TestFeatureExtraction(sqlContext, patientRecords, df_features, f_schema, int(mortality.values), patientId) df_features = df_features.union(df_new_features) df_features = df_features.filter(col('patient_id')>0) df_features.write.csv("s3a://"+bucket+"/PredictFeatures/"+str(patientId)) if i < 6: df_features = df_features.filter(col('patient_id')>0) df_features.write.csv("s3a://"+bucket+"/TrainFeatures/"+str(i))
def _load_data(filePaths, dataset_name, spark_context, groupfiles, groupsize): sqlContext = SQLContext(spark_context) return sqlContext.read.json(filePaths)
import itertools from math import sqrt from operator import add from os.path import join, isfile, dirname from pyspark import SparkContext, SparkConf, SQLContext from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating CLOUDSQL_INSTANCE_IP = sys.argv[1] CLOUDSQL_NAME = sys.argv[2] CLOUDSQL_USER = sys.argv[3] CLOUDSQL_PWD = sys.argv[4] conf = SparkConf().setAppName("app_collaborative") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) jdbcDriver = 'com.mysql.jdbc.Driver' jdbcUrl = 'jdbc:mysql://%s:3306/%s?user=%s&password=%s' % (CLOUDSQL_INSTANCE_IP, CLOUDSQL_NAME, CLOUDSQL_USER, CLOUDSQL_PWD) #[START how_far] def howFarAreWe(model, against, sizeAgainst): # Ignore the rating column againstNoRatings = against.map(lambda x: (int(x[0]), int(x[1])) ) # Keep the rating to compare against againstWiRatings = against.map(lambda x: ((int(x[0]),int(x[1])), int(x[2])) ) # Make a prediction and map it for later comparison # The map has to be ((user,product), rating) not ((product,user), rating) predictions = model.predictAll(againstNoRatings).map(lambda p: ( (p[0],p[1]), p[2]) )
sqlc.registerDataFrameAsTable(pw_df, "pw_df") sqlc.sql('Select sum(h) from pw_df').show() # MAIN sc_conf = SparkConf() sc_conf.setAppName("pwned") # sc_conf.set('spark.executor.memory', '2g') # sc_conf.set('spark.driver.memory', '4g') # sc_conf.set('spark.cores.max', '4') sc_conf.set('spark.sql.crossJoin.enabled', True) sc = SparkContext(conf=sc_conf) sqlc = SQLContext(sc) print(sys.version_info) print('Spark version %s running.' % sc.version) print('Config values of Spark context: ') print(sc.getConf().getAll()) runner = rdd_approach, df_sql_approach, count_all_occurrences for f in runner: f() print('Finished.') # hack to keep spark ui alive raw_input("Press ctrl+c to exit")
from pyspark.sql.utils import AnalysisException import os, math, time # In[2]: conf = ( SparkConf() #.setMaster('spark://10.100.5.182:7077') #.setMaster("local") .setAppName("hw1")) # In[3]: try: sc = SparkContext(conf=conf) sql_sc = SQLContext(sc) except ValueError: pass # In[4]: try: data = sql_sc.read.csv('./household_power_consumption.txt', sep=';', header=True) except AnalysisException: data = sql_sc.read.csv('hdfs:///bdm/hw1/household_power_consumption.txt', sep=';', header=True) # In[5]:
if __name__ == '__main__': conf = SparkConf() sc = SparkContext(conf=conf) datadir = "/Users/eyalbenivri/Developer/projects/spark-workshop/data/" # sudo dpkg --configure -a # sudo apt-get install python-setuptools # sudo easy_install dateutils # Download pyspark_csv.py from https://github.com/seahboonsiew/pyspark-csv sys.path.append('/Users/eyalbenivri/Developer/libs/pyspark_libs') # replace as necessary import pyspark_csv sc.addFile('/Users/eyalbenivri/Developer/libs/pyspark_libs/pyspark_csv.py') # ditto sqlContext = SQLContext(sc) # Task 1: load the prop-prices.csv file as an RDD, and use the csvToDataFrame function from the pyspark_csv module # to create a DataFrame and register it as a temporary table so that you can run SQL queries: print("------- ******* Task 1 ******* -------") columns = ['id', 'price', 'date', 'zip', 'type', 'new', 'duration', 'PAON', 'SAON', 'street', 'locality', 'town', 'district', 'county', 'ppd', 'status'] rdd = sc.textFile(datadir + "prop-prices.csv") df = pyspark_csv.csvToDataFrame(sqlContext, rdd, columns=columns) df.registerTempTable("properties") df.persist() # Task 2: let's do some basic analysis on the data. # Find how many records we have per year, and print them out sorted by year.
# Databricks notebook source from pyspark import SparkContext from pyspark import SQLContext from pyspark.sql.functions import * target_query="SELECT * from databse_name.byod_dbfs_table" sparkContext = SparkContext.getOrCreate() sqlContext = SQLContext(sparkContext) dataframe = sqlContext.sql(target_query) dataframe.repartition(1).write.format('com.databricks.spark.csv').options(delimiter=",").save("s3n://amgen-edl-acux-aaaa123-bkt/BYOD/", header="true", mode="overwrite") # COMMAND ---------- print "test2" # COMMAND ---------- print "test3"
from pyspark.sql import Row from pyspark import SparkContext, SQLContext from pyspark.sql.functions import udf, lit, col from pyspark.sql.types import ArrayType, StringType FEATURES_DATA = 'hdfs:///user/harshdee/base_features_complete.parquet' SELECTED_NEWSPAPERS = 'hdfs:///user/harshdee/newspapers_citations.parquet' sc = SparkContext() sqlContext = SQLContext(sc) sqlContext.setConf('spark.sql.parquet.compression.codec', 'snappy') features = sqlContext.read.parquet(FEATURES_DATA) features = features.withColumnRenamed('page_title', 'page_title_') features = features.select( col('citations_features._1').alias('retrieved_citation'), col('citations_features._2').alias('ref_index'), col('citations_features._3').alias('total_words'), col('citations_features._4._1').alias('neighboring_words'), col('citations_features._4._2').alias('neighboring_tags') ) selected_newspapers = sqlContext.read.parquet(SELECTED_NEWSPAPERS) ## def array_to_string(my_list): ## return '[' + ','.join([str(elem) for elem in my_list]) + ']' ## array_to_string_udf = udf(array_to_string,StringType()) results = features.join(selected_newspapers, features['retrieved_citation'] == selected_newspapers['citations']) ## results = results.withColumn('neighboring_words', array_to_string_udf(results["neighboring_words"]))
from pyspark.sql import DataFrameStatFunctions, DataFrame from pyspark.sql.types import * #inicializar cluster conf = SparkConf() #conf.set("spark.driver.memory", "16g") #conf.set("spark.driver.cores", 4) #conf.set("spark.driver.memoryOverhead", 0.9) #conf.set("spark.executor.memory", "32g") #conf.set("spark.executor.cores", 12) #conf.set("spark.jars", "/home/jaa6766") sc = SparkContext(master="local[*]", sparkHome="/usr/local/spark/", appName="tarea-mge-8-parqueteo", conf=conf) spark = SQLContext(sc) #leer csv fuente original data = spark.read.csv("s3a://jorge-altamirano/profeco/data.csv", schema = StructType() \ .add("producto", StringType(), False) \ .add("presentacion", StringType(), True) \ .add("marca", StringType(), True) \ .add("categoria", StringType(), True) \ .add("catalogo", StringType(), True) \ .add("precio", DecimalType(precision=16, scale=4), True) \ .add("fechaRegistro", TimestampType(), True) \ .add("cadenaComercial", StringType(), True) \ .add("giro", StringType(), True) \ .add("nombreComercial", StringType(), True) \ .add("direccion", StringType(), True) \
def getAccuracyRate(predDataFrame): accurRate = 0.0 numPredictions = predDataFrame.count() predDataFrame = predDataFrame.withColumn('isSame', when(predDataFrame['label'] == predDataFrame['prediction'], 1.0).otherwise(0.0)) correctPredictions = predDataFrame.select(sum('isSame')).collect()[0][0] accurRate = (float(correctPredictions) / float(numPredictions)) * 100.0 return accurRate ### Main Program ### reload(sys) sys.setdefaultencoding('utf8') if __name__: conf = SparkConf().setAppName("Project2Part3") sparkContxt = SparkContext(conf = conf) sqlContext = SQLContext(sparkContxt) directPath = sys.argv[1] trainFilePath = directPath + 'adult.data.csv' testFilePath = directPath + 'adult.test.csv' trainingDataFrame = sqlContext.read.load(trainFilePath, format = 'com.databricks.spark.csv', header = 'true', inferSchema = 'true', ignoreLeadingWhiteSpace='true', ignoreTrailingWhiteSpace='true') nRows = trainingDataFrame.count() nColumns = len(trainingDataFrame.columns) trainingDataFrame.show(5, False) print('# Initial Training Rows:', nRows, '\t# Initial Training Columns:', nColumns, '\n') testDataFrame = sqlContext.read.load(testFilePath, format = 'com.databricks.spark.csv', header = 'true', inferSchema = 'true', ignoreLeadingWhiteSpace='true', ignoreTrailingWhiteSpace='true') nRows = testDataFrame.count() nColumns = len(testDataFrame.columns) testDataFrame.show(5, False) print('# Initial Test Rows :', nRows, '\t# Initial Test Columns :', nColumns, '\n')
from pyspark.sql.types import * from sqlalchemy import create_engine # import pandas as pd from pyspark.sql.functions import * from pyspark.sql.functions import UserDefinedFunction, monotonically_increasing_id from datetime import datetime from pyspark.sql.functions import lit from pandas.io import sql import sys import boto3 import os conf = SparkConf().setMaster("spark://*****:*****@148.251.19.66/nlp_live') # connection= engine.connect() url = "jdbc:mysql://148.251.19.66/bdt_live?user=bigdata_user&password=dbphuv8qeB28JTBW" print 'data fetch started' myrdd = sc.textFile( "s3a://nlplive.hi.raw.data/logs/nlp_session-www*.nlpcaptcha.in-{201708180[4-5]*}" ) data_df = sqlContext.read.json(myrdd) data_df.registerTempTable('hi_raw_data') print 'data fetched' # data_df.printSchema() mydf2 = sqlContext.sql( "Select sessionId,publisher_id,device_finger_print,browserTimeStamp,ct,id1,id2,id3,id4,id5,source,browser,device,ip,country,nlpbot,event_type,event_value,referal_url,user_agent,url,token,time_stamp from hi_raw_data where ip='122.160.157.46'"
class UserSummary(object): def __init__(self): # local test # self.spark = SparkSession.builder.appName("group_by_fans").master("local").config( # conf=SparkConf()).getOrCreate() # self.sql_context = SQLContext(sparkContext=self.spark.sparkContext) # self.start_time = "2017-08-01 00:00" # "publish_time": "2017-02-22 19:20" # self.time_format = "%Y-%m-%d %H:%M" # prod self.spark = SparkSession.builder.config( conf=SparkConf().setAppName("weibo_user_summary")).getOrCreate() self.sql_context = SQLContext(sparkContext=self.spark.sparkContext) self.days_list = self.dateRange("2017-10-16", "2017-10-21") self.path = 'hdfs:///ssymmetry_db/raw_db/sina_weibo_fans/sina_weibo_fans_item/2017/%s/*' self.output = '/home/spark/hxkTest/out/' def dateRange(self, begin_date, end_date): dates = [] dt = datetime.datetime.strptime(begin_date, "%Y-%m-%d") date = begin_date[:] while date <= end_date: dates.append(date) dt = dt + datetime.timedelta(1) date = dt.strftime("%Y-%m-%d") return dates def read_dataframe(self, path, time_list): data_path = [] for each_day in time_list: data_path.append(path % each_day) dataframe = self.spark.read.json(data_path) # local test # dataframe = self.spark.read.json("sina_weibo_fans_data_2017-11-09-10-18.json") return dataframe def read_blog_data(self, dataframe): df = dataframe.filter("blog_id is not NULL").select( 'uid', 'blog_content', 'forward_content').drop_duplicates().fillna(" ") return df def main(self): stop_words = [] def user_tag_to_num(x): uid = x["uid"] user_tag = x["user_tag"] if user_tag == u"电视剧" or user_tag == u"电台" or user_tag == u"电影" or user_tag == u"动漫" \ or user_tag == u"广播电台" or user_tag == u"媒体传播" \ or user_tag == u"媒体人" or user_tag == u"美女模特" \ or user_tag == u"美女帅哥" or user_tag == u"休闲娱乐" \ or user_tag == u"娱乐明星" or user_tag == u"综艺": user_tag_num = 1 elif user_tag == u"动物萌宠" or user_tag == u"萌宠": user_tag_num = 2 elif user_tag == u"法律": user_tag_num = 3 elif user_tag == u"房产": user_tag_num = 4 elif user_tag == u"搞笑" or user_tag == u"搞笑幽默": user_tag_num = 5 elif user_tag == u"互联网": user_tag_num = 6 elif user_tag == u"健身" or user_tag == u"运动健身": user_tag_num = 7 elif user_tag == u"教育" or user_tag == u"公益": user_tag_num = 8 elif user_tag == u"科学": user_tag_num = 9 elif user_tag == u"理财" or user_tag == u"投资理财": user_tag_num = 10 elif user_tag == u"历史": user_tag_num = 11 elif user_tag == u"旅游" or user_tag == u"旅游出行": user_tag_num = 12 elif user_tag == u"美食": user_tag_num = 13 elif user_tag == u"美妆": user_tag_num = 14 elif user_tag == u"汽车" or user_tag == u"交通": user_tag_num = 15 elif user_tag == u"社会时政" or user_tag == u"军事" or user_tag == u"政府政务" or user_tag == u"时事": user_tag_num = 16 elif user_tag == u"数码": user_tag_num = 17 elif user_tag == u"体育" or user_tag == u"体育竞技" or user_tag == u"游戏": user_tag_num = 18 elif user_tag == u"养生" or user_tag == u"医疗" or user_tag == u"医疗健康" or user_tag == u"育儿": user_tag_num = 19 elif user_tag == u"作家" or user_tag == u"艺术" or user_tag == u"音乐" or user_tag == u"收藏" or user_tag == u"设计" or user_tag == u"摄影" or user_tag == u"时尚": user_tag_num = 20 elif user_tag == u"职场": user_tag_num = 21 elif user_tag == u"宗教": user_tag_num = 22 elif user_tag == u"星座命理" or user_tag == u"情感" or user_tag == u"婚庆": user_tag_num = 23 elif user_tag == u"商界名人": user_tag_num = 24 else: user_tag_num = 0 return (uid, user_tag_num) # prod dataframe = self.read_dataframe(self.path, self.days_list).persist() blog_df = self.read_blog_data(dataframe) # read approved user list df = self.spark.read.csv( "hdfs:///ssymmetry_db/raw_db/sina_user_tag/sina_user_tag_item/weibo_uid_with_user_tag.csv", header=True) \ .select("uid", "user_tag") user_tag_num_df = self.sql_context.createDataFrame( df.rdd.map(user_tag_to_num), ["uid", "user_tag"]) # local test # dataframe = self.spark.read.json("sina_weibo_fans_data_2017-11-09-10-18.json") # blog_df = self.read_blog_data(dataframe).fillna(" ") # blog_rdd = blog_df.rdd # select the blogs for the tagged users tagged_user_blog = blog_df.join( user_tag_num_df, blog_df.uid == user_tag_num_df.uid).select( blog_df.uid, blog_df.blog_content, blog_df.forward_content, user_tag_num_df.user_tag) def preprocess_data(x): uid = x["uid"] blog_content = x["blog_content"] forward_content = x["forward_content"] user_tag = x["user_tag"] if forward_content.rfind(u"*****") > 0: forward_content = forward_content.split(u"*****")[1] return (uid, (blog_content + forward_content, user_tag)) def extract_keywords(x): uid = x[0] # prod # ja.set_stop_words("/home/spark/hxkTest/movie_data/stopwords_cn.txt") ja.set_stop_words( "/home/spark/hxkTest/spark_script/weibo_user_summary/stopwords_cn.txt" ) # local test # ja.set_stop_words("stopwords_cn.txt") keywords = ja.extract_tags(x[1][0]) user_tag = x[1][1] return (uid, (keywords, user_tag)) # the rdd contains uid, keywords and the user_tag, next step we need to convert the keywords to a matrix data = tagged_user_blog.rdd.map(preprocess_data).reduceByKey( lambda x, y: (x[0] + y[0], x[1])).map(extract_keywords).collect() uid_list = [] keywords_list = [] user_tag_list = [] for elem in data: user_keyword = "" uid_list.append(elem[0]) keywords = elem[1][0] for word in keywords: user_keyword += word user_keyword += "_" keywords_list.append(user_keyword) user_tag_list.append(elem[1][1]) result_dict = { "uid": uid_list, "keywords": keywords_list, "user_tag": user_tag_list } pd.DataFrame(result_dict, index=None).to_csv( "/home/spark/hxkTest/spark_script/weibo_user_summary/tagged_user_keyword.csv", encoding="utf-8")
import pyspark from pyspark import SparkContext, SQLContext sc = SparkContext.getOrCreate() sql = SQLContext(sc) Student = sql.createDataFrame([('009001', 'Anuj', '70%', 'B.tech(cs)'), ('009002', 'Sachin', '80%', 'B.tech(cs)'), ('008005', 'Yogesh', '94%', 'MCA'), ('007014', 'Ananya', '98%', 'MCA')], ['Roll_Num', 'Name', 'Percentage', 'Department']) Student.show()
def read_csv(sc): sql = SQLContext(sc) df = sql.read.csv("./filteredC.small.training", header=True, inferSchema=True) return df
master = "local[*]" spark_home = '/opt/cloud/spark' os.environ['SPARK_HOME'] = spark_home #input = "/impala/parquet/back/back-portal-loginflowlog/dat=%s" % day input = "/input/loginfowlog/02*" #output = "/impala/parquet/back" output = "/output" conf = (SparkConf() .setMaster(master) .setAppName("user_visit_day") #.set("spark.kryoserializer.buffer.mb", "256") .set("spark.sql.parquet.binaryAsString", "true")) sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) df = sqlContext.read.parquet(input) rdd = df.select('logintype', 'logtype', 'hosid', 'suppid', 'logtime', 'usermac') fields = [ StructField('day', StringType(), True), StructField('mac', StringType(), True), StructField('hosid', StringType(), True), StructField('loginPage', IntegerType(), False), StructField('forwardPage', IntegerType(), False), StructField('arrivePage', IntegerType(), False) ] schema = StructType(fields) # compute pages
class Analysiser: def __init__(self): conf = SparkConf().setAppName('Analysiser').set("spark.sql.crossJoin.enabled", True) self.sc = SparkContext(conf=conf) self.sqlctx = SQLContext(self.sc) self.pdf = pd.read_excel('data_o.xlsx', sheetname=0, header=0, parse_cols=[9, 10, 23, 32, 45, 60]) schema = StructType([ StructField('TI',StringType(),True), StructField('SO', StringType(), True), StructField('C1', StringType(), True), StructField('TC', StringType(), True), StructField('PY', StringType(), True), StructField('UT', StringType(), True) ]) df = self.sqlctx.createDataFrame(self.pdf,schema) def m_clean(x): try: py = int(x['PY']) tc = int(x['TC']) authors = x['C1'] if py>=2006 and py<=2016 and authors != '': first_author = authors[1:].split(']')[0].split('; ')[0] return [(x['TI'],x['SO'],x['C1'],first_author,x['TC'],int(x['PY']),x['UT']),] else: return [] except Exception as e: return [] schema2 = StructType([ StructField('TI', StringType(), True), StructField('SO', StringType(), True), StructField('C1', StringType(), True), StructField('first_author', StringType(), True), StructField('TC', StringType(), True), StructField('PY', IntegerType(), True), StructField('UT', StringType(), True) ]) self.df = self.sqlctx.createDataFrame(df.rdd.flatMap(m_clean),schema2) #self.df.show() # def parse(self): # .wb = load_workbook('data_min.xlsx') # sheet = wb.get_sheet_by_name('all') # new_wb = openpyxl.Workbook() # new_sheet = new_wb.create_sheet('simple') # new_sheet.append(['TI', 'SO', 'C1', 'TC', 'PY', 'UT']) # # # for row in list(sheet.rows)[2:100]: # r = [c.value for c in row] # r_min = [r[9],r[10],r[23],r[32],r[45],r[60]] # print(r_min) # new_sheet.append(r_min) # new_wb.save('export.xlsx') def parse2(self): self.df.ExcelWriter('output.xls') def func1(self): df = self.df.toPandas() #print(df.head()) plt.figure(figsize=(9, 6)) plt.scatter(df['PY'], df['TC'], s=25, alpha=0.4, marker='o') # T:散点的颜色 # s:散点的大小 # alpha:是透明程度 plt.show() def func2(self): df = self.df first_author_df = df.select('first_author','PY').groupBy('first_author').max('PY').withColumnRenamed('max(PY)','maxPY') self.sqlctx.registerDataFrameAsTable(df.drop('first_author'),'df') self.sqlctx.registerDataFrameAsTable(first_author_df,'fa') sql = "select first_author,TC from (fa outer join df on C1 like CONCAT('%',first_author,'%'))" join = self.sqlctx.sql(sql) join_rdd = join.rdd.map(lambda x:(x['first_author'],x['TC'])).reduceByKey(lambda x,y:x+'-'+y) # for r in join_rdd.collect(): # print(r) def m_h(x): flag = False h = 0 cts = [int(x) for x in x[1].split('-')] cts.sort(reverse=True) for i in range(1, len(list(cts))+1): if i >= cts[i-1]: flag = True h = i # TODO or cts[i-1] break if flag: return [(x[0],h),] else: return [] author_h_rdd = join_rdd.flatMap(m_h) author_h_df = self.sqlctx.createDataFrame(author_h_rdd,['first_author','h']) final_df = author_h_df.join(first_author_df,'first_author','left_outer').select('h','maxPY') pdf = final_df.toPandas() plt.figure(figsize=(9, 6)) plt.scatter(pdf['maxPY'], pdf['h'], s=25, alpha=0.4, marker='o') # T:散点的颜色 # s:散点的大小 # alpha:是透明程度 plt.show()
ids.append(cluster[j]) bag_words = {} for i in ids: bag_words[i]=(float(ids.count(i))/len(ids)) # Create a SparseVector sv = Vectors.sparse(2000, bag_words) return sv input = sys.argv[1] output = sys.argv[2] conf = SparkConf().setAppName('bag_words') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) with open('clusterFinal.pickle', 'rb') as f: cluster=pickle.load(f) schema = StructType([ StructField('reviewText', StringType(), False),StructField('overall', FloatType(), False),StructField('reviewTime', StringType(), False) ]) df = sqlContext.read.json(input, schema=schema) df.registerTempTable('review_table') sd=sqlContext.sql(""" SELECT reviewText FROM review_table """) fin=sd.rdd.map(lambda x: str(x.reviewText)).map(clean_words)
from pyspark.sql import DataFrameStatFunctions, DataFrame from pyspark.sql.types import * #inicializar cluster conf = SparkConf() #conf.set("spark.driver.memory", "16g") #conf.set("spark.driver.cores", 4) #conf.set("spark.driver.memoryOverhead", 0.9) #conf.set("spark.executor.memory", "32g") #conf.set("spark.executor.cores", 12) #conf.set("spark.jars", "/home/jaa6766") sc = SparkContext(master="local[*]", sparkHome="/usr/local/spark/", appName="tarea-mge-8-parqueteo", conf=conf) spark = SQLContext(sc) #lectura del parquet data = spark.read.parquet("s3a://jorge-altamirano/profeco/data.parquet") #funcion que hace lo mismo que summary, pero #irónicamente más rápido que spark (esta es para numéricos) def summary_j3a(col): min1 = data.select(min(data[col]).alias("min")).toPandas().transpose() max1 = data.select(max(data[col]).alias("max")).toPandas().transpose() avg1 = data.select(mean(data[col]).alias("avg")).toPandas().transpose() std1 = data.select(stddev( data[col]).alias("stddev")).toPandas().transpose() probs = [0.25, 0.5, 0.75] qnt1 = pd.DataFrame( \
def main(): sc = SparkContext(SPARK_ADDRESS, appName="RedditBatchLayer") #sc = SparkContext("local[*]", appName="RedditBatchLayer") bcURL = sc.broadcast(urlTitlePool) sqlContext = SQLContext(sc) conn = S3Connection(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) #conn = boto.connect_s3(AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY) bucket = conn.get_bucket(RAW_JSON_REDDIT_BUCKET) def addTitleURL(cmtTuple): # 150,000/ 3000 = avg 50 comments/topic onePst = bcURL.value[randint(0, 3000)] return cmtTuple + (onePst[0], onePst[1]) # adding title and url if (smallBatch): logFile = 's3a://reddit-comments/2007/RC_2007-10' #df = sqlContext.read.json(logFile) df = sqlContext.jsonFile(logFile) users_rdd = df.filter(df['author'] != '[deleted]') year = 2007 month = 12 users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\ .map(addTitleURL) #.repartition(REPARTITION_SIZE) users_row.foreachPartition(insert_into_cassandra) # calculate user relationship graph # (URL, user) tuple post2user = users_row.map(lambda x: (x[10], x[0])) #graph = post2user.join(post2user)\ # self join to find user relationship by posts # .filter(lambda x: x[1][0] != x[1][1])\ # remove all self linked relationship # .map(makeAscOrder)\ # make to asc order by user name # .distinct()\ # remove duplicated user pairs, because the relationship is mutual # .map(lambda x: (x[1], 1))\ # ready tho count number of common edges # .reduceByKey(lambda x, y: x+y)\ # count total number for every edge/relationship # .map(lambda x: (x[0][0], x[1], x[0][1]))# flatten and ready to write table graph = post2user.join(post2user)\ .filter(lambda x: x[1][0] != x[1][1])\ .map(makeAscOrder)\ .distinct()\ .map(lambda x: (x[1], 1))\ .reduceByKey(lambda x, y: x+y)\ .map(lambda x: (x[0][0], x[1], x[0][1])) graph.foreachPartition(insert_graph) else: for key in bucket.list(): if '-' not in key.name.encode('utf-8'): # filter out folders and _SUCCESS continue logFile = 's3a://{0}/{1}'.format(RAW_JSON_REDDIT_BUCKET, key.name.encode('utf-8')) year = logFile.split('-')[1][-4:] month = logFile.split('-')[2] from_year = FROM_YEAR_MONTH.split('_')[0] from_month = FROM_YEAR_MONTH.split('_')[1] if int(year) < int(from_year) or (int(year) == int(from_year) and int(month) < int(from_month)): continue #df = sqlContext.read.json(logFile) df = sqlContext.jsonFile(logFile) users_rdd = df.filter(df['author'] != '[deleted]') # 0 1 2 3 4 5 6 7 8 9 (title) 10(url) users_row = users_rdd.map(lambda json: (json.author, '{0}_{1}'.format(year, month), json.created_utc, json.subreddit, json.id, json.body, json.score, json.ups, json.controversiality))\ .map(addTitleURL) #.repartition(REPARTITION_SIZE) users_row.foreachPartition(insert_into_cassandra) # calculate user relationship graph # (URL, user) tuple post2user = users_row.map(lambda x: (x[10], x[0])) #graph = post2user.join(post2user)\ # self join to find user relationship by posts # .filter(lambda x: x[1][0] != x[1][1])\ # remove all self linked relationship # .map(makeAscOrder)\ # make to asc order by user name # .distinct()\ # remove duplicated user pairs, because the relationship is mutual # .map(lambda x: (x[1], 1))\ # ready tho count number of common edges # .reduceByKey(lambda x, y: x+y)\ # count total number for every edge/relationship # .map(lambda x: (x[0][0], x[1], x[0][1]))# flatten and ready to write table graph = post2user.join(post2user)\ .filter(lambda x: x[1][0] != x[1][1])\ .map(makeAscOrder)\ .distinct()\ .map(lambda x: (x[1], 1))\ .reduceByKey(lambda x, y: x+y)\ .map(lambda x: (x[0][0], x[1], x[0][1])) #.repartition(REPARTITION_SIZE) graph.foreachPartition(insert_graph) sc.stop()
# This is an edited version of https://github.com/minhptx/iswc-2016-semantic-labeling, which was edited to use it as a baseline for Tab2KG (https://github.com/sgottsch/Tab2KG). import os from gensim.models import Word2Vec from pyspark import SparkContext, SQLContext sc = SparkContext() sql_context = SQLContext(sc) root_dir = os.path.abspath(os.path.join(os.path.realpath(__file__), '..')) data_dir = os.path.join(root_dir, "data/datasets") train_model_dir = os.path.join(root_dir, "data/train_models") # word2vec = Word2Vec.load_word2vec_format(os.path.join("/Users/minhpham/tools/", 'GoogleNews-vectors-negative300.bin'), binary=True) file_write = open('debug.txt', 'w') logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org").setLevel(logger.Level.FATAL) logger.LogManager.getLogger("akka").setLevel(logger.Level.FATAL)
select_userlogin_repeat = "select get_date(login_time) as day,hos_id,mac,login_time from wxcity_userlogin_info where login_time >= '%s 00:00:00' and login_time <='%s 23:59:59' order by day,hos_id,mac,login_time" select_userlogin_repeat_sta = "select day,hos_id,sum(t2),sum(t5),sum(t10),sum(t30),sum(t60) from repeat_login_list group by day,hos_id" if __name__ == '__main__': if len(sys.argv) != 5: print("Usage: spark_streaming.py <master> <begin> <end> <input>", file=sys.stderr) exit(-1) master, time_begin, time_end, input = sys.argv[1:] input_path = input + '/' + time_begin + '.csv' logger.info("--->" + master + " " + input_path) sc = SparkContext(master, 'wxcity_userlogin_repeat_app') sql_context = SQLContext(sc) lines = sc.hadoopFile(input, 'org.apache.hadoop.mapred.TextInputFormat', 'org.apache.hadoop.io.LongWritable', 'org.apache.hadoop.io.Text' ) rs_tuples = MysqlDao().findWithQuery(ConfigPortalSql.select_mysql_hos_gw_sup) gwid_hosid_dict = {} for r in rs_tuples: hos_id = str(r[0]) gw_id = r[1] gwid_hosid_dict[gw_id] = hos_id logger.debug('-->gwid_hosid:' + str(gwid_hosid_dict.__len__())) users = lines.map(lambda x: x[1].split(',')).filter(lambda x: len(x) == 17) \
from pyspark import SparkContext, SQLContext from pyspark import SparkFiles from pyspark.sql import * from pyspark.sql.functions import * sc = SparkContext(appName="phoenix-spark-sparkapi") sqlContext = SQLContext(sc) # Approach 1 - Using the Spark driver zkUrl = 'zookeeper.example.com:2181/hbase-secure' df_phoenixTable = sqlContext.read\ .format('org.apache.phoenix.spark')\ .option('table', 'schema.table')\ .option('zkUrl',zkUrl)\ .load() ## Use Spark API to filter df_phoenixTable_filtered=df_phoenixTable.filter((df_phoenixTable['col1'] == lit('98765')) & \ (df_phoenixTable['col2'] == 'A') & (df_phoenixTable['col3'] == lit('123456'))) df_phoenixTable_filtered.show() ## Use SQL to filter df_phoenixTable.registerTempTable("df_phoenixTable") df_phoenixTable_sql = sqlContext.sql( 'SELECT col1,col2,col3 FROM df_phoenixTable WHERE col1=98765 AND col2=\'A\' AND col3=\'123456\'' ) df_phoenixTable_sql.show() # Sample upsert back to Phoenix df_phoenixTable_sql.write.format('org.apache.phoenix.spark').mode( 'overwrite').option('table', 'schema.targetTable').option('zkUrl', zkUrl).save()
ymd = pro_time.strftime("%Y%m%d") master = "spark://hadoop:7077" appName = "spark_loginflowlog" #input = "/impala/parquet/back/back-portal-loginflowlog/dat=%s*" % ym input = '/input/loginfowlog/*' spark_home = '/opt/cloud/spark' os.environ['SPARK_HOME'] = spark_home conf = (SparkConf() .setMaster(master) .setAppName(appName) .set("spark.sql.parquet.binaryAsString","true") ) sc = SparkContext(conf = conf) sql_context = SQLContext(sc) sql_context.registerFunction("to_mac", lambda x: normal_mac(x), StringType()) parquet_df = sql_context.read.parquet(input) sql_context.registerDataFrameAsTable(parquet_df, "loginflowlog") #_sql = "select to_mac(upper(usermac)),count(distinct dat) days from loginflowlog group by to_mac(upper(usermac))" _sql = "select to_mac(upper(usermac)),count(distinct logtime) days from loginflowlog group by to_mac(upper(usermac))" rs_df = sql_context.sql(_sql) rs = rs_df.collect() logger.info("---->" + str(len(rs))) lists = [] for r in rs: usermac = r[0] days = r[1] t = (usermac,days)
""" Given an RDD of dictionaries and a column to decile, add decile of column to each row's dictionary by pre-computing decile thresholds to avoid out of memory errors when sorting an RDD with too many columns. """ from pyspark import SparkContext, SQLContext#, HiveContext from datetime import datetime sc = SparkContext() sqlContext = SQLContext(sc) # hiveContext = HiveContext(sc) sc.setLogLevel("FATAL") # # setup # import numpy as np # create random data n = 52 prices = [float(list(5 + abs(np.random.randn(1)) * 100)[0]) for i in range(n)] dates = [datetime(year=np.random.randint(2000, 2016), month=np.random.randint(1, 12), day=np.random.randint(1, 28)).date() for i in range(n)] groups = [np.random.randint(1, 100) for i in range(n)] data = [{"price": price, "date": _date, "group": group} for price, _date, group in zip(prices, dates, groups)] df = sqlContext.createDataFrame(data)
# In[47]: #from pyspark.sql import Row #from pyspark.sql.types import * # In[98]: dataRow.take(1) # In[94]: dataRow.distinct().count() # In[38]: sqlContext = SQLContext(sc) # In[ ]: dfData = sqlContext.createDataFrame(dataRow).toDF("basket_hash", "baskey_key", "payload") # In[40]: dfData.show() # In[41]: dfData.printSchema() # In[43]:
# -*- coding: utf-8 -*- """ Created on Tue Jun 21 09:31:37 2016 @author: rsk """ from pyspark import SparkContext from pyspark import SQLContext sc = SparkContext("local","recommendationEngineApp") sqlContext = SQLContext(sc) from pyspark.sql import SQLContext,Row #from pyspark.sql import Functions as F dataDir = "/home/rsk/Documents/Spark" userData = sc.textFile(dataDir+"/ml-100k/u.user").map(lambda x : x.split("|")) movieData = sc.textFile(dataDir+"/ml-100k/u.item").map(lambda x : x.split("|")) ratingData = sc.textFile(dataDir+"/ml-100k/u.data").map(lambda x : x.split("\t")) #%% ratingDataDF = ratingData.map(lambda x : Row(userID = int(x[0]), movieID = int(x[1]), rating=float(x[2]), timestamp = int(x[3]))) ratingDataDF = sqlContext.createDataFrame(ratingDataDF) userDataDF = userData.map(lambda x : Row(userID=int(x[0]), age = int(x[1]), gender = x[2],
if __name__ == '__main__': # --set datetime DAY_OFFSET = 1 now = datetime.datetime.now() pro_time = now - datetime.timedelta(days=DAY_OFFSET) day = pro_time.strftime("%Y%m%d") master = "spark://hadoop:7077" appName = "spark_pageflow_outflow" input = "/impala/parquet/site/site-pageflowv1/dat=%s" % day spark_home = '/opt/cloud/spark' os.environ['SPARK_HOME'] = spark_home sc = SparkContext(master, appName) sql_context = SQLContext(sc) sql_context.registerFunction("to_day", lambda x: mill_date_str(x), StringType()) sql_context.registerFunction("to_str", lambda x: bytearray_str(x), StringType()) parquet_df = sql_context.read.parquet(input) sql_context.registerDataFrameAsTable(parquet_df, "site_pageflowv1") _sql = "select to_str(url),to_day(createtime) day,count(1) pv,count(distinct to_str(guuid)) uv " \ "from site_pageflowv1 where dat= %s and to_str(name)='outflow' " \ "group by to_str(url),to_day(createtime)" % day rs_df = sql_context.sql(_sql) rs = rs_df.collect() logger.info("---->" + str(len(rs))) list = []
from pyspark.sql.functions import udf from pyspark.sql import Row def toInt(s): if isinstance(s, str) == True: st = [str(ord(i)) for i in s] return(int(''.join(st))) else: return None if __name__ == '__main__': conf = pyspark.SparkConf() sc = pyspark.SparkContext.getOrCreate(conf=conf) spark = SQLContext(sc) schema = StructType([ StructField("sales", FloatType(),True), StructField("employee", StringType(),True), StructField("ID", IntegerType(),True) ]) data = [[ 10.2, "Fred",123]] df = spark.createDataFrame(data,schema=schema) colsInt = udf(lambda z: toInt(z), IntegerType()) spark.udf.register("colsInt", colsInt) df2 = df.withColumn( 'semployee',colsInt('employee'))
from pyspark import SparkContext, SparkConf, SQLContext conf = SparkConf().setAppName("pyspark-readFromJSONinHDFS-py") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) departmentsJson = sqlContext.jsonFile( "/user/joseluisillana1709/department_json/department.json") departmentsJson.registerTempTable("departmentsTable") departmentsData = sqlContext.sql("select * from departmentsTable") for rec in departmentsData.collect(): print(rec) #Writing data in json format departmentsData.toJSON().saveAsTextFile( "/user/joseluisillana1709/pruebas_spark/result/departmentsJson")
from pyspark import SparkConf, SparkContext, SQLContext from pyspark.sql.types import StructType, StructField, StringType conf = SparkConf().setMaster("local").setAppName("CustomerOrders") sqlContext = SQLContext(SparkContext) df = sqlContext.read.json( "https://s3-eu-west-1.amazonaws.com/dwh-test-resources/recipes.json") df.show()
from pyspark import SQLContext, SparkConf, SparkContext from pyspark.sql.types import * #setting the configurations for the SparkConf object here conf = (SparkConf() .setMaster("local[4]") .setAppName("convert.py") .set("spark.executor.memory", "1g")) #creating the SparkConf object here sc = SparkContext(conf = conf) #creating the sqlContext that will be used sqlContext = SQLContext(sc) #reading the parquet file #Change this line to be the directory where the parquet file exists parquetFile = sqlContext.read.parquet('data/test2') parquetFile.registerTempTable("parquetFile") #Queries are made from the base + command. #base SELECTS elements of what you are interested from WHERE base = "SELECT * FROM parquetFile WHERE" #command is the query you make. command = ' ip_len >= 1500' test = sqlContext.sql(base + command)
from pyspark.sql import * from pyspark.sql.functions import * from pyspark.sql.types import * import json from datetime import tzinfo, datetime import pytz import re from pyspark import SparkContext, SQLContext sc = SparkContext() sqlContext = SQLContext(sc) sqlContext.setConf("spark.sql.parquet.compression.codec", "snappy") ############################################### sections_dataframe = sqlContext.createDataFrame( sections_titles.flatMap( lambda r: [Row(id=r.id, title=t) for t in r.sections])) counts_rdd = sections_dataframe.map(lambda x: (x.title, 1)).reduceByKey( lambda a, b: a + b).map(lambda x: Row(sections_title=x[0], count=x[1])) counts_dataframe = sqlContext.createDataFrame(counts_rdd).filter("count > 10") joined = sections_dataframe.join( counts_dataframe, sections_dataframe.title == counts_dataframe.sections_title) filtered = joined.map(lambda r: (r.id, [r.title])).reduceByKey( lambda a, b: a + b)
class DataHandler: def __init__(self): self.conf = (SparkConf() .setAppName("BandCard") .set("spark.cores.max", "2") .set('spark.executor.extraClassPath', '/usr/local/env/lib/mysql-connector-java-5.1.38-bin.jar')) self.sc = SparkContext(conf=self.conf) self.sqlctx = SQLContext(self.sc) self.mysql_helper = MySQLHelper('core', host='10.9.29.212') def load_from_mysql(self, table, database='core'): url = "jdbc:mysql://10.9.29.212:3306/%s?user=root&characterEncoding=UTF-8" % database df = self.sqlctx.read.format("jdbc").options(url=url, dbtable=table, driver="com.mysql.jdbc.Driver").load() return df def prepare_life_cycle(self, year, season): ''' 准备生命周期数据 从t_CMMS_ASSLIB_ASSET中获取每日AUM数据 prepare data saum1 (last season sum aum) saum2 (current season sum aum) aum_now account_age (months) last_tr_date (days) :param year: :param season: 1,2,3,4 :return: ''' # 计算月份 print('----------------------生命周期-Start----------------------') print('开始准备生命周期数据...') print('开始计算月份') if season == 1: # date1 当前季度月份 date1 = [str(year) + '-01', str(year) + '-02', str(year) + '-03'] # date2 上一季月份 date2 = [str(year - 1) + '-10', str(year - 1) + '-11', str(year - 1) + '-12'] elif season == 4: date1 = [str(year) + '-10', str(year) + '-11', str(year) + '-12'] date2 = [str(year) + '-07', str(year) + '-08', str(year) + '-9'] else: date1 = [str(year) + '-0' + str(3 * season - 2), str(year) + '-0' + str(3 * season - 1), str(year) + '-0' + str(3 * season)] date2 = [str(year) + '-0' + str(3 * season - 5), str(year) + '-0' + str(3 * season - 4), str(year) + '-0' + str(3 * season - 3)] print('当前季度月份 new:', date1) print('上一季度月份 old:', date2) # 加载AUM表 aum = self.load_from_mysql('t_CMMS_ASSLIB_ASSET_c').cache() # 拼接每季度三个月断数据 season_new = aum.filter(aum.STAT_DAT == date1[0]).unionAll(aum.filter(aum.STAT_DAT == date1[1])).unionAll( aum.filter(aum.STAT_DAT == date1[2])) season_old = aum.filter(aum.STAT_DAT == date2[0]).unionAll(aum.filter(aum.STAT_DAT == date2[1])).unionAll( aum.filter(aum.STAT_DAT == date2[2])) # 计算每季度AUM aum_season_old = season_old.select('CUST_NO', season_old.AUM.alias('AUM1')).groupBy('CUST_NO').sum('AUM1') aum_season_new = season_new.select('CUST_NO', season_new.AUM.alias('AUM2')).groupBy('CUST_NO').sum('AUM2') # 两个季度进行外联接 ''' +-----------+---------+---------+ | CUST_NO|sum(AUM2)|sum(AUM1)| +-----------+---------+---------+ |81005329523| null|294844.59| |81011793167| null| 365.20| |81015319088| null| 9640.96| +-----------+---------+---------+ ''' union_season = aum_season_old.join(aum_season_new, 'CUST_NO', 'outer') # 筛选当前AUM temp_result = aum.select('CUST_NO', 'AUM', 'STAT_DAT').groupBy('CUST_NO', 'STAT_DAT').sum('AUM').sort( 'CUST_NO').sort(aum.STAT_DAT.desc()) temp_result.select('CUST_NO', temp_result['sum(AUM)'].alias('AUM'), 'STAT_DAT').registerTempTable('group_in') aum_now_sql = "select CUST_NO,first(AUM) as AUM_NOW from group_in group by CUST_NO" aum_now = self.sqlctx.sql(aum_now_sql) # 清除缓存表 self.sqlctx.dropTempTable('group_in') # 联合 union_season_aumnow = union_season.join(aum_now, 'CUST_NO', 'outer') # 计算用户开户至今时间(months) # 载入账户表 account = self.load_from_mysql('t_CMMS_ACCOUNT_LIST').cache() account.select('CUST_NO', 'OPEN_DAT').registerTempTable('account') account_age_aql = "select CUST_NO, first(ACCOUNT_AGE) as ACCOUNT_AGE from " \ "(select CUST_NO, round(datediff(now(), OPEN_DAT) / 30) as ACCOUNT_AGE " \ "from account order by CUST_NO, ACCOUNT_AGE desc ) as t group by CUST_NO" account_age = self.sqlctx.sql(account_age_aql) # calculate last tran date account_1 = account.select('CUST_NO', 'ACC_NO15') detail = self.load_from_mysql('t_CMMS_ACCOUNT_DETAIL').select('ACC_NO15', 'TRAN_DAT') a_d = account_1.join(detail, 'ACC_NO15', 'outer') a_d.filter(a_d.CUST_NO != '').registerTempTable('adtable') last_tr_date_sql = "select CUST_NO,first(TRAN_DAT) as LAST_TR_DATE from (select CUST_NO,TRAN_DAT from adtable order by TRAN_DAT desc) as t group by CUST_NO" last_tr_date = self.sqlctx.sql(last_tr_date_sql) # 联合 season aum_now account_age last_tr_date unions = union_season_aumnow.join(account_age, 'CUST_NO', 'outer').join(last_tr_date, 'CUST_NO', 'outer') # 清除缓存表 self.sqlctx.dropTempTable('account') self.sqlctx.dropTempTable('adtable') self.sqlctx.clearCache() # 结果插入表 print('结果插入临时表:t_CMMS_TEMP_LIFECYCLE...') insert_lifecycle_sql = "replace into t_CMMS_TEMP_LIFECYCLE(CUST_NO,SAUM1,SAUM2,INCREASE,ACCOUNT_AGE,AUM_NOW,LAST_TR_DATE) values(%s,%s,%s,%s,%s,%s,%s)" # 缓冲区 temp = [] for row in unions.collect(): row_dic = row.asDict() if len(temp) >= 1000: # 批量写入数据库 self.mysql_helper.executemany(insert_lifecycle_sql, temp) temp.clear() # 加载数据到缓冲区 try: # 计算增长率 increase = (row_dic['sum(AUM2)'] - row_dic['sum(AUM1)']) / row_dic['sum(AUM1)'] except Exception: increase = 0 # 计算开户时长(月份数) 若无则视为6个月以上 if row_dic['ACCOUNT_AGE'] is None: row_dic['ACCOUNT_AGE'] = 7 # 最后交易日期 ltd = row_dic['LAST_TR_DATE'] if ltd is not None: try: ltd = datetime.datetime.strptime(ltd, '%Y-%m-%d') except Exception: ltd = ltd[:4] + '-' + ltd[4:6] + '-' + ltd[6:] ltd = datetime.datetime.strptime(ltd, '%Y-%m-%d') days = (datetime.datetime.now() - ltd).days else: days = 366 temp.append((row_dic['CUST_NO'], row_dic['sum(AUM1)'], row_dic['sum(AUM2)'], increase, row_dic['ACCOUNT_AGE'], row_dic['AUM_NOW'], days)) if len(temp) != 0: self.mysql_helper.executemany(insert_lifecycle_sql, temp) temp.clear() def calculate_life_cycle(self): ''' 根据AUM变化情况计算生命周期阶段 calculate life cycle period :return: ''' print('开始计算生命周期...') life_cycle = self.load_from_mysql('t_CMMS_TEMP_LIFECYCLE').cache() def clcmap(line): cust_no = line['CUST_NO'] account_age = line['ACCOUNT_AGE'] last_tr_date = line['LAST_TR_DATE'] aum_now = line['AUM_NOW'] increase = line['INCREASE'] period = 0 if aum_now is None: period = 9 # 未知 elif aum_now < 1000 and last_tr_date > 365: period = 3 # 流失期 else: if increase > 20 or account_age < 6: period = 0 # 成长期 elif increase >= -20 and increase <= 20: period = 1 # 成熟期 else: period = 2 # 稳定期 return period, cust_no map_result = life_cycle.map(clcmap).collect() # clear the life_cycle cache self.sqlctx.clearCache() temp = [] print('结果更新到临时表:t_CMMS_TEMP_LIFECYCLE...') update_life_period_sql = "update t_CMMS_TEMP_LIFECYCLE set PERIOD = %s where CUST_NO = %s" for row in map_result: if len(temp) >= 1000: self.mysql_helper.executemany(update_life_period_sql, temp) temp.clear() temp.append(row) if len(temp) != 1000: self.mysql_helper.executemany(update_life_period_sql, temp) temp.clear() def lifecycle_to_real_table(self, year, season): ''' 将生命周期数据写入正式表中 put life_cycle tmp table to real table :return: ''' print('开始将生命周期数据写入正式表中...') life_cycle = self.load_from_mysql('t_CMMS_TEMP_LIFECYCLE').select('CUST_NO', 'PERIOD') cust_info = self.load_from_mysql('t_CMMS_INFO_CUSTOMER').select('CUST_NO', 'CUST_ID', 'CUST_NAM') union = life_cycle.join(cust_info, 'CUST_NO', 'left_outer').cache() temp = [] sql = "replace into t_CMMS_ANALYSE_LIFE(CUST_NO,CUST_ID,CUST_NM,LIFE_CYC,QUARTER,UPDATE_TIME) values(%s,%s,%s,%s,%s,now())" quarter = str(year) + '-' + str(season) for row in union.collect(): if len(temp) >= 1000: self.mysql_helper.executemany(sql, temp) temp.clear() cust_id = row['CUST_ID'] if row['CUST_ID'] is not None else '0' temp.append((row['CUST_NO'], cust_id, row['CUST_NAM'], row['PERIOD'], quarter)) if len(temp) != 1000: self.mysql_helper.executemany(sql, temp) temp.clear() self.sqlctx.clearCache() def run_life_cycle(self,year,season): ''' 运行完整的生命周期流程 1 准备生命周期数据,计算AUM及其变化幅度 2 根据变化幅度计算生命周期阶段 3 将数据从缓存表放到实际表 :param year: :param season: :return: ''' self.prepare_life_cycle(year,season) self.calculate_life_cycle() self.lifecycle_to_real_table(year,season) #------------------------------------------------------------------------生命周期结束------------------------------------------------------------------------# def customer_value(self, year, half_year): ''' 计算客户价值 calculate customer value :param year: which year to calculate :param half_year: 0 for month 1-6 , 1 for month 7-12 :return: ''' print('---------------------------客户价值-Start--------------------------') cust_info = self.load_from_mysql('t_CMMS_INFO_CUSTOMER').select('CUST_NO', 'CUST_ID', 'CUST_NAM').cache() aum = self.load_from_mysql('t_CMMS_ASSLIB_ASSET_c').select('CUST_NO', 'STAT_DAT', 'AUM', 'ASS_TYPE').cache() base = half_year * 6 aum_slot_filter = None for i in range(1, 7): i = base + i if i < 10: i = '0' + str(i) else: i = str(i) slot = str(year) + '-' + i slot_filter = aum.filter(aum.STAT_DAT == slot) if aum_slot_filter is None: aum_slot_filter = slot_filter else: aum_slot_filter = aum_slot_filter.unionAll(slot_filter) # CUST_NO sum(AUM) huoqi_aum = aum_slot_filter.select('CUST_NO', 'ASS_TYPE', aum_slot_filter['AUM'].alias('AUM_HQ')).filter( aum_slot_filter.ASS_TYPE == '1').groupBy('CUST_NO').sum('AUM_HQ') dingqi_aum = aum_slot_filter.select('CUST_NO', 'ASS_TYPE', (aum_slot_filter.AUM * 0.8).alias('AUM_DQ')).filter( aum_slot_filter.ASS_TYPE == '2').groupBy('CUST_NO').sum('AUM_DQ') # 定期活期已计算好,sum(AUM_HQ),sum(AUM_DQ) j = huoqi_aum.join(dingqi_aum, 'CUST_NO', 'outer') # j.show() # 清除原有数据 self.mysql_helper.execute('truncate core.t_CMMS_ANALYSE_VALUE') # 开始联合其他表 all_col = j.join(cust_info, 'CUST_NO', 'outer') print(j.count(), cust_info.count()) # all_col.show() #根据客户价值计算客户等级 def calculate_rank(value): if value < 1000: return 0 elif value < 10000: return 1 elif value < 100000: return 2 elif value < 500000: return 3 elif value < 2000000: return 4 elif value < 5000000: return 5 else: return 6 temp = [] print('将数据replace到正式表...') update_value_sql = "replace into t_CMMS_ANALYSE_VALUE(CUST_ID,CUST_NO,CUST_NM,CUST_VALUE,CUST_RANK,SLOT,UPDATE_TIME) values(%s,%s,%s,%s,%s,%s,now())" for row in all_col.collect(): if len(temp) >= 1000: self.mysql_helper.executemany(update_value_sql, temp) temp.clear() val_dq = row['sum(AUM_DQ)'] if row['sum(AUM_DQ)'] is not None else 0 val_hq = row['sum(AUM_HQ)'] if row['sum(AUM_HQ)'] is not None else 0 cust_val = float(val_dq) + float(val_hq) cust_rank = calculate_rank(cust_val) slot = str(year) + '-' + str(half_year) cust_id = row['CUST_ID'] if row['CUST_ID'] is not None else 1 temp.append((cust_id, row['CUST_NO'], row['CUST_NAM'], cust_val, cust_rank, slot)) if len(temp) != 1000: self.mysql_helper.executemany(update_value_sql, temp) temp.clear() def aum_total(self): ''' 计算AUM总和 data for t_CMMS_ASSLIB_ASSTOT :return: ''' print('---------------------------总资产-Start--------------------------') # TODO t_CMMS_ASSLIB_ASSET_c 要改成正式表t_CMMS_ASSLIB_ASSET df_asset = self.load_from_mysql('t_CMMS_ASSLIB_ASSET_c').select('CUST_NO', 'CUST_ID', 'STAT_DAT', 'AUM', 'CUR', 'ACC_NAM').cache() # print(df_asset.count(), df_asset.columns) other_col = df_asset.select('CUST_NO', 'CUST_ID', 'CUR', 'ACC_NAM').distinct() # print(other_col.count(),other_col.columns) aum = df_asset.select('CUST_NO', 'STAT_DAT', 'AUM') # print(aum.count(), aum.columns) aum = aum.select('CUST_NO', 'STAT_DAT', 'AUM').groupBy(['CUST_NO', 'STAT_DAT']).sum('AUM').sort( ['CUST_NO', aum.STAT_DAT.desc()]) \ .groupBy('CUST_NO').agg({'sum(AUM)': 'first', 'STAT_DAT': 'first'}) # print(aum.count(), aum.columns) total = aum.select('CUST_NO', aum['first(sum(AUM))'].alias('AUM'), aum['first(STAT_DAT)'].alias('STAT_DAT')). \ join(other_col, 'CUST_NO', 'left_outer').distinct() # total.filter(total.STAT_DAT == '2016-06-') .show() # prepare params def list_map(line): return line['CUST_ID'], line['CUST_NO'], line['ACC_NAM'], line['STAT_DAT'], line['CUR'], line['AUM'] df = total.map(list_map) # clear old data self.mysql_helper.execute('truncate t_CMMS_ASSLIB_ASSTOT') sql = "insert into t_CMMS_ASSLIB_ASSTOT(CUST_ID,CUST_NO,ACC_NAM,STAT_DAT,CUR,AUM) values(%s,%s,%s,%s,%s,%s)" # execute sql self.mysql_helper.batch_operate(sql, df, 100) def debt_total(self): ''' prepare data for total debt :return: ''' print('---------------------------总负债-Start--------------------------') df_debt = self.load_from_mysql('t_CMMS_ASSLIB_DEBT').select('LOAN_ACC', 'CUST_NO', 'CUST_ID', 'CUST_NAM', 'BAL_AMT', 'GRANT_AMT', 'CUR') df_debt = df_debt.filter(df_debt.LOAN_ACC != '') df_sum = df_debt.groupBy('CUST_NO').sum('GRANT_AMT', 'BAL_AMT') df_other = df_debt.groupBy('CUST_NO').agg({'CUST_ID': 'first', 'CUST_NAM': 'first', 'CUR': 'first'}) df_total = df_sum.join(df_other, 'CUST_NO', 'left_outer').distinct() stat_dat = datetime.datetime.now().strftime('%Y%m%d') def m(line): return line['CUST_NO'], line['first(CUST_ID)'], line['first(CUST_NAM)'], line['first(CUR)'], line[ 'sum(GRANT_AMT)'], line['sum(BAL_AMT)'], stat_dat df = df_total.map(m) sql = "replace into t_CMMS_ASSLIB_DEBTOT(CUST_NO,CUST_ID,ACC_NAM,CUR,LOAN_AMT,BAL_AMT,STAT_DAT) values(%s,%s,%s,%s,%s,%s,%s)" self.mysql_helper.batch_operate(sql, df) def run(self): # 生命周期 年份 季度1,2,3,4 dh.run_life_cycle(2016, 2) # 客户价值 上半年:0,下半年:1 dh.customer_value(2016, 0) # 总资产 dh.aum_total() # 总负债 dh.debt_total()
from pyspark import SparkContext, SQLContext from pyspark.sql.types import StructType from pyspark.sql.types import StructField from pyspark.sql.types import StringType, IntegerType sc = SparkContext() sqlContext = SQLContext(sc) #### # 1. Setup (10 points): Download the gbook file and write a function to load it in an RDD & DataFrame #### # RDD API # Columns: # 0: place (string), 1: count1 (int), 2: count2 (int), 3: count3 (int) # Spark SQL - DataFrame API #### # 5. Joining (10 points): The following program construct a new dataframe out of 'df' with a much smaller size. #### schema = StructType([StructField("word", StringType(), True), StructField("count1", IntegerType(), True), StructField("count2", IntegerType(), True), StructField("count3", IntegerType(), True)]) df = sqlContext.read.csv('gbooks', schema=schema, sep='\t')
''' day = "20151212" master = "local[*]" spark_home = '/opt/cloud/spark' os.environ['SPARK_HOME'] = spark_home # logFile = 'hdfs://master:8020/impala/parquet/back/back-portal-loginflowlog/dat=' + day logFile = "/input/loginfowlog/02*" conf = (SparkConf() .setMaster(master) .setAppName("loginflowlog2mysql") # .set("spark.kryoserializer.buffer.mb", "256") .set("spark.sql.parquet.binaryAsString", "true")) sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) sqlContext.registerFunction("to_datestr", lambda x: longTime2str(x), StringType()) df = sqlContext.read.parquet(logFile) rdd = df.select('logintype', 'logtype', 'hosid', 'suppid', 'logtime', 'usermac') fields = [ StructField('logintype', StringType(), True), StructField('logtype', StringType(), True), StructField('hosid', StringType(), True), StructField('suppid', StringType(), True), StructField('logtime', LongType(), True), StructField('usermac', StringType(), True) ]
""" countWords.py""" from pyspark import SparkContext from pyspark import SQLContext from pyspark.sql.types import * from pyspark.sql import Row sc = SparkContext("local", "PhaseCalib App") sqlContext = SQLContext(sc) custom_schema = StructType([StructField("phase", FloatType(), False)]) df = sqlContext.read.options( header='true').schema(custom_schema).csv('new_phase_data.csv') df.describe().show() def calibrate(row): return Row(row['phase'] * 2) calibrated_rdd = df.rdd.map(calibrate) calibrated_df = sqlContext.createDataFrame(calibrated_rdd, custom_schema) calibrated_df.describe().show() calibrated_df.write.option("header", "true").save("calibrated_phase_data.csv") sc.stop()
logging.basicConfig(format='%(asctime)s %(message)s') nn_gridsearch = logging.getLogger('nn_gridsearch') nn_gridsearch.setLevel(logging.DEBUG) handler = logging.FileHandler('../logs/nn_gridsearch.txt') nn_gridsearch.addHandler(handler) nn_gridsearch.debug('-'*40) nn_gridsearch.debug('-'*40) nn_gridsearch.debug('Execution time: %s' % str(datetime.now())) # with open('~/.aws/credentials.json') as f: # CREDENTIALS = json.load(f) sc = set_spark_context() conn = S3Connection() sqc = SQLContext(sc) sm = SparkModel(sc, conn, rdd_path='rdd.pkl') bow_rdd = sm.RDD.join(sm.target).map(lambda (key, (bow, label)): (label, bow)) \ .sample(withReplacement=False, fraction=.5, seed=1) df = sqc.createDataFrame(bow_rdd, ['string_label', 'raw']) train_rdd, test_rdd = df.randomSplit([.8, .2], seed=1) results = [] num_features = 5000 min_doc_freq = 20 layers = [[5000, 2056, 512, 128, 2], [5000, 1000, 128, 2], [5000, 100, 2], [5000, 5000, 2]] for l in layers: remover = StopWordsRemover(inputCol="raw", outputCol="words")
def spark_table_exists(sql_ctx: SQLContext, view: str) -> bool: # noinspection PyBroadException return view in sql_ctx.tableNames()
import itertools from math import sqrt from operator import add from os.path import join, isfile, dirname from pyspark import SparkContext, SparkConf, SQLContext from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating from pyspark.sql.types import StructType, StructField, StringType, FloatType CLOUDSQL_INSTANCE_IP = '104.155.188.32' # CHANGE (database server IP) CLOUDSQL_DB_NAME = 'recommendation_spark' CLOUDSQL_USER = '******' CLOUDSQL_PWD = 'root' # CHANGE conf = SparkConf().setAppName("train_model") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) jdbcDriver = 'com.mysql.jdbc.Driver' jdbcUrl = 'jdbc:mysql://%s:3306/%s?user=%s&password=%s' % (CLOUDSQL_INSTANCE_IP, CLOUDSQL_DB_NAME, CLOUDSQL_USER, CLOUDSQL_PWD) # checkpointing helps prevent stack overflow errors sc.setCheckpointDir('checkpoint/') # Read the ratings and accommodations data from Cloud SQL dfRates = sqlContext.read.format('jdbc').options(driver=jdbcDriver, url=jdbcUrl, dbtable='Rating').load() dfAccos = sqlContext.read.format('jdbc').options(driver=jdbcDriver, url=jdbcUrl, dbtable='Accommodation').load() print("read ...") # train the model model = ALS.train(dfRates.rdd, 20, 20) # you could tune these numbers, but these are reasonable choices print("trained ...")
return [notebooks_path, nn['notebook']['path']] except Exception as e: if not explicit_process_name: print( 'WARN Unable to automatically extract pyspark notebook name' ) return ['', explicit_process_name or 'Unknown pyspark filename'] notebooks_path, notebook_name = get_notebook_name() except: pass spark = create_spark_kensu(project, None, "Lab", offline=OFFLINE) from pyspark import SQLContext sql = SQLContext(spark) # DEMO DATASOURCE if "DB_USER" not in os.environ or "DB_PASSWORD" not in os.environ or "DB_CONNECTION_URL" not in os.environ: print("Var env DB_USER or DB_PASSWORD or DB_CONNECTION_URL missing") notebook_segments = os.path.split(notebook_name) offline_file_name = notebook_segments[len(notebook_segments) - 1] + ".jsonl" dam = DamProvider().initDam(api_url="https://api-demo102.usnek.com", auth_token=token, process_name=notebook_name, user_name=os.environ["USER"], code_location=os.environ['DAM_CODE_REPOSITORY'], init_context=True, do_report=True,
''' DAY_OFFSET=1 #--set datetime now =datetime.datetime.now() pro_time=now-datetime.timedelta(days=DAY_OFFSET) dest_time_str=pro_time.strftime("%Y%m%d") ''' master = "spark://master:7077" sep = "\t" app_name = 'user_sign_in_app' ''' spark_home='/opt/cloud/spark' os.environ['SPARK_HOME']=spark_home ''' sc = SparkContext(master, app_name) sql_context = SQLContext(sc) lines = sc.textFile(input) parts = lines.map(lambda l: l.split(sep)).filter(lambda x: len(x) == 18) ''' portal id(*) gw_id user_id user_name login_time logout_time(*) mac ip user_agent download_flow(*) upload_flow(*) os browser ratio batch_no user_type supp_id ''' user_login = parts.map(lambda p: (p[1].strip(), p[2].strip(),p[17].strip(),p[3].strip(),p[16].strip(), p[4].strip(),p[5].strip(),p[6].strip(),p[7].strip(),p[8].strip(), p[9].strip(),p[10].strip(),p[11].strip(),p[12].strip(),p[13].strip(), p[14].strip(),p[15].strip())) schema_string = "id gw_id supp_id user_id user_type " \ "user_name login_time logout_time mac ip " \
MODULE_NAME = os.path.basename(sys.modules['__main__'].__file__) TEST_NAME = os.path.splitext(MODULE_NAME)[0] LOGGER = logger.get_logger(TEST_NAME) # Specify some constants URLPATH1 = "s3a://dask-avro-data/application-data/app-*.avro" URLPATH2 = "s3a://dask-avro-data/fulfillment-data/fulfillment-*.avro" # Start LOGGER.info('START: Creating spark conf') Sconf = SparkConf().setMaster('local[4]'). \ # 12 on c5.9xlarge set('spark.driver.memory', '4g'). \ # 4g on c5.9xlarge set('spark.executor.memory', '6g') # 5g on c5.9xlarge sc = SparkContext(appName="my_test", conf=Sconf) sqlContext = SQLContext(sparkContext=sc) LOGGER.info('FINISH: Finished creating spark conf') LOGGER.info('START: Creating spark dataframe 1') df1 = sqlContext.read.format("com.databricks.spark.avro").load(URLPATH1) df1 = df1.filter(df1.payload.originationCountryCode == 'CAN') df1 = df1.selectExpr( "payload.applicationId as applicationId", "payload.creationTimestamp as creationTimestamp", "payload.approved as approved", "payload.creditLimit as creditLimit" ) LOGGER.info('FINISH: Spark dataframe 1 created') LOGGER.info('START: Creating spark dataframe 2') df2 = sqlContext.read.format("com.databricks.spark.avro").load(URLPATH2)
import os import sys from pyspark import SQLContext from pyspark import SparkContext #os.environ["SPARK_HOME"] = "/opt/spark-1.6.1-bin-hadoop2.6" #os.environ["HADOOP_HOME"] = "/opt/hadoop" #os.environ["HADOOP_PREFIX"] = "/opt/hadoop" #os.environ["HIVE_HOME"] = "/opt/hive" sc = SparkContext('local[1]') sql_context = SQLContext(sc) sql_context.setConf( "spark.sql.shuffle.partitions", "1") sql_context.sql(""" use fex_test """)
def analyze(ss, cfg): """ Run job :param ss: SparkSession :param cfg: app configuration :return: None """ logger = logging.getLogger(__name__) logger.info('Python version: {}'.format(sys.version)) logger.info('Exporting data to support answer to dataset_selection_query_1 : What % of papers coming from a ' + 'university are OA. This program just retrieves dois of all papers published by the input university ' + 'and saves it to a file. Another program has to be called on top of this dataset to answer the question' ) # MAG dataset to use db_name = "mag2020" sql_sc = SQLContext(ss) q1a = sql_sc.read.parquet("hdfs:///project/core/Q1A_raw") unpaywall = sql_sc.read.parquet("hdfs:///project/core/unpaywall/unpaywall.parquet").withColumnRenamed("is_oa", "source_is_oa").withColumnRenamed( "oa_status", "source_oa_status") q1a_source_oa = q1a.join(unpaywall, q1a.source_doi == unpaywall.doi, "left").select("source_paperid", "source_doi", "source_year", "source_is_oa", "source_oa_status").distinct() paperauthoraffiliations = ss.table(db_name + ".paperauthoraffiliations").select("paperid", "affiliationid") affiliations = ss.table(db_name + ".affiliations").select("affiliationid", "latitude", "longitude", "normalizedname", "officialpage", "displayname", "rank") authoraffiliations = paperauthoraffiliations.join(affiliations, ["affiliationid"]) q1a_source_oa_latlon = authoraffiliations.join(q1a_source_oa, authoraffiliations.paperid == q1a.source_paperid) q1a_source_oa_agg = q1a_source_oa_latlon.distinct() \ .groupby("affiliationid") \ .agg(F.first("latitude"), F.first("longitude"), F.first("normalizedname"), F.first("rank"), F.count("paperid").alias("count_paper"), F.sum(F.when(F.col("source_is_oa") == True, 1).otherwise(0)).alias("count_oa"), F.sum(F.when(F.col("source_oa_status") == "green", 1).otherwise(0)).alias("count_green"), F.sum(F.when(F.col("source_oa_status") == "gold", 1).otherwise(0)).alias("count_gold")) q1a_source_oa_agg = q1a_source_oa_agg.withColumn("source_oa_score", col("count_oa") / col("count_paper")) q1a_source_oa_agg = q1a_source_oa_agg.withColumn("source_oa_gold", col("count_gold") / col("count_oa")) q1a_source_oa_agg = q1a_source_oa_agg.withColumn("source_oa_green", col("count_green") / col("count_oa")) q1a_source_oa_agg.write.csv("hdfs:///project/core/Q1A_sourceoa") q1a_source_oa_agg = q1a_source_oa_latlon.filter(F.col("source_year") < 2011).filter(F.col("source_year") >= 2006).distinct() \ .groupby("affiliationid") \ .agg(F.first("latitude"), F.first("longitude"), F.first("normalizedname"), F.first("rank"), F.count("paperid").alias("count_paper"), F.sum(F.when(F.col("source_is_oa") == True, 1).otherwise(0)).alias("count_oa"), F.sum(F.when(F.col("source_oa_status") == "green", 1).otherwise(0)).alias("count_green"), F.sum(F.when(F.col("source_oa_status") == "gold", 1).otherwise(0)).alias("count_gold")) q1a_source_oa_agg = q1a_source_oa_agg.withColumn("source_oa_score", col("count_oa") / col("count_paper")) q1a_source_oa_agg = q1a_source_oa_agg.withColumn("source_oa_gold", col("count_gold") / col("count_oa")) q1a_source_oa_agg = q1a_source_oa_agg.withColumn("source_oa_green", col("count_green") / col("count_oa")) q1a_source_oa_agg.write.csv("hdfs:///project/core/Q1A_sourceoa_2006_2011") q1a_source_oa_agg = q1a_source_oa_latlon.filter(F.col("source_year") <= 2015).filter(F.col("source_year") >= 2011).distinct() \ .groupby("affiliationid") \ .agg(F.first("latitude"), F.first("longitude"), F.first("normalizedname"), F.first("rank"), F.count("paperid").alias("count_paper"), F.sum(F.when(F.col("source_is_oa") == True, 1).otherwise(0)).alias("count_oa"), F.sum(F.when(F.col("source_oa_status") == "green", 1).otherwise(0)).alias("count_green"), F.sum(F.when(F.col("source_oa_status") == "gold", 1).otherwise(0)).alias("count_gold")) q1a_source_oa_agg = q1a_source_oa_agg.withColumn("source_oa_score", col("count_oa") / col("count_paper")) q1a_source_oa_agg = q1a_source_oa_agg.withColumn("source_oa_gold", col("count_gold") / col("count_oa")) q1a_source_oa_agg = q1a_source_oa_agg.withColumn("source_oa_green", col("count_green") / col("count_oa")) q1a_source_oa_agg.write.csv("hdfs:///project/core/Q1A_sourceoa_2011_2015") q1a_source_oa_agg = q1a_source_oa_latlon.filter(F.col("source_year") > 2015).filter(F.col("source_year") <= 2020).distinct() \ .groupby("affiliationid") \ .agg(F.first("latitude"), F.first("longitude"), F.first("normalizedname"), F.first("rank"), F.count("paperid").alias("count_paper"), F.sum(F.when(F.col("source_is_oa") == True, 1).otherwise(0)).alias("count_oa"), F.sum(F.when(F.col("source_oa_status") == "green", 1).otherwise(0)).alias("count_green"), F.sum(F.when(F.col("source_oa_status") == "gold", 1).otherwise(0)).alias("count_gold")) q1a_source_oa_agg = q1a_source_oa_agg.withColumn("source_oa_score", col("count_oa") / col("count_paper")) q1a_source_oa_agg = q1a_source_oa_agg.withColumn("source_oa_gold", col("count_gold") / col("count_oa")) q1a_source_oa_agg = q1a_source_oa_agg.withColumn("source_oa_green", col("count_green") / col("count_oa")) q1a_source_oa_agg.write.csv("hdfs:///project/core/Q1A_sourceoa_2015_2020")
logger.info(day) ''' # spark_home = '/opt/cloud/spark' os.environ['SPARK_HOME'] = spark_home #master = "spark://hadoop:7077" master = "local[1]" app_name = "spark_transferdata" sep = "\t" #input = "/data/140301_150731.csv" input = "/input/loginlog/2015" output = "/output/loginlog/2015" sc = SparkContext(master, app_name) sqlContext = SQLContext(sc) # load lines = sc.textFile(input) rdd = lines.map(lambda l: l.split(sep))\ .filter(lambda l:len(l)==11)\ .map(lambda l:(l[0],l[1],l[2],to_long(l[3]),l[4], long(l[5]),long(l[6]),l[7],l[8],l[9], to_long(l[10]))) # uid,adid,guuid,guuidctime,url,referer,hosid,gwid,ua,ip,createtime # uid,adid,guuid,createtime fields = [ StructField('uid', StringType(), True), StructField('adid', StringType(), True), StructField('guuid', StringType(), True), StructField('guuidctime', LongType(), True), StructField('url', StringType(), True),
# Path for pyspark and py4j sys.path.append("/Users/dustinchen/Documents/APP/spark-1.6.1-bin-hadoop2.6/python") sys.path.append("/Users/dustinchen/Documents/APP/spark-1.6.1-bin-hadoop2.6/python/lib/py4j-0.9-src.zip") try: from pyspark import SparkConf, SparkContext, SQLContext from pyspark.sql.functions import regexp_extract from pyspark.sql import Row except ImportError as e: print ("Can not import Spark Modules", e) if __name__ == "__main__": conf = SparkConf().setAppName("GISAPP").setMaster("local") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) nyc_shapefile = shapefile.Reader("/Users/dustinchen/Documents/APP/Resources/NY_counties_clip/NY_counties_clip.shp") """ 0 ('deletionflag', 'c', 1, 0) 1 ['objectid', 'n', 9, 0] 2 ['statefp', 'c', 2, 0] 3 ['countyfp', 'c', 3, 0] 4 ['countyns', 'c', 8, 0] 5 ['geoid', 'c', 5, 0] 6 ['name', 'c', 100, 0] ['namelsad', 'c', 100, 0] ['lsad', 'c', 2, 0] ['classfp', 'c', 2, 0] ['mtfcc', 'c', 5, 0] ['csafp', 'c', 3, 0] ['cbsafp', 'c', 5, 0]
for i in range(LINE_LENGTH): sys.stdout.write('-') print("") try: from pyspark import SparkContext from pyspark import SQLContext print ("Successfully imported Spark Modules -- `SparkContext, SQLContext`") print_horizontal() except ImportError as e: print ("Can not import Spark Modules", e) sys.exit(1) sqlContext = SQLContext(sparkContext=sc) # Loads parquet file located in AWS S3 into RDD Data Frame parquetFile = sqlContext.read.parquet("s3://jon-parquet-format/nation.plain.parquet") # Stores the DataFrame into an "in-memory temporary table" parquetFile.registerTempTable("parquetFile") # Run standard SQL queries against temporary table nations_all_sql = sqlContext.sql("SELECT * FROM parquetFile") # Print the result set nations_all = nations_all_sql.map(lambda p: "Country: {0:15} Ipsum Comment: {1}".format(p.name, p.comment_col)) print("All Nations and Comments -- `SELECT * FROM parquetFile`") print_horizontal()
from pyspark import SparkContext, SparkConf, SQLContext import sys conf = SparkConf().setAppName("DocSimilarity_Avro") sc = SparkContext(conf=conf) sqlcontext = SQLContext(sc) sqlcontext.setConf("spark.sql.avro.compression.codec", "uncompressed") sim_matrix_df = sqlcontext.read.format("com.databricks.spark.avro").load( sys.argv[1]) sim_matrix_rdd = sim_matrix_df.rdd similar_docs = sim_matrix_rdd.takeOrdered(10, key=lambda x: -x[1]) similar_docs = [doc[0] for doc in similar_docs] similar_docs_1 = sc.parallelize(similar_docs) similar_docs_1.saveAsTextFile(sys.argv[2])
# In[2]: try: try: timespan=str(sys.argv[1]) except IndexError: print 'please pass timespan in argument' sys.exit() conf = (SparkConf().setMaster("local").setAppName("hi_report_app").set("spark.executor.memory", "1g")) sc = SparkContext(conf = conf) sc.setLogLevel("Error") sqlContext = SQLContext(sc) # In[2]: config_url='https://s3-ap-southeast-1.amazonaws.com/nlplive.humanindex.data/config.json' try: config_response=requests.get(config_url) config = json.loads(config_response.content) except: print "Cannot fetch Config......" # In[23]: try: fetch_response=requests.get(str(config['baseAPIUrl'])+'/'+str(config['version'])+'/preProcessing/GetPredictionFileJob/'+timespan+'/publisher') #check if api request is successfull or not if(fetch_response.status_code==200):
""" STEP 4 """ from pyspark.sql import * from pyspark.sql.functions import * from pyspark.sql.types import * import json from datetime import tzinfo, datetime import pytz import re from pyspark import SparkContext, SQLContext sc = SparkContext() sqlContext = SQLContext(sc) sqlContext.setConf("spark.sql.parquet.compression.codec", "snappy") white_list = [ "itwiki", "enwiki", "dewiki", "fawiki", "nlwiki", "frwiki", "eswiki" ] links = sqlContext.read.parquet( 'hdfs:///user/piccardi/parquet/wikidata_links.parquet') def pivot_row(row): result = [] for l in white_list: result.append(Row(id=row.id, lang=l, title=row[l])) return result
import sys import itertools from math import sqrt from operator import add from os.path import join, isfile, dirname from pyspark import SparkContext, SparkConf, SQLContext from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating from pyspark.sql.types import StructType from pyspark.sql.types import StructField from pyspark.sql.types import StringType from pyspark.sql.types import FloatType conf = SparkConf().setAppName("app_collaborative") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) USER_ID = 0 #CLOUDSQL_INSTANCE_IP = 173.194.251.148 #BEST_RANK = 20 #BEST_ITERATION = 10 #BEST_REGULATION = 0.1 CLOUDSQL_INSTANCE_IP = sys.argv[1] CLOUDSQL_NAME = sys.argv[2] CLOUDSQL_USER = sys.argv[3] CLOUDSQL_PWD = sys.argv[4] BEST_RANK = int(sys.argv[5]) BEST_ITERATION = int(sys.argv[6]) BEST_REGULATION = float(sys.argv[7])
return df def deleteColumn(df): columns_to_drop = ['_c0', '_c1', '_c2', '_c3', '_c4', '_c5'] df = df.drop(*columns_to_drop) return df if __name__ == "__main__": # create Spark context with Spark configuration conf = SparkConf().setAppName("Practica 4") sc = SparkContext(conf=conf) sql = SQLContext(sc) # Load training data df = sql.read.format("csv").load("./data_training.csv") df = castColumns(df) ignore = ['_c6'] assembler = VectorAssembler( inputCols=[x for x in df.columns if x not in ignore], outputCol='features') data = assembler.transform(df) data = data.withColumnRenamed("_c6", "label") data = deleteColumn(data) data.show()
from pyspark import SQLContext import os import json # 根据evaluation进行分类 def name_place(name, place, price, evaluation): if evaluation <= 3: return name + "," + "general" elif evaluation > 3 and evaluation <=5: return name + "," + "good" if __name__ == "__main__": conf = SparkConf().setMaster("local[2]").setAppName("sql_udf") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) json_path = os.path.abspath("../doc/book.json") # json读取并隐射 json_df = sqlContext.read.json(json_path) json_df.registerTempTable("json_book") # UDF自定义函数 sqlContext.registerFunction("name_place", name_place) evalRDD = sqlContext.sql("SELECT name_place(name, place, price,evaluation) AS book_eval FROM json_book") #bookMap = lengthRDD.map(lambda books: (books.name, books.author, books.price, books.publish, books.place)) evalRDD.show()
import itertools from math import sqrt from operator import add from os.path import join, isfile, dirname from pyspark import SparkContext, SparkConf, SQLContext from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating from pyspark.sql.types import StructType from pyspark.sql.types import StructField from pyspark.sql.types import StringType from pyspark.sql.types import FloatType import pandas as pd conf = SparkConf().setAppName("app_collaborative") sc = SparkContext(conf=conf) sc.setCheckpointDir('checkpoint/') sqlContext = SQLContext(sc) def howFarAreWe(model, against, sizeAgainst): againstNoRatings = against.map(lambda x: (int(x[0]), int(x[1]))) againstWiRatings = against.map(lambda x: ((int(x[0]), int(x[1])), int(x[2]))) predictions = model.predictAll(againstNoRatings).map(lambda p: ((p[0], p[1]), p[2])) predictionsAndRatings = predictions.join(againstWiRatings).values() return sqrt( predictionsAndRatings.map(lambda s: (s[0] - s[1])**2).reduce(add) / float(sizeAgainst))