select a.customer_no as customer_no, b.industry as stock_type, sum(a.business_balance) as total_exchange_amount from ''' + db_name + '''.fact_cust_stock_detail a left outer join ''' + db_name + '''.dim_stock b on a.stock_no=b.stock_no where a.part_date>=''' + six_months_ago + ''' and a.part_date<=''' + run_date + ''' and a.entrust_bs='买入' group by a.customer_no, b.industry ) c ''') industry_prefer_detail.registerTempTable( 'industry_prefer_detail_table') # 注册成临时表 hqlContext.cacheTable('industry_prefer_detail_table') # 获取前N个偏好行业 industry_prefer_topN = hqlContext.sql(''' select customer_no, concat_ws(',', collect_list(stock_type)) as stock_industry_n from industry_prefer_detail_table where rank<=3 group by customer_no ''') industry_prefer_topN.registerTempTable( 'industry_prefer_topN_table') # 注册成临时表 # 持仓比例,近6个月平均持仓比例 stock_hold_ratio = hqlContext.sql(''' select a.customer_no as customer_no,
import os from pyspark import SparkContext, SparkConf, SQLContext, HiveContext if __name__ == "__main__": localClusterURL = "local[2]" clusterMasterURL = "spark://Master:7077" print(os.environ['SPARK_HOME']) print(os.environ['HADOOP_HOME']) conf = SparkConf().setAppName("RatingData").setMaster(clusterMasterURL) sc = SparkContext.getOrCreate(conf) sqlContext = SQLContext(sc) hc = HiveContext(sc) ratings = hc.cacheTable('ratings') count = hc.sql("select count(*) from ratings").first()[0] # 将数据分割成训练集和测试集 percent = 0.6 trainingDataCount = int((count * percent)) testDataCount = int(count * (1 - percent)) # 评分数据按时间升序排列 trainingDataASC = hc.sql('select userId,movieId,rating from ratings order by ts asc') trainingDataASC.write.mode('overwrite').saveAsTable('trainingDataASC') # 评分数据按时间降序排列 trainingDataDESC = hc.sql('select userId,movieId,rating from ratings order by ts desc') trainingDataDESC.write.mode('overwrite').saveAsTable('trainingDataDESC')
where part_date>=''' + one_year_ago + ''' and part_date<=''' + run_date + ''' ''').rdd # collect 后的结果集type为list s_e_date_rdd = stock_hold_cur_detail. \ map(lambda row_item: (row_item['customer_no'] + ',' + row_item['stock_no'] + ',' + row_item['stock_type'], row_item['init_date'] + ':' + str(row_item['current_qty']))). \ reduceByKey(lambda x, y: x + ' ' + y). \ flatMap(lambda i: parse_s_e_date(i)) # DataFrame可由list、tuple、rdd转化 s_e_date_df = hqlContext.createDataFrame(s_e_date_rdd, [ 'customer_no', 'stock_no', 'stock_type', 'stock_start_date', 'stock_end_date']) s_e_date_df.registerTempTable("stock_hold_date_table") # 所有持仓信息注册为表 hqlContext.cacheTable('stock_hold_date_table') # 偏好股票明细 stock_prefer_detail = hqlContext.sql(''' select a.customer_no as customer_no, a.stock_no as stock_no, a.stock_name as stock_name, row_number() over (partition by customer_no order by total_exchange_amount) as rank from ( select customer_no, stock_no, stock_name, sum(business_balance) as total_exchange_amount from ''' + db_name + '''.fact_cust_stock_detail where part_date>=''' + one_year_ago + '''
part_date <= ''' + run_date + ''' and business_type='证券买卖' ) a left outer join (select stock_no, close_price, init_date from ''' + db_name + '''.fact_stock_market_detail where init_date >= ''' + half_year_ago + ''' and init_date <= ''' + run_date + ''' ) b on a.stock_no = b.stock_no and a.init_date = b.init_date ''') pmgz_detail.registerTempTable('pmgz_detail_table') hqlContext.cacheTable('pmgz_detail_table') # 买入盘面感知(只计算了有买入交易的客户的盘面感知,没有买入交易的客户的盘面感知应为0,下同) market_perception_bought = hqlContext.sql(''' select customer_no, count(case when business_price < close_price then 1 else null end)/count(customer_no) as percep_in from pmgz_detail_table where entrust_bs = '买入' group by customer_no ''') market_perception_bought.registerTempTable( 'market_perception_bought_table') # 卖出盘面感知 market_perception_sold = hqlContext.sql(''' select customer_no,
class DataSizer: def __init__(self): self.localClusterURL = "local[2]" self.clusterMasterURL = "spark://Master:7077" self.conf = SparkConf().setAppName('DaraSizer').setMaster( self.localClusterURL) self.sc = SparkContext.getOrCreate(self.conf) self.sqlContext = SQLContext(self.sc) self.hc = HiveContext(self.sc) self.jdbcURL = "jdbc:mysql://Master:3306/recommend?useUnicode=true&characterEncoding=utf-8&useSSL=false" self.prop = { 'dirver': 'com.mysql.jdbc.Driver', 'user': '******', 'password': '******' } # user\rating\links\tags在hdfs中的位置 ===> 即推荐原料在hdfs中的存档路径 self.hdfs_data_path = 'hdfs://Master:9000/movie/data/' self.movies_path = self.hdfs_data_path + 'movies.txt' self.ratings_path = self.hdfs_data_path + 'ratings.txt' self.links_path = self.hdfs_data_path + 'links.txt' self.tags_path = self.hdfs_data_path + 'tags.txt' # 各种result数据在mysql中的表 self.default5Table = 'MovieSizer.operation_default5recommend' self.top5Table = 'MovieSizer.oertion_top5recomm' self.alsTable = 'MovieSizer.movies_alsTab' self.similarTable = 'MovieSizer.movies_movidesimilar' self.usesrTable = 'MovieSizer.usesr_userprofile' self.ratingTable = 'MovieSizer.operation_rating' self.movieTab = 'MovieSizer.movies_movieinfo' self.tagTab = 'MovieSizer.movies_movieinfo_typelist' # 设置RDD的partition的数量一般以集群分配给应用的CPU核数的整数倍为宜。 self.minPartitions = 8 def sizerData(self): ratings = self.hc.cacheTable('ratings') count = self.hc.sql("select count(*) from ratings").first()[0] # 将数据分割成训练集和测试集 percent = 0.6 trainingDataCount = int((count * percent)) testDataCount = int(count * (1 - percent)) # 评分数据按时间升序排列 并根据时间戳去重, 取同一个用户对同一个电影的最近的电影评分 # trainingDataASC = self.hc.sql('select userId,movieId,rating from ratings order by ts asc') trainingDataASC = self.hc.sql( 'select id, user_id, movie_id, rating, ds from (select *, row_number() over (partition by user_id, movie_id order by ds desc) num from moviesizer.ratings) t where t.num=1 order by ds' ) # trainingDataASC.show() trainingDataASC.write.mode('overwrite').saveAsTable( 'moviesizer.trainingDataASC') # 评分数据按时间降序排列 trainingDataDESC = self.hc.sql( 'select id, user_id, movie_id, rating, ds from (select *, row_number() over (partition by user_id, movie_id order by ds desc) num from moviesizer.ratings) t where t.num=1 order by ds desc' ) trainingDataDESC.write.mode('overwrite').saveAsTable( 'moviesizer.trainingDataDESC') # 60% 的数据作为训练模型 trainingData = self.hc.sql('select * from trainingDataAsc limit %d ' % trainingDataCount) trainingData.write.mode('overwrite').saveAsTable( 'moviesizer.trainingData') # 40% 作为测试模型 testData = self.hc.sql('select * from trainingDataDesc limit %d' % testDataCount) testData.write.mode('overwrite').saveAsTable('moviesizer.testData')
# where part_date>'''+six_month_ago+''' and # part_date<='''+run_date+''') m # ) n # where n.rn<=1 # ''') # 每个客户最近半年的总交易天数 cust_hy_days = hqlContext.sql(''' select customer_no, count(distinct init_date) as cust_hy from ''' + db_name[0] + '''.Fact_Cust_Fund_Detail where part_date>''' + six_month_ago + ''' and part_date<=''' + run_date + ''' group by customer_no ''') cust_hy_days.registerTempTable('cust_hy_days_table') hqlContext.cacheTable('cust_hy_days_table') activity_df = hqlContext.sql(''' select zczzl.customer_no, 0.5*nvl(zczzl.turnover_rate_year,0) + 0.2*nvl(hyts.hy,0) + 0.3*nvl(rzpl.fin_frequency,0) as cust_activity from (select t1.customer_no, (t1.cust_hy-t2.min_hy)/(t2.max_hy-t2.max_hy) as hy from cust_hy_days_table t1, (select min(cust_hy) as min_hy, max(cust_hy) as max_hy from cust_hy_days_table) t2 ) hyts full outer join (select c1.customer_no, (c1.turnover_rate_year-c2.min_try)/(c2.max_try-c2.min_try) as turnover_rate_year from (select customer_no,
def load_data(): # load data from files # and return query results / aggregates. hiveContext = HiveContext(sc) # 1027 # path = '/home/brandon/PycharmProjects/markov_chain/data/raw_tx/' # path = '/home/brandon/PycharmProjects/markov_chain/data/raw_tx_fraud/train/' # AMAZON AWS EMR path = 'hdfs:///tmp/files/' #HDFS # new segement files tx_files = [path + 'l_adults_2550_female_rural.csv', path + 'l_adults_2550_female_urban.csv', path + 'l_adults_2550_male_rural.csv', \ path + 'l_adults_2550_male_urban.csv', path + 'l_young_adults_female_rural.csv', path + 'l_young_adults_female_urban.csv',\ path + 'l_young_adults_male_rural.csv', path + 'l_young_adults_male_urban.csv', path + 'l_adults_50up_female_rural.csv', \ path + 'l_adults_50up_female_urban.csv', path + 'l_adults_50up_male_rural.csv', path + 'l_adults_50up_male_urban.csv' ] # small file for debugging # 1027 # tx_files = [path + 's_l_male_30_40_smaller_cities.csv'] # tx_files = [path + 'sorted_fraud_male_30_40_smaller_cities.csv'] # tx_files = [path+'40_60_bigger_cities.csv',path+'40_60_smaller_cities.csv',path+'all_60_up.csv'\ # ,path+'female_30_40_bigger_cities.csv',path+'female_30_40_smaller_cities.csv'\ # ,path+'male_30_40_bigger_cities.csv',path+'male_30_40_smaller_cities.csv'\ # ,path+'millenials.csv',path+'young_adults.csv'] # 1027 # tx_files = [path+'l_40_60_bigger_cities.csv',path+'l_40_60_smaller_cities.csv',path+'l_all_60_up.csv'\ # ,path+'l_female_30_40_bigger_cities.csv',path+'l_female_30_40_smaller_cities.csv'\ # ,path+'l_male_30_40_bigger_cities.csv',path+'l_male_30_40_smaller_cities.csv'\ # ,path+'l_millenials.csv',path+'l_young_adults.csv'] all_tx = sc.textFile(','.join(tx_files),600) # 1027 # txSchemaString = 'ssn|cc_num|first|last|gender|street|city|state|zip|lat|long|city_pop|job|dob|acct_num|profile|trans_num|trans_date|trans_time|unix_time|category|amt|merchant|merch_lat|merch_long' txSchemaString = 'ssn|cc_num|first|last|gender|street|city|state|zip|lat|long|city_pop|job|dob|acct_num|profile|trans_num|trans_date|trans_time|unix_time|category|amt|is_fraud|merchant|merch_lat|merch_long' txFields = [StructField(field_name, StringType(), True) for field_name in txSchemaString.split('|')] txFields[17] = StructField('trans_date', DateType(), True) txSchema = StructType(txFields) # ssn|cc_num|first|last|gender|street|city|state|zip|lat|long|city_pop|job|dob|acct_num|profile|trans_num|trans_date|trans_time|unix_time|category|amt|merchant|merch_lat|merch_long txHeader = all_tx.filter(lambda l: "ssn|" in l) txNoHeader = all_tx.subtract(txHeader) temp_tx = txNoHeader.map(lambda k: k.split("|")).map(lambda p: ( p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], p[16], datetime.datetime.strptime(p[17], '%Y-%m-%d').date(), p[18], p[19], p[20], p[21], p[22], p[23], p[24], p[25])) h_tx_df = hiveContext.createDataFrame(temp_tx, txSchema) h_tx_df.registerTempTable("htx") hiveContext.cacheTable("htx") # HBASE CODE HERE # create dataframe with all records # map using hbase_process to extract record into individual componenets # and create a dictionary to store in hbase #h_data = hiveContext.sql("SELECT * FROM htx") #h_data.map(hbase_process).foreachPartition(store_full_data) # get cust mean time between transactions time_lag_eval = hiveContext.sql( "SELECT cc_num, unix_time, LAG(htx.unix_time) OVER (PARTITION BY htx.cc_num ORDER BY htx.unix_time) as lag_time from htx order by cc_num, unix_time asc") time_lag_eval.registerTempTable("ts_lag") user_avg_time = hiveContext.sql("SELECT cc_num, AVG(unix_time - lag_time) as time_diff, percentile_approx((unix_time - lag_time),0.1) as low_bound, percentile_approx((unix_time - lag_time),0.90) as high_bound from ts_lag where lag_time is not null group by cc_num") user_avg_time.registerTempTable("avg_time") # get cust mean per category mean_per_cat = hiveContext.sql("SELECT cc_num, category, avg(amt) as mean_exp, (avg(amt)-2*(stddev_pop(amt))) as low_bound, (avg(amt)+2*(stddev_pop(amt))) as high_bound from htx group by cc_num, category") mean_per_cat.registerTempTable("mean_per_cat") # evaluate amount for HML and time of purchase for normal/abnormal test = hiveContext.sql( # # "SELECT htx.cc_num, profile, htx.category, htx.trans_date,htx.trans_time, htx.unix_time,IF(htx.amt>(2 * m.mean_exp),'H',(IF(htx.amt<(0.5 * m.mean_exp),'L','N'))) as EXP, IF(htx.category like '%_net%','N','P') as CNP, htx.amt, LAG(htx.unix_time) OVER (PARTITION BY htx.cc_num ORDER BY htx.unix_time) as lag_time from htx join mean_per_cat m on htx.cc_num=m.cc_num and m.category =htx.category") "SELECT htx.cc_num, profile, htx.category, htx.trans_date,htx.trans_time, htx.unix_time,IF(htx.amt>m.high_bound,'H',(IF(htx.amt < m.low_bound,'L','N'))) as EXP, IF(cast(SUBSTR(htx.trans_time,0,2) as int)<05,'A',IF(cast(SUBSTR(htx.trans_time,0,2) as int)>21,'A','N')) as NAT, htx.amt, LAG(htx.unix_time) OVER (PARTITION BY htx.cc_num ORDER BY htx.unix_time) as lag_time from htx join mean_per_cat m on htx.cc_num=m.cc_num and m.category =htx.category") test.registerTempTable("full_table") # evaluate for transaction time (HML) full_data = hiveContext.sql( "SELECT full_table.cc_num, profile, category, trans_date, trans_time, unix_time,lag_time,IF(lag_time is null,100000,unix_time-lag_time) as time_since,amt, EXP,NAT,IF((unix_time-lag_time)<avg_time.low_bound,'H',IF((unix_time-lag_time)>avg_time.high_bound,'L','N')) as VEL from full_table left join avg_time on avg_time.cc_num = full_table.cc_num") full_data.registerTempTable("full_data") # return full tx data for user with reduced HML/AN/HML variables per_cust_transactions = hiveContext.sql( "SELECT cc_num as cust_id,concat(EXP,NAT, VEL) as trans_list from full_data order by cc_num, unix_time asc") # return full tx data for profile with reduced HML/NP/HML variables in sorted order #pre_sort_ per_profile_transactions = hiveContext.sql( "SELECT profile as cust_id,concat(EXP,NAT,VEL) as trans_list from full_data order by profile, unix_time asc") #pre_sort_per_profile_transactions.registerTempTable("pre_sort") # we only need cust_id (really profile name here) and trans_list, but we had to include cc_num above in our sort #per_profile_transactions = hiveContext.sql("SELECT cust_id,trans_list from pre_sort") # gets pre-computed reference values for each customer and stores in redis # avg spent per category # n transactions # last unix time stamp agg_info = hiveContext.sql( "SELECT CONCAT(category, '_', cc_num) as cust_id, category, concat(low_bound,',',high_bound) as low_high from mean_per_cat") avg_cat_data = agg_info.rdd.map(lambda x: [str(x.cust_id), str(x.low_high)]) agg_n_tx = hiveContext.sql( "SELECT CONCAT('count_', cc_num) as cust_id, count(cc_num) as tx_count from full_data group by cc_num") n_tx = agg_n_tx.rdd.map(lambda x: [str(x.cust_id), str(x.tx_count)]) agg_unix_ts = hiveContext.sql( "SELECT CONCAT('timestamp_', cc_num) as cust_id, max(unix_time) as last_unix_time from full_data group by cc_num") n_ts = agg_unix_ts.rdd.map(lambda x: [str(x.cust_id), str(x.last_unix_time)]) agg_vel_info = hiveContext.sql( "SELECT CONCAT('velocity_', cc_num) as cust_id, concat(low_bound,',',high_bound) as low_high from avg_time") avg_vel_data = agg_vel_info.rdd.map(lambda x: [str(x.cust_id), str(x.low_high)]) # compile our final string per customer for all tx's per_cust_transactions_r = per_cust_transactions.map(lambda p: (str(p.cust_id), str(p.trans_list))) \ .reduceByKey(lambda y, z: y + ',' + z).map(lambda x: ''.join(x[0]) + ',' + x[1]) # compile our final string per profile for all tx's per_profile_transactions_r = per_profile_transactions.map(lambda p: (str(p.cust_id), str(p.trans_list))) \ .reduceByKey(lambda y, z: y + ',' + z).map(lambda x: ''.join(x[0]) + ',' + x[1]) # return tx data and aggregates return_dict = {} return_dict['profile'] = per_profile_transactions_r return_dict['customer'] = per_cust_transactions_r return avg_cat_data, n_tx, n_ts, return_dict, avg_vel_data