예제 #1
0
            select a.customer_no as customer_no,
                   b.industry as stock_type,
                   sum(a.business_balance) as total_exchange_amount
            from ''' + db_name + '''.fact_cust_stock_detail a
            left outer join ''' + db_name + '''.dim_stock b
            on a.stock_no=b.stock_no
            where a.part_date>=''' + six_months_ago + '''
              and a.part_date<=''' + run_date + '''
              and a.entrust_bs='买入'
            group by a.customer_no,
                     b.industry
        ) c
    ''')
    industry_prefer_detail.registerTempTable(
        'industry_prefer_detail_table')  # 注册成临时表
    hqlContext.cacheTable('industry_prefer_detail_table')

    # 获取前N个偏好行业
    industry_prefer_topN = hqlContext.sql('''
        select customer_no,
               concat_ws(',', collect_list(stock_type)) as stock_industry_n
        from industry_prefer_detail_table
        where rank<=3
        group by customer_no
    ''')
    industry_prefer_topN.registerTempTable(
        'industry_prefer_topN_table')  # 注册成临时表

    # 持仓比例,近6个月平均持仓比例
    stock_hold_ratio = hqlContext.sql('''   
        select a.customer_no as customer_no,
예제 #2
0
import os

from pyspark import SparkContext, SparkConf, SQLContext, HiveContext

if __name__ == "__main__":
    localClusterURL = "local[2]"
    clusterMasterURL = "spark://Master:7077"
    print(os.environ['SPARK_HOME'])
    print(os.environ['HADOOP_HOME'])
    conf = SparkConf().setAppName("RatingData").setMaster(clusterMasterURL)
    sc = SparkContext.getOrCreate(conf)
    sqlContext = SQLContext(sc)
    hc = HiveContext(sc)

    ratings = hc.cacheTable('ratings')
    count = hc.sql("select count(*) from ratings").first()[0]

    # 将数据分割成训练集和测试集
    percent = 0.6
    trainingDataCount = int((count * percent))
    testDataCount = int(count * (1 - percent))

    # 评分数据按时间升序排列
    trainingDataASC = hc.sql('select userId,movieId,rating from ratings order by ts asc')
    trainingDataASC.write.mode('overwrite').saveAsTable('trainingDataASC')

    # 评分数据按时间降序排列
    trainingDataDESC = hc.sql('select userId,movieId,rating from ratings order by ts desc')
    trainingDataDESC.write.mode('overwrite').saveAsTable('trainingDataDESC')
예제 #3
0
        where part_date>=''' + one_year_ago + '''
          and part_date<=''' + run_date + '''
    ''').rdd

    # collect 后的结果集type为list
    s_e_date_rdd = stock_hold_cur_detail. \
        map(lambda row_item: (row_item['customer_no'] + ',' + row_item['stock_no'] + ',' + row_item['stock_type'],
                              row_item['init_date'] + ':' + str(row_item['current_qty']))). \
        reduceByKey(lambda x, y: x + ' ' + y). \
        flatMap(lambda i: parse_s_e_date(i))

    # DataFrame可由list、tuple、rdd转化
    s_e_date_df = hqlContext.createDataFrame(s_e_date_rdd, [
        'customer_no', 'stock_no', 'stock_type', 'stock_start_date', 'stock_end_date'])
    s_e_date_df.registerTempTable("stock_hold_date_table")  # 所有持仓信息注册为表
    hqlContext.cacheTable('stock_hold_date_table')

    # 偏好股票明细
    stock_prefer_detail = hqlContext.sql('''
        select a.customer_no as customer_no,
               a.stock_no as stock_no,
               a.stock_name as stock_name,
               row_number() over (partition by customer_no order by total_exchange_amount) as rank
        from
        (
        select customer_no,
               stock_no,
               stock_name,
               sum(business_balance) as total_exchange_amount
        from ''' + db_name + '''.fact_cust_stock_detail
        where part_date>=''' + one_year_ago + '''
예제 #4
0
                    part_date <= ''' + run_date + ''' and 
                    business_type='证券买卖'
              ) a
        left outer join 
            (select stock_no,
                    close_price,
                    init_date
            from ''' + db_name + '''.fact_stock_market_detail
            where init_date >= ''' + half_year_ago + ''' and
                  init_date <= ''' + run_date + '''
            ) b
        on a.stock_no = b.stock_no and
           a.init_date = b.init_date
    ''')
    pmgz_detail.registerTempTable('pmgz_detail_table')
    hqlContext.cacheTable('pmgz_detail_table')

    # 买入盘面感知(只计算了有买入交易的客户的盘面感知,没有买入交易的客户的盘面感知应为0,下同)
    market_perception_bought = hqlContext.sql('''
        select customer_no,
               count(case when business_price < close_price then 1 else null end)/count(customer_no) as percep_in
        from pmgz_detail_table
        where entrust_bs = '买入'
        group by customer_no
    ''')
    market_perception_bought.registerTempTable(
        'market_perception_bought_table')

    # 卖出盘面感知
    market_perception_sold = hqlContext.sql('''
        select customer_no,
예제 #5
0
class DataSizer:
    def __init__(self):
        self.localClusterURL = "local[2]"
        self.clusterMasterURL = "spark://Master:7077"
        self.conf = SparkConf().setAppName('DaraSizer').setMaster(
            self.localClusterURL)
        self.sc = SparkContext.getOrCreate(self.conf)
        self.sqlContext = SQLContext(self.sc)
        self.hc = HiveContext(self.sc)

        self.jdbcURL = "jdbc:mysql://Master:3306/recommend?useUnicode=true&characterEncoding=utf-8&useSSL=false"

        self.prop = {
            'dirver': 'com.mysql.jdbc.Driver',
            'user': '******',
            'password': '******'
        }
        #  user\rating\links\tags在hdfs中的位置 ===> 即推荐原料在hdfs中的存档路径
        self.hdfs_data_path = 'hdfs://Master:9000/movie/data/'
        self.movies_path = self.hdfs_data_path + 'movies.txt'
        self.ratings_path = self.hdfs_data_path + 'ratings.txt'
        self.links_path = self.hdfs_data_path + 'links.txt'
        self.tags_path = self.hdfs_data_path + 'tags.txt'

        # 各种result数据在mysql中的表
        self.default5Table = 'MovieSizer.operation_default5recommend'
        self.top5Table = 'MovieSizer.oertion_top5recomm'

        self.alsTable = 'MovieSizer.movies_alsTab'
        self.similarTable = 'MovieSizer.movies_movidesimilar'
        self.usesrTable = 'MovieSizer.usesr_userprofile'
        self.ratingTable = 'MovieSizer.operation_rating'

        self.movieTab = 'MovieSizer.movies_movieinfo'
        self.tagTab = 'MovieSizer.movies_movieinfo_typelist'

        # 设置RDD的partition的数量一般以集群分配给应用的CPU核数的整数倍为宜。
        self.minPartitions = 8

    def sizerData(self):
        ratings = self.hc.cacheTable('ratings')
        count = self.hc.sql("select count(*) from ratings").first()[0]

        # 将数据分割成训练集和测试集
        percent = 0.6
        trainingDataCount = int((count * percent))
        testDataCount = int(count * (1 - percent))

        # 评分数据按时间升序排列 并根据时间戳去重, 取同一个用户对同一个电影的最近的电影评分
        # trainingDataASC = self.hc.sql('select userId,movieId,rating from ratings order by ts asc')
        trainingDataASC = self.hc.sql(
            'select id, user_id, movie_id, rating, ds from (select *, row_number() over (partition by user_id, movie_id order by ds desc) num from moviesizer.ratings) t where t.num=1 order by ds'
        )
        # trainingDataASC.show()
        trainingDataASC.write.mode('overwrite').saveAsTable(
            'moviesizer.trainingDataASC')

        # 评分数据按时间降序排列
        trainingDataDESC = self.hc.sql(
            'select id, user_id, movie_id, rating, ds from (select *, row_number() over (partition by user_id, movie_id order by ds desc) num from moviesizer.ratings) t where t.num=1 order by ds desc'
        )
        trainingDataDESC.write.mode('overwrite').saveAsTable(
            'moviesizer.trainingDataDESC')

        # 60% 的数据作为训练模型
        trainingData = self.hc.sql('select * from trainingDataAsc limit %d ' %
                                   trainingDataCount)
        trainingData.write.mode('overwrite').saveAsTable(
            'moviesizer.trainingData')

        # 40% 作为测试模型
        testData = self.hc.sql('select * from trainingDataDesc limit %d' %
                               testDataCount)
        testData.write.mode('overwrite').saveAsTable('moviesizer.testData')
예제 #6
0
    # where part_date>'''+six_month_ago+''' and
    # part_date<='''+run_date+''') m
    # ) n
    # where n.rn<=1
    # ''')
    # 每个客户最近半年的总交易天数
    cust_hy_days = hqlContext.sql('''
        select customer_no,
               count(distinct init_date) as cust_hy
        from ''' + db_name[0] + '''.Fact_Cust_Fund_Detail 
        where part_date>''' + six_month_ago + ''' 
          and part_date<=''' + run_date + '''
        group by customer_no
    ''')
    cust_hy_days.registerTempTable('cust_hy_days_table')
    hqlContext.cacheTable('cust_hy_days_table')

    activity_df = hqlContext.sql('''
        select zczzl.customer_no,
               0.5*nvl(zczzl.turnover_rate_year,0) + 0.2*nvl(hyts.hy,0) + 0.3*nvl(rzpl.fin_frequency,0) as cust_activity
        from (select t1.customer_no,
                     (t1.cust_hy-t2.min_hy)/(t2.max_hy-t2.max_hy) as hy
              from cust_hy_days_table t1,
                    (select min(cust_hy) as min_hy,
                            max(cust_hy) as max_hy
                     from cust_hy_days_table) t2
             ) hyts
        full outer join 
             (select c1.customer_no,
                     (c1.turnover_rate_year-c2.min_try)/(c2.max_try-c2.min_try) as turnover_rate_year
              from (select customer_no,
예제 #7
0
def load_data():
    # load data from files
    # and return query results / aggregates.

    hiveContext = HiveContext(sc)
    # 1027
    # path = '/home/brandon/PycharmProjects/markov_chain/data/raw_tx/'
    # path = '/home/brandon/PycharmProjects/markov_chain/data/raw_tx_fraud/train/'

    # AMAZON AWS EMR
    path = 'hdfs:///tmp/files/'    #HDFS


    # new segement files

    tx_files = [path + 'l_adults_2550_female_rural.csv', path + 'l_adults_2550_female_urban.csv', path + 'l_adults_2550_male_rural.csv', \
    path + 'l_adults_2550_male_urban.csv', path + 'l_young_adults_female_rural.csv', path + 'l_young_adults_female_urban.csv',\
    path + 'l_young_adults_male_rural.csv', path + 'l_young_adults_male_urban.csv', path + 'l_adults_50up_female_rural.csv', \
    path + 'l_adults_50up_female_urban.csv', path + 'l_adults_50up_male_rural.csv', path + 'l_adults_50up_male_urban.csv' ]

    # small file for debugging
    # 1027

    # tx_files = [path + 's_l_male_30_40_smaller_cities.csv']
    # tx_files = [path + 'sorted_fraud_male_30_40_smaller_cities.csv']

    # tx_files = [path+'40_60_bigger_cities.csv',path+'40_60_smaller_cities.csv',path+'all_60_up.csv'\
    #         ,path+'female_30_40_bigger_cities.csv',path+'female_30_40_smaller_cities.csv'\
    #         ,path+'male_30_40_bigger_cities.csv',path+'male_30_40_smaller_cities.csv'\
    #         ,path+'millenials.csv',path+'young_adults.csv']

    # 1027
    # tx_files = [path+'l_40_60_bigger_cities.csv',path+'l_40_60_smaller_cities.csv',path+'l_all_60_up.csv'\
    #         ,path+'l_female_30_40_bigger_cities.csv',path+'l_female_30_40_smaller_cities.csv'\
    #         ,path+'l_male_30_40_bigger_cities.csv',path+'l_male_30_40_smaller_cities.csv'\
    #         ,path+'l_millenials.csv',path+'l_young_adults.csv']



    all_tx = sc.textFile(','.join(tx_files),600)

    # 1027
    # txSchemaString = 'ssn|cc_num|first|last|gender|street|city|state|zip|lat|long|city_pop|job|dob|acct_num|profile|trans_num|trans_date|trans_time|unix_time|category|amt|merchant|merch_lat|merch_long'
    txSchemaString = 'ssn|cc_num|first|last|gender|street|city|state|zip|lat|long|city_pop|job|dob|acct_num|profile|trans_num|trans_date|trans_time|unix_time|category|amt|is_fraud|merchant|merch_lat|merch_long'
    txFields = [StructField(field_name, StringType(), True) for field_name in txSchemaString.split('|')]
    txFields[17] = StructField('trans_date', DateType(), True)

    txSchema = StructType(txFields)
    # ssn|cc_num|first|last|gender|street|city|state|zip|lat|long|city_pop|job|dob|acct_num|profile|trans_num|trans_date|trans_time|unix_time|category|amt|merchant|merch_lat|merch_long
    txHeader = all_tx.filter(lambda l: "ssn|" in l)
    txNoHeader = all_tx.subtract(txHeader)

    temp_tx = txNoHeader.map(lambda k: k.split("|")).map(lambda p: (
    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], p[16],
    datetime.datetime.strptime(p[17], '%Y-%m-%d').date(), p[18], p[19], p[20], p[21], p[22], p[23], p[24], p[25]))

    h_tx_df = hiveContext.createDataFrame(temp_tx, txSchema)
    h_tx_df.registerTempTable("htx")

    hiveContext.cacheTable("htx")

    # HBASE CODE HERE
    # create dataframe with all records
    # map using hbase_process to extract record into individual componenets
    # and create a dictionary to store in hbase
    #h_data = hiveContext.sql("SELECT * FROM htx")
    #h_data.map(hbase_process).foreachPartition(store_full_data)

    # get cust mean time between transactions
    time_lag_eval = hiveContext.sql(
    "SELECT cc_num, unix_time, LAG(htx.unix_time) OVER (PARTITION BY htx.cc_num ORDER BY  htx.unix_time) as lag_time from htx order by cc_num, unix_time asc")
    time_lag_eval.registerTempTable("ts_lag")

    user_avg_time = hiveContext.sql("SELECT cc_num, AVG(unix_time - lag_time) as time_diff, percentile_approx((unix_time - lag_time),0.1) as low_bound, percentile_approx((unix_time - lag_time),0.90) as high_bound from ts_lag where lag_time is not null group by cc_num")
    user_avg_time.registerTempTable("avg_time")


    # get cust mean per category
    mean_per_cat = hiveContext.sql("SELECT cc_num, category, avg(amt) as mean_exp, (avg(amt)-2*(stddev_pop(amt))) as low_bound, (avg(amt)+2*(stddev_pop(amt))) as high_bound from htx group by cc_num, category")
    mean_per_cat.registerTempTable("mean_per_cat")

    # evaluate amount for HML and time of purchase for normal/abnormal
    test = hiveContext.sql(
    # #    "SELECT htx.cc_num, profile, htx.category, htx.trans_date,htx.trans_time, htx.unix_time,IF(htx.amt>(2 * m.mean_exp),'H',(IF(htx.amt<(0.5 * m.mean_exp),'L','N'))) as EXP, IF(htx.category like '%_net%','N','P') as CNP, htx.amt, LAG(htx.unix_time) OVER (PARTITION BY htx.cc_num ORDER BY  htx.unix_time) as lag_time from htx join mean_per_cat m on htx.cc_num=m.cc_num and m.category =htx.category")
    "SELECT htx.cc_num, profile, htx.category, htx.trans_date,htx.trans_time, htx.unix_time,IF(htx.amt>m.high_bound,'H',(IF(htx.amt < m.low_bound,'L','N'))) as EXP, IF(cast(SUBSTR(htx.trans_time,0,2) as int)<05,'A',IF(cast(SUBSTR(htx.trans_time,0,2) as int)>21,'A','N')) as NAT, htx.amt, LAG(htx.unix_time) OVER (PARTITION BY htx.cc_num ORDER BY  htx.unix_time) as lag_time from htx join mean_per_cat m on htx.cc_num=m.cc_num and m.category =htx.category")
    test.registerTempTable("full_table")

    # evaluate for transaction time (HML)
    full_data = hiveContext.sql(
         "SELECT full_table.cc_num, profile, category, trans_date, trans_time, unix_time,lag_time,IF(lag_time is null,100000,unix_time-lag_time) as time_since,amt, EXP,NAT,IF((unix_time-lag_time)<avg_time.low_bound,'H',IF((unix_time-lag_time)>avg_time.high_bound,'L','N')) as VEL from full_table left join avg_time on avg_time.cc_num = full_table.cc_num")
    full_data.registerTempTable("full_data")


    # return full tx data for user with reduced HML/AN/HML variables
    per_cust_transactions = hiveContext.sql(
        "SELECT cc_num as cust_id,concat(EXP,NAT, VEL) as trans_list from full_data order by cc_num, unix_time asc")

    # return full tx data for profile with reduced HML/NP/HML variables in sorted order
    #pre_sort_
    per_profile_transactions = hiveContext.sql(
        "SELECT profile as cust_id,concat(EXP,NAT,VEL) as trans_list from full_data order by profile, unix_time asc")
    #pre_sort_per_profile_transactions.registerTempTable("pre_sort")



    # we only need cust_id (really profile name here) and trans_list, but we had to include cc_num above in our sort
    #per_profile_transactions = hiveContext.sql("SELECT cust_id,trans_list from pre_sort")

    # gets pre-computed reference values for each customer and stores in redis
    #   avg spent per category
    #   n transactions
    #   last unix time stamp
    agg_info = hiveContext.sql(
        "SELECT CONCAT(category, '_', cc_num) as cust_id, category, concat(low_bound,',',high_bound) as low_high from mean_per_cat")
    avg_cat_data = agg_info.rdd.map(lambda x: [str(x.cust_id), str(x.low_high)])


    agg_n_tx = hiveContext.sql(
        "SELECT CONCAT('count_', cc_num) as cust_id, count(cc_num) as tx_count from full_data group by cc_num")
    n_tx = agg_n_tx.rdd.map(lambda x: [str(x.cust_id), str(x.tx_count)])

    agg_unix_ts = hiveContext.sql(
        "SELECT CONCAT('timestamp_', cc_num) as cust_id, max(unix_time) as last_unix_time from full_data group by cc_num")
    n_ts = agg_unix_ts.rdd.map(lambda x: [str(x.cust_id), str(x.last_unix_time)])

    agg_vel_info = hiveContext.sql(
        "SELECT CONCAT('velocity_', cc_num) as cust_id, concat(low_bound,',',high_bound) as low_high from avg_time")
    avg_vel_data = agg_vel_info.rdd.map(lambda x: [str(x.cust_id), str(x.low_high)])




    # compile our final string per customer for all tx's
    per_cust_transactions_r = per_cust_transactions.map(lambda p: (str(p.cust_id), str(p.trans_list))) \
        .reduceByKey(lambda y, z: y + ',' + z).map(lambda x: ''.join(x[0]) + ',' + x[1])



    # compile our final string per profile for all tx's
    per_profile_transactions_r = per_profile_transactions.map(lambda p: (str(p.cust_id), str(p.trans_list))) \
        .reduceByKey(lambda y, z: y + ',' + z).map(lambda x: ''.join(x[0]) + ',' + x[1])


    # return tx data and aggregates
    return_dict = {}
    return_dict['profile'] = per_profile_transactions_r
    return_dict['customer'] = per_cust_transactions_r

    return avg_cat_data, n_tx, n_ts, return_dict, avg_vel_data