def run_hive(): dic = [{'id': '1,2,3'}] # df = pd.DataFrame(dic) sc = SparkContext() # sc.parallelize(dic) sql_ctx = HiveContext(sc) # sql_ctx.registerDataFrameAsTable(df, "aaa") sdf = sql_ctx.createDataFrame(dic) sdf.registerTempTable('aaa') # sdf.show() # df2 = sql_ctx.sql('select split(id,',') from aaa') df2 = sql_ctx.sql( 'select select collect_list(cast (explode(split(id,",")) AS string)) from aaa' ) df2.show()
phone = ob.get("bankpremobile", None) else: phone = None return (phone, idCard, idBank, name) # xiaoshudian_app_key = "1186159692" # xiaoshudian_tec_api = ["tel","address_getbymobile","channel_NameIDCardAccountVerify","channel_cellphone","operator_capricorn","address_match","channel_idcard","channel_bankby3","channel_idNameFase","channel_criminal","channel_blacklistverify","credit_implement"] # def filter(app_key,api): # not (app_key in xiaoshudian_app_key and api not in xiaoshudian_tec_api) data_rdd = data.rdd.map(lambda a: (a.app_key_param, a.date, standard_params(a.params), a.interface, a.api_type)) \ .map(lambda (a, b, c, d, e): (a, b, c[0], c[1], c[2], c[3], d, e)) ''' c[0]->phone c[1]->idcard c[2]->idbank c[3]->name ''' schemaStr = "app_key date phone idcard idbank name interface api_type" fields = [ StructField(field_name, StringType(), True) for field_name in schemaStr.split() ] schema = StructType(fields) data_df = hc.createDataFrame(data_rdd, schema).distinct() dfw = DataFrameWriter(data_df) dfw.saveAsTable("wl_analysis.t_lel_record_data_backflow", mode="overwrite")
stock_concentrate_top1_industry_info.open_price = stock_concentrate_top1_industry_info.open_price.astype('float') stock_concentrate_top1_industry_info.close_price = stock_concentrate_top1_industry_info.close_price.astype('float') stock_concentrate_top1_industry_info.high_price = stock_concentrate_top1_industry_info.high_price.astype('float') stock_concentrate_top1_industry_info.low_price = stock_concentrate_top1_industry_info.low_price.astype('float') stock_concentrate_top1_industry_info.amount = stock_concentrate_top1_industry_info.amount.astype('float') stock_concentrate_top1_industry_info.volume = stock_concentrate_top1_industry_info.volume.astype('float') stock_concentrate_top1_industry_info.change_rate = stock_concentrate_top1_industry_info.change_rate.astype('float') workday_list = ['five_','four_','three_','two_','one_'] #开盘价 top1_open_price_info = pd.pivot_table(stock_concentrate_top1_industry_info,index=['customer_no','op_date'],columns=['date_rank'],values=['open_price']) top1_open_price_info.columns=[x + 'top1_industry_open_price' for x in workday_list ] top1_open_price_info = top1_open_price_info.reset_index() top1_open_price_info = add_statistics(top1_open_price_info,'top1_industry_open_price') top1_open_price_info = hql.createDataFrame(top1_open_price_info) top1_open_price_info.registerTempTable('top1_open_price_info') #收盘价 top1_close_price_info = pd.pivot_table(stock_concentrate_top1_industry_info,index=['customer_no','op_date'],columns=['date_rank'],values=['close_price']) top1_close_price_info.columns=[x + 'top1_industry_close_price' for x in workday_list ] top1_close_price_info = top1_close_price_info.reset_index() top1_close_price_info = add_statistics(top1_close_price_info,'top1_industry_close_price') top1_close_price_info = hql.createDataFrame(top1_close_price_info) top1_close_price_info.registerTempTable('top1_close_price_info') #最高价 top1_high_price_info = pd.pivot_table(stock_concentrate_top1_industry_info,index=['customer_no','op_date'],columns=['date_rank'],values=['high_price']) top1_high_price_info.columns=[x + 'top1_industry_high_price' for x in workday_list ]
pdf['is_polluter'] = model.predict(pdf[predict_cols]) ''' print "Predictions: \n" print pdf.head() ''' # set a threshhold of 85% probability to flag as spammer polluters = pdf[pdf.is_polluter > 0.85] #.to_json(orient="records") links = polluters.tweeted_urls links_df = sqlContext.createDataFrame(pd.DataFrame(links)) links_df = links_df.select(['tweeted_urls.url', 'tweeted_urls.expanded_url']) uniqueLInks = links_df.dropDuplicates(['url', 'expanded_url']) # forget about S3: # uniqueLInks.repartition(1).save("s3n://w205twitterproject/temp_urls","json") print "saving file to /data/w205Project/honeypot/temp_urls.json...." # instead, save to file on local disk for use by scrapy uniqueLInks.toPandas().to_json(orient="records",path_or_buf='/data/w205Project/python/url_spider/url_spider/logs/temp_urls.log') ################################################################################################################# # APPEND CLASSIFIED DATA TO POSTGRES FOR DASHBOARD ################################################################################################################# from sqlalchemy import create_engine
]) genband_stats_schema = StructType([ StructField("iteration_id", StringType(), False), StructField("file_id", StringType(), False), StructField("file_name", StringType(), False), StructField("check_sum", StringType(), False), StructField("total_cnt", StringType(), False), StructField("success_cnt", StringType(), False), StructField("failed_cnt", StringType(), False), StructField("timeframe_day", LongType(), False), StructField("timeframe_hr", IntegerType(), False) ]) ## Below code is for Hive storing calls_df = sqlContext.createDataFrame(final, genband_schema) #sqlContext.sql("drop table if exists gtt.GENBAND_CDR") #sqlContext.sql("CREATE TABLE gtt.GENBAND_CDR (file_id string, file_name string, connect_datetime timestamp, originating_number string ,terminating_number string ," # "elapsed_time decimal(12,2), dom_int_indicator string, trunkid1 string , trunkid2 string , call_code string ," # "completion_ind string, answer_ind string ) partitioned by (timeframe_day bigint,timeframe_hr int)") stats_df = sqlContext.createDataFrame(final_stats, genband_stats_schema) #sqlContext.sql("drop table if exists gtt.GENBAND_PARSER_STATS") #sqlContext.sql("CREATE TABLE gtt.GENBAND_PARSER_STATS (iteration_id string, file_id string, file_name string, " # "check_sum string, total_cnt string, success_cnt string, failed_cnt string ) partitioned by (timeframe_day bigint,timeframe_hr int)") print(calls_df.columns) print(calls_df.count()) #calls_df.write.format("parquet").mode("append").saveAsTable("gtt.genband_cdr")
#list_ids_in_db=[i424["id"] for i424 in list_ids_in_db_row if i424["id"] !=''] #list_ids_in_db list_2_load_keys=list(set(msg_2_days_dict.keys()).difference(set(list_ids_in_db)))[0:2] if len(list_2_load_keys)==0: print("No data to load") exit() list_2_load_str="/* ".join(filter(None, list_2_load_keys))+"/*" list_2_load_str ids_2_load=[[i425,msg_2_days_dict[i425]] for i425 in list_2_load_keys] #ids_2_load=[i400.split(" ") for i400 in list_ids_2_load] R = Row('id', 'date_') ids_2_load_df=sqlContext.createDataFrame([R(i421[0],i421[1]) for i421 in ids_2_load]) commands.getoutput("hadoop fs -cat "+list_2_load_str+"| perl -p -e 's/{\"entitydata\":/\n{\"entitydata\":/g'| grep -v \"^$\" |hadoop fs -put -f - JSON_INPUT_gcct") #https://stackoverflow.com/questions/5508509/how-do-i-check-if-a-string-is-valid-json-in-python def is_json(myjson): try: json_object = json.loads(myjson) except ValueError, e: return({"_corrupt_record_data":"True"}) return(json_object) def create_table_stmnt(db_location,db,table):
def load_data(): # load data from files # and return query results / aggregates. hiveContext = HiveContext(sc) # 1027 # path = '/home/brandon/PycharmProjects/markov_chain/data/raw_tx/' # path = '/home/brandon/PycharmProjects/markov_chain/data/raw_tx_fraud/train/' # AMAZON AWS EMR path = 'hdfs:///tmp/files/' #HDFS # new segement files tx_files = [path + 'l_adults_2550_female_rural.csv', path + 'l_adults_2550_female_urban.csv', path + 'l_adults_2550_male_rural.csv', \ path + 'l_adults_2550_male_urban.csv', path + 'l_young_adults_female_rural.csv', path + 'l_young_adults_female_urban.csv',\ path + 'l_young_adults_male_rural.csv', path + 'l_young_adults_male_urban.csv', path + 'l_adults_50up_female_rural.csv', \ path + 'l_adults_50up_female_urban.csv', path + 'l_adults_50up_male_rural.csv', path + 'l_adults_50up_male_urban.csv' ] # small file for debugging # 1027 # tx_files = [path + 's_l_male_30_40_smaller_cities.csv'] # tx_files = [path + 'sorted_fraud_male_30_40_smaller_cities.csv'] # tx_files = [path+'40_60_bigger_cities.csv',path+'40_60_smaller_cities.csv',path+'all_60_up.csv'\ # ,path+'female_30_40_bigger_cities.csv',path+'female_30_40_smaller_cities.csv'\ # ,path+'male_30_40_bigger_cities.csv',path+'male_30_40_smaller_cities.csv'\ # ,path+'millenials.csv',path+'young_adults.csv'] # 1027 # tx_files = [path+'l_40_60_bigger_cities.csv',path+'l_40_60_smaller_cities.csv',path+'l_all_60_up.csv'\ # ,path+'l_female_30_40_bigger_cities.csv',path+'l_female_30_40_smaller_cities.csv'\ # ,path+'l_male_30_40_bigger_cities.csv',path+'l_male_30_40_smaller_cities.csv'\ # ,path+'l_millenials.csv',path+'l_young_adults.csv'] all_tx = sc.textFile(','.join(tx_files),600) # 1027 # txSchemaString = 'ssn|cc_num|first|last|gender|street|city|state|zip|lat|long|city_pop|job|dob|acct_num|profile|trans_num|trans_date|trans_time|unix_time|category|amt|merchant|merch_lat|merch_long' txSchemaString = 'ssn|cc_num|first|last|gender|street|city|state|zip|lat|long|city_pop|job|dob|acct_num|profile|trans_num|trans_date|trans_time|unix_time|category|amt|is_fraud|merchant|merch_lat|merch_long' txFields = [StructField(field_name, StringType(), True) for field_name in txSchemaString.split('|')] txFields[17] = StructField('trans_date', DateType(), True) txSchema = StructType(txFields) # ssn|cc_num|first|last|gender|street|city|state|zip|lat|long|city_pop|job|dob|acct_num|profile|trans_num|trans_date|trans_time|unix_time|category|amt|merchant|merch_lat|merch_long txHeader = all_tx.filter(lambda l: "ssn|" in l) txNoHeader = all_tx.subtract(txHeader) temp_tx = txNoHeader.map(lambda k: k.split("|")).map(lambda p: ( p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], p[16], datetime.datetime.strptime(p[17], '%Y-%m-%d').date(), p[18], p[19], p[20], p[21], p[22], p[23], p[24], p[25])) h_tx_df = hiveContext.createDataFrame(temp_tx, txSchema) h_tx_df.registerTempTable("htx") hiveContext.cacheTable("htx") # HBASE CODE HERE # create dataframe with all records # map using hbase_process to extract record into individual componenets # and create a dictionary to store in hbase #h_data = hiveContext.sql("SELECT * FROM htx") #h_data.map(hbase_process).foreachPartition(store_full_data) # get cust mean time between transactions time_lag_eval = hiveContext.sql( "SELECT cc_num, unix_time, LAG(htx.unix_time) OVER (PARTITION BY htx.cc_num ORDER BY htx.unix_time) as lag_time from htx order by cc_num, unix_time asc") time_lag_eval.registerTempTable("ts_lag") user_avg_time = hiveContext.sql("SELECT cc_num, AVG(unix_time - lag_time) as time_diff, percentile_approx((unix_time - lag_time),0.1) as low_bound, percentile_approx((unix_time - lag_time),0.90) as high_bound from ts_lag where lag_time is not null group by cc_num") user_avg_time.registerTempTable("avg_time") # get cust mean per category mean_per_cat = hiveContext.sql("SELECT cc_num, category, avg(amt) as mean_exp, (avg(amt)-2*(stddev_pop(amt))) as low_bound, (avg(amt)+2*(stddev_pop(amt))) as high_bound from htx group by cc_num, category") mean_per_cat.registerTempTable("mean_per_cat") # evaluate amount for HML and time of purchase for normal/abnormal test = hiveContext.sql( # # "SELECT htx.cc_num, profile, htx.category, htx.trans_date,htx.trans_time, htx.unix_time,IF(htx.amt>(2 * m.mean_exp),'H',(IF(htx.amt<(0.5 * m.mean_exp),'L','N'))) as EXP, IF(htx.category like '%_net%','N','P') as CNP, htx.amt, LAG(htx.unix_time) OVER (PARTITION BY htx.cc_num ORDER BY htx.unix_time) as lag_time from htx join mean_per_cat m on htx.cc_num=m.cc_num and m.category =htx.category") "SELECT htx.cc_num, profile, htx.category, htx.trans_date,htx.trans_time, htx.unix_time,IF(htx.amt>m.high_bound,'H',(IF(htx.amt < m.low_bound,'L','N'))) as EXP, IF(cast(SUBSTR(htx.trans_time,0,2) as int)<05,'A',IF(cast(SUBSTR(htx.trans_time,0,2) as int)>21,'A','N')) as NAT, htx.amt, LAG(htx.unix_time) OVER (PARTITION BY htx.cc_num ORDER BY htx.unix_time) as lag_time from htx join mean_per_cat m on htx.cc_num=m.cc_num and m.category =htx.category") test.registerTempTable("full_table") # evaluate for transaction time (HML) full_data = hiveContext.sql( "SELECT full_table.cc_num, profile, category, trans_date, trans_time, unix_time,lag_time,IF(lag_time is null,100000,unix_time-lag_time) as time_since,amt, EXP,NAT,IF((unix_time-lag_time)<avg_time.low_bound,'H',IF((unix_time-lag_time)>avg_time.high_bound,'L','N')) as VEL from full_table left join avg_time on avg_time.cc_num = full_table.cc_num") full_data.registerTempTable("full_data") # return full tx data for user with reduced HML/AN/HML variables per_cust_transactions = hiveContext.sql( "SELECT cc_num as cust_id,concat(EXP,NAT, VEL) as trans_list from full_data order by cc_num, unix_time asc") # return full tx data for profile with reduced HML/NP/HML variables in sorted order #pre_sort_ per_profile_transactions = hiveContext.sql( "SELECT profile as cust_id,concat(EXP,NAT,VEL) as trans_list from full_data order by profile, unix_time asc") #pre_sort_per_profile_transactions.registerTempTable("pre_sort") # we only need cust_id (really profile name here) and trans_list, but we had to include cc_num above in our sort #per_profile_transactions = hiveContext.sql("SELECT cust_id,trans_list from pre_sort") # gets pre-computed reference values for each customer and stores in redis # avg spent per category # n transactions # last unix time stamp agg_info = hiveContext.sql( "SELECT CONCAT(category, '_', cc_num) as cust_id, category, concat(low_bound,',',high_bound) as low_high from mean_per_cat") avg_cat_data = agg_info.rdd.map(lambda x: [str(x.cust_id), str(x.low_high)]) agg_n_tx = hiveContext.sql( "SELECT CONCAT('count_', cc_num) as cust_id, count(cc_num) as tx_count from full_data group by cc_num") n_tx = agg_n_tx.rdd.map(lambda x: [str(x.cust_id), str(x.tx_count)]) agg_unix_ts = hiveContext.sql( "SELECT CONCAT('timestamp_', cc_num) as cust_id, max(unix_time) as last_unix_time from full_data group by cc_num") n_ts = agg_unix_ts.rdd.map(lambda x: [str(x.cust_id), str(x.last_unix_time)]) agg_vel_info = hiveContext.sql( "SELECT CONCAT('velocity_', cc_num) as cust_id, concat(low_bound,',',high_bound) as low_high from avg_time") avg_vel_data = agg_vel_info.rdd.map(lambda x: [str(x.cust_id), str(x.low_high)]) # compile our final string per customer for all tx's per_cust_transactions_r = per_cust_transactions.map(lambda p: (str(p.cust_id), str(p.trans_list))) \ .reduceByKey(lambda y, z: y + ',' + z).map(lambda x: ''.join(x[0]) + ',' + x[1]) # compile our final string per profile for all tx's per_profile_transactions_r = per_profile_transactions.map(lambda p: (str(p.cust_id), str(p.trans_list))) \ .reduceByKey(lambda y, z: y + ',' + z).map(lambda x: ''.join(x[0]) + ',' + x[1]) # return tx data and aggregates return_dict = {} return_dict['profile'] = per_profile_transactions_r return_dict['customer'] = per_cust_transactions_r return avg_cat_data, n_tx, n_ts, return_dict, avg_vel_data
StructField('price', FloatType(), True), \ StructField('postPrice', FloatType(), True), \ StructField('userNick', StringType(), True), \ StructField('categoryId', StringType(), True), \ StructField('categoryName', StringType(), True), \ StructField('fishPoolId', StringType(), True), \ StructField('fishpoolName', StringType(), True), \ StructField('bar', StringType(), True), \ StructField('barInfo', StringType(), True), \ StructField('abbr', StringType(), True), \ StructField('shiren', StringType(), True), \ StructField('zhima', StringType(), True), \ StructField('ts', StringType(), True) ]) hc = HiveContext(sc) df = hc.createDataFrame(data, schema) hc.registerDataFrameAsTable(df, "xianyu_iteminfo") hc.sql( "insert OVERWRITE table wl_base.`t_base_ec_xianyu_iteminfo_parquet` PARTITION(ds = '" + lastday + "') " "select " "case when t1.itemid is null then t2.itemid else t1.itemid end, " "case when t1.itemid is null then t2.userid else t1.userid end, " "case when t1.itemid is null then t2.phone else t1.phone end, " "case when t1.itemid is null then t2.contacts else t1.contacts end, " "case when t1.itemid is null then t2.title else t1.title end, " "case when t1.itemid is null then t2.province else t1.province end, " "case when t1.itemid is null then t2.city else t1.city end, " "case when t1.itemid is null then t2.area else t1.area end, " "case when t1.itemid is null then t2.auctionType else t1.auctionType end, " "case when t1.itemid is null then t2.description else t1.description end, "
sqlstr = sqlstr[: len(sqlstr)-2] sqlstr += "\n) stored as orc" print sqlstr sql_context.sql(sqlstr) df.insertInto(tableName, overwrite) if __name__ == '__main__': #log.debug("debug") #a = eval("(1,[2,3])") #print "xxxxxxx",a[1][0] #a = {1: 1.0, 3: 5.5} #str_a = str(a) #a = eval(str_a) #print a[1] #print json.loads("""{1:1}""") sc = SparkContext("local[1]", appName="bintrade.ml.diff_feature") sql_context = HiveContext(sc) sql_context.sql(""" use fex_test """) sql_context.setConf("spark.sql.shuffle.partitions", "1") ldict = [{"symbol":"AAA", "date":"2010-01-01", "close":1.0}, {"symbol":"AAA","date":"2010-01-01", "close":1.0}] df = sql_context.createDataFrame(ldict) dfToTableWithPar(sql_context, df, "test_eod_AAA")
workday_list = ['five_', 'four_', 'three_', 'two_', 'one_'] ##融资余额 market_sh_finance_financevalue = pd.pivot_table( market_sh_finance_info, index=['customer_no', 'op_date'], columns=['date_rank'], values=['financevalue']) market_sh_finance_financevalue.columns = [ x + 'sh_market_financevalue' for x in workday_list ] market_sh_finance_financevalue = market_sh_finance_financevalue.reset_index() market_sh_finance_financevalue = add_statistics(market_sh_finance_financevalue, 'sh_market_financevalue') market_sh_finance_financevalue = hql.createDataFrame( market_sh_finance_financevalue) market_sh_finance_financevalue.registerTempTable( 'market_sh_finance_financevalue') ##融资买入额 market_sh_finance_financebuyvalue = pd.pivot_table( market_sh_finance_info, index=['customer_no', 'op_date'], columns=['date_rank'], values=['financebuyvalue']) market_sh_finance_financebuyvalue.columns = [ x + 'sh_market_financebuyvalue' for x in workday_list ] market_sh_finance_financebuyvalue = market_sh_finance_financebuyvalue.reset_index( ) market_sh_finance_financebuyvalue = add_statistics(
else: result1 = float(numerator1) / denominator1 return result1 ##Which URL has the most ranks below 10 across all keywords over the period? doc = sc.textFile("maprfs:///user/root/serp.csv", use_unicode=False).map(lambda x: x.replace(",", "\t")) header = doc.first() data = doc.filter(lambda row: row <> header).map(lambda x: x.split("\t")) clean_data = data.filter(lambda line: len(line) == 7) seo_data = clean_data.map(lambda x: (x[0], x[1], x[2], x[3], x[4], x[5], x[6])) good_urls = seo_data.filter(lambda x: float(x[5]) < 10) good_urls_df = sqlContext.createDataFrame( good_urls, schema=['keyword', 'market', 'location', 'device', 'date', 'rank', 'url']) good_urls_df.registerTempTable('good_urls_df') best_urls_df = good_urls_df[['url' ]].groupby(['url' ]).count().sort(col("count").desc()) best_urls_df.registerTempTable('best_urls_df') sqlContext.sql('truncate table serp.best_rank_urls') sqlContext.sql( 'insert into serp.best_rank_urls SELECT * FROM best_urls_df limit 1') #Provide the set of keywords (keyword information) where the rank 1 URL changes the most over the period. #A change, for the purpose of this question, is when a given keyword's rank 1 URL is different from the previous day's URL. keyval = seo_data.map(lambda x: (x[0], x[1], x[4], x[5], x[6])) keyval_rank = keyval.filter(lambda x: float(x[3]) == 1)
b_labels = sc.broadcast(labels) # 返回的结果格式:[t_id, cluster_size, tags_str, id_str] interest_based_matrix = sc.parallelize(id_list).map( lambda cno: (str(cno), b_indicators.value, b_labels.value)).map( lambda t: get_similar_industry(t[0], t[1], t[2])).collect() # 基于相似资产水平,存储最终结果 # 返回的结果格式:[[t_id, cluster_size, tags_str, id_str]] asset_based_matrix = sc.parallelize(id_list).map( lambda cno: (str(cno), b_indicators.value, b_labels.value)).map( lambda t: get_similar_asset(t[0], t[1], t[2])).collect() # 基于兴趣 interest_based_df = hqlContext.createDataFrame( pd.DataFrame( interest_based_matrix, columns=['customer_no', 'cluster_size', 'tags', 'ids'])) # 基于资产水平 asset_based_df = hqlContext.createDataFrame( pd.DataFrame( asset_based_matrix, columns=['customer_no', 'cluster_size', 'tags', 'ids'])) # 注册成表,便于后期的关联操作 interest_based_df.registerTempTable('interest_based_table') asset_based_df.registerTempTable('asset_based_table') # 热门板块客户代表标签 hot_based_labels = get_hot_industry(indicators, labels) hot_based_df = hqlContext.createDataFrame(
def create_table(df, table_name, sqlContext, cols=None, size_limit=30): df.persist() no_plot_cols = [] output = [] cols_complete = [] var_cols = [ 'colm', 'col_type', 'uniques', 'missing', 'mean', 'stddev', 'graph' ] type_dict = { 'float': 'numeric', 'long': 'numeric', 'integer': 'numeric', 'smallint': 'numeric', 'int': 'numeric', 'bigint': 'numeric', 'string': 'categorical', 'timestamp': 'date', 'binary': 'indicator', 'decimal(9,2)': 'numeric' } if cols == None: cols = df.columns for c in cols: print 'Getting {:s} data'.format(c) sys.stdout.flush() #print("Producing graphs" + str(col_graphs)) cols_complete.append(c) rem_cols = list(set(df.columns) - set(cols_complete)) #Initialize columns uniq = 0 null = 0 mean = 0 std_dev = 0 g = 0 g_path = 0 col_g = [] # col_g_paths = [] # col_g.extend(np.zeros(len(col_graphs))) uniq = df.select(c).distinct().count() print('... uniques: {:d}'.format(uniq)) col_type = df.select(c).dtypes[0][1] col_type = type_dict[col_type] if uniq == 2: col_type = 'indicator' print('... column type: {:s}'.format(col_type)) null = df.where(F.col(c).isNull()).count() print('... nulls: {:d}'.format(null)) if (uniq < size_limit) & (col_type in ['categorical', 'indicator']): g, g_path = g_barplot(df, c) if col_type in ['numeric']: df_sum = df.select(c).agg(F.avg(F.col(c)), F.stddev(F.col(c))).take(1) mean = df_sum[0][0] std_dev = df_sum[0][1] print('... numerical summary: {:0.2f}, {:0.2f}'.format( mean, std_dev)) g, g_path = g_histogram(df, c) print('... Single Graph Done') output.append(tuple([c, col_type, uniq, null, mean, std_dev, g_path])) # 2 factor charts here # create the table schema_list = [ T.StructField("colm", T.StringType(), True), T.StructField("col_type", T.StringType(), True), T.StructField("uniques", T.IntegerType(), True), T.StructField("missing", T.IntegerType(), True), T.StructField("mean", T.FloatType(), True), T.StructField("stddev", T.FloatType(), True), T.StructField("graph", T.StringType(), True) ] # graph_schema_list = [T.StructField(x, T.StringType(), True) for x in col_graphs] # schema_list.extend(graph_schema_list) schema = T.StructType(schema_list) print schema rdd = sc.parallelize(output) hive = HiveContext(sc) hive.createDataFrame(rdd, schema=schema)\ .write.mode('overwrite')\ .saveAsTable('datamining.' + table_name,format='parquet') df.unpersist() print '... {:s} saved to cluster'.format(table_name) sys.stdout.flush()
workday_list = ['five_', 'four_', 'three_', 'two_', 'one_'] #行列转换 透视表 week_every_date_stock_position = pd.pivot_table( week_every_date_stock_position, index=['customer_no', 'op_date'], columns=['date_rank'], values=['position']) week_every_date_stock_position.columns = [ x + 'stock_position' for x in workday_list ] week_every_date_stock_position = week_every_date_stock_position.reset_index() week_every_date_stock_position = add_statistics(week_every_date_stock_position, 'stock_position') week_every_date_stock_position = hql.createDataFrame( week_every_date_stock_position) week_every_date_stock_position.registerTempTable( 'week_every_date_stock_position') #1.7近一周各天维保比例 week_every_date_deposit_rate = hql.sql(''' select customer_no,op_date,date_rank,deposit_rate from (select a.customer_no,op_date,init_date,deposit_rate, rank()over(partition by a.customer_no,op_date order by init_date ) as date_rank from (select customer_no,close_rate as deposit_rate, concat(substr(init_date, 0,4), '-', substr(init_date, 5,2),'-', substr(init_date, 7,2)) as init_date from agg_cust_balance_workday)a join (select customer_no,op_date,op_before_one_date,op_before_week_date
# df = diff_std_5(df,'top1_industry_volume') # df = diff_std_5(df,'top1_industry_change_rate') # df = diff_std_5(df,'top2_industry_open_price') # df = diff_std_5(df,'top2_industry_close_price') # df = diff_std_5(df,'top2_industry_high_price') # df = diff_std_5(df,'top2_industry_low_price') # df = diff_std_5(df,'top2_industry_amount') # df = diff_std_5(df,'top2_industry_volume') # df = diff_std_5(df,'top2_industry_change_rate') # if int(flag) == 0: # df = pd.concat([df_id,df,df_label],axis = 1) # else: # df = pd.concat([df_id,df],axis = 1) spark_df = hql.createDataFrame(df) try: if int(flag) == 0: spark_df.write.mode('overwrite').saveAsTable( 'cust_mining.c99_feature_engine_train_{}'.format(run_date)) print('write cust_mining.c99_feature_engine_train_' + run_date + ' to hive Successfully') else: spark_df.write.mode('overwrite').saveAsTable( 'cust_mining.c99_feature_engine_test_{}'.format(run_date)) print('write cust_mining.c99_feature_engine_test_' + run_date + ' to hive Successfully') except: print('write ' + run_date + ' to hive Failed !!!')
nvl(stock_type, '') as stock_type, nvl(current_qty, 0) as current_qty from ''' + db_name + '''.agg_cust_stock where part_date>=''' + one_year_ago + ''' and part_date<=''' + run_date + ''' ''').rdd # collect 后的结果集type为list s_e_date_rdd = stock_hold_cur_detail. \ map(lambda row_item: (row_item['customer_no'] + ',' + row_item['stock_no'] + ',' + row_item['stock_type'], row_item['init_date'] + ':' + str(row_item['current_qty']))). \ reduceByKey(lambda x, y: x + ' ' + y). \ flatMap(lambda i: parse_s_e_date(i)) # DataFrame可由list、tuple、rdd转化 s_e_date_df = hqlContext.createDataFrame(s_e_date_rdd, [ 'customer_no', 'stock_no', 'stock_type', 'stock_start_date', 'stock_end_date']) s_e_date_df.registerTempTable("stock_hold_date_table") # 所有持仓信息注册为表 hqlContext.cacheTable('stock_hold_date_table') # 偏好股票明细 stock_prefer_detail = hqlContext.sql(''' select a.customer_no as customer_no, a.stock_no as stock_no, a.stock_name as stock_name, row_number() over (partition by customer_no order by total_exchange_amount) as rank from ( select customer_no, stock_no, stock_name, sum(business_balance) as total_exchange_amount
list_2_load_keys = list( set(msg_2_days_dict.keys()).difference(set(list_ids_in_db)))[0:3] if len(list_2_load_keys) == 0: print("No Process ids to load") exit() list_2_load_str = "/* ".join(filter(None, list_2_load_keys)) + "/*" list_2_load_keys_ids = [str(i429).split("/")[-1] for i429 in list_2_load_keys] print("\n\nThe following processids are loaded " + str(list_2_load_keys_ids).replace("[", "").replace("]", "")) ids_2_load = [[i425, msg_2_days_dict[i425]] for i425 in list_2_load_keys] #ids_2_load=[i400.split(" ") for i400 in list_ids_2_load] R = Row('id', 'date_') ids_2_load_df = sqlContext.createDataFrame( [R(i421[0], i421[1]) for i421 in ids_2_load]) ids_2_load_df = ids_2_load_df.withColumn("Process__id", lit(process__id)) ids_2_load_df = ids_2_load_df.withColumn("gdia_load_date", lit(str(today_full))) #commands.getoutput("hadoop fs -cat "+list_2_load_str+"| perl -p -e 's/{\"entitydata\":/\n{\"entitydata\":/g'| grep -v \"^$\" |hadoop fs -put -f - JSON_INPUT_FSM") commands.getoutput( "hadoop fs -cat " + list_2_load_str + "| perl -p -e 's/{\"entitydata\":/\n{\"entitydata\":/g'| grep -v \"^$\" |hadoop fs -put -f - " + db_location + db + "/raw/JSON_INPUT") #https://stackoverflow.com/questions/5508509/how-do-i-check-if-a-string-is-valid-json-in-python def is_json(myjson, index_value, index_col): try: json_object = json.loads(myjson)
workday_list = ['five_', 'four_', 'three_', 'two_', 'one_'] ### 开盘价 stock_concentrate_top2_openprice = pd.pivot_table( stock_concentrate_top2_info, index=['customer_no', 'op_date'], columns=['date_rank'], values=['openprice']) stock_concentrate_top2_openprice.columns = [ x + 'top2_stock_openprice' for x in workday_list ] stock_concentrate_top2_openprice = stock_concentrate_top2_openprice.reset_index( ) stock_concentrate_top2_openprice = add_statistics( stock_concentrate_top2_openprice, 'top2_stock_openprice') stock_concentrate_top2_openprice = hql.createDataFrame( stock_concentrate_top2_openprice) stock_concentrate_top2_openprice.registerTempTable( 'stock_concentrate_top2_openprice') ### 收盘价 stock_concentrate_top2_closeprice = pd.pivot_table( stock_concentrate_top2_info, index=['customer_no', 'op_date'], columns=['date_rank'], values=['closeprice']) stock_concentrate_top2_closeprice.columns = [ x + 'top2_stock_closeprice' for x in workday_list ] stock_concentrate_top2_closeprice = stock_concentrate_top2_closeprice.reset_index( ) stock_concentrate_top2_closeprice = add_statistics(
df.insertInto(tableName, overwrite) if __name__ == '__main__': #log.debug("debug") #a = eval("(1,[2,3])") #print "xxxxxxx",a[1][0] #a = {1: 1.0, 3: 5.5} #str_a = str(a) #a = eval(str_a) #print a[1] #print json.loads("""{1:1}""") sc = SparkContext("local[1]", appName="bintrade.ml.diff_feature") sql_context = HiveContext(sc) sql_context.sql(""" use fex_test """) sql_context.setConf("spark.sql.shuffle.partitions", "1") ldict = [{ "symbol": "AAA", "date": "2010-01-01", "close": 1.0 }, { "symbol": "AAA", "date": "2010-01-01", "close": 1.0 }] df = sql_context.createDataFrame(ldict) dfToTableWithPar(sql_context, df, "test_eod_AAA")