Python HiveContext.createDataFrame 예제들, pyspark.HiveContext.createDataFrame Python 예제들

예제 #1

0

파일 보기

def run_hive():
    dic = [{'id': '1,2,3'}]
    # df = pd.DataFrame(dic)
    sc = SparkContext()
    # sc.parallelize(dic)

    sql_ctx = HiveContext(sc)
    # sql_ctx.registerDataFrameAsTable(df, "aaa")

    sdf = sql_ctx.createDataFrame(dic)
    sdf.registerTempTable('aaa')

    # sdf.show()

    # df2 = sql_ctx.sql('select split(id,',') from aaa')
    df2 = sql_ctx.sql(
        'select  select collect_list(cast (explode(split(id,",")) AS string)) from aaa'
    )
    df2.show()

예제 #2

0

파일 보기

파일: data_standard_s2.py 프로젝트: zhilangtaosha/sparkdata

        phone = ob.get("bankpremobile", None)
    else:
        phone = None
    return (phone, idCard, idBank, name)


# xiaoshudian_app_key = "1186159692"
# xiaoshudian_tec_api = ["tel","address_getbymobile","channel_NameIDCardAccountVerify","channel_cellphone","operator_capricorn","address_match","channel_idcard","channel_bankby3","channel_idNameFase","channel_criminal","channel_blacklistverify","credit_implement"]
# def filter(app_key,api):
#     not (app_key in xiaoshudian_app_key and api not in  xiaoshudian_tec_api)


data_rdd = data.rdd.map(lambda a: (a.app_key_param, a.date, standard_params(a.params), a.interface, a.api_type)) \
    .map(lambda (a, b, c, d, e): (a, b, c[0], c[1], c[2], c[3], d, e))
'''
c[0]->phone
c[1]->idcard
c[2]->idbank
c[3]->name
'''
schemaStr = "app_key date phone idcard idbank name interface api_type"
fields = [
    StructField(field_name, StringType(), True)
    for field_name in schemaStr.split()
]
schema = StructType(fields)

data_df = hc.createDataFrame(data_rdd, schema).distinct()
dfw = DataFrameWriter(data_df)
dfw.saveAsTable("wl_analysis.t_lel_record_data_backflow", mode="overwrite")

예제 #3

0

파일 보기

파일: c4_industry_feature.py 프로젝트: sadjjk/active_customer

stock_concentrate_top1_industry_info.open_price = stock_concentrate_top1_industry_info.open_price.astype('float')
stock_concentrate_top1_industry_info.close_price = stock_concentrate_top1_industry_info.close_price.astype('float')
stock_concentrate_top1_industry_info.high_price = stock_concentrate_top1_industry_info.high_price.astype('float')
stock_concentrate_top1_industry_info.low_price = stock_concentrate_top1_industry_info.low_price.astype('float')
stock_concentrate_top1_industry_info.amount = stock_concentrate_top1_industry_info.amount.astype('float')
stock_concentrate_top1_industry_info.volume = stock_concentrate_top1_industry_info.volume.astype('float')
stock_concentrate_top1_industry_info.change_rate = stock_concentrate_top1_industry_info.change_rate.astype('float')

workday_list = ['five_','four_','three_','two_','one_']

#开盘价
top1_open_price_info = pd.pivot_table(stock_concentrate_top1_industry_info,index=['customer_no','op_date'],columns=['date_rank'],values=['open_price'])
top1_open_price_info.columns=[x + 'top1_industry_open_price' for x in workday_list ]
top1_open_price_info = top1_open_price_info.reset_index()
top1_open_price_info = add_statistics(top1_open_price_info,'top1_industry_open_price')
top1_open_price_info = hql.createDataFrame(top1_open_price_info)
top1_open_price_info.registerTempTable('top1_open_price_info')


#收盘价
top1_close_price_info = pd.pivot_table(stock_concentrate_top1_industry_info,index=['customer_no','op_date'],columns=['date_rank'],values=['close_price'])
top1_close_price_info.columns=[x + 'top1_industry_close_price' for x in workday_list ]
top1_close_price_info = top1_close_price_info.reset_index()
top1_close_price_info = add_statistics(top1_close_price_info,'top1_industry_close_price')
top1_close_price_info = hql.createDataFrame(top1_close_price_info)
top1_close_price_info.registerTempTable('top1_close_price_info')


#最高价
top1_high_price_info = pd.pivot_table(stock_concentrate_top1_industry_info,index=['customer_no','op_date'],columns=['date_rank'],values=['high_price'])
top1_high_price_info.columns=[x + 'top1_industry_high_price' for x in workday_list ]

예제 #4

0

파일 보기

pdf['is_polluter'] = model.predict(pdf[predict_cols])



'''
print "Predictions: \n"
print pdf.head()
'''

# set a threshhold of 85% probability to flag as spammer
polluters = pdf[pdf.is_polluter > 0.85]

#.to_json(orient="records")
links = polluters.tweeted_urls
links_df = sqlContext.createDataFrame(pd.DataFrame(links))
links_df = links_df.select(['tweeted_urls.url', 'tweeted_urls.expanded_url'])
uniqueLInks = links_df.dropDuplicates(['url', 'expanded_url'])

# forget about S3:
# uniqueLInks.repartition(1).save("s3n://w205twitterproject/temp_urls","json")
print "saving file to /data/w205Project/honeypot/temp_urls.json...."
# instead, save to file on local disk for use by scrapy
uniqueLInks.toPandas().to_json(orient="records",path_or_buf='/data/w205Project/python/url_spider/url_spider/logs/temp_urls.log')


#################################################################################################################
#    APPEND CLASSIFIED DATA TO POSTGRES FOR DASHBOARD
#################################################################################################################

from sqlalchemy import create_engine

예제 #5

0

파일 보기

])

genband_stats_schema = StructType([
    StructField("iteration_id", StringType(), False),
    StructField("file_id", StringType(), False),
    StructField("file_name", StringType(), False),
    StructField("check_sum", StringType(), False),
    StructField("total_cnt", StringType(), False),
    StructField("success_cnt", StringType(), False),
    StructField("failed_cnt", StringType(), False),
    StructField("timeframe_day", LongType(), False),
    StructField("timeframe_hr", IntegerType(), False)
])

## Below code is for Hive storing
calls_df = sqlContext.createDataFrame(final, genband_schema)
#sqlContext.sql("drop table if exists gtt.GENBAND_CDR")
#sqlContext.sql("CREATE TABLE gtt.GENBAND_CDR (file_id string, file_name string, connect_datetime timestamp, originating_number string ,terminating_number string  ,"
#               "elapsed_time decimal(12,2), dom_int_indicator string, trunkid1 string  ,  trunkid2 string  , call_code string  ,"
#               "completion_ind string,  answer_ind string ) partitioned by (timeframe_day bigint,timeframe_hr int)")

stats_df = sqlContext.createDataFrame(final_stats, genband_stats_schema)

#sqlContext.sql("drop table if exists gtt.GENBAND_PARSER_STATS")
#sqlContext.sql("CREATE TABLE gtt.GENBAND_PARSER_STATS (iteration_id string, file_id string, file_name string, "
#               "check_sum string, total_cnt string, success_cnt string, failed_cnt string ) partitioned by (timeframe_day bigint,timeframe_hr int)")

print(calls_df.columns)
print(calls_df.count())

#calls_df.write.format("parquet").mode("append").saveAsTable("gtt.genband_cdr")

예제 #6

0

파일 보기

#list_ids_in_db=[i424["id"] for i424 in list_ids_in_db_row if i424["id"] !='']
#list_ids_in_db

list_2_load_keys=list(set(msg_2_days_dict.keys()).difference(set(list_ids_in_db)))[0:2]
if len(list_2_load_keys)==0:
    print("No data to load")
    exit()
list_2_load_str="/* ".join(filter(None, list_2_load_keys))+"/*"
list_2_load_str


ids_2_load=[[i425,msg_2_days_dict[i425]] for i425 in list_2_load_keys]

#ids_2_load=[i400.split(" ") for i400 in list_ids_2_load]
R = Row('id', 'date_')
ids_2_load_df=sqlContext.createDataFrame([R(i421[0],i421[1]) for i421 in ids_2_load])


commands.getoutput("hadoop fs -cat "+list_2_load_str+"| perl -p -e 's/{\"entitydata\":/\n{\"entitydata\":/g'| grep -v \"^$\" |hadoop fs -put -f - JSON_INPUT_gcct")


#https://stackoverflow.com/questions/5508509/how-do-i-check-if-a-string-is-valid-json-in-python
def is_json(myjson):
    try:
        json_object = json.loads(myjson)
    except ValueError, e:
        return({"_corrupt_record_data":"True"})
    return(json_object)
	
	
def create_table_stmnt(db_location,db,table):

예제 #7

0

파일 보기

파일: Sparkov_AWS.py 프로젝트: namebrandon/Sparkov

def load_data():
    # load data from files
    # and return query results / aggregates.

    hiveContext = HiveContext(sc)
    # 1027
    # path = '/home/brandon/PycharmProjects/markov_chain/data/raw_tx/'
    # path = '/home/brandon/PycharmProjects/markov_chain/data/raw_tx_fraud/train/'

    # AMAZON AWS EMR
    path = 'hdfs:///tmp/files/'    #HDFS


    # new segement files

    tx_files = [path + 'l_adults_2550_female_rural.csv', path + 'l_adults_2550_female_urban.csv', path + 'l_adults_2550_male_rural.csv', \
    path + 'l_adults_2550_male_urban.csv', path + 'l_young_adults_female_rural.csv', path + 'l_young_adults_female_urban.csv',\
    path + 'l_young_adults_male_rural.csv', path + 'l_young_adults_male_urban.csv', path + 'l_adults_50up_female_rural.csv', \
    path + 'l_adults_50up_female_urban.csv', path + 'l_adults_50up_male_rural.csv', path + 'l_adults_50up_male_urban.csv' ]

    # small file for debugging
    # 1027

    # tx_files = [path + 's_l_male_30_40_smaller_cities.csv']
    # tx_files = [path + 'sorted_fraud_male_30_40_smaller_cities.csv']

    # tx_files = [path+'40_60_bigger_cities.csv',path+'40_60_smaller_cities.csv',path+'all_60_up.csv'\
    #         ,path+'female_30_40_bigger_cities.csv',path+'female_30_40_smaller_cities.csv'\
    #         ,path+'male_30_40_bigger_cities.csv',path+'male_30_40_smaller_cities.csv'\
    #         ,path+'millenials.csv',path+'young_adults.csv']

    # 1027
    # tx_files = [path+'l_40_60_bigger_cities.csv',path+'l_40_60_smaller_cities.csv',path+'l_all_60_up.csv'\
    #         ,path+'l_female_30_40_bigger_cities.csv',path+'l_female_30_40_smaller_cities.csv'\
    #         ,path+'l_male_30_40_bigger_cities.csv',path+'l_male_30_40_smaller_cities.csv'\
    #         ,path+'l_millenials.csv',path+'l_young_adults.csv']



    all_tx = sc.textFile(','.join(tx_files),600)

    # 1027
    # txSchemaString = 'ssn|cc_num|first|last|gender|street|city|state|zip|lat|long|city_pop|job|dob|acct_num|profile|trans_num|trans_date|trans_time|unix_time|category|amt|merchant|merch_lat|merch_long'
    txSchemaString = 'ssn|cc_num|first|last|gender|street|city|state|zip|lat|long|city_pop|job|dob|acct_num|profile|trans_num|trans_date|trans_time|unix_time|category|amt|is_fraud|merchant|merch_lat|merch_long'
    txFields = [StructField(field_name, StringType(), True) for field_name in txSchemaString.split('|')]
    txFields[17] = StructField('trans_date', DateType(), True)

    txSchema = StructType(txFields)
    # ssn|cc_num|first|last|gender|street|city|state|zip|lat|long|city_pop|job|dob|acct_num|profile|trans_num|trans_date|trans_time|unix_time|category|amt|merchant|merch_lat|merch_long
    txHeader = all_tx.filter(lambda l: "ssn|" in l)
    txNoHeader = all_tx.subtract(txHeader)

    temp_tx = txNoHeader.map(lambda k: k.split("|")).map(lambda p: (
    p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], p[16],
    datetime.datetime.strptime(p[17], '%Y-%m-%d').date(), p[18], p[19], p[20], p[21], p[22], p[23], p[24], p[25]))

    h_tx_df = hiveContext.createDataFrame(temp_tx, txSchema)
    h_tx_df.registerTempTable("htx")

    hiveContext.cacheTable("htx")

    # HBASE CODE HERE
    # create dataframe with all records
    # map using hbase_process to extract record into individual componenets
    # and create a dictionary to store in hbase
    #h_data = hiveContext.sql("SELECT * FROM htx")
    #h_data.map(hbase_process).foreachPartition(store_full_data)

    # get cust mean time between transactions
    time_lag_eval = hiveContext.sql(
    "SELECT cc_num, unix_time, LAG(htx.unix_time) OVER (PARTITION BY htx.cc_num ORDER BY  htx.unix_time) as lag_time from htx order by cc_num, unix_time asc")
    time_lag_eval.registerTempTable("ts_lag")

    user_avg_time = hiveContext.sql("SELECT cc_num, AVG(unix_time - lag_time) as time_diff, percentile_approx((unix_time - lag_time),0.1) as low_bound, percentile_approx((unix_time - lag_time),0.90) as high_bound from ts_lag where lag_time is not null group by cc_num")
    user_avg_time.registerTempTable("avg_time")


    # get cust mean per category
    mean_per_cat = hiveContext.sql("SELECT cc_num, category, avg(amt) as mean_exp, (avg(amt)-2*(stddev_pop(amt))) as low_bound, (avg(amt)+2*(stddev_pop(amt))) as high_bound from htx group by cc_num, category")
    mean_per_cat.registerTempTable("mean_per_cat")

    # evaluate amount for HML and time of purchase for normal/abnormal
    test = hiveContext.sql(
    # #    "SELECT htx.cc_num, profile, htx.category, htx.trans_date,htx.trans_time, htx.unix_time,IF(htx.amt>(2 * m.mean_exp),'H',(IF(htx.amt<(0.5 * m.mean_exp),'L','N'))) as EXP, IF(htx.category like '%_net%','N','P') as CNP, htx.amt, LAG(htx.unix_time) OVER (PARTITION BY htx.cc_num ORDER BY  htx.unix_time) as lag_time from htx join mean_per_cat m on htx.cc_num=m.cc_num and m.category =htx.category")
    "SELECT htx.cc_num, profile, htx.category, htx.trans_date,htx.trans_time, htx.unix_time,IF(htx.amt>m.high_bound,'H',(IF(htx.amt < m.low_bound,'L','N'))) as EXP, IF(cast(SUBSTR(htx.trans_time,0,2) as int)<05,'A',IF(cast(SUBSTR(htx.trans_time,0,2) as int)>21,'A','N')) as NAT, htx.amt, LAG(htx.unix_time) OVER (PARTITION BY htx.cc_num ORDER BY  htx.unix_time) as lag_time from htx join mean_per_cat m on htx.cc_num=m.cc_num and m.category =htx.category")
    test.registerTempTable("full_table")

    # evaluate for transaction time (HML)
    full_data = hiveContext.sql(
         "SELECT full_table.cc_num, profile, category, trans_date, trans_time, unix_time,lag_time,IF(lag_time is null,100000,unix_time-lag_time) as time_since,amt, EXP,NAT,IF((unix_time-lag_time)<avg_time.low_bound,'H',IF((unix_time-lag_time)>avg_time.high_bound,'L','N')) as VEL from full_table left join avg_time on avg_time.cc_num = full_table.cc_num")
    full_data.registerTempTable("full_data")


    # return full tx data for user with reduced HML/AN/HML variables
    per_cust_transactions = hiveContext.sql(
        "SELECT cc_num as cust_id,concat(EXP,NAT, VEL) as trans_list from full_data order by cc_num, unix_time asc")

    # return full tx data for profile with reduced HML/NP/HML variables in sorted order
    #pre_sort_
    per_profile_transactions = hiveContext.sql(
        "SELECT profile as cust_id,concat(EXP,NAT,VEL) as trans_list from full_data order by profile, unix_time asc")
    #pre_sort_per_profile_transactions.registerTempTable("pre_sort")



    # we only need cust_id (really profile name here) and trans_list, but we had to include cc_num above in our sort
    #per_profile_transactions = hiveContext.sql("SELECT cust_id,trans_list from pre_sort")

    # gets pre-computed reference values for each customer and stores in redis
    #   avg spent per category
    #   n transactions
    #   last unix time stamp
    agg_info = hiveContext.sql(
        "SELECT CONCAT(category, '_', cc_num) as cust_id, category, concat(low_bound,',',high_bound) as low_high from mean_per_cat")
    avg_cat_data = agg_info.rdd.map(lambda x: [str(x.cust_id), str(x.low_high)])


    agg_n_tx = hiveContext.sql(
        "SELECT CONCAT('count_', cc_num) as cust_id, count(cc_num) as tx_count from full_data group by cc_num")
    n_tx = agg_n_tx.rdd.map(lambda x: [str(x.cust_id), str(x.tx_count)])

    agg_unix_ts = hiveContext.sql(
        "SELECT CONCAT('timestamp_', cc_num) as cust_id, max(unix_time) as last_unix_time from full_data group by cc_num")
    n_ts = agg_unix_ts.rdd.map(lambda x: [str(x.cust_id), str(x.last_unix_time)])

    agg_vel_info = hiveContext.sql(
        "SELECT CONCAT('velocity_', cc_num) as cust_id, concat(low_bound,',',high_bound) as low_high from avg_time")
    avg_vel_data = agg_vel_info.rdd.map(lambda x: [str(x.cust_id), str(x.low_high)])




    # compile our final string per customer for all tx's
    per_cust_transactions_r = per_cust_transactions.map(lambda p: (str(p.cust_id), str(p.trans_list))) \
        .reduceByKey(lambda y, z: y + ',' + z).map(lambda x: ''.join(x[0]) + ',' + x[1])



    # compile our final string per profile for all tx's
    per_profile_transactions_r = per_profile_transactions.map(lambda p: (str(p.cust_id), str(p.trans_list))) \
        .reduceByKey(lambda y, z: y + ',' + z).map(lambda x: ''.join(x[0]) + ',' + x[1])


    # return tx data and aggregates
    return_dict = {}
    return_dict['profile'] = per_profile_transactions_r
    return_dict['customer'] = per_cust_transactions_r

    return avg_cat_data, n_tx, n_ts, return_dict, avg_vel_data

예제 #8

0

파일 보기

파일: xianyu_iteminfo_df.py 프로젝트: zhilangtaosha/sparkdata

                     StructField('price', FloatType(), True), \
                     StructField('postPrice', FloatType(), True), \
                     StructField('userNick', StringType(), True), \
                     StructField('categoryId', StringType(), True), \
                     StructField('categoryName', StringType(), True), \
                     StructField('fishPoolId', StringType(), True), \
                     StructField('fishpoolName', StringType(), True), \
                     StructField('bar', StringType(), True), \
                     StructField('barInfo', StringType(), True), \
                     StructField('abbr', StringType(), True), \
                     StructField('shiren', StringType(), True), \
                     StructField('zhima', StringType(), True), \
                     StructField('ts', StringType(), True)
                     ])
hc = HiveContext(sc)
df = hc.createDataFrame(data, schema)
hc.registerDataFrameAsTable(df, "xianyu_iteminfo")
hc.sql(
    "insert OVERWRITE table  wl_base.`t_base_ec_xianyu_iteminfo_parquet` PARTITION(ds = '"
    + lastday + "') "
    "select "
    "case when t1.itemid is null then t2.itemid else t1.itemid end, "
    "case when t1.itemid is null then t2.userid else t1.userid end, "
    "case when t1.itemid is null then t2.phone else t1.phone end, "
    "case when t1.itemid is null then t2.contacts else t1.contacts end, "
    "case when t1.itemid is null then t2.title else t1.title end, "
    "case when t1.itemid is null then t2.province else t1.province end, "
    "case when t1.itemid is null then t2.city else t1.city end, "
    "case when t1.itemid is null then t2.area else t1.area end, "
    "case when t1.itemid is null then t2.auctionType else t1.auctionType end, "
    "case when t1.itemid is null then t2.description else t1.description end, "

예제 #9

0

파일 보기

파일: test_lib.py 프로젝트: hongbin0908/bintrade

    sqlstr = sqlstr[: len(sqlstr)-2]
    sqlstr += "\n) stored as orc"
    print sqlstr

    sql_context.sql(sqlstr)
    df.insertInto(tableName, overwrite)



if __name__ == '__main__':
    #log.debug("debug")
    #a = eval("(1,[2,3])")
    #print "xxxxxxx",a[1][0]
    #a = {1: 1.0, 3: 5.5}
    #str_a = str(a)
    #a = eval(str_a)
    #print a[1]

    #print json.loads("""{1:1}""")
    sc = SparkContext("local[1]", appName="bintrade.ml.diff_feature")
    sql_context = HiveContext(sc)
    sql_context.sql(""" use fex_test """)
    sql_context.setConf("spark.sql.shuffle.partitions", "1")


    ldict = [{"symbol":"AAA", "date":"2010-01-01", "close":1.0}, {"symbol":"AAA","date":"2010-01-01", "close":1.0}]

    df = sql_context.createDataFrame(ldict)
    dfToTableWithPar(sql_context, df,  "test_eod_AAA")

예제 #10

0

파일 보기

workday_list = ['five_', 'four_', 'three_', 'two_', 'one_']

##融资余额
market_sh_finance_financevalue = pd.pivot_table(
    market_sh_finance_info,
    index=['customer_no', 'op_date'],
    columns=['date_rank'],
    values=['financevalue'])
market_sh_finance_financevalue.columns = [
    x + 'sh_market_financevalue' for x in workday_list
]
market_sh_finance_financevalue = market_sh_finance_financevalue.reset_index()
market_sh_finance_financevalue = add_statistics(market_sh_finance_financevalue,
                                                'sh_market_financevalue')
market_sh_finance_financevalue = hql.createDataFrame(
    market_sh_finance_financevalue)
market_sh_finance_financevalue.registerTempTable(
    'market_sh_finance_financevalue')

##融资买入额
market_sh_finance_financebuyvalue = pd.pivot_table(
    market_sh_finance_info,
    index=['customer_no', 'op_date'],
    columns=['date_rank'],
    values=['financebuyvalue'])
market_sh_finance_financebuyvalue.columns = [
    x + 'sh_market_financebuyvalue' for x in workday_list
]
market_sh_finance_financebuyvalue = market_sh_finance_financebuyvalue.reset_index(
)
market_sh_finance_financebuyvalue = add_statistics(

예제 #11

0

파일 보기

파일: stats.py 프로젝트: sarhv/Stats_assignment

    else:
        result1 = float(numerator1) / denominator1
    return result1


##Which URL has the most ranks below 10 across all keywords over the period?

doc = sc.textFile("maprfs:///user/root/serp.csv",
                  use_unicode=False).map(lambda x: x.replace(",", "\t"))
header = doc.first()
data = doc.filter(lambda row: row <> header).map(lambda x: x.split("\t"))
clean_data = data.filter(lambda line: len(line) == 7)
seo_data = clean_data.map(lambda x: (x[0], x[1], x[2], x[3], x[4], x[5], x[6]))
good_urls = seo_data.filter(lambda x: float(x[5]) < 10)
good_urls_df = sqlContext.createDataFrame(
    good_urls,
    schema=['keyword', 'market', 'location', 'device', 'date', 'rank', 'url'])
good_urls_df.registerTempTable('good_urls_df')
best_urls_df = good_urls_df[['url'
                             ]].groupby(['url'
                                         ]).count().sort(col("count").desc())
best_urls_df.registerTempTable('best_urls_df')
sqlContext.sql('truncate table serp.best_rank_urls')
sqlContext.sql(
    'insert into serp.best_rank_urls SELECT * FROM best_urls_df limit 1')

#Provide the set of keywords (keyword information) where the rank 1 URL changes the most over the period.
#A change, for the purpose of this question, is when a given keyword's rank 1 URL is different from the previous day's URL.

keyval = seo_data.map(lambda x: (x[0], x[1], x[4], x[5], x[6]))
keyval_rank = keyval.filter(lambda x: float(x[3]) == 1)

예제 #12

0

파일 보기

        b_labels = sc.broadcast(labels)

        # 返回的结果格式：[t_id, cluster_size, tags_str, id_str]
        interest_based_matrix = sc.parallelize(id_list).map(
            lambda cno: (str(cno), b_indicators.value, b_labels.value)).map(
                lambda t: get_similar_industry(t[0], t[1], t[2])).collect()

        # 基于相似资产水平，存储最终结果
        # 返回的结果格式：[[t_id, cluster_size, tags_str, id_str]]
        asset_based_matrix = sc.parallelize(id_list).map(
            lambda cno: (str(cno), b_indicators.value, b_labels.value)).map(
                lambda t: get_similar_asset(t[0], t[1], t[2])).collect()

        # 基于兴趣
        interest_based_df = hqlContext.createDataFrame(
            pd.DataFrame(
                interest_based_matrix,
                columns=['customer_no', 'cluster_size', 'tags', 'ids']))

        # 基于资产水平
        asset_based_df = hqlContext.createDataFrame(
            pd.DataFrame(
                asset_based_matrix,
                columns=['customer_no', 'cluster_size', 'tags', 'ids']))

        # 注册成表，便于后期的关联操作
        interest_based_df.registerTempTable('interest_based_table')
        asset_based_df.registerTempTable('asset_based_table')

        # 热门板块客户代表标签
        hot_based_labels = get_hot_industry(indicators, labels)
        hot_based_df = hqlContext.createDataFrame(

예제 #13

0

파일 보기

파일: dm_console.py 프로젝트: aidanhelmbold/Data_Mining

def create_table(df, table_name, sqlContext, cols=None, size_limit=30):

    df.persist()
    no_plot_cols = []
    output = []
    cols_complete = []
    var_cols = [
        'colm', 'col_type', 'uniques', 'missing', 'mean', 'stddev', 'graph'
    ]

    type_dict = {
        'float': 'numeric',
        'long': 'numeric',
        'integer': 'numeric',
        'smallint': 'numeric',
        'int': 'numeric',
        'bigint': 'numeric',
        'string': 'categorical',
        'timestamp': 'date',
        'binary': 'indicator',
        'decimal(9,2)': 'numeric'
    }

    if cols == None:
        cols = df.columns

    for c in cols:
        print 'Getting {:s} data'.format(c)
        sys.stdout.flush()

        #print("Producing graphs" + str(col_graphs))
        cols_complete.append(c)
        rem_cols = list(set(df.columns) - set(cols_complete))

        #Initialize columns
        uniq = 0
        null = 0
        mean = 0
        std_dev = 0
        g = 0
        g_path = 0
        col_g = []
        # col_g_paths = []
        # col_g.extend(np.zeros(len(col_graphs)))

        uniq = df.select(c).distinct().count()
        print('... uniques: {:d}'.format(uniq))

        col_type = df.select(c).dtypes[0][1]
        col_type = type_dict[col_type]
        if uniq == 2:
            col_type = 'indicator'
        print('... column type: {:s}'.format(col_type))

        null = df.where(F.col(c).isNull()).count()
        print('... nulls: {:d}'.format(null))

        if (uniq < size_limit) & (col_type in ['categorical', 'indicator']):
            g, g_path = g_barplot(df, c)

        if col_type in ['numeric']:
            df_sum = df.select(c).agg(F.avg(F.col(c)),
                                      F.stddev(F.col(c))).take(1)
            mean = df_sum[0][0]
            std_dev = df_sum[0][1]

            print('... numerical summary: {:0.2f}, {:0.2f}'.format(
                mean, std_dev))
            g, g_path = g_histogram(df, c)

        print('... Single Graph Done')
        output.append(tuple([c, col_type, uniq, null, mean, std_dev, g_path]))

    # 2 factor charts here

    # create the table
    schema_list = [
        T.StructField("colm", T.StringType(), True),
        T.StructField("col_type", T.StringType(), True),
        T.StructField("uniques", T.IntegerType(), True),
        T.StructField("missing", T.IntegerType(), True),
        T.StructField("mean", T.FloatType(), True),
        T.StructField("stddev", T.FloatType(), True),
        T.StructField("graph", T.StringType(), True)
    ]

    # graph_schema_list = [T.StructField(x, T.StringType(), True) for x in col_graphs]
    # schema_list.extend(graph_schema_list)
    schema = T.StructType(schema_list)
    print schema

    rdd = sc.parallelize(output)

    hive = HiveContext(sc)
    hive.createDataFrame(rdd, schema=schema)\
        .write.mode('overwrite')\
        .saveAsTable('datamining.' + table_name,format='parquet')
    df.unpersist()
    print '... {:s} saved to cluster'.format(table_name)
    sys.stdout.flush()

예제 #14

0

파일 보기

workday_list = ['five_', 'four_', 'three_', 'two_', 'one_']

#行列转换 透视表
week_every_date_stock_position = pd.pivot_table(
    week_every_date_stock_position,
    index=['customer_no', 'op_date'],
    columns=['date_rank'],
    values=['position'])
week_every_date_stock_position.columns = [
    x + 'stock_position' for x in workday_list
]
week_every_date_stock_position = week_every_date_stock_position.reset_index()
week_every_date_stock_position = add_statistics(week_every_date_stock_position,
                                                'stock_position')
week_every_date_stock_position = hql.createDataFrame(
    week_every_date_stock_position)
week_every_date_stock_position.registerTempTable(
    'week_every_date_stock_position')

#1.7近一周各天维保比例
week_every_date_deposit_rate = hql.sql('''
select customer_no,op_date,date_rank,deposit_rate
from 
    (select a.customer_no,op_date,init_date,deposit_rate,
            rank()over(partition by a.customer_no,op_date order by init_date ) as date_rank
    from 
        (select customer_no,close_rate as deposit_rate,
        concat(substr(init_date, 0,4), '-', substr(init_date, 5,2),'-', substr(init_date, 7,2)) as init_date
        from agg_cust_balance_workday)a
        join 
        (select customer_no,op_date,op_before_one_date,op_before_week_date

예제 #15

0

파일 보기

파일: feature_engine.py 프로젝트: sadjjk/active_customer

# df = diff_std_5(df,'top1_industry_volume')
# df = diff_std_5(df,'top1_industry_change_rate')
# df = diff_std_5(df,'top2_industry_open_price')
# df = diff_std_5(df,'top2_industry_close_price')
# df = diff_std_5(df,'top2_industry_high_price')
# df = diff_std_5(df,'top2_industry_low_price')
# df = diff_std_5(df,'top2_industry_amount')
# df = diff_std_5(df,'top2_industry_volume')
# df = diff_std_5(df,'top2_industry_change_rate')

# if int(flag) == 0:
#     df = pd.concat([df_id,df,df_label],axis = 1)
# else:
#     df = pd.concat([df_id,df],axis = 1)

spark_df = hql.createDataFrame(df)

try:
    if int(flag) == 0:
        spark_df.write.mode('overwrite').saveAsTable(
            'cust_mining.c99_feature_engine_train_{}'.format(run_date))
        print('write cust_mining.c99_feature_engine_train_' + run_date +
              ' to hive Successfully')
    else:
        spark_df.write.mode('overwrite').saveAsTable(
            'cust_mining.c99_feature_engine_test_{}'.format(run_date))
        print('write cust_mining.c99_feature_engine_test_' + run_date +
              ' to hive Successfully')
except:
    print('write ' + run_date + ' to hive Failed !!!')

예제 #16

0

파일 보기

파일: invest_model.py 프로젝트: Jameslu041/lz_BackEnd

               nvl(stock_type, '') as stock_type,
               nvl(current_qty, 0) as current_qty
        from ''' + db_name + '''.agg_cust_stock
        where part_date>=''' + one_year_ago + '''
          and part_date<=''' + run_date + '''
    ''').rdd

    # collect 后的结果集type为list
    s_e_date_rdd = stock_hold_cur_detail. \
        map(lambda row_item: (row_item['customer_no'] + ',' + row_item['stock_no'] + ',' + row_item['stock_type'],
                              row_item['init_date'] + ':' + str(row_item['current_qty']))). \
        reduceByKey(lambda x, y: x + ' ' + y). \
        flatMap(lambda i: parse_s_e_date(i))

    # DataFrame可由list、tuple、rdd转化
    s_e_date_df = hqlContext.createDataFrame(s_e_date_rdd, [
        'customer_no', 'stock_no', 'stock_type', 'stock_start_date', 'stock_end_date'])
    s_e_date_df.registerTempTable("stock_hold_date_table")  # 所有持仓信息注册为表
    hqlContext.cacheTable('stock_hold_date_table')

    # 偏好股票明细
    stock_prefer_detail = hqlContext.sql('''
        select a.customer_no as customer_no,
               a.stock_no as stock_no,
               a.stock_name as stock_name,
               row_number() over (partition by customer_no order by total_exchange_amount) as rank
        from
        (
        select customer_no,
               stock_no,
               stock_name,
               sum(business_balance) as total_exchange_amount

예제 #17

0

파일 보기

list_2_load_keys = list(
    set(msg_2_days_dict.keys()).difference(set(list_ids_in_db)))[0:3]
if len(list_2_load_keys) == 0:
    print("No Process ids to load")
    exit()
list_2_load_str = "/* ".join(filter(None, list_2_load_keys)) + "/*"
list_2_load_keys_ids = [str(i429).split("/")[-1] for i429 in list_2_load_keys]

print("\n\nThe following processids are loaded " +
      str(list_2_load_keys_ids).replace("[", "").replace("]", ""))

ids_2_load = [[i425, msg_2_days_dict[i425]] for i425 in list_2_load_keys]

#ids_2_load=[i400.split(" ") for i400 in list_ids_2_load]
R = Row('id', 'date_')
ids_2_load_df = sqlContext.createDataFrame(
    [R(i421[0], i421[1]) for i421 in ids_2_load])
ids_2_load_df = ids_2_load_df.withColumn("Process__id", lit(process__id))
ids_2_load_df = ids_2_load_df.withColumn("gdia_load_date",
                                         lit(str(today_full)))

#commands.getoutput("hadoop fs -cat "+list_2_load_str+"| perl -p -e 's/{\"entitydata\":/\n{\"entitydata\":/g'| grep -v \"^$\" |hadoop fs -put -f - JSON_INPUT_FSM")
commands.getoutput(
    "hadoop fs -cat " + list_2_load_str +
    "| perl -p -e 's/{\"entitydata\":/\n{\"entitydata\":/g'| grep -v \"^$\" |hadoop fs -put -f - "
    + db_location + db + "/raw/JSON_INPUT")


#https://stackoverflow.com/questions/5508509/how-do-i-check-if-a-string-is-valid-json-in-python
def is_json(myjson, index_value, index_col):
    try:
        json_object = json.loads(myjson)

예제 #18

0

파일 보기

workday_list = ['five_', 'four_', 'three_', 'two_', 'one_']

### 开盘价
stock_concentrate_top2_openprice = pd.pivot_table(
    stock_concentrate_top2_info,
    index=['customer_no', 'op_date'],
    columns=['date_rank'],
    values=['openprice'])
stock_concentrate_top2_openprice.columns = [
    x + 'top2_stock_openprice' for x in workday_list
]
stock_concentrate_top2_openprice = stock_concentrate_top2_openprice.reset_index(
)
stock_concentrate_top2_openprice = add_statistics(
    stock_concentrate_top2_openprice, 'top2_stock_openprice')
stock_concentrate_top2_openprice = hql.createDataFrame(
    stock_concentrate_top2_openprice)
stock_concentrate_top2_openprice.registerTempTable(
    'stock_concentrate_top2_openprice')

### 收盘价
stock_concentrate_top2_closeprice = pd.pivot_table(
    stock_concentrate_top2_info,
    index=['customer_no', 'op_date'],
    columns=['date_rank'],
    values=['closeprice'])
stock_concentrate_top2_closeprice.columns = [
    x + 'top2_stock_closeprice' for x in workday_list
]
stock_concentrate_top2_closeprice = stock_concentrate_top2_closeprice.reset_index(
)
stock_concentrate_top2_closeprice = add_statistics(

예제 #19

0

파일 보기

파일: test_lib.py 프로젝트: hongbin0908/bintrade

    df.insertInto(tableName, overwrite)


if __name__ == '__main__':
    #log.debug("debug")
    #a = eval("(1,[2,3])")
    #print "xxxxxxx",a[1][0]
    #a = {1: 1.0, 3: 5.5}
    #str_a = str(a)
    #a = eval(str_a)
    #print a[1]

    #print json.loads("""{1:1}""")
    sc = SparkContext("local[1]", appName="bintrade.ml.diff_feature")
    sql_context = HiveContext(sc)
    sql_context.sql(""" use fex_test """)
    sql_context.setConf("spark.sql.shuffle.partitions", "1")

    ldict = [{
        "symbol": "AAA",
        "date": "2010-01-01",
        "close": 1.0
    }, {
        "symbol": "AAA",
        "date": "2010-01-01",
        "close": 1.0
    }]

    df = sql_context.createDataFrame(ldict)
    dfToTableWithPar(sql_context, df, "test_eod_AAA")