def fee_cnt_time(dataObj): prefix = 'yysRecharge' result = dict() last_modify_time = dataObj.last_modify_time df_recharges = dataObj.df_recharges day_list = [] array = (30, 90, 180) amount_cut = [10, 20, 50, 100, 150, 200] for i in array: x = (str(i) + 'd', df_recharges[(last_modify_time - df_recharges['recharge_time']).dt.days <= i]) day_list.append(x) for day in day_list: temp = day[1][['recharge_time', 'amount']] # 近x天充值总金额 result[f'{prefix}_sum_recharge_amout_{day[0]}'] = sum( temp['amount'].values) # 近x天充值次数 result[f'{prefix}_recharge_cnt_sum_{day[0]}'] = temp.shape[0] # 近x天单笔充值金额中位数 result[f'{prefix}_recharge_amount_median_{day[0]}'] = cal_median( temp['amount'].values) # 近x天单笔充值金额最大值 result[f'{prefix}_recharge_amount_max_{day[0]}'] = cal_max( temp['amount'].values) # 近x天充值费用平均值,按笔算 result[f'{prefix}_recharge_amount_mean_{day[0]}_fm_cnt'] = division( sum(temp['amount'].values), temp.shape[0]) for ac in amount_cut: amount = 10 * ac match_df = temp[temp.amount > 10 * ac] result[ f'{prefix}_recharge_amount{amount}_cnt_{day[0]}'] = match_df.shape[ 0] if day[0] == '90d' or day[0] == '180d': array = [] match_df = sorted(temp['recharge_time']) for i in range(0, len(match_df)): if (i + 1) < len(match_df): day_dis = (match_df[i + 1] - match_df[i]).days array.append(day_dis) # 近x天相邻两笔充值最大间隔 result[f'{prefix}_recharge_timespan_max_{day[0]}'] = cal_max(array) # 近x天相邻两笔充值最小间隔 result[f'{prefix}_recharge_timespan_min_{day[0]}'] = cal_min(array) # 近x天相邻两笔充值间隔均值 result[f'{prefix}_recharge_timespan_mean_{day[0]}'] = division( sum(array), len(array)) recharge_time = sorted(df_recharges['recharge_time']) # 最近一次充值距更新时间天数 result[f'{prefix}_trade_recent_time_span'] = ( last_modify_time - recharge_time[len(recharge_time) - 1]).days print('recharge fee cnt time count:', len(result)) return result
def fee_contact_day(dataObj): prefix = 'yysMsg' result = dict() last_modify_time = dataObj.last_modify_time df_smses = dataObj.df_smses day_list = [] array = (30, 90, 180) for i in array: x = (str(i) + 'd', df_smses[(last_modify_time - df_smses['time']).dt.days <= i]) day_list.append(x) for day in day_list: temp = day[1][['peer_number', 'fee', 'time']] # 近x天短信费用 result[f'{prefix}_msg_fee_sum_{day[0]}'] = sum(temp['fee'].values) # 近x天短信最大费用 result[f'{prefix}_msg_fee_max_day_{day[0]}'] = cal_max( temp['fee'].values) number_cnt = temp['peer_number'].value_counts() number_cnt = number_cnt.reset_index() # 近x个月短信次数超过2,5,10的联系人个数 result[f'{prefix}_msg_cnt2_num_{day[0]}'] = len( set(number_cnt['index'][number_cnt.peer_number > 2])) result[f'{prefix}_msg_cnt5_num_{day[0]}'] = len( set(number_cnt['index'][number_cnt.peer_number > 5])) result[f'{prefix}_msg_cnt10_num_{day[0]}'] = len( set(number_cnt['index'][number_cnt.peer_number > 10])) temp = df_smses.copy() temp['year'] = df_smses['time'].dt.year temp['month'] = df_smses['time'].dt.month temp = temp.groupby(['year', 'month']) sum_fee_list = [] for index, data in temp: sum_fee_list.append(sum(data['fee'].values)) result[f'{prefix}_msg_fee10_cnt_{day[0]}'] = len( [x for x in sum_fee_list if x > 10]) result[f'{prefix}_msg_fee50_cnt_{day[0]}'] = len( [x for x in sum_fee_list if x > 50]) print('msg fee day feature count:', len(result)) return result
def recharge_aver_median(dataObj): prefix = 'yysRecharge' result = dict() open_date = dataObj.open_data last_modify_time = dataObj.last_modify_time df_recharge = dataObj.df_recharges time_limit_list = [] time_limit_list.append(last_modify_time) for i in (1, 2, 3, 4, 5, 6): time_limit_list.append(last_modify_time - datetime.timedelta(days=30 * i)) every_month_data = [] for i in range(len(time_limit_list)): if (i + 1) < len(time_limit_list): temp = df_recharge[(df_recharge.time > time_limit_list[i + 1]) & (df_recharge.time <= time_limit_list[i])] every_month_data.append(temp) my_dict = [('90d', 90, every_month_data[0:3]), ('180d', 180, every_month_data)] recharge_cnt = [] for temp in every_month_data: recharge_cnt.append(temp.shape[0]) result[f'{prefix}_rechange_cnt_median'] = cal_median(recharge_cnt) for md in my_dict: recharge_cnt_list = [] # 存放月充值次数 recharge_amount_list = [] # 存放月充值金额 for temp in md[2]: recharge_cnt_list.append(temp.shape[0]) recharge_amount_list.append(sum(temp['amount'].values)) if (last_modify_time - datetime.timedelta(md[1]) >= open_date): # 月均费用平均值 result[ f'{prefix}_recharge_amount_mean_{md[0]}_fm_month'] = division( sum(recharge_amount_list), len(recharge_amount_list)) # 月均充值次数 result[f'{prefix}_recharge_cnt_mean_{md[0]}'] = division( sum(recharge_cnt_list), len(recharge_cnt_list)) else: fm = fm = (last_modify_time - open_date) // 30 + 1 # 月均费用平均值 result[ f'{prefix}_recharge_amount_mean_{md[0]}_fm_month'] = division( sum(recharge_amount_list), fm) # 月均充值次数 result[f'{prefix}_recharge_cnt_mean_{md[0]}'] = division( sum(recharge_cnt_list), fm) # 月均充值次数最大值 result[f'{prefix}_recharge_cnt_max_{md[0]}'] = cal_max( recharge_cnt_list) # 月均充值金额最大值 result[f'{prefix}_recharge_amount_month_max_{md[0]}'] = cal_max( recharge_amount_list) # 月均充值次数稳定性 result[f'{prefix}_recharge_cnt_month_stab_{md[0]}'] = division( cal_std(recharge_cnt_list), result[f'{prefix}_recharge_cnt_mean_{md[0]}_fm_month'] ) if cal_std(recharge_cnt_list) != '' else '' # 月均充值金额稳定性 result[f'{prefix}_recharge_cnt_month_stab_{md[0]}'] = division( cal_std(recharge_amount_list), result[f'{prefix}_recharge_amount_mean_{md[0]}_fm_month'] ) if cal_std(recharge_amount_list) != '' else '' print('yysRecharge recharge aver median feature count:', len(result)) return result
def fee_time(dataObj): prefix = 'yysBill' result = dict() last_modify_time = dataObj.last_modify_time df_bill = dataObj.df_bills # 近6个月各种费用平均值 result[f'{prefix}_bill_total_fee_month_mean'] = df_bill.total_fee.mean() result[f'{prefix}_bill_actual_fee_month_mean'] = df_bill.actual_fee.mean() result[f'{prefix}_bill_voice_fee_month_mean'] = df_bill.voice_fee.mean() result[f'{prefix}_bill_extra_service_fee_month_mean'] = df_bill.extra_service_fee.mean() result[f'{prefix}_bill_sms_fee_month_mean'] = df_bill.sms_fee.mean() result[f'{prefix}_bill_extra_fee_month_mean'] = df_bill.extra_fee.mean() result[f'{prefix}_bill_web_fee_month_mean'] = df_bill.web_fee.mean() # 近6个月各种费用中位数 result[f'{prefix}_bill_total_fee_month_mean'] = df_bill.total_fee.median() result[f'{prefix}_bill_actual_fee_month_mean'] = df_bill.actual_fee.median() result[f'{prefix}_bill_voice_fee_month_mean'] = df_bill.voice_fee.median() result[f'{prefix}_bill_extra_service_fee_month_mean'] = df_bill.extra_service_fee.median() result[f'{prefix}_bill_sms_fee_month_mean'] = df_bill.sms_fee.median() result[f'{prefix}_bill_extra_fee_month_mean'] = df_bill.extra_fee.median() result[f'{prefix}_bill_web_fee_month_mean'] = df_bill.web_fee.median() my_dict = {3:'', 6:''} year = last_modify_time.year month = last_modify_time.month for i in (3,6): if (month-i)>0: month_threshold = month - i year_threshold = year else: month_threshold = month + 12 - i year_threshold = year - 1 month_year_df = df_bill[((df_bill.bill_month).dt.year>=year_threshold)&((df_bill.bill_month).dt.month>=month_threshold)] if i == 3: my_dict[3] = month_year_df if i == 6: my_dict[6] = month_year_df day_list = {(3, '90d'), (6, '180d')} fee_list = [10, 20, 50, 100, 150, 200] for day in day_list: df_temple = my_dict[day[0]] # 费用之和 if day[0] >= df_temple.shape[0]: # 总费用 result[f'{prefix}_bill_total_fee_sum_{day[1]}'] = df_temple.total_fee.sum() # 实际费用 result[f'{prefix}_bill_actual_fee_sum_{day[1]}'] = df_temple.actual_fee.sum() # 语音费用 result[f'{prefix}_bill_voice_fee_sum_{day[1]}'] = df_temple.voice_fee.sum() # 语音费用占比 result[f'{prefix}_bill_voice_fee_sum_rate_{day[1]}'] = division(df_temple.voice_fee.sum(), df_temple.total_fee.sum()) # 其他费用 result[f'{prefix}_bill_extra_service_fee_sum_{day[1]}'] = df_temple.extra_service_fee.sum() # 其他费用占比 result[f'{prefix}_bill_extra_service_fee_sum_rate_{day[1]}'] = division(df_temple.extra_service_fee.sum(), df_temple.total_fee.sum()) # 短信费用 result[f'{prefix}_bill_sms_fee_sum_{day[1]}'] = df_temple.sms_fee.sum() # 短信费用占比 result[f'{prefix}_bill_sms_fee_sum_rate_{day[1]}'] = division(df_temple.sms_fee.sum(), df_temple.total_fee.sum()) result[f'{prefix}_bill_extra_fee_sum_{day[1]}'] = df_temple.extra_fee.sum() result[f'{prefix}_bill_extra_fee_sum_rate_{day[1]}'] = division(df_temple.extra_fee.sum(), df_temple.total_fee.sum()) result[f'{prefix}_bill_web_fee_sum_{day[1]}'] = df_temple.web_fee.sum() result[f'{prefix}_bill_web_fee_sum_rate_{day[1]}'] = division(df_temple.web_fee.sum(), df_temple.total_fee.sum()) else: df1 = df_temple.iloc[:day[0]] result[f'{prefix}_bill_total_fee_sum_{day[1]}'] = df1.total_fee.sum() result[f'{prefix}_bill_actual_fee_sum_{day[1]}'] = df1.actual_fee.sum() result[f'{prefix}_bill_voice_fee_sum_{day[1]}'] = df1.voice_fee.sum() result[f'{prefix}_bill_voice_fee_sum_rate_{day[1]}'] = division(df1.voice_fee.sum(), df1.total_fee.sum()) result[f'{prefix}_bill_extra_service_fee_sum_{day[1]}'] = df1.extra_service_fee.sum() result[f'{prefix}_bill_extra_service_fee_sum_rate_{day[1]}'] = division(df1.extra_service_fee.sum(), df1.total_fee.sum()) result[f'{prefix}_bill_sms_fee_sum_{day[1]}'] = df1.sms_fee.sum() result[f'{prefix}_bill_sms_fee_sum_rate_{day[1]}'] = division(df1.sms_fee.sum(), df1.total_fee.sum()) result[f'{prefix}_bill_extra_fee_sum_{day[1]}'] = df1.extra_fee.sum() result[f'{prefix}_bill_extra_fee_sum_rate_{day[1]}'] = division(df1.extra_fee.sum(), df1.total_fee.sum()) result[f'{prefix}_bill_web_fee_sum_{day[1]}'] = df1.web_fee.sum(df1.web_fee.sum(), df1.total_fee.sum()) # 费用最值 result[f'{prefix}_bill_total_fee_month_max_{day[1]}'] = cal_max(df_temple.total_fee.values) result[f'{prefix}_bill_actual_fee_month_max_{day[1]}'] = cal_max(df_temple.actual_fee.values) result[f'{prefix}_bill_voice_fee_month_max_{day[1]}'] = cal_max(df_temple.voice_fee.values) result[f'{prefix}_bill_extra_service_fee_month_max_{day[1]}'] = cal_max(df_temple.extra_service_fee.values) result[f'{prefix}_bill_sms_fee_month_max_{day[1]}'] = cal_max(df_temple.sms_fee.values) result[f'{prefix}_bill_extra_fee_month_max_{day[1]}'] = cal_max(df_temple.extra_fee.values) result[f'{prefix}_bill_web_fee_month_max_{day[1]}'] = cal_max(df_temple.web_fee.values) for fee in fee_list: temp_total = df_temple[['bill_month', 'total_fee', 'web_fee']][df_temple.total_fee > (fee * 100)] temp_web = df_temple[['bill_month', 'total_fee', 'web_fee']][df_temple.web_fee > (fee * 100)] result[f'{prefix}_bill_totalfee{fee}_cnt_{day[1]}'] = min(temp_total.shape[0], day[0]) result[f'{prefix}_bill_webfee{fee}_cnt_{day[1]}'] = min(temp_web.shape[0], day[0]) print('fee time feature count:', len(result)) return result
def duration_day(dataObj): prefix = 'yysCall' result = dict() last_modify_time = dataObj.last_modify_time # 通话时长区间,单位s, 取左开右闭 duration_list = [(0,30), (30,60), (60,180), (180,300), (300,600), (600, 'up')] df_callRecord_3day = dataObj.df_callRecord[(last_modify_time - dataObj.df_callRecord['time']).dt.days <= 3] df_callRecord_15day = dataObj.df_callRecord[(last_modify_time - dataObj.df_callRecord['time']).dt.days <= 15] day_list = [ ('3d', df_callRecord_3day), ('7d', dataObj.df_callRecord_7day), ('15d',df_callRecord_15day), ('30d', dataObj.df_callRecord_1m), ('90d', dataObj.df_callRecord_3m), ('180d', dataObj.df_callRecord) ] for day in day_list: # temp是近x天的通话记录 temp = day[1] # 近x天互通联系人个数 result[f'{prefix}_dial_dialed_contacter_cnt_{day[0]}'] = len(set(temp['peer_number'][temp.dial_type == 'DIAL'].values) & set(temp['peer_number'][temp.dial_type == 'DIALED'].values)) # 近x天联系人数量 result[f'{prefix}_contacter_cnt_{day[0]}'] = len(set(temp['peer_number'])) # 近x天通话次数 result[f'{prefix}_call_cnt_{day[0]}'] = temp.shape[0] # 近x天主叫次数 result[f'{prefix}_dial_cnt_{day[0]}'] = temp['peer_number'][temp.dial_type == 'DIAL'].shape[0] # 近x天被叫次数 result[f'{prefix}_dialed_cnt_{day[0]}'] = temp['peer_number'][temp.dial_type == 'DIALED'].shape[0] # 近x天通话时长总和 result[f'{prefix}_call_time_{day[0]}'] = sum(temp['duration'].values) # 近x天主叫时长总和 result[f'{prefix}_dial_time_{day[0]}'] = sum(temp['duration'][temp.dial_type == 'DIAL'].values) # 近x天被叫时长总和 result[f'{prefix}_dialed_time_{day[0]}'] = sum(temp['duration'][temp.dial_type == 'DIALED'].values) # 近x天被叫时长中位数 result[f'{prefix}_dialed_time_median_{day[0]}'] = cal_median(temp['duration'][temp.dial_type == 'DIALED'].values.tolist()) # 近x天主叫时长中位数 result[f'{prefix}_dial_time_median_{day[0]}'] = cal_median(temp['duration'][temp.dial_type == 'DIAL'].values.tolist()) # 近x天最大通话时长 result[f'{prefix}_call_max_time_{day[0]}'] = cal_max(temp['duration'].values) # 近x天主叫最大通话时长 result[f'{prefix}_dial_max_time_{day[0]}'] = cal_max(temp['duration'][temp.dial_type=='DIAL'].values) # 近x天被叫最大通话时长 result[f'{prefix}_dialed_max_time_{day[0]}'] = cal_max(temp['duration'][temp.dial_type=='DIAL'].values) for dr in duration_list: # match是近x天通话时长在(y,z]秒的记录, match_df是符合筛选条件的记录 if dr[1] == 'up': match_df = temp[['peer_number','dial_type','duration']][temp.duration > dr[0]] else: match_df = temp[['peer_number','dial_type','duration']][(temp.duration > dr[0]) & (temp.duration <= dr[1])] # 近x天通话时长(y,z]秒内的通话记录的通话总时长 result[f'{prefix}_duration_{dr[0]}_{dr[1]}_sum_{day[0]}'] = sum(match_df['duration'].values.tolist()) # 近x天主叫时长(y,z]秒内的通话记录的通话总时长 result[f'{prefix}_dial_duration_{dr[0]}_{dr[1]}_sum_{day[0]}'] = sum(match_df['duration'][match_df.dial_type == 'DIAL'].values) # 近x天被叫时长(y,z]秒内的通话记录的通话总时长 result[f'{prefix}_dialed_duration_{dr[0]}_{dr[1]}_sum_{day[0]}'] = sum(match_df['duration'][match_df.dial_type == 'DIALED'].values) # 近x天通话时长(y,z]秒内的联系人个数 result[f'{prefix}_contacter_{dr[0]}_{dr[1]}_cnt_{day[0]}'] = len(set(match_df['peer_number'].values)) # 近x天通话时长(y,z]秒内互通联系人个数 result[f'{prefix}_dial_dialed_contacter_{dr[0]}_{dr[1]}_cnt_{day[0]}'] = len(set(match_df['peer_number'][match_df.dial_type == 'DIAL'].values.tolist()) & set(match_df['peer_number'][match_df.dial_type == 'DIALED'].values.tolist())) # 近x天通话时长(y,z)秒内的联系人个数占比 result[f'{prefix}_contacter_{dr[0]}_{dr[1]}_rate_{day[0]}'] = division(result[f'{prefix}_contacter_{dr[0]}_{dr[1]}_cnt_{day[0]}'],len(set(temp['peer_number'].values))) # 近x天主叫联系人 和 被叫联系人 dial_number = set(temp['peer_number'][temp.dial_type == 'DIALED'].values.tolist()) dialed_number = set(temp['peer_number'][temp.dial_type == 'DIAL'].values.tolist()) # 近x天通话时长在(y,z]秒内互通联系人个数占比 result[f'{prefix}_dial_dialed_contacter_{dr[0]}_{dr[1]}_rate_{day[0]}'] = division(result[f'{prefix}_dial_dialed_contacter_{dr[0]}_{dr[1]}_cnt_{day[0]}'], len(dial_number & dialed_number)) # 近x天通话时长在(y,z]秒内的通话次数 result[f'{prefix}_call_{dr[0]}_{dr[1]}_cnt_{day[0]}'] = match_df.shape[0] # 近x天通话时长在(y,z]秒内的通话次数占比 result[f'{prefix}_call_{dr[0]}_{dr[1]}_rate_{day[0]}'] = division(match_df.shape[0], temp.shape[0]) # 近x天通话时长在(y,z]秒内的主叫通话次数 result[f'{prefix}_caller_{dr[0]}_{dr[1]}_cnt_{day[0]}'] = match_df['peer_number'][match_df.dial_type == 'DIAL'].shape[0] # 近x天通话时长在(y,z]秒内的被叫通话次数 result[f'{prefix}_called_{dr[0]}_{dr[1]}_cnt_{day[0]}'] = match_df['peer_number'][match_df.dial_type == 'DIALED'].shape[0] # 近x天通话时长在(y,z]秒的主叫号码数 calling = match_df['peer_number'][match_df.dial_type == 'DIAL'].values.tolist() called = match_df['peer_number'][match_df.dial_type == 'DIALED'].values.tolist() # 近x月通话时长在(y,z]秒的互相通话次数 result[f'{prefix}_calls_{dr[0]}_{dr[1]}_cnt_{day[0]}'] = len([x for x in calling if x in called]) print('duration day feature count:', len(result)) return result
def msg_aver_median(dataObj): prefix = 'yysMsg' result = dict() # 开户时间 open_date = dataObj.open_data # 运营商数据爬取时间 last_modify_time = dataObj.last_modify_time df_smses = dataObj.df_smses time_limit_list = [] time_limit_list.append(last_modify_time) for i in (1, 2, 3, 4, 5, 6): time_limit_list.append(last_modify_time - datetime.timedelta(days=30 * i)) every_month_data = [] for i in range(len(time_limit_list)): if (i + 1) < len(time_limit_list): temp = df_smses[(df_smses.time > time_limit_list[i + 1]) & (df_smses.time <= time_limit_list[i])] every_month_data.append(temp) my_dict = [('90d', 90, every_month_data[0:3]), ('180d', 180, every_month_data)] sms_cnt_list = [] for temp in every_month_data: sms_cnt_list.append(temp.shape[0]) result[f'median_sms_cnt'] = cal_median(sms_cnt_list) for md in my_dict: msg_cnt_list = [] msg_send_list = [] msg_receive_list = [] msg_fee_list = [] for temp in md[2]: msg_cnt_list.append(temp.shape[0]) msg_send_list.append(temp[temp.send_type == 'SEND'].shape[0]) msg_receive_list.append(temp[temp.send_type == 'RECEIVE'].shape[0]) msg_fee_list.append(sum(temp['fee'].values)) if (last_modify_time - datetime.timedelta(md[1]) >= open_date): # 月平均短信次数 result[f'{prefix}_msg_cnt_mean_{md[0]}'] = division( sum(msg_cnt_list), len(msg_cnt_list)) # 月平均发送次数 result[f'{prefix}_msg_send_cnt_mean_{md[0]}'] = division( sum(msg_send_list), len(msg_send_list)) # 月平均接收次数 result[f'{prefix}_msg_receive_cnt_mean_{md[0]}'] = division( sum(msg_receive_list), len(msg_receive_list)) else: fm = (last_modify_time - open_date) // 30 + 1 # 月平均短信次数 result[f'{prefix}_msg_cnt_mean_{md[0]}'] = division( sum(msg_cnt_list), fm) # 月平均发送次数 result[f'{prefix}_msg_send_cnt_mean_{md[0]}'] = division( sum(msg_send_list), fm) # 月平均接收次数 result[f'{prefix}_msg_receive_cnt_mean_{md[0]}'] = division( sum(msg_receive_list), fm) # 月均最大费用 result[f'{prefix}_msg_fee_max_month_{md[0]}'] = cal_max(msg_fee_list) # 发送次数稳定性 result[f'{prefix}_msg_send_cnt_month_stab_{md[0]}'] = division( cal_std(msg_send_list), result[f'{prefix}_msg_send_cnt_mean_{md[0]}'] ) if cal_std(msg_send_list) != '' else '' #接收次数稳定性 result[f'{prefix}_msg_receive_cnt_month_stab_{md[0]}'] = division( cal_std(msg_send_list), result[f'{prefix}_msg_receive_cnt_mean_{md[0]}'] ) if cal_std(msg_receive_list) != '' else '' return result