def fee_cnt_time(dataObj): prefix = 'yysRecharge' result = dict() last_modify_time = dataObj.last_modify_time df_recharges = dataObj.df_recharges day_list = [] array = (30, 90, 180) amount_cut = [10, 20, 50, 100, 150, 200] for i in array: x = (str(i) + 'd', df_recharges[(last_modify_time - df_recharges['recharge_time']).dt.days <= i]) day_list.append(x) for day in day_list: temp = day[1][['recharge_time', 'amount']] # 近x天充值总金额 result[f'{prefix}_sum_recharge_amout_{day[0]}'] = sum( temp['amount'].values) # 近x天充值次数 result[f'{prefix}_recharge_cnt_sum_{day[0]}'] = temp.shape[0] # 近x天单笔充值金额中位数 result[f'{prefix}_recharge_amount_median_{day[0]}'] = cal_median( temp['amount'].values) # 近x天单笔充值金额最大值 result[f'{prefix}_recharge_amount_max_{day[0]}'] = cal_max( temp['amount'].values) # 近x天充值费用平均值,按笔算 result[f'{prefix}_recharge_amount_mean_{day[0]}_fm_cnt'] = division( sum(temp['amount'].values), temp.shape[0]) for ac in amount_cut: amount = 10 * ac match_df = temp[temp.amount > 10 * ac] result[ f'{prefix}_recharge_amount{amount}_cnt_{day[0]}'] = match_df.shape[ 0] if day[0] == '90d' or day[0] == '180d': array = [] match_df = sorted(temp['recharge_time']) for i in range(0, len(match_df)): if (i + 1) < len(match_df): day_dis = (match_df[i + 1] - match_df[i]).days array.append(day_dis) # 近x天相邻两笔充值最大间隔 result[f'{prefix}_recharge_timespan_max_{day[0]}'] = cal_max(array) # 近x天相邻两笔充值最小间隔 result[f'{prefix}_recharge_timespan_min_{day[0]}'] = cal_min(array) # 近x天相邻两笔充值间隔均值 result[f'{prefix}_recharge_timespan_mean_{day[0]}'] = division( sum(array), len(array)) recharge_time = sorted(df_recharges['recharge_time']) # 最近一次充值距更新时间天数 result[f'{prefix}_trade_recent_time_span'] = ( last_modify_time - recharge_time[len(recharge_time) - 1]).days print('recharge fee cnt time count:', len(result)) return result
def fee_day(dataObj): prefix = 'yysCall' result = dict() last_modify_time = dataObj.last_modify_time df_callRecord_3day = dataObj.df_callRecord[(last_modify_time - dataObj.df_callRecord['time']).dt.days <= 3] df_callRecord_15day = dataObj.df_callRecord[(last_modify_time - dataObj.df_callRecord['time']).dt.days <= 15] day_list = [ #('3d', df_callRecord_3day), #('7d', dataObj.df_callRecord_7day), #('15d', df_callRecord_15day), ('30d', dataObj.df_callRecord_1m), ('90d', dataObj.df_callRecord_3m), ('180d', dataObj.df_callRecord) ] # 费用区间,单位是分 fee_range = [(0, 20),(20, 50),(50, 100),(100, 500),(500, 'up')] for day in day_list: temp = day[1][['peer_number', 'fee']] for fr in fee_range: if fr[1] == 'up': match_df = temp[temp.fee > fr[0]] else: match_df = temp[(temp.fee > fr[0]) & (temp.fee <= fr[1])] # 近x天通话费用在(y,z]的通话次数 result[f'{prefix}_call_fee_{fr[0]}_{fr[1]}_cnt_{day[0]}'] = match_df.shape[0] # 近x天通话费用在(y,z]的通话次数占比 result[f'{prefix}_call_fee_{fr[0]}_{fr[1]}_rate_{day[0]}'] = division(match_df.shape[0], temp.shape[0]) # 近x天通话费用在(y,z]的联系人个数 result[f'{prefix}_contacter_call_fee_{fr[0]}_{fr[1]}_cnt_{day[0]}'] = len(set(match_df['peer_number'].values)) print('fee day feature count:', len(result)) return result
def contact_tag(dataObj): prefix = 'yysCall' result = dict() last_modify_time = dataObj.last_modify_time df_callRecord_3day = dataObj.df_callRecord[(last_modify_time - dataObj.df_callRecord['time']).dt.days <= 3] df_callRecord_15day = dataObj.df_callRecord[(last_modify_time - dataObj.df_callRecord['time']).dt.days <= 15] day_list = [ ('3d', df_callRecord_3day), ('7d', dataObj.df_callRecord_7day), ('15d', df_callRecord_15day), ('30d', dataObj.df_callRecord_1m), ('90d', dataObj.df_callRecord_3m), ('180d', dataObj.df_callRecord) ] phones = ['110', '120', '119'] for day in day_list: temp = day[1][['peer_number', 'duration', 'dial_type']] for phone in phones: # match_df 是近x天与phone的通话记录,phone属于110,120,199 match_df = temp[temp.peer_number==phone] # 近x天与110,120,119通话次数 result[f'{prefix}_call_{phone}_cnt_{day[0]}'] = match_df.shape[0] # 近x天与110,120,119主叫次数 result[f'{prefix}_call_dial_{phone}_cnt_{day[0]}'] = match_df[match_df.dial_type == 'DIAL'].shape[0] # 近x天与110,120,119被叫次数 result[f'{prefix}_call_dialed_{phone}_cnt_{day[0]}'] = match_df[match_df.dial_type == 'DIALED'].shape[0] # 近x天与110,120,119通话次数占比 result[f'{prefix}_call_{phone}_rate_{day[0]}'] = division(match_df.shape[0], temp.shape[0]) # 近x天与110,120,119主叫时长 result[f'{prefix}_call_dial_{phone}_time_{day[0]}'] = sum(match_df['duration'][match_df.dial_type == 'DIAL'].values) # 近x天与110,120,119被叫时长 result[f'{prefix}_call_dialed_{phone}_time_{day[0]}'] = sum(match_df['duration'][match_df.dial_type == 'DIALED'].values) call_duration = sum(match_df['duration'].values.tolist()) # 近x天与110,120,119的通话时长 result[f'{prefix}_call_{phone}_time_{day[0]}'] = call_duration # 近x天与110,120,119的通话时长占比 result[f'{prefix}_call_{phone}_time_rate_{day[0]}'] = division(call_duration, sum(temp['duration'].values.tolist())) print('contact tag feature count:', len(result)) return result
def time_interval_day(dataObj): prefix = 'yysCall' result = dict() time_interval_list = [ ('early_morning', 5, 30, 9, 0), ('morning', 9, 0, 11, 30), ('nooning', 11, 30, 13, 30), ('afternoon', 13, 30, 17, 30), ('toward_evening', 17, 30, 19, 30), ('evening', 19, 30, 23, 30), ('small_hour', 23, 30, 1, 30), ('midnight', 1, 30, 5, 30) ] last_modify_time = dataObj.last_modify_time df_callRecord_3day = dataObj.df_callRecord[(last_modify_time - dataObj.df_callRecord['time']).dt.days <= 3] df_callRecord_15day = dataObj.df_callRecord[(last_modify_time - dataObj.df_callRecord['time']).dt.days <= 15] day_list = [ ('3d', df_callRecord_3day), ('7d', dataObj.df_callRecord_7day), ('15d', df_callRecord_15day), ('30d', dataObj.df_callRecord_1m), ('90d', dataObj.df_callRecord_3m), ('180d', dataObj.df_callRecord) ] for day in day_list: # temp是近x天的通话 data = day[1][['time', 'peer_number', 'duration', 'dial_type']] temp = data.copy() temp['hour'] = data['time'].dt.hour temp['minute'] = data['time'].dt.minute for ti in time_interval_list: if ti[0] == 'small_hour': # 从23:30 到 23:59 time_range1 = temp[(temp.hour>=ti[1]) & (temp.minute>=ti[2]) & (temp.minute<=59)] # 从00:00 到 1:30 time_range2 = temp[(temp.hour>=0) & (temp.minute>0) & (temp.hour<=ti[3]) & (temp.minute<ti[4])] match_df = pd.concat([time_range1, time_range2], axis=0) else: match_df = temp[(temp.hour>=ti[1]) & (temp.minute>=ti[2]) & (temp.hour<=ti[3]) & (temp.minute<ti[4])] # 近x天通话时段在(y_hour: y_minute)-(z_hour: z_minute)的通话次数 result[f'{prefix}_call_{ti[0]}_cnt_{day[0]}'] = match_df.shape[0] # 近x天通话时段在(y_hour: y_minute)-(z_hour: z_minute)的通话次数比例 result[f'{prefix}_call_{ti[0]}_rate_{day[0]}'] = division(match_df.shape[0], temp.shape[0]) # 近x天通话时段在(y_hour: y_minute)-(z_hour: z_minute)的通话时长 result[f'{prefix}_call_{ti[0]}_time_{day[0]}'] = sum(match_df['duration'].values.tolist()) # 近x天通话时段在(y_hour: y_minute)-(z_hour: z_minute)的联系人个数 result[f'{prefix}_contacter_{ti[0]}_cnt_{day[0]}'] = len(set(match_df['peer_number'].values)) print('time interval day feature count:', len(result)) return result
def recharge_aver_median(dataObj): prefix = 'yysRecharge' result = dict() open_date = dataObj.open_data last_modify_time = dataObj.last_modify_time df_recharge = dataObj.df_recharges time_limit_list = [] time_limit_list.append(last_modify_time) for i in (1, 2, 3, 4, 5, 6): time_limit_list.append(last_modify_time - datetime.timedelta(days=30 * i)) every_month_data = [] for i in range(len(time_limit_list)): if (i + 1) < len(time_limit_list): temp = df_recharge[(df_recharge.time > time_limit_list[i + 1]) & (df_recharge.time <= time_limit_list[i])] every_month_data.append(temp) my_dict = [('90d', 90, every_month_data[0:3]), ('180d', 180, every_month_data)] recharge_cnt = [] for temp in every_month_data: recharge_cnt.append(temp.shape[0]) result[f'{prefix}_rechange_cnt_median'] = cal_median(recharge_cnt) for md in my_dict: recharge_cnt_list = [] # 存放月充值次数 recharge_amount_list = [] # 存放月充值金额 for temp in md[2]: recharge_cnt_list.append(temp.shape[0]) recharge_amount_list.append(sum(temp['amount'].values)) if (last_modify_time - datetime.timedelta(md[1]) >= open_date): # 月均费用平均值 result[ f'{prefix}_recharge_amount_mean_{md[0]}_fm_month'] = division( sum(recharge_amount_list), len(recharge_amount_list)) # 月均充值次数 result[f'{prefix}_recharge_cnt_mean_{md[0]}'] = division( sum(recharge_cnt_list), len(recharge_cnt_list)) else: fm = fm = (last_modify_time - open_date) // 30 + 1 # 月均费用平均值 result[ f'{prefix}_recharge_amount_mean_{md[0]}_fm_month'] = division( sum(recharge_amount_list), fm) # 月均充值次数 result[f'{prefix}_recharge_cnt_mean_{md[0]}'] = division( sum(recharge_cnt_list), fm) # 月均充值次数最大值 result[f'{prefix}_recharge_cnt_max_{md[0]}'] = cal_max( recharge_cnt_list) # 月均充值金额最大值 result[f'{prefix}_recharge_amount_month_max_{md[0]}'] = cal_max( recharge_amount_list) # 月均充值次数稳定性 result[f'{prefix}_recharge_cnt_month_stab_{md[0]}'] = division( cal_std(recharge_cnt_list), result[f'{prefix}_recharge_cnt_mean_{md[0]}_fm_month'] ) if cal_std(recharge_cnt_list) != '' else '' # 月均充值金额稳定性 result[f'{prefix}_recharge_cnt_month_stab_{md[0]}'] = division( cal_std(recharge_amount_list), result[f'{prefix}_recharge_amount_mean_{md[0]}_fm_month'] ) if cal_std(recharge_amount_list) != '' else '' print('yysRecharge recharge aver median feature count:', len(result)) return result
def fee_time(dataObj): prefix = 'yysBill' result = dict() last_modify_time = dataObj.last_modify_time df_bill = dataObj.df_bills # 近6个月各种费用平均值 result[f'{prefix}_bill_total_fee_month_mean'] = df_bill.total_fee.mean() result[f'{prefix}_bill_actual_fee_month_mean'] = df_bill.actual_fee.mean() result[f'{prefix}_bill_voice_fee_month_mean'] = df_bill.voice_fee.mean() result[f'{prefix}_bill_extra_service_fee_month_mean'] = df_bill.extra_service_fee.mean() result[f'{prefix}_bill_sms_fee_month_mean'] = df_bill.sms_fee.mean() result[f'{prefix}_bill_extra_fee_month_mean'] = df_bill.extra_fee.mean() result[f'{prefix}_bill_web_fee_month_mean'] = df_bill.web_fee.mean() # 近6个月各种费用中位数 result[f'{prefix}_bill_total_fee_month_mean'] = df_bill.total_fee.median() result[f'{prefix}_bill_actual_fee_month_mean'] = df_bill.actual_fee.median() result[f'{prefix}_bill_voice_fee_month_mean'] = df_bill.voice_fee.median() result[f'{prefix}_bill_extra_service_fee_month_mean'] = df_bill.extra_service_fee.median() result[f'{prefix}_bill_sms_fee_month_mean'] = df_bill.sms_fee.median() result[f'{prefix}_bill_extra_fee_month_mean'] = df_bill.extra_fee.median() result[f'{prefix}_bill_web_fee_month_mean'] = df_bill.web_fee.median() my_dict = {3:'', 6:''} year = last_modify_time.year month = last_modify_time.month for i in (3,6): if (month-i)>0: month_threshold = month - i year_threshold = year else: month_threshold = month + 12 - i year_threshold = year - 1 month_year_df = df_bill[((df_bill.bill_month).dt.year>=year_threshold)&((df_bill.bill_month).dt.month>=month_threshold)] if i == 3: my_dict[3] = month_year_df if i == 6: my_dict[6] = month_year_df day_list = {(3, '90d'), (6, '180d')} fee_list = [10, 20, 50, 100, 150, 200] for day in day_list: df_temple = my_dict[day[0]] # 费用之和 if day[0] >= df_temple.shape[0]: # 总费用 result[f'{prefix}_bill_total_fee_sum_{day[1]}'] = df_temple.total_fee.sum() # 实际费用 result[f'{prefix}_bill_actual_fee_sum_{day[1]}'] = df_temple.actual_fee.sum() # 语音费用 result[f'{prefix}_bill_voice_fee_sum_{day[1]}'] = df_temple.voice_fee.sum() # 语音费用占比 result[f'{prefix}_bill_voice_fee_sum_rate_{day[1]}'] = division(df_temple.voice_fee.sum(), df_temple.total_fee.sum()) # 其他费用 result[f'{prefix}_bill_extra_service_fee_sum_{day[1]}'] = df_temple.extra_service_fee.sum() # 其他费用占比 result[f'{prefix}_bill_extra_service_fee_sum_rate_{day[1]}'] = division(df_temple.extra_service_fee.sum(), df_temple.total_fee.sum()) # 短信费用 result[f'{prefix}_bill_sms_fee_sum_{day[1]}'] = df_temple.sms_fee.sum() # 短信费用占比 result[f'{prefix}_bill_sms_fee_sum_rate_{day[1]}'] = division(df_temple.sms_fee.sum(), df_temple.total_fee.sum()) result[f'{prefix}_bill_extra_fee_sum_{day[1]}'] = df_temple.extra_fee.sum() result[f'{prefix}_bill_extra_fee_sum_rate_{day[1]}'] = division(df_temple.extra_fee.sum(), df_temple.total_fee.sum()) result[f'{prefix}_bill_web_fee_sum_{day[1]}'] = df_temple.web_fee.sum() result[f'{prefix}_bill_web_fee_sum_rate_{day[1]}'] = division(df_temple.web_fee.sum(), df_temple.total_fee.sum()) else: df1 = df_temple.iloc[:day[0]] result[f'{prefix}_bill_total_fee_sum_{day[1]}'] = df1.total_fee.sum() result[f'{prefix}_bill_actual_fee_sum_{day[1]}'] = df1.actual_fee.sum() result[f'{prefix}_bill_voice_fee_sum_{day[1]}'] = df1.voice_fee.sum() result[f'{prefix}_bill_voice_fee_sum_rate_{day[1]}'] = division(df1.voice_fee.sum(), df1.total_fee.sum()) result[f'{prefix}_bill_extra_service_fee_sum_{day[1]}'] = df1.extra_service_fee.sum() result[f'{prefix}_bill_extra_service_fee_sum_rate_{day[1]}'] = division(df1.extra_service_fee.sum(), df1.total_fee.sum()) result[f'{prefix}_bill_sms_fee_sum_{day[1]}'] = df1.sms_fee.sum() result[f'{prefix}_bill_sms_fee_sum_rate_{day[1]}'] = division(df1.sms_fee.sum(), df1.total_fee.sum()) result[f'{prefix}_bill_extra_fee_sum_{day[1]}'] = df1.extra_fee.sum() result[f'{prefix}_bill_extra_fee_sum_rate_{day[1]}'] = division(df1.extra_fee.sum(), df1.total_fee.sum()) result[f'{prefix}_bill_web_fee_sum_{day[1]}'] = df1.web_fee.sum(df1.web_fee.sum(), df1.total_fee.sum()) # 费用最值 result[f'{prefix}_bill_total_fee_month_max_{day[1]}'] = cal_max(df_temple.total_fee.values) result[f'{prefix}_bill_actual_fee_month_max_{day[1]}'] = cal_max(df_temple.actual_fee.values) result[f'{prefix}_bill_voice_fee_month_max_{day[1]}'] = cal_max(df_temple.voice_fee.values) result[f'{prefix}_bill_extra_service_fee_month_max_{day[1]}'] = cal_max(df_temple.extra_service_fee.values) result[f'{prefix}_bill_sms_fee_month_max_{day[1]}'] = cal_max(df_temple.sms_fee.values) result[f'{prefix}_bill_extra_fee_month_max_{day[1]}'] = cal_max(df_temple.extra_fee.values) result[f'{prefix}_bill_web_fee_month_max_{day[1]}'] = cal_max(df_temple.web_fee.values) for fee in fee_list: temp_total = df_temple[['bill_month', 'total_fee', 'web_fee']][df_temple.total_fee > (fee * 100)] temp_web = df_temple[['bill_month', 'total_fee', 'web_fee']][df_temple.web_fee > (fee * 100)] result[f'{prefix}_bill_totalfee{fee}_cnt_{day[1]}'] = min(temp_total.shape[0], day[0]) result[f'{prefix}_bill_webfee{fee}_cnt_{day[1]}'] = min(temp_web.shape[0], day[0]) print('fee time feature count:', len(result)) return result
def duration_day(dataObj): prefix = 'yysCall' result = dict() last_modify_time = dataObj.last_modify_time # 通话时长区间,单位s, 取左开右闭 duration_list = [(0,30), (30,60), (60,180), (180,300), (300,600), (600, 'up')] df_callRecord_3day = dataObj.df_callRecord[(last_modify_time - dataObj.df_callRecord['time']).dt.days <= 3] df_callRecord_15day = dataObj.df_callRecord[(last_modify_time - dataObj.df_callRecord['time']).dt.days <= 15] day_list = [ ('3d', df_callRecord_3day), ('7d', dataObj.df_callRecord_7day), ('15d',df_callRecord_15day), ('30d', dataObj.df_callRecord_1m), ('90d', dataObj.df_callRecord_3m), ('180d', dataObj.df_callRecord) ] for day in day_list: # temp是近x天的通话记录 temp = day[1] # 近x天互通联系人个数 result[f'{prefix}_dial_dialed_contacter_cnt_{day[0]}'] = len(set(temp['peer_number'][temp.dial_type == 'DIAL'].values) & set(temp['peer_number'][temp.dial_type == 'DIALED'].values)) # 近x天联系人数量 result[f'{prefix}_contacter_cnt_{day[0]}'] = len(set(temp['peer_number'])) # 近x天通话次数 result[f'{prefix}_call_cnt_{day[0]}'] = temp.shape[0] # 近x天主叫次数 result[f'{prefix}_dial_cnt_{day[0]}'] = temp['peer_number'][temp.dial_type == 'DIAL'].shape[0] # 近x天被叫次数 result[f'{prefix}_dialed_cnt_{day[0]}'] = temp['peer_number'][temp.dial_type == 'DIALED'].shape[0] # 近x天通话时长总和 result[f'{prefix}_call_time_{day[0]}'] = sum(temp['duration'].values) # 近x天主叫时长总和 result[f'{prefix}_dial_time_{day[0]}'] = sum(temp['duration'][temp.dial_type == 'DIAL'].values) # 近x天被叫时长总和 result[f'{prefix}_dialed_time_{day[0]}'] = sum(temp['duration'][temp.dial_type == 'DIALED'].values) # 近x天被叫时长中位数 result[f'{prefix}_dialed_time_median_{day[0]}'] = cal_median(temp['duration'][temp.dial_type == 'DIALED'].values.tolist()) # 近x天主叫时长中位数 result[f'{prefix}_dial_time_median_{day[0]}'] = cal_median(temp['duration'][temp.dial_type == 'DIAL'].values.tolist()) # 近x天最大通话时长 result[f'{prefix}_call_max_time_{day[0]}'] = cal_max(temp['duration'].values) # 近x天主叫最大通话时长 result[f'{prefix}_dial_max_time_{day[0]}'] = cal_max(temp['duration'][temp.dial_type=='DIAL'].values) # 近x天被叫最大通话时长 result[f'{prefix}_dialed_max_time_{day[0]}'] = cal_max(temp['duration'][temp.dial_type=='DIAL'].values) for dr in duration_list: # match是近x天通话时长在(y,z]秒的记录, match_df是符合筛选条件的记录 if dr[1] == 'up': match_df = temp[['peer_number','dial_type','duration']][temp.duration > dr[0]] else: match_df = temp[['peer_number','dial_type','duration']][(temp.duration > dr[0]) & (temp.duration <= dr[1])] # 近x天通话时长(y,z]秒内的通话记录的通话总时长 result[f'{prefix}_duration_{dr[0]}_{dr[1]}_sum_{day[0]}'] = sum(match_df['duration'].values.tolist()) # 近x天主叫时长(y,z]秒内的通话记录的通话总时长 result[f'{prefix}_dial_duration_{dr[0]}_{dr[1]}_sum_{day[0]}'] = sum(match_df['duration'][match_df.dial_type == 'DIAL'].values) # 近x天被叫时长(y,z]秒内的通话记录的通话总时长 result[f'{prefix}_dialed_duration_{dr[0]}_{dr[1]}_sum_{day[0]}'] = sum(match_df['duration'][match_df.dial_type == 'DIALED'].values) # 近x天通话时长(y,z]秒内的联系人个数 result[f'{prefix}_contacter_{dr[0]}_{dr[1]}_cnt_{day[0]}'] = len(set(match_df['peer_number'].values)) # 近x天通话时长(y,z]秒内互通联系人个数 result[f'{prefix}_dial_dialed_contacter_{dr[0]}_{dr[1]}_cnt_{day[0]}'] = len(set(match_df['peer_number'][match_df.dial_type == 'DIAL'].values.tolist()) & set(match_df['peer_number'][match_df.dial_type == 'DIALED'].values.tolist())) # 近x天通话时长(y,z)秒内的联系人个数占比 result[f'{prefix}_contacter_{dr[0]}_{dr[1]}_rate_{day[0]}'] = division(result[f'{prefix}_contacter_{dr[0]}_{dr[1]}_cnt_{day[0]}'],len(set(temp['peer_number'].values))) # 近x天主叫联系人 和 被叫联系人 dial_number = set(temp['peer_number'][temp.dial_type == 'DIALED'].values.tolist()) dialed_number = set(temp['peer_number'][temp.dial_type == 'DIAL'].values.tolist()) # 近x天通话时长在(y,z]秒内互通联系人个数占比 result[f'{prefix}_dial_dialed_contacter_{dr[0]}_{dr[1]}_rate_{day[0]}'] = division(result[f'{prefix}_dial_dialed_contacter_{dr[0]}_{dr[1]}_cnt_{day[0]}'], len(dial_number & dialed_number)) # 近x天通话时长在(y,z]秒内的通话次数 result[f'{prefix}_call_{dr[0]}_{dr[1]}_cnt_{day[0]}'] = match_df.shape[0] # 近x天通话时长在(y,z]秒内的通话次数占比 result[f'{prefix}_call_{dr[0]}_{dr[1]}_rate_{day[0]}'] = division(match_df.shape[0], temp.shape[0]) # 近x天通话时长在(y,z]秒内的主叫通话次数 result[f'{prefix}_caller_{dr[0]}_{dr[1]}_cnt_{day[0]}'] = match_df['peer_number'][match_df.dial_type == 'DIAL'].shape[0] # 近x天通话时长在(y,z]秒内的被叫通话次数 result[f'{prefix}_called_{dr[0]}_{dr[1]}_cnt_{day[0]}'] = match_df['peer_number'][match_df.dial_type == 'DIALED'].shape[0] # 近x天通话时长在(y,z]秒的主叫号码数 calling = match_df['peer_number'][match_df.dial_type == 'DIAL'].values.tolist() called = match_df['peer_number'][match_df.dial_type == 'DIALED'].values.tolist() # 近x月通话时长在(y,z]秒的互相通话次数 result[f'{prefix}_calls_{dr[0]}_{dr[1]}_cnt_{day[0]}'] = len([x for x in calling if x in called]) print('duration day feature count:', len(result)) return result
def call_summary(dataObj): prefix = 'yysCall' result = dict() # 开户时间 open_date = dataObj.open_data # 运营商数据爬取时间 last_modify_time = dataObj.last_modify_time df_call = dataObj.df_callRecord time_limit_list = [] time_limit_list.append(last_modify_time) for i in (1, 2, 3, 4, 5, 6): time_limit_list.append(last_modify_time - datetime.timedelta(days=30 * i)) every_month_data = [] for i in range(len(time_limit_list)): if (i+1) < len(time_limit_list): temp = df_call[(df_call.time>time_limit_list[i+1]) & (df_call.time<=time_limit_list[i])] every_month_data.append(temp) my_dict = [('90d',90, every_month_data[0: 3]), ('180d', 180, every_month_data)] for md in my_dict: call_cnt_list = [] dial_cnt_list = [] dialed_cnt_list = [] call_time_list = [] dial_time_list = [] dialed_time_list = [] for temp in md[2]: call_cnt_list.append(temp.shape[0]) dial_cnt_list.append(temp[temp.dial_type == 'DIAL'].shape[0]) dialed_cnt_list.append(temp[temp.dial_type == 'DIALED'].shape[0]) call_time_list.append(sum(temp['duration'].values)) dial_time_list.append(sum(temp['duration'][temp.dial_type == 'DIAL'].values)) dialed_time_list.append(sum(temp['duration'][temp.dial_type == 'DIALED'].values)) if (last_modify_time-datetime.timedelta(md[1]) >= open_date): # 月均被叫次数 result[f'{prefix}_dialed_avg_month_cnt_{md[0]}'] = division(sum(dialed_cnt_list), len(dialed_cnt_list)) # 月均主叫次数 result[f'{prefix}_dial_avg_month_cnt_{md[0]}'] = division(sum(dial_cnt_list), len(dial_cnt_list)) # 月均通话次数 result[f'{prefix}_call_avg_month_cnt_{md[0]}'] = division(sum(call_cnt_list), len(call_cnt_list)) # 月平均主叫时长 result[f'{prefix}_dial_avg_month_time_{md[0]}'] = division(sum(dial_time_list), len(dial_time_list)) # 月平均被叫时长 result[f'{prefix}_dialed_avg_month_time_{md[0]}'] = division(sum(dialed_time_list), len(dialed_time_list)) # 月平均通话时长 result[f'{prefix}_call_avg_month_time_{md[0]}'] = division(sum(call_time_list), len(call_time_list)) else: fm = (last_modify_time - open_date)//30 + 1 # 月均被叫次数 result[f'{prefix}_dialed_avg_month_cnt_{md[0]}'] = division(sum(dialed_cnt_list), fm) # 月均主叫次数 result[f'{prefix}_dial_avg_month_cnt_{md[0]}'] = division(sum(dial_cnt_list), fm) # 月均通话次数 result[f'{prefix}_call_avg_month_cnt_{md[0]}'] = division(sum(call_cnt_list), fm) # 月平均主叫时长 result[f'{prefix}_dial_avg_month_time_{md[0]}'] = division(sum(dial_time_list), fm) # 月平均被叫时长 result[f'{prefix}_dialed_avg_month_time_{md[0]}'] = division(sum(dialed_time_list), fm) # 月平均通话时长 result[f'{prefix}_call_avg_month_time_{md[0]}'] = division(sum(call_time_list), fm) print('yysCall call summary feature count') return result
def msg_aver_median(dataObj): prefix = 'yysMsg' result = dict() # 开户时间 open_date = dataObj.open_data # 运营商数据爬取时间 last_modify_time = dataObj.last_modify_time df_smses = dataObj.df_smses time_limit_list = [] time_limit_list.append(last_modify_time) for i in (1, 2, 3, 4, 5, 6): time_limit_list.append(last_modify_time - datetime.timedelta(days=30 * i)) every_month_data = [] for i in range(len(time_limit_list)): if (i + 1) < len(time_limit_list): temp = df_smses[(df_smses.time > time_limit_list[i + 1]) & (df_smses.time <= time_limit_list[i])] every_month_data.append(temp) my_dict = [('90d', 90, every_month_data[0:3]), ('180d', 180, every_month_data)] sms_cnt_list = [] for temp in every_month_data: sms_cnt_list.append(temp.shape[0]) result[f'median_sms_cnt'] = cal_median(sms_cnt_list) for md in my_dict: msg_cnt_list = [] msg_send_list = [] msg_receive_list = [] msg_fee_list = [] for temp in md[2]: msg_cnt_list.append(temp.shape[0]) msg_send_list.append(temp[temp.send_type == 'SEND'].shape[0]) msg_receive_list.append(temp[temp.send_type == 'RECEIVE'].shape[0]) msg_fee_list.append(sum(temp['fee'].values)) if (last_modify_time - datetime.timedelta(md[1]) >= open_date): # 月平均短信次数 result[f'{prefix}_msg_cnt_mean_{md[0]}'] = division( sum(msg_cnt_list), len(msg_cnt_list)) # 月平均发送次数 result[f'{prefix}_msg_send_cnt_mean_{md[0]}'] = division( sum(msg_send_list), len(msg_send_list)) # 月平均接收次数 result[f'{prefix}_msg_receive_cnt_mean_{md[0]}'] = division( sum(msg_receive_list), len(msg_receive_list)) else: fm = (last_modify_time - open_date) // 30 + 1 # 月平均短信次数 result[f'{prefix}_msg_cnt_mean_{md[0]}'] = division( sum(msg_cnt_list), fm) # 月平均发送次数 result[f'{prefix}_msg_send_cnt_mean_{md[0]}'] = division( sum(msg_send_list), fm) # 月平均接收次数 result[f'{prefix}_msg_receive_cnt_mean_{md[0]}'] = division( sum(msg_receive_list), fm) # 月均最大费用 result[f'{prefix}_msg_fee_max_month_{md[0]}'] = cal_max(msg_fee_list) # 发送次数稳定性 result[f'{prefix}_msg_send_cnt_month_stab_{md[0]}'] = division( cal_std(msg_send_list), result[f'{prefix}_msg_send_cnt_mean_{md[0]}'] ) if cal_std(msg_send_list) != '' else '' #接收次数稳定性 result[f'{prefix}_msg_receive_cnt_month_stab_{md[0]}'] = division( cal_std(msg_send_list), result[f'{prefix}_msg_receive_cnt_mean_{md[0]}'] ) if cal_std(msg_receive_list) != '' else '' return result