示例#1
0
def fee_cnt_time(dataObj):
    prefix = 'yysRecharge'
    result = dict()
    last_modify_time = dataObj.last_modify_time
    df_recharges = dataObj.df_recharges
    day_list = []
    array = (30, 90, 180)
    amount_cut = [10, 20, 50, 100, 150, 200]
    for i in array:
        x = (str(i) + 'd',
             df_recharges[(last_modify_time -
                           df_recharges['recharge_time']).dt.days <= i])
        day_list.append(x)
    for day in day_list:
        temp = day[1][['recharge_time', 'amount']]
        # 近x天充值总金额
        result[f'{prefix}_sum_recharge_amout_{day[0]}'] = sum(
            temp['amount'].values)
        # 近x天充值次数
        result[f'{prefix}_recharge_cnt_sum_{day[0]}'] = temp.shape[0]
        # 近x天单笔充值金额中位数
        result[f'{prefix}_recharge_amount_median_{day[0]}'] = cal_median(
            temp['amount'].values)
        # 近x天单笔充值金额最大值
        result[f'{prefix}_recharge_amount_max_{day[0]}'] = cal_max(
            temp['amount'].values)
        # 近x天充值费用平均值,按笔算
        result[f'{prefix}_recharge_amount_mean_{day[0]}_fm_cnt'] = division(
            sum(temp['amount'].values), temp.shape[0])

        for ac in amount_cut:
            amount = 10 * ac
            match_df = temp[temp.amount > 10 * ac]
            result[
                f'{prefix}_recharge_amount{amount}_cnt_{day[0]}'] = match_df.shape[
                    0]

        if day[0] == '90d' or day[0] == '180d':
            array = []
            match_df = sorted(temp['recharge_time'])
            for i in range(0, len(match_df)):
                if (i + 1) < len(match_df):
                    day_dis = (match_df[i + 1] - match_df[i]).days
                    array.append(day_dis)
            # 近x天相邻两笔充值最大间隔
            result[f'{prefix}_recharge_timespan_max_{day[0]}'] = cal_max(array)
            # 近x天相邻两笔充值最小间隔
            result[f'{prefix}_recharge_timespan_min_{day[0]}'] = cal_min(array)
            # 近x天相邻两笔充值间隔均值
            result[f'{prefix}_recharge_timespan_mean_{day[0]}'] = division(
                sum(array), len(array))

    recharge_time = sorted(df_recharges['recharge_time'])
    # 最近一次充值距更新时间天数
    result[f'{prefix}_trade_recent_time_span'] = (
        last_modify_time - recharge_time[len(recharge_time) - 1]).days
    print('recharge fee cnt time count:', len(result))
    return result
示例#2
0
def fee_day(dataObj):
    prefix = 'yysCall'
    result = dict()
    last_modify_time = dataObj.last_modify_time
    df_callRecord_3day = dataObj.df_callRecord[(last_modify_time - dataObj.df_callRecord['time']).dt.days <= 3]
    df_callRecord_15day = dataObj.df_callRecord[(last_modify_time - dataObj.df_callRecord['time']).dt.days <= 15]
    day_list = [
        #('3d', df_callRecord_3day),
        #('7d', dataObj.df_callRecord_7day),
        #('15d', df_callRecord_15day),
        ('30d', dataObj.df_callRecord_1m),
        ('90d', dataObj.df_callRecord_3m),
        ('180d', dataObj.df_callRecord)
    ]
    # 费用区间,单位是分
    fee_range = [(0, 20),(20, 50),(50, 100),(100, 500),(500, 'up')]
    for day in day_list:
        temp = day[1][['peer_number', 'fee']]
        for fr in fee_range:
            if fr[1] == 'up':
                match_df = temp[temp.fee > fr[0]]
            else:
                match_df = temp[(temp.fee > fr[0]) & (temp.fee <= fr[1])]
            # 近x天通话费用在(y,z]的通话次数
            result[f'{prefix}_call_fee_{fr[0]}_{fr[1]}_cnt_{day[0]}'] = match_df.shape[0]
            # 近x天通话费用在(y,z]的通话次数占比
            result[f'{prefix}_call_fee_{fr[0]}_{fr[1]}_rate_{day[0]}'] = division(match_df.shape[0], temp.shape[0])
            # 近x天通话费用在(y,z]的联系人个数
            result[f'{prefix}_contacter_call_fee_{fr[0]}_{fr[1]}_cnt_{day[0]}'] = len(set(match_df['peer_number'].values))
    print('fee day feature count:', len(result))
    return result
示例#3
0
def contact_tag(dataObj):
    prefix = 'yysCall'
    result = dict()
    last_modify_time = dataObj.last_modify_time
    df_callRecord_3day = dataObj.df_callRecord[(last_modify_time - dataObj.df_callRecord['time']).dt.days <= 3]
    df_callRecord_15day = dataObj.df_callRecord[(last_modify_time - dataObj.df_callRecord['time']).dt.days <= 15]
    day_list = [
        ('3d', df_callRecord_3day),
        ('7d', dataObj.df_callRecord_7day),
        ('15d', df_callRecord_15day),
        ('30d', dataObj.df_callRecord_1m),
        ('90d', dataObj.df_callRecord_3m),
        ('180d', dataObj.df_callRecord)
    ]
    phones = ['110', '120', '119']
    for day in day_list:
        temp = day[1][['peer_number', 'duration', 'dial_type']]
        for phone in phones:
            # match_df 是近x天与phone的通话记录,phone属于110,120,199
            match_df = temp[temp.peer_number==phone]
            # 近x天与110,120,119通话次数
            result[f'{prefix}_call_{phone}_cnt_{day[0]}'] = match_df.shape[0]
            # 近x天与110,120,119主叫次数
            result[f'{prefix}_call_dial_{phone}_cnt_{day[0]}'] = match_df[match_df.dial_type == 'DIAL'].shape[0]
            # 近x天与110,120,119被叫次数
            result[f'{prefix}_call_dialed_{phone}_cnt_{day[0]}'] = match_df[match_df.dial_type == 'DIALED'].shape[0]
            # 近x天与110,120,119通话次数占比
            result[f'{prefix}_call_{phone}_rate_{day[0]}'] = division(match_df.shape[0], temp.shape[0])
            # 近x天与110,120,119主叫时长
            result[f'{prefix}_call_dial_{phone}_time_{day[0]}'] = sum(match_df['duration'][match_df.dial_type == 'DIAL'].values)
            # 近x天与110,120,119被叫时长
            result[f'{prefix}_call_dialed_{phone}_time_{day[0]}'] = sum(match_df['duration'][match_df.dial_type == 'DIALED'].values)
            call_duration = sum(match_df['duration'].values.tolist())
            # 近x天与110,120,119的通话时长
            result[f'{prefix}_call_{phone}_time_{day[0]}'] = call_duration
            # 近x天与110,120,119的通话时长占比
            result[f'{prefix}_call_{phone}_time_rate_{day[0]}'] = division(call_duration, sum(temp['duration'].values.tolist()))
    print('contact tag feature count:', len(result))
    return result
示例#4
0
def time_interval_day(dataObj):
    prefix = 'yysCall'
    result = dict()
    time_interval_list = [
        ('early_morning', 5, 30, 9, 0),
        ('morning', 9, 0, 11, 30),
        ('nooning', 11, 30, 13, 30),
        ('afternoon', 13, 30, 17, 30),
        ('toward_evening', 17, 30, 19, 30),
        ('evening', 19, 30, 23, 30),
        ('small_hour', 23, 30, 1, 30),
        ('midnight', 1, 30, 5, 30)
    ]
    last_modify_time = dataObj.last_modify_time
    df_callRecord_3day = dataObj.df_callRecord[(last_modify_time - dataObj.df_callRecord['time']).dt.days <= 3]
    df_callRecord_15day = dataObj.df_callRecord[(last_modify_time - dataObj.df_callRecord['time']).dt.days <= 15]
    day_list = [
        ('3d', df_callRecord_3day),
        ('7d', dataObj.df_callRecord_7day),
        ('15d', df_callRecord_15day),
        ('30d', dataObj.df_callRecord_1m),
        ('90d', dataObj.df_callRecord_3m),
        ('180d', dataObj.df_callRecord)
    ]
    for day in day_list:
        # temp是近x天的通话
        data = day[1][['time', 'peer_number', 'duration', 'dial_type']]
        temp = data.copy()
        temp['hour'] = data['time'].dt.hour
        temp['minute'] =  data['time'].dt.minute
        for ti in time_interval_list:
            if ti[0] == 'small_hour':
                # 从23:30 到 23:59
                time_range1 = temp[(temp.hour>=ti[1]) & (temp.minute>=ti[2]) & (temp.minute<=59)]
                # 从00:00 到 1:30
                time_range2 = temp[(temp.hour>=0) & (temp.minute>0) & (temp.hour<=ti[3]) & (temp.minute<ti[4])]
                match_df = pd.concat([time_range1, time_range2], axis=0)
            else:
                match_df = temp[(temp.hour>=ti[1]) & (temp.minute>=ti[2]) & (temp.hour<=ti[3]) & (temp.minute<ti[4])]
            # 近x天通话时段在(y_hour: y_minute)-(z_hour: z_minute)的通话次数
            result[f'{prefix}_call_{ti[0]}_cnt_{day[0]}'] = match_df.shape[0]
            # 近x天通话时段在(y_hour: y_minute)-(z_hour: z_minute)的通话次数比例
            result[f'{prefix}_call_{ti[0]}_rate_{day[0]}'] = division(match_df.shape[0], temp.shape[0])
            # 近x天通话时段在(y_hour: y_minute)-(z_hour: z_minute)的通话时长
            result[f'{prefix}_call_{ti[0]}_time_{day[0]}'] = sum(match_df['duration'].values.tolist())
            # 近x天通话时段在(y_hour: y_minute)-(z_hour: z_minute)的联系人个数
            result[f'{prefix}_contacter_{ti[0]}_cnt_{day[0]}'] = len(set(match_df['peer_number'].values))
    print('time interval day feature count:', len(result))
    return result
示例#5
0
def recharge_aver_median(dataObj):
    prefix = 'yysRecharge'
    result = dict()
    open_date = dataObj.open_data
    last_modify_time = dataObj.last_modify_time
    df_recharge = dataObj.df_recharges
    time_limit_list = []
    time_limit_list.append(last_modify_time)
    for i in (1, 2, 3, 4, 5, 6):
        time_limit_list.append(last_modify_time -
                               datetime.timedelta(days=30 * i))
    every_month_data = []
    for i in range(len(time_limit_list)):
        if (i + 1) < len(time_limit_list):
            temp = df_recharge[(df_recharge.time > time_limit_list[i + 1])
                               & (df_recharge.time <= time_limit_list[i])]
            every_month_data.append(temp)
    my_dict = [('90d', 90, every_month_data[0:3]),
               ('180d', 180, every_month_data)]
    recharge_cnt = []
    for temp in every_month_data:
        recharge_cnt.append(temp.shape[0])
    result[f'{prefix}_rechange_cnt_median'] = cal_median(recharge_cnt)
    for md in my_dict:
        recharge_cnt_list = []  # 存放月充值次数
        recharge_amount_list = []  # 存放月充值金额
        for temp in md[2]:
            recharge_cnt_list.append(temp.shape[0])
            recharge_amount_list.append(sum(temp['amount'].values))
        if (last_modify_time - datetime.timedelta(md[1]) >= open_date):
            # 月均费用平均值
            result[
                f'{prefix}_recharge_amount_mean_{md[0]}_fm_month'] = division(
                    sum(recharge_amount_list), len(recharge_amount_list))
            # 月均充值次数
            result[f'{prefix}_recharge_cnt_mean_{md[0]}'] = division(
                sum(recharge_cnt_list), len(recharge_cnt_list))
        else:
            fm = fm = (last_modify_time - open_date) // 30 + 1
            # 月均费用平均值
            result[
                f'{prefix}_recharge_amount_mean_{md[0]}_fm_month'] = division(
                    sum(recharge_amount_list), fm)
            # 月均充值次数
            result[f'{prefix}_recharge_cnt_mean_{md[0]}'] = division(
                sum(recharge_cnt_list), fm)
        # 月均充值次数最大值
        result[f'{prefix}_recharge_cnt_max_{md[0]}'] = cal_max(
            recharge_cnt_list)
        # 月均充值金额最大值
        result[f'{prefix}_recharge_amount_month_max_{md[0]}'] = cal_max(
            recharge_amount_list)
        # 月均充值次数稳定性
        result[f'{prefix}_recharge_cnt_month_stab_{md[0]}'] = division(
            cal_std(recharge_cnt_list),
            result[f'{prefix}_recharge_cnt_mean_{md[0]}_fm_month']
        ) if cal_std(recharge_cnt_list) != '' else ''
        # 月均充值金额稳定性
        result[f'{prefix}_recharge_cnt_month_stab_{md[0]}'] = division(
            cal_std(recharge_amount_list),
            result[f'{prefix}_recharge_amount_mean_{md[0]}_fm_month']
        ) if cal_std(recharge_amount_list) != '' else ''

    print('yysRecharge recharge aver median feature count:', len(result))
    return result
示例#6
0
def fee_time(dataObj):
    prefix = 'yysBill'
    result = dict()
    last_modify_time = dataObj.last_modify_time
    df_bill = dataObj.df_bills
    # 近6个月各种费用平均值
    result[f'{prefix}_bill_total_fee_month_mean'] = df_bill.total_fee.mean()
    result[f'{prefix}_bill_actual_fee_month_mean'] = df_bill.actual_fee.mean()
    result[f'{prefix}_bill_voice_fee_month_mean'] = df_bill.voice_fee.mean()
    result[f'{prefix}_bill_extra_service_fee_month_mean'] = df_bill.extra_service_fee.mean()
    result[f'{prefix}_bill_sms_fee_month_mean'] = df_bill.sms_fee.mean()
    result[f'{prefix}_bill_extra_fee_month_mean'] = df_bill.extra_fee.mean()
    result[f'{prefix}_bill_web_fee_month_mean'] = df_bill.web_fee.mean()
    # 近6个月各种费用中位数
    result[f'{prefix}_bill_total_fee_month_mean'] = df_bill.total_fee.median()
    result[f'{prefix}_bill_actual_fee_month_mean'] = df_bill.actual_fee.median()
    result[f'{prefix}_bill_voice_fee_month_mean'] = df_bill.voice_fee.median()
    result[f'{prefix}_bill_extra_service_fee_month_mean'] = df_bill.extra_service_fee.median()
    result[f'{prefix}_bill_sms_fee_month_mean'] = df_bill.sms_fee.median()
    result[f'{prefix}_bill_extra_fee_month_mean'] = df_bill.extra_fee.median()
    result[f'{prefix}_bill_web_fee_month_mean'] = df_bill.web_fee.median()

    my_dict = {3:'', 6:''}
    year = last_modify_time.year
    month = last_modify_time.month
    for i in (3,6):
        if (month-i)>0:
            month_threshold = month - i
            year_threshold = year
        else:
            month_threshold = month + 12 - i
            year_threshold = year - 1
        month_year_df = df_bill[((df_bill.bill_month).dt.year>=year_threshold)&((df_bill.bill_month).dt.month>=month_threshold)]
        if i == 3:
            my_dict[3] = month_year_df
        if i == 6:
            my_dict[6] = month_year_df
    day_list = {(3, '90d'), (6, '180d')}
    fee_list = [10, 20, 50, 100, 150, 200]


    for day in day_list:
        df_temple = my_dict[day[0]]
        # 费用之和
        if day[0] >= df_temple.shape[0]:
            # 总费用
            result[f'{prefix}_bill_total_fee_sum_{day[1]}'] = df_temple.total_fee.sum()
            # 实际费用
            result[f'{prefix}_bill_actual_fee_sum_{day[1]}'] = df_temple.actual_fee.sum()
            # 语音费用
            result[f'{prefix}_bill_voice_fee_sum_{day[1]}'] = df_temple.voice_fee.sum()
            # 语音费用占比
            result[f'{prefix}_bill_voice_fee_sum_rate_{day[1]}'] = division(df_temple.voice_fee.sum(), df_temple.total_fee.sum())
            # 其他费用
            result[f'{prefix}_bill_extra_service_fee_sum_{day[1]}'] = df_temple.extra_service_fee.sum()
            # 其他费用占比
            result[f'{prefix}_bill_extra_service_fee_sum_rate_{day[1]}'] = division(df_temple.extra_service_fee.sum(), df_temple.total_fee.sum())
            # 短信费用
            result[f'{prefix}_bill_sms_fee_sum_{day[1]}'] = df_temple.sms_fee.sum()
            # 短信费用占比
            result[f'{prefix}_bill_sms_fee_sum_rate_{day[1]}'] = division(df_temple.sms_fee.sum(), df_temple.total_fee.sum())
            result[f'{prefix}_bill_extra_fee_sum_{day[1]}'] = df_temple.extra_fee.sum()
            result[f'{prefix}_bill_extra_fee_sum_rate_{day[1]}'] = division(df_temple.extra_fee.sum(), df_temple.total_fee.sum())
            result[f'{prefix}_bill_web_fee_sum_{day[1]}'] = df_temple.web_fee.sum()
            result[f'{prefix}_bill_web_fee_sum_rate_{day[1]}'] = division(df_temple.web_fee.sum(), df_temple.total_fee.sum())
        else:
            df1 = df_temple.iloc[:day[0]]
            result[f'{prefix}_bill_total_fee_sum_{day[1]}'] = df1.total_fee.sum()
            result[f'{prefix}_bill_actual_fee_sum_{day[1]}'] = df1.actual_fee.sum()
            result[f'{prefix}_bill_voice_fee_sum_{day[1]}'] = df1.voice_fee.sum()
            result[f'{prefix}_bill_voice_fee_sum_rate_{day[1]}'] = division(df1.voice_fee.sum(), df1.total_fee.sum())
            result[f'{prefix}_bill_extra_service_fee_sum_{day[1]}'] = df1.extra_service_fee.sum()
            result[f'{prefix}_bill_extra_service_fee_sum_rate_{day[1]}'] = division(df1.extra_service_fee.sum(), df1.total_fee.sum())
            result[f'{prefix}_bill_sms_fee_sum_{day[1]}'] = df1.sms_fee.sum()
            result[f'{prefix}_bill_sms_fee_sum_rate_{day[1]}'] = division(df1.sms_fee.sum(), df1.total_fee.sum())
            result[f'{prefix}_bill_extra_fee_sum_{day[1]}'] = df1.extra_fee.sum()
            result[f'{prefix}_bill_extra_fee_sum_rate_{day[1]}'] = division(df1.extra_fee.sum(), df1.total_fee.sum())
            result[f'{prefix}_bill_web_fee_sum_{day[1]}'] = df1.web_fee.sum(df1.web_fee.sum(), df1.total_fee.sum())
        # 费用最值
        result[f'{prefix}_bill_total_fee_month_max_{day[1]}'] = cal_max(df_temple.total_fee.values)
        result[f'{prefix}_bill_actual_fee_month_max_{day[1]}'] = cal_max(df_temple.actual_fee.values)
        result[f'{prefix}_bill_voice_fee_month_max_{day[1]}'] = cal_max(df_temple.voice_fee.values)
        result[f'{prefix}_bill_extra_service_fee_month_max_{day[1]}'] = cal_max(df_temple.extra_service_fee.values)
        result[f'{prefix}_bill_sms_fee_month_max_{day[1]}'] = cal_max(df_temple.sms_fee.values)
        result[f'{prefix}_bill_extra_fee_month_max_{day[1]}'] = cal_max(df_temple.extra_fee.values)
        result[f'{prefix}_bill_web_fee_month_max_{day[1]}'] = cal_max(df_temple.web_fee.values)

        for fee in fee_list:
            temp_total = df_temple[['bill_month', 'total_fee', 'web_fee']][df_temple.total_fee > (fee * 100)]
            temp_web = df_temple[['bill_month', 'total_fee', 'web_fee']][df_temple.web_fee > (fee * 100)]
            result[f'{prefix}_bill_totalfee{fee}_cnt_{day[1]}'] = min(temp_total.shape[0], day[0])
            result[f'{prefix}_bill_webfee{fee}_cnt_{day[1]}'] = min(temp_web.shape[0], day[0])

    print('fee time feature count:', len(result))
    return result
示例#7
0
def duration_day(dataObj):
    prefix = 'yysCall'
    result = dict()
    last_modify_time = dataObj.last_modify_time
    # 通话时长区间,单位s, 取左开右闭
    duration_list = [(0,30), (30,60), (60,180), (180,300), (300,600), (600, 'up')]
    df_callRecord_3day = dataObj.df_callRecord[(last_modify_time - dataObj.df_callRecord['time']).dt.days <= 3]
    df_callRecord_15day = dataObj.df_callRecord[(last_modify_time - dataObj.df_callRecord['time']).dt.days <= 15]
    day_list = [
        ('3d', df_callRecord_3day),
        ('7d', dataObj.df_callRecord_7day),
        ('15d',df_callRecord_15day),
        ('30d', dataObj.df_callRecord_1m),
        ('90d', dataObj.df_callRecord_3m),
        ('180d', dataObj.df_callRecord)
    ]
    for day in day_list:
        # temp是近x天的通话记录
        temp = day[1]
        # 近x天互通联系人个数
        result[f'{prefix}_dial_dialed_contacter_cnt_{day[0]}'] = len(set(temp['peer_number'][temp.dial_type == 'DIAL'].values) & set(temp['peer_number'][temp.dial_type == 'DIALED'].values))
        # 近x天联系人数量
        result[f'{prefix}_contacter_cnt_{day[0]}'] = len(set(temp['peer_number']))
        # 近x天通话次数
        result[f'{prefix}_call_cnt_{day[0]}'] = temp.shape[0]
        # 近x天主叫次数
        result[f'{prefix}_dial_cnt_{day[0]}'] = temp['peer_number'][temp.dial_type == 'DIAL'].shape[0]
        # 近x天被叫次数
        result[f'{prefix}_dialed_cnt_{day[0]}'] = temp['peer_number'][temp.dial_type == 'DIALED'].shape[0]
        # 近x天通话时长总和
        result[f'{prefix}_call_time_{day[0]}'] = sum(temp['duration'].values)
        # 近x天主叫时长总和
        result[f'{prefix}_dial_time_{day[0]}'] = sum(temp['duration'][temp.dial_type == 'DIAL'].values)
        # 近x天被叫时长总和
        result[f'{prefix}_dialed_time_{day[0]}'] = sum(temp['duration'][temp.dial_type == 'DIALED'].values)
        # 近x天被叫时长中位数
        result[f'{prefix}_dialed_time_median_{day[0]}'] = cal_median(temp['duration'][temp.dial_type == 'DIALED'].values.tolist())
        # 近x天主叫时长中位数
        result[f'{prefix}_dial_time_median_{day[0]}'] = cal_median(temp['duration'][temp.dial_type == 'DIAL'].values.tolist())
        # 近x天最大通话时长
        result[f'{prefix}_call_max_time_{day[0]}'] = cal_max(temp['duration'].values)
        # 近x天主叫最大通话时长
        result[f'{prefix}_dial_max_time_{day[0]}'] = cal_max(temp['duration'][temp.dial_type=='DIAL'].values)
        # 近x天被叫最大通话时长
        result[f'{prefix}_dialed_max_time_{day[0]}'] = cal_max(temp['duration'][temp.dial_type=='DIAL'].values)

        for dr in duration_list:
            # match是近x天通话时长在(y,z]秒的记录, match_df是符合筛选条件的记录
            if dr[1] == 'up':
                match_df = temp[['peer_number','dial_type','duration']][temp.duration > dr[0]]
            else:
                match_df = temp[['peer_number','dial_type','duration']][(temp.duration > dr[0]) & (temp.duration <= dr[1])]
            # 近x天通话时长(y,z]秒内的通话记录的通话总时长
            result[f'{prefix}_duration_{dr[0]}_{dr[1]}_sum_{day[0]}'] = sum(match_df['duration'].values.tolist())
            # 近x天主叫时长(y,z]秒内的通话记录的通话总时长
            result[f'{prefix}_dial_duration_{dr[0]}_{dr[1]}_sum_{day[0]}'] = sum(match_df['duration'][match_df.dial_type == 'DIAL'].values)
            # 近x天被叫时长(y,z]秒内的通话记录的通话总时长
            result[f'{prefix}_dialed_duration_{dr[0]}_{dr[1]}_sum_{day[0]}'] = sum(match_df['duration'][match_df.dial_type == 'DIALED'].values)
            # 近x天通话时长(y,z]秒内的联系人个数
            result[f'{prefix}_contacter_{dr[0]}_{dr[1]}_cnt_{day[0]}'] = len(set(match_df['peer_number'].values))
            # 近x天通话时长(y,z]秒内互通联系人个数
            result[f'{prefix}_dial_dialed_contacter_{dr[0]}_{dr[1]}_cnt_{day[0]}'] = len(set(match_df['peer_number'][match_df.dial_type == 'DIAL'].values.tolist()) & set(match_df['peer_number'][match_df.dial_type == 'DIALED'].values.tolist()))
            # 近x天通话时长(y,z)秒内的联系人个数占比
            result[f'{prefix}_contacter_{dr[0]}_{dr[1]}_rate_{day[0]}'] = division(result[f'{prefix}_contacter_{dr[0]}_{dr[1]}_cnt_{day[0]}'],len(set(temp['peer_number'].values)))
            # 近x天主叫联系人 和 被叫联系人
            dial_number = set(temp['peer_number'][temp.dial_type == 'DIALED'].values.tolist())
            dialed_number = set(temp['peer_number'][temp.dial_type == 'DIAL'].values.tolist())
            # 近x天通话时长在(y,z]秒内互通联系人个数占比
            result[f'{prefix}_dial_dialed_contacter_{dr[0]}_{dr[1]}_rate_{day[0]}'] = division(result[f'{prefix}_dial_dialed_contacter_{dr[0]}_{dr[1]}_cnt_{day[0]}'], len(dial_number & dialed_number))
            # 近x天通话时长在(y,z]秒内的通话次数
            result[f'{prefix}_call_{dr[0]}_{dr[1]}_cnt_{day[0]}'] = match_df.shape[0]
            # 近x天通话时长在(y,z]秒内的通话次数占比
            result[f'{prefix}_call_{dr[0]}_{dr[1]}_rate_{day[0]}'] = division(match_df.shape[0], temp.shape[0])
            # 近x天通话时长在(y,z]秒内的主叫通话次数
            result[f'{prefix}_caller_{dr[0]}_{dr[1]}_cnt_{day[0]}'] = match_df['peer_number'][match_df.dial_type == 'DIAL'].shape[0]
            # 近x天通话时长在(y,z]秒内的被叫通话次数
            result[f'{prefix}_called_{dr[0]}_{dr[1]}_cnt_{day[0]}'] = match_df['peer_number'][match_df.dial_type == 'DIALED'].shape[0]
            # 近x天通话时长在(y,z]秒的主叫号码数
            calling = match_df['peer_number'][match_df.dial_type == 'DIAL'].values.tolist()
            called = match_df['peer_number'][match_df.dial_type == 'DIALED'].values.tolist()
            # 近x月通话时长在(y,z]秒的互相通话次数
            result[f'{prefix}_calls_{dr[0]}_{dr[1]}_cnt_{day[0]}'] = len([x for x in calling if x in called])
    print('duration day feature count:', len(result))
    return result
示例#8
0
def call_summary(dataObj):
    prefix = 'yysCall'
    result = dict()
    # 开户时间
    open_date = dataObj.open_data
    # 运营商数据爬取时间
    last_modify_time = dataObj.last_modify_time
    df_call = dataObj.df_callRecord
    time_limit_list = []
    time_limit_list.append(last_modify_time)
    for i in (1, 2, 3, 4, 5, 6):
        time_limit_list.append(last_modify_time - datetime.timedelta(days=30 * i))
    every_month_data = []
    for i in range(len(time_limit_list)):
        if (i+1) < len(time_limit_list):
            temp = df_call[(df_call.time>time_limit_list[i+1]) & (df_call.time<=time_limit_list[i])]
            every_month_data.append(temp)
    my_dict = [('90d',90, every_month_data[0: 3]), ('180d', 180, every_month_data)]
    for md in my_dict:
        call_cnt_list = []
        dial_cnt_list = []
        dialed_cnt_list = []
        call_time_list = []
        dial_time_list = []
        dialed_time_list = []
        for temp in md[2]:
            call_cnt_list.append(temp.shape[0])
            dial_cnt_list.append(temp[temp.dial_type == 'DIAL'].shape[0])
            dialed_cnt_list.append(temp[temp.dial_type == 'DIALED'].shape[0])
            call_time_list.append(sum(temp['duration'].values))
            dial_time_list.append(sum(temp['duration'][temp.dial_type == 'DIAL'].values))
            dialed_time_list.append(sum(temp['duration'][temp.dial_type == 'DIALED'].values))
        if (last_modify_time-datetime.timedelta(md[1]) >= open_date):
            # 月均被叫次数
            result[f'{prefix}_dialed_avg_month_cnt_{md[0]}'] = division(sum(dialed_cnt_list), len(dialed_cnt_list))
            # 月均主叫次数
            result[f'{prefix}_dial_avg_month_cnt_{md[0]}'] = division(sum(dial_cnt_list), len(dial_cnt_list))
            # 月均通话次数
            result[f'{prefix}_call_avg_month_cnt_{md[0]}'] = division(sum(call_cnt_list), len(call_cnt_list))
            # 月平均主叫时长
            result[f'{prefix}_dial_avg_month_time_{md[0]}'] = division(sum(dial_time_list), len(dial_time_list))
            # 月平均被叫时长
            result[f'{prefix}_dialed_avg_month_time_{md[0]}'] = division(sum(dialed_time_list), len(dialed_time_list))
            # 月平均通话时长
            result[f'{prefix}_call_avg_month_time_{md[0]}'] = division(sum(call_time_list), len(call_time_list))
        else:
            fm = (last_modify_time - open_date)//30 + 1
            # 月均被叫次数
            result[f'{prefix}_dialed_avg_month_cnt_{md[0]}'] = division(sum(dialed_cnt_list), fm)
            # 月均主叫次数
            result[f'{prefix}_dial_avg_month_cnt_{md[0]}'] = division(sum(dial_cnt_list), fm)
            # 月均通话次数
            result[f'{prefix}_call_avg_month_cnt_{md[0]}'] = division(sum(call_cnt_list), fm)
            # 月平均主叫时长
            result[f'{prefix}_dial_avg_month_time_{md[0]}'] = division(sum(dial_time_list), fm)
            # 月平均被叫时长
            result[f'{prefix}_dialed_avg_month_time_{md[0]}'] = division(sum(dialed_time_list), fm)
            # 月平均通话时长
            result[f'{prefix}_call_avg_month_time_{md[0]}'] = division(sum(call_time_list), fm)
    print('yysCall call summary feature count')
    return result
示例#9
0
def msg_aver_median(dataObj):
    prefix = 'yysMsg'
    result = dict()
    # 开户时间
    open_date = dataObj.open_data
    # 运营商数据爬取时间
    last_modify_time = dataObj.last_modify_time
    df_smses = dataObj.df_smses
    time_limit_list = []
    time_limit_list.append(last_modify_time)
    for i in (1, 2, 3, 4, 5, 6):
        time_limit_list.append(last_modify_time -
                               datetime.timedelta(days=30 * i))
    every_month_data = []
    for i in range(len(time_limit_list)):
        if (i + 1) < len(time_limit_list):
            temp = df_smses[(df_smses.time > time_limit_list[i + 1])
                            & (df_smses.time <= time_limit_list[i])]
            every_month_data.append(temp)
    my_dict = [('90d', 90, every_month_data[0:3]),
               ('180d', 180, every_month_data)]
    sms_cnt_list = []
    for temp in every_month_data:
        sms_cnt_list.append(temp.shape[0])
    result[f'median_sms_cnt'] = cal_median(sms_cnt_list)
    for md in my_dict:
        msg_cnt_list = []
        msg_send_list = []
        msg_receive_list = []
        msg_fee_list = []
        for temp in md[2]:
            msg_cnt_list.append(temp.shape[0])
            msg_send_list.append(temp[temp.send_type == 'SEND'].shape[0])
            msg_receive_list.append(temp[temp.send_type == 'RECEIVE'].shape[0])
            msg_fee_list.append(sum(temp['fee'].values))

        if (last_modify_time - datetime.timedelta(md[1]) >= open_date):
            # 月平均短信次数
            result[f'{prefix}_msg_cnt_mean_{md[0]}'] = division(
                sum(msg_cnt_list), len(msg_cnt_list))
            # 月平均发送次数
            result[f'{prefix}_msg_send_cnt_mean_{md[0]}'] = division(
                sum(msg_send_list), len(msg_send_list))
            # 月平均接收次数
            result[f'{prefix}_msg_receive_cnt_mean_{md[0]}'] = division(
                sum(msg_receive_list), len(msg_receive_list))
        else:
            fm = (last_modify_time - open_date) // 30 + 1
            # 月平均短信次数
            result[f'{prefix}_msg_cnt_mean_{md[0]}'] = division(
                sum(msg_cnt_list), fm)
            # 月平均发送次数
            result[f'{prefix}_msg_send_cnt_mean_{md[0]}'] = division(
                sum(msg_send_list), fm)
            # 月平均接收次数
            result[f'{prefix}_msg_receive_cnt_mean_{md[0]}'] = division(
                sum(msg_receive_list), fm)
        # 月均最大费用
        result[f'{prefix}_msg_fee_max_month_{md[0]}'] = cal_max(msg_fee_list)
        # 发送次数稳定性
        result[f'{prefix}_msg_send_cnt_month_stab_{md[0]}'] = division(
            cal_std(msg_send_list),
            result[f'{prefix}_msg_send_cnt_mean_{md[0]}']
        ) if cal_std(msg_send_list) != '' else ''
        #接收次数稳定性
        result[f'{prefix}_msg_receive_cnt_month_stab_{md[0]}'] = division(
            cal_std(msg_send_list),
            result[f'{prefix}_msg_receive_cnt_mean_{md[0]}']
        ) if cal_std(msg_receive_list) != '' else ''
    return result