def gen_fraud_ratio_feature(kinds='B'):
    """
    :param kinds: str, 目标编码的 字符, 可以是 ABCDE 或其组合
    :return:
    """
    # 0 读取数据
    train_id, test_id, train_data, test_data, Ytrain = ReadData(
        Ytrain=True, sort_by_time=True)
    train_id['LABEL'] = Ytrain['LABEL']
    train_data = train_data.merge(train_id, on=['PERSONID'], how='left')
    train_id = train_id.drop(['LABEL'], axis=1)
    # 1 个人计数
    df_cat_person_count = train_data[[
        'PERSONID', 'FTR51'
    ]].groupby('PERSONID').apply(lambda df_person: ftr51s2cat_count_dict(
        df_person, kinds)).to_frame('count_dict_person').reset_index()
    train_id = train_id.merge(df_cat_person_count, on=['PERSONID'], how='left')
    # 2 个人欺诈计数
    mask = train_data['LABEL'] == 1
    df_cat_person_fraud = train_data[mask][[
        'PERSONID', 'FTR51'
    ]].groupby('PERSONID').apply(lambda df_person: ftr51s2cat_count_dict(
        df_person, kinds)).to_frame('fraud_dict_person').reset_index()
    train_id = train_id.merge(df_cat_person_fraud, on=['PERSONID'], how='left')
    # ---------------------------------------- 好深的bug
    # 这样一来,如果非欺诈人员就没有个人欺诈记录,值全部为0,
    train_id['fraud_dict_person'] = train_id[[
        'count_dict_person', 'fraud_dict_person'
    ]].apply(lambda x: repair_fraud_dict_person(x), axis=1)
    # ---------------------------------------  好深的bug
    # 3 所有计数
    ftr51s_all = ','.join(list(train_data['FTR51'].values))
    count_dict_all = compute_cat_count_dict_from_ftr51s(ftr51s_all, kinds)
    # 4 所有欺诈
    ftr51s_all_fraud = ','.join(list(train_data[mask]['FTR51'].values))
    fraud_dict_all = compute_cat_count_dict_from_ftr51s(
        ftr51s_all_fraud, kinds)
    fraud_dict_all = {
        key: fraud_dict_all.setdefault(key, 0)
        for key in count_dict_all.keys()
    }
    # 5 赋值
    train_id['count_dict_all'] = [
        count_dict_all for _ in range(train_id.shape[0])
    ]
    train_id['fraud_dict_all'] = [
        fraud_dict_all for _ in range(train_id.shape[0])
    ]
    # 6 oob dict
    train_id['count_dict_oob'] = train_id[[
        'count_dict_all', 'count_dict_person'
    ]].apply(
        lambda s: subtract_dict(s['count_dict_all'], s['count_dict_person']),
        axis=1)

    train_id['fraud_dict_oob'] = train_id[[
        'fraud_dict_all', 'fraud_dict_person'
    ]].apply(
        lambda s: subtract_dict(s['fraud_dict_all'], s['fraud_dict_person']),
        axis=1)
    # 7 cat fraud  ratio dict
    # train
    train_id['cat_fraud_ratio_dict_oob'] = train_id[[
        'count_dict_oob', 'fraud_dict_oob'
    ]].apply(lambda s: division_dict(s['count_dict_oob'], s['fraud_dict_oob']),
             axis=1)
    # test
    cat_fraud_ratio_dict_all = division_dict(count_dict_all, fraud_dict_all)
    test_id['cat_fraud_ratio_dict_oob'] = [
        cat_fraud_ratio_dict_all for _ in range(test_id.shape[0])
    ]
    count_dict_person_test = test_data[[
        'PERSONID', 'FTR51'
    ]].groupby('PERSONID').apply(lambda df_person: ftr51s2cat_count_dict(
        df_person, kinds)).to_frame('count_dict_person').reset_index()
    test_id = test_id.merge(count_dict_person_test,
                            on=['PERSONID'],
                            how='left')
    test_id['cat_fraud_ratio_dict_oob'] = test_id.apply(lambda x: {
        key: x['cat_fraud_ratio_dict_oob'].setdefault(key, 0)
        for key in x['count_dict_person'].keys()
    },
                                                        axis=1)

    # 利用cat的欺诈比生成个人的特征
    # 8 max_fraud_ratio 特征
    train_id['max_fraud_ratio'] = train_id['cat_fraud_ratio_dict_oob'].map(
        lambda fraud_ratio_dict: pd.Series(fraud_ratio_dict).max())
    test_id['max_fraud_ratio'] = test_id['cat_fraud_ratio_dict_oob'].map(
        lambda fraud_ratio_dict: pd.Series(fraud_ratio_dict).max())

    #  9 sum_fraud_ratio 特征
    train_id['sum_fraud_ratio'] = train_id['cat_fraud_ratio_dict_oob'].map(
        lambda fraud_ratio_dict: pd.Series(fraud_ratio_dict).sum())
    test_id['sum_fraud_ratio'] = test_id['cat_fraud_ratio_dict_oob'].map(
        lambda fraud_ratio_dict: pd.Series(fraud_ratio_dict).sum())

    # 10  mean_fraud_ratio 特征
    train_id['mean_fraud_ratio'] = train_id['cat_fraud_ratio_dict_oob'].map(
        lambda fraud_ratio_dict: pd.Series(fraud_ratio_dict).mean())
    test_id['mean_fraud_ratio'] = test_id['cat_fraud_ratio_dict_oob'].map(
        lambda fraud_ratio_dict: pd.Series(fraud_ratio_dict).mean())

    # 11 保存特征, 查看分布
    for feat in ['max_fraud_ratio', 'sum_fraud_ratio', 'mean_fraud_ratio']:
        SaveFeature(train_id, test_id, feat)
        IsDifferentDistribution(feat)
Пример #2
0
def gen_fraud_ratio_feature(kinds='E', stats_name='fraud_ratio_mean_weight'):
    """
    计算一个人所有的cat, 计算cat oob 的count, fraud, 例如某欺诈用户如果B1一次记录出现两次,则B1 fraud +2, count +2,
    利用count, fraud 计算统计值
    :param kinds: str, 目标编码的 字符, 可以是 ABCDE 或其组合
    :return:
    """
    feature_name = '{}_{}'.format(stats_name, kinds)
    print('computing feature {}'.format(feature_name))
    # 0 读取数据
    train_id, test_id, train_data, test_data, Ytrain = ReadData(
        Ytrain=True, sort_by_time=True)
    train_id['LABEL'] = Ytrain['LABEL']
    train_data = train_data.merge(train_id, on=['PERSONID'], how='left')
    train_id = train_id.drop(['LABEL'], axis=1)
    # 1 个人计数
    df_cat_person_count = train_data[[
        'PERSONID', 'FTR51'
    ]].groupby('PERSONID').apply(lambda df_person: ftr51s2cat_count_dict(
        df_person, kinds)).to_frame('count_dict_person').reset_index()
    train_id = train_id.merge(df_cat_person_count, on=['PERSONID'], how='left')
    # 2 个人欺诈计数
    mask = train_data['LABEL'] == 1
    df_cat_person_fraud = train_data[mask][[
        'PERSONID', 'FTR51'
    ]].groupby('PERSONID').apply(lambda df_person: ftr51s2cat_count_dict(
        df_person, kinds)).to_frame('fraud_dict_person').reset_index()
    train_id = train_id.merge(df_cat_person_fraud, on=['PERSONID'], how='left')
    # ---------------------------------------- 好深的bug
    # 这样一来,如果非欺诈人员就没有个人欺诈记录,值全部为0,
    train_id['fraud_dict_person'] = train_id[[
        'count_dict_person', 'fraud_dict_person'
    ]].apply(lambda x: repair_fraud_dict_person(x), axis=1)
    # ---------------------------------------  好深的bug
    # 3 所有计数
    ftr51s_all = ','.join(list(train_data['FTR51'].values))
    count_dict_all = compute_cat_count_dict_from_ftr51s(ftr51s_all, kinds)
    # 4 所有欺诈
    ftr51s_all_fraud = ','.join(list(train_data[mask]['FTR51'].values))
    fraud_dict_all = compute_cat_count_dict_from_ftr51s(
        ftr51s_all_fraud, kinds)
    fraud_dict_all = {
        key: fraud_dict_all.setdefault(key, 0)
        for key in count_dict_all.keys()
    }
    # 5 赋值
    train_id['count_dict_all'] = [
        count_dict_all for _ in range(train_id.shape[0])
    ]
    train_id['fraud_dict_all'] = [
        fraud_dict_all for _ in range(train_id.shape[0])
    ]

    # 6 oob dict
    train_id['count_dict_oob'] = train_id[[
        'count_dict_all', 'count_dict_person'
    ]].apply(
        lambda s: subtract_dict(s['count_dict_all'], s['count_dict_person']),
        axis=1)

    train_id['fraud_dict_oob'] = train_id[[
        'fraud_dict_all', 'fraud_dict_person'
    ]].apply(
        lambda s: subtract_dict(s['fraud_dict_all'], s['fraud_dict_person']),
        axis=1)

    count_dict_person_test = test_data[[
        'PERSONID', 'FTR51'
    ]].groupby('PERSONID').apply(lambda df_person: ftr51s2cat_count_dict(
        df_person, kinds)).to_frame('count_dict_person').reset_index()
    test_id = test_id.merge(count_dict_person_test,
                            on=['PERSONID'],
                            how='left')
    test_id['fraud_dict_oob'] = [
        fraud_dict_all for _ in range(test_id.shape[0])
    ]
    test_id['count_dict_oob'] = [
        count_dict_all for _ in range(test_id.shape[0])
    ]

    test_id['count_dict_oob'] = test_id.apply(lambda x: {
        key: x['count_dict_oob'].setdefault(key, 0)
        for key in x['count_dict_person'].keys()
    },
                                              axis=1)
    test_id['fraud_dict_oob'] = test_id.apply(lambda x: {
        key: x['fraud_dict_oob'].setdefault(key, 0)
        for key in x['count_dict_person'].keys()
    },
                                              axis=1)

    # 统计计算特征

    train_id[feature_name] = train_id.apply(
        lambda s: stats_by_oob_dict(s, stats_name), axis=1)
    test_id[feature_name] = test_id.apply(
        lambda s: stats_by_oob_dict(s, stats_name), axis=1)
    SaveFeature(train_id, test_id, feature_name)
    IsDifferentDistribution(feature_name)