示例#1
0
def gen_stats_cost_by_non_zero_group(cost, stats_name='mean_mean', size='7d', recompute=False):
    """
    对诊疗次数的统计, 窗口可以是月或全局, 颗粒度天单位
    :param cost: str, 项目名称
    :param stats_name: str, 统计名    :param df_person:
    :param size: str, 下采样时间间隔, 类似'xd',粒度为x天, 或 '1t'粒度为每次
    :param recompute: bool,是否重新计算该特征
    :return:
    """
    # 1
    # ['len_max', 'len_max_ratio', 'len_mean', 'len_std', 'len_count',
    #                           'sum_max', 'sum_max_ratio', 'sum_mean', 'sum_std',
    #                           'mean_max', 'mean_std', 'mean_mean']
    feature_name = '{}_{}_by_non_zero_group_{}'.format(stats_name, cost, size)
    if IsAbsense(feature_name) | recompute:
        # 2 compute feature
        print('compute {}'.format(feature_name))
        # 2.1 读取数据
        train_id, test_id, train_data, test_data = ReadData(Ytrain=False, sort_by_time=True)
        train_test_id = pd.concat([train_id, test_id], axis=0, ignore_index=True)
        train_test_data = pd.concat([train_data, test_data], axis=0, ignore_index=True)
        # 2.3 计算count df
        stats_df = train_test_data[['PERSONID', 'CREATETIME', cost]].groupby('PERSONID').apply(
            lambda df_person: stats_cost_by_non_zero_group(df_person, cost, stats_name, size)).to_frame(feature_name).reset_index()
        # 2.4 merge 拼接
        train_test_id = train_test_id.merge(stats_df, on=['PERSONID'], how='left')
        # 2.5 保存特征
        train_id[feature_name] = train_test_id[feature_name][:15000].values
        test_id[feature_name] = train_test_id[feature_name][15000:].values
        SaveFeature(train_id, test_id, feature_name)
        print('Finished Computing {} \n'.format(feature_name))
        return feature_name, 'gen_stats_cost_by_non_zero_group("{}", "{}", "{}")'.format(cost, stats_name, size)
    else:
        print('The Feature has already been computed \n')
        return feature_name, 'gen_stats_cost_by_non_zero_group("{}", "{}", "{}")'.format(cost, stats_name, size)
def gen_stats_value_ftr51(stats_name, size='400d'):
    """
    :param stats_name: str,对药品数量进行统计的名字
    :param size: str, 统计的时间粒度 , 7d, 15d, 30d, 45d
    :return:
    """

    feature_name = '{}_ftr51_by_{}'.format(stats_name, size)
    # 0 读取数据
    train_id, test_id, train_data, test_data = ReadData(Ytrain=False,
                                                        sort_by_time=True)
    train_test_id = pd.concat([train_id, test_id], axis=0, ignore_index=True)
    train_test_data = pd.concat([train_data, test_data],
                                axis=0,
                                ignore_index=True)
    # 计算统计字典
    print('1 computing stats value of ftr51 by  {}'.format(size))
    ftr51_stats_value_df = train_test_data[[
        'PERSONID', 'CREATETIME', 'FTR51'
    ]].groupby('PERSONID').apply(
        lambda df_person: compute_stats_value_FTR51_by_size(
            df_person, stats_name, size)).to_frame(feature_name).reset_index()
    train_test_id = train_test_id.merge(ftr51_stats_value_df,
                                        on=['PERSONID'],
                                        how='left')
    train_id[feature_name] = train_test_id[feature_name][:15000].values
    test_id[feature_name] = train_test_id[feature_name][15000:].values
    SaveFeature(train_id, test_id, feature_name)
    print('Finished Computing {} \n'.format(feature_name))
    return feature_name, 'gen_stats_value_ftr51("{}", "{}")'.format(
        stats_name, size)
示例#3
0
def gen_time_feature(stats_name, version='v1', kind='train', agg='mean'):
    """
    :param stats_name:
    :param version:
    :param kind:
    :return:
    """
    # 0 被统计的值
    values = [
        'SBP', 'DBP', 'HEART_RATE_TIMES', 'GLU', 'HEIGHT', 'WEIGHT', 'BMI'
    ]
    # 1 读取历史数据
    followup = ReadHistData(info='followup_person_info',
                            version=version,
                            kind=kind)
    labels = ReadLabelsData(version, kind)
    #  2 计算特征
    labels['stats_dict'] = labels.apply(
        lambda label: compute_time_feature_dict(
            filter_hist_data(label, followup), values, stats_name, agg),
        axis=1)
    v = DictVectorizer()
    stats_matrix = v.fit_transform(labels['stats_dict'].values).toarray()
    value_names = v.get_feature_names()
    feature_names = [
        '{}_{}_{}'.format(value_name, stats_name, agg)
        for value_name in value_names
    ]
    stats_df = pd.DataFrame(data=stats_matrix, columns=feature_names)
    labels = pd.concat([labels, stats_df], axis=1)
    #  3 保存特征
    for feat in feature_names:
        SaveFeature(labels, feat, version, kind)
    return feature_names
def gen_stats_count(stats_name,
                    month='global',
                    size='1d',
                    non_zero=True,
                    recompute=False):
    """
    对诊疗次数的统计, 窗口可以是月或全局, 颗粒度天单位
    :param stats_name: str, 统计名
    :param size: str, 下采样时间间隔, 类似'xd',粒度为x天, 或 '1t'粒度为每次
    :param month: str, 需要统计的时间窗口
    :param non_zero: bool, 只统计非0的时间颗粒
    :param recompute: bool,是否重新计算该特征
    :return:
    """
    # 1
    feature_name = '{}_count_in_{}_by_{}_{}'.format(stats_name, month, size,
                                                    non_zero)

    if IsAbsense(feature_name) | recompute:
        # 2 compute feature
        print('compute {}'.format(feature_name))
        # 2.1 读取数据
        train_id, test_id, train_data, test_data = ReadData(Ytrain=False,
                                                            sort_by_time=True)
        train_test_id = pd.concat([train_id, test_id],
                                  axis=0,
                                  ignore_index=True)
        # 2.2 选择需要统计的数据
        train_data, test_data = SelectDataByMonth(train_data, test_data, month)
        train_test_data = pd.concat([train_data, test_data],
                                    axis=0,
                                    ignore_index=True)
        train_test_data['count'] = 1
        # 2.3 计算count df
        stats_df = train_test_data[[
            'PERSONID', 'CREATETIME', 'count'
        ]].groupby('PERSONID').apply(lambda df_person: stats_count_by_size(
            df_person, stats_name, size, non_zero)).to_frame(
                feature_name).reset_index()
        # 2.4 merge 拼接
        train_test_id = train_test_id.merge(stats_df,
                                            on=['PERSONID'],
                                            how='left')
        count_stats_fillna_by_stats_name(train_test_id, feature_name,
                                         stats_name)
        # 2.5 保存特征
        train_id[feature_name] = train_test_id[feature_name][:15000].values
        test_id[feature_name] = train_test_id[feature_name][15000:].values
        SaveFeature(train_id, test_id, feature_name)
        print('Finished Computing {} \n'.format(feature_name))
        return feature_name, 'gen_stats_count("{}", "{}", "{}", {})'.format(
            stats_name, month, size, non_zero)
    else:
        print('The Feature has already been computed \n')
        return feature_name, 'gen_stats_count("{}", "{}", "{}", {})'.format(
            stats_name, month, size, non_zero)
def gen_base_feature2(version, kind='train'):
    #  1 读取历史数据
    followup = ReadHistData(info='followup_person_info', version=version, kind=kind)
    labels = ReadLabelsData(version, kind)

    # 2 第一次随访的age
    labels = labels.merge(followup.groupby('ID')['DATE_OF_BIRTH'].max().reset_index(), on='ID', how='left')
    first_followup_time_df = followup.groupby('ID')['FOLLOWUP_DATE'].min().to_frame('first_followup_time').reset_index()
    labels = labels.merge(first_followup_time_df, on='ID', how='left')
    labels['first_followup_age'] = (labels['first_followup_time'] - labels['DATE_OF_BIRTH']).dt.days
    SaveFeature(labels, 'first_followup_age', version, kind)

    # 3 第一次随访到确认高血压的时间
    labels = labels.merge(followup.groupby('ID')['CONFIRM_DATE'].max().reset_index(), on='ID', how='left')
    labels['first_followup_time_diff_confirm_time'] = (labels['first_followup_time'] - labels['CONFIRM_DATE']).dt.days
    SaveFeature(labels, 'first_followup_time_diff_confirm_time', version, kind)

    # 4 当前时间到随访的时间
    labels['TimePoint_diff_first_followup_time'] = (labels['TimePoint'] - labels['first_followup_time']).dt.days
    SaveFeature(labels, 'TimePoint_diff_first_followup_time', version, kind)
def gen_base_feature(version, kind='train'):
    """
    计算年龄、性别、确认时长
    :param kind:
    :return:
    """
    #  1 读取历史数据
    followup = ReadHistData(info='followup_person_info', version=version, kind=kind)
    labels = ReadLabelsData(version, kind)
    # 1 性别
    labels = labels.merge(followup.groupby('ID')['SEX_CODE'].max().to_frame('SEX_CODE').reset_index(), on=['ID'], how='left')
    SaveFeature(labels, 'SEX_CODE', version, kind)
    # 2 在时间点的年龄
    labels['age'] = labels.apply(lambda label: compute_age(filter_hist_data(label, followup), label), axis=1)
    SaveFeature(labels, 'age', version, kind)
    # 3 确认高血压时的年龄
    labels = labels.merge(
        followup.groupby('ID').apply(lambda df_person:
                                     (df_person['CONFIRM_DATE'].max() - df_person['DATE_OF_BIRTH'].max()).days).to_frame('confirm_age').reset_index(), on=['ID'], how='left')
    SaveFeature(labels, 'confirm_age', version, kind)
    # 4 时间点与确认高血压时间的差
    labels['time_diff_confirm_2TimePoint'] = (labels['age'] - labels['confirm_age'])
    SaveFeature(labels, 'time_diff_confirm_2TimePoint', version, kind)
def gen_decomposition_stats_vector_from_cat_vector(stats_name, kinds, size='30d', decomp_method='lda', n_components=20):
    """
    :param stats_name: str,对药品数量进行统计的名字
    :param size: str, 统计的时间粒度 1d, 4d, 7d, 15d, 30d, 45d
    :param decomp_method: str, 分解方法
    :param n_components: int , 分解之后的维度
    :return:
    """
    assert decomp_method in ['svd', 'nmf', 'lda']

    stats_matrix_name = '{}_{}_vector_by_{}'.format(stats_name, kinds, size)
    # 0 读取数据
    stats_sparse_matrix = sparse.load_npz(get_path() + 'Data/Feature/{}.npz'.format(stats_matrix_name)).toarray()

    print(0)
    if decomp_method == 'svd':
        print(' svd decomposition...')
        svd = TruncatedSVD(n_components=n_components, n_iter=50, random_state=42)
        stats_matrix_decomp = svd.fit_transform(stats_sparse_matrix)

    if decomp_method == 'nmf':
        print(' nmf decomposition...')
        nmf = NMF(n_components=n_components, init='random', random_state=0, max_iter=200)
        stats_matrix_decomp = nmf.fit_transform(stats_sparse_matrix)

    if decomp_method == 'lda':
        print(' lda decomposition...')
        lda = LatentDirichletAllocation(n_components=n_components, max_iter=50,
                                        learning_method='online',
                                        learning_offset=50.,
                                        random_state=0,
                                        n_jobs=-1)
        stats_matrix_decomp = lda.fit_transform(stats_sparse_matrix)
        print(1)

    n = stats_matrix_decomp.shape[1]
    columns = ['{}_{}_{}_vector_by_{}_{}_{}'.format(decomp_method, stats_name, kinds, size, n_components, j) for j in range(n)]
    stats_df = pd.DataFrame(data=stats_matrix_decomp, columns=columns)
    print(2)
    train = stats_df[:15000].reset_index(drop=True)
    test = stats_df[15000:].reset_index(drop=True)
    for feature in columns:
        SaveFeature(train, test, feature)

    return columns, 'gen_decomposition_stats_vector_from_cat_vector("{}", "{}", "{}", "{}", {})'.format(stats_name, kinds, size, decomp_method, n_components)
def gen_rolling_stats_count(size, stats_name='sumratio2max', recompute=False):
    """
    对诊疗次数进行滑窗统计, 窗口可以是月或全局, 颗粒度天单位
    :param stats_name: str, 统计方法
    :param size: str, 下采样时间间隔, 类似'xd',粒度为x天,
    :param recompute: bool,是否重新计算该特征
    :return:
    """
    # 1
    feature_name = 'rolling_{}_count_{}'.format(stats_name, size)

    if IsAbsense(feature_name) | recompute:
        # 2 compute feature
        print('compute {}'.format(feature_name))
        # 2.1 读取数据
        train_id, test_id, train_data, test_data = ReadData(Ytrain=False,
                                                            sort_by_time=True)
        train_test_id = pd.concat([train_id, test_id],
                                  axis=0,
                                  ignore_index=True)
        train_test_data = pd.concat([train_data, test_data],
                                    axis=0,
                                    ignore_index=True)
        train_test_data['count'] = 1
        # 2.3 计算count df
        stats_df = train_test_data[[
            'PERSONID', 'CREATETIME', 'count'
        ]].groupby('PERSONID').apply(lambda df_person: rolling_stats_count(
            df_person, stats_name, size)).to_frame(feature_name).reset_index()
        # 2.4 merge 拼接
        train_test_id = train_test_id.merge(stats_df,
                                            on=['PERSONID'],
                                            how='left')
        # 2.5 保存特征
        train_id[feature_name] = train_test_id[feature_name][:15000].values
        test_id[feature_name] = train_test_id[feature_name][15000:].values
        SaveFeature(train_id, test_id, feature_name)
        print('Finished Computing {} \n'.format(feature_name))
        return feature_name, 'gen_rolling_stats_count("{}", "{}")'.format(
            size, stats_name)
    else:
        print('The Feature has already been computed \n')
        return feature_name, 'gen_rolling_stats_count("{}", "{}")'.format(
            size, stats_name)
def gen_action_time_feature(stats_name, version='v2', kind='train'):
    """
    :param stats_name:
    :param version:
    :param kind:
    :return:
    """

    # 1 读取历史数据
    followup = ReadHistData(info='followup_person_info',
                            version=version,
                            kind=kind)
    labels = ReadLabelsData(version, kind)
    #  2 计算特征
    labels[stats_name] = labels.apply(
        lambda label: compute_action_time_feature(
            filter_hist_data(label, followup), stats_name),
        axis=1)
    SaveFeature(labels, stats_name, version, kind)
    return stats_name
示例#10
0
def gen_missing_ratio(value, version='v1', kind='train'):
    """
    :param value:
    :param version:
    :param kind:
    :return:
    """
    # 0 特征名
    feature_name = '{}_missing_ratio'.format(value)
    # 1 读取历史数据
    followup = ReadHistData(info='followup_person_info',
                            version=version,
                            kind=kind)
    labels = ReadLabelsData(version, kind)
    #  2 计算特征
    labels[feature_name] = labels.apply(lambda label: compute_missing_ratio(
        filter_hist_data(label, followup), value),
                                        axis=1)
    # 3 保存特征
    SaveFeature(labels, feature_name, version, kind)
    return
def gen_(month, recompute=False):
    # 1
    feature_name = ''


    if IsAbsense(feature_name) | recompute:
        # 2 compute feature
        print('compute {}'.format(feature_name))
        # 2.1 读取数据
        train_id, test_id, train_data, test_data, Ytrain = ReadData(Ytrain=True)
        train_id['LABEL'] = Ytrain['LABEL'].values
        train_data = train_data.merge(train_id, on=['PERSONID'], how='left')
        # 2.2 选择需要统计的数据
        train_data, test_data = SelectDataByMonth(train_data, test_data, month)
        # 如果本月未出现
        train_id[feature_name] = train_id[feature_name].fillna(0)
        test_id[feature_name] = test_id[feature_name].fillna(0)
        # 保存特征
        SaveFeature(train_id, test_id, feature_name)
        print('Finished Computing {} \n'.format(feature_name))
        return feature_name, 'gen_stats_woe_OfPerson_by_columns({}, "{}", {})'.format(feature_list, agg_name, False)
    else:
        print('The Feature has already been computed \n')
        return feature_name, 'gen_stats_woe_OfPerson_by_columns({}, "{}", {})'.format(feature_list, agg_name, False)
def gen_isolationforest():
    pdb.set_trace()
    feature_name = 'iso_forest_score'
    # 数据准备
    log = ReadExperimentLog(43)
    config = log['config']
    Xtrain, Ytrain, Xtest = CombineFeature(config['feature_names'])
    train_test_feature = pd.concat([Xtrain, Xtest], axis=0, ignore_index=True)

    #
    clf = IsolationForest(n_estimators=500, random_state=42)
    clf.fit(train_test_feature[config['feature_names']].values)
    train_test_feature[feature_name] = clf.decision_function(
        train_test_feature[config['feature_names']].values)

    #
    Xtrain[feature_name] = train_test_feature[feature_name][:15000].values
    Xtest[feature_name] = train_test_feature[feature_name][15000:].values

    SaveFeature(Xtrain, Xtest, feature_name)

    IsDifferentDistribution(feature_name)

    return
def gen_stats_value_ftr51_in_month(month='month3', stats_name='count_ratio_range'):
    """
    :param stats_name: str,对药品数量进行统计的名字
    :param size: str, 统计的时间粒度 , 7d, 15d, 30d, 45d
    :return:
    """
    # ['nunique', 'nunique_ratio', 'len', 'count_std', 'count_max', 'count_range', 'count_ratio_std', 'count_ratio_max', 'count_ratio_range']
    # pdb.set_trace()
    feature_name = '{}_ftr51_in_{}'.format(stats_name, month)
    # 0 读取数据
    train_id, test_id, train_data, test_data = ReadData(Ytrain=False, sort_by_time=True)
    train_test_id = pd.concat([train_id, test_id], axis=0, ignore_index=True)
    train_data, test_data = SelectDataByMonth(train_data, test_data, month)
    train_test_data = pd.concat([train_data, test_data], axis=0, ignore_index=True)
    # 计算统计字典
    print('1 computing stats value of ftr51 in  {}'.format(month))
    ftr51_stats_value_df = train_test_data[['PERSONID', 'CREATETIME', 'FTR51']].groupby('PERSONID').apply(
        lambda df_person: compute_stats_value_FTR51_in_month(df_person, stats_name)).to_frame(feature_name).reset_index()
    train_test_id = train_test_id.merge(ftr51_stats_value_df, on=['PERSONID'], how='left')
    train_id[feature_name] = train_test_id[feature_name][:15000].values
    test_id[feature_name] = train_test_id[feature_name][15000:].values
    SaveFeature(train_id, test_id, feature_name)
    print('Finished Computing {} \n'.format(feature_name))
    return feature_name, 'gen_stats_value_ftr51("{}", "{}")'.format(stats_name, month)
示例#14
0
def fill_na(value, train_version='v2', test_version='v2_1'):
    """
    :param value: str, 被预测的指标
    :param train_version: list, 训练回归模型的特征版本
    :param test_version: str, 被预测填充的测试版本
    :return:
    """
    # 1 不同的测试子集使用不同的训练特征来预测
    if test_version == 'v2_1':
        feature_names = [
            'SEX_CODE', 'age', 'confirm_age', 'time_diff_confirm_2TimePoint'
        ]
    elif test_version in ['v2_2', 'v2_3', 'v2_4']:
        feature_names = [
            'SEX_CODE', 'age', 'confirm_age', 'time_diff_confirm_2TimePoint',
            'first_followup_age', 'first_followup_time_diff_confirm_time',
            'TimePoint_diff_first_followup_time'
        ]
    else:
        assert False
    # 2 读取标签
    Ytrain = pd.read_pickle(
        get_path_feature() +
        '{}_{}_{}.pkl'.format(train_version, value, 'train'))
    # 3.1 读取训练集合测试集
    Xtrain = pd.read_pickle(
        get_path_feature() +
        '{}_{}_{}.pkl'.format(train_version, feature_names[0], 'train'))
    print('the shape of Xtrain is', Xtrain.shape)
    Xtest = pd.read_pickle(
        get_path_feature() +
        '{}_{}_{}.pkl'.format(test_version, feature_names[0], 'test'))
    print('the shape of Xtest is', Xtest.shape)
    for feat in feature_names[1:]:
        train_feature = pd.read_pickle(
            get_path_feature() +
            '{}_{}_{}.pkl'.format(train_version, feat, 'train'))
        print('the shape of train feature is', train_feature.shape)
        Xtrain = Xtrain.merge(train_feature,
                              on=['ID', 'TimePoint', 'version'],
                              how='left')
        print('the shape of Xtrain is', Xtrain.shape)
        test_feature = pd.read_pickle(
            get_path_feature() +
            '{}_{}_{}.pkl'.format(test_version, feat, 'test'))
        print('the shape of test feature is', test_feature.shape)
        Xtest = Xtest.merge(test_feature,
                            on=['ID', 'TimePoint', 'version'],
                            how='left')
        print('the shape of Xtest is', Xtest.shape)
    # 3.2 不要使用有缺失值的样本
    mask1 = Ytrain[value] != -9999
    mask2 = Ytrain[value] != -99999
    mask = mask1 & mask2
    Xtrain = Xtrain[mask].reset_index(drop=True)
    Ytrain = Ytrain[mask].reset_index(drop=True)

    # 4 数据准备完毕, 开始训练预测
    clf = xgb.XGBRegressor(max_depth=3,
                           learning_rate=0.03,
                           n_estimators=200,
                           silent=True,
                           objective='reg:linear')
    clf.fit(Xtrain[feature_names].values,
            Ytrain[value].values,
            eval_metric='rmse')
    y_pred = clf.predict(Xtest[feature_names].values)

    # 5 根据是否存在,或者存在但是为NA
    if os.path.exists(get_path_feature() + get_path_feature() +
                      '{}_{}_{}.pkl'.format(test_version, value, 'test')):
        Ytest = pd.read_pickle(
            get_path_feature() +
            '{}_{}_{}.pkl'.format(test_version, value, 'test'))
        print('the shape of Ytest is', Ytest.test)
        for i in range(Ytest.shape[0]):
            if pd.isna(Ytest[value][i]):
                Ytest[value][i] = y_pred[i]
            else:
                pass
        Xtest = Xtest.merge(Ytest,
                            on=['ID', 'TimePoint', 'version'],
                            how='left')
    else:
        Xtest[value] = y_pred

    # 6 预测完毕,保重!
    SaveFeature(feat_df=Xtest,
                feature_name=value,
                version=test_version,
                kind='test')
def gen_fraud_ratio_feature(kinds='B'):
    """
    :param kinds: str, 目标编码的 字符, 可以是 ABCDE 或其组合
    :return:
    """
    # 0 读取数据
    train_id, test_id, train_data, test_data, Ytrain = ReadData(
        Ytrain=True, sort_by_time=True)
    train_id['LABEL'] = Ytrain['LABEL']
    train_data = train_data.merge(train_id, on=['PERSONID'], how='left')
    train_id = train_id.drop(['LABEL'], axis=1)
    # 1 个人计数
    df_cat_person_count = train_data[[
        'PERSONID', 'FTR51'
    ]].groupby('PERSONID').apply(lambda df_person: ftr51s2cat_count_dict(
        df_person, kinds)).to_frame('count_dict_person').reset_index()
    train_id = train_id.merge(df_cat_person_count, on=['PERSONID'], how='left')
    # 2 个人欺诈计数
    mask = train_data['LABEL'] == 1
    df_cat_person_fraud = train_data[mask][[
        'PERSONID', 'FTR51'
    ]].groupby('PERSONID').apply(lambda df_person: ftr51s2cat_count_dict(
        df_person, kinds)).to_frame('fraud_dict_person').reset_index()
    train_id = train_id.merge(df_cat_person_fraud, on=['PERSONID'], how='left')
    # ---------------------------------------- 好深的bug
    # 这样一来,如果非欺诈人员就没有个人欺诈记录,值全部为0,
    train_id['fraud_dict_person'] = train_id[[
        'count_dict_person', 'fraud_dict_person'
    ]].apply(lambda x: repair_fraud_dict_person(x), axis=1)
    # ---------------------------------------  好深的bug
    # 3 所有计数
    ftr51s_all = ','.join(list(train_data['FTR51'].values))
    count_dict_all = compute_cat_count_dict_from_ftr51s(ftr51s_all, kinds)
    # 4 所有欺诈
    ftr51s_all_fraud = ','.join(list(train_data[mask]['FTR51'].values))
    fraud_dict_all = compute_cat_count_dict_from_ftr51s(
        ftr51s_all_fraud, kinds)
    fraud_dict_all = {
        key: fraud_dict_all.setdefault(key, 0)
        for key in count_dict_all.keys()
    }
    # 5 赋值
    train_id['count_dict_all'] = [
        count_dict_all for _ in range(train_id.shape[0])
    ]
    train_id['fraud_dict_all'] = [
        fraud_dict_all for _ in range(train_id.shape[0])
    ]
    # 6 oob dict
    train_id['count_dict_oob'] = train_id[[
        'count_dict_all', 'count_dict_person'
    ]].apply(
        lambda s: subtract_dict(s['count_dict_all'], s['count_dict_person']),
        axis=1)

    train_id['fraud_dict_oob'] = train_id[[
        'fraud_dict_all', 'fraud_dict_person'
    ]].apply(
        lambda s: subtract_dict(s['fraud_dict_all'], s['fraud_dict_person']),
        axis=1)
    # 7 cat fraud  ratio dict
    # train
    train_id['cat_fraud_ratio_dict_oob'] = train_id[[
        'count_dict_oob', 'fraud_dict_oob'
    ]].apply(lambda s: division_dict(s['count_dict_oob'], s['fraud_dict_oob']),
             axis=1)
    # test
    cat_fraud_ratio_dict_all = division_dict(count_dict_all, fraud_dict_all)
    test_id['cat_fraud_ratio_dict_oob'] = [
        cat_fraud_ratio_dict_all for _ in range(test_id.shape[0])
    ]
    count_dict_person_test = test_data[[
        'PERSONID', 'FTR51'
    ]].groupby('PERSONID').apply(lambda df_person: ftr51s2cat_count_dict(
        df_person, kinds)).to_frame('count_dict_person').reset_index()
    test_id = test_id.merge(count_dict_person_test,
                            on=['PERSONID'],
                            how='left')
    test_id['cat_fraud_ratio_dict_oob'] = test_id.apply(lambda x: {
        key: x['cat_fraud_ratio_dict_oob'].setdefault(key, 0)
        for key in x['count_dict_person'].keys()
    },
                                                        axis=1)

    # 利用cat的欺诈比生成个人的特征
    # 8 max_fraud_ratio 特征
    train_id['max_fraud_ratio'] = train_id['cat_fraud_ratio_dict_oob'].map(
        lambda fraud_ratio_dict: pd.Series(fraud_ratio_dict).max())
    test_id['max_fraud_ratio'] = test_id['cat_fraud_ratio_dict_oob'].map(
        lambda fraud_ratio_dict: pd.Series(fraud_ratio_dict).max())

    #  9 sum_fraud_ratio 特征
    train_id['sum_fraud_ratio'] = train_id['cat_fraud_ratio_dict_oob'].map(
        lambda fraud_ratio_dict: pd.Series(fraud_ratio_dict).sum())
    test_id['sum_fraud_ratio'] = test_id['cat_fraud_ratio_dict_oob'].map(
        lambda fraud_ratio_dict: pd.Series(fraud_ratio_dict).sum())

    # 10  mean_fraud_ratio 特征
    train_id['mean_fraud_ratio'] = train_id['cat_fraud_ratio_dict_oob'].map(
        lambda fraud_ratio_dict: pd.Series(fraud_ratio_dict).mean())
    test_id['mean_fraud_ratio'] = test_id['cat_fraud_ratio_dict_oob'].map(
        lambda fraud_ratio_dict: pd.Series(fraud_ratio_dict).mean())

    # 11 保存特征, 查看分布
    for feat in ['max_fraud_ratio', 'sum_fraud_ratio', 'mean_fraud_ratio']:
        SaveFeature(train_id, test_id, feat)
        IsDifferentDistribution(feat)
示例#16
0
def gen_decomposition_stats_vector_ftr51(stats_name,
                                         size='7d',
                                         non_zero=False,
                                         decomp_method='lda',
                                         n_components=5):
    """
    :param stats_name: str,对药品数量进行统计的名字
    :param size: str, 统计的时间粒度 1d, 4d, 7d, 15d, 30d, 45d
    :param non_zero: bool, 统计是否非0
    :param decomp_method: str, 分解方法
    :param n_components: int , 分解之后的维度
    :return:
    """
    assert decomp_method in ['svd', 'nmf', 'lda']
    mask = (stats_name in ['sum', 'max', 'sum_ratio', 'max_ratio']) & non_zero
    assert not mask
    matrix_name = '{}_vector_ftr51_by_{}_{}'.format(stats_name, size, non_zero)
    # 0 读取数据

    ftr51_stats_sparse_matrix = sparse.load_npz(
        get_path() + 'Data/Feature/{}.npz'.format(matrix_name)).toarray()

    if decomp_method == 'svd':
        print(' svd decomposition...')
        svd = TruncatedSVD(n_components=n_components,
                           n_iter=50,
                           random_state=42)
        ftr51_stats_matrix_decomp = svd.fit_transform(
            ftr51_stats_sparse_matrix)

    if decomp_method == 'nmf':
        print(' nmf decomposition...')
        nmf = NMF(n_components=n_components,
                  init='random',
                  random_state=0,
                  max_iter=200)
        ftr51_stats_matrix_decomp = nmf.fit_transform(
            ftr51_stats_sparse_matrix)

    if decomp_method == 'lda':
        print(' lda decomposition...')
        lda = LatentDirichletAllocation(n_components=n_components,
                                        max_iter=50,
                                        learning_method='online',
                                        learning_offset=50.,
                                        random_state=0,
                                        n_jobs=1)
        ftr51_stats_matrix_decomp = lda.fit_transform(
            ftr51_stats_sparse_matrix)
        joblib.dump(lda, "lda_{}_{}.m".format(stats_name, size))

    columns = [
        '{}_{}_vector_by_{}_{}_{}_{}'.format(decomp_method, stats_name, size,
                                             non_zero, n_components, j)
        for j in range(ftr51_stats_matrix_decomp.shape[1])
    ]
    stats_df = pd.DataFrame(data=ftr51_stats_matrix_decomp, columns=columns)
    train = stats_df[:15000].reset_index(drop=True)
    test = stats_df[15000:].reset_index(drop=True)
    for feature in columns:
        SaveFeature(train, test, feature)

    return columns, 'gen_decomposition_stats_vector_ftr51("{}", "{}", {}, "{}", {})'.format(
        stats_name, size, non_zero, decomp_method, n_components)
示例#17
0
def gen_fraud_ratio_feature(kinds='E', stats_name='fraud_ratio_mean_weight'):
    """
    计算一个人所有的cat, 计算cat oob 的count, fraud, 例如某欺诈用户如果B1一次记录出现两次,则B1 fraud +2, count +2,
    利用count, fraud 计算统计值
    :param kinds: str, 目标编码的 字符, 可以是 ABCDE 或其组合
    :return:
    """
    feature_name = '{}_{}'.format(stats_name, kinds)
    print('computing feature {}'.format(feature_name))
    # 0 读取数据
    train_id, test_id, train_data, test_data, Ytrain = ReadData(
        Ytrain=True, sort_by_time=True)
    train_id['LABEL'] = Ytrain['LABEL']
    train_data = train_data.merge(train_id, on=['PERSONID'], how='left')
    train_id = train_id.drop(['LABEL'], axis=1)
    # 1 个人计数
    df_cat_person_count = train_data[[
        'PERSONID', 'FTR51'
    ]].groupby('PERSONID').apply(lambda df_person: ftr51s2cat_count_dict(
        df_person, kinds)).to_frame('count_dict_person').reset_index()
    train_id = train_id.merge(df_cat_person_count, on=['PERSONID'], how='left')
    # 2 个人欺诈计数
    mask = train_data['LABEL'] == 1
    df_cat_person_fraud = train_data[mask][[
        'PERSONID', 'FTR51'
    ]].groupby('PERSONID').apply(lambda df_person: ftr51s2cat_count_dict(
        df_person, kinds)).to_frame('fraud_dict_person').reset_index()
    train_id = train_id.merge(df_cat_person_fraud, on=['PERSONID'], how='left')
    # ---------------------------------------- 好深的bug
    # 这样一来,如果非欺诈人员就没有个人欺诈记录,值全部为0,
    train_id['fraud_dict_person'] = train_id[[
        'count_dict_person', 'fraud_dict_person'
    ]].apply(lambda x: repair_fraud_dict_person(x), axis=1)
    # ---------------------------------------  好深的bug
    # 3 所有计数
    ftr51s_all = ','.join(list(train_data['FTR51'].values))
    count_dict_all = compute_cat_count_dict_from_ftr51s(ftr51s_all, kinds)
    # 4 所有欺诈
    ftr51s_all_fraud = ','.join(list(train_data[mask]['FTR51'].values))
    fraud_dict_all = compute_cat_count_dict_from_ftr51s(
        ftr51s_all_fraud, kinds)
    fraud_dict_all = {
        key: fraud_dict_all.setdefault(key, 0)
        for key in count_dict_all.keys()
    }
    # 5 赋值
    train_id['count_dict_all'] = [
        count_dict_all for _ in range(train_id.shape[0])
    ]
    train_id['fraud_dict_all'] = [
        fraud_dict_all for _ in range(train_id.shape[0])
    ]

    # 6 oob dict
    train_id['count_dict_oob'] = train_id[[
        'count_dict_all', 'count_dict_person'
    ]].apply(
        lambda s: subtract_dict(s['count_dict_all'], s['count_dict_person']),
        axis=1)

    train_id['fraud_dict_oob'] = train_id[[
        'fraud_dict_all', 'fraud_dict_person'
    ]].apply(
        lambda s: subtract_dict(s['fraud_dict_all'], s['fraud_dict_person']),
        axis=1)

    count_dict_person_test = test_data[[
        'PERSONID', 'FTR51'
    ]].groupby('PERSONID').apply(lambda df_person: ftr51s2cat_count_dict(
        df_person, kinds)).to_frame('count_dict_person').reset_index()
    test_id = test_id.merge(count_dict_person_test,
                            on=['PERSONID'],
                            how='left')
    test_id['fraud_dict_oob'] = [
        fraud_dict_all for _ in range(test_id.shape[0])
    ]
    test_id['count_dict_oob'] = [
        count_dict_all for _ in range(test_id.shape[0])
    ]

    test_id['count_dict_oob'] = test_id.apply(lambda x: {
        key: x['count_dict_oob'].setdefault(key, 0)
        for key in x['count_dict_person'].keys()
    },
                                              axis=1)
    test_id['fraud_dict_oob'] = test_id.apply(lambda x: {
        key: x['fraud_dict_oob'].setdefault(key, 0)
        for key in x['count_dict_person'].keys()
    },
                                              axis=1)

    # 统计计算特征

    train_id[feature_name] = train_id.apply(
        lambda s: stats_by_oob_dict(s, stats_name), axis=1)
    test_id[feature_name] = test_id.apply(
        lambda s: stats_by_oob_dict(s, stats_name), axis=1)
    SaveFeature(train_id, test_id, feature_name)
    IsDifferentDistribution(feature_name)