Пример #1
0
def gen_time_feature(stats_name, version='v1', kind='train', agg='mean'):
    """
    :param stats_name:
    :param version:
    :param kind:
    :return:
    """
    # 0 被统计的值
    values = [
        'SBP', 'DBP', 'HEART_RATE_TIMES', 'GLU', 'HEIGHT', 'WEIGHT', 'BMI'
    ]
    # 1 读取历史数据
    followup = ReadHistData(info='followup_person_info',
                            version=version,
                            kind=kind)
    labels = ReadLabelsData(version, kind)
    #  2 计算特征
    labels['stats_dict'] = labels.apply(
        lambda label: compute_time_feature_dict(
            filter_hist_data(label, followup), values, stats_name, agg),
        axis=1)
    v = DictVectorizer()
    stats_matrix = v.fit_transform(labels['stats_dict'].values).toarray()
    value_names = v.get_feature_names()
    feature_names = [
        '{}_{}_{}'.format(value_name, stats_name, agg)
        for value_name in value_names
    ]
    stats_df = pd.DataFrame(data=stats_matrix, columns=feature_names)
    labels = pd.concat([labels, stats_df], axis=1)
    #  3 保存特征
    for feat in feature_names:
        SaveFeature(labels, feat, version, kind)
    return feature_names
def gen_time_base_feature(version, kind):
    """
    :param version:
    :param kind:
    :return:
    """
    labels = ReadLabelsData(version=version, kind=kind)
    hist_data = ReadHistData(version=version,
                             info='followup_person_info',
                             kind=kind)
def gen_action_time_feature(stats_name, version='v2', kind='train'):
    """
    :param stats_name:
    :param version:
    :param kind:
    :return:
    """

    # 1 读取历史数据
    followup = ReadHistData(info='followup_person_info',
                            version=version,
                            kind=kind)
    labels = ReadLabelsData(version, kind)
    #  2 计算特征
    labels[stats_name] = labels.apply(
        lambda label: compute_action_time_feature(
            filter_hist_data(label, followup), stats_name),
        axis=1)
    SaveFeature(labels, stats_name, version, kind)
    return stats_name
def gen_base_feature2(version, kind='train'):
    #  1 读取历史数据
    followup = ReadHistData(info='followup_person_info', version=version, kind=kind)
    labels = ReadLabelsData(version, kind)

    # 2 第一次随访的age
    labels = labels.merge(followup.groupby('ID')['DATE_OF_BIRTH'].max().reset_index(), on='ID', how='left')
    first_followup_time_df = followup.groupby('ID')['FOLLOWUP_DATE'].min().to_frame('first_followup_time').reset_index()
    labels = labels.merge(first_followup_time_df, on='ID', how='left')
    labels['first_followup_age'] = (labels['first_followup_time'] - labels['DATE_OF_BIRTH']).dt.days
    SaveFeature(labels, 'first_followup_age', version, kind)

    # 3 第一次随访到确认高血压的时间
    labels = labels.merge(followup.groupby('ID')['CONFIRM_DATE'].max().reset_index(), on='ID', how='left')
    labels['first_followup_time_diff_confirm_time'] = (labels['first_followup_time'] - labels['CONFIRM_DATE']).dt.days
    SaveFeature(labels, 'first_followup_time_diff_confirm_time', version, kind)

    # 4 当前时间到随访的时间
    labels['TimePoint_diff_first_followup_time'] = (labels['TimePoint'] - labels['first_followup_time']).dt.days
    SaveFeature(labels, 'TimePoint_diff_first_followup_time', version, kind)
def run_hist_data_type(version, kind):
    """
    :param version:
    :param kind:
    :return:
    """
    hist_data = ReadHistData(version=version,
                             info='followup_person_info',
                             kind=kind)
    labels = ReadLabelsData(version=version, kind=kind)
    labels['data_type'] = labels.apply(lambda label: compute_hist_data_type(
        filter_hist_data(label=label, followup=hist_data)),
                                       axis=1)

    for date_type in ['1', '2', '3', '4']:
        mask = (hist_data['date_type'] == date_type)
        pd.Series(mask).to_pickle(
            get_path_labels() +
            '{}_mask_{}_{}.pkl'.format(version, date_type, kind))
    return
Пример #6
0
def gen_missing_ratio(value, version='v1', kind='train'):
    """
    :param value:
    :param version:
    :param kind:
    :return:
    """
    # 0 特征名
    feature_name = '{}_missing_ratio'.format(value)
    # 1 读取历史数据
    followup = ReadHistData(info='followup_person_info',
                            version=version,
                            kind=kind)
    labels = ReadLabelsData(version, kind)
    #  2 计算特征
    labels[feature_name] = labels.apply(lambda label: compute_missing_ratio(
        filter_hist_data(label, followup), value),
                                        axis=1)
    # 3 保存特征
    SaveFeature(labels, feature_name, version, kind)
    return
def gen_base_feature(version, kind='train'):
    """
    计算年龄、性别、确认时长
    :param kind:
    :return:
    """
    #  1 读取历史数据
    followup = ReadHistData(info='followup_person_info', version=version, kind=kind)
    labels = ReadLabelsData(version, kind)
    # 1 性别
    labels = labels.merge(followup.groupby('ID')['SEX_CODE'].max().to_frame('SEX_CODE').reset_index(), on=['ID'], how='left')
    SaveFeature(labels, 'SEX_CODE', version, kind)
    # 2 在时间点的年龄
    labels['age'] = labels.apply(lambda label: compute_age(filter_hist_data(label, followup), label), axis=1)
    SaveFeature(labels, 'age', version, kind)
    # 3 确认高血压时的年龄
    labels = labels.merge(
        followup.groupby('ID').apply(lambda df_person:
                                     (df_person['CONFIRM_DATE'].max() - df_person['DATE_OF_BIRTH'].max()).days).to_frame('confirm_age').reset_index(), on=['ID'], how='left')
    SaveFeature(labels, 'confirm_age', version, kind)
    # 4 时间点与确认高血压时间的差
    labels['time_diff_confirm_2TimePoint'] = (labels['age'] - labels['confirm_age'])
    SaveFeature(labels, 'time_diff_confirm_2TimePoint', version, kind)