Пример #1
0
def get_all_times(events, right_ts=None):

    right_ts = get_right_ts_name(right_ts=right_ts)
    time_index = pd.DataFrame(sorted(events[right_ts].unique()),
                              columns=[right_ts])
    del events
    return time_index
Пример #2
0
def load_data(input_path,
              pref=None,
              right_ts=None,
              user_id=None,
              target=None,
              cols=None,
              sample_s=None,
              random_s=None,
              verbose=True):
    '''
    Loads and concatenates dataframes stored as pickles.
    Parameters:
     > cols: columns to load keep.
     > random_s: size of sample data.
    '''

    right_ts = get_right_ts_name(right_ts=right_ts)
    target = get_target_name(target=target)
    user_id = get_user_id_name(user_id=user_id)

    if verbose:
        print('Loading data...')
    file_list = sorted(os.listdir(input_path))
    df_list = []

    for file in file_list:
        c1 = (file.startswith(pref)) or (pref is None)
        if c1 and file.endswith('.pkl'):
            temp = joblib.load(input_path + file)
            if cols:
                df_list.append(temp[cols].copy())
            else:
                df_list.append(temp.copy())
            if verbose:
                print('> File "{}" loaded.'.format(file))
    del temp
    df = pd.concat(df_list).sort_values([user_id, right_ts])

    if verbose:
        print()
        print('> Removing user-periods with null target...')
    df = df.loc[df[target].notnull()]

    if sample_s:
        if verbose:
            print('> Selecting sample data...')
            print()
        sample_ids = df[user_id].drop_duplicates() \
            .sample(sample_s, random_state=random_s)
        df = df.loc[df[user_id].isin(sample_ids)]

    if verbose:
        print('> Unique users ({}): {}'.format(user_id,
                                               df[user_id].unique().size))
        aux = df[right_ts].dt.date.unique()
        print('> Periods: {} ({} - {})'.format(aux.size, aux.min(), aux.max()))
        print('> User-periods: {}'.format(len(df)))

    return df
Пример #3
0
def presequence_padding(df0,
                        val,
                        right_ts=None,
                        user_id=None,
                        target=None,
                        verbose=True):
    '''
    Transforms data such that each sequence has a length equal to a multiple of
    a choosen value.
    '''

    right_ts = get_right_ts_name(right_ts=right_ts)
    target = get_target_name(target=target)
    user_id = get_user_id_name(user_id=user_id)

    df = df0.copy()

    unique_ts = sorted(df[right_ts].unique())
    d = dict(zip(unique_ts, range(len(unique_ts))))
    ts_list = sorted(list(d.keys()))
    df['time_i'] = df[right_ts].map(d)

    user_len = df[user_id].value_counts().to_frame(name='len0')
    u = user_len['len0'].max()
    user_len['len1'] = (val *
                        (((user_len['len0'] - 1) // val) + 1)).clip_upper(u)

    if verbose:
        print('Padding sequence lengths...')
        print('> Sequence length multiple: {}'.format(val))
        fs = user_len['len0'].unique().size
        print('> Initial sequence lengths: {}'.format(fs))
        fs = user_len['len1'].unique().size
        print('> Final sequence lengths:   {}'.format(fs))
        print('> Creating "indexes" (user_id & time-stamp)...')

    USER_TS = []

    for length in sorted(user_len['len1'].unique()):
        uids = user_len.loc[user_len['len1'] == length].index
        temp = df.loc[df[user_id].isin(uids)].copy()

        for uid in uids:
            mti = temp.loc[temp[user_id] == uid, 'time_i'].max()
            ts = ts_list[(mti - length + 1):(mti + 1)]
            user_ts = pd.DataFrame({user_id: uid, right_ts: ts})
            USER_TS.append(user_ts)

    if verbose:
        print('> Merging "indexes" with session data...')
    df = pd.concat(USER_TS).merge(df, on=[right_ts, user_id], how='left')

    df.drop(columns=['time_i'], inplace=True)
    df['weight'] = df[target].notnull().astype(int)
    del USER_TS

    return df
Пример #4
0
def adding_target(e, periods, p2p, test=False, user_id=None, right_ts=None):

    user_id = get_user_id_name(user_id=user_id)
    right_ts = get_right_ts_name(right_ts=right_ts)

    m = periods['end_type'] == 'cancer'

    m2 = (periods['dat_can'] >= periods['temp_can'])
    target = periods.loc[m].copy()
    if test:
        target = target.loc[m2].copy()
        target['c_year'] = target['test_dat_max'].apply(
            lambda x: x - relativedelta.relativedelta(months=p2p - 1))
    else:
        target['c_year'] = target['train_dat_max'].apply(
            lambda x: x - relativedelta.relativedelta(months=p2p - 1))

    cancers_year = target.set_index(user_id)['c_year'].to_dict()
    e['c_year'] = e[user_id].map(cancers_year).copy()
    e['target'] = (e[right_ts] >= e['c_year']).astype(float)
    e.drop(['c_year'], axis=1, inplace=True)
    del target
Пример #5
0
def expand(df, time_index, user_id=None, right_ts=None):
    'Returns: a dataframe with all the missing dates fully with missing data'

    user_id = get_user_id_name(user_id=user_id)
    right_ts = get_right_ts_name(right_ts=right_ts)

    df.reset_index(inplace=True)
    df_expanded = []
    for i in df[user_id].unique():
        ti = time_index.copy()
        ti[user_id] = i
        df_expanded.append(ti)
    df_expanded = pd.concat(df_expanded, axis=0)

    # Merge index with event data
    df_expanded = df_expanded.merge(df,
                                    on=[user_id, right_ts],
                                    how='left',
                                    copy=False)

    del df
    return df_expanded
Пример #6
0
def pivoting_events(events,
                    agg_function,
                    user_id=None,
                    right_ts=None,
                    code=None,
                    verbose=True):

    user_id = get_user_id_name(user_id=user_id)
    right_ts = get_right_ts_name(right_ts=right_ts)
    code = get_code_name(code=code)

    input_data = pd.pivot_table(events,
                                index=[user_id, right_ts],
                                columns=code,
                                aggfunc=agg_function)
    col_et = input_data.columns.tolist()
    input_data.reset_index(inplace=True)
    input_data.set_index([user_id, right_ts], inplace=True)
    input_data.columns = [str(s2) for (s1, s2) in input_data.columns.tolist()]
    if verbose:
        print(input_data.size)
        print(input_data.shape)
    del events
    return input_data
Пример #7
0
def trend_enrichment(df0,
                     col,
                     right_ts=None,
                     q2cl=0,
                     q2cu=1,
                     neg_val=True,
                     drop_clip=True,
                     show_plot=True,
                     bins=20):
    '''
    Enriches dataset with global trend data for the chosen
    continuous feature.

    Parameters:
     > q2cl: quantile value used to lower clip the feature.
     > q2cu: quantile value used to upper clip the feature.
       By default, no clipping takes place.
     > neg_val: If True, negative values are coherent for col.
     > drop_clip: If True, the clipped col is excluded from the resulting
       dataset.

    The added columns are:
     > col_clip: clipped col (optional).
     > col_mean: global mean of col per period.
     > col_mean_diff: difference of the global mean between periods.
    '''

    right_ts = get_right_ts_name(right_ts=right_ts)

    df = df0.copy()

    # clip continuous feature
    df[col + '_clip'], se1 = clip_continuous_f(df[col],
                                               q2cl=q2cl,
                                               q2cu=q2cu,
                                               neg_val=neg_val,
                                               return_se1=True)

    # compute trend
    active_users = df.loc[df['weight'] == 1][right_ts].value_counts()
    global_col = df.loc[df['weight'] == 1] \
        .groupby(right_ts)[col + '_clip'].sum()
    trend = pd.concat([active_users, global_col], axis=1).sort_index()
    trend.columns = ['users', col]
    trend[col + '_mean'] = trend[col] / trend['users']
    trend[col + '_mean_diff'] = trend[col + '_mean'].diff().fillna(0)

    for c in [col + '_mean', col + '_mean_diff']:
        d = trend[c].to_dict()
        df[c] = df[right_ts].map(d)

    if drop_clip:
        df.drop(columns=col + '_clip', inplace=True)

    if show_plot:
        fig, axs = plt.subplots(1, 2, figsize=(15, 5))

        ax0 = se1.hist(ax=axs[0], bins=bins)
        ax0.set_title('User-periods with {} != 0'.format(col))
        ax0.set_xlabel('(Clipped) {}'.format(col))
        ax0.set_ylabel('User-periods')

        ax1 = trend[col + '_mean'].plot(ax=axs[1])
        ax1.set_title('Periodic {} per user'.format(col))
        ax1.set_ylabel('(Clipped) {}'.format(col))

    del se1, active_users, global_col, trend, d
    return df