def get_all_times(events, right_ts=None): right_ts = get_right_ts_name(right_ts=right_ts) time_index = pd.DataFrame(sorted(events[right_ts].unique()), columns=[right_ts]) del events return time_index
def load_data(input_path, pref=None, right_ts=None, user_id=None, target=None, cols=None, sample_s=None, random_s=None, verbose=True): ''' Loads and concatenates dataframes stored as pickles. Parameters: > cols: columns to load keep. > random_s: size of sample data. ''' right_ts = get_right_ts_name(right_ts=right_ts) target = get_target_name(target=target) user_id = get_user_id_name(user_id=user_id) if verbose: print('Loading data...') file_list = sorted(os.listdir(input_path)) df_list = [] for file in file_list: c1 = (file.startswith(pref)) or (pref is None) if c1 and file.endswith('.pkl'): temp = joblib.load(input_path + file) if cols: df_list.append(temp[cols].copy()) else: df_list.append(temp.copy()) if verbose: print('> File "{}" loaded.'.format(file)) del temp df = pd.concat(df_list).sort_values([user_id, right_ts]) if verbose: print() print('> Removing user-periods with null target...') df = df.loc[df[target].notnull()] if sample_s: if verbose: print('> Selecting sample data...') print() sample_ids = df[user_id].drop_duplicates() \ .sample(sample_s, random_state=random_s) df = df.loc[df[user_id].isin(sample_ids)] if verbose: print('> Unique users ({}): {}'.format(user_id, df[user_id].unique().size)) aux = df[right_ts].dt.date.unique() print('> Periods: {} ({} - {})'.format(aux.size, aux.min(), aux.max())) print('> User-periods: {}'.format(len(df))) return df
def presequence_padding(df0, val, right_ts=None, user_id=None, target=None, verbose=True): ''' Transforms data such that each sequence has a length equal to a multiple of a choosen value. ''' right_ts = get_right_ts_name(right_ts=right_ts) target = get_target_name(target=target) user_id = get_user_id_name(user_id=user_id) df = df0.copy() unique_ts = sorted(df[right_ts].unique()) d = dict(zip(unique_ts, range(len(unique_ts)))) ts_list = sorted(list(d.keys())) df['time_i'] = df[right_ts].map(d) user_len = df[user_id].value_counts().to_frame(name='len0') u = user_len['len0'].max() user_len['len1'] = (val * (((user_len['len0'] - 1) // val) + 1)).clip_upper(u) if verbose: print('Padding sequence lengths...') print('> Sequence length multiple: {}'.format(val)) fs = user_len['len0'].unique().size print('> Initial sequence lengths: {}'.format(fs)) fs = user_len['len1'].unique().size print('> Final sequence lengths: {}'.format(fs)) print('> Creating "indexes" (user_id & time-stamp)...') USER_TS = [] for length in sorted(user_len['len1'].unique()): uids = user_len.loc[user_len['len1'] == length].index temp = df.loc[df[user_id].isin(uids)].copy() for uid in uids: mti = temp.loc[temp[user_id] == uid, 'time_i'].max() ts = ts_list[(mti - length + 1):(mti + 1)] user_ts = pd.DataFrame({user_id: uid, right_ts: ts}) USER_TS.append(user_ts) if verbose: print('> Merging "indexes" with session data...') df = pd.concat(USER_TS).merge(df, on=[right_ts, user_id], how='left') df.drop(columns=['time_i'], inplace=True) df['weight'] = df[target].notnull().astype(int) del USER_TS return df
def adding_target(e, periods, p2p, test=False, user_id=None, right_ts=None): user_id = get_user_id_name(user_id=user_id) right_ts = get_right_ts_name(right_ts=right_ts) m = periods['end_type'] == 'cancer' m2 = (periods['dat_can'] >= periods['temp_can']) target = periods.loc[m].copy() if test: target = target.loc[m2].copy() target['c_year'] = target['test_dat_max'].apply( lambda x: x - relativedelta.relativedelta(months=p2p - 1)) else: target['c_year'] = target['train_dat_max'].apply( lambda x: x - relativedelta.relativedelta(months=p2p - 1)) cancers_year = target.set_index(user_id)['c_year'].to_dict() e['c_year'] = e[user_id].map(cancers_year).copy() e['target'] = (e[right_ts] >= e['c_year']).astype(float) e.drop(['c_year'], axis=1, inplace=True) del target
def expand(df, time_index, user_id=None, right_ts=None): 'Returns: a dataframe with all the missing dates fully with missing data' user_id = get_user_id_name(user_id=user_id) right_ts = get_right_ts_name(right_ts=right_ts) df.reset_index(inplace=True) df_expanded = [] for i in df[user_id].unique(): ti = time_index.copy() ti[user_id] = i df_expanded.append(ti) df_expanded = pd.concat(df_expanded, axis=0) # Merge index with event data df_expanded = df_expanded.merge(df, on=[user_id, right_ts], how='left', copy=False) del df return df_expanded
def pivoting_events(events, agg_function, user_id=None, right_ts=None, code=None, verbose=True): user_id = get_user_id_name(user_id=user_id) right_ts = get_right_ts_name(right_ts=right_ts) code = get_code_name(code=code) input_data = pd.pivot_table(events, index=[user_id, right_ts], columns=code, aggfunc=agg_function) col_et = input_data.columns.tolist() input_data.reset_index(inplace=True) input_data.set_index([user_id, right_ts], inplace=True) input_data.columns = [str(s2) for (s1, s2) in input_data.columns.tolist()] if verbose: print(input_data.size) print(input_data.shape) del events return input_data
def trend_enrichment(df0, col, right_ts=None, q2cl=0, q2cu=1, neg_val=True, drop_clip=True, show_plot=True, bins=20): ''' Enriches dataset with global trend data for the chosen continuous feature. Parameters: > q2cl: quantile value used to lower clip the feature. > q2cu: quantile value used to upper clip the feature. By default, no clipping takes place. > neg_val: If True, negative values are coherent for col. > drop_clip: If True, the clipped col is excluded from the resulting dataset. The added columns are: > col_clip: clipped col (optional). > col_mean: global mean of col per period. > col_mean_diff: difference of the global mean between periods. ''' right_ts = get_right_ts_name(right_ts=right_ts) df = df0.copy() # clip continuous feature df[col + '_clip'], se1 = clip_continuous_f(df[col], q2cl=q2cl, q2cu=q2cu, neg_val=neg_val, return_se1=True) # compute trend active_users = df.loc[df['weight'] == 1][right_ts].value_counts() global_col = df.loc[df['weight'] == 1] \ .groupby(right_ts)[col + '_clip'].sum() trend = pd.concat([active_users, global_col], axis=1).sort_index() trend.columns = ['users', col] trend[col + '_mean'] = trend[col] / trend['users'] trend[col + '_mean_diff'] = trend[col + '_mean'].diff().fillna(0) for c in [col + '_mean', col + '_mean_diff']: d = trend[c].to_dict() df[c] = df[right_ts].map(d) if drop_clip: df.drop(columns=col + '_clip', inplace=True) if show_plot: fig, axs = plt.subplots(1, 2, figsize=(15, 5)) ax0 = se1.hist(ax=axs[0], bins=bins) ax0.set_title('User-periods with {} != 0'.format(col)) ax0.set_xlabel('(Clipped) {}'.format(col)) ax0.set_ylabel('User-periods') ax1 = trend[col + '_mean'].plot(ax=axs[1]) ax1.set_title('Periodic {} per user'.format(col)) ax1.set_ylabel('(Clipped) {}'.format(col)) del se1, active_users, global_col, trend, d return df