def convert_to_npy(df=None, save=True, modalities=None):
        if df is None:
            df = ParquetFile(
                os.path.join(project_dir, 'data', 'interim',
                             'data.parq')).to_pandas().set_index('date')

        user_data = list()
        for user, group in df.groupby('user'):
            # Select activity
            activity = group[group['modality'] == 'cpm']

            # Require 8 hours of data
            activity = activity[pd.isnull(activity).sum(axis=1) < (16 * 12)]

            if activity.modality.count() >= 120:
                group = group.loc[activity.index.tolist()]

                # Extract modalities
                modality_data = list()
                modality_grouped = group.groupby('modality')
                for modality in modalities:
                    modality_data.append(
                        modality_grouped.get_group(modality).drop(['modality'],
                                                                  axis=1))

                # We concatenate on dates to ensure the same dimension across modalities
                user_data.append(
                    pd.concat(modality_data,
                              axis=1).values.reshape(-1, len(modality_data),
                                                     289).transpose(0, 2, 1))

        data = np.concatenate(user_data, axis=0)
        if save:
            np.save(os.path.join(project_dir, 'data', 'interim', 'data.npy'),
                    data)

        return data
예제 #2
0
def load_all():
    df = ParquetFile(os.path.join(project_dir, 'data', 'interim',
                                  'data.parq')).to_pandas().set_index('date')
    data_size = df.groupby(['modality', 'user']).size().unstack()
예제 #3
0
    p = sns.heatmap(np.nan_to_num(data[:, :, 0]))
    plt.show(p)


def npy_heatmap():
    data = np.load(os.path.join(project_dir, 'data', 'interim',
                                'data.npy')).astype(np.float32)[:, :-1]
    print(data.shape)
    p = sns.heatmap(data[:100, :, 0])
    plt.show(p)


if __name__ == '__main__':
    project_dir = os.path.join(os.path.dirname(__file__), os.pardir, os.pardir)

    df = ParquetFile(
        os.path.join(project_dir, 'data', 'interim',
                     'data.parq')).to_pandas(filters=[('user', '==',
                                                       194)]).set_index('date')

    modality_data = list()
    for modality, m_group in df.groupby('modality'):
        modality_data.append(m_group.drop(['modality', 'user'], axis=1))

    # We concatenate on dates to ensure the same dimension across modalities
    fig, ax = plt.subplots(ncols=2, figsize=(10, 30))
    sns.heatmap(pd.concat(modality_data, axis=1).values.reshape(-1, 6,
                                                                288)[:, -1, :],
                ax=ax[0])
    sns.heatmap(modality_data[-1], ax=ax[1])
    plt.show(fig)