def statistics(step_name, emb_name, by, units=1, data_path=DATA_PATH):

    df = load_frame(step_name, data_path)

    print("WARNING! This func supports EQUAL BUCKETS ONLY")

    if by == 'hour':
        groupeddf = df.groupby(['day', 'hour']).apply(get_stats).reset_index()

    elif by == 'minute':
        groupeddf = df.groupby(['day', 'hour',
                                'minute']).apply(get_stats).reset_index()

    if units > 1:

        ret_df = groupeddf.groupby(groupeddf.index // units).sum()

        ret_df['day'] = ret_df.day.apply(lambda x: int(x / units))

        if 'hour' in ret_df.columns:
            ret_df['hour'] = ret_df.hour.apply(lambda x: int(x / units))

        if 'minute' in ret_df.columns:
            ret_df['minute'] = ret_df.minute.apply(lambda x: int(x / units))

    else:
        ret_df = groupeddf

    dump_frame(ret_df, name=emb_name)
    def __init__(self, vec_name, hourly_name='hourly_dzne_dsp', desc_name='dzne_desc', threshold=25, in_path=DATA_PATH, out_path=DATA_PATH):
        """

        :param vec_name: filename
        :param hourly_name: filename of hourly aggregated steps
        :param desc_name: filename of attribute descriptions
        :param in_path: optional, if none utils.storage.DATA_PATH will be used
        """
        super().__init__(vec_name, in_path, out_path)
        self.threshold = threshold
        hourly = load_frame(hourly_name)
        desc = load_frame(desc_name)
        hourly['sex'] = list(desc.sex) * 7
        m_average = []
        f_average = []
        for hour in range(24):
            m_average.append(hourly[(hourly.sex == 'm')][str(hour)].mean())
            f_average.append(hourly[(hourly.sex == 'f')][str(hour)].mean())
        self.dif = [abs(m - f) for m, f in zip(m_average, f_average)]
 def __init__(self, vec_name, data_path=DATA_PATH, out_path=DATA_PATH):
     """
     Loads stepframe form file and checks for correct format
     :param step_name: filename
     :param data_path: optional, if none utils.storage.DATA_PATH will be used
     """
     super().__init__()
     self.vec_name = vec_name
     self.data_path = data_path
     self.out_path = out_path
     vecframe = load_frame(vec_name, data_path)
     check_if_vecframe(vecframe)
     self.vecframe = vecframe
    def __init__(self,
                 vec_name,
                 desc_name='dzne_desc',
                 threshold=25,
                 in_path=DATA_PATH,
                 out_path=DATA_PATH):
        """

        :param vec_name: filename
        :param desc_name: filename of attribute descriptions
        :param in_path: optional, if none utils.storage.DATA_PATH will be used
        """
        super().__init__(vec_name, in_path, out_path)
        self.threshold = threshold
        desc = load_frame(desc_name)
        m_average = self.vecframe[desc.sex == 'm'].mean()[2:]
        f_average = self.vecframe[desc.sex == 'f'].mean()[2:]
        self.dif = [abs(m - f) for m, f in zip(m_average, f_average)]
예제 #5
0
    def __init__(self, vf_fname, time_dim=0, in_datapath=DATA_PATH):
        """
        Loads vecframe containing the compressed embeddings from file
        :param step_name: filename
        :param data_path: optional, if none, default from utils.storage.load_stepframe will be used
        """
        super().__init__()

        self.vf_fname = vf_fname

        self.in_datapath = in_datapath

        if not time_dim:
            vecframe = load_frame(vf_fname, in_datapath)
            check_if_vecframe(vecframe)
            self.vecframe = vecframe

        else:
            self.vecframe = load_frame_as_3d_nparray(vf_fname,
                                                     data_path=in_datapath)
def make_weekly_vecframe(step_name, vec_name='{}_week', data_path=DATA_PATH):
    '''
    Transforms a stepframe into a vecframe without splittling the data.
    'desc' will always be 0.

    :param step_name: name of the stepframe
    :param vec_name: name under which vecframe will be saved
    :param data_path: optional, path to data folder
    :return:
    '''
    stepframe = load_frame(step_name, DATA_PATH)
    vecframe = stepframe.loc[:, '0':].transpose()
    vecframe.columns = [str(col) for col in vecframe.columns]
    vecframe['user'] = vecframe.index
    vecframe['user'] = vecframe['user'].apply(int)
    vecframe['desc'] = [0] * vecframe.shape[0]
    cols = list(vecframe.columns)
    vecframe = vecframe[cols[-2:] + cols[:-2]]
    # vecframe = vecframe.reset_index(drop=True)
    check_if_vecframe(vecframe)
    dump_frame(vecframe, vec_name.format(step_name), data_path)
def daySplitter(step_name, data_path=DATA_PATH):
    """
    Splits entries into days and saves results as vecframe.
    """

    stepframe = load_frame(step_name, data_path)
    check_if_stepframe(stepframe)
    vec_len = stepframe.loc[stepframe.day == 0].shape[0]
    columns = ['user', 'desc'] + list(range(vec_len))
    vfs = []
    for day in stepframe.day.unique():
        vf = stepframe[stepframe.day == day].iloc[:,
                                                  4:999 + 4].T.astype('int32')
        vf.columns = list(range(vec_len))
        vf['user'] = vf.index.to_numpy(dtype=pd.np.int)
        vf['desc'] = day
        vfs.append(vf)
    vecframe = pd.concat(vfs, sort=False, ignore_index=True)
    vecframe = vecframe[columns]
    vecframe.columns = vecframe.columns.astype(str)
    check_if_vecframe(vecframe)
    dump_frame(vecframe, '{}_dsp'.format(step_name))
    def __init__(self,
                 vf_fname,
                 attribute,
                 in_datapath=DATA_PATH,
                 out_datapath=DATA_PATH):
        """

        :param vf_fname: filename of vecframe of input features
        :param attribute: gender or age or edu
        :param in_datapath: location of input features
        :param out_datapath: location where train and test sets will be saved
        """
        # this constructor sets the attribute, out_datapath, train and test fnames, merges with the input features in vecframe,

        # the parent class constructor loads the vecframe from vf_fname into a dataframe self.vecframe and sets in_datapath
        super().__init__(vf_fname, in_datapath)

        assert (attribute in ['sex', 'edu_binary',
                              'age_binary'])  #, 'age_multi', 'edu_multi'

        self.attribute = attribute

        self.att = load_frame(
            "dzne_desc")  # pd.read_csv(DATA_PATH + 'dzne_desc.csv')
        self.att = self.att.loc[:, ['user', 'age', 'edu', 'sex']]
        self.att['user'] = pd.to_numeric(self.att['user'])

        self.merged = self.vecframe.merge(self.att, on='user')

        self.train_fname = self.attribute + '_train_' + self.vf_fname + '.csv'
        self.test_fname = self.attribute + '_test_' + self.vf_fname + '.csv'

        self.out_datapath = out_datapath + self.attribute + '/'

        for i in range(0, 5):

            if not os.path.exists(self.out_datapath + str(i) + '/'):
                os.makedirs(self.out_datapath + str(i) + '/')
        out, (latent_state, cell_state) = self.encoder(input)
        emb = latent_state

        # This part is just for decoding
        middle_layer = latent_state.repeat(len(input), 1, 1)
        # Decode
        y, _ = self.decoder(middle_layer)
        return torch.squeeze(y), emb


epochs, batchsize = int(sys.argv[2]), int(sys.argv[3])
in_dsp_fname = str(sys.argv[1])  #'1minute_emb_dsp'

in_dsp = load_frame(
    in_dsp_fname
)  #dzne_dsp = dzne_dsp.iloc[:100, :], data_path="../../data/dzne/"

if 'nor' not in in_dsp_fname:

    normalize_vecframe_by_col(in_dsp_fname)

    in_dsp_fname = in_dsp_fname + "_nor"
    print(
        "nor NOT found in filename, Vecframe will be normalized and saved to ",
        in_dsp_fname)
    in_dsp = load_frame(in_dsp_fname)

else:

    print("nor found in filename, Not normalizing!")