def statistics(step_name, emb_name, by, units=1, data_path=DATA_PATH): df = load_frame(step_name, data_path) print("WARNING! This func supports EQUAL BUCKETS ONLY") if by == 'hour': groupeddf = df.groupby(['day', 'hour']).apply(get_stats).reset_index() elif by == 'minute': groupeddf = df.groupby(['day', 'hour', 'minute']).apply(get_stats).reset_index() if units > 1: ret_df = groupeddf.groupby(groupeddf.index // units).sum() ret_df['day'] = ret_df.day.apply(lambda x: int(x / units)) if 'hour' in ret_df.columns: ret_df['hour'] = ret_df.hour.apply(lambda x: int(x / units)) if 'minute' in ret_df.columns: ret_df['minute'] = ret_df.minute.apply(lambda x: int(x / units)) else: ret_df = groupeddf dump_frame(ret_df, name=emb_name)
def __init__(self, vec_name, hourly_name='hourly_dzne_dsp', desc_name='dzne_desc', threshold=25, in_path=DATA_PATH, out_path=DATA_PATH): """ :param vec_name: filename :param hourly_name: filename of hourly aggregated steps :param desc_name: filename of attribute descriptions :param in_path: optional, if none utils.storage.DATA_PATH will be used """ super().__init__(vec_name, in_path, out_path) self.threshold = threshold hourly = load_frame(hourly_name) desc = load_frame(desc_name) hourly['sex'] = list(desc.sex) * 7 m_average = [] f_average = [] for hour in range(24): m_average.append(hourly[(hourly.sex == 'm')][str(hour)].mean()) f_average.append(hourly[(hourly.sex == 'f')][str(hour)].mean()) self.dif = [abs(m - f) for m, f in zip(m_average, f_average)]
def __init__(self, vec_name, data_path=DATA_PATH, out_path=DATA_PATH): """ Loads stepframe form file and checks for correct format :param step_name: filename :param data_path: optional, if none utils.storage.DATA_PATH will be used """ super().__init__() self.vec_name = vec_name self.data_path = data_path self.out_path = out_path vecframe = load_frame(vec_name, data_path) check_if_vecframe(vecframe) self.vecframe = vecframe
def __init__(self, vec_name, desc_name='dzne_desc', threshold=25, in_path=DATA_PATH, out_path=DATA_PATH): """ :param vec_name: filename :param desc_name: filename of attribute descriptions :param in_path: optional, if none utils.storage.DATA_PATH will be used """ super().__init__(vec_name, in_path, out_path) self.threshold = threshold desc = load_frame(desc_name) m_average = self.vecframe[desc.sex == 'm'].mean()[2:] f_average = self.vecframe[desc.sex == 'f'].mean()[2:] self.dif = [abs(m - f) for m, f in zip(m_average, f_average)]
def __init__(self, vf_fname, time_dim=0, in_datapath=DATA_PATH): """ Loads vecframe containing the compressed embeddings from file :param step_name: filename :param data_path: optional, if none, default from utils.storage.load_stepframe will be used """ super().__init__() self.vf_fname = vf_fname self.in_datapath = in_datapath if not time_dim: vecframe = load_frame(vf_fname, in_datapath) check_if_vecframe(vecframe) self.vecframe = vecframe else: self.vecframe = load_frame_as_3d_nparray(vf_fname, data_path=in_datapath)
def make_weekly_vecframe(step_name, vec_name='{}_week', data_path=DATA_PATH): ''' Transforms a stepframe into a vecframe without splittling the data. 'desc' will always be 0. :param step_name: name of the stepframe :param vec_name: name under which vecframe will be saved :param data_path: optional, path to data folder :return: ''' stepframe = load_frame(step_name, DATA_PATH) vecframe = stepframe.loc[:, '0':].transpose() vecframe.columns = [str(col) for col in vecframe.columns] vecframe['user'] = vecframe.index vecframe['user'] = vecframe['user'].apply(int) vecframe['desc'] = [0] * vecframe.shape[0] cols = list(vecframe.columns) vecframe = vecframe[cols[-2:] + cols[:-2]] # vecframe = vecframe.reset_index(drop=True) check_if_vecframe(vecframe) dump_frame(vecframe, vec_name.format(step_name), data_path)
def daySplitter(step_name, data_path=DATA_PATH): """ Splits entries into days and saves results as vecframe. """ stepframe = load_frame(step_name, data_path) check_if_stepframe(stepframe) vec_len = stepframe.loc[stepframe.day == 0].shape[0] columns = ['user', 'desc'] + list(range(vec_len)) vfs = [] for day in stepframe.day.unique(): vf = stepframe[stepframe.day == day].iloc[:, 4:999 + 4].T.astype('int32') vf.columns = list(range(vec_len)) vf['user'] = vf.index.to_numpy(dtype=pd.np.int) vf['desc'] = day vfs.append(vf) vecframe = pd.concat(vfs, sort=False, ignore_index=True) vecframe = vecframe[columns] vecframe.columns = vecframe.columns.astype(str) check_if_vecframe(vecframe) dump_frame(vecframe, '{}_dsp'.format(step_name))
def __init__(self, vf_fname, attribute, in_datapath=DATA_PATH, out_datapath=DATA_PATH): """ :param vf_fname: filename of vecframe of input features :param attribute: gender or age or edu :param in_datapath: location of input features :param out_datapath: location where train and test sets will be saved """ # this constructor sets the attribute, out_datapath, train and test fnames, merges with the input features in vecframe, # the parent class constructor loads the vecframe from vf_fname into a dataframe self.vecframe and sets in_datapath super().__init__(vf_fname, in_datapath) assert (attribute in ['sex', 'edu_binary', 'age_binary']) #, 'age_multi', 'edu_multi' self.attribute = attribute self.att = load_frame( "dzne_desc") # pd.read_csv(DATA_PATH + 'dzne_desc.csv') self.att = self.att.loc[:, ['user', 'age', 'edu', 'sex']] self.att['user'] = pd.to_numeric(self.att['user']) self.merged = self.vecframe.merge(self.att, on='user') self.train_fname = self.attribute + '_train_' + self.vf_fname + '.csv' self.test_fname = self.attribute + '_test_' + self.vf_fname + '.csv' self.out_datapath = out_datapath + self.attribute + '/' for i in range(0, 5): if not os.path.exists(self.out_datapath + str(i) + '/'): os.makedirs(self.out_datapath + str(i) + '/')
out, (latent_state, cell_state) = self.encoder(input) emb = latent_state # This part is just for decoding middle_layer = latent_state.repeat(len(input), 1, 1) # Decode y, _ = self.decoder(middle_layer) return torch.squeeze(y), emb epochs, batchsize = int(sys.argv[2]), int(sys.argv[3]) in_dsp_fname = str(sys.argv[1]) #'1minute_emb_dsp' in_dsp = load_frame( in_dsp_fname ) #dzne_dsp = dzne_dsp.iloc[:100, :], data_path="../../data/dzne/" if 'nor' not in in_dsp_fname: normalize_vecframe_by_col(in_dsp_fname) in_dsp_fname = in_dsp_fname + "_nor" print( "nor NOT found in filename, Vecframe will be normalized and saved to ", in_dsp_fname) in_dsp = load_frame(in_dsp_fname) else: print("nor found in filename, Not normalizing!")