def insert_co_exp_ids(profile_ids, modalities, db_con_1, db_con_2): """ Scan high frequency type modalities and extract mutually existent exp ids. Save it to CSV file. """ print('\twriting co_exp_ids for sensor data') for profile_id in profile_ids: high_interval_mods = filter(lambda x: info.MOD_FREQ_TYPE[x] == info.FREQ_HIGH, modalities) co_exp_ids = [] for mod in high_interval_mods: exp_ids = loader.load_exp_ids(profile_id, mod, filtered=False, server_index=1, db_con=db_con_1, close=False) if len(exp_ids) > 0: co_exp_ids.append(pd.DataFrame([0] * len(exp_ids), index=exp_ids, columns=[mod])) co_exp_ids = pd.concat(co_exp_ids, axis=1) co_exp_ids = co_exp_ids.dropna() co_exp_ids = list(co_exp_ids.index) co_exp_ids.sort() done_ids = loader.load_co_exp_ids(profile_id, db_con=db_con_2, close=False) co_exp_ids = filter(lambda x: x not in done_ids, co_exp_ids) if len(co_exp_ids) == 0: print profile_id, "all co_exp_ids are already inserted!" continue df = DataFrame(co_exp_ids, columns=['expId']) df['profile_id'] = profile_id df.to_sql("co_exp_ids", db_con_2, flavor='mysql', if_exists='append', index=False) print('\t\t%s number of exp ids of user %s are successfully inserted!' % (len(df), profile_id))
def insert_time_line_filtered_logs(profile_id, modality, permission_free=True, server_index=1): """ Read CSV file about mutual time lines and filter the original logs data set passing only in this time lines. Passed logs are inserted at a relevant table whose name contains '_filtered' as suffix on DB. This method check automatically already done time lines, so don't worry about data duplication. """ print('\tNow inserting %s filtered logs.' % modality) if not os.path.isfile('%s/sensitive_timeline%s.csv' % (param.TIME_PATH, profile_id)): print('\t\ttimeline file of user %s does not exist!' % profile_id) return already_done_ids = loader.load_exp_ids(profile_id, modality, filtered=True, server_index=server_index) timeline_loaded = loader.read_csv_time_line(profile_id) if timeline_loaded is None or len(timeline_loaded) == 0: print('\t\ttimeline file of user %s is empty!' % profile_id) return ids_to_do = list(timeline_loaded['exp_id'].unique()) if already_done_ids is not None and len(already_done_ids) > 0: ids_to_do = filter(lambda x: x not in already_done_ids, ids_to_do) if len(ids_to_do) == 0: print('\t\tAll exp ids of user %s are already done~. Nothing to do!' % profile_id) return mysql_con = mdb.connect(info.HOST_2, info.ID, info.PWD, info.DB_NAME_2) for id_to_do in ids_to_do: id_timelines = timeline_loaded.query('exp_id == %s' % id_to_do) log_df = loader.load_mod_logs(profile_id, modality, exp_id=id_to_do, permission_free=permission_free) total_selected_df = None for i in id_timelines.index: try: timeline = id_timelines.loc[i] selected_df = log_df.query('"%s" <= time_stamp <= "%s"' % (timeline['start_time'], timeline['end_time'])) start_df = log_df.query('time_stamp == "%s"' % timeline['start_time']) end_df = log_df.query('time_stamp == "%s"' % timeline['end_time']) if len(start_df) == 0: try: selected_df.loc[timeline['start_time']] \ = pd.Series([id_to_do] + [np.nan] * (len(log_df.columns) - 1), index=log_df.columns) except ValueError: sys.exit(-1) selected_df = pd.concat([selected_df.iloc[[-1]], selected_df.iloc[:-1, :]]) if len(end_df) == 0: selected_df.loc[timeline['end_time']] \ = pd.Series([id_to_do] + [np.nan] * (len(log_df.columns) - 1), index=log_df.columns) selected_df.loc[:, 'chunk_id'] = i if total_selected_df is None: total_selected_df = selected_df else: total_selected_df = total_selected_df.append(selected_df) except IndexError: print('why? IndexError??') sys.exit(-1) if total_selected_df is not None and len(total_selected_df) > 0: # total_selected_df = total_selected_df.reset_index() column_list = list(map(lambda x: x.split('_')[0] + 'Id' if x == 'exp_id' or x == 'chunk_id' else x, total_selected_df.columns)) total_selected_df.columns = column_list total_selected_df.loc[:, 'profile_id'] = profile_id total_selected_df.index.name = 'time_stamp' total_selected_df = total_selected_df.reset_index() try: total_selected_df.loc[:, 'time_stamp'] \ = map(lambda x: x.total_seconds(), list(total_selected_df['time_stamp'] - datetime.datetime(1970, 1, 1))) except KeyError: print('why KeyError?') sys.exit(-1) total_selected_df.to_sql(modality + "_filtered", mysql_con, flavor='mysql', if_exists='append', index=False) print('\t\t%s number of logs of exp id %s of user %s are successfully inserted!' % (len(total_selected_df), id_to_do, profile_id))