def get_ready_for_conditioning(profile_id, exp_chunk_pair, permission_free=True, unit_time=param.UNIT_TIME, filtered=True, server_index=1): """ about modalities and theirs fields which is already set for conditioning in ida_db_info module, load logs and make state-duration DataFrame of each category value :return: Dictionary object {mod_name : {field_name : [dataframe_1, dataframe_2, ...]}} """ cond_info = {} for mod_name in feat.COND_MOD_LIST: cond_info[mod_name] = {} df = loader.load_mod_logs(profile_id, mod_name, exp_id=exp_chunk_pair[0], chunk_id=exp_chunk_pair[1], permission_free=permission_free, where_cond='', filtered=filtered, server_index=server_index) sample_df = pc.make_samples_from_logs(df, unit_time=unit_time) for field in feat.COND_MOD_FIELD[mod_name]: cond_info[mod_name][field] = [] series = sample_df[field] disc_series = series categories = get_cat_var_values(mod_name, field) if info.MOD_FIELD_TYPE[mod_name][field] in info.VAR_NUMERIC: disc_series = tf.discretize_series(series, mod_name, field, eq_freq=True) cond_info[mod_name][field] += list(map(lambda x: tf.get_state_duration_series(disc_series, x), categories)) print('\tconditioning ready!\n') return cond_info
def write_mod_chunk_feats_at_once(profile_id, modalities, exp_chunk_pairs, permission_free=True, unit_time=param.UNIT_TIME, filtered=True, server_index=2): """ Only difference with "write_mod_chunk_feats" is that it load logs on memory "at once". So it is faster that "write_mod_chunk_feats", and more unlikely to be obstructed by "connection failed error". Before using this method, you should check data set size to load and memory options for compiler. """ exp_ids = list(pd.Series(map(lambda x: x[0], exp_chunk_pairs)).unique()) for mod in modalities: if not os.path.exists('%s/id_%s/%s/' % (param.FEATURE_PATH, profile_id, mod)): os.makedirs('%s/id_%s/%s/' % (param.FEATURE_PATH, profile_id, mod)) exp_chunk_map = {} for pair in exp_chunk_pairs: if pair[0] in exp_chunk_map.keys(): exp_chunk_map[pair[0]].append(pair[1]) else: exp_chunk_map[pair[0]] = [pair[1]] ### for removing files ### # for mod in modalities: # file_list = os.listdir('data_set/features/id_%s/%s/' % (profile_id, mod)) # for file_name in file_list: # os.remove('data_set/features/id_%s/%s/%s' % (profile_id, mod, file_name)) mod_logs = {} for mod in modalities: temp_logs = loader.load_mod_logs(profile_id, mod, permission_free=True, filtered=True, server_index=2) print "%s logs are loaded" % mod mod_logs[mod] = temp_logs for exp_id in exp_ids[100:]: mod_features = {} for mod in modalities: if info.MOD_FREQ_TYPE[mod] == info.FREQ_HIGH or info.MOD_FREQ_TYPE[mod] == info.FREQ_EVENT_DRIVEN: exp_features = loader.read_csv_chunk_features(profile_id, mod, exp_id) if os.path.isfile( '%s/id_%s/%s/exp_%s.csv' % (param.FEATURE_PATH, profile_id, mod, exp_id)) else None mod_features[mod] = exp_features for chunk_id in exp_chunk_map[exp_id]: conditioning_info = engineer.get_ready_for_conditioning(profile_id, (exp_id, chunk_id), permission_free, unit_time, filtered=filtered, server_index=server_index) for mod in mod_features.keys(): if mod_features[mod] is not None: unique_chunk_ids = pd.Series(mod_features[mod].columns).unique() if len(unique_chunk_ids) < len(mod_features[mod].columns): unique_chunk_ids = list(unique_chunk_ids) unique_chunk_ids.sort() mod_features[mod] = mod_features[mod].loc[:, unique_chunk_ids] mod_features[mod].to_csv( "%s/id_%s/%s/exp_%s.csv" % (param.FEATURE_PATH, profile_id, mod, exp_id)) if chunk_id in mod_features[mod].columns: print("\t\t%s, %s, %s already done." % (exp_id, chunk_id, mod)) continue print("\t\t%s, %s, %s" % (exp_id, chunk_id, mod)) log_df = mod_logs[mod] data_df = pc.make_samples_from_logs(log_df.query('chunk_id == %s' % chunk_id), unit_time=unit_time) field_index, feature_index, values \ = engineer.extract_feats_n_total_heads(data_df, mod, conditioning_info=conditioning_info, filtered=filtered) feature_df = pd.DataFrame(values, index=[field_index, feature_index], columns=['value']) feature_df.index.names = ['field', 'feature'] feature_df.loc[('info', 'profile_id'), :] = profile_id feature_df.loc[('info', 'exp_id'), :] = exp_id feature_df.loc[('info', 'chunk_id'), :] = chunk_id if data_df is not None: feature_df.loc[('info', 'count'), :] = len(data_df) if len(data_df) > 0: feature_df.loc[('info', 'duration'), :] = float( (data_df.index[-1] - data_df.index[0]).value) / 10e8 else: feature_df.loc[('info', 'duration'), :] = 0.0 else: feature_df.loc[('info', 'count'), :] = 0.0 feature_df.loc[('info', 'duration'), :] = 0.0 if mod_features[mod] is None: mod_features[mod] = DataFrame(index=feature_df.index, columns=[]) mod_features[mod].columns.name = 'chunk' mod_features[mod][chunk_id] = feature_df mod_features[mod].to_csv("%s/id_%s/%s/exp_%s.csv" % (param.FEATURE_PATH, profile_id, mod, exp_id))
if len(install_only) > 0 else np.nan, ] total_cost = sum(install_only["appPrice"]) values.append(total_cost) if param.FEATURE_SET_EXTENSION_APP: item_based = [0] * len(apks) content_based = [0] * len(lexicon) for apk_name in install_only["packageName"]: if apk_name in apks: item_based[apks.get(apk_name)] = 1 if apk_name in apk_contents: content = apk_contents.get(apk_name) for word in content: if word in lexicon: content_based[lexicon.get(word)] += 1 values.extend(item_based) values.extend(content_based) return col_lv1, col_lv2, values if __name__ == "__main__": import data_loader as loader import info_ida_db as info log_df = loader.load_mod_logs(1, info.APP) extract_app_features(log_df)
def insert_time_line_filtered_logs(profile_id, modality, permission_free=True, server_index=1): """ Read CSV file about mutual time lines and filter the original logs data set passing only in this time lines. Passed logs are inserted at a relevant table whose name contains '_filtered' as suffix on DB. This method check automatically already done time lines, so don't worry about data duplication. """ print('\tNow inserting %s filtered logs.' % modality) if not os.path.isfile('%s/sensitive_timeline%s.csv' % (param.TIME_PATH, profile_id)): print('\t\ttimeline file of user %s does not exist!' % profile_id) return already_done_ids = loader.load_exp_ids(profile_id, modality, filtered=True, server_index=server_index) timeline_loaded = loader.read_csv_time_line(profile_id) if timeline_loaded is None or len(timeline_loaded) == 0: print('\t\ttimeline file of user %s is empty!' % profile_id) return ids_to_do = list(timeline_loaded['exp_id'].unique()) if already_done_ids is not None and len(already_done_ids) > 0: ids_to_do = filter(lambda x: x not in already_done_ids, ids_to_do) if len(ids_to_do) == 0: print('\t\tAll exp ids of user %s are already done~. Nothing to do!' % profile_id) return mysql_con = mdb.connect(info.HOST_2, info.ID, info.PWD, info.DB_NAME_2) for id_to_do in ids_to_do: id_timelines = timeline_loaded.query('exp_id == %s' % id_to_do) log_df = loader.load_mod_logs(profile_id, modality, exp_id=id_to_do, permission_free=permission_free) total_selected_df = None for i in id_timelines.index: try: timeline = id_timelines.loc[i] selected_df = log_df.query('"%s" <= time_stamp <= "%s"' % (timeline['start_time'], timeline['end_time'])) start_df = log_df.query('time_stamp == "%s"' % timeline['start_time']) end_df = log_df.query('time_stamp == "%s"' % timeline['end_time']) if len(start_df) == 0: try: selected_df.loc[timeline['start_time']] \ = pd.Series([id_to_do] + [np.nan] * (len(log_df.columns) - 1), index=log_df.columns) except ValueError: sys.exit(-1) selected_df = pd.concat([selected_df.iloc[[-1]], selected_df.iloc[:-1, :]]) if len(end_df) == 0: selected_df.loc[timeline['end_time']] \ = pd.Series([id_to_do] + [np.nan] * (len(log_df.columns) - 1), index=log_df.columns) selected_df.loc[:, 'chunk_id'] = i if total_selected_df is None: total_selected_df = selected_df else: total_selected_df = total_selected_df.append(selected_df) except IndexError: print('why? IndexError??') sys.exit(-1) if total_selected_df is not None and len(total_selected_df) > 0: # total_selected_df = total_selected_df.reset_index() column_list = list(map(lambda x: x.split('_')[0] + 'Id' if x == 'exp_id' or x == 'chunk_id' else x, total_selected_df.columns)) total_selected_df.columns = column_list total_selected_df.loc[:, 'profile_id'] = profile_id total_selected_df.index.name = 'time_stamp' total_selected_df = total_selected_df.reset_index() try: total_selected_df.loc[:, 'time_stamp'] \ = map(lambda x: x.total_seconds(), list(total_selected_df['time_stamp'] - datetime.datetime(1970, 1, 1))) except KeyError: print('why KeyError?') sys.exit(-1) total_selected_df.to_sql(modality + "_filtered", mysql_con, flavor='mysql', if_exists='append', index=False) print('\t\t%s number of logs of exp id %s of user %s are successfully inserted!' % (len(total_selected_df), id_to_do, profile_id))