예제 #1
0
def get_ready_for_conditioning(profile_id, exp_chunk_pair, permission_free=True,
                               unit_time=param.UNIT_TIME, filtered=True, server_index=1):
    """
    about modalities and theirs fields which is already set for conditioning in ida_db_info module,
    load logs and make state-duration DataFrame of each category value
    :return: Dictionary object {mod_name : {field_name : [dataframe_1, dataframe_2, ...]}}
    """
    cond_info = {}
    for mod_name in feat.COND_MOD_LIST:
        cond_info[mod_name] = {}
        df = loader.load_mod_logs(profile_id, mod_name, exp_id=exp_chunk_pair[0], chunk_id=exp_chunk_pair[1],
                                  permission_free=permission_free, where_cond='', filtered=filtered,
                                  server_index=server_index)
        sample_df = pc.make_samples_from_logs(df, unit_time=unit_time)
        for field in feat.COND_MOD_FIELD[mod_name]:
            cond_info[mod_name][field] = []
            series = sample_df[field]
            disc_series = series
            categories = get_cat_var_values(mod_name, field)

            if info.MOD_FIELD_TYPE[mod_name][field] in info.VAR_NUMERIC:
                disc_series = tf.discretize_series(series, mod_name, field, eq_freq=True)
            cond_info[mod_name][field] += list(map(lambda x: tf.get_state_duration_series(disc_series, x), categories))
    print('\tconditioning ready!\n')
    return cond_info
예제 #2
0
파일: data_writer.py 프로젝트: heevery/ohp
def write_mod_chunk_feats_at_once(profile_id, modalities, exp_chunk_pairs,
                                  permission_free=True, unit_time=param.UNIT_TIME, filtered=True, server_index=2):
    """
    Only difference with "write_mod_chunk_feats" is that it load logs on memory "at once".
    So it is faster that "write_mod_chunk_feats", and more unlikely to be obstructed by "connection failed error".
    Before using this method, you should check data set size to load and memory options for compiler.
    """
    exp_ids = list(pd.Series(map(lambda x: x[0], exp_chunk_pairs)).unique())
    for mod in modalities:
        if not os.path.exists('%s/id_%s/%s/' % (param.FEATURE_PATH, profile_id, mod)):
            os.makedirs('%s/id_%s/%s/' % (param.FEATURE_PATH, profile_id, mod))

    exp_chunk_map = {}
    for pair in exp_chunk_pairs:
        if pair[0] in exp_chunk_map.keys():
            exp_chunk_map[pair[0]].append(pair[1])
        else:
            exp_chunk_map[pair[0]] = [pair[1]]

    ### for removing files ###
    # for mod in modalities:
    #     file_list = os.listdir('data_set/features/id_%s/%s/' % (profile_id, mod))
    #     for file_name in file_list:
    #         os.remove('data_set/features/id_%s/%s/%s' % (profile_id, mod, file_name))

    mod_logs = {}
    for mod in modalities:
        temp_logs = loader.load_mod_logs(profile_id, mod, permission_free=True, filtered=True, server_index=2)
        print "%s logs are loaded" % mod
        mod_logs[mod] = temp_logs

    for exp_id in exp_ids[100:]:
        mod_features = {}
        for mod in modalities:
            if info.MOD_FREQ_TYPE[mod] == info.FREQ_HIGH or info.MOD_FREQ_TYPE[mod] == info.FREQ_EVENT_DRIVEN:
                exp_features = loader.read_csv_chunk_features(profile_id, mod, exp_id) if os.path.isfile(
                        '%s/id_%s/%s/exp_%s.csv' % (param.FEATURE_PATH, profile_id, mod, exp_id)) else None
                mod_features[mod] = exp_features

        for chunk_id in exp_chunk_map[exp_id]:
            conditioning_info = engineer.get_ready_for_conditioning(profile_id, (exp_id, chunk_id),
                                                                    permission_free, unit_time, filtered=filtered,
                                                                    server_index=server_index)
            for mod in mod_features.keys():
                if mod_features[mod] is not None:
                    unique_chunk_ids = pd.Series(mod_features[mod].columns).unique()
                    if len(unique_chunk_ids) < len(mod_features[mod].columns):
                        unique_chunk_ids = list(unique_chunk_ids)
                        unique_chunk_ids.sort()
                        mod_features[mod] = mod_features[mod].loc[:, unique_chunk_ids]
                        mod_features[mod].to_csv(
                                "%s/id_%s/%s/exp_%s.csv" % (param.FEATURE_PATH, profile_id, mod, exp_id))

                    if chunk_id in mod_features[mod].columns:
                        print("\t\t%s, %s, %s already done." % (exp_id, chunk_id, mod))
                        continue
                print("\t\t%s, %s, %s" % (exp_id, chunk_id, mod))

                log_df = mod_logs[mod]
                data_df = pc.make_samples_from_logs(log_df.query('chunk_id == %s' % chunk_id), unit_time=unit_time)

                field_index, feature_index, values \
                    = engineer.extract_feats_n_total_heads(data_df, mod, conditioning_info=conditioning_info,
                                                           filtered=filtered)
                feature_df = pd.DataFrame(values, index=[field_index, feature_index], columns=['value'])
                feature_df.index.names = ['field', 'feature']

                feature_df.loc[('info', 'profile_id'), :] = profile_id
                feature_df.loc[('info', 'exp_id'), :] = exp_id
                feature_df.loc[('info', 'chunk_id'), :] = chunk_id
                if data_df is not None:
                    feature_df.loc[('info', 'count'), :] = len(data_df)
                    if len(data_df) > 0:
                        feature_df.loc[('info', 'duration'), :] = float(
                                (data_df.index[-1] - data_df.index[0]).value) / 10e8
                    else:
                        feature_df.loc[('info', 'duration'), :] = 0.0
                else:
                    feature_df.loc[('info', 'count'), :] = 0.0
                    feature_df.loc[('info', 'duration'), :] = 0.0

                if mod_features[mod] is None:
                    mod_features[mod] = DataFrame(index=feature_df.index, columns=[])
                    mod_features[mod].columns.name = 'chunk'
                mod_features[mod][chunk_id] = feature_df
                mod_features[mod].to_csv("%s/id_%s/%s/exp_%s.csv" % (param.FEATURE_PATH, profile_id, mod, exp_id))
예제 #3
0
            if len(install_only) > 0
            else np.nan,
        ]

        total_cost = sum(install_only["appPrice"])
        values.append(total_cost)

        if param.FEATURE_SET_EXTENSION_APP:
            item_based = [0] * len(apks)
            content_based = [0] * len(lexicon)
            for apk_name in install_only["packageName"]:
                if apk_name in apks:
                    item_based[apks.get(apk_name)] = 1
                if apk_name in apk_contents:
                    content = apk_contents.get(apk_name)
                    for word in content:
                        if word in lexicon:
                            content_based[lexicon.get(word)] += 1
            values.extend(item_based)
            values.extend(content_based)

        return col_lv1, col_lv2, values


if __name__ == "__main__":
    import data_loader as loader
    import info_ida_db as info

    log_df = loader.load_mod_logs(1, info.APP)
    extract_app_features(log_df)
예제 #4
0
def insert_time_line_filtered_logs(profile_id, modality, permission_free=True, server_index=1):
    """
    Read CSV file about mutual time lines
    and filter the original logs data set passing only in this time lines.
    Passed logs are inserted at a relevant table whose name contains '_filtered' as suffix on DB.
    This method check automatically already done time lines, so don't worry about data duplication.
    """
    print('\tNow inserting %s filtered logs.' % modality)
    if not os.path.isfile('%s/sensitive_timeline%s.csv' % (param.TIME_PATH, profile_id)):
        print('\t\ttimeline file of user %s does not exist!' % profile_id)
        return

    already_done_ids = loader.load_exp_ids(profile_id, modality, filtered=True, server_index=server_index)
    timeline_loaded = loader.read_csv_time_line(profile_id)
    if timeline_loaded is None or len(timeline_loaded) == 0:
        print('\t\ttimeline file of user %s is empty!' % profile_id)
        return

    ids_to_do = list(timeline_loaded['exp_id'].unique())
    if already_done_ids is not None and len(already_done_ids) > 0:
        ids_to_do = filter(lambda x: x not in already_done_ids, ids_to_do)
    if len(ids_to_do) == 0:
        print('\t\tAll exp ids of user %s are already done~. Nothing to do!' % profile_id)
        return

    mysql_con = mdb.connect(info.HOST_2, info.ID, info.PWD, info.DB_NAME_2)
    for id_to_do in ids_to_do:
        id_timelines = timeline_loaded.query('exp_id == %s' % id_to_do)
        log_df = loader.load_mod_logs(profile_id, modality, exp_id=id_to_do, permission_free=permission_free)
        total_selected_df = None
        for i in id_timelines.index:
            try:
                timeline = id_timelines.loc[i]
                selected_df = log_df.query('"%s" <= time_stamp <= "%s"'
                                           % (timeline['start_time'], timeline['end_time']))
                start_df = log_df.query('time_stamp == "%s"' % timeline['start_time'])
                end_df = log_df.query('time_stamp == "%s"' % timeline['end_time'])
                if len(start_df) == 0:
                    try:
                        selected_df.loc[timeline['start_time']] \
                            = pd.Series([id_to_do] + [np.nan] * (len(log_df.columns) - 1), index=log_df.columns)
                    except ValueError:
                        sys.exit(-1)
                    selected_df = pd.concat([selected_df.iloc[[-1]], selected_df.iloc[:-1, :]])
                if len(end_df) == 0:
                    selected_df.loc[timeline['end_time']] \
                        = pd.Series([id_to_do] + [np.nan] * (len(log_df.columns) - 1), index=log_df.columns)
                selected_df.loc[:, 'chunk_id'] = i
                if total_selected_df is None:
                    total_selected_df = selected_df
                else:
                    total_selected_df = total_selected_df.append(selected_df)
            except IndexError:
                print('why? IndexError??')
                sys.exit(-1)
        if total_selected_df is not None and len(total_selected_df) > 0:
            # total_selected_df = total_selected_df.reset_index()
            column_list = list(map(lambda x: x.split('_')[0] + 'Id' if x == 'exp_id' or x == 'chunk_id' else x,
                                   total_selected_df.columns))
            total_selected_df.columns = column_list
            total_selected_df.loc[:, 'profile_id'] = profile_id
            total_selected_df.index.name = 'time_stamp'
            total_selected_df = total_selected_df.reset_index()
            try:
                total_selected_df.loc[:, 'time_stamp'] \
                    = map(lambda x: x.total_seconds(),
                          list(total_selected_df['time_stamp'] - datetime.datetime(1970, 1, 1)))
            except KeyError:
                print('why KeyError?')
                sys.exit(-1)
            total_selected_df.to_sql(modality + "_filtered", mysql_con, flavor='mysql', if_exists='append', index=False)
            print('\t\t%s number of logs of exp id %s of user %s are successfully inserted!'
                  % (len(total_selected_df), id_to_do, profile_id))