Exemplo n.º 1
0
def write_mod_chunk_feats(profile_id, modalities, exp_chunk_pairs,
                          permission_free=True, unit_time=param.UNIT_TIME, filtered=True, server_index=2):
    """
    Extract some user's chunk features for specific modalities.
    It automatically checks already computed chunks and skip this chunks.
    For not computed yet chunks and modalities, every time it computes features, over writes the file.
    """
    exp_ids = list(pd.Series(map(lambda x: x[0], exp_chunk_pairs)).unique())
    for mod in modalities:
        if not os.path.exists('%s/id_%s/%s/' % (param.FEATURE_PATH, profile_id, mod)):
            os.makedirs('%s/id_%s/%s/' % (param.FEATURE_PATH, profile_id, mod))

    exp_chunk_map = {}
    for pair in exp_chunk_pairs:
        if pair[0] in exp_chunk_map.keys():
            exp_chunk_map[pair[0]].append(pair[1])
        else:
            exp_chunk_map[pair[0]] = [pair[1]]

    ### for removing files ###
    # for mod in modalities:
    #     file_list = os.listdir('data_set/features/id_%s/%s/' % (profile_id, mod))
    #     for file_name in file_list:
    #         os.remove('data_set/features/id_%s/%s/%s' % (profile_id, mod, file_name))

    for exp_id in exp_ids[:100]:
        mod_features = {}
        for mod in modalities:
            if info.MOD_FREQ_TYPE[mod] == info.FREQ_HIGH or info.MOD_FREQ_TYPE[mod] == info.FREQ_EVENT_DRIVEN:
                exp_features = loader.read_csv_chunk_features(profile_id, mod, exp_id) if os.path.isfile(
                        '%s/id_%s/%s/exp_%s.csv' % (param.FEATURE_PATH, profile_id, mod, exp_id)) else None
                mod_features[mod] = exp_features

        for chunk_id in exp_chunk_map[exp_id]:
            conditioning_info = engineer.get_ready_for_conditioning(profile_id, (exp_id, chunk_id),
                                                                    permission_free, unit_time, filtered=filtered,
                                                                    server_index=server_index)
            for mod in mod_features.keys():
                if mod_features[mod] is not None:
                    unique_chunk_ids = pd.Series(mod_features[mod].columns).unique()
                    if len(unique_chunk_ids) < len(mod_features[mod].columns):
                        unique_chunk_ids = list(unique_chunk_ids)
                        unique_chunk_ids.sort()
                        mod_features[mod] = mod_features[mod].loc[:, unique_chunk_ids]
                        mod_features[mod].to_csv(
                                "%s/id_%s/%s/exp_%s.csv" % (param.FEATURE_PATH, profile_id, mod, exp_id))

                    if chunk_id in mod_features[mod].columns:
                        print("\t\t%s, %s, %s already done." % (exp_id, chunk_id, mod))
                        continue
                print("\t\t%s, %s, %s" % (exp_id, chunk_id, mod))
                feat_series = engineer.extract_feats(profile_id, mod, exp_ids=[exp_id], chunk_ids=[chunk_id],
                                                     conditioning_info=conditioning_info,
                                                     permission_free=permission_free, unit_time=unit_time,
                                                     filtered=filtered, server_index=server_index)
                if mod_features[mod] is None:
                    mod_features[mod] = DataFrame(index=feat_series.index, columns=[])
                    mod_features[mod].columns.name = 'chunk'
                mod_features[mod][chunk_id] = feat_series
                mod_features[mod].to_csv("%s/id_%s/%s/exp_%s.csv" % (param.FEATURE_PATH, profile_id, mod, exp_id))
Exemplo n.º 2
0
def aggregate_mod_feats_user_agg_policy(profile_id, modality):
    """
    read already extracted chunk features relevant to specific modality
    aggregate that features with aggregate_chunks method
    :return: DataFrame object
    """
    mod_feat_df = None
    print('\t\tnow aggregating features for user %s, %s' % (profile_id, modality))
    # path = '%s/id_%s/' % (param.FEATURE_AGG_PATH, profile_id)
    # if not os.path.exists(path):
    #     os.makedirs(path)

    chunk_feat_path = '%s/%s%s/%s' % (param.FEATURE_PATH, param.FEATURE_AGG_INNER_PREFIX, profile_id, modality)
    if os.path.exists(chunk_feat_path):
        files = os.listdir(chunk_feat_path)

        if info.MOD_FREQ_TYPE[modality] == info.FREQ_EVENT_DRIVEN or info.MOD_FREQ_TYPE[modality] == info.FREQ_ONE_TIME:
            files = filter(lambda x: param.FEATURE_NOT_SENSOR_FILE_SUFFIX in x, files)
            if modality == info.APP and param.FEATURE_SET == param.FEATURE_SET_EXTENSION_APP:
                files = filter(lambda x: param.FEATURE_NEW_APP_FILE_SUFFIX in x, files)
            files.sort()

            for feat_file in files:
                if "_" in feat_file: # each field file
                    f_feat_df = loader.read_csv_user_mod_features(
                        profile_id, modality, feat_file.split("_")[0],
                        feat_file.split("_")[1].split(".")[0], use_agg_feat=False)
                else: # one file for modality
                    f_feat_df = loader.read_csv_user_mod_features(
                        profile_id, modality, suffix=feat_file.split(".")[0], use_agg_feat=False)

                if mod_feat_df is None:
                    mod_feat_df = f_feat_df
                else:
                    mod_feat_df = pd.concat([mod_feat_df, f_feat_df], axis=0)
            mod_feat_df = mod_feat_df.query('field != "info"')
            mod_feat_df['modality'] = modality
            mod_feat_df.set_index('modality', append=True, inplace=True)
            mod_feat_df = mod_feat_df.reorder_levels(['modality', 'field', 'feature'])

        else:
            exp_ids = map(lambda x: int(x.split("_")[1].split(".")[0]), files)
            exp_ids.sort()
            for exp_id in exp_ids:
                exp_features = loader.read_csv_chunk_features(profile_id, modality, exp_id)
                if mod_feat_df is None:
                    mod_feat_df = exp_features
                else:
                    mod_feat_df = pd.concat([mod_feat_df, exp_features], axis=1)
            mod_feat_df = pc.aggregate_chunks(mod_feat_df, modality)
        mod_feat_df.columns = [profile_id]
        mod_feat_df.columns.name = 'user'
    else:
        print("feature file doesn't exist! You should run data_writer.py first.")
        print("feature file doesn't exist! You should run data_writer.py first.")
        print("feature file doesn't exist! You should run data_writer.py first.")
        sys.exit(-1)
    return mod_feat_df
Exemplo n.º 3
0
def write_chunk_feats():
    """
    Feature set on '/features/...' contains various conditional features.
    However when we use chunk as classification instance, conditional features become meaningless.
    By this method, chunk instances are generated with features of various modalities excluding conditional features.
    They are saved on '/chunkFeatures/...'
    """
    file_list = os.listdir(param.FEATURE_PATH)
    id_list = map(lambda x: int(x.split("_")[1]), file_list)
    id_list.sort()
    for p_id in id_list:
        tl_df = loader.read_csv_time_line(p_id)
        exp_ids = list(tl_df['exp_id'].unique())
        exp_ids.sort()

        mod_list = os.listdir('%s/id_%s' % (param.FEATURE_PATH, p_id))
        mod_list = filter(lambda x: info.MOD_FREQ_TYPE[x] == info.FREQ_HIGH, mod_list)
        mod_list.sort()

        for exp_id in exp_ids:
            exp_df = []
            for mod in mod_list:
                df = loader.read_csv_chunk_features(p_id, mod, exp_id)
                if len(exp_df) == 0:
                    info_df = df.iloc[[-1]]
                    info_df['modality'] = 'info'
                    info_df.set_index('modality', append=True, inplace=True)
                    info_df = info_df.reorder_levels(['modality', 'field', 'feature'])
                    exp_df.append(info_df)

                df = df.drop(df.index[range(-5, 0)])

                df.reset_index('feature', inplace=True)
                df['condition'] = df['feature'].str.rsplit("_", n=1).str.get(1).values
                df = df.query('condition == "none"')

                df = df.drop('condition', axis=1)
                df.set_index('feature', append=True, inplace=True)
                df['modality'] = mod
                df.set_index('modality', append=True, inplace=True)
                df = df.reorder_levels(['modality', 'field', 'feature'])
                exp_df.append(df)
            exp_df = pd.concat(exp_df, axis=0)

            temp_path = "%s/user_%s" % (param.CHUNK_PATH, p_id)
            if not os.path.isdir(temp_path):
                os.makedirs(temp_path)
            exp_df.to_csv("%s/exp_%s.csv" % (temp_path, exp_id))
Exemplo n.º 4
0
def write_mod_chunk_feats_csv_to_db():
    """
    Transport feature data set saved on CSV file to DB.
    But this method must be updated, because DB size gets too huge with it.
    """
    mysql_con = mdb.connect(info.HOST_2, info.ID, info.PWD, info.DB_NAME_2)
    file_list = os.listdir(param.FEATURE_PATH)
    id_list = map(lambda x: int(x.split("_")[1]), file_list)
    id_list.sort()
    for p_id in id_list:
        mod_list = os.listdir('%s/id_%s' % (param.FEATURE_PATH, p_id))
        mod_list = filter(lambda x: info.MOD_FREQ_TYPE[x] == info.FREQ_HIGH, mod_list)
        mod_list.sort()

        for mod in mod_list:
            exp_list = os.listdir('%s/id_%s/%s' % (param.FEATURE_PATH, p_id, mod))
            exp_list = filter(lambda x: 'chunks' not in x, exp_list)
            exp_list = map(lambda x: int(x.split("_")[1].split(".")[0]), exp_list)
            exp_list.sort()

            pairs = loader.exec_query(
                    "select distinct expId, chunkId from time_lines_checker where profile_id = %s and modality = '%s'"
                    % (p_id, mod), server_index=2)
            pairs = map(lambda x: (int(x[0]), int(x[1])), pairs)
            for exp in exp_list:
                df = loader.read_csv_chunk_features(p_id, mod, exp)
                for column in df.columns:
                    if (exp, column) in pairs:
                        print ("exp %s, chunk %s is already in DB" % (exp, column))
                        continue

                    chunk_df = df.loc[:, [column]]
                    chunk_df.columns = ['value']
                    chunk_df = chunk_df.iloc[:-5] if chunk_df.index[-5][1] == 'profile_id' else chunk_df.iloc[:-6]
                    chunk_df = chunk_df.reset_index()
                    chunk_df['chunkId'] = int(column)
                    chunk_df['expId'] = exp
                    chunk_df['profile_id'] = p_id

                    chunk_df.to_sql(mod + "_chunk_feature", mysql_con, flavor='mysql', if_exists='append', index=False)
                    loader.exec_query(
                            "insert into time_lines_checker (profile_id, expId, chunkId, modality) values (%s, %s, %s, '%s')"
                            % (p_id, exp, column, mod), server_index=2)
                    pairs.append((exp, column))
                    print('\t\t%s number of features of user %s, exp %s, chunk %s are successfully inserted!' % (
                        len(df), p_id, exp, column))
Exemplo n.º 5
0
def remove_index_from_chunk_feat_df():
    """
    Some CSV file may contain useless index, and duplicate columns.
    This method fixes it, but in many cases this method is not needed.
    """
    file_list = os.listdir(param.FEATURE_PATH)
    id_list = map(lambda x: int(x.split("_")[1]), file_list)
    id_list.sort()
    for p_id in id_list:
        mod_list = os.listdir('%s/id_%s' % (param.FEATURE_PATH, p_id))
        # mod_list = filter(lambda x: info.MOD_FREQ_TYPE[x] == info.FREQ_HIGH, mod_list)
        mod_list.sort()

        for mod in mod_list:
            files = os.listdir('%s/id_%s/%s' % (param.FEATURE_PATH, p_id, mod))
            for file in files:
                if 'all-chunks' in file:
                    if "_" in file:
                        f_feat_df = loader.read_csv_user_mod_features(
                            p_id, mod, file.split("_")[0], file.split("_")[1].split(".")[0], use_agg_feat=False)
                    else:
                        f_feat_df = loader.read_csv_user_mod_features(
                            p_id, mod, suffix=file.split(".")[0], use_agg_feat=False)

                else:
                    exp_id = int(file.split("_")[1].split(".")[0])
                    f_feat_df = loader.read_csv_chunk_features(p_id, mod, exp_id)

                fixed_feat_df = f_feat_df.query('feature != "modality"')
                unique_columns = fixed_feat_df.columns.unique()
                if len(unique_columns) < len(fixed_feat_df.columns):
                    fixed_feat_df = fixed_feat_df.T.groupby(level=0).first().T

                if fixed_feat_df.shape != f_feat_df.shape:
                    print "%s, %s, %s, shape changed! %s ==> %s" % (
                        p_id, mod, file, str(f_feat_df.shape), str(fixed_feat_df.shape))
                    fixed_feat_df.to_csv(('%s/id_%s/%s/' % (param.FEATURE_PATH, p_id, mod)) + file)
Exemplo n.º 6
0
def write_mod_chunk_feats_at_once(profile_id, modalities, exp_chunk_pairs,
                                  permission_free=True, unit_time=param.UNIT_TIME, filtered=True, server_index=2):
    """
    Only difference with "write_mod_chunk_feats" is that it load logs on memory "at once".
    So it is faster that "write_mod_chunk_feats", and more unlikely to be obstructed by "connection failed error".
    Before using this method, you should check data set size to load and memory options for compiler.
    """
    exp_ids = list(pd.Series(map(lambda x: x[0], exp_chunk_pairs)).unique())
    for mod in modalities:
        if not os.path.exists('%s/id_%s/%s/' % (param.FEATURE_PATH, profile_id, mod)):
            os.makedirs('%s/id_%s/%s/' % (param.FEATURE_PATH, profile_id, mod))

    exp_chunk_map = {}
    for pair in exp_chunk_pairs:
        if pair[0] in exp_chunk_map.keys():
            exp_chunk_map[pair[0]].append(pair[1])
        else:
            exp_chunk_map[pair[0]] = [pair[1]]

    ### for removing files ###
    # for mod in modalities:
    #     file_list = os.listdir('data_set/features/id_%s/%s/' % (profile_id, mod))
    #     for file_name in file_list:
    #         os.remove('data_set/features/id_%s/%s/%s' % (profile_id, mod, file_name))

    mod_logs = {}
    for mod in modalities:
        temp_logs = loader.load_mod_logs(profile_id, mod, permission_free=True, filtered=True, server_index=2)
        print "%s logs are loaded" % mod
        mod_logs[mod] = temp_logs

    for exp_id in exp_ids[100:]:
        mod_features = {}
        for mod in modalities:
            if info.MOD_FREQ_TYPE[mod] == info.FREQ_HIGH or info.MOD_FREQ_TYPE[mod] == info.FREQ_EVENT_DRIVEN:
                exp_features = loader.read_csv_chunk_features(profile_id, mod, exp_id) if os.path.isfile(
                        '%s/id_%s/%s/exp_%s.csv' % (param.FEATURE_PATH, profile_id, mod, exp_id)) else None
                mod_features[mod] = exp_features

        for chunk_id in exp_chunk_map[exp_id]:
            conditioning_info = engineer.get_ready_for_conditioning(profile_id, (exp_id, chunk_id),
                                                                    permission_free, unit_time, filtered=filtered,
                                                                    server_index=server_index)
            for mod in mod_features.keys():
                if mod_features[mod] is not None:
                    unique_chunk_ids = pd.Series(mod_features[mod].columns).unique()
                    if len(unique_chunk_ids) < len(mod_features[mod].columns):
                        unique_chunk_ids = list(unique_chunk_ids)
                        unique_chunk_ids.sort()
                        mod_features[mod] = mod_features[mod].loc[:, unique_chunk_ids]
                        mod_features[mod].to_csv(
                                "%s/id_%s/%s/exp_%s.csv" % (param.FEATURE_PATH, profile_id, mod, exp_id))

                    if chunk_id in mod_features[mod].columns:
                        print("\t\t%s, %s, %s already done." % (exp_id, chunk_id, mod))
                        continue
                print("\t\t%s, %s, %s" % (exp_id, chunk_id, mod))

                log_df = mod_logs[mod]
                data_df = pc.make_samples_from_logs(log_df.query('chunk_id == %s' % chunk_id), unit_time=unit_time)

                field_index, feature_index, values \
                    = engineer.extract_feats_n_total_heads(data_df, mod, conditioning_info=conditioning_info,
                                                           filtered=filtered)
                feature_df = pd.DataFrame(values, index=[field_index, feature_index], columns=['value'])
                feature_df.index.names = ['field', 'feature']

                feature_df.loc[('info', 'profile_id'), :] = profile_id
                feature_df.loc[('info', 'exp_id'), :] = exp_id
                feature_df.loc[('info', 'chunk_id'), :] = chunk_id
                if data_df is not None:
                    feature_df.loc[('info', 'count'), :] = len(data_df)
                    if len(data_df) > 0:
                        feature_df.loc[('info', 'duration'), :] = float(
                                (data_df.index[-1] - data_df.index[0]).value) / 10e8
                    else:
                        feature_df.loc[('info', 'duration'), :] = 0.0
                else:
                    feature_df.loc[('info', 'count'), :] = 0.0
                    feature_df.loc[('info', 'duration'), :] = 0.0

                if mod_features[mod] is None:
                    mod_features[mod] = DataFrame(index=feature_df.index, columns=[])
                    mod_features[mod].columns.name = 'chunk'
                mod_features[mod][chunk_id] = feature_df
                mod_features[mod].to_csv("%s/id_%s/%s/exp_%s.csv" % (param.FEATURE_PATH, profile_id, mod, exp_id))