예제 #1
0
def insert_co_exp_ids(profile_ids, modalities, db_con_1, db_con_2):
    """
    Scan high frequency type modalities
    and extract mutually existent exp ids.
    Save it to CSV file.
    """
    print('\twriting co_exp_ids for sensor data')
    for profile_id in profile_ids:
        high_interval_mods = filter(lambda x: info.MOD_FREQ_TYPE[x] == info.FREQ_HIGH, modalities)
        co_exp_ids = []
        for mod in high_interval_mods:
            exp_ids = loader.load_exp_ids(profile_id, mod, filtered=False, server_index=1, db_con=db_con_1, close=False)
            if len(exp_ids) > 0:
                co_exp_ids.append(pd.DataFrame([0] * len(exp_ids), index=exp_ids, columns=[mod]))
        co_exp_ids = pd.concat(co_exp_ids, axis=1)
        co_exp_ids = co_exp_ids.dropna()
        co_exp_ids = list(co_exp_ids.index)
        co_exp_ids.sort()

        done_ids = loader.load_co_exp_ids(profile_id, db_con=db_con_2, close=False)
        co_exp_ids = filter(lambda x: x not in done_ids, co_exp_ids)
        if len(co_exp_ids) == 0:
            print profile_id, "all co_exp_ids are already inserted!"
            continue

        df = DataFrame(co_exp_ids, columns=['expId'])
        df['profile_id'] = profile_id
        df.to_sql("co_exp_ids", db_con_2, flavor='mysql', if_exists='append', index=False)
        print('\t\t%s number of exp ids of user %s are successfully inserted!' % (len(df), profile_id))
예제 #2
0
def insert_time_line_filtered_logs(profile_id, modality, permission_free=True, server_index=1):
    """
    Read CSV file about mutual time lines
    and filter the original logs data set passing only in this time lines.
    Passed logs are inserted at a relevant table whose name contains '_filtered' as suffix on DB.
    This method check automatically already done time lines, so don't worry about data duplication.
    """
    print('\tNow inserting %s filtered logs.' % modality)
    if not os.path.isfile('%s/sensitive_timeline%s.csv' % (param.TIME_PATH, profile_id)):
        print('\t\ttimeline file of user %s does not exist!' % profile_id)
        return

    already_done_ids = loader.load_exp_ids(profile_id, modality, filtered=True, server_index=server_index)
    timeline_loaded = loader.read_csv_time_line(profile_id)
    if timeline_loaded is None or len(timeline_loaded) == 0:
        print('\t\ttimeline file of user %s is empty!' % profile_id)
        return

    ids_to_do = list(timeline_loaded['exp_id'].unique())
    if already_done_ids is not None and len(already_done_ids) > 0:
        ids_to_do = filter(lambda x: x not in already_done_ids, ids_to_do)
    if len(ids_to_do) == 0:
        print('\t\tAll exp ids of user %s are already done~. Nothing to do!' % profile_id)
        return

    mysql_con = mdb.connect(info.HOST_2, info.ID, info.PWD, info.DB_NAME_2)
    for id_to_do in ids_to_do:
        id_timelines = timeline_loaded.query('exp_id == %s' % id_to_do)
        log_df = loader.load_mod_logs(profile_id, modality, exp_id=id_to_do, permission_free=permission_free)
        total_selected_df = None
        for i in id_timelines.index:
            try:
                timeline = id_timelines.loc[i]
                selected_df = log_df.query('"%s" <= time_stamp <= "%s"'
                                           % (timeline['start_time'], timeline['end_time']))
                start_df = log_df.query('time_stamp == "%s"' % timeline['start_time'])
                end_df = log_df.query('time_stamp == "%s"' % timeline['end_time'])
                if len(start_df) == 0:
                    try:
                        selected_df.loc[timeline['start_time']] \
                            = pd.Series([id_to_do] + [np.nan] * (len(log_df.columns) - 1), index=log_df.columns)
                    except ValueError:
                        sys.exit(-1)
                    selected_df = pd.concat([selected_df.iloc[[-1]], selected_df.iloc[:-1, :]])
                if len(end_df) == 0:
                    selected_df.loc[timeline['end_time']] \
                        = pd.Series([id_to_do] + [np.nan] * (len(log_df.columns) - 1), index=log_df.columns)
                selected_df.loc[:, 'chunk_id'] = i
                if total_selected_df is None:
                    total_selected_df = selected_df
                else:
                    total_selected_df = total_selected_df.append(selected_df)
            except IndexError:
                print('why? IndexError??')
                sys.exit(-1)
        if total_selected_df is not None and len(total_selected_df) > 0:
            # total_selected_df = total_selected_df.reset_index()
            column_list = list(map(lambda x: x.split('_')[0] + 'Id' if x == 'exp_id' or x == 'chunk_id' else x,
                                   total_selected_df.columns))
            total_selected_df.columns = column_list
            total_selected_df.loc[:, 'profile_id'] = profile_id
            total_selected_df.index.name = 'time_stamp'
            total_selected_df = total_selected_df.reset_index()
            try:
                total_selected_df.loc[:, 'time_stamp'] \
                    = map(lambda x: x.total_seconds(),
                          list(total_selected_df['time_stamp'] - datetime.datetime(1970, 1, 1)))
            except KeyError:
                print('why KeyError?')
                sys.exit(-1)
            total_selected_df.to_sql(modality + "_filtered", mysql_con, flavor='mysql', if_exists='append', index=False)
            print('\t\t%s number of logs of exp id %s of user %s are successfully inserted!'
                  % (len(total_selected_df), id_to_do, profile_id))