示例#1
0
def get_home(data: pd.DataFrame, periods: dict) -> tuple:
    # NOTE this is to match the names library expects. Ideally library should work with the
    #      same names as the AWARE database schema
    data.rename(index=str,
                columns={
                    'double_latitude': 'latitude',
                    'double_longitude': 'longitude'
                },
                inplace=True)

    # NOTE this is to imitate data in CMU's use of the library and mainly because
    #      there has been cases (e.g. screen) where columns have been accessed by
    #      their position and not their name
    data = data[['timestamp', 'latitude', 'longitude']]

    # NOTE this is again to imitation data in CMU's used of the library and out of
    #      caution. I do not know if being sorted is assumed by the library.
    data.sort_values(by='timestamp', ascending=True, inplace=True)

    convert_timezone(data, 'US/Pacific', {'timestamp': 'time'})
    data.set_index("time", inplace=True)
    data = data.tz_localize(None)
    # QUESTION why to first set timezone as UTC and then change it back?
    #          why not setting it as local time zone from the beginning?

    periodranges = np.ndarray(shape=(len(periods), 2), dtype=np.int64)
    for index, period in enumerate(periods):
        start = period['start']
        start = datetime.datetime(start['year'], start['month'], start['day'],
                                  start['hour'], start['minute'],
                                  start['second'])
        start = time.mktime(start.timetuple())
        end = period['end']
        end = datetime.datetime(end['year'], end['month'], end['day'],
                                end['hour'], end['minute'], end['second'])
        end = time.mktime(end.timetuple())
        periodranges[index, 0] = start
        periodranges[index, 1] = end
    nightranges = getDaywiseSplitsForEpoch("night")
    home_location = infer_home(data, periodranges, nightranges)

    return home_location
示例#2
0
def extract_activity(data, *args):
    """
    :param data: a pandas data frame holding the raw but cleaned data for feature extraction
    :return: a dictionary of (epoch, weekdays, grouping):dataframe of features
    """

    # TO-DO consider moving these to extract and pass it as arguments
    # ideally these are set when instantiating from FeatureExtraction class
    EPOCHlist = ["allday", "night", "morning", "afternoon", "evening"]
    weekdays = ["", "wkdy", "wkend"]
    GROUPING_FUNCTIONS = {
        "day": groupby_day,
        #    "week": groupby_week, # not of interest to UWEXP
        #    "half_sem": groupby_half_sem, # not of interest to UWEXP
        #    "sem": groupby_all_sem, # TO-DO address the grouping bug
        #    "ema": groupby_ema # TO-DO test
    }

    if data.shape[0] == 0:
        print('no data to extract features from')
        return None

    data = data[['timestamp', 'activity_type', 'activity_name']]
    # NOTE this is to imitate data in CMU's use of the library and mainly because
    #      there has been cases (e.g. screen) where columns have been accessed by
    #      their position and not their activity_name

    data.sort_values(by=['timestamp'], ascending=True, inplace=True)
    # NOTE this is needed becaues of change calculations below
    # QUESTION: why do we consider unknown activities in feature calculations?
    #           shouldn't we remove them?

    # TO-DO consider moving this to extract if it is applicable to all sensorts
    convert_timezone(data, 'US/Pacific', {'timestamp': 'time'})
    data.set_index("time", inplace=True)
    data = data.tz_localize(None)
    # QUESTION why to first set timezone as UTC and then change it back?
    #          why not setting it as local time zone from the beginning?

    results_out = {}
    for epoch in EPOCHlist:

        if epoch == "allday":
            epoch_data = data.copy()
            # NOTE importatnt to use a copy of data because it is changed in the loop
        else:
            timeranges = getDaywiseSplitsForEpoch(epoch)
            epoch_filter = timerange_filter(data, timeranges * 1000)
            epoch_data = data[epoch_filter]

        if epoch_data.shape[0] == 0:
            print('no data for epoch {}'.format(epoch))
            continue

        epoch_data['changed'] = (epoch_data['activity_type'] !=
                                 epoch_data['activity_type'].shift())
        for weekday in weekdays:
            data_wk, weekday_suffix = getDataFromDayOfTheWeek(
                epoch_data, weekday)

            if (data_wk is None) or len(data_wk) == 0:
                if weekday == '':
                    weekday_t = 'wk'
                else:
                    weekday_t = weekday
                print('no data for weekday type {}'.format(weekday_t))
                continue

            for gkey, gfunc in GROUPING_FUNCTIONS.items():
                results = all_groups_flexible(data_wk,
                                              ACTIVITY_APPLY['no_args'],
                                              ACTIVITY_APPLY['with_args'], [],
                                              None, {gkey: gfunc})
                results_out[(epoch, weekday, gkey)] = formatResultsFor1Table(
                    None, results, epoch, weekday)
                # NOTE: the first argument of formatResultsFor1Table is not used in it
                #       I pass None to hint that

    # NOTE: if no features are extracted return None
    if len(results_out) == 0:
        print("no features extracted")
        results_out = None

    return results_out
示例#3
0
def extract_audio(data, *args):
    """
    :param data: a pandas data frame holding the raw but cleaned data for feature extraction
    :return: a dictionary of (epoch, weekdays, grouping):dataframe of features
    """

    # TO-DO consider moving these to extract and pass it as arguments
    # ideally these are set when instantiating from FeatureExtraction class
    EPOCHlist = ["allday", "night", "morning", "afternoon", "evening"]
    weekdays = ["", "wkdy", "wkend"]
    GROUPING_FUNCTIONS = {
        "day": groupby_day,
        "week": groupby_week,  # not of interest to UWEXP
        "half_sem": groupby_half_sem,  # not of interest to UWEXP
        #    "sem": groupby_all_sem, # TO-DO address the grouping bug
        #    "ema": groupby_ema # TO-DO test
    }

    if data.shape[0] == 0:
        print('no data to extract features from')
        return None

    data.rename(index=str,
                columns={
                    'double_energy': 'energy',
                    'double_convo_start': 'convo_start',
                    'double_convo_end': 'convo_end'
                },
                inplace=True)
    # TO-DO consider changing this. if the library is developed to work with data from AWARE
    #       it is importnat to use the same column names as that of AWARE tables

    # NOTE this is to imitate data in CMU's use of the library and mainly because
    #      there has been cases (e.g. screen) where columns have been accessed by
    #      their position and not their name
    data = data[[
        'timestamp', 'inference', 'energy', 'convo_start', 'convo_end'
    ]]

    # TO-DO consider moving this to extract if it is applicable to all sensorts
    convert_timezone(data, 'US/Pacific', {'timestamp': 'time'})
    data.set_index("time", inplace=True)
    data = data.tz_localize(None)
    # QUESTION why to first set timezone as UTC and then change it back?
    #          why not setting it as local time zone from the beginning?

    results_out = {}
    for epoch in EPOCHlist:

        if epoch == "allday":
            epoch_data = data.copy()
            # NOTE importatnt to use a copy of data because it is changed in the loop
        else:
            timeranges = getDaywiseSplitsForEpoch(epoch)
            epoch_filter = timerange_filter(data, timeranges * 1000)
            epoch_data = data[epoch_filter]

        if epoch_data.shape[0] == 0:
            print('no data for epoch {}'.format(epoch))
            continue

        # YSS Debugging
        #temp = resampleseparatedays_sec(epoch_data, AUDIO_SAMPLE_RATE_IN_SECONDS)
        #print('original: zero convo start / size = {} / {}'.format(epoch_data[epoch_data['convo_start'] != 0].shape[0], epoch_data.shape[0]))
        #print('resampled: zero convo start / size = {} / {}'.format(temp[temp['convo_start'] != 0].shape[0], temp.shape[0]))

        #epoch_data = resampleseparatedays_sec(epoch_data, AUDIO_SAMPLE_RATE_IN_SECONDS)
        # NOTE resampling messes up with the data in such a way that no record with non-zeor
        #      convo_start remains
        for weekday in weekdays:
            data_wk, weekday_suffix = getDataFromDayOfTheWeek(
                epoch_data, weekday)

            if (data_wk is None) or len(data_wk) == 0:
                if weekday == '':
                    weekday_t = 'wk'
                else:
                    weekday_t = weekday
                print('no data for weekday type {}'.format(weekday_t))
                continue

            for gkey, gfunc in GROUPING_FUNCTIONS.items():
                results = all_groups_flexible(data_wk, AUDIO_APPLY['no_args'],
                                              AUDIO_APPLY['with_args'], [],
                                              None, {gkey: gfunc})
                results_out[(epoch, weekday, gkey)] = formatResultsFor1Table(
                    None, results, epoch, weekday)
                # NOTE: the first argument of formatResultsFor1Table is not used in it
                #       I pass None to hint that

    # NOTE: if no features are extracted return None
    if len(results_out) == 0:
        print("no features extracted")
        results_out = None

    return results_out
示例#4
0
def extract_location(data, *args):
    """
    :param data: a pandas data frame holding the raw but cleaned data for feature extraction
    :return: a dictionary of (epoch, weekdays, grouping):dataframe of features
    """

    # TO-DO consider moving these to extract and pass it as arguments
    # ideally these are set when instantiating from FeatureExtraction class
    EPOCHlist = [
        "allday",
        "night",
        "morning",
        "afternoon",
        "evening"
    ]
    weekdays = [
        "",
        "wkdy",
        "wkend"
    ]
    GROUPING_FUNCTIONS = {
        "day": groupby_day,
    #    "week": groupby_week, # not of interest to UWEXP
    #    "half_sem": groupby_half_sem, # not of interest to UWEXP
    #    "sem": groupby_all_sem, # TO-DO address the grouping bug
    #    "ema": groupby_ema # TO-DO test
    }

    if data.shape[0] == 0:
        print('no data to extract features from')
        return None

    # NOTE this is to match the names library expects. Ideally library should work with the
    #      same names as the AWARE database schema
    data.rename(index=str, 
                columns={'double_latitude': 'latitude', 'double_longitude': 'longitude'},
                inplace=True)

    # NOTE this is to imitate data in CMU's use of the library and mainly because
    #      there has been cases (e.g. screen) where columns have been accessed by
    #      their position and not their name
    data = data[['timestamp', 'latitude', 'longitude']]

    # NOTE this is again to imitation data in CMU's used of the library and out of
    #      caution. I do not know if being sorted is assumed by the library.
    data.sort_values(by='timestamp', ascending=True, inplace=True)

    # TO-DO consider moving this to extract if it is applicable to all sensorts
    convert_timezone(data, 'US/Pacific', {'timestamp':'time'})
    data.set_index("time", inplace=True)
    data = data.tz_localize(None)
    # QUESTION why to first set timezone as UTC and then change it back?
    #          why not setting it as local time zone from the beginning?

    # get the home locations
    with open(args[0]['home_location_file'], 'r') as fileObj:
        home_locations = json.load(fileObj)
    
    pid = args[1]
    
    # get the home location if it does not exist for pid
    if pid not in home_locations:
        # get the period of time to use location data within it
        periods = args[0]['on_site_periods']
        periodranges = np.ndarray(shape=(len(periods), 2), dtype=np.int64)
        for index, period in enumerate(periods):
            start = period['start']
            start = datetime.datetime(start['year'], 
                                      start['month'], 
                                      start['day'], 
                                      start['hour'], 
                                      start['minute'], 
                                      start['second'])
            start = time.mktime(start.timetuple())
            end = period['end']
            end = datetime.datetime(end['year'],
                                    end['month'], 
                                    end['day'], 
                                    end['hour'], 
                                    end['minute'], 
                                    end['second'])
            end = time.mktime(end.timetuple())
            periodranges[index, 0] = start
            periodranges[index, 1] = end
        nightranges = getDaywiseSplitsForEpoch("night")
        home_locations[pid] = infer_home(data, periodranges, nightranges)

    homelatlong = (home_locations[pid][LATITUDE_INDEX], 
                   home_locations[pid][LONGITUDE_INDEX])

    if((homelatlong[LATITUDE_INDEX] is None) | (homelatlong[LONGITUDE_INDEX] is None)):
        print('home location does not exist')
        return None

    results_out = {}
    for epoch in EPOCHlist:
        
        if epoch == "allday":
            epoch_data = data.copy()
            # NOTE importatnt to use a copy of data because it is changed in the loop
        else:
            timeranges = getDaywiseSplitsForEpoch(epoch)
            epoch_filter = timerange_filter(data, timeranges * 1000)
            epoch_data = data[epoch_filter]
            
        if epoch_data.shape[0] == 0:
            print('no data for epoch {}'.format(epoch))
            continue

        #print('original: zero lat / size = {} / {}'.format(epoch_data[epoch_data['latitude'] != 0].shape[0], epoch_data.shape[0])) # YSS Debugging
        #print('original: zero long / size = {} / {}'.format(epoch_data[epoch_data['longitude'] != 0].shape[0], epoch_data.shape[0])) # YSS Debugging
        epoch_data = resampleseparatedays_min(epoch_data, LOCATION_SAMPLE_RATE_IN_MINUTES)
        #print('resampled: zero lat / size = {} / {}'.format(epoch_data[epoch_data['latitude'] != 0].shape[0], epoch_data.shape[0])) # YSS Debugging
        #print('resampled: zero long / size = {} / {}'.format(epoch_data[epoch_data['longitude'] != 0].shape[0], epoch_data.shape[0])) # YSS Debugging
        epoch_data_local = epoch_data.copy()
        epoch_data = cluster_and_label_with_moving(epoch_data, eps=distance_to_degrees(10), min_samples=10)
        # TO-DO figure out what distance_to_degree and min_samples are and why the specific values 
        # above (e.g. 10 and 10) are used 
        for weekday in weekdays:
            data_wk, weekday_suffix = getDataFromDayOfTheWeek(epoch_data, weekday)
            data_local_wk, weekday_suffix = getDataFromDayOfTheWeek(epoch_data_local, weekday)

            if (data_wk is None) or len(data_wk) == 0:
                if weekday == '':
                    weekday_t = 'wk'
                else:
                    weekday_t = weekday
                print('no data for weekday type {}'.format(weekday_t))
                continue

            for gkey, gfunc in GROUPING_FUNCTIONS.items():
                argstuplelist = [(homelatlong,)]
                results = all_groups_flexible(data_wk, 
                                              LOCATION_APPLY['no_args'] + LOCATION_APPLY['no_args_global'], 
                                              LOCATION_APPLY['with_args'] + LOCATION_APPLY['with_args_global'],
                                              argstuplelist, 
                                              None, 
                                              {gkey:gfunc})
                results_local = getResultFromLocalClusters(data_local_wk, 
                                                           {gkey:gfunc}, 
                                                           LOCATION_APPLY['no_args'] + LOCATION_APPLY['no_args_local'], 
                                                           LOCATION_APPLY['with_args'] + LOCATION_APPLY['with_args_local'], 
                                                           argstuplelist)
                # TO-DO needs investigations; copied and pasted from CMU code
                # Concatenate global and local
                for k in results.keys():
                    results[k] = pd.concat([results[k], results_local[k]], axis = 1)
                # stupid issue with nan and inf values
                for k in results.keys():
                    results[k] = results[k].replace([np.inf, -np.inf], np.nan)
                    results[k] = results[k].astype(object).where(pd.notnull(results[k]), None)
                results_out[(epoch, weekday, gkey)] = formatResultsFor1Table(None, 
                                                                             results, 
                                                                             epoch, 
                                                                             weekday)
                # NOTE: the first argument of formatResultsFor1Table is not used in it
                #       I pass None to hint that
                
    # NOTE: if no features are extracted return None
    if len(results_out) == 0:
        print("no features extracted")
        results_out = None

    return results_out
示例#5
0
def extract_screen(data, *args):
    """
    :param data: a pandas data frame holding the raw but cleaned data for feature extraction
    :return: a dictionary of (epoch, weekdays, grouping):dataframe of features
    """

    # TO-DO consider moving these to extract and pass it as arguments
    # ideally these are set when instantiating from FeatureExtraction class
    EPOCHlist = ["allday", "night", "morning", "afternoon", "evening"]
    weekdays = ["", "wkdy", "wkend"]
    GROUPING_FUNCTIONS = {
        "day": groupby_day,
        #    "week": groupby_week, # not of interest to UWEXP
        #    "half_sem": groupby_half_sem, # not of interest to UWEXP
        #    "sem": groupby_all_sem, # TO-DO address the grouping bug
        #    "ema": groupby_ema # TO-DO test
    }

    if data.shape[0] == 0:
        print('no data to extract features from')
        return None

    data = data[['timestamp', 'screen_status']]
    # NOTE: this is to exactly follow CMU's query for screen data
    #       unfortunately the library functions use the schema of query result table
    #       in the most disappointing way: they assume columns in the the same order
    #       as the query result table and index on positions of those columns instead
    #       of using the column names. Terrible!

    # TO-DO consider moving this to extract if it is applicable to all sensorts
    convert_timezone(data, 'US/Pacific', {'timestamp': 'time'})
    data.set_index("time", inplace=True)
    data = data.tz_localize(None)
    # QUESTION why to first set timezone as UTC and then change it back?
    #          why not setting it as local time zone from the beginning?

    results_out = {}
    for epoch in EPOCHlist:

        if epoch == "allday":
            epoch_data = data.copy()
            # NOTE it is not necessary to use a copy of data because no change is
            #      applied to it within the loop. I use a copy here for the sake of
            #      consistency with other cases where use of copy is important for
            #      correct calculation of features in the next iteration of the loop.
            prev_epoch_data = data.copy()
        else:
            timeranges = getDaywiseSplitsForEpoch(epoch)
            epoch_filter = timerange_filter(data, timeranges * 1000)
            epoch_data = data[epoch_filter]

            timeranges = getDaywiseSplitsForEpoch(getPrevEpoch(epoch))
            epoch_filter = timerange_filter(data, timeranges * 1000)
            prev_epoch_data = data[epoch_filter]

        # QUESTION: I don't understand what prev_epock_data is used for

        if epoch_data.shape[0] == 0:
            print('no data for epoch {}'.format(epoch))
            continue

        for weekday in weekdays:
            data_wk, weekday_suffix = getDataFromDayOfTheWeek(
                epoch_data, weekday)

            if (data_wk is None) or len(data_wk) == 0:
                if weekday == '':
                    weekday_t = 'wk'
                else:
                    weekday_t = weekday
                print('no data for weekday type {}'.format(weekday_t))
                continue

            s, e = getEpochStartEndTimedelta(epoch)

            for gkey, gfunc in GROUPING_FUNCTIONS.items():
                if gkey == "day":
                    no_args = SCREEN_APPLY['no_args'] + SCREEN_APPLY[
                        'no_args_daily']
                    with_args = SCREEN_APPLY['with_args'] + SCREEN_APPLY[
                        'with_args_daily']
                else:
                    no_args = SCREEN_APPLY['no_args'] + SCREEN_APPLY[
                        'no_args_multi']
                    with_args = SCREEN_APPLY['with_args'] + SCREEN_APPLY[
                        'with_args_multi']
                argstuplelist = [(s, e)] * len(with_args)
                results = all_groups_flexible_with_prev(
                    data_wk, prev_epoch_data, True, no_args, with_args,
                    argstuplelist, None, {gkey: gfunc})
                results_out[(epoch, weekday, gkey)] = formatResultsFor1Table(
                    None, results, epoch, weekday)
                # NOTE: the first argument of formatResultsFor1Table is not used in it
                #       I pass None to hint that

    # NOTE: if no features are extracted return None
    if len(results_out) == 0:
        print("no features extracted")
        results_out = None

    return results_out
示例#6
0
def extract_bluetooth(data, *args):
    """
    :param data: a pandas data frame holding the raw but cleaned data for feature extraction
    :return: a dictionary of (epoch, weekdays, grouping):dataframe of features
    """

    # TO-DO consider moving these to extract and pass it as arguments
    # ideally these are set when instantiating from FeatureExtraction class
    EPOCHlist = ["allday", "night", "morning", "afternoon", "evening"]
    weekdays = ["", "wkdy", "wkend"]
    GROUPING_FUNCTIONS = {
        "day": groupby_day,
        #    "week": groupby_week, # not of interest to UWEXP
        #    "half_sem": groupby_half_sem, # not of interest to UWEXP
        #    "sem": groupby_all_sem, # TO-DO address the grouping bug
        #    "ema": groupby_ema # TO-DO test
    }

    if data.shape[0] == 0:
        print('no data to extract features from')
        return None

    # NOTE this is to imitate data in CMU's use of the library and mainly because
    #      there has been cases (e.g. screen) where columns have been accessed by
    #      their position and not their name
    data = data[['timestamp', 'bt_address', 'bt_rssi']]

    # TO-DO consider moving this to extract if it is applicable to all sensorts
    convert_timezone(data, 'US/Pacific', {'timestamp': 'time'})
    data.set_index("time", inplace=True)
    data = data.tz_localize(None)
    # QUESTION why to first set timezone as UTC and then change it back?
    #          why not setting it as local time zone from the beginning?

    baddress_freq_data = badd_data(data)
    if baddress_freq_data is None or len(baddress_freq_data) == 0:
        print("no bluetooth address frequency data (freq data samples = 0)")
        return None
    size = baddress_freq_data.shape[0]
    baddress_freq_data = baddress_freq_data.dropna(
        how='any')  # TO-DO: what is this doing?
    if (baddress_freq_data.shape[0] != size):
        print('removed rows of bluetooth address data with null entries.')
        print('\tfurther investigate as this should not happen.')

    baddress_freq_data, numclust = cluster_address_freq(baddress_freq_data)
    if numclust is not None:
        baddress_freq_data.set_index(['bt_address'])
        own_devices = getOwnDevices(baddress_freq_data)
    else:
        own_devices = [None]

    results_out = {}
    for epoch in EPOCHlist:

        if epoch == "allday":
            epoch_data = data.copy()
            # NOTE it is not necessary to use a copy of data because no change is
            #      applied to it within the loop. I use a copy here for the sake of
            #      consistency with other cases where use of copy is important for
            #      correct calculation of features in the next iteration of the loop.
        else:
            timeranges = getDaywiseSplitsForEpoch(epoch)
            epoch_filter = timerange_filter(data, timeranges * 1000)
            epoch_data = data[epoch_filter]

        if epoch_data.shape[0] == 0:
            print('no data for epoch {}'.format(epoch))
            continue

        # resampling should not be done for battery_charges data as it only stores
        # periods during which the battery charges
        for weekday in weekdays:
            # Get Both/Weekdays/Weekends
            data_wk, weekday_suffix = getDataFromDayOfTheWeek(
                epoch_data, weekday)

            if (data_wk is None) or len(data_wk) == 0:
                if weekday == '':
                    weekday_t = 'wk'
                else:
                    weekday_t = weekday
                print('no data for weekday type {}'.format(weekday_t))
                continue

            for gkey, gfunc in GROUPING_FUNCTIONS.items():
                argstuplelist = [(own_devices, )] * len(
                    BLUETOOTH_APPLY['with_args'])
                # NOTE: I don't think we need to pass in a list of tuples
                #       so the following should work too
                # TO-DO test
                #argstuplelist = [own_devices] * len(BLUETOOTH_APPLY['with_args'])
                results = all_groups_flexible(data_wk,
                                              BLUETOOTH_APPLY['no_args'],
                                              BLUETOOTH_APPLY['with_args'],
                                              argstuplelist, None,
                                              {gkey: gfunc})
                results_out[(epoch, weekday, gkey)] = formatResultsFor1Table(
                    None, results, epoch, weekday)
                # NOTE: the first argument of formatResultsFor1Table is not used in it
                #       I pass None to hint that

    # NOTE: if no features are extracted return None
    if len(results_out) == 0:
        print("no features extracted")
        results_out = None

    return results_out
示例#7
0
def extract_app(data, *args):
    """
    :param data: a pandas data frame holding the raw but cleaned data for feature extraction
    :return: a dictionary of (epoch, weekdays, grouping):dataframe of features
    """

    # TO-DO consider moving these to extract and pass it as arguments
    # ideally these are set when instantiating from FeatureExtraction class
    EPOCHlist = ["allday", "night", "morning", "afternoon", "evening"]
    weekdays = ["", "wkdy", "wkend"]
    GROUPING_FUNCTIONS = {
        "day": groupby_day,
        #    "week": groupby_week, # not of interest to UWEXP
        #    "half_sem": groupby_half_sem, # not of interest to UWEXP
        #    "sem": groupby_all_sem, # TO-DO address the grouping bug
        #    "ema": groupby_ema # TO-DO test
    }

    if data.shape[0] == 0:
        print('no data to extract features from')
        return None

    # NOTE I'm specifically not imitating CMU code to only keep the columns
    #      of their query and in the same order as I want to reuse this code
    #      for applications_foreground

    # TO-DO consider moving this to extract if it is applicable to all sensorts
    convert_timezone(data, 'US/Pacific', {'timestamp': 'time'})
    data.set_index("time", inplace=True)
    data = data.tz_localize(None)
    # QUESTION why to first set timezone as UTC and then change it back?
    #          why not setting it as local time zone from the beginning?

    # get the app categories
    with open(args[0]['app_category_file'], 'r') as fileObj:
        app_categories = json.load(fileObj)
    data['package_category'] = data.apply(
        lambda x: assign_category(x['package_name'], app_categories), axis=1)
    # TO-DO consider implementing this using joins and based on appcategories table

    results_out = {}
    for epoch in EPOCHlist:

        if epoch == "allday":
            epoch_data = data.copy()
            # NOTE it is not necessary to use a copy of data because no change is
            #      applied to it within the loop. I use a copy here for the sake of
            #      consistency with other cases where use of copy is important for
            #      correct calculation of features in the next iteration of the loop.
        else:
            timeranges = getDaywiseSplitsForEpoch(epoch)
            epoch_filter = timerange_filter(data, timeranges * 1000)
            epoch_data = data[epoch_filter]

        if epoch_data.shape[0] == 0:
            print('no data for epoch {}'.format(epoch))
            continue

        for weekday in weekdays:
            data_wk, weekday_suffix = getDataFromDayOfTheWeek(
                epoch_data, weekday)

            if (data_wk is None) or len(data_wk) == 0:
                if weekday == '':
                    weekday_t = 'wk'
                else:
                    weekday_t = weekday
                print('no data for weekday type {}'.format(weekday_t))
                continue

            for gkey, gfunc in GROUPING_FUNCTIONS.items():
                results = all_groups_flexible(data_wk,
                                              APP_FUNCTIONS['no_args'],
                                              APP_FUNCTIONS['with_args'], [],
                                              None, {gkey: gfunc})
                results_out[(epoch, weekday, gkey)] = formatResultsFor1Table(
                    None, results, epoch, weekday)
                # NOTE: the first argument of formatResultsFor1Table is not used in it
                #       I pass None to hint that

    # NOTE: if no features are extracted return None
    if len(results_out) == 0:
        print("no features extracted")
        results_out = None

    return results_out
示例#8
0
def extract_call(data, *args):
    """
    :param data: a pandas data frame holding the raw but cleaned data for feature extraction
    :return: a dictionary of (epoch, weekdays, grouping):dataframe of features
    """

    # TO-DO consider moving these to extract and pass it as arguments
    # ideally these are set when instantiating from FeatureExtraction class
    EPOCHlist = [
        "allday",
        "night",
        "morning",
        "afternoon",
        "evening"
    ]
    weekdays = [
        "",
        "wkdy",
        "wkend"
    ]
    GROUPING_FUNCTIONS = {
        "day": groupby_day,
    #    "week": groupby_week, # not of interest to UWEXP
    #    "half_sem": groupby_half_sem, # not of interest to UWEXP
    #    "sem": groupby_all_sem, # TO-DO address the grouping bug
    #    "ema": groupby_ema # TO-DO test
    }

    if data.shape[0] == 0:
        print('no data to extract features from')
        return None

    # TO-DO load the phone number category information (a.k.a PHC_TABLE in CMU's work)
    #       get the left join of calls in data and category on device_id and trace so
    #       you end up with a new table with the additional column category which is
    #       either None (when category information is unavailable) or is one of family,
    #       friend-in-town, or friend-out-of-town.

    # NOTE  in UW phase I data, iOS traces are UUID of the call session rather than
    #       hashed values of the phone number people called/were called. Given 75% of
    #       participants were iOS users there is very little category information
    #       we can get. Therefore, the feature extraction below is for now implemented
    #       in such a way that does not use categoy information. 


    # NOTE this is to imitate data in CMU's use of the library and mainly because
    #      there has been cases (e.g. screen) where columns have been accessed by
    #      their position and not their name
    data = data[['timestamp', 'call_type', 'call_duration', 'trace']]
    # TO-DO should also keep 'category' column when that data is available

    # TO-DO consider moving this to extract if it is applicable to all sensorts
    convert_timezone(data, 'US/Pacific', {'timestamp':'time'})
    data.set_index("time", inplace=True)
    data = data.tz_localize(None)
    # QUESTION why to first set timezone as UTC and then change it back?
    #          why not setting it as local time zone from the beginning?

    results_out = {}
    for epoch in EPOCHlist:
        
        if epoch == "allday":
            epoch_data = data.copy()
            # NOTE it is not necessary to use a copy of data because no change is
            #      applied to it within the loop. I use a copy here for the sake of
            #      consistency with other cases where use of copy is important for
            #      correct calculation of features in the next iteration of the loop.
            # TO-DO should get family, friend-in-town, friend-out-of-town
            #       in separate dataframe based on category information
        else:
            timeranges = getDaywiseSplitsForEpoch(epoch)
            epoch_filter = timerange_filter(data, timeranges * 1000)
            epoch_data = data[epoch_filter]
            # TO-DO should get family, friend-in-town, friend-out-of-town
            #       in separate dataframe based on category information
            
        if epoch_data.shape[0] == 0:
            print('no data for epoch {}'.format(epoch))
            continue

        for weekday in weekdays:
            data_wk, weekday_suffix = getDataFromDayOfTheWeek(epoch_data, weekday)
            # TO-DO should similar get the data_wk information for family, 
            #       friends-in-town, and friends-out-of-town
            
            if (data_wk is None) or len(data_wk) == 0:
                if weekday == '':
                    weekday_t = 'wk'
                else:
                    weekday_t = weekday
                print('no data for weekday type {}'.format(weekday_t))
                continue

            for gkey, gfunc in GROUPING_FUNCTIONS.items():
                results = all_groups_communication(data_wk, 
                                                   CALL_APPLY, 
                                                   None, [], # family
                                                   None, [], # friends-in-town
                                                   None, [], # friends-out-of-town
                                                   FAMILY_FEATURE_COLUMNS, 
                                                   FRIENDSINTOWN_FEATURE_COLUMNS, 
                                                   FRIENDSOUTOFTOWN_FEATURE_COLUMNS,
                                                   None, 
                                                   {gkey:gfunc})
                results_out[(epoch, weekday, gkey)] = formatResultsFor1Table(None, 
                                                                             results, 
                                                                             epoch, 
                                                                             weekday)
                # NOTE: the first argument of formatResultsFor1Table is not used in it
                #       I pass None to hint that
                
    # NOTE: if no features are extracted return None
    if len(results_out) == 0:
        print("no features extracted")
        results_out = None

    return results_out