def get_home(data: pd.DataFrame, periods: dict) -> tuple: # NOTE this is to match the names library expects. Ideally library should work with the # same names as the AWARE database schema data.rename(index=str, columns={ 'double_latitude': 'latitude', 'double_longitude': 'longitude' }, inplace=True) # NOTE this is to imitate data in CMU's use of the library and mainly because # there has been cases (e.g. screen) where columns have been accessed by # their position and not their name data = data[['timestamp', 'latitude', 'longitude']] # NOTE this is again to imitation data in CMU's used of the library and out of # caution. I do not know if being sorted is assumed by the library. data.sort_values(by='timestamp', ascending=True, inplace=True) convert_timezone(data, 'US/Pacific', {'timestamp': 'time'}) data.set_index("time", inplace=True) data = data.tz_localize(None) # QUESTION why to first set timezone as UTC and then change it back? # why not setting it as local time zone from the beginning? periodranges = np.ndarray(shape=(len(periods), 2), dtype=np.int64) for index, period in enumerate(periods): start = period['start'] start = datetime.datetime(start['year'], start['month'], start['day'], start['hour'], start['minute'], start['second']) start = time.mktime(start.timetuple()) end = period['end'] end = datetime.datetime(end['year'], end['month'], end['day'], end['hour'], end['minute'], end['second']) end = time.mktime(end.timetuple()) periodranges[index, 0] = start periodranges[index, 1] = end nightranges = getDaywiseSplitsForEpoch("night") home_location = infer_home(data, periodranges, nightranges) return home_location
def extract_activity(data, *args): """ :param data: a pandas data frame holding the raw but cleaned data for feature extraction :return: a dictionary of (epoch, weekdays, grouping):dataframe of features """ # TO-DO consider moving these to extract and pass it as arguments # ideally these are set when instantiating from FeatureExtraction class EPOCHlist = ["allday", "night", "morning", "afternoon", "evening"] weekdays = ["", "wkdy", "wkend"] GROUPING_FUNCTIONS = { "day": groupby_day, # "week": groupby_week, # not of interest to UWEXP # "half_sem": groupby_half_sem, # not of interest to UWEXP # "sem": groupby_all_sem, # TO-DO address the grouping bug # "ema": groupby_ema # TO-DO test } if data.shape[0] == 0: print('no data to extract features from') return None data = data[['timestamp', 'activity_type', 'activity_name']] # NOTE this is to imitate data in CMU's use of the library and mainly because # there has been cases (e.g. screen) where columns have been accessed by # their position and not their activity_name data.sort_values(by=['timestamp'], ascending=True, inplace=True) # NOTE this is needed becaues of change calculations below # QUESTION: why do we consider unknown activities in feature calculations? # shouldn't we remove them? # TO-DO consider moving this to extract if it is applicable to all sensorts convert_timezone(data, 'US/Pacific', {'timestamp': 'time'}) data.set_index("time", inplace=True) data = data.tz_localize(None) # QUESTION why to first set timezone as UTC and then change it back? # why not setting it as local time zone from the beginning? results_out = {} for epoch in EPOCHlist: if epoch == "allday": epoch_data = data.copy() # NOTE importatnt to use a copy of data because it is changed in the loop else: timeranges = getDaywiseSplitsForEpoch(epoch) epoch_filter = timerange_filter(data, timeranges * 1000) epoch_data = data[epoch_filter] if epoch_data.shape[0] == 0: print('no data for epoch {}'.format(epoch)) continue epoch_data['changed'] = (epoch_data['activity_type'] != epoch_data['activity_type'].shift()) for weekday in weekdays: data_wk, weekday_suffix = getDataFromDayOfTheWeek( epoch_data, weekday) if (data_wk is None) or len(data_wk) == 0: if weekday == '': weekday_t = 'wk' else: weekday_t = weekday print('no data for weekday type {}'.format(weekday_t)) continue for gkey, gfunc in GROUPING_FUNCTIONS.items(): results = all_groups_flexible(data_wk, ACTIVITY_APPLY['no_args'], ACTIVITY_APPLY['with_args'], [], None, {gkey: gfunc}) results_out[(epoch, weekday, gkey)] = formatResultsFor1Table( None, results, epoch, weekday) # NOTE: the first argument of formatResultsFor1Table is not used in it # I pass None to hint that # NOTE: if no features are extracted return None if len(results_out) == 0: print("no features extracted") results_out = None return results_out
def extract_audio(data, *args): """ :param data: a pandas data frame holding the raw but cleaned data for feature extraction :return: a dictionary of (epoch, weekdays, grouping):dataframe of features """ # TO-DO consider moving these to extract and pass it as arguments # ideally these are set when instantiating from FeatureExtraction class EPOCHlist = ["allday", "night", "morning", "afternoon", "evening"] weekdays = ["", "wkdy", "wkend"] GROUPING_FUNCTIONS = { "day": groupby_day, "week": groupby_week, # not of interest to UWEXP "half_sem": groupby_half_sem, # not of interest to UWEXP # "sem": groupby_all_sem, # TO-DO address the grouping bug # "ema": groupby_ema # TO-DO test } if data.shape[0] == 0: print('no data to extract features from') return None data.rename(index=str, columns={ 'double_energy': 'energy', 'double_convo_start': 'convo_start', 'double_convo_end': 'convo_end' }, inplace=True) # TO-DO consider changing this. if the library is developed to work with data from AWARE # it is importnat to use the same column names as that of AWARE tables # NOTE this is to imitate data in CMU's use of the library and mainly because # there has been cases (e.g. screen) where columns have been accessed by # their position and not their name data = data[[ 'timestamp', 'inference', 'energy', 'convo_start', 'convo_end' ]] # TO-DO consider moving this to extract if it is applicable to all sensorts convert_timezone(data, 'US/Pacific', {'timestamp': 'time'}) data.set_index("time", inplace=True) data = data.tz_localize(None) # QUESTION why to first set timezone as UTC and then change it back? # why not setting it as local time zone from the beginning? results_out = {} for epoch in EPOCHlist: if epoch == "allday": epoch_data = data.copy() # NOTE importatnt to use a copy of data because it is changed in the loop else: timeranges = getDaywiseSplitsForEpoch(epoch) epoch_filter = timerange_filter(data, timeranges * 1000) epoch_data = data[epoch_filter] if epoch_data.shape[0] == 0: print('no data for epoch {}'.format(epoch)) continue # YSS Debugging #temp = resampleseparatedays_sec(epoch_data, AUDIO_SAMPLE_RATE_IN_SECONDS) #print('original: zero convo start / size = {} / {}'.format(epoch_data[epoch_data['convo_start'] != 0].shape[0], epoch_data.shape[0])) #print('resampled: zero convo start / size = {} / {}'.format(temp[temp['convo_start'] != 0].shape[0], temp.shape[0])) #epoch_data = resampleseparatedays_sec(epoch_data, AUDIO_SAMPLE_RATE_IN_SECONDS) # NOTE resampling messes up with the data in such a way that no record with non-zeor # convo_start remains for weekday in weekdays: data_wk, weekday_suffix = getDataFromDayOfTheWeek( epoch_data, weekday) if (data_wk is None) or len(data_wk) == 0: if weekday == '': weekday_t = 'wk' else: weekday_t = weekday print('no data for weekday type {}'.format(weekday_t)) continue for gkey, gfunc in GROUPING_FUNCTIONS.items(): results = all_groups_flexible(data_wk, AUDIO_APPLY['no_args'], AUDIO_APPLY['with_args'], [], None, {gkey: gfunc}) results_out[(epoch, weekday, gkey)] = formatResultsFor1Table( None, results, epoch, weekday) # NOTE: the first argument of formatResultsFor1Table is not used in it # I pass None to hint that # NOTE: if no features are extracted return None if len(results_out) == 0: print("no features extracted") results_out = None return results_out
def extract_location(data, *args): """ :param data: a pandas data frame holding the raw but cleaned data for feature extraction :return: a dictionary of (epoch, weekdays, grouping):dataframe of features """ # TO-DO consider moving these to extract and pass it as arguments # ideally these are set when instantiating from FeatureExtraction class EPOCHlist = [ "allday", "night", "morning", "afternoon", "evening" ] weekdays = [ "", "wkdy", "wkend" ] GROUPING_FUNCTIONS = { "day": groupby_day, # "week": groupby_week, # not of interest to UWEXP # "half_sem": groupby_half_sem, # not of interest to UWEXP # "sem": groupby_all_sem, # TO-DO address the grouping bug # "ema": groupby_ema # TO-DO test } if data.shape[0] == 0: print('no data to extract features from') return None # NOTE this is to match the names library expects. Ideally library should work with the # same names as the AWARE database schema data.rename(index=str, columns={'double_latitude': 'latitude', 'double_longitude': 'longitude'}, inplace=True) # NOTE this is to imitate data in CMU's use of the library and mainly because # there has been cases (e.g. screen) where columns have been accessed by # their position and not their name data = data[['timestamp', 'latitude', 'longitude']] # NOTE this is again to imitation data in CMU's used of the library and out of # caution. I do not know if being sorted is assumed by the library. data.sort_values(by='timestamp', ascending=True, inplace=True) # TO-DO consider moving this to extract if it is applicable to all sensorts convert_timezone(data, 'US/Pacific', {'timestamp':'time'}) data.set_index("time", inplace=True) data = data.tz_localize(None) # QUESTION why to first set timezone as UTC and then change it back? # why not setting it as local time zone from the beginning? # get the home locations with open(args[0]['home_location_file'], 'r') as fileObj: home_locations = json.load(fileObj) pid = args[1] # get the home location if it does not exist for pid if pid not in home_locations: # get the period of time to use location data within it periods = args[0]['on_site_periods'] periodranges = np.ndarray(shape=(len(periods), 2), dtype=np.int64) for index, period in enumerate(periods): start = period['start'] start = datetime.datetime(start['year'], start['month'], start['day'], start['hour'], start['minute'], start['second']) start = time.mktime(start.timetuple()) end = period['end'] end = datetime.datetime(end['year'], end['month'], end['day'], end['hour'], end['minute'], end['second']) end = time.mktime(end.timetuple()) periodranges[index, 0] = start periodranges[index, 1] = end nightranges = getDaywiseSplitsForEpoch("night") home_locations[pid] = infer_home(data, periodranges, nightranges) homelatlong = (home_locations[pid][LATITUDE_INDEX], home_locations[pid][LONGITUDE_INDEX]) if((homelatlong[LATITUDE_INDEX] is None) | (homelatlong[LONGITUDE_INDEX] is None)): print('home location does not exist') return None results_out = {} for epoch in EPOCHlist: if epoch == "allday": epoch_data = data.copy() # NOTE importatnt to use a copy of data because it is changed in the loop else: timeranges = getDaywiseSplitsForEpoch(epoch) epoch_filter = timerange_filter(data, timeranges * 1000) epoch_data = data[epoch_filter] if epoch_data.shape[0] == 0: print('no data for epoch {}'.format(epoch)) continue #print('original: zero lat / size = {} / {}'.format(epoch_data[epoch_data['latitude'] != 0].shape[0], epoch_data.shape[0])) # YSS Debugging #print('original: zero long / size = {} / {}'.format(epoch_data[epoch_data['longitude'] != 0].shape[0], epoch_data.shape[0])) # YSS Debugging epoch_data = resampleseparatedays_min(epoch_data, LOCATION_SAMPLE_RATE_IN_MINUTES) #print('resampled: zero lat / size = {} / {}'.format(epoch_data[epoch_data['latitude'] != 0].shape[0], epoch_data.shape[0])) # YSS Debugging #print('resampled: zero long / size = {} / {}'.format(epoch_data[epoch_data['longitude'] != 0].shape[0], epoch_data.shape[0])) # YSS Debugging epoch_data_local = epoch_data.copy() epoch_data = cluster_and_label_with_moving(epoch_data, eps=distance_to_degrees(10), min_samples=10) # TO-DO figure out what distance_to_degree and min_samples are and why the specific values # above (e.g. 10 and 10) are used for weekday in weekdays: data_wk, weekday_suffix = getDataFromDayOfTheWeek(epoch_data, weekday) data_local_wk, weekday_suffix = getDataFromDayOfTheWeek(epoch_data_local, weekday) if (data_wk is None) or len(data_wk) == 0: if weekday == '': weekday_t = 'wk' else: weekday_t = weekday print('no data for weekday type {}'.format(weekday_t)) continue for gkey, gfunc in GROUPING_FUNCTIONS.items(): argstuplelist = [(homelatlong,)] results = all_groups_flexible(data_wk, LOCATION_APPLY['no_args'] + LOCATION_APPLY['no_args_global'], LOCATION_APPLY['with_args'] + LOCATION_APPLY['with_args_global'], argstuplelist, None, {gkey:gfunc}) results_local = getResultFromLocalClusters(data_local_wk, {gkey:gfunc}, LOCATION_APPLY['no_args'] + LOCATION_APPLY['no_args_local'], LOCATION_APPLY['with_args'] + LOCATION_APPLY['with_args_local'], argstuplelist) # TO-DO needs investigations; copied and pasted from CMU code # Concatenate global and local for k in results.keys(): results[k] = pd.concat([results[k], results_local[k]], axis = 1) # stupid issue with nan and inf values for k in results.keys(): results[k] = results[k].replace([np.inf, -np.inf], np.nan) results[k] = results[k].astype(object).where(pd.notnull(results[k]), None) results_out[(epoch, weekday, gkey)] = formatResultsFor1Table(None, results, epoch, weekday) # NOTE: the first argument of formatResultsFor1Table is not used in it # I pass None to hint that # NOTE: if no features are extracted return None if len(results_out) == 0: print("no features extracted") results_out = None return results_out
def extract_screen(data, *args): """ :param data: a pandas data frame holding the raw but cleaned data for feature extraction :return: a dictionary of (epoch, weekdays, grouping):dataframe of features """ # TO-DO consider moving these to extract and pass it as arguments # ideally these are set when instantiating from FeatureExtraction class EPOCHlist = ["allday", "night", "morning", "afternoon", "evening"] weekdays = ["", "wkdy", "wkend"] GROUPING_FUNCTIONS = { "day": groupby_day, # "week": groupby_week, # not of interest to UWEXP # "half_sem": groupby_half_sem, # not of interest to UWEXP # "sem": groupby_all_sem, # TO-DO address the grouping bug # "ema": groupby_ema # TO-DO test } if data.shape[0] == 0: print('no data to extract features from') return None data = data[['timestamp', 'screen_status']] # NOTE: this is to exactly follow CMU's query for screen data # unfortunately the library functions use the schema of query result table # in the most disappointing way: they assume columns in the the same order # as the query result table and index on positions of those columns instead # of using the column names. Terrible! # TO-DO consider moving this to extract if it is applicable to all sensorts convert_timezone(data, 'US/Pacific', {'timestamp': 'time'}) data.set_index("time", inplace=True) data = data.tz_localize(None) # QUESTION why to first set timezone as UTC and then change it back? # why not setting it as local time zone from the beginning? results_out = {} for epoch in EPOCHlist: if epoch == "allday": epoch_data = data.copy() # NOTE it is not necessary to use a copy of data because no change is # applied to it within the loop. I use a copy here for the sake of # consistency with other cases where use of copy is important for # correct calculation of features in the next iteration of the loop. prev_epoch_data = data.copy() else: timeranges = getDaywiseSplitsForEpoch(epoch) epoch_filter = timerange_filter(data, timeranges * 1000) epoch_data = data[epoch_filter] timeranges = getDaywiseSplitsForEpoch(getPrevEpoch(epoch)) epoch_filter = timerange_filter(data, timeranges * 1000) prev_epoch_data = data[epoch_filter] # QUESTION: I don't understand what prev_epock_data is used for if epoch_data.shape[0] == 0: print('no data for epoch {}'.format(epoch)) continue for weekday in weekdays: data_wk, weekday_suffix = getDataFromDayOfTheWeek( epoch_data, weekday) if (data_wk is None) or len(data_wk) == 0: if weekday == '': weekday_t = 'wk' else: weekday_t = weekday print('no data for weekday type {}'.format(weekday_t)) continue s, e = getEpochStartEndTimedelta(epoch) for gkey, gfunc in GROUPING_FUNCTIONS.items(): if gkey == "day": no_args = SCREEN_APPLY['no_args'] + SCREEN_APPLY[ 'no_args_daily'] with_args = SCREEN_APPLY['with_args'] + SCREEN_APPLY[ 'with_args_daily'] else: no_args = SCREEN_APPLY['no_args'] + SCREEN_APPLY[ 'no_args_multi'] with_args = SCREEN_APPLY['with_args'] + SCREEN_APPLY[ 'with_args_multi'] argstuplelist = [(s, e)] * len(with_args) results = all_groups_flexible_with_prev( data_wk, prev_epoch_data, True, no_args, with_args, argstuplelist, None, {gkey: gfunc}) results_out[(epoch, weekday, gkey)] = formatResultsFor1Table( None, results, epoch, weekday) # NOTE: the first argument of formatResultsFor1Table is not used in it # I pass None to hint that # NOTE: if no features are extracted return None if len(results_out) == 0: print("no features extracted") results_out = None return results_out
def extract_bluetooth(data, *args): """ :param data: a pandas data frame holding the raw but cleaned data for feature extraction :return: a dictionary of (epoch, weekdays, grouping):dataframe of features """ # TO-DO consider moving these to extract and pass it as arguments # ideally these are set when instantiating from FeatureExtraction class EPOCHlist = ["allday", "night", "morning", "afternoon", "evening"] weekdays = ["", "wkdy", "wkend"] GROUPING_FUNCTIONS = { "day": groupby_day, # "week": groupby_week, # not of interest to UWEXP # "half_sem": groupby_half_sem, # not of interest to UWEXP # "sem": groupby_all_sem, # TO-DO address the grouping bug # "ema": groupby_ema # TO-DO test } if data.shape[0] == 0: print('no data to extract features from') return None # NOTE this is to imitate data in CMU's use of the library and mainly because # there has been cases (e.g. screen) where columns have been accessed by # their position and not their name data = data[['timestamp', 'bt_address', 'bt_rssi']] # TO-DO consider moving this to extract if it is applicable to all sensorts convert_timezone(data, 'US/Pacific', {'timestamp': 'time'}) data.set_index("time", inplace=True) data = data.tz_localize(None) # QUESTION why to first set timezone as UTC and then change it back? # why not setting it as local time zone from the beginning? baddress_freq_data = badd_data(data) if baddress_freq_data is None or len(baddress_freq_data) == 0: print("no bluetooth address frequency data (freq data samples = 0)") return None size = baddress_freq_data.shape[0] baddress_freq_data = baddress_freq_data.dropna( how='any') # TO-DO: what is this doing? if (baddress_freq_data.shape[0] != size): print('removed rows of bluetooth address data with null entries.') print('\tfurther investigate as this should not happen.') baddress_freq_data, numclust = cluster_address_freq(baddress_freq_data) if numclust is not None: baddress_freq_data.set_index(['bt_address']) own_devices = getOwnDevices(baddress_freq_data) else: own_devices = [None] results_out = {} for epoch in EPOCHlist: if epoch == "allday": epoch_data = data.copy() # NOTE it is not necessary to use a copy of data because no change is # applied to it within the loop. I use a copy here for the sake of # consistency with other cases where use of copy is important for # correct calculation of features in the next iteration of the loop. else: timeranges = getDaywiseSplitsForEpoch(epoch) epoch_filter = timerange_filter(data, timeranges * 1000) epoch_data = data[epoch_filter] if epoch_data.shape[0] == 0: print('no data for epoch {}'.format(epoch)) continue # resampling should not be done for battery_charges data as it only stores # periods during which the battery charges for weekday in weekdays: # Get Both/Weekdays/Weekends data_wk, weekday_suffix = getDataFromDayOfTheWeek( epoch_data, weekday) if (data_wk is None) or len(data_wk) == 0: if weekday == '': weekday_t = 'wk' else: weekday_t = weekday print('no data for weekday type {}'.format(weekday_t)) continue for gkey, gfunc in GROUPING_FUNCTIONS.items(): argstuplelist = [(own_devices, )] * len( BLUETOOTH_APPLY['with_args']) # NOTE: I don't think we need to pass in a list of tuples # so the following should work too # TO-DO test #argstuplelist = [own_devices] * len(BLUETOOTH_APPLY['with_args']) results = all_groups_flexible(data_wk, BLUETOOTH_APPLY['no_args'], BLUETOOTH_APPLY['with_args'], argstuplelist, None, {gkey: gfunc}) results_out[(epoch, weekday, gkey)] = formatResultsFor1Table( None, results, epoch, weekday) # NOTE: the first argument of formatResultsFor1Table is not used in it # I pass None to hint that # NOTE: if no features are extracted return None if len(results_out) == 0: print("no features extracted") results_out = None return results_out
def extract_app(data, *args): """ :param data: a pandas data frame holding the raw but cleaned data for feature extraction :return: a dictionary of (epoch, weekdays, grouping):dataframe of features """ # TO-DO consider moving these to extract and pass it as arguments # ideally these are set when instantiating from FeatureExtraction class EPOCHlist = ["allday", "night", "morning", "afternoon", "evening"] weekdays = ["", "wkdy", "wkend"] GROUPING_FUNCTIONS = { "day": groupby_day, # "week": groupby_week, # not of interest to UWEXP # "half_sem": groupby_half_sem, # not of interest to UWEXP # "sem": groupby_all_sem, # TO-DO address the grouping bug # "ema": groupby_ema # TO-DO test } if data.shape[0] == 0: print('no data to extract features from') return None # NOTE I'm specifically not imitating CMU code to only keep the columns # of their query and in the same order as I want to reuse this code # for applications_foreground # TO-DO consider moving this to extract if it is applicable to all sensorts convert_timezone(data, 'US/Pacific', {'timestamp': 'time'}) data.set_index("time", inplace=True) data = data.tz_localize(None) # QUESTION why to first set timezone as UTC and then change it back? # why not setting it as local time zone from the beginning? # get the app categories with open(args[0]['app_category_file'], 'r') as fileObj: app_categories = json.load(fileObj) data['package_category'] = data.apply( lambda x: assign_category(x['package_name'], app_categories), axis=1) # TO-DO consider implementing this using joins and based on appcategories table results_out = {} for epoch in EPOCHlist: if epoch == "allday": epoch_data = data.copy() # NOTE it is not necessary to use a copy of data because no change is # applied to it within the loop. I use a copy here for the sake of # consistency with other cases where use of copy is important for # correct calculation of features in the next iteration of the loop. else: timeranges = getDaywiseSplitsForEpoch(epoch) epoch_filter = timerange_filter(data, timeranges * 1000) epoch_data = data[epoch_filter] if epoch_data.shape[0] == 0: print('no data for epoch {}'.format(epoch)) continue for weekday in weekdays: data_wk, weekday_suffix = getDataFromDayOfTheWeek( epoch_data, weekday) if (data_wk is None) or len(data_wk) == 0: if weekday == '': weekday_t = 'wk' else: weekday_t = weekday print('no data for weekday type {}'.format(weekday_t)) continue for gkey, gfunc in GROUPING_FUNCTIONS.items(): results = all_groups_flexible(data_wk, APP_FUNCTIONS['no_args'], APP_FUNCTIONS['with_args'], [], None, {gkey: gfunc}) results_out[(epoch, weekday, gkey)] = formatResultsFor1Table( None, results, epoch, weekday) # NOTE: the first argument of formatResultsFor1Table is not used in it # I pass None to hint that # NOTE: if no features are extracted return None if len(results_out) == 0: print("no features extracted") results_out = None return results_out
def extract_call(data, *args): """ :param data: a pandas data frame holding the raw but cleaned data for feature extraction :return: a dictionary of (epoch, weekdays, grouping):dataframe of features """ # TO-DO consider moving these to extract and pass it as arguments # ideally these are set when instantiating from FeatureExtraction class EPOCHlist = [ "allday", "night", "morning", "afternoon", "evening" ] weekdays = [ "", "wkdy", "wkend" ] GROUPING_FUNCTIONS = { "day": groupby_day, # "week": groupby_week, # not of interest to UWEXP # "half_sem": groupby_half_sem, # not of interest to UWEXP # "sem": groupby_all_sem, # TO-DO address the grouping bug # "ema": groupby_ema # TO-DO test } if data.shape[0] == 0: print('no data to extract features from') return None # TO-DO load the phone number category information (a.k.a PHC_TABLE in CMU's work) # get the left join of calls in data and category on device_id and trace so # you end up with a new table with the additional column category which is # either None (when category information is unavailable) or is one of family, # friend-in-town, or friend-out-of-town. # NOTE in UW phase I data, iOS traces are UUID of the call session rather than # hashed values of the phone number people called/were called. Given 75% of # participants were iOS users there is very little category information # we can get. Therefore, the feature extraction below is for now implemented # in such a way that does not use categoy information. # NOTE this is to imitate data in CMU's use of the library and mainly because # there has been cases (e.g. screen) where columns have been accessed by # their position and not their name data = data[['timestamp', 'call_type', 'call_duration', 'trace']] # TO-DO should also keep 'category' column when that data is available # TO-DO consider moving this to extract if it is applicable to all sensorts convert_timezone(data, 'US/Pacific', {'timestamp':'time'}) data.set_index("time", inplace=True) data = data.tz_localize(None) # QUESTION why to first set timezone as UTC and then change it back? # why not setting it as local time zone from the beginning? results_out = {} for epoch in EPOCHlist: if epoch == "allday": epoch_data = data.copy() # NOTE it is not necessary to use a copy of data because no change is # applied to it within the loop. I use a copy here for the sake of # consistency with other cases where use of copy is important for # correct calculation of features in the next iteration of the loop. # TO-DO should get family, friend-in-town, friend-out-of-town # in separate dataframe based on category information else: timeranges = getDaywiseSplitsForEpoch(epoch) epoch_filter = timerange_filter(data, timeranges * 1000) epoch_data = data[epoch_filter] # TO-DO should get family, friend-in-town, friend-out-of-town # in separate dataframe based on category information if epoch_data.shape[0] == 0: print('no data for epoch {}'.format(epoch)) continue for weekday in weekdays: data_wk, weekday_suffix = getDataFromDayOfTheWeek(epoch_data, weekday) # TO-DO should similar get the data_wk information for family, # friends-in-town, and friends-out-of-town if (data_wk is None) or len(data_wk) == 0: if weekday == '': weekday_t = 'wk' else: weekday_t = weekday print('no data for weekday type {}'.format(weekday_t)) continue for gkey, gfunc in GROUPING_FUNCTIONS.items(): results = all_groups_communication(data_wk, CALL_APPLY, None, [], # family None, [], # friends-in-town None, [], # friends-out-of-town FAMILY_FEATURE_COLUMNS, FRIENDSINTOWN_FEATURE_COLUMNS, FRIENDSOUTOFTOWN_FEATURE_COLUMNS, None, {gkey:gfunc}) results_out[(epoch, weekday, gkey)] = formatResultsFor1Table(None, results, epoch, weekday) # NOTE: the first argument of formatResultsFor1Table is not used in it # I pass None to hint that # NOTE: if no features are extracted return None if len(results_out) == 0: print("no features extracted") results_out = None return results_out