def clean_sensor_data(input_folder, output_folder, debug_mode=True, scheduler='processes'): sensor_files = glob(os.path.join(input_folder, '*', 'MasterSynced', '**', 'Actigraph*sensor.csv'), recursive=True) sensor_files = list(filter(dataset.is_pid_included, sensor_files)) groupby = GroupBy(sensor_files, **MhealthWindowing.make_metas(sensor_files)) grouper = MHealthGrouper(sensor_files) groups = [ grouper.pid_group(), grouper.sid_group(), grouper.auto_init_placement_group() ] groupby.split(*groups, group_types=['PID', 'SID', 'SENSOR_PLACEMENT'], ingroup_sortkey_func=sort_by_file_timestamp, descending=False) groupby.apply(_preprocess_sensor_data, dataset_name=os.path.basename(input_folder)) groupby.final_join() groupby.compute(scheduler=scheduler).get_result()
def _preprocess_sensor_data(item, all_items, **kwargs): # get session boundaries metas = GroupBy.get_meta(item) # load data data_loader = delayed(fileio.load_sensor) loaded_data = data_loader(GroupBy.get_data(item)) # apply offset mapping get_offset = partial(dataset.get_offset, offset_column=1) offset_in_secs = delayed(get_offset)(GroupBy.get_data(item)) offset_data = delayed(dataframe.offset)(loaded_data, offset_in_secs) # apply orientation corrections orientation_correction = delayed(dataset.get_orientation_correction)( GroupBy.get_data(item)) flip_and_swap = apply_on_accelerometer_dataframe(orientation.flip_and_swap) corrected_data = delayed(flip_and_swap)(offset_data, x_flip=orientation_correction[0], y_flip=orientation_correction[1], z_flip=orientation_correction[2]) dataset_name = kwargs['dataset_name'] corrected_data = delayed(save_to_file)(corrected_data, metas, dataset_name) return GroupBy.bundle(corrected_data, **metas)
def extract_mhealth(self, data_inputs, interval=12.8, step=12.8, scheduler='processes', **kwargs): compute = self._feature_set def sort_func(item): return dataset.get_file_timestamp(GroupBy.get_data(item)) def load_data(item, all_items): metas = GroupBy.get_meta(item) data_loader = delayed(fileio.load_sensor) return GroupBy.bundle(data_loader(GroupBy.get_data(item)), **metas) @delayed def join_as_dataframe(groups): group_dfs = [] groups = GroupBy.get_data_groups(groups) for group_name in groups: group_names = group_name.split('-') group_df = pd.concat(groups[group_name]) group_col_names = [] for name in group_names: group_col_names.append('GROUP' + str(group_names.index(name))) group_df['GROUP' + str(group_names.index(name))] = name group_dfs.append(group_df) result = pd.concat(group_dfs, sort=False) result.set_index(group_col_names, inplace=True, append=True) return result @delayed @MhealthWindowing.groupby_windowing('sensor') def compute_features(df, **kwargs): return compute(df.values, **kwargs) self._inputs = data_inputs self._grouper = MHealthGrouper(data_inputs) self._groupby = GroupBy(data_inputs, **MhealthWindowing.make_metas(data_inputs)) groups = [ self._grouper.pid_group(), self._grouper.sid_group(), self._grouper.auto_init_placement_group() ] self._groupby.split(*groups, ingroup_sortkey_func=sort_func, descending=False) self._groupby.apply(load_data) self._groupby.apply(compute_features, interval=interval, step=step, **kwargs) self._groupby.final_join(join_as_dataframe) self._result = self._groupby.compute(scheduler=scheduler, **kwargs).get_result() return self
def preprocess_annotations(item, all_items, **kwargs): # get session boundaries metas = GroupBy.get_meta(item) # load data data_loader = delayed(fileio.load_annotation) loaded_data = data_loader(GroupBy.get_data(item)) return GroupBy.bundle(loaded_data, **metas)
def load_data(item, all_items, **kwargs): # get session boundaries metas = GroupBy.get_meta(item) # load data data_loader = delayed(fileio.load_sensor) loaded_data = data_loader(GroupBy.get_data(item)) return GroupBy.bundle(loaded_data, **metas)
def count_total_rows(data, all_data, **kwargs): @delayed def load_data(data): return pd.read_csv(data, parse_dates=[0], infer_datetime_format=True) @delayed def count(data): return data.shape[0] df = load_data(GroupBy.get_data(data)) return GroupBy.bundle(count(df))
def load_data(item, all_items, *, old_sr, new_sr, **kwargs): # get session boundaries metas = GroupBy.get_meta(item) # load data data_loader = delayed(fileio.load_sensor) loaded_data = data_loader(GroupBy.get_data(item)) if old_sr == new_sr: resampled_data = loaded_data else: print('resampling raw data...from {} to {}'.format(old_sr, new_sr)) resampled_data = resample_data(loaded_data, old_sr=old_sr, new_sr=new_sr) return GroupBy.bundle(resampled_data, **metas)
def as_dataframe(groups): group_dfs = [] groups = GroupBy.get_data_groups(groups) for group_name in groups: group_df = pd.concat(groups[group_name]) group_df['GROUP_NAME'] = group_name group_dfs.append(group_df) result = pd.concat(group_dfs) return result
def join_as_dataframe(groups): group_dfs = [] groups = GroupBy.get_data_groups(groups) for group_name in groups: group_names = group_name.split('-') group_df = pd.concat(groups[group_name]) group_col_names = [] for name in group_names: group_col_names.append('GROUP' + str(group_names.index(name))) group_df['GROUP' + str(group_names.index(name))] = name group_dfs.append(group_df) result = pd.concat(group_dfs, sort=False) result.set_index(group_col_names, inplace=True, append=True) return result
def get_class_map(input_folder, annotation_files, scheduler='synchronous'): groupby = GroupBy(annotation_files, **MhealthWindowing.make_metas(annotation_files)) grouper = MHealthGrouper(annotation_files) groups = [grouper.pid_group(), grouper.annotator_group()] groupby.split(*groups, ingroup_sortkey_func=sort_by_file_timestamp, descending=False) groupby.apply(preprocess_annotations) groupby.final_join(delayed(join_as_dataframe)) merged_annotations = groupby.compute(scheduler=scheduler).get_result() splitted_annotations = to_mutually_exclusive(merged_annotations) class_label_set = os.path.join(input_folder, 'MetaCrossParticipants', 'muss_class_labels.csv') class_map = ClassLabeler.from_annotation_set(splitted_annotations, class_label_set, interval=12.8) return class_map
def get_class_set(annotation_files, class_map, scheduler='synchronous', profiling=True): groupby = GroupBy(annotation_files, **MhealthWindowing.make_metas(annotation_files)) grouper = MHealthGrouper(annotation_files) groups = [grouper.pid_group(), grouper.annotator_group()] groupby.split(*groups, group_types=['PID', 'ANNOTATOR'], ingroup_sortkey_func=sort_by_file_timestamp, descending=False) groupby.apply(preprocess_annotations) groupby.apply(convert_annotations, interval=12.8, step=12.8, class_map=class_map) groupby.final_join(delayed(join_as_dataframe)) class_set = groupby.compute(scheduler=scheduler, profiling=profiling).get_result() return (class_set, groupby)
def load_data(item, all_items): metas = GroupBy.get_meta(item) data_loader = delayed(fileio.load_sensor) return GroupBy.bundle(data_loader(GroupBy.get_data(item)), **metas)
def sum_rows(group_items, **kwargs): group_items = [GroupBy.get_data(group_item) for group_item in group_items] return GroupBy.bundle(np.sum(group_items))
def as_dataframe(groups, group_types): groups = GroupBy.get_data_groups(groups) result = pd.DataFrame(groups) result = result.transpose() return result
groups = GroupBy.get_data_groups(groups) result = pd.DataFrame(groups) result = result.transpose() return result if __name__ == '__main__': import pprint from glob import glob from padar_parallel.grouper import MHealthGrouper from padar_converter.mhealth import dataset input_files = glob( 'D:/data/spades_lab/SPADES_[1-2]/MasterSynced/**/Actigraph*.sensor.csv', recursive=True) pprint.pprint(input_files) grouper = MHealthGrouper(input_files) groupby_obj = GroupBy(input_files) \ .split(grouper.pid_group(), grouper.sid_group(), group_types=['PID', 'SID'], ingroup_sortkey_func=lambda x: dataset.get_file_timestamp(x['data'])) groupby_obj.apply(count_total_rows) \ .post_join(join_func=sum_rows) \ .final_join(join_func=as_dataframe) groupby_obj.visualize_workflow(filename='test_apply.pdf') result = groupby_obj.compute(scheduler='processes').get_result() print(result)
def greater_than_zero(data, all_data): return GroupBy.bundle(GroupBy.get_data(data) > 0)
def get_feature_set(sensor_files, sampling_rate=80, parallel=False, profiling=True): if parallel: scheduler = 'processes' else: scheduler = 'sync' groupby = GroupBy(sensor_files, **MhealthWindowing.make_metas(sensor_files)) grouper = MHealthGrouper(sensor_files) groups = [grouper.pid_group()] groupby.split(*groups, group_types=['PID'], ingroup_sortkey_func=sort_by_file_timestamp, descending=False) groupby.apply(load_data) groupby.apply(compute_features, interval=12.8, step=12.8, sr=sampling_rate) groupby.final_join(delayed(join_as_dataframe)) feature_set = groupby.compute(scheduler=scheduler, profiling=profiling).get_result() feature_columns = feature_set.columns feature_columns = [col + '_' + 'DW' for col in feature_columns] feature_set.columns = feature_columns feature_set = feature_set.reset_index() return feature_set
def prepare_feature_set(input_folder, *, output_folder=None, debug=False, sampling_rate=80, resample_sr=80, scheduler='processes', profiling=True, force=True): """Compute feature set for "Location Matters" paper by Tang et al. Process the given raw dataset (stored in mhealth format) and generate feature set file in csv format along with a profiling report and feature computation pipeline diagram. :param input_folder: Folder path of input raw dataset :param output_folder: Use auto path if None :param debug: Use this flag to output results to 'debug_run' folder :param sampling_rate: The sampling rate of the raw accelerometer data in Hz :param resample_sr: The new sampling rate we desire to resample the raw data to. :param scheduler: 'processes': Use multi-core processing; 'threads': Use python threads (not-in-parallel); 'sync': Use a single thread in sequential order :param profiling: Use profiling or not. """ if output_folder is None: output_folder = utils.generate_run_folder(input_folder, debug=debug) if not os.path.exists(output_folder): os.makedirs(output_folder) feature_filepath = os.path.join(output_folder, 'muss.feature.csv') if not force and os.path.exists(feature_filepath): logging.info('Feature set file exists, skip regenerating it...') return feature_filepath sensor_files = glob(os.path.join(input_folder, '*', 'MasterSynced', '**', 'Actigraph*sensor.csv'), recursive=True) groupby = GroupBy(sensor_files, **MhealthWindowing.make_metas(sensor_files)) grouper = MHealthGrouper(sensor_files) groups = [ grouper.pid_group(), grouper.sid_group(), grouper.auto_init_placement_group() ] groupby.split(*groups, group_types=['PID', 'SID', 'SENSOR_PLACEMENT'], ingroup_sortkey_func=sort_by_file_timestamp, descending=False) groupby.apply(load_data, old_sr=sampling_rate, new_sr=resample_sr) if resample_sr != sampling_rate: sr = resample_sr else: sr = sampling_rate groupby.apply(compute_features, interval=12.8, step=12.8, sr=sr) groupby.final_join(delayed(join_as_dataframe)) result = groupby.compute(scheduler=scheduler, profiling=profiling).get_result() # rename placements result = result.reset_index() result.loc[:, 'SENSOR_PLACEMENT'] = result.loc[:, 'SENSOR_PLACEMENT'].apply( dataset.get_placement_abbr) if not os.path.exists(output_folder): os.makedirs(output_folder) profiling_filepath = os.path.join(output_folder, 'feature_computation_profiling.html') workflow_filepath = os.path.join(output_folder, 'feature_computation_workflow.pdf') result.to_csv(feature_filepath, float_format='%.9f', index=False) if profiling: groupby.show_profiling(file_path=profiling_filepath) try: groupby.visualize_workflow(filename=workflow_filepath) except Exception as e: print(e) print('skip generating workflow pdf') return feature_filepath
def sort_func(item): return dataset.get_file_timestamp(GroupBy.get_data(item))
def load_data(data, all_data, **kwargs): metas = GroupBy.get_meta(data) return GroupBy.bundle(delayed(fileio.load_sensor)(GroupBy.get_data(data)), **metas)
group_df = pd.concat(groups[group_name]) group_df['GROUP_NAME'] = group_name group_dfs.append(group_df) result = pd.concat(group_dfs) return result if __name__ == '__main__': import pprint from glob import glob from padar_parallel.groupby import GroupBy from padar_parallel.grouper import MHealthGrouper from padar_converter.mhealth import dataset input_files = glob( 'D:/data/spades_lab/SPADES_[1-9]/MasterSynced/**/Actigraph*.sensor.csv', recursive=True) pprint.pprint(input_files) grouper = MHealthGrouper(input_files) groupby_obj = GroupBy( input_files, **MhealthWindowing.make_metas(input_files)) groupby_obj.split(grouper.pid_group(), grouper.sid_group(), group_types = ['PID', 'SID'], ingroup_sortkey_func=lambda x: dataset.get_file_timestamp(GroupBy.get_data(x))) groupby_obj.apply(load_data) groupby_obj.apply(sampling_rate, interval=12.8, step=12.8) \ .final_join(join_func=delayed(join_as_dataframe)) groupby_obj.visualize_workflow(filename='test_apply_by_window.pdf') result = groupby_obj.compute( scheduler='processes').show_profiling().get_result() result.to_csv('test.csv', index=True)