Пример #1
0
def get_feature_set(sensor_files,
                    sampling_rate=80,
                    parallel=False,
                    profiling=True):
    if parallel:
        scheduler = 'processes'
    else:
        scheduler = 'sync'
    groupby = GroupBy(sensor_files,
                      **MhealthWindowing.make_metas(sensor_files))
    grouper = MHealthGrouper(sensor_files)
    groups = [grouper.pid_group()]

    groupby.split(*groups,
                  group_types=['PID'],
                  ingroup_sortkey_func=sort_by_file_timestamp,
                  descending=False)

    groupby.apply(load_data)
    groupby.apply(compute_features, interval=12.8, step=12.8, sr=sampling_rate)
    groupby.final_join(delayed(join_as_dataframe))
    feature_set = groupby.compute(scheduler=scheduler,
                                  profiling=profiling).get_result()
    feature_columns = feature_set.columns
    feature_columns = [col + '_' + 'DW' for col in feature_columns]
    feature_set.columns = feature_columns
    feature_set = feature_set.reset_index()
    return feature_set
Пример #2
0
def clean_sensor_data(input_folder,
                      output_folder,
                      debug_mode=True,
                      scheduler='processes'):

    sensor_files = glob(os.path.join(input_folder, '*', 'MasterSynced', '**',
                                     'Actigraph*sensor.csv'),
                        recursive=True)

    sensor_files = list(filter(dataset.is_pid_included, sensor_files))

    groupby = GroupBy(sensor_files,
                      **MhealthWindowing.make_metas(sensor_files))

    grouper = MHealthGrouper(sensor_files)
    groups = [
        grouper.pid_group(),
        grouper.sid_group(),
        grouper.auto_init_placement_group()
    ]

    groupby.split(*groups,
                  group_types=['PID', 'SID', 'SENSOR_PLACEMENT'],
                  ingroup_sortkey_func=sort_by_file_timestamp,
                  descending=False)

    groupby.apply(_preprocess_sensor_data,
                  dataset_name=os.path.basename(input_folder))

    groupby.final_join()

    groupby.compute(scheduler=scheduler).get_result()
Пример #3
0
def get_class_map(input_folder, annotation_files, scheduler='synchronous'):
    groupby = GroupBy(annotation_files,
                      **MhealthWindowing.make_metas(annotation_files))

    grouper = MHealthGrouper(annotation_files)
    groups = [grouper.pid_group(), grouper.annotator_group()]

    groupby.split(*groups,
                  ingroup_sortkey_func=sort_by_file_timestamp,
                  descending=False)

    groupby.apply(preprocess_annotations)
    groupby.final_join(delayed(join_as_dataframe))

    merged_annotations = groupby.compute(scheduler=scheduler).get_result()
    splitted_annotations = to_mutually_exclusive(merged_annotations)
    class_label_set = os.path.join(input_folder, 'MetaCrossParticipants',
                                   'muss_class_labels.csv')
    class_map = ClassLabeler.from_annotation_set(splitted_annotations,
                                                 class_label_set,
                                                 interval=12.8)
    return class_map
Пример #4
0
def get_class_set(annotation_files,
                  class_map,
                  scheduler='synchronous',
                  profiling=True):
    groupby = GroupBy(annotation_files,
                      **MhealthWindowing.make_metas(annotation_files))

    grouper = MHealthGrouper(annotation_files)
    groups = [grouper.pid_group(), grouper.annotator_group()]

    groupby.split(*groups,
                  group_types=['PID', 'ANNOTATOR'],
                  ingroup_sortkey_func=sort_by_file_timestamp,
                  descending=False)
    groupby.apply(preprocess_annotations)
    groupby.apply(convert_annotations,
                  interval=12.8,
                  step=12.8,
                  class_map=class_map)
    groupby.final_join(delayed(join_as_dataframe))

    class_set = groupby.compute(scheduler=scheduler,
                                profiling=profiling).get_result()
    return (class_set, groupby)
Пример #5
0
def prepare_feature_set(input_folder,
                        *,
                        output_folder=None,
                        debug=False,
                        sampling_rate=80,
                        resample_sr=80,
                        scheduler='processes',
                        profiling=True,
                        force=True):
    """Compute feature set for "Location Matters" paper by Tang et al.

    Process the given raw dataset (stored in mhealth format) and generate feature set file in csv format along with a profiling report and feature computation pipeline diagram.

    :param input_folder: Folder path of input raw dataset
    :param output_folder: Use auto path if None
    :param debug: Use this flag to output results to 'debug_run' folder
    :param sampling_rate: The sampling rate of the raw accelerometer data in Hz
    :param resample_sr: The new sampling rate we desire to resample the raw data to.
    :param scheduler: 'processes': Use multi-core processing;
                      'threads': Use python threads (not-in-parallel);
                      'sync': Use a single thread in sequential order
    :param profiling: Use profiling or not.
    """

    if output_folder is None:
        output_folder = utils.generate_run_folder(input_folder, debug=debug)

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    feature_filepath = os.path.join(output_folder, 'muss.feature.csv')

    if not force and os.path.exists(feature_filepath):
        logging.info('Feature set file exists, skip regenerating it...')
        return feature_filepath

    sensor_files = glob(os.path.join(input_folder, '*', 'MasterSynced', '**',
                                     'Actigraph*sensor.csv'),
                        recursive=True)

    groupby = GroupBy(sensor_files,
                      **MhealthWindowing.make_metas(sensor_files))

    grouper = MHealthGrouper(sensor_files)
    groups = [
        grouper.pid_group(),
        grouper.sid_group(),
        grouper.auto_init_placement_group()
    ]

    groupby.split(*groups,
                  group_types=['PID', 'SID', 'SENSOR_PLACEMENT'],
                  ingroup_sortkey_func=sort_by_file_timestamp,
                  descending=False)

    groupby.apply(load_data, old_sr=sampling_rate, new_sr=resample_sr)

    if resample_sr != sampling_rate:
        sr = resample_sr
    else:
        sr = sampling_rate

    groupby.apply(compute_features, interval=12.8, step=12.8, sr=sr)

    groupby.final_join(delayed(join_as_dataframe))

    result = groupby.compute(scheduler=scheduler,
                             profiling=profiling).get_result()

    # rename placements
    result = result.reset_index()
    result.loc[:,
               'SENSOR_PLACEMENT'] = result.loc[:, 'SENSOR_PLACEMENT'].apply(
                   dataset.get_placement_abbr)

    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    profiling_filepath = os.path.join(output_folder,
                                      'feature_computation_profiling.html')
    workflow_filepath = os.path.join(output_folder,
                                     'feature_computation_workflow.pdf')
    result.to_csv(feature_filepath, float_format='%.9f', index=False)
    if profiling:
        groupby.show_profiling(file_path=profiling_filepath)
        try:
            groupby.visualize_workflow(filename=workflow_filepath)
        except Exception as e:
            print(e)
            print('skip generating workflow pdf')
    return feature_filepath
Пример #6
0
class FeatureExtractor:
    def __init__(self):
        self._feature_set = None
        self._groupby = None
        self._grouper = None

    def add_feature_set(self, feature_set):
        self._feature_set = feature_set

    def extract_mhealth(self,
                        data_inputs,
                        interval=12.8,
                        step=12.8,
                        scheduler='processes',
                        **kwargs):
        compute = self._feature_set

        def sort_func(item):
            return dataset.get_file_timestamp(GroupBy.get_data(item))

        def load_data(item, all_items):
            metas = GroupBy.get_meta(item)
            data_loader = delayed(fileio.load_sensor)
            return GroupBy.bundle(data_loader(GroupBy.get_data(item)), **metas)

        @delayed
        def join_as_dataframe(groups):
            group_dfs = []
            groups = GroupBy.get_data_groups(groups)
            for group_name in groups:
                group_names = group_name.split('-')
                group_df = pd.concat(groups[group_name])
                group_col_names = []
                for name in group_names:
                    group_col_names.append('GROUP' +
                                           str(group_names.index(name)))
                    group_df['GROUP' + str(group_names.index(name))] = name
                group_dfs.append(group_df)
            result = pd.concat(group_dfs, sort=False)
            result.set_index(group_col_names, inplace=True, append=True)
            return result

        @delayed
        @MhealthWindowing.groupby_windowing('sensor')
        def compute_features(df, **kwargs):
            return compute(df.values, **kwargs)

        self._inputs = data_inputs
        self._grouper = MHealthGrouper(data_inputs)
        self._groupby = GroupBy(data_inputs,
                                **MhealthWindowing.make_metas(data_inputs))
        groups = [
            self._grouper.pid_group(),
            self._grouper.sid_group(),
            self._grouper.auto_init_placement_group()
        ]
        self._groupby.split(*groups,
                            ingroup_sortkey_func=sort_func,
                            descending=False)
        self._groupby.apply(load_data)
        self._groupby.apply(compute_features,
                            interval=interval,
                            step=step,
                            **kwargs)
        self._groupby.final_join(join_as_dataframe)
        self._result = self._groupby.compute(scheduler=scheduler,
                                             **kwargs).get_result()
        return self

    def show_profiling(self):
        self._groupby.show_profiling()

    def save(self, filepath):
        self._result.to_csv(filepath, float_format='%.9f', index=True)