def source_selector(parent, modality="all", location="all"): locations_set = set(get_ancestral_metadata(parent, "locations")) assert location in locations_set or location == "all", f"Location {location} not in {locations_set}" modality_set = set(get_ancestral_metadata(parent, "modalities")) assert modality in modality_set or modality == "all", f"Modality {modality} not in {modality_set}" loc, mod = location, modality root = parent / f"{loc=}-{mod=}" # Prepare a set of viable outputs valid_locations = set() for pair in parent.meta["sources"]: loc, mod = pair["loc"], pair["mod"] good_location = location == "all" or pair["loc"] == location good_modality = modality == "all" or pair["mod"] == modality if good_location and good_modality: valid_locations.update({Key(f"{loc=}-{mod=}")}) # Aggregate all relevant sources selected = 0 for key, node in parent.outputs.items(): if key in valid_locations: selected += 1 root.acquire_node(key=key, node=node) if not selected: logger.exception( f"No wearable keys found in {sorted(parent.outputs.keys())}") raise KeyError return root
def window(parent, win_len, win_inc): root = parent / f"{win_len=:03.2f}-{win_inc=:03.2f}" fs = get_ancestral_metadata(root, "fs") kwargs = dict(index=parent.index["index"], win_len=win_len, win_inc=win_inc, fs=fs) # Build index outputs for key, node in parent.index.items(): root.instantiate_node(key=key, func=PartitionByTrial(window_index), kwargs=dict(data=node, **kwargs), backend="pandas") # Build Data outputs for key, node in parent.outputs.items(): root.instantiate_node( key=key, func=PartitionByTrial(window_data), kwargs=dict(data=node, **kwargs), backend="none", ) return root
def resample(parent, fs_new): fs_old = get_ancestral_metadata(parent, "fs") root = parent / f"{fs_new}Hz" root.meta.insert("fs", fs_new) kwargs = dict(fs_old=fs_old, fs_new=fs_new) if fs_old != fs_new: # Only compute indexes and outputs if the sample rate has changed for key, node in parent.index.items(): root.instantiate_node( key=key, backend="pandas", func=PartitionByTrial(resample_metadata), kwargs=dict(index=parent.index["index"], data=node, is_index="index" in str(key), **kwargs), ) for key, node in parent.outputs.items(): root.instantiate_node( key=key, func=PartitionByTrial(resample_data), kwargs=dict(index=parent.index["index"], data=node, **kwargs), ) return root
def statistical_features(parent): """ There are two feature categories defined here: 1. Time domain 2. Frequency domain And these get mapped from transformed data from two sources: 1. Acceleration 2. Gyroscope Assuming these two sources have gone through some body/gravity transformations (eg from src.transformations.body_grav_filt) there will actually be several more sources, eg: 1. accel-body 2. accel-body-jerk 3. accel-body-jerk 4. accel-grav 5. gyro-body 6. gyro-body-jerk 7. gyro-body-jerk With more data sources this list will grows quickly. The feature types (time and frequency domain) are mapped to the transformed sources in a particular way. For example, the frequency domain features are *not* calculated on the gravity data sources. The loop below iterates through all of the outputs of the previous node in the graph, and the logic within the loop manages the correct mapping of functions to sources. Consult with the dataset table (tables/datasets.md) and see anguita2013 for details. """ root = parent / "statistical_features" fs = get_ancestral_metadata(root, "fs") accel_key = "mod='accel'" gyro_key = "mod='gyro'" mag_key = "mod='mag'" for key, node in parent.outputs.items(): key_td = f"{key}-feat='td'" key_fd = f"{key}-feat='fd'" t_kwargs = dict(data=node) f_kwargs = dict(data=node, fs=fs) if accel_key in key: root.instantiate_node(key=key_td, func=t_feat, kwargs=t_kwargs, backend="numpy") if "grav" not in key: root.instantiate_node(key=key_fd, func=f_feat, kwargs=f_kwargs, backend="numpy") if gyro_key in key or mag_key in key: root.instantiate_node(key=key_td, func=t_feat, kwargs=t_kwargs, backend="numpy") root.instantiate_node(key=key_fd, func=f_feat, kwargs=f_kwargs, backend="numpy") return root.instantiate_node( key="features", func=np.concatenate, args=[sorted_node_values(root.outputs)], kwargs=dict(axis=1) )
def basic_har( # # Dataset dataset_name="pamap2", # # Representation sources modality="all", location="all", # # Task/split task_name="har", data_partition="predefined", # # Windowification fs_new=33, win_len=3, win_inc=1, # # Features feat_name="ecdf", clf_name="rf", # # Embedding visualisation viz=False, evaluate=False, ): dataset = dataset_importer(dataset_name) # Resample, filter and window the raw sensor data wear_windowed = get_windowed_wearables(dataset=dataset, modality=modality, location=location, fs_new=fs_new, win_len=win_len, win_inc=win_inc) # Extract features features = get_features(feat_name=feat_name, windowed_data=wear_windowed) # Visualise the feature embeddings if viz: umap_embedding(features, task_name=task_name).evaluate() # Get classifier params models = dict() train_test_splits = get_ancestral_metadata( features, "data_partitions")[data_partition] for train_test_split in randomised_order(train_test_splits): models[train_test_split] = get_classifier( clf_name=clf_name, feature_node=features, task_name=task_name, data_partition=data_partition, evaluate=evaluate, train_test_split=train_test_split, ) return features, models
def basic_ensemble( dataset_name="anguita2013", modality="all", location="all", task_name="har", feat_name="ecdf", data_partition="predefined", fs_new=33, win_len=3, win_inc=1, ): dataset = dataset_importer(dataset_name) windowed_data = get_windowed_wearables(dataset=dataset, modality=modality, location=location, fs_new=fs_new, win_len=win_len, win_inc=win_inc) models = dict() train_test_splits = get_ancestral_metadata( windowed_data, "data_partitions")[data_partition] for train_test_split in randomised_order(train_test_splits): models[train_test_split] = ensemble_classifier( feat_name=feat_name, task_name=task_name, data_partition=data_partition, windowed_data=windowed_data, train_test_split=train_test_split, clf_names=["sgd", "lr", "rf"], evaluate=True, ) return models
def har_chain( test_dataset="anguita2013", fs_new=33, win_len=3, win_inc=1, task_name="har", data_partition="predefined", feat_name="ecdf", clf_name="sgd", evaluate=False, ): # Make metadata for the experiment kwargs = dict(fs_new=fs_new, win_len=win_len, win_inc=win_inc, task_name=task_name, feat_name=feat_name, clf_name=clf_name) dataset_alignment = dict( anguita2013=dict(dataset_name="anguita2013", location="waist", modality="accel"), pamap2=dict(dataset_name="pamap2", location="chest", modality="accel"), uschad=dict(dataset_name="uschad", location="waist", modality="accel"), ) # Extract the representation for the test dataset test_dataset = dataset_alignment.pop(test_dataset) features, test_models = basic_har(data_partition="predefined", **test_dataset, **kwargs) # Instantiate the root directory root = features.graph / f"chained-from-{sorted(dataset_alignment.keys())}" # Build up the list of models from aux datasets auxiliary_models = { train_test_split: [model] for train_test_split, model in test_models.items() } for model_name, model_kwargs in dataset_alignment.items(): aux_features, aux_models = basic_har(data_partition="deployable", **model_kwargs, **kwargs) for fi, mi in aux_models.items(): auxiliary_models[fi].append(mi) models = dict() # Perform the chaining train_test_splits = get_ancestral_metadata( features, "data_partitions")[data_partition] for train_test_split in randomised_order(train_test_splits): aux_probs = [features] + [ model.predict_proba(features) for model in auxiliary_models[train_test_split] ] prob_features = root.instantiate_orphan_node(func=np.concatenate, args=[aux_probs], kwargs=dict(axis=1)) models[train_test_split] = get_classifier( clf_name=clf_name, feature_node=prob_features, task_name=task_name, data_partition=data_partition, train_test_split=train_test_split, evaluate=evaluate, ) return features, models
def __init__(self, name, *args, **kwargs): super(Dataset, self).__init__(name=f"datasets/{name}", meta=DatasetMeta(name)) def load_meta(*args, **kwargs): return self.meta.meta load_meta.__name__ = name metadata = self.instantiate_node(key=f"{name}-metadata", backend="yaml", func=load_meta, kwargs=dict()) zip_name = kwargs.get("unzip_path", lambda x: x)(splitext( basename(self.meta.meta["download_urls"][0]))[0]) self.unzip_path = join(self.meta.zip_path, splitext(zip_name)[0]) index = self.instantiate_node( key="index", func=self.build_index, backend="pandas", kwargs=dict(path=self.unzip_path, metatdata=metadata), ) # Build the indexes self.instantiate_node( key="predefined", func=self.build_predefined, backend="pandas", kwargs=dict(path=self.unzip_path, metatdata=metadata), ) data_partitions = get_ancestral_metadata(self, "data_partitions") self.instantiate_node( key="loso", func=self.build_loso, backend="pandas", kwargs=dict(index=index, columns=data_partitions["loso"]), ) self.instantiate_node( key="deployable", func=self.build_deployable, backend="pandas", kwargs=dict(index=index, columns=data_partitions["deployable"]), ) tasks = get_ancestral_metadata(self, "tasks") for task in tasks: self.instantiate_node( key=task, func=self.build_label, backend="pandas", kwargs=dict(path=self.unzip_path, task=task, inv_lookup=self.meta.inv_lookup[task], metatdata=metadata), ) # Build list of outputs for placement_modality in self.meta["sources"]: loc = placement_modality["loc"] mod = placement_modality["mod"] self.instantiate_node( key=f"{loc=}-{mod=}", func=self.build_data, backend="numpy", kwargs=dict(loc=loc, mod=mod, metadata=metadata), )