def main(): DATA_PATH = Path('./intermediate_datafiles/') DATASET_FNAME = 'chapter2_result.csv' RESULT_FNAME = 'chapter3_heart_rate.csv' try: dataset = pd.read_csv(Path(DATA_PATH / DATASET_FNAME), index_col=0) dataset.index = pd.to_datetime(dataset.index) except IOError as e: print( 'File not found, try to run the preceding crowdsignals scripts first!' ) raise e DataViz = VisualizeDataset(__file__) # Original heart rate values # DataViz.plot_imputed_values(dataset, ['original'], 'hr_watch_rate') Kalman = KalmanFilters() dataset = Kalman.apply_kalman_filter(dataset, 'hr_watch_rate') # print(dataset.head()) # print(dataset.index) DataViz.plot_dataset(dataset, ['hr_watch_rate', 'hr_watch_rate_kalman'], ['exact', 'exact'], ['line', 'line'])
def main(): DataViz = VisualizeDataset() dataset_path = './intermediate_datafiles/' try: dataset = pd.read_csv(dataset_path + 'chapter3_result_final.csv', index_col=0) except IOError as e: print( 'File not found, try to run previous crowdsignals scripts first!') raise e dataset.index = dataset.index.to_datetime() milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds / 1000 # Now we move to the frequency domain, with the same window size. FreqAbs = FourierTransformation() fs = float(1000) / milliseconds_per_instance periodic_predictor_cols = [ 'acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x', 'acc_watch_y', 'acc_watch_z', 'gyr_phone_x', 'gyr_phone_y', 'gyr_phone_z', 'gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z', 'mag_phone_x', 'mag_phone_y', 'mag_phone_z', 'mag_watch_x', 'mag_watch_y', 'mag_watch_z' ] data_table = FreqAbs.abstract_frequency( copy.deepcopy(dataset), ['acc_phone_x'], int(float(10000) / milliseconds_per_instance), fs) # Spectral analysis. DataViz.plot_dataset(data_table, [ 'acc_phone_x_max_freq', 'acc_phone_x_freq_weighted', 'acc_phone_x_pse', 'label' ], ['like', 'like', 'like', 'like'], ['line', 'line', 'line', 'points']) dataset = FreqAbs.abstract_frequency( dataset, periodic_predictor_cols, int(float(10000) / milliseconds_per_instance), fs) # Now we only take a certain percentage of overlap in the windows, otherwise our training examples will be too much alike. # The percentage of overlap we allow window_overlap = 0.9 skip_points = int((1 - window_overlap) * ws) dataset = dataset.iloc[::skip_points, :] DataViz.plot_dataset( dataset, [ 'acc_phone_x', 'gyr_phone_x', 'hr_watch_rate', 'light_phone_lux', 'mag_phone_x', 'press_phone_', 'pca_1', 'label' ], ['like', 'like', 'like', 'like', 'like', 'like', 'like', 'like'], ['line', 'line', 'line', 'line', 'line', 'line', 'line', 'points'])
def main(): dataset_path = './intermediate_datafiles/' dataset = pd.read_csv(dataset_path + 'chapter2_result.csv', index_col=0) outlier_columns = ['acc_phone_x', 'light_phone_lux'] DataViz = VisualizeDataset() OutlierDistr = DistributionBasedOutlierDetection() OutlierDist = DistanceBasedOutlierDetection() dataset.index = dataset.index.to_datetime() start = input("choose method: [1],[2],[3],[4]") if start == 1: param = input("Chauvenet\ninput parameters: c") for col in outlier_columns: dataset = OutlierDistr.chauvenet(dataset, col, param) DataViz.plot_binary_outliers(dataset, col, col + '_outlier') elif start == 2: # param = input("Mixture model\n input parameters: components, iter") components, iter = raw_input("Mixture model\n input parameters: components, iter").split(',') components = int(components) iter = int(iter) for col in outlier_columns: dataset = OutlierDistr.mixture_model(dataset, col, components, iter) DataViz.plot_dataset(dataset, [col, col + '_mixture'], ['exact','exact'], ['line', 'points']) elif start == 3: d_min, f_min = raw_input("Simple distance-based\n input parameters: d_min, f_min").split() d_min = float(d_min) f_min = float(f_min) for col in outlier_columns: dataset = OutlierDist.simple_distance_based(dataset, [col], 'euclidean', d_min, f_min) DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier') elif start == 4: param = input("Local outlier factor\n input parameters: k") for col in outlier_columns: dataset = OutlierDist.local_outlier_factor(dataset, col, 'euclidean', k) DataViz.plot_dataset(dataset, [col, 'lof'], ['exact','exact'], ['line', 'points']) else : print("no method selected")
# # for col in [c for c in dataset.columns if not 'label' in c]: # sm.qqplot(dataset[col].values, line='s', ax = ax[i,j]) # ax[i,j].set_xticks([]) # ax[i,j].set_yticks([]) # ax[i,j].set_xticklabels([]) # ax[i,j].set_xticklabels([]) # ax[i,j].set_title(col) # i += 1 # if i == 5: # i = 0 # j += 1 # plt.show() DataViz.plot_dataset( dataset, ['acc_', 'press_', 'gyr_', 'mag_', 'linacc_', 'hr_', 'label'], ['like', 'like', 'like', 'like', 'like', 'like', 'like'], ['line', 'line', 'line', 'line', 'line', 'points', 'points']) # Compute the number of milliseconds covered by an instance based on the first two rows milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds / 1000 # Step 1: Let us see whether we have some outliers we would prefer to remove. # Determine the columns we want to experiment on. # outlier_columns = ['acc_phone_x', 'light_phone_lux'] outlier_columns = [c for c in dataset.columns if not 'label' in c] # Create the outlier classes. OutlierDistr = DistributionBasedOutlierDetection() OutlierDist = DistanceBasedOutlierDetection()
# Plot the data DataViz = VisualizeDataset(__file__) # Boxplot DataViz.plot_dataset_boxplot( dataset, ['acc_mobile_x', 'acc_mobile_y', 'acc_mobile_z']) #DataViz.plot_dataset_boxplot(dataset, ['gyr_mobile_x', 'gyr_mobile_y', 'gyr_mobile_z']) # Plot all data # DataViz.plot_dataset(dataset, ['acc_', 'gyr_', 'hr_watch_rate', 'light_phone_lux', 'mag_', 'press_phone_', 'label'], # ['like', 'like', 'like', 'like', 'like', 'like', 'like','like'], # ['line', 'line', 'line', 'line', 'line', 'line', 'points', 'points']) DataViz.plot_dataset(dataset, [ 'acc_mobile_', 'gyr_mobile_', 'mag_mobile_', 'prox_mobile_distance', 'loc_mobile_', 'label' ], ['like', 'like', 'like', 'like', 'like', 'like'], ['line', 'line', 'line', 'points', 'line', 'points']) # And print a summary of the dataset. util.print_statistics(dataset) datasets.append(copy.deepcopy(dataset)) # If needed, we could save the various versions of the dataset we create in the loop with logical filenames: # dataset.to_csv(RESULT_PATH / f'chapter2_result_{milliseconds_per_instance}') # Make a table like the one shown in the book, comparing the two datasets produced. util.print_latex_table_statistics_two_datasets(datasets[0], datasets[1]) # Finally, store the last dataset we generated (250 ms). dataset.to_csv(RESULT_PATH / RESULT_FNAME)
milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds / 1000 # Chapter 4: Identifying aggregate attributes. # First we focus on the time domain. # Set the window sizes to the number of instances representing 5 seconds, 30 seconds and 5 minutes window_sizes = [int(float(5000)/milliseconds_per_instance), int(float(0.5*60000)/milliseconds_per_instance), int(float(5*60000)/milliseconds_per_instance)] NumAbs = NumericalAbstraction() dataset_copy = copy.deepcopy(dataset) for ws in window_sizes: dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['ax'], ws, 'mean') dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['ax'], ws, 'std') DataViz.plot_dataset(dataset_copy, ['ax', 'ax_temp_mean', 'ax_temp_std', 'label'], ['exact', 'like', 'like', 'like'],['line', 'line', 'line', 'points']) ws = int(float(0.5 * 60000)/milliseconds_per_instance) selected_predictor_cols = [c for c in dataset.columns if not 'label' in c] dataset = NumAbs.abstract_numerical(dataset, selected_predictor_cols, ws, 'mean') dataset = NumAbs.abstract_numerical(dataset, selected_predictor_cols, ws, 'std') CatAbs = CategoricalAbstraction() dataset = CatAbs.abstract_categorical(dataset, ['label'], ['like'], 0.03,int(float(5*60000)/milliseconds_per_instance), 2) # Now we move to the frequency domain, with the same window size. FreqAbs = FourierTransformation() fs = float(1000) / milliseconds_per_instance periodic_predictor_cols = ['gFx', 'gFy', 'gFz', 'ax', 'ay', 'az', 'wx', 'wy', 'wz', 'p', 'Bx', 'By', 'Bz', 'Azimuth',
] # # data_table = FreqAbs.abstract_frequency(copy.deepcopy(dataset), ['acc_phone_Y'], # int(float(4000) / milliseconds_per_instance), fs) # Spectral analysis. # DataViz.plot_dataset(data_table, ['acc_phone_Y_max_freq', 'acc_phone_Y_freq_weighted', 'acc_phone_Y_pse', 'label'], # ['like', 'like', 'like', 'like'], ['line', 'line', 'line', 'points']) # we use 4s ws_freq = int(float(4000) / milliseconds_per_instance) dataset = FreqAbs.abstract_frequency(dataset, periodic_predictor_cols, ws_freq, fs) # Now we only take a certain percentage of overlap in the windows, otherwise our training examples will be too much alike. ws = int( float(4000) / milliseconds_per_instance) # we remove 10% of the data for every second. # The percentage of overlap we allow window_overlap = 0.9 skip_points = int((1 - window_overlap) * ws) dataset = dataset.iloc[::skip_points, :] dataset.to_csv(DATA_PATH / RESULT_FNAME) DataViz.plot_dataset( dataset, ['acc_phone_X', 'gyr_phone_X', 'mag_phone_X', 'pca_1', 'label'], ['like', 'like', 'like', 'like', 'like'], ['line', 'line', 'line', 'line', 'points'])
# DataViz.plot_dataset(kalman_dataset, ['acc_phone_X', 'acc_phone_X_kalman'], ['exact', 'exact'], ['line', 'line']) # We ignore the Kalman filter output for now... # Let us apply a lowpass filter and reduce the importance of the data above 1.5 Hz --> 2Hz LowPass = LowPassFilter() # Determine the sampling frequency. print("milliseconds_per_instance ", milliseconds_per_instance) fs = float(4000) / milliseconds_per_instance # old value 1000 cutoff = 4 # 1.5 hz? # Let us study acc_phone_X: new_dataset = LowPass.low_pass_filter(copy.deepcopy(dataset), 'acc_phone_X', fs, cutoff, order=10) DataViz.plot_dataset(new_dataset.iloc[int(0 * len(new_dataset.index)):int(0.8 * len(new_dataset.index)), :], ['acc_phone_X', 'acc_phone_X_lowpass'], ['exact', 'exact'], ['line', 'line']) # And not let us include all measurements that have a form of periodicity (and filter them): periodic_measurements = ['acc_phone_X', 'acc_phone_Y', 'acc_phone_Z', 'gyr_phone_X', 'gyr_phone_Y', 'gyr_phone_Z', 'mag_phone_X', 'mag_phone_Y', 'mag_phone_Z'] for col in periodic_measurements: dataset = LowPass.low_pass_filter(dataset, col, fs, cutoff, order=10) DataViz.plot_dataset(dataset.iloc[int(0.5 * len(new_dataset.index)):int(0.8 * len(new_dataset.index))], [col, col + '_lowpass'], ['exact', 'exact'], ['line', 'line']) dataset[col] = dataset[col + '_lowpass'] del dataset[col + '_lowpass'] # Determine the PC's for all but our target columns (the labels and the heart rate) # We simplify by ignoring both, we could also ignore one first, and apply a PC to the remainder.
dataset = DataSet.data_table # Plot the data DataViz = VisualizeDataset() # Boxplot DataViz.plot_dataset_boxplot(dataset, [ 'acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x', 'acc_watch_y', 'acc_watch_z' ]) # Plot all data DataViz.plot_dataset( dataset, [ 'acc_', 'gyr_', 'hr_watch_rate', 'light_phone_lux', 'mag_', 'press_phone_', 'label' ], ['like', 'like', 'like', 'like', 'like', 'like', 'like', 'like'], ['line', 'line', 'line', 'line', 'line', 'line', 'points', 'points']) # And print a summary of the dataset util.print_statistics(dataset) datasets.append(copy.deepcopy(dataset)) # And print the table that has been included in the book util.print_latex_table_statistics_two_datasets(datasets[0], datasets[1]) # Finally, store the last dataset we have generated (250 ms). dataset.to_csv(result_dataset_path + 'chapter2_result.csv')
# Determine the columns we want to experiment on. outlier_columns = ['acc_y', 'lin_acc_x'] # Create the outlier classes. OutlierDistr = DistributionBasedOutlierDetection() OutlierDist = DistanceBasedOutlierDetection() # And investigate the approaches for all relevant attributes. for col in outlier_columns: # And try out all different approaches. Note that we have done some optimization # of the parameter values for each of the approaches by visual inspection. dataset = OutlierDistr.chauvenet(dataset, col) DataViz.plot_binary_outliers(dataset, col, col + '_outlier', 'Chauvenets criterion') dataset = OutlierDistr.mixture_model(dataset, col) DataViz.plot_dataset(dataset, [col, col + '_mixture'], 'Mixture models', ['exact', 'exact'], ['line', 'points']) # This requires: # n_data_points * n_data_points * point_size = # 31839 * 31839 * 64 bits = ~8GB available memory # try: # dataset = OutlierDist.simple_distance_based(dataset, [col], 'euclidean', 0.10, 0.99) # DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier', 'Simple distance-based approach') # except MemoryError as e: # print('Not enough memory available for simple distance-based outlier detection...') # print('Skipping.') # # try: # dataset = OutlierDist.local_outlier_factor(dataset, [col], 'euclidean', 5) # DataViz.plot_dataset(dataset, [col, 'lof'], 'Local outlier factor', ['exact', 'exact'], ['line', 'points']) # except MemoryError as e: # print('Not enough memory available for lof...')
'label', 'binary') # Get the resulting pandas data table dataset = DataSet.data_table # Plot the data DataViz = VisualizeDataset() # Boxplot DataViz.plot_dataset_boxplot(dataset, ['acc_x', 'acc_y', 'acc_z']) DataViz.plot_dataset_boxplot(dataset, ['gyr_x', 'gyr_y', 'gyr_z']) # Plot all data DataViz.plot_dataset( dataset, ['acc_', 'mag_', 'gyr_', 'light_', 'loc_', 'lin_acc_', 'label'], ['like', 'like', 'like', 'like', 'like', 'like', 'like'], ['line', 'line', 'line', 'line', 'line', 'line', 'points']) # print a summary of the dataset util.print_statistics(dataset) datasets.append(copy.deepcopy(dataset)) # And print the table that has been included in the book # util.print_latex_table_statistics_two_datasets(datasets[0], datasets[1]) # Finally, store the last dataset we have generated (250 ms). dataset.to_csv(result_dataset_path + 'aggregation_result.csv')
# Let us apply a lowpass filter and reduce the importance of the data above 1.5 Hz LowPass = LowPassFilter() # Determine the sampling frequency. fs = float(1000) / milliseconds_per_instance cutoff = 1.5 # Let us study acc_x: new_dataset = LowPass.low_pass_filter(copy.deepcopy(dataset), 'acc_x', fs, cutoff, order=10) DataViz.plot_dataset( new_dataset.ix[int(0.4 * len(new_dataset.index)):int(0.43 * len(new_dataset.index)), :], ['acc_x', 'acc_x_lowpass'], ['exact', 'exact'], ['line', 'line']) # And not let us include all measurements that have a form of periodicity (and filter them): periodic_measurements = [ 'acc_x', 'acc_y', 'acc_z', 'lin_acc_x', 'lin_acc_y', 'lin_acc_y', 'gyr_x', 'gyr_y', 'gyr_z', 'mag_x', 'mag_y', 'mag_z' ] for col in periodic_measurements: dataset = LowPass.low_pass_filter(dataset, col, fs, cutoff, order=10) dataset[col] = dataset[col + '_lowpass'] del dataset[col + '_lowpass'] # Determine the PC's for all but our target columns (the labels)
# .apply(lambda group: group.reindex(full_idx, method='nearest')) # .reset_index(level=0, drop=True) # .sort_index() # ) # print(dataset) dataset = dataset.drop(columns=['label']) NumAbs = NumericalAbstraction() dataset_copy = copy.deepcopy(dataset) for ws in window_sizes: dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['acc_x'], ws, 'mean') dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['acc_x'], ws, 'std') # print(dataset_copy.columns) # print(dataset_copy) # exit() DataViz.plot_dataset(dataset_copy, ['acc_x', 'acc_x_temp_mean', 'acc_x_temp_std', 'label'], ['like', 'like', 'like', 'like'], ['line', 'line', 'line', 'points']) ws = int(float(0.5*60000)/milliseconds_per_instance) selected_predictor_cols = [c for c in dataset.columns if not 'label' in c] dataset = NumAbs.abstract_numerical(dataset, selected_predictor_cols, ws, 'mean') dataset = NumAbs.abstract_numerical(dataset, selected_predictor_cols, ws, 'std') DataViz.plot_dataset(dataset, ['acc_x','acc_y','acc_z', 'pca_1', 'label'], ['like', 'like', 'like','like','like'], ['line', 'line', 'line','line', 'points']) CatAbs = CategoricalAbstraction() dataset = CatAbs.abstract_categorical(dataset, ['label'], ['like'], 0.03, int(float(5*60000)/milliseconds_per_instance), 2) # Now we move to the frequency domain, with the same window size. FreqAbs = FourierTransformation()
pc_values = PCA.determine_pc_explained_variance(dataset, selected_predictor_cols) # Plot the variance explained. plot.plot(range(1, len(selected_predictor_cols) + 1), pc_values, 'b-') plot.xlabel('principal component number') plot.ylabel('explained variance') plot.show(block=False) # We select 7 as the best number of PC's as this explains most of the variance n_pcs = 4 dataset = PCA.apply_pca(copy.deepcopy(dataset), selected_predictor_cols, n_pcs) #And we visualize the result of the PC's DataViz.plot_dataset(dataset, ['pca_', 'label'], ['like', 'like'], ['line', 'points']) # And the overall final dataset: DataViz.plot_dataset(dataset, ['acc_', 'gyr_', 'mag_', 'press_phone_', 'label'], ['like', 'like', 'like', 'like', 'like'], ['line', 'line', 'line', 'line', 'points']) # Store the outcome. dataset.to_csv(dataset_path + 'mydata_chapter3_result_final.csv')
'gyr_Gyroscope y (rad/s)', 'gyr_Gyroscope z (rad/s)', 'mag_Magnetic field x (muT)', 'mag_Magnetic field y (muT)', 'mag_Magnetic field z (muT)', 'linacc_Linear Acceleration x (m/s^2)', 'linacc_Linear Acceleration y (m/s^2)', 'linacc_Linear Acceleration z (m/s^2)', 'hr_Heart Rate' ] data_table = FreqAbs.abstract_frequency( copy.deepcopy(dataset), ['linacc_Linear Acceleration x (m/s^2)'], int(float(10000) / milliseconds_per_instance), fs) # Spectral analysis. DataViz.plot_dataset(data_table, [ 'linacc_Linear Acceleration x (m/s^2)_max_freq', 'linacc_Linear Acceleration x (m/s^2)_freq_weighted', 'linacc_Linear Acceleration x (m/s^2)_pse', 'label' ], ['like', 'like', 'like', 'like'], ['line', 'line', 'line', 'points']) print('geen plot') dataset = FreqAbs.abstract_frequency( dataset, periodic_predictor_cols, int(float(10000) / milliseconds_per_instance), fs) # Now we only take a certain percentage of overlap in the windows, otherwise our training examples will be too much alike. # # The percentage of overlap we allow # window_overlap = 0.9 # skip_points = int((1-window_overlap) * ws) # dataset = dataset.iloc[::skip_points,:]
def main(): # Set a granularity (the discrete step size of our time series data) and choose if all resulting datasets should # be saved. A course-grained granularity of one instance per minute, and a fine-grained one with four instances # per second are used. GRANULARITIES = [60000, 250] SAVE_VERSIONS = False # We can call Path.mkdir(exist_ok=True) to make any required directories if they don't already exist. [path.mkdir(exist_ok=True, parents=True) for path in [DATASET_PATH, RESULT_PATH]] # Create object to visualize the data and save figures DataViz = VisualizeDataset(module_path=__file__) datasets = [] for milliseconds_per_instance in GRANULARITIES: print( f'Creating numerical datasets from files in {DATASET_PATH} using granularity {milliseconds_per_instance}.') # Create an initial dataset object with the base directory for our data and a granularity and add selected # measurements to it data_engineer = CreateDataset(base_dir=DATASET_PATH, granularity=milliseconds_per_instance) # Add the accelerometer data (continuous numerical measurements) of the phone and the smartwatch # and aggregate the values per timestep by averaging the values data_engineer.add_numerical_dataset(file='accelerometer_phone.csv', timestamp_col='timestamps', value_cols=['x', 'y', 'z'], aggregation='avg', prefix='acc_phone_') data_engineer.add_numerical_dataset(file='accelerometer_smartwatch.csv', timestamp_col='timestamps', value_cols=['x', 'y', 'z'], aggregation='avg', prefix='acc_watch_') # Add the gyroscope data (continuous numerical measurements) of the phone and the smartwatch # and aggregate the values per timestep by averaging the values data_engineer.add_numerical_dataset(file='gyroscope_phone.csv', timestamp_col='timestamps', value_cols=['x', 'y', 'z'], aggregation='avg', prefix='gyr_phone_') data_engineer.add_numerical_dataset(file='gyroscope_smartwatch.csv', timestamp_col='timestamps', value_cols=['x', 'y', 'z'], aggregation='avg', prefix='gyr_watch_') # Add the heart rate (continuous numerical measurements) and aggregate by averaging the values data_engineer.add_numerical_dataset(file='heart_rate_smartwatch.csv', timestamp_col='timestamps', value_cols=['rate'], aggregation='avg', prefix='hr_watch_') # Add the labels provided by the users as binary attributes (i.e. add a one to the attribute representing the # specific value for a label if it occurs within an interval). These are categorical events that might overlap. data_engineer.add_event_dataset(file='labels.csv', start_timestamp_col='label_start', end_timestamp_col='label_end', value_col='label', aggregation='binary') # Add the amount of light sensed by the phone (continuous numerical measurements) and aggregate by averaging data_engineer.add_numerical_dataset(file='light_phone.csv', timestamp_col='timestamps', value_cols=['lux'], aggregation='avg', prefix='light_phone_') # Add the magnetometer data (continuous numerical measurements) of the phone and the smartwatch # and aggregate the values per timestep by averaging the values data_engineer.add_numerical_dataset(file='magnetometer_phone.csv', timestamp_col='timestamps', value_cols=['x', 'y', 'z'], aggregation='avg', prefix='mag_phone_') data_engineer.add_numerical_dataset(file='magnetometer_smartwatch.csv', timestamp_col='timestamps', value_cols=['x', 'y', 'z'], aggregation='avg', prefix='mag_watch_') # Add the pressure sensed by the phone (continuous numerical measurements) and aggregate by averaging again data_engineer.add_numerical_dataset(file='pressure_phone.csv', timestamp_col='timestamps', value_cols=['pressure'], aggregation='avg', prefix='press_phone_') # Get the resulting pandas data table dataset = data_engineer.data_table # Create boxplots DataViz.plot_dataset_boxplot(dataset=dataset, cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x', 'acc_watch_y', 'acc_watch_z']) # Plot all data DataViz.plot_dataset(data_table=dataset, columns=['acc_', 'gyr_', 'hr_watch_rate', 'light_phone_lux', 'mag_', 'press_phone_', 'label'], match=['like', 'like', 'like', 'like', 'like', 'like', 'like', 'like'], display=['line', 'line', 'line', 'line', 'line', 'line', 'points', 'points']) # Print a summary of the dataset util.print_statistics(dataset=dataset) datasets.append(copy.deepcopy(dataset)) # Save the various versions of the created datasets with logical filenames if needed if SAVE_VERSIONS: dataset.to_csv(RESULT_PATH / f'chapter2_result_{milliseconds_per_instance}') # Make a table like the one shown in the book, comparing the two datasets produced util.print_latex_table_statistics_two_datasets(dataset1=datasets[0], dataset2=datasets[1]) # Finally, store the last dataset we generated (250 ms) dataset.to_csv(RESULT_PATH / RESULT_FNAME)
# We simplify by ignoring both, we could also ignore one first, and apply a PC to the remainder. PCA = PrincipalComponentAnalysis() selected_predictor_cols = [c for c in dataset.columns if (not ('label' in c)) and (not (c == 'hr_watch_rate'))] pc_values = PCA.determine_pc_explained_variance(dataset, selected_predictor_cols) # print(len(selected_predictor_cols)+1, pc_values) # Plot the variance explained. DataViz.plot_xy(x=[range(1, len(selected_predictor_cols)+1)], y=[pc_values], xlabel='principal component number', ylabel='explained variance', ylim=[0,1], line_styles=['b-']) # We select 7 as the best number of PC's as this explains most of the variance n_pcs = 4 dataset = PCA.apply_pca(copy.deepcopy(dataset), selected_predictor_cols, n_pcs) #And we visualize the result of the PC's DataViz.plot_dataset(dataset, ['pca_', 'label'], ['like', 'like'], ['line', 'points']) # And the overall final dataset: DataViz.plot_dataset(dataset, ['attitude','gravity','rotationRate','userAcceleration', 'label'], ['like', 'like', 'like', 'like', 'like', 'like', 'like','like', 'like'], ['line', 'line', 'line', 'line', 'line', 'line', 'line', 'points', 'points']) # Store the outcome. dataset.to_csv(DATA_PATH / RESULT_FNAME)
def main(): # Set up file names and locations. DATA_PATH = Path('./intermediate_datafiles/') DATASET_FNAME = sys.argv[1] if len(sys.argv) > 1 else 'chapter2_result.csv' RESULT_FNAME = sys.argv[2] if len( sys.argv) > 2 else 'chapter3_result_outliers.csv' # Next, import the data from the specified location and parse the date index. try: dataset = pd.read_csv(Path(DATA_PATH / DATASET_FNAME), index_col=0) dataset.index = pd.to_datetime(dataset.index) except IOError as e: print( 'File not found, try to run the preceding crowdsignals scripts first!' ) raise e # We'll create an instance of our visualization class to plot the results. DataViz = VisualizeDataset(__file__) # Compute the number of milliseconds covered by an instance using the first two rows. milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds / 1000 # Step 1: Let us see whether we have some outliers we would prefer to remove. # Determine the columns we want to experiment on. outlier_columns = ['acc_phone_x', 'light_phone_lux'] # Create the outlier classes. OutlierDistr = DistributionBasedOutlierDetection() OutlierDist = DistanceBasedOutlierDetection() # And investigate the approaches for all relevant attributes. for col in outlier_columns: print(f"Applying outlier criteria for column {col}") # And try out all different approaches. Note that we have done some optimization # of the parameter values for each of the approaches by visual inspection. dataset = OutlierDistr.chauvenet(dataset, col) DataViz.plot_binary_outliers(dataset, col, col + '_outlier') dataset = OutlierDistr.mixture_model(dataset, col) DataViz.plot_dataset(dataset, [col, col + '_mixture'], ['exact', 'exact'], ['line', 'points']) # This requires: # n_data_points * n_data_points * point_size = # 31839 * 31839 * 32 bits = ~4GB available memory try: dataset = OutlierDist.simple_distance_based( dataset, [col], 'euclidean', 0.10, 0.99) DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier') except MemoryError as e: print( 'Not enough memory available for simple distance-based outlier detection...' ) print('Skipping.') try: dataset = OutlierDist.local_outlier_factor(dataset, [col], 'euclidean', 5) DataViz.plot_dataset(dataset, [col, 'lof'], ['exact', 'exact'], ['line', 'points']) except MemoryError as e: print('Not enough memory available for lof...') print('Skipping.') # Remove all the stuff from the dataset again. cols_to_remove = [ col + '_outlier', col + '_mixture', 'simple_dist_outlier', 'lof' ] for to_remove in cols_to_remove: if to_remove in dataset: del dataset[to_remove] # We take Chauvenet's criterion and apply it to all but the label data... for col in [c for c in dataset.columns if not 'label' in c]: print(f'Measurement is now: {col}') dataset = OutlierDistr.chauvenet(dataset, col) dataset.loc[dataset[f'{col}_outlier'] == True, col] = np.nan del dataset[col + '_outlier'] dataset.to_csv(DATA_PATH / RESULT_FNAME)
# First we focus on the time domain. # Set the window sizes to the number of instances representing 5 seconds, 30 seconds and 5 minutes window_sizes = [int(float(5000)/milliseconds_per_instance), int(float(0.5*60000)/milliseconds_per_instance), int(float(5*60000)/milliseconds_per_instance)] print('total window sizes', window_sizes) NumAbs = NumericalAbstraction() dataset_copy = copy.deepcopy(dataset) for ws in window_sizes: dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['acc_phone_x'], ws, 'mean') dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['acc_phone_x'], ws, 'std') print('window size', ws) DataViz.plot_dataset(dataset_copy, ['acc_phone_x', 'acc_phone_x_temp_mean', 'acc_phone_x_temp_std', 'label'], ['exact', 'like', 'like', 'like'], ['line', 'line', 'line', 'points']) ws = int(float(0.5*60000)/milliseconds_per_instance) selected_predictor_cols = [c for c in dataset.columns if not 'label' in c] dataset = NumAbs.abstract_numerical(dataset, selected_predictor_cols, ws, 'mean') dataset = NumAbs.abstract_numerical(dataset, selected_predictor_cols, ws, 'std') DataViz.plot_dataset(dataset, ['acc_phone_x', 'gyr_phone_x', 'hr_watch_rate', 'light_phone_lux', 'mag_phone_x', 'press_phone_', 'pca_1', 'label'], ['like', 'like', 'like', 'like', 'like', 'like', 'like','like'], ['line', 'line', 'line', 'line', 'line', 'line', 'line', 'points']) CatAbs = CategoricalAbstraction() dataset = CatAbs.abstract_categorical(dataset, ['label'], ['like'], 0.03, int(float(5*60000)/milliseconds_per_instance), 2) print('attributes frequency domain') # Now we move to the frequency domain, with the same window size.
for col in [c for c in dataset.columns if not 'label' in c]: dataset = MisVal.impute_interpolate(dataset, col) # Using the result from Chapter 2, let us try the Kalman filter on the light_phone_lux attribute and study the result. original_dataset = pd.read_csv(DATA_PATH / ORIG_DATASET_FNAME, index_col=0) original_dataset.index = pd.to_datetime(original_dataset.index) KalFilter = KalmanFilters() kalman_dataset = KalFilter.apply_kalman_filter(original_dataset, 'acc_mobile_x') DataViz.plot_imputed_values(kalman_dataset, ['original', 'kalman'], 'acc_mobile_x', kalman_dataset['acc_mobile_x_kalman']) DataViz.plot_dataset(kalman_dataset, ['acc_mobile_x', 'acc_mobile_x_kalman'], ['exact', 'exact'], ['line', 'line']) # Determine the PC's for all but our target columns (the labels and the heart rate) # We simplify by ignoring both, we could also ignore one first, and apply a PC to the remainder. PCA = PrincipalComponentAnalysis() selected_predictor_cols = [c for c in dataset.columns if (not ('label' in c))] pc_values = PCA.determine_pc_explained_variance(dataset, selected_predictor_cols) # Plot the variance explained. DataViz.plot_xy(x=[range(1, len(selected_predictor_cols) + 1)], y=[pc_values], xlabel='principal component number', ylabel='explained variance',
# And we impute for all columns except for the label in the selected way (interpolation) for col in [c for c in dataset.columns if not 'label' in c]: dataset = MisVal.impute_interpolate(dataset, col) # Let us try the Kalman filter on the light_phone_lux attribute and study the result. original_dataset = pd.read_csv(dataset_path + 'chapter2_result.csv', index_col=0) original_dataset.index = original_dataset.index.to_datetime() KalFilter = KalmanFilters() kalman_dataset = KalFilter.apply_kalman_filter(original_dataset, 'acc_phone_x') DataViz.plot_imputed_values(kalman_dataset, ['original', 'kalman'], 'acc_phone_x', kalman_dataset['acc_phone_x_kalman']) DataViz.plot_dataset(kalman_dataset, ['acc_phone_x', 'acc_phone_x_kalman'], ['exact', 'exact'], ['line', 'line']) # We ignore the Kalman filter output for now... # Let us apply a lowpass filter and reduce the importance of the data above 1.5 Hz LowPass = LowPassFilter() # Determine the sampling frequency. fs = float(1000) / milliseconds_per_instance cutoff = 1.5 # Let us study acc_phone_x: new_dataset = LowPass.low_pass_filter(copy.deepcopy(dataset), 'acc_phone_x', fs,
int(float(2000) / milliseconds_per_instance), int(float(4000) / milliseconds_per_instance), int(float(10000) / milliseconds_per_instance) ] NumAbs = NumericalAbstraction() dataset_copy = copy.deepcopy(dataset) for ws in window_sizes: dataset_copy = NumAbs.abstract_numerical( dataset_copy, ['linacc_Linear Acceleration x (m/s^2)'], ws, 'mean') dataset_copy = NumAbs.abstract_numerical( dataset_copy, ['linacc_Linear Acceleration x (m/s^2)'], ws, 'std') DataViz.plot_dataset(dataset_copy, [ 'linacc_Linear Acceleration x (m/s^2)', 'linacc_Linear Acceleration x (m/s^2)_temp_mean', 'linacc_Linear Acceleration x (m/s^2)_temp_std', 'label' ], ['exact', 'like', 'like', 'like'], ['line', 'line', 'line', 'points']) ws = int(float(4000) / milliseconds_per_instance) selected_predictor_cols = [ 'acc_Acceleration x (m/s^2)', 'acc_Acceleration y (m/s^2)', 'acc_Acceleration z (m/s^2)', 'press_Pressure (hPa)', 'gyr_Gyroscope x (rad/s)', 'gyr_Gyroscope y (rad/s)', 'gyr_Gyroscope z (rad/s)', 'mag_Magnetic field x (muT)', 'mag_Magnetic field y (muT)', 'mag_Magnetic field z (muT)', 'linacc_Linear Acceleration x (m/s^2)', 'linacc_Linear Acceleration y (m/s^2)', 'linacc_Linear Acceleration z (m/s^2)', 'hr_Heart Rate' ]
# Chapter 4: Identifying aggregate attributes. # First we focus on the time domain. # Set the window sizes to the number of instances representing 5 seconds, 30 seconds and 5 minutes window_sizes = [int(float(5000)/milliseconds_per_instance), int(float(0.5*60000)/milliseconds_per_instance), int(float(5*60000)/milliseconds_per_instance)] NumAbs = NumericalAbstraction() dataset_copy = copy.deepcopy(dataset) for ws in window_sizes: dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['acc_phone_x'], ws, 'mean') dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['acc_phone_x'], ws, 'std') DataViz.plot_dataset(dataset_copy, ['acc_phone_x', 'acc_phone_x_temp_mean', 'acc_phone_x_temp_std', 'label'], ['exact', 'like', 'like', 'like'], ['line', 'line', 'line', 'points']) ws = int(float(0.5*60000)/milliseconds_per_instance) selected_predictor_cols = [c for c in dataset.columns if not 'label' in c] dataset = NumAbs.abstract_numerical(dataset, selected_predictor_cols, ws, 'mean') dataset = NumAbs.abstract_numerical(dataset, selected_predictor_cols, ws, 'std') CatAbs = CategoricalAbstraction() dataset = CatAbs.abstract_categorical(dataset, ['label'], ['like'], 0.03, int(float(5*60000)/milliseconds_per_instance), 2) # Now we move to the frequency domain, with the same window size. FreqAbs = FourierTransformation() fs = float(1000)/milliseconds_per_instance
# Determine the PC's for all but our target columns (the labels and the heart rate) # We simplify by ignoring both, we could also ignore one first, and apply a PC to the remainder. PCA = PrincipalComponentAnalysis() selected_predictor_cols = [c for c in dataset.columns if (not ('label' in c) and (not 'id' in c))] pc_values = PCA.determine_pc_explained_variance(dataset, selected_predictor_cols) # Plot the variance explained. DataViz.plot_xy(x=[range(1, len(selected_predictor_cols)+1)], y=[pc_values], xlabel='principal component number', ylabel='explained variance', ylim=[0,1], line_styles=['b-']) # We select 7 as the best number of PC's as this explains most of the variance n_pcs = 2 dataset = PCA.apply_pca(copy.deepcopy(dataset), selected_predictor_cols, n_pcs) #And we visualize the result of the PC's DataViz.plot_dataset(dataset, ['pca_', 'label'], ['like', 'like'], ['line', 'points']) # And the overall final dataset: DataViz.plot_dataset(dataset, ['acc_x', 'acc_y', 'acc_z', 'heartrate', 'pca_', 'label'], ['like', 'like', 'like', 'like', 'like', 'like'], ['line', 'line', 'line', 'line', 'line', 'points']) # Store the outcome. dataset.to_csv(DATA_PATH / RESULT_FNAME)
int(float(1000) / milliseconds_per_instance), int(float(5000) / milliseconds_per_instance), int(float(0.3 * 60000) / milliseconds_per_instance) ] NumAbs = NumericalAbstraction() dataset_copy = copy.deepcopy(dataset) for ws in window_sizes: dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['userAcceleration.x'], ws, 'mean') dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['userAcceleration.x'], ws, 'std') DataViz.plot_dataset(dataset_copy, [ 'userAcceleration.x', 'userAcceleration.x_temp_mean', 'userAcceleration.x_temp_std', 'label' ], ['exact', 'like', 'like', 'like'], ['line', 'line', 'line', 'points']) ws = int(float(0.3 * 60000) / milliseconds_per_instance) selected_predictor_cols = [c for c in dataset.columns if not 'label' in c] dataset = NumAbs.abstract_numerical(dataset, selected_predictor_cols, ws, 'mean') dataset = NumAbs.abstract_numerical(dataset, selected_predictor_cols, ws, 'std') DataViz.plot_dataset( dataset, [ 'gravity.x', 'gravity.y', 'gravity.z', 'userAcceleration.x', 'userAcceleration.y', 'userAcceleration.z', 'label' ], ['like', 'like', 'like', 'like', 'like', 'like', 'like'], ['line', 'line', 'line', 'line', 'line', 'line', 'points'])
from util.VisualizeDataset import VisualizeDataset import pandas as pd person_id = 1455390 DataViz = VisualizeDataset('assignment3.') #CHAPTER 2 path = r'C:/Users/MICK/Desktop/ML4QS/ML4QS/Python3Code/intermediate_datafiles/Assignment3/' df = pd.read_csv(path + 'chapter2_result.csv') df.index = pd.to_datetime(df['time']) df = df[df['personid'] == person_id] DataViz.plot_dataset( df, ['acc_x', 'acc_y', 'acc_z', 'heartrate', 'heartrate_std', 'label'], ['like', 'like', 'like', 'exact', 'exact', 'like'], ['line', 'line', 'line', 'line', 'line', 'points']) #CHAPTER 3 path = r'C:/Users/MICK/Desktop/ML4QS/ML4QS/Python3Code/intermediate_datafiles/Assignment3/' df = pd.read_csv(path + 'chapter3_result_final.csv') df.index = pd.to_datetime(df['time']) df = df[df['personid'] == person_id] DataViz.plot_dataset(df, ['acc_x', 'acc_y', 'acc_z', 'heartrate', 'pca_', 'label'], ['like', 'like', 'like', 'like', 'like', 'like'], ['line', 'line', 'line', 'line', 'line', 'points']) print(df.columns)
'labelSittingDown', 'labelSitting', 'labelStandingFromLying', 'labelOnAllFours', 'labelSittingOnTheGround', 'labelStandingFromSitting', 'labelStandingFromSittingOnTheGround'], 'max', '') # Get the resulting pandas data table dataset = DataSet.data_table # Plot the data DataViz = VisualizeDataset() # Boxplot DataViz.plot_dataset_boxplot(dataset, ['ankle_l_x', 'ankle_l_y', 'ankle_l_z', 'ankle_r_x', 'ankle_r_y', 'ankle_r_z', 'belt_x', 'belt_y', 'belt_z', 'chest_x', 'chest_y', 'chest_z']) # Plot all data DataViz.plot_dataset(dataset, ['ankle_l_', 'ankle_r_', 'belt_', 'chest_', 'label'], ['like', 'like', 'like', 'like', 'like'], ['line', 'line', 'line', 'line', 'points']) # And print a summary of the dataset util.print_statistics(dataset) datasets.append(copy.deepcopy(dataset)) # And print the table that has been included in the book util.print_latex_table_statistics_two_datasets(datasets[0], datasets[1]) # Finally, store the last dataset we have generated (250 ms). #dataset.to_csv(result_dataset_path + 'chapter2_result.csv')
""" for c in periodic_predictor_cols: data_table = FreqAbs.abstract_frequency(copy.deepcopy(dataset), [c], int(float(10000) / milliseconds_per_instance), fs) DataViz.plot_dataset(data_table, [c+'_max_freq', c+'_freq_weighted', c+'_pse', c+'_freq_skewness', c+'_freq_kurtosis', 'label'], ['like', 'like', 'like', 'like', 'like', 'like'], ['line', 'line', 'line', 'line', 'line', 'points']) """ # Compute and add frequency domain features to dataset dataset = FreqAbs.abstract_frequency( dataset, periodic_predictor_cols, int(float(10000) / milliseconds_per_instance), fs) # ------------------------------------------------------------------------------------ # REDUCE OVERLAP print 'reducing overlap.' # The percentage of overlap we allow window_overlap = 0.95 skip_points = int((1 - window_overlap) * ws) dataset = dataset.iloc[::skip_points, :] dataset.to_csv(dataset_path + 'domain_features_result_95.csv') DataViz.plot_dataset( dataset, [ 'acc_x', 'gyr_x', 'lin_acc_x', 'light_illuminance', 'mag_x', 'loc_height', 'pca_1', 'label' ], ['like', 'like', 'like', 'like', 'like', 'like', 'like', 'like'], ['line', 'line', 'line', 'line', 'line', 'line', 'line', 'points'])
'acc_z', "gyr_x", "gyr_y", "gyr_z", ] for ws in window_sizes: dataset = NumAbs.abstract_numerical(dataset, periodic_predictor_cols, ws, 'mean') dataset = NumAbs.abstract_numerical(dataset, periodic_predictor_cols, ws, 'std') print('window size', ws) print(dataset.columns) DataViz.plot_dataset(dataset, ['acc_x', 'acc_y', 'acc_z', 'label'], ['exact', 'like', 'like', 'like'], ['line', 'line', 'line', 'points']) # ws = int(float(0.5*60000)/milliseconds_per_instance) # dataset = NumAbs.abstract_numerical(dataset, periodic_predictor_cols, ws, 'mean') # dataset = NumAbs.abstract_numerical(dataset, periodic_predictor_cols, ws, 'std') print('temporal', dataset.shape) print('attributes frequency domain') # Now we move to the frequency domain, with the same window size. FreqAbs = FourierTransformation() fs = float(1000) / milliseconds_per_instance
# DataSetCS.add_numerical_dataset('pressure_phone.csv', 'timestamps', ['pressure'], 'avg', 'press_phone_') # Get the resulting pandas data table dataset_own = DataSetOwn.data_table # dataset_cs = DataSetCS.data_table # Plot the data DataViz = VisualizeDataset() # Boxplot DataViz.plot_dataset_boxplot(dataset_own, ['acc_phone_x','acc_phone_y','acc_phone_z']) # DataViz.plot_dataset_boxplot(dataset_cs, ['acc_phone_x','acc_phone_y','acc_phone_z']) # Plot all data DataViz.plot_dataset(dataset_own, ['acc_', 'gyr_', 'mag_', 'press_' ,'pedom_phone_', 'label'], ['like', 'like', 'like', 'like', 'like', 'like'], ['line', 'line', 'line','line', 'points', 'points']) # DataViz.plot_dataset(dataset_cs, ['acc_phone', 'gyr_phone', 'mag_phone', 'press_phone_', 'label'], ['like', 'like', 'like', 'like', 'like'], ['line', 'line', 'line', 'line', 'points']) # And print a summary of the dataset util.print_statistics(dataset_own) datasets_own.append(copy.deepcopy(dataset_own)) # util.print_statistics(dataset_cs) # datasets_cs.append(copy.deepcopy(dataset_cs)) # And print the table that has been included in the book util.print_latex_table_statistics_two_datasets(datasets_own[0], datasets_own[1]) util.print_latex_table_statistics_two_datasets(datasets_cs[0], datasets_cs[1])