예제 #1
0
    # TODO add the other datasets

    # We add the labels provided by the users. These are categorical events that might overlap. We add them
    # as binary attributes (i.e. add a one to the attribute representing the specific value for the label if it
    # occurs within an interval).
    dataset.add_event_dataset('labels_phone.csv', 'label_start', 'label_end',
                              'label', 'binary')

    # Get the resulting pandas data table
    dataset = dataset.data_table

    # Plot the data
    DataViz = VisualizeDataset(__file__)

    # Boxplot
    DataViz.plot_dataset_boxplot(
        dataset, ['acc_mobile_x', 'acc_mobile_y', 'acc_mobile_z'])
    #DataViz.plot_dataset_boxplot(dataset, ['gyr_mobile_x', 'gyr_mobile_y', 'gyr_mobile_z'])

    # Plot all data
    # DataViz.plot_dataset(dataset, ['acc_', 'gyr_', 'hr_watch_rate', 'light_phone_lux', 'mag_', 'press_phone_', 'label'],
    #                               ['like', 'like', 'like', 'like', 'like', 'like', 'like','like'],
    #                               ['line', 'line', 'line', 'line', 'line', 'line', 'points', 'points'])

    DataViz.plot_dataset(dataset, [
        'acc_mobile_', 'gyr_mobile_', 'mag_mobile_', 'prox_mobile_distance',
        'loc_mobile_', 'label'
    ], ['like', 'like', 'like', 'like', 'like', 'like'],
                         ['line', 'line', 'line', 'points', 'line', 'points'])

    # And print a summary of the dataset.
    util.print_statistics(dataset)
예제 #2
0
    DataSet.add_binary_labels_dataset('A01_parsed_raw_data.csv', 'timestamp',
                                      ['labelWalking', 'labelFalling', 'labelLyingDown', 'labelLying',
                                       'labelSittingDown', 'labelSitting', 'labelStandingFromLying', 'labelOnAllFours',
                                       'labelSittingOnTheGround', 'labelStandingFromSitting',
                                       'labelStandingFromSittingOnTheGround'], 'max', '')

    # Get the resulting pandas data table

    dataset = DataSet.data_table

    # Plot the data

    DataViz = VisualizeDataset()

    # Boxplot
    DataViz.plot_dataset_boxplot(dataset, ['ankle_l_x', 'ankle_l_y', 'ankle_l_z', 'ankle_r_x', 'ankle_r_y', 'ankle_r_z',
                                           'belt_x', 'belt_y', 'belt_z', 'chest_x', 'chest_y', 'chest_z'])

    # Plot all data
    DataViz.plot_dataset(dataset, ['ankle_l_', 'ankle_r_', 'belt_', 'chest_', 'label'], ['like', 'like', 'like', 'like', 'like'], ['line', 'line', 'line', 'line', 'points'])

    # And print a summary of the dataset

    util.print_statistics(dataset)
    datasets.append(copy.deepcopy(dataset))

# And print the table that has been included in the book

util.print_latex_table_statistics_two_datasets(datasets[0], datasets[1])

# Finally, store the last dataset we have generated (250 ms).
#dataset.to_csv(result_dataset_path + 'chapter2_result.csv')
예제 #3
0
def main():
    # Set a granularity (the discrete step size of our time series data) and choose if all resulting datasets should
    # be saved. A course-grained granularity of one instance per minute, and a fine-grained one with four instances
    # per second are used.
    GRANULARITIES = [60000, 250]
    SAVE_VERSIONS = False

    # We can call Path.mkdir(exist_ok=True) to make any required directories if they don't already exist.
    [path.mkdir(exist_ok=True, parents=True) for path in [DATASET_PATH, RESULT_PATH]]

    # Create object to visualize the data and save figures
    DataViz = VisualizeDataset(module_path=__file__)

    datasets = []
    for milliseconds_per_instance in GRANULARITIES:
        print(
            f'Creating numerical datasets from files in {DATASET_PATH} using granularity {milliseconds_per_instance}.')

        # Create an initial dataset object with the base directory for our data and a granularity and add selected
        # measurements to it
        data_engineer = CreateDataset(base_dir=DATASET_PATH, granularity=milliseconds_per_instance)

        # Add the accelerometer data (continuous numerical measurements) of the phone and the smartwatch
        # and aggregate the values per timestep by averaging the values
        data_engineer.add_numerical_dataset(file='accelerometer_phone.csv', timestamp_col='timestamps',
                                            value_cols=['x', 'y', 'z'], aggregation='avg', prefix='acc_phone_')
        data_engineer.add_numerical_dataset(file='accelerometer_smartwatch.csv', timestamp_col='timestamps',
                                            value_cols=['x', 'y', 'z'], aggregation='avg', prefix='acc_watch_')

        # Add the gyroscope data (continuous numerical measurements) of the phone and the smartwatch
        # and aggregate the values per timestep by averaging the values
        data_engineer.add_numerical_dataset(file='gyroscope_phone.csv', timestamp_col='timestamps',
                                            value_cols=['x', 'y', 'z'], aggregation='avg', prefix='gyr_phone_')
        data_engineer.add_numerical_dataset(file='gyroscope_smartwatch.csv', timestamp_col='timestamps',
                                            value_cols=['x', 'y', 'z'], aggregation='avg', prefix='gyr_watch_')

        # Add the heart rate (continuous numerical measurements) and aggregate by averaging the values
        data_engineer.add_numerical_dataset(file='heart_rate_smartwatch.csv', timestamp_col='timestamps',
                                            value_cols=['rate'], aggregation='avg', prefix='hr_watch_')

        # Add the labels provided by the users as binary attributes (i.e. add a one to the attribute representing the
        # specific value for a label if it occurs within an interval). These are categorical events that might overlap.
        data_engineer.add_event_dataset(file='labels.csv', start_timestamp_col='label_start',
                                        end_timestamp_col='label_end',
                                        value_col='label', aggregation='binary')

        # Add the amount of light sensed by the phone (continuous numerical measurements) and aggregate by averaging
        data_engineer.add_numerical_dataset(file='light_phone.csv', timestamp_col='timestamps', value_cols=['lux'],
                                            aggregation='avg', prefix='light_phone_')

        # Add the magnetometer data (continuous numerical measurements) of the phone and the smartwatch
        # and aggregate the values per timestep by averaging the values
        data_engineer.add_numerical_dataset(file='magnetometer_phone.csv', timestamp_col='timestamps',
                                            value_cols=['x', 'y', 'z'], aggregation='avg', prefix='mag_phone_')
        data_engineer.add_numerical_dataset(file='magnetometer_smartwatch.csv', timestamp_col='timestamps',
                                            value_cols=['x', 'y', 'z'], aggregation='avg', prefix='mag_watch_')

        # Add the pressure sensed by the phone (continuous numerical measurements) and aggregate by averaging again
        data_engineer.add_numerical_dataset(file='pressure_phone.csv', timestamp_col='timestamps',
                                            value_cols=['pressure'],
                                            aggregation='avg', prefix='press_phone_')

        # Get the resulting pandas data table
        dataset = data_engineer.data_table

        # Create boxplots
        DataViz.plot_dataset_boxplot(dataset=dataset, cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x',
                                                            'acc_watch_y', 'acc_watch_z'])

        # Plot all data
        DataViz.plot_dataset(data_table=dataset,
                             columns=['acc_', 'gyr_', 'hr_watch_rate', 'light_phone_lux', 'mag_', 'press_phone_',
                                      'label'],
                             match=['like', 'like', 'like', 'like', 'like', 'like', 'like', 'like'],
                             display=['line', 'line', 'line', 'line', 'line', 'line', 'points', 'points'])

        # Print a summary of the dataset
        util.print_statistics(dataset=dataset)
        datasets.append(copy.deepcopy(dataset))

        # Save the various versions of the created datasets with logical filenames if needed
        if SAVE_VERSIONS:
            dataset.to_csv(RESULT_PATH / f'chapter2_result_{milliseconds_per_instance}')

    # Make a table like the one shown in the book, comparing the two datasets produced
    util.print_latex_table_statistics_two_datasets(dataset1=datasets[0], dataset2=datasets[1])

    # Finally, store the last dataset we generated (250 ms)
    dataset.to_csv(RESULT_PATH / RESULT_FNAME)
예제 #4
0
    # We add the pressure sensed by the phone (continuous numerical measurements) and aggregate by averaging again
    DataSet.add_numerical_dataset('pressure_phone.csv', 'timestamps',
                                  ['pressure'], 'avg', 'press_phone_')

    # Get the resulting pandas data table

    dataset = DataSet.data_table

    # Plot the data

    DataViz = VisualizeDataset()

    # Boxplot
    DataViz.plot_dataset_boxplot(dataset, [
        'acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x',
        'acc_watch_y', 'acc_watch_z'
    ])

    # Plot all data
    DataViz.plot_dataset(
        dataset, [
            'acc_', 'gyr_', 'hr_watch_rate', 'light_phone_lux', 'mag_',
            'press_phone_', 'label'
        ], ['like', 'like', 'like', 'like', 'like', 'like', 'like', 'like'],
        ['line', 'line', 'line', 'line', 'line', 'line', 'points', 'points'])

    # And print a summary of the dataset

    util.print_statistics(dataset)
    datasets.append(copy.deepcopy(dataset))
예제 #5
0
    # dataset.add_numerical_dataset('magnetometer_phone.csv', 'timestamps', ['x','y','z'], 'avg', 'mag_phone_')
    # dataset.add_numerical_dataset('magnetometer_smartwatch.csv', 'timestamps', ['x','y','z'], 'avg', 'mag_watch_')

    # We add the pressure sensed by the phone (continuous numerical measurements) and aggregate by averaging again
    # dataset.add_numerical_dataset('pressure_phone.csv', 'timestamps', ['pressure'], 'avg', 'press_phone_')

    # Get the resulting pandas data table
    dataset = dataset.data_table

    print(dataset)

    # Plot the data
    DataViz = VisualizeDataset(__file__)

    # Boxplot
    DataViz.plot_dataset_boxplot(dataset, ['userAcceleration.x', 'userAcceleration.y',
                                   'userAcceleration.z'])

    # Plot all data
    DataViz.plot_dataset(dataset, ['attitude.', 'gravity.', 'rotationRate.', 'userAcceleration.', 'label'],
                         ['like', 'like', 'like', 'like', 'like'],
                         ['line', 'line', 'line', 'line', 'points'])

    # And print a summary of the dataset.
    util.print_statistics(dataset)
    datasets.append(copy.deepcopy(dataset))

    # If needed, we could save the various versions of the dataset we create in the loop with logical filenames:
    # dataset.to_csv(RESULT_PATH / f'chapter2_result_{milliseconds_per_instance}')

# Make a table like the one shown in the book, comparing the two datasets produced.
util.print_latex_table_statistics_two_datasets(datasets[0], datasets[1])
예제 #6
0
#    DataSetCS.add_numerical_dataset('magnetometer_smartwatch.csv', 'timestamps', ['x','y','z'], 'avg', 'mag_watch_')

    # We add the pressure sensed by the phone (continuous numerical measurements) and aggregate by averaging again
    DataSetOwn.add_numerical_dataset('pedom_custom.csv', 'timestamps', ['steps', 'distance'], 'avg', 'pedom_phone_')
#    DataSetCS.add_numerical_dataset('pressure_phone.csv', 'timestamps', ['pressure'], 'avg', 'press_phone_')

    # Get the resulting pandas data table

    dataset_own = DataSetOwn.data_table
#    dataset_cs = DataSetCS.data_table

    # Plot the data
    DataViz = VisualizeDataset()

    # Boxplot
    DataViz.plot_dataset_boxplot(dataset_own, ['acc_phone_x','acc_phone_y','acc_phone_z'])
#    DataViz.plot_dataset_boxplot(dataset_cs, ['acc_phone_x','acc_phone_y','acc_phone_z'])

    # Plot all data
    DataViz.plot_dataset(dataset_own, ['acc_', 'gyr_', 'mag_', 'press_' ,'pedom_phone_', 'label'], ['like', 'like', 'like', 'like', 'like', 'like'], ['line', 'line', 'line','line', 'points', 'points'])
#    DataViz.plot_dataset(dataset_cs, ['acc_phone', 'gyr_phone', 'mag_phone', 'press_phone_', 'label'], ['like', 'like', 'like', 'like', 'like'], ['line', 'line', 'line', 'line', 'points'])

    # And print a summary of the dataset

    util.print_statistics(dataset_own)
    datasets_own.append(copy.deepcopy(dataset_own))
    
#    util.print_statistics(dataset_cs)
#    datasets_cs.append(copy.deepcopy(dataset_cs))

# And print the table that has been included in the book
예제 #7
0
dataset.add_event_dataset('labels.csv', 'label_start', 'label_end', 'label',
                          'binary')

dataset = dataset.data_table
dataset_walking = dataset[dataset['labelWalking'] == 1]
dataset_sitting = dataset[dataset['labelSitting'] == 1]
dataset_running = dataset[dataset['labelRunning'] == 1]
print(dataset['labelWalking'])
# Plot the data
DataViz = VisualizeDataset(__file__)

# Boxplot
DataViz.plot_dataset_boxplot(dataset_walking, [
    'acc_phone_x',
    'acc_phone_y',
    'acc_phone_z',
])
DataViz.plot_dataset_boxplot(dataset_sitting, [
    'acc_phone_x',
    'acc_phone_y',
    'acc_phone_z',
])
DataViz.plot_dataset_boxplot(dataset_running, [
    'acc_phone_x',
    'acc_phone_y',
    'acc_phone_z',
])

# # Plot all data
# DataViz.plot_dataset(dataset, ['acc_', 'gyr_',  'label'],
예제 #8
0
    # as binary attributes (i.e. add a one to the attribute representing the specific value for the label if it
    # occurs within an interval).

    DataSet.add_event_dataset('labels.csv', 'label_start', 'label_end',
                              'label', 'binary')

    # Get the resulting pandas data table

    dataset = DataSet.data_table

    # Plot the data

    DataViz = VisualizeDataset()

    # Boxplot
    DataViz.plot_dataset_boxplot(dataset, ['acc_x', 'acc_y', 'acc_z'])
    DataViz.plot_dataset_boxplot(dataset, ['gyr_x', 'gyr_y', 'gyr_z'])

    # Plot all data
    DataViz.plot_dataset(
        dataset,
        ['acc_', 'mag_', 'gyr_', 'light_', 'loc_', 'lin_acc_', 'label'],
        ['like', 'like', 'like', 'like', 'like', 'like', 'like'],
        ['line', 'line', 'line', 'line', 'line', 'line', 'points'])

    # print a summary of the dataset
    util.print_statistics(dataset)
    datasets.append(copy.deepcopy(dataset))

# And print the table that has been included in the book
# util.print_latex_table_statistics_two_datasets(datasets[0], datasets[1])
예제 #9
0
def main():
    # Set up file names and locations.
    DATA_PATH = Path('./intermediate_datafiles/')
    DATASET_FNAME = sys.argv[1] if len(sys.argv) > 1 else 'chapter2_result.csv'
    RESULT_FNAME = sys.argv[2] if len(
        sys.argv) > 2 else 'chapter3_result_outliers.csv'

    # Next, import the data from the specified location and parse the date index.
    try:
        dataset = pd.read_csv(Path(DATA_PATH / DATASET_FNAME), index_col=0)
        dataset.index = pd.to_datetime(dataset.index)

    except IOError as e:
        print(
            'File not found, try to run the preceding crowdsignals scripts first!'
        )
        raise e

    # We'll create an instance of our visualization class to plot the results.
    DataViz = VisualizeDataset(__file__)

    # Compute the number of milliseconds covered by an instance using the first two rows.
    milliseconds_per_instance = (dataset.index[1] -
                                 dataset.index[0]).microseconds / 1000

    # Step 1: Let us see whether we have some outliers we would prefer to remove.

    # Determine the columns we want to experiment on.
    outlier_columns = [
        'acc_phone_X',
        'acc_phone_Y',
        'acc_phone_Z',
        'gyr_phone_X',
        'gyr_phone_Y',
        'gyr_phone_Z',
        'mag_phone_X',
        'mag_phone_Y',
        'mag_phone_Z',
    ]

    # Create the outlier classes.
    OutlierDistr = DistributionBasedOutlierDetection()
    OutlierDist = DistanceBasedOutlierDetection()

    # And investigate the approaches for all relevant attributes.
    for col in outlier_columns:

        print(f"Applying outlier criteria for column {col}")

        # And try out all different approaches. Note that we have done some optimization
        # of the parameter values for each of the approaches by visual inspection.
        dataset = OutlierDistr.chauvenet(dataset, col)
        DataViz.plot_binary_outliers(dataset, col, col + '_outlier')
        dataset = OutlierDistr.mixture_model(dataset, col)
        DataViz.plot_dataset(dataset, [col, col + '_mixture'],
                             ['exact', 'exact'], ['line', 'points'])
        # This requires:
        # n_data_points * n_data_points * point_size =
        # 31839 * 31839 * 32 bits = ~4GB available memory

        try:
            dataset = OutlierDist.simple_distance_based(
                dataset, [col], 'euclidean', 0.10, 0.99)
            DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier')
        except MemoryError as e:
            print(
                'Not enough memory available for simple distance-based outlier detection...'
            )
            print('Skipping.')

        try:
            dataset = OutlierDist.local_outlier_factor(dataset, [col],
                                                       'euclidean', 5)
            DataViz.plot_dataset(dataset, [col, 'lof'], ['exact', 'exact'],
                                 ['line', 'points'])
            DataViz.plot_dataset_boxplot(dataset, ['lof'])
            # print(col, dataset['lof'].describe())
            qtls = list(dataset['lof'].quantile([0.01, 0.25, 0.5, 0.75, 0.99]))
            # print(col, qtls)
            #print(col, qtls[4])

            dataset['lof_outliers'] = False
            dataset.loc[(dataset['lof'] > qtls[4]), 'lof_outliers'] = True

            DataViz.plot_binary_outliers(dataset, col, 'lof_outliers')
        except MemoryError as e:
            print('Not enough memory available for lof...')
            print('Skipping.')

        # Remove all the stuff from the dataset again.
        cols_to_remove = [
            col + '_outlier', col + '_mixture', 'simple_dist_outlier', 'lof',
            'lof_outliers'
        ]
        for to_remove in cols_to_remove:
            if to_remove in dataset:
                del dataset[to_remove]

    # We take Chauvenet's criterion and apply it to all but the label data...
    for col in [c for c in dataset.columns if not 'label' in c]:
        print(f'Measurement is now: {col}')
        if col.startswith('mag'):
            dataset = OutlierDist.simple_distance_based(
                dataset, [col], 'euclidean', 0.10,
                0.99).rename(columns={'simple_dist_outlier': f'{col}_outlier'})
        else:
            dataset = OutlierDistr.chauvenet(dataset, col)

        dataset.loc[dataset[f'{col}_outlier'] == True, col] = np.nan
        DataViz.plot_binary_outliers(dataset, col, f'{col}_outlier')
        del dataset[col + '_outlier']

    dataset.to_csv(DATA_PATH / RESULT_FNAME)
예제 #10
0
                              for x in list(sensors.keys())[:-1]] + ['points'],
                             save_path='concatenated_250')

    if task != 'create_plots':
        exit(2)

    for granularity in granularities:
        for i, activity_path in enumerate(activity_paths):
            print('Activity: ', activity_path)
            dataset = CreateDataset(activity_path, granularity)

            for sensor_name, sensor_axis in sensors.items():
                dataset.add_numerical_dataset(sensor_name, time_column_name,
                                              sensor_axis, 'avg',
                                              axis_abbreviations[sensor_name])
            dataset = dataset.data_table

            DataViz = VisualizeDataset(__file__)
            for sensor_name, sensor_axis in sensors.items():
                DataViz.plot_dataset_boxplot(dataset, [
                    axis_abbreviations[sensor_name] + x
                    for x in sensors[sensor_name]
                ],
                                             save_path=str(granularity) + '/' +
                                             activities[i])
            DataViz.plot_dataset(
                dataset, [x for x in axis_abbreviations.values()],
                ['like' for x in axis_abbreviations.keys()],
                ['line' for x in axis_abbreviations.keys()],
                save_path=str(granularity) + '/' + activities[i])
예제 #11
0
    #Add location
    ##dataset.add_numerical_dataset('Location.csv', 'timestamps', ['lat', 'lon', 'height', 'velocity', 'direction', 'horizontal', 'vertical'], 'avg', 'loc_')
    ##dataset.add_numerical_dataset('Location.csv', 'timestamps', ['height', 'velocity', 'horizontal', 'vertical'], 'avg', 'loc_')
    dataset.add_numerical_dataset('Location.csv', 'timestamps',
                                  ['height', 'velocity'], 'avg', 'loc_')

    # Get the resulting pandas data table
    dataset = dataset.data_table
    print(dataset)

    # Plot the data
    DataViz = VisualizeDataset(__file__)

    # Boxplot
    DataViz.plot_dataset_boxplot(dataset, ['acc_x', 'acc_y', 'acc_z'])
    DataViz.plot_dataset_boxplot(dataset, ['gyr_x', 'gyr_y', 'gyr_z'])
    DataViz.plot_dataset_boxplot(dataset, ['bar_x'])
    DataViz.plot_dataset_boxplot(dataset, ['mag_x', 'mag_y', 'mag_z'])
    ##DataViz.plot_dataset_boxplot(dataset, ['acc_x','acc_y','acc_z',])

    # Plot all data
    DataViz.plot_dataset(
        dataset, ['acc_', 'gyr_', 'bar_', 'mag_', 'lin_acc_', 'loc_', 'label'],
        ['like', 'like', 'like', 'like', 'like', 'like', 'like'],
        ['line', 'line', 'line', 'line', 'line', 'line', 'points'])

    # And print a summary of the dataset.
    #util.print_statistics(dataset)
    datasets.append(copy.deepcopy(dataset))