Python VisualizeDataset.plot_dataset 예제들, util.VisualizeDataset.VisualizeDataset.plot_dataset Python 예제들

예제 #1

0

파일 보기

def main():
    DATA_PATH = Path('./intermediate_datafiles/')
    DATASET_FNAME = 'chapter2_result.csv'
    RESULT_FNAME = 'chapter3_heart_rate.csv'

    try:
        dataset = pd.read_csv(Path(DATA_PATH / DATASET_FNAME), index_col=0)
        dataset.index = pd.to_datetime(dataset.index)
    except IOError as e:
        print(
            'File not found, try to run the preceding crowdsignals scripts first!'
        )
        raise e

    DataViz = VisualizeDataset(__file__)

    # Original heart rate values
    # DataViz.plot_imputed_values(dataset, ['original'], 'hr_watch_rate')

    Kalman = KalmanFilters()

    dataset = Kalman.apply_kalman_filter(dataset, 'hr_watch_rate')
    # print(dataset.head())
    # print(dataset.index)

    DataViz.plot_dataset(dataset, ['hr_watch_rate', 'hr_watch_rate_kalman'],
                         ['exact', 'exact'], ['line', 'line'])

예제 #2

0

파일 보기

def main():
    DataViz = VisualizeDataset()

    dataset_path = './intermediate_datafiles/'
    try:
        dataset = pd.read_csv(dataset_path + 'chapter3_result_final.csv',
                              index_col=0)
    except IOError as e:
        print(
            'File not found, try to run previous crowdsignals scripts first!')
        raise e

    dataset.index = dataset.index.to_datetime()
    milliseconds_per_instance = (dataset.index[1] -
                                 dataset.index[0]).microseconds / 1000

    # Now we move to the frequency domain, with the same window size.

    FreqAbs = FourierTransformation()
    fs = float(1000) / milliseconds_per_instance

    periodic_predictor_cols = [
        'acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x',
        'acc_watch_y', 'acc_watch_z', 'gyr_phone_x', 'gyr_phone_y',
        'gyr_phone_z', 'gyr_watch_x', 'gyr_watch_y', 'gyr_watch_z',
        'mag_phone_x', 'mag_phone_y', 'mag_phone_z', 'mag_watch_x',
        'mag_watch_y', 'mag_watch_z'
    ]
    data_table = FreqAbs.abstract_frequency(
        copy.deepcopy(dataset), ['acc_phone_x'],
        int(float(10000) / milliseconds_per_instance), fs)

    # Spectral analysis.

    DataViz.plot_dataset(data_table, [
        'acc_phone_x_max_freq', 'acc_phone_x_freq_weighted', 'acc_phone_x_pse',
        'label'
    ], ['like', 'like', 'like', 'like'], ['line', 'line', 'line', 'points'])

    dataset = FreqAbs.abstract_frequency(
        dataset, periodic_predictor_cols,
        int(float(10000) / milliseconds_per_instance), fs)

    # Now we only take a certain percentage of overlap in the windows, otherwise our training examples will be too much alike.

    # The percentage of overlap we allow
    window_overlap = 0.9
    skip_points = int((1 - window_overlap) * ws)
    dataset = dataset.iloc[::skip_points, :]

    DataViz.plot_dataset(
        dataset, [
            'acc_phone_x', 'gyr_phone_x', 'hr_watch_rate', 'light_phone_lux',
            'mag_phone_x', 'press_phone_', 'pca_1', 'label'
        ], ['like', 'like', 'like', 'like', 'like', 'like', 'like', 'like'],
        ['line', 'line', 'line', 'line', 'line', 'line', 'line', 'points'])

예제 #3

0

파일 보기

def main():
    dataset_path = './intermediate_datafiles/'
    dataset = pd.read_csv(dataset_path + 'chapter2_result.csv', index_col=0)
    outlier_columns = ['acc_phone_x', 'light_phone_lux']
    DataViz = VisualizeDataset()

    OutlierDistr = DistributionBasedOutlierDetection()
    OutlierDist = DistanceBasedOutlierDetection()

    dataset.index = dataset.index.to_datetime()
    start = input("choose method: [1],[2],[3],[4]")
    if start == 1:
        param = input("Chauvenet\ninput parameters: c")
        for col in outlier_columns:
            dataset = OutlierDistr.chauvenet(dataset, col, param)
            DataViz.plot_binary_outliers(dataset, col, col + '_outlier')

    elif start == 2:
        # param = input("Mixture model\n input parameters: components, iter")
        components, iter = raw_input("Mixture model\n input parameters: components, iter").split(',')
        components = int(components)
        iter = int(iter)
        for col in outlier_columns:
            dataset = OutlierDistr.mixture_model(dataset, col, components, iter)
            DataViz.plot_dataset(dataset, [col, col + '_mixture'], ['exact','exact'], ['line', 'points'])

    elif start == 3:
        d_min, f_min = raw_input("Simple distance-based\n input parameters: d_min, f_min").split()
        d_min = float(d_min)
        f_min = float(f_min)
        for col in outlier_columns:
            dataset = OutlierDist.simple_distance_based(dataset, [col], 'euclidean', d_min, f_min)
            DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier')

    elif start == 4:
        param = input("Local outlier factor\n input parameters: k")
        for col in outlier_columns:
            dataset = OutlierDist.local_outlier_factor(dataset, col, 'euclidean', k)
            DataViz.plot_dataset(dataset, [col, 'lof'], ['exact','exact'], ['line', 'points'])

    else :
        print("no method selected")

예제 #4

0

파일 보기

#
# for col in [c for c in dataset.columns if not 'label' in c]:
#     sm.qqplot(dataset[col].values, line='s', ax = ax[i,j])
#     ax[i,j].set_xticks([])
#     ax[i,j].set_yticks([])
#     ax[i,j].set_xticklabels([])
#     ax[i,j].set_xticklabels([])
#     ax[i,j].set_title(col)
#     i += 1
#     if i == 5:
#         i = 0
#         j += 1
# plt.show()

DataViz.plot_dataset(
    dataset, ['acc_', 'press_', 'gyr_', 'mag_', 'linacc_', 'hr_', 'label'],
    ['like', 'like', 'like', 'like', 'like', 'like', 'like'],
    ['line', 'line', 'line', 'line', 'line', 'points', 'points'])

# Compute the number of milliseconds covered by an instance based on the first two rows
milliseconds_per_instance = (dataset.index[1] -
                             dataset.index[0]).microseconds / 1000

# Step 1: Let us see whether we have some outliers we would prefer to remove.

# Determine the columns we want to experiment on.
# outlier_columns = ['acc_phone_x', 'light_phone_lux']
outlier_columns = [c for c in dataset.columns if not 'label' in c]

# Create the outlier classes.
OutlierDistr = DistributionBasedOutlierDetection()
OutlierDist = DistanceBasedOutlierDetection()

예제 #5

0

파일 보기

    # Plot the data
    DataViz = VisualizeDataset(__file__)

    # Boxplot
    DataViz.plot_dataset_boxplot(
        dataset, ['acc_mobile_x', 'acc_mobile_y', 'acc_mobile_z'])
    #DataViz.plot_dataset_boxplot(dataset, ['gyr_mobile_x', 'gyr_mobile_y', 'gyr_mobile_z'])

    # Plot all data
    # DataViz.plot_dataset(dataset, ['acc_', 'gyr_', 'hr_watch_rate', 'light_phone_lux', 'mag_', 'press_phone_', 'label'],
    #                               ['like', 'like', 'like', 'like', 'like', 'like', 'like','like'],
    #                               ['line', 'line', 'line', 'line', 'line', 'line', 'points', 'points'])

    DataViz.plot_dataset(dataset, [
        'acc_mobile_', 'gyr_mobile_', 'mag_mobile_', 'prox_mobile_distance',
        'loc_mobile_', 'label'
    ], ['like', 'like', 'like', 'like', 'like', 'like'],
                         ['line', 'line', 'line', 'points', 'line', 'points'])

    # And print a summary of the dataset.
    util.print_statistics(dataset)
    datasets.append(copy.deepcopy(dataset))

    # If needed, we could save the various versions of the dataset we create in the loop with logical filenames:
    # dataset.to_csv(RESULT_PATH / f'chapter2_result_{milliseconds_per_instance}')

# Make a table like the one shown in the book, comparing the two datasets produced.
util.print_latex_table_statistics_two_datasets(datasets[0], datasets[1])

# Finally, store the last dataset we generated (250 ms).
dataset.to_csv(RESULT_PATH / RESULT_FNAME)

예제 #6

0

파일 보기

파일: crowdsignals_ch4_our.py 프로젝트: Courses-VU/ML4QS

milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds / 1000

# Chapter 4: Identifying aggregate attributes.

# First we focus on the time domain.

# Set the window sizes to the number of instances representing 5 seconds, 30 seconds and 5 minutes
window_sizes = [int(float(5000)/milliseconds_per_instance), int(float(0.5*60000)/milliseconds_per_instance), int(float(5*60000)/milliseconds_per_instance)]

NumAbs = NumericalAbstraction()
dataset_copy = copy.deepcopy(dataset)
for ws in window_sizes:
    dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['ax'], ws, 'mean')
    dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['ax'], ws, 'std')

DataViz.plot_dataset(dataset_copy, ['ax', 'ax_temp_mean', 'ax_temp_std', 'label'], ['exact', 'like', 'like', 'like'],['line', 'line', 'line', 'points'])

ws = int(float(0.5 * 60000)/milliseconds_per_instance)
selected_predictor_cols = [c for c in dataset.columns if not 'label' in c]
dataset = NumAbs.abstract_numerical(dataset, selected_predictor_cols, ws, 'mean')
dataset = NumAbs.abstract_numerical(dataset, selected_predictor_cols, ws, 'std')

CatAbs = CategoricalAbstraction()
dataset = CatAbs.abstract_categorical(dataset, ['label'], ['like'], 0.03,int(float(5*60000)/milliseconds_per_instance), 2)

# Now we move to the frequency domain, with the same window size.

FreqAbs = FourierTransformation()
fs = float(1000) / milliseconds_per_instance

periodic_predictor_cols = ['gFx', 'gFy', 'gFz', 'ax', 'ay', 'az', 'wx', 'wy', 'wz', 'p', 'Bx', 'By', 'Bz', 'Azimuth',

예제 #7

0

파일 보기

]
#
# data_table = FreqAbs.abstract_frequency(copy.deepcopy(dataset), ['acc_phone_Y'],
#                                         int(float(4000) / milliseconds_per_instance), fs)

# Spectral analysis.

# DataViz.plot_dataset(data_table, ['acc_phone_Y_max_freq', 'acc_phone_Y_freq_weighted', 'acc_phone_Y_pse', 'label'],
#                      ['like', 'like', 'like', 'like'], ['line', 'line', 'line', 'points'])
# we use 4s
ws_freq = int(float(4000) / milliseconds_per_instance)
dataset = FreqAbs.abstract_frequency(dataset, periodic_predictor_cols, ws_freq,
                                     fs)

# Now we only take a certain percentage of overlap in the windows, otherwise our training examples will be too much alike.

ws = int(
    float(4000) /
    milliseconds_per_instance)  # we remove 10% of the data for every second.
# The percentage of overlap we allow
window_overlap = 0.9
skip_points = int((1 - window_overlap) * ws)
dataset = dataset.iloc[::skip_points, :]

dataset.to_csv(DATA_PATH / RESULT_FNAME)

DataViz.plot_dataset(
    dataset, ['acc_phone_X', 'gyr_phone_X', 'mag_phone_X', 'pca_1', 'label'],
    ['like', 'like', 'like', 'like', 'like'],
    ['line', 'line', 'line', 'line', 'points'])

예제 #8

0

파일 보기

파일: crowdsignals_ch3_rest.py 프로젝트: ML4QS-2/ML4QS

# DataViz.plot_dataset(kalman_dataset, ['acc_phone_X', 'acc_phone_X_kalman'], ['exact', 'exact'], ['line', 'line'])

# We ignore the Kalman filter output for now...

# Let us apply a lowpass filter and reduce the importance of the data above 1.5 Hz --> 2Hz

LowPass = LowPassFilter()

# Determine the sampling frequency.
print("milliseconds_per_instance ", milliseconds_per_instance)
fs = float(4000) / milliseconds_per_instance  # old value 1000
cutoff = 4  # 1.5 hz?

# Let us study acc_phone_X:
new_dataset = LowPass.low_pass_filter(copy.deepcopy(dataset), 'acc_phone_X', fs, cutoff, order=10)
DataViz.plot_dataset(new_dataset.iloc[int(0 * len(new_dataset.index)):int(0.8 * len(new_dataset.index)), :],
                     ['acc_phone_X', 'acc_phone_X_lowpass'], ['exact', 'exact'], ['line', 'line'])

# And not let us include all measurements that have a form of periodicity (and filter them):
periodic_measurements = ['acc_phone_X', 'acc_phone_Y', 'acc_phone_Z', 'gyr_phone_X', 'gyr_phone_Y',
                         'gyr_phone_Z', 'mag_phone_X', 'mag_phone_Y', 'mag_phone_Z']

for col in periodic_measurements:
    dataset = LowPass.low_pass_filter(dataset, col, fs, cutoff, order=10)
    DataViz.plot_dataset(dataset.iloc[int(0.5 * len(new_dataset.index)):int(0.8 * len(new_dataset.index))],
                         [col, col + '_lowpass'], ['exact', 'exact'], ['line', 'line'])
    dataset[col] = dataset[col + '_lowpass']
    del dataset[col + '_lowpass']

# Determine the PC's for all but our target columns (the labels and the heart rate)
# We simplify by ignoring both, we could also ignore one first, and apply a PC to the remainder.

예제 #9

0

파일 보기

파일: crowdsignals_ch2.py 프로젝트: Courses-VU/ML4QS

    dataset = DataSet.data_table

    # Plot the data

    DataViz = VisualizeDataset()

    # Boxplot
    DataViz.plot_dataset_boxplot(dataset, [
        'acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x',
        'acc_watch_y', 'acc_watch_z'
    ])

    # Plot all data
    DataViz.plot_dataset(
        dataset, [
            'acc_', 'gyr_', 'hr_watch_rate', 'light_phone_lux', 'mag_',
            'press_phone_', 'label'
        ], ['like', 'like', 'like', 'like', 'like', 'like', 'like', 'like'],
        ['line', 'line', 'line', 'line', 'line', 'line', 'points', 'points'])

    # And print a summary of the dataset

    util.print_statistics(dataset)
    datasets.append(copy.deepcopy(dataset))

# And print the table that has been included in the book

util.print_latex_table_statistics_two_datasets(datasets[0], datasets[1])

# Finally, store the last dataset we have generated (250 ms).
dataset.to_csv(result_dataset_path + 'chapter2_result.csv')

예제 #10

0

파일 보기

# Determine the columns we want to experiment on.
outlier_columns = ['acc_y', 'lin_acc_x']

# Create the outlier classes.
OutlierDistr = DistributionBasedOutlierDetection()
OutlierDist = DistanceBasedOutlierDetection()

# And investigate the approaches for all relevant attributes.
for col in outlier_columns:
    # And try out all different approaches. Note that we have done some optimization
    # of the parameter values for each of the approaches by visual inspection.
    dataset = OutlierDistr.chauvenet(dataset, col)
    DataViz.plot_binary_outliers(dataset, col, col + '_outlier',
                                 'Chauvenets criterion')
    dataset = OutlierDistr.mixture_model(dataset, col)
    DataViz.plot_dataset(dataset, [col, col + '_mixture'], 'Mixture models',
                         ['exact', 'exact'], ['line', 'points'])
    # This requires:
    # n_data_points * n_data_points * point_size =
    # 31839 * 31839 * 64 bits = ~8GB available memory
    # try:
    #     dataset = OutlierDist.simple_distance_based(dataset, [col], 'euclidean', 0.10, 0.99)
    #     DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier', 'Simple distance-based approach')
    # except MemoryError as e:
    #     print('Not enough memory available for simple distance-based outlier detection...')
    #     print('Skipping.')
    #
    # try:
    #     dataset = OutlierDist.local_outlier_factor(dataset, [col], 'euclidean', 5)
    #     DataViz.plot_dataset(dataset, [col, 'lof'], 'Local outlier factor', ['exact', 'exact'], ['line', 'points'])
    # except MemoryError as e:
    #     print('Not enough memory available for lof...')

예제 #11

0

파일 보기

                              'label', 'binary')

    # Get the resulting pandas data table

    dataset = DataSet.data_table

    # Plot the data

    DataViz = VisualizeDataset()

    # Boxplot
    DataViz.plot_dataset_boxplot(dataset, ['acc_x', 'acc_y', 'acc_z'])
    DataViz.plot_dataset_boxplot(dataset, ['gyr_x', 'gyr_y', 'gyr_z'])

    # Plot all data
    DataViz.plot_dataset(
        dataset,
        ['acc_', 'mag_', 'gyr_', 'light_', 'loc_', 'lin_acc_', 'label'],
        ['like', 'like', 'like', 'like', 'like', 'like', 'like'],
        ['line', 'line', 'line', 'line', 'line', 'line', 'points'])

    # print a summary of the dataset
    util.print_statistics(dataset)
    datasets.append(copy.deepcopy(dataset))

# And print the table that has been included in the book
# util.print_latex_table_statistics_two_datasets(datasets[0], datasets[1])

# Finally, store the last dataset we have generated (250 ms).
dataset.to_csv(result_dataset_path + 'aggregation_result.csv')

예제 #12

0

파일 보기

# Let us apply a lowpass filter and reduce the importance of the data above 1.5 Hz
LowPass = LowPassFilter()

# Determine the sampling frequency.
fs = float(1000) / milliseconds_per_instance
cutoff = 1.5

# Let us study acc_x:
new_dataset = LowPass.low_pass_filter(copy.deepcopy(dataset),
                                      'acc_x',
                                      fs,
                                      cutoff,
                                      order=10)
DataViz.plot_dataset(
    new_dataset.ix[int(0.4 *
                       len(new_dataset.index)):int(0.43 *
                                                   len(new_dataset.index)), :],
    ['acc_x', 'acc_x_lowpass'], ['exact', 'exact'], ['line', 'line'])

# And not let us include all measurements that have a form of periodicity (and filter them):
periodic_measurements = [
    'acc_x', 'acc_y', 'acc_z', 'lin_acc_x', 'lin_acc_y', 'lin_acc_y', 'gyr_x',
    'gyr_y', 'gyr_z', 'mag_x', 'mag_y', 'mag_z'
]

for col in periodic_measurements:
    dataset = LowPass.low_pass_filter(dataset, col, fs, cutoff, order=10)
    dataset[col] = dataset[col + '_lowpass']
    del dataset[col + '_lowpass']

# Determine the PC's for all but our target columns (the labels)

예제 #13

0

파일 보기

파일: crowdsignals_ch4.py 프로젝트: antreashp/DM_ass1_rs

#  .apply(lambda group: group.reindex(full_idx, method='nearest')) 
#  .reset_index(level=0, drop=True) 
#  .sort_index() 
# )
# print(dataset)
dataset = dataset.drop(columns=['label'])
NumAbs = NumericalAbstraction()
dataset_copy = copy.deepcopy(dataset)
for ws in window_sizes:
    dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['acc_x'], ws, 'mean')
    dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['acc_x'], ws, 'std')

# print(dataset_copy.columns)
# print(dataset_copy)
# exit()
DataViz.plot_dataset(dataset_copy, ['acc_x', 'acc_x_temp_mean', 'acc_x_temp_std', 'label'], ['like', 'like', 'like', 'like'], ['line', 'line', 'line', 'points'])

ws = int(float(0.5*60000)/milliseconds_per_instance)
selected_predictor_cols = [c for c in dataset.columns if not 'label' in c]
dataset = NumAbs.abstract_numerical(dataset, selected_predictor_cols, ws, 'mean')
dataset = NumAbs.abstract_numerical(dataset, selected_predictor_cols, ws, 'std')

DataViz.plot_dataset(dataset, ['acc_x','acc_y','acc_z', 'pca_1', 'label'], ['like', 'like', 'like','like','like'], ['line', 'line', 'line','line', 'points'])


CatAbs = CategoricalAbstraction()
dataset = CatAbs.abstract_categorical(dataset, ['label'], ['like'], 0.03, int(float(5*60000)/milliseconds_per_instance), 2)

# Now we move to the frequency domain, with the same window size.

FreqAbs = FourierTransformation()

예제 #14

0

파일 보기

pc_values = PCA.determine_pc_explained_variance(dataset,
                                                selected_predictor_cols)

# Plot the variance explained.

plot.plot(range(1, len(selected_predictor_cols) + 1), pc_values, 'b-')
plot.xlabel('principal component number')
plot.ylabel('explained variance')
plot.show(block=False)

# We select 7 as the best number of PC's as this explains most of the variance

n_pcs = 4

dataset = PCA.apply_pca(copy.deepcopy(dataset), selected_predictor_cols, n_pcs)

#And we visualize the result of the PC's

DataViz.plot_dataset(dataset, ['pca_', 'label'], ['like', 'like'],
                     ['line', 'points'])

# And the overall final dataset:

DataViz.plot_dataset(dataset,
                     ['acc_', 'gyr_', 'mag_', 'press_phone_', 'label'],
                     ['like', 'like', 'like', 'like', 'like'],
                     ['line', 'line', 'line', 'line', 'points'])

# Store the outcome.

dataset.to_csv(dataset_path + 'mydata_chapter3_result_final.csv')

예제 #15

0

파일 보기

    'gyr_Gyroscope y (rad/s)', 'gyr_Gyroscope z (rad/s)',
    'mag_Magnetic field x (muT)', 'mag_Magnetic field y (muT)',
    'mag_Magnetic field z (muT)', 'linacc_Linear Acceleration x (m/s^2)',
    'linacc_Linear Acceleration y (m/s^2)',
    'linacc_Linear Acceleration z (m/s^2)', 'hr_Heart Rate'
]

data_table = FreqAbs.abstract_frequency(
    copy.deepcopy(dataset), ['linacc_Linear Acceleration x (m/s^2)'],
    int(float(10000) / milliseconds_per_instance), fs)

# Spectral analysis.

DataViz.plot_dataset(data_table, [
    'linacc_Linear Acceleration x (m/s^2)_max_freq',
    'linacc_Linear Acceleration x (m/s^2)_freq_weighted',
    'linacc_Linear Acceleration x (m/s^2)_pse', 'label'
], ['like', 'like', 'like', 'like'], ['line', 'line', 'line', 'points'])

print('geen plot')
dataset = FreqAbs.abstract_frequency(
    dataset, periodic_predictor_cols,
    int(float(10000) / milliseconds_per_instance), fs)

# Now we only take a certain percentage of overlap in the windows, otherwise our training examples will be too much alike.

# # The percentage of overlap we allow
# window_overlap = 0.9
# skip_points = int((1-window_overlap) * ws)
# dataset = dataset.iloc[::skip_points,:]

예제 #16

0

파일 보기

def main():
    # Set a granularity (the discrete step size of our time series data) and choose if all resulting datasets should
    # be saved. A course-grained granularity of one instance per minute, and a fine-grained one with four instances
    # per second are used.
    GRANULARITIES = [60000, 250]
    SAVE_VERSIONS = False

    # We can call Path.mkdir(exist_ok=True) to make any required directories if they don't already exist.
    [path.mkdir(exist_ok=True, parents=True) for path in [DATASET_PATH, RESULT_PATH]]

    # Create object to visualize the data and save figures
    DataViz = VisualizeDataset(module_path=__file__)

    datasets = []
    for milliseconds_per_instance in GRANULARITIES:
        print(
            f'Creating numerical datasets from files in {DATASET_PATH} using granularity {milliseconds_per_instance}.')

        # Create an initial dataset object with the base directory for our data and a granularity and add selected
        # measurements to it
        data_engineer = CreateDataset(base_dir=DATASET_PATH, granularity=milliseconds_per_instance)

        # Add the accelerometer data (continuous numerical measurements) of the phone and the smartwatch
        # and aggregate the values per timestep by averaging the values
        data_engineer.add_numerical_dataset(file='accelerometer_phone.csv', timestamp_col='timestamps',
                                            value_cols=['x', 'y', 'z'], aggregation='avg', prefix='acc_phone_')
        data_engineer.add_numerical_dataset(file='accelerometer_smartwatch.csv', timestamp_col='timestamps',
                                            value_cols=['x', 'y', 'z'], aggregation='avg', prefix='acc_watch_')

        # Add the gyroscope data (continuous numerical measurements) of the phone and the smartwatch
        # and aggregate the values per timestep by averaging the values
        data_engineer.add_numerical_dataset(file='gyroscope_phone.csv', timestamp_col='timestamps',
                                            value_cols=['x', 'y', 'z'], aggregation='avg', prefix='gyr_phone_')
        data_engineer.add_numerical_dataset(file='gyroscope_smartwatch.csv', timestamp_col='timestamps',
                                            value_cols=['x', 'y', 'z'], aggregation='avg', prefix='gyr_watch_')

        # Add the heart rate (continuous numerical measurements) and aggregate by averaging the values
        data_engineer.add_numerical_dataset(file='heart_rate_smartwatch.csv', timestamp_col='timestamps',
                                            value_cols=['rate'], aggregation='avg', prefix='hr_watch_')

        # Add the labels provided by the users as binary attributes (i.e. add a one to the attribute representing the
        # specific value for a label if it occurs within an interval). These are categorical events that might overlap.
        data_engineer.add_event_dataset(file='labels.csv', start_timestamp_col='label_start',
                                        end_timestamp_col='label_end',
                                        value_col='label', aggregation='binary')

        # Add the amount of light sensed by the phone (continuous numerical measurements) and aggregate by averaging
        data_engineer.add_numerical_dataset(file='light_phone.csv', timestamp_col='timestamps', value_cols=['lux'],
                                            aggregation='avg', prefix='light_phone_')

        # Add the magnetometer data (continuous numerical measurements) of the phone and the smartwatch
        # and aggregate the values per timestep by averaging the values
        data_engineer.add_numerical_dataset(file='magnetometer_phone.csv', timestamp_col='timestamps',
                                            value_cols=['x', 'y', 'z'], aggregation='avg', prefix='mag_phone_')
        data_engineer.add_numerical_dataset(file='magnetometer_smartwatch.csv', timestamp_col='timestamps',
                                            value_cols=['x', 'y', 'z'], aggregation='avg', prefix='mag_watch_')

        # Add the pressure sensed by the phone (continuous numerical measurements) and aggregate by averaging again
        data_engineer.add_numerical_dataset(file='pressure_phone.csv', timestamp_col='timestamps',
                                            value_cols=['pressure'],
                                            aggregation='avg', prefix='press_phone_')

        # Get the resulting pandas data table
        dataset = data_engineer.data_table

        # Create boxplots
        DataViz.plot_dataset_boxplot(dataset=dataset, cols=['acc_phone_x', 'acc_phone_y', 'acc_phone_z', 'acc_watch_x',
                                                            'acc_watch_y', 'acc_watch_z'])

        # Plot all data
        DataViz.plot_dataset(data_table=dataset,
                             columns=['acc_', 'gyr_', 'hr_watch_rate', 'light_phone_lux', 'mag_', 'press_phone_',
                                      'label'],
                             match=['like', 'like', 'like', 'like', 'like', 'like', 'like', 'like'],
                             display=['line', 'line', 'line', 'line', 'line', 'line', 'points', 'points'])

        # Print a summary of the dataset
        util.print_statistics(dataset=dataset)
        datasets.append(copy.deepcopy(dataset))

        # Save the various versions of the created datasets with logical filenames if needed
        if SAVE_VERSIONS:
            dataset.to_csv(RESULT_PATH / f'chapter2_result_{milliseconds_per_instance}')

    # Make a table like the one shown in the book, comparing the two datasets produced
    util.print_latex_table_statistics_two_datasets(dataset1=datasets[0], dataset2=datasets[1])

    # Finally, store the last dataset we generated (250 ms)
    dataset.to_csv(RESULT_PATH / RESULT_FNAME)

예제 #17

0

파일 보기

# We simplify by ignoring both, we could also ignore one first, and apply a PC to the remainder.

PCA = PrincipalComponentAnalysis()
selected_predictor_cols = [c for c in dataset.columns if (not ('label' in c)) and (not (c == 'hr_watch_rate'))]
pc_values = PCA.determine_pc_explained_variance(dataset, selected_predictor_cols)

# print(len(selected_predictor_cols)+1, pc_values)
# Plot the variance explained.
DataViz.plot_xy(x=[range(1, len(selected_predictor_cols)+1)], y=[pc_values],
                xlabel='principal component number', ylabel='explained variance',
                ylim=[0,1], line_styles=['b-'])

# We select 7 as the best number of PC's as this explains most of the variance

n_pcs = 4

dataset = PCA.apply_pca(copy.deepcopy(dataset), selected_predictor_cols, n_pcs)

#And we visualize the result of the PC's

DataViz.plot_dataset(dataset, ['pca_', 'label'], ['like', 'like'], ['line', 'points'])

# And the overall final dataset:

DataViz.plot_dataset(dataset, ['attitude','gravity','rotationRate','userAcceleration', 'label'],
                     ['like', 'like', 'like', 'like', 'like', 'like', 'like','like', 'like'],
                     ['line', 'line', 'line', 'line', 'line', 'line', 'line', 'points', 'points'])

# Store the outcome.

dataset.to_csv(DATA_PATH / RESULT_FNAME)

예제 #18

0

파일 보기

파일: crowdsignals_ch3_outliers.py 프로젝트: yaaani85/ML4QS

def main():

    # Set up file names and locations.
    DATA_PATH = Path('./intermediate_datafiles/')
    DATASET_FNAME = sys.argv[1] if len(sys.argv) > 1 else 'chapter2_result.csv'
    RESULT_FNAME = sys.argv[2] if len(
        sys.argv) > 2 else 'chapter3_result_outliers.csv'

    # Next, import the data from the specified location and parse the date index.
    try:
        dataset = pd.read_csv(Path(DATA_PATH / DATASET_FNAME), index_col=0)
        dataset.index = pd.to_datetime(dataset.index)

    except IOError as e:
        print(
            'File not found, try to run the preceding crowdsignals scripts first!'
        )
        raise e

    # We'll create an instance of our visualization class to plot the results.
    DataViz = VisualizeDataset(__file__)

    # Compute the number of milliseconds covered by an instance using the first two rows.
    milliseconds_per_instance = (dataset.index[1] -
                                 dataset.index[0]).microseconds / 1000

    # Step 1: Let us see whether we have some outliers we would prefer to remove.

    # Determine the columns we want to experiment on.
    outlier_columns = ['acc_phone_x', 'light_phone_lux']

    # Create the outlier classes.
    OutlierDistr = DistributionBasedOutlierDetection()
    OutlierDist = DistanceBasedOutlierDetection()

    # And investigate the approaches for all relevant attributes.
    for col in outlier_columns:

        print(f"Applying outlier criteria for column {col}")

        # And try out all different approaches. Note that we have done some optimization
        # of the parameter values for each of the approaches by visual inspection.
        dataset = OutlierDistr.chauvenet(dataset, col)
        DataViz.plot_binary_outliers(dataset, col, col + '_outlier')
        dataset = OutlierDistr.mixture_model(dataset, col)
        DataViz.plot_dataset(dataset, [col, col + '_mixture'],
                             ['exact', 'exact'], ['line', 'points'])
        # This requires:
        # n_data_points * n_data_points * point_size =
        # 31839 * 31839 * 32 bits = ~4GB available memory

        try:
            dataset = OutlierDist.simple_distance_based(
                dataset, [col], 'euclidean', 0.10, 0.99)
            DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier')
        except MemoryError as e:
            print(
                'Not enough memory available for simple distance-based outlier detection...'
            )
            print('Skipping.')

        try:
            dataset = OutlierDist.local_outlier_factor(dataset, [col],
                                                       'euclidean', 5)
            DataViz.plot_dataset(dataset, [col, 'lof'], ['exact', 'exact'],
                                 ['line', 'points'])
        except MemoryError as e:
            print('Not enough memory available for lof...')
            print('Skipping.')

        # Remove all the stuff from the dataset again.
        cols_to_remove = [
            col + '_outlier', col + '_mixture', 'simple_dist_outlier', 'lof'
        ]
        for to_remove in cols_to_remove:
            if to_remove in dataset:
                del dataset[to_remove]

    # We take Chauvenet's criterion and apply it to all but the label data...

    for col in [c for c in dataset.columns if not 'label' in c]:
        print(f'Measurement is now: {col}')
        dataset = OutlierDistr.chauvenet(dataset, col)
        dataset.loc[dataset[f'{col}_outlier'] == True, col] = np.nan
        del dataset[col + '_outlier']

    dataset.to_csv(DATA_PATH / RESULT_FNAME)

예제 #19

0

파일 보기

파일: crowdsignals_ch4.py 프로젝트: kim66003/ML4QS_group25

# First we focus on the time domain.

# Set the window sizes to the number of instances representing 5 seconds, 30 seconds and 5 minutes
window_sizes = [int(float(5000)/milliseconds_per_instance), int(float(0.5*60000)/milliseconds_per_instance), int(float(5*60000)/milliseconds_per_instance)]

print('total window sizes', window_sizes)

NumAbs = NumericalAbstraction()
dataset_copy = copy.deepcopy(dataset)
for ws in window_sizes:
    dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['acc_phone_x'], ws, 'mean')
    dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['acc_phone_x'], ws, 'std')
    print('window size', ws)

DataViz.plot_dataset(dataset_copy, ['acc_phone_x', 'acc_phone_x_temp_mean', 'acc_phone_x_temp_std', 'label'], ['exact', 'like', 'like', 'like'], ['line', 'line', 'line', 'points'])

ws = int(float(0.5*60000)/milliseconds_per_instance)
selected_predictor_cols = [c for c in dataset.columns if not 'label' in c]
dataset = NumAbs.abstract_numerical(dataset, selected_predictor_cols, ws, 'mean')
dataset = NumAbs.abstract_numerical(dataset, selected_predictor_cols, ws, 'std')

DataViz.plot_dataset(dataset, ['acc_phone_x', 'gyr_phone_x', 'hr_watch_rate', 'light_phone_lux', 'mag_phone_x', 'press_phone_', 'pca_1', 'label'], ['like', 'like', 'like', 'like', 'like', 'like', 'like','like'], ['line', 'line', 'line', 'line', 'line', 'line', 'line', 'points'])


CatAbs = CategoricalAbstraction()
dataset = CatAbs.abstract_categorical(dataset, ['label'], ['like'], 0.03, int(float(5*60000)/milliseconds_per_instance), 2)

print('attributes frequency domain')

# Now we move to the frequency domain, with the same window size.

예제 #20

0

파일 보기

for col in [c for c in dataset.columns if not 'label' in c]:
    dataset = MisVal.impute_interpolate(dataset, col)

# Using the result from Chapter 2, let us try the Kalman filter on the light_phone_lux attribute and study the result.

original_dataset = pd.read_csv(DATA_PATH / ORIG_DATASET_FNAME, index_col=0)
original_dataset.index = pd.to_datetime(original_dataset.index)
KalFilter = KalmanFilters()
kalman_dataset = KalFilter.apply_kalman_filter(original_dataset,
                                               'acc_mobile_x')

DataViz.plot_imputed_values(kalman_dataset, ['original', 'kalman'],
                            'acc_mobile_x',
                            kalman_dataset['acc_mobile_x_kalman'])
DataViz.plot_dataset(kalman_dataset, ['acc_mobile_x', 'acc_mobile_x_kalman'],
                     ['exact', 'exact'], ['line', 'line'])

# Determine the PC's for all but our target columns (the labels and the heart rate)
# We simplify by ignoring both, we could also ignore one first, and apply a PC to the remainder.

PCA = PrincipalComponentAnalysis()
selected_predictor_cols = [c for c in dataset.columns if (not ('label' in c))]
pc_values = PCA.determine_pc_explained_variance(dataset,
                                                selected_predictor_cols)

# Plot the variance explained.
DataViz.plot_xy(x=[range(1,
                         len(selected_predictor_cols) + 1)],
                y=[pc_values],
                xlabel='principal component number',
                ylabel='explained variance',

예제 #21

0

파일 보기

# And we impute for all columns except for the label in the selected way (interpolation)

for col in [c for c in dataset.columns if not 'label' in c]:
    dataset = MisVal.impute_interpolate(dataset, col)

# Let us try the Kalman filter on the light_phone_lux attribute and study the result.

original_dataset = pd.read_csv(dataset_path + 'chapter2_result.csv',
                               index_col=0)
original_dataset.index = original_dataset.index.to_datetime()
KalFilter = KalmanFilters()
kalman_dataset = KalFilter.apply_kalman_filter(original_dataset, 'acc_phone_x')
DataViz.plot_imputed_values(kalman_dataset, ['original', 'kalman'],
                            'acc_phone_x',
                            kalman_dataset['acc_phone_x_kalman'])
DataViz.plot_dataset(kalman_dataset, ['acc_phone_x', 'acc_phone_x_kalman'],
                     ['exact', 'exact'], ['line', 'line'])

# We ignore the Kalman filter output for now...

# Let us apply a lowpass filter and reduce the importance of the data above 1.5 Hz

LowPass = LowPassFilter()

# Determine the sampling frequency.
fs = float(1000) / milliseconds_per_instance
cutoff = 1.5

# Let us study acc_phone_x:
new_dataset = LowPass.low_pass_filter(copy.deepcopy(dataset),
                                      'acc_phone_x',
                                      fs,

예제 #22

0

파일 보기

파일: crowdsignals_ch4final1.py 프로젝트: jeba91/ML4QS

    int(float(2000) / milliseconds_per_instance),
    int(float(4000) / milliseconds_per_instance),
    int(float(10000) / milliseconds_per_instance)
]

NumAbs = NumericalAbstraction()
dataset_copy = copy.deepcopy(dataset)
for ws in window_sizes:
    dataset_copy = NumAbs.abstract_numerical(
        dataset_copy, ['linacc_Linear Acceleration x (m/s^2)'], ws, 'mean')
    dataset_copy = NumAbs.abstract_numerical(
        dataset_copy, ['linacc_Linear Acceleration x (m/s^2)'], ws, 'std')

DataViz.plot_dataset(dataset_copy, [
    'linacc_Linear Acceleration x (m/s^2)',
    'linacc_Linear Acceleration x (m/s^2)_temp_mean',
    'linacc_Linear Acceleration x (m/s^2)_temp_std', 'label'
], ['exact', 'like', 'like', 'like'], ['line', 'line', 'line', 'points'])

ws = int(float(4000) / milliseconds_per_instance)

selected_predictor_cols = [
    'acc_Acceleration x (m/s^2)', 'acc_Acceleration y (m/s^2)',
    'acc_Acceleration z (m/s^2)', 'press_Pressure (hPa)',
    'gyr_Gyroscope x (rad/s)', 'gyr_Gyroscope y (rad/s)',
    'gyr_Gyroscope z (rad/s)', 'mag_Magnetic field x (muT)',
    'mag_Magnetic field y (muT)', 'mag_Magnetic field z (muT)',
    'linacc_Linear Acceleration x (m/s^2)',
    'linacc_Linear Acceleration y (m/s^2)',
    'linacc_Linear Acceleration z (m/s^2)', 'hr_Heart Rate'
]

예제 #23

0

파일 보기

파일: crowdsignals_ch4-own.py 프로젝트: raki123/mlqs


# Chapter 4: Identifying aggregate attributes.

# First we focus on the time domain.

# Set the window sizes to the number of instances representing 5 seconds, 30 seconds and 5 minutes
window_sizes = [int(float(5000)/milliseconds_per_instance), int(float(0.5*60000)/milliseconds_per_instance), int(float(5*60000)/milliseconds_per_instance)]

NumAbs = NumericalAbstraction()
dataset_copy = copy.deepcopy(dataset)
for ws in window_sizes:
    dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['acc_phone_x'], ws, 'mean')
    dataset_copy = NumAbs.abstract_numerical(dataset_copy, ['acc_phone_x'], ws, 'std')

DataViz.plot_dataset(dataset_copy, ['acc_phone_x', 'acc_phone_x_temp_mean', 'acc_phone_x_temp_std', 'label'], ['exact', 'like', 'like', 'like'], ['line', 'line', 'line', 'points'])

ws = int(float(0.5*60000)/milliseconds_per_instance)
selected_predictor_cols = [c for c in dataset.columns if not 'label' in c]
dataset = NumAbs.abstract_numerical(dataset, selected_predictor_cols, ws, 'mean')
dataset = NumAbs.abstract_numerical(dataset, selected_predictor_cols, ws, 'std')


CatAbs = CategoricalAbstraction()
dataset = CatAbs.abstract_categorical(dataset, ['label'], ['like'], 0.03, int(float(5*60000)/milliseconds_per_instance), 2)

# Now we move to the frequency domain, with the same window size.

FreqAbs = FourierTransformation()
fs = float(1000)/milliseconds_per_instance

예제 #24

0

파일 보기

파일: crowdsignals_ch3_rest.py 프로젝트: Mick-IJzer/ML4QS

# Determine the PC's for all but our target columns (the labels and the heart rate)
# We simplify by ignoring both, we could also ignore one first, and apply a PC to the remainder.

PCA = PrincipalComponentAnalysis()
selected_predictor_cols = [c for c in dataset.columns if (not ('label' in c) and (not 'id' in c))]
pc_values = PCA.determine_pc_explained_variance(dataset, selected_predictor_cols)

# Plot the variance explained.
DataViz.plot_xy(x=[range(1, len(selected_predictor_cols)+1)], y=[pc_values],
                xlabel='principal component number', ylabel='explained variance',
                ylim=[0,1], line_styles=['b-'])

# We select 7 as the best number of PC's as this explains most of the variance

n_pcs = 2

dataset = PCA.apply_pca(copy.deepcopy(dataset), selected_predictor_cols, n_pcs)

#And we visualize the result of the PC's

DataViz.plot_dataset(dataset, ['pca_', 'label'], ['like', 'like'], ['line', 'points'])

# And the overall final dataset:
DataViz.plot_dataset(dataset, ['acc_x', 'acc_y', 'acc_z', 'heartrate', 'pca_', 'label'],
                              ['like', 'like', 'like', 'like', 'like', 'like'],
                              ['line', 'line', 'line', 'line', 'line', 'points'])

# Store the outcome.

dataset.to_csv(DATA_PATH / RESULT_FNAME)

예제 #25

0

파일 보기

파일: crowdsignals_ch4.py 프로젝트: sheridavandenbent/ML4QS

    int(float(1000) / milliseconds_per_instance),
    int(float(5000) / milliseconds_per_instance),
    int(float(0.3 * 60000) / milliseconds_per_instance)
]

NumAbs = NumericalAbstraction()
dataset_copy = copy.deepcopy(dataset)
for ws in window_sizes:
    dataset_copy = NumAbs.abstract_numerical(dataset_copy,
                                             ['userAcceleration.x'], ws,
                                             'mean')
    dataset_copy = NumAbs.abstract_numerical(dataset_copy,
                                             ['userAcceleration.x'], ws, 'std')

DataViz.plot_dataset(dataset_copy, [
    'userAcceleration.x', 'userAcceleration.x_temp_mean',
    'userAcceleration.x_temp_std', 'label'
], ['exact', 'like', 'like', 'like'], ['line', 'line', 'line', 'points'])

ws = int(float(0.3 * 60000) / milliseconds_per_instance)
selected_predictor_cols = [c for c in dataset.columns if not 'label' in c]
dataset = NumAbs.abstract_numerical(dataset, selected_predictor_cols, ws,
                                    'mean')
dataset = NumAbs.abstract_numerical(dataset, selected_predictor_cols, ws,
                                    'std')

DataViz.plot_dataset(
    dataset, [
        'gravity.x', 'gravity.y', 'gravity.z', 'userAcceleration.x',
        'userAcceleration.y', 'userAcceleration.z', 'label'
    ], ['like', 'like', 'like', 'like', 'like', 'like', 'like'],
    ['line', 'line', 'line', 'line', 'line', 'line', 'points'])

예제 #26

0

파일 보기

파일: Plots_ass3.py 프로젝트: Mick-IJzer/ML4QS

from util.VisualizeDataset import VisualizeDataset
import pandas as pd

person_id = 1455390
DataViz = VisualizeDataset('assignment3.')

#CHAPTER 2
path = r'C:/Users/MICK/Desktop/ML4QS/ML4QS/Python3Code/intermediate_datafiles/Assignment3/'
df = pd.read_csv(path + 'chapter2_result.csv')

df.index = pd.to_datetime(df['time'])
df = df[df['personid'] == person_id]

DataViz.plot_dataset(
    df, ['acc_x', 'acc_y', 'acc_z', 'heartrate', 'heartrate_std', 'label'],
    ['like', 'like', 'like', 'exact', 'exact', 'like'],
    ['line', 'line', 'line', 'line', 'line', 'points'])

#CHAPTER 3
path = r'C:/Users/MICK/Desktop/ML4QS/ML4QS/Python3Code/intermediate_datafiles/Assignment3/'
df = pd.read_csv(path + 'chapter3_result_final.csv')

df.index = pd.to_datetime(df['time'])
df = df[df['personid'] == person_id]

DataViz.plot_dataset(df,
                     ['acc_x', 'acc_y', 'acc_z', 'heartrate', 'pca_', 'label'],
                     ['like', 'like', 'like', 'like', 'like', 'like'],
                     ['line', 'line', 'line', 'line', 'line', 'points'])

print(df.columns)

예제 #27

0

파일 보기

                                       'labelSittingDown', 'labelSitting', 'labelStandingFromLying', 'labelOnAllFours',
                                       'labelSittingOnTheGround', 'labelStandingFromSitting',
                                       'labelStandingFromSittingOnTheGround'], 'max', '')

    # Get the resulting pandas data table

    dataset = DataSet.data_table

    # Plot the data

    DataViz = VisualizeDataset()

    # Boxplot
    DataViz.plot_dataset_boxplot(dataset, ['ankle_l_x', 'ankle_l_y', 'ankle_l_z', 'ankle_r_x', 'ankle_r_y', 'ankle_r_z',
                                           'belt_x', 'belt_y', 'belt_z', 'chest_x', 'chest_y', 'chest_z'])

    # Plot all data
    DataViz.plot_dataset(dataset, ['ankle_l_', 'ankle_r_', 'belt_', 'chest_', 'label'], ['like', 'like', 'like', 'like', 'like'], ['line', 'line', 'line', 'line', 'points'])

    # And print a summary of the dataset

    util.print_statistics(dataset)
    datasets.append(copy.deepcopy(dataset))

# And print the table that has been included in the book

util.print_latex_table_statistics_two_datasets(datasets[0], datasets[1])

# Finally, store the last dataset we have generated (250 ms).
#dataset.to_csv(result_dataset_path + 'chapter2_result.csv')

예제 #28

0

파일 보기

"""
for c in periodic_predictor_cols:
    data_table = FreqAbs.abstract_frequency(copy.deepcopy(dataset), [c],
                                            int(float(10000) / milliseconds_per_instance), fs)
    DataViz.plot_dataset(data_table, [c+'_max_freq', c+'_freq_weighted', c+'_pse', c+'_freq_skewness', c+'_freq_kurtosis', 'label'],
                         ['like', 'like', 'like', 'like', 'like', 'like'], ['line', 'line', 'line', 'line', 'line', 'points'])
"""

# Compute and add frequency domain features to dataset
dataset = FreqAbs.abstract_frequency(
    dataset, periodic_predictor_cols,
    int(float(10000) / milliseconds_per_instance), fs)

# ------------------------------------------------------------------------------------
# REDUCE OVERLAP
print 'reducing overlap.'

# The percentage of overlap we allow
window_overlap = 0.95
skip_points = int((1 - window_overlap) * ws)
dataset = dataset.iloc[::skip_points, :]

dataset.to_csv(dataset_path + 'domain_features_result_95.csv')

DataViz.plot_dataset(
    dataset, [
        'acc_x', 'gyr_x', 'lin_acc_x', 'light_illuminance', 'mag_x',
        'loc_height', 'pca_1', 'label'
    ], ['like', 'like', 'like', 'like', 'like', 'like', 'like', 'like'],
    ['line', 'line', 'line', 'line', 'line', 'line', 'line', 'points'])

예제 #29

0

파일 보기

파일: FeaturesWatch.py 프로젝트: kim66003/ML4QS_group25

    'acc_z',
    "gyr_x",
    "gyr_y",
    "gyr_z",
]

for ws in window_sizes:
    dataset = NumAbs.abstract_numerical(dataset, periodic_predictor_cols, ws,
                                        'mean')
    dataset = NumAbs.abstract_numerical(dataset, periodic_predictor_cols, ws,
                                        'std')
    print('window size', ws)

print(dataset.columns)
DataViz.plot_dataset(dataset, ['acc_x', 'acc_y', 'acc_z', 'label'],
                     ['exact', 'like', 'like', 'like'],
                     ['line', 'line', 'line', 'points'])

# ws = int(float(0.5*60000)/milliseconds_per_instance)
# dataset = NumAbs.abstract_numerical(dataset, periodic_predictor_cols, ws, 'mean')
# dataset = NumAbs.abstract_numerical(dataset, periodic_predictor_cols, ws, 'std')

print('temporal', dataset.shape)

print('attributes frequency domain')

# Now we move to the frequency domain, with the same window size.

FreqAbs = FourierTransformation()
fs = float(1000) / milliseconds_per_instance

예제 #30

0

파일 보기

파일: our_ch2.py 프로젝트: tomescumihail93/ML4QS

#    DataSetCS.add_numerical_dataset('pressure_phone.csv', 'timestamps', ['pressure'], 'avg', 'press_phone_')

    # Get the resulting pandas data table

    dataset_own = DataSetOwn.data_table
#    dataset_cs = DataSetCS.data_table

    # Plot the data
    DataViz = VisualizeDataset()

    # Boxplot
    DataViz.plot_dataset_boxplot(dataset_own, ['acc_phone_x','acc_phone_y','acc_phone_z'])
#    DataViz.plot_dataset_boxplot(dataset_cs, ['acc_phone_x','acc_phone_y','acc_phone_z'])

    # Plot all data
    DataViz.plot_dataset(dataset_own, ['acc_', 'gyr_', 'mag_', 'press_' ,'pedom_phone_', 'label'], ['like', 'like', 'like', 'like', 'like', 'like'], ['line', 'line', 'line','line', 'points', 'points'])
#    DataViz.plot_dataset(dataset_cs, ['acc_phone', 'gyr_phone', 'mag_phone', 'press_phone_', 'label'], ['like', 'like', 'like', 'like', 'like'], ['line', 'line', 'line', 'line', 'points'])

    # And print a summary of the dataset

    util.print_statistics(dataset_own)
    datasets_own.append(copy.deepcopy(dataset_own))
    
#    util.print_statistics(dataset_cs)
#    datasets_cs.append(copy.deepcopy(dataset_cs))

# And print the table that has been included in the book

util.print_latex_table_statistics_two_datasets(datasets_own[0], datasets_own[1])
util.print_latex_table_statistics_two_datasets(datasets_cs[0], datasets_cs[1])