Exemplo n.º 1
 def __init__(self, data_path, data_file):
     self.dataset = pd.read_csv(Path(data_path / data_file), index_col=0)
     self.dataset = self.dataset
     self.dataset.index = pd.to_datetime(self.dataset.index)
     self.DataViz = VisualizeDataset(__file__, show=False)
     self.outlier_columns = ['acc_phone_x', 'light_phone_lux']
     self.OutlierDistr = DistributionBasedOutlierDetection()
     self.OutlierDist = DistanceBasedOutlierDetection()
     self.original_columns = self.dataset.columns
     self.num_outliers = {'acc_phone_x': 0, 'light_phone_lux': 0}
Exemplo n.º 2
def main():
    # D:\Users\Andy\Downloads\Desktop\ml4qs\ML4QS_Group_41\ML4QS\simple_dataset\2\accelerometer.csv
    # Set up file names and locations.
    for user in range(1, 34):

        DATA_PATH = Path('./intermediate_datafiles/')
        DATASET_FNAME = sys.argv[1] if len(
        ) > 1 else 'AS14_' + "{:02d}".format(user) + '/chapter2_result.csv'
        RESULT_FNAME = sys.argv[2] if len(
            sys.argv) > 2 else 'chapter3_result_outliers.csv'

        # Next, import the data from the specified location and parse the date index.
            dataset = pd.read_csv(Path(DATA_PATH / DATASET_FNAME), index_col=0)
            dataset.index = pd.to_datetime(dataset.index)

        except IOError as e:
                'File not found, try to run the preceding crowdsignals scripts first!'
            raise e

        # We'll create an instance of our visualization class to plot the results.
        DataViz = VisualizeDataset(__file__)

        # Compute the number of milliseconds covered by an instance using the first two rows.
        milliseconds_per_instance = (dataset.index[1] -
                                     dataset.index[0]).microseconds / 1000

        # Step 1: Let us see whether we have some outliers we would prefer to remove.

        # Determine the columns we want to experiment on.
        outlier_columns = [
            'actvalue', 'builtvalue', 'commvalue', 'entvalue', 'accvalue',
            'offvalue', 'othervalue', 'socialvalue', 'travelvalue', 'unkvalue',
            'utilvalue', 'callvalue', 'arovalue', 'valvalue', 'scrvalue',

        # Create the outlier classes.
        OutlierDistr = DistributionBasedOutlierDetection()
        OutlierDist = DistanceBasedOutlierDetection()

        # And investigate the approaches for all relevant attributes.
        for col in outlier_columns:
            # if col is None:
            #     continue
            print(f"Applying outlier criteria for column {col}")

            # And try out all different approaches. Note that we have done some optimization
            # of the parameter values for each of the approaches by visual inspection.
            dataset = OutlierDistr.chauvenet(dataset, col)
            # DataViz.plot_binary_outliers(dataset, col, col + '_outlier')
            dataset = OutlierDistr.mixture_model(dataset, col)
            # DataViz.plot_dataset(dataset, [col, col + '_mixture'], ['exact','exact'], ['line', 'points'])
            # This requires:
            # n_data_points * n_data_points * point_size =
            # 31839 * 31839 * 32 bits = ~4GB available memory

                dataset = OutlierDist.simple_distance_based(
                    dataset, [col], 'euclidean', 0.10, 0.99)
                # DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier')
            except MemoryError as e:
                    'Not enough memory available for simple distance-based outlier detection...'

            # try:
            #     dataset = OutlierDist.local_outlier_factor(dataset, [col], 'euclidean', 5)
            #     # DataViz.plot_dataset(dataset, [col, 'lof'], ['exact','exact'], ['line', 'points'])
            # except MemoryError as e:
            #     print('Not enough memory available for lof...')
            #     print('Skipping.')

            # Remove all the stuff from the dataset again.
            cols_to_remove = [
                col + '_outlier', col + '_mixture', 'simple_dist_outlier',
            for to_remove in cols_to_remove:
                if to_remove in dataset:
                    del dataset[to_remove]

        # We take Chauvenet's criterion and apply it to all but the label data...

        for col in [c for c in dataset.columns if not 'label' in c]:
            print(f'Measurement is now: {col}')
            dataset = OutlierDistr.chauvenet(dataset, col)
            dataset.loc[dataset[f'{col}_outlier'] == True, col] = np.nan
            del dataset[col + '_outlier']

        dataset.to_csv(DATA_PATH / RESULT_FNAME)
Exemplo n.º 3
    ['like', 'like', 'like', 'like', 'like', 'like', 'like'],
    ['line', 'line', 'line', 'line', 'line', 'points', 'points'])

# Compute the number of milliseconds covered by an instance based on the first two rows
milliseconds_per_instance = (dataset.index[1] -
                             dataset.index[0]).microseconds / 1000

# Step 1: Let us see whether we have some outliers we would prefer to remove.

# Determine the columns we want to experiment on.
# outlier_columns = ['acc_phone_x', 'light_phone_lux']
outlier_columns = [c for c in dataset.columns if not 'label' in c]

# Create the outlier classes.
OutlierDistr = DistributionBasedOutlierDetection()
OutlierDist = DistanceBasedOutlierDetection()

# #And investigate the approaches for all relevant attributes.
# for col in outlier_columns:
#     # And try out all different approaches. Note that we have done some optimization
#     # of the parameter values for each of the approaches by visual inspection.
#     # dataset = OutlierDistr.chauvenet(dataset, col)
#     # print(col, sum(dataset[col+'_outlier']))
#     # plot = DataViz.plot_binary_outliers(dataset, col, col + '_outlier', ax[i,j])
#     dataset = OutlierDistr.mixture_model(dataset, col)
#     DataViz.plot_dataset(dataset, [col, col + '_mixture'], ['exact','exact'], ['line', 'points'])
#     # plot.plot(ax=ax[i,j])
#     # i += 1
#     # if i == 7:
#     #     i = 0
Exemplo n.º 4
    raise e

dataset.index = dataset.index.to_datetime()

# Compute the number of milliseconds covered by an instance based on the first two rows
milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds/1000

# Step 1: Let us see whether we have some outliers we would prefer to remove.

# Determine the columns we want to experiment on.
#outlier_columns = ['acc_phone_x','acc_phone_y','acc_phone_z','gyr_phone_x','gyr_phone_y','gyr_phone_z','light_phone_lux','mag_phone_x','mag_phone_y','mag_phone_z']
outlier_columns = ['light_phone_lux']

# Create the outlier classes.
OutlierDistr = DistributionBasedOutlierDetection()
OutlierDist = DistanceBasedOutlierDetection()

#And investigate the approaches for all relevant attributes.
for col in outlier_columns:
    # And try out all different approaches. Note that we have done some optimization
    # of the parameter values for each of the approaches by visual inspection.
    dataset = OutlierDistr.chauvenet(dataset, col)
    #print 'chauvenet', col
    #DataViz.plot_binary_outliers(dataset, col, col + '_outlier')
    dataset = OutlierDistr.mixture_model(dataset, col)
    #DataViz.plot_dataset(dataset, [col, col + '_mixture'], ['exact','exact'], ['line', 'points'])
    # This requires:
    # n_data_points * n_data_points * point_size =
    # 31839 * 31839 * 64 bits = ~8GB available memory
Exemplo n.º 5
def main():

    # Set up file names and locations.

    # Next, import the data from the specified location and parse the date index.
    dataset = pickle.load(open('datasets\dataframes\df_concat_with_labels.pkl', 'rb'))

    # We'll create an instance of our visualization class to plot the results.
    DataViz = VisualizeDataset(__file__, show=False)

    # Compute the number of milliseconds covered by an instance using the first two rows.
    milliseconds_per_instance = (dataset.index[1] - dataset.index[0]).microseconds/1000

    # Step 1: Let us see whether we have some outliers we would prefer to remove.

    # Determine the columns we want to experiment on.
    outlier_columns = [
        "Acceleration x (m/s^2)","Acceleration y (m/s^2)","Acceleration z (m/s^2)",
        "Magnetic field x (µT)","Magnetic field y (µT)","Magnetic field z (µT)",
        "Gyroscope x (rad/s)","Gyroscope y (rad/s)","Gyroscope z (rad/s)",
        "Linear Acceleration x (m/s^2)","Linear Acceleration y (m/s^2)","Linear Acceleration z (m/s^2)",
    # Create the outlier classes.
    OutlierDistr = DistributionBasedOutlierDetection()
    OutlierDist = DistanceBasedOutlierDetection()

    # And investigate the approaches for all relevant attributes.
    for col in outlier_columns:

        print(f"Applying outlier criteria for column {col}")

        # And try out all different approaches. Note that we have done some optimization
        # of the parameter values for each of the approaches by visual inspection.
        dataset = OutlierDistr.chauvenet(dataset, col)
        # DataViz.plot_binary_outliers(dataset, col, col + '_outlier')
        dataset = OutlierDistr.mixture_model(dataset, col)
        # DataViz.plot_dataset(dataset, [col, col + '_mixture'], ['exact','exact'], ['line', 'points'])
        # This requires:
        # n_data_points * n_data_points * point_size =
        # 31839 * 31839 * 32 bits = ~4GB available memory

            dataset = OutlierDist.simple_distance_based(dataset, [col], 'euclidean', 0.10, 0.99)
            # DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier')
        except MemoryError as e:
            print('Not enough memory available for simple distance-based outlier detection...')

            dataset = OutlierDist.local_outlier_factor(dataset, [col], 'euclidean', 5)
            # DataViz.plot_dataset(dataset, [col, 'lof'], ['exact','exact'], ['line', 'points'])
        except MemoryError as e:
            print('Not enough memory available for lof...')

        # Remove all the stuff from the dataset again.
        cols_to_remove = [col + '_outlier', col + '_mixture', 'simple_dist_outlier', 'lof']
        for to_remove in cols_to_remove:
            if to_remove in dataset:
                del dataset[to_remove]

    # We take Chauvenet's criterion and apply it to all but the label data...

    for col in [c for c in dataset.columns if not 'label' in c]:
        print(f'Measurement is now: {col}')
        dataset = OutlierDistr.chauvenet(dataset, col)
        dataset.loc[dataset[f'{col}_outlier'] == True, col] = np.nan
        del dataset[col + '_outlier']

    pickle.dump(dataset, open('concat_outliers.pkl', 'wb'))
Exemplo n.º 6
def main():
    # Import the data from the specified location and parse the date index
        dataset = pd.read_csv(Path(DATA_PATH / DATASET_FILENAME), index_col=0)
        dataset.index = pd.to_datetime(dataset.index)
    except IOError as e:
        print('File not found, try to run the preceding crowdsignals scripts first!')
        raise e

    # Create an instance of visualization class to plot the results
    DataViz = VisualizeDataset(module_path=__file__)
    # Create the outlier classes
    OutlierDistribution = DistributionBasedOutlierDetection()
    OutlierDistance = DistanceBasedOutlierDetection()

    # Step 1: If requested, see whether there are some outliers that need to be preferably removed
    # Set the columns to experiment on
    outlier_columns = ['acc_phone_x', 'light_phone_lux']

    if FLAGS.mode == 'chauvenet':
        # Investigate the outlier columns using chauvenet criterium
        for col in outlier_columns:
            print(f"Applying chauvenet outlier criteria for column {col}")
            dataset = OutlierDistribution.chauvenet(data_table=dataset, col=col)
            DataViz.plot_binary_outliers(data_table=dataset, col=col, outlier_col=f'{col}_outlier')

    elif FLAGS.mode == 'mixture':
        # Investigate the outlier columns using mixture models
        for col in outlier_columns:
            print(f"Applying mixture model for column {col}")
            dataset = OutlierDistribution.mixture_model(data_table=dataset, col=col, components=3)
            DataViz.plot_dataset(data_table=dataset, columns=[col, f'{col}_mixture'], match=['exact', 'exact'],
                                 display=['line', 'points'])

    elif FLAGS.mode == 'distance':
        for col in outlier_columns:
            print(f"Applying distance based outlier detection for column {col}")
            # This step requires:
            # n_data_points * n_data_points * point_size = 31839 * 31839 * 32 bits = ~4GB available memory
                dataset = OutlierDistance.simple_distance_based(data_table=dataset, cols=[col], d_function='euclidean',
                                                                d_min=FLAGS.dmin, f_min=FLAGS.fmin)
                DataViz.plot_binary_outliers(data_table=dataset, col=col, outlier_col='simple_dist_outlier')
            except MemoryError:
                print('Not enough memory available for simple distance-based outlier detection...')

    elif FLAGS.mode == 'LOF':
        for col in outlier_columns:
            print(f"Applying Local outlier factor for column {col}")
                dataset = OutlierDistance.local_outlier_factor(data_table=dataset, cols=[col], d_function='euclidean',
                DataViz.plot_dataset(data_table=dataset, columns=[col, 'lof'], match=['exact', 'exact'],
                                     display=['line', 'points'])
            except MemoryError:
                print('Not enough memory available for local outlier factor...')

    elif FLAGS.mode == 'final':
        # Take Chauvenet's criterion and apply it to all but the label column in the main dataset
        for col in [c for c in dataset.columns if 'label' not in c]:
            print(f'Measurement is now: {col}')
            dataset = OutlierDistribution.chauvenet(data_table=dataset, col=col)
            dataset.loc[dataset[f'{col}_outlier'], col] = np.nan
            del dataset[col + '_outlier']

        dataset.to_csv(DATA_PATH / RESULT_FILENAME)
Exemplo n.º 7
class OutlierExperiment:
    def __init__(self, data_path, data_file):
        self.dataset = pd.read_csv(Path(data_path / data_file), index_col=0)
        self.dataset = self.dataset
        self.dataset.index = pd.to_datetime(self.dataset.index)
        self.DataViz = VisualizeDataset(__file__, show=False)
        self.outlier_columns = ['acc_phone_x', 'light_phone_lux']
        self.OutlierDistr = DistributionBasedOutlierDetection()
        self.OutlierDist = DistanceBasedOutlierDetection()
        self.original_columns = self.dataset.columns
        self.num_outliers = {'acc_phone_x': 0, 'light_phone_lux': 0}

    def remove_columns(self):
        for to_remove in self.dataset.columns:
            if to_remove not in self.original_columns:
                del self.dataset[to_remove]

    def chauvenet(self, C):
        original_columns = self.dataset.columns
        for col in self.outlier_columns:
            print(f"Applying outlier criteria for column {col}")
            self.dataset = self.OutlierDistr.chauvenet(self.dataset, col, C)
            self.DataViz.plot_binary_outliers(self.dataset, col,
                                              col + '_outlier')
            self.num_outliers[col] = self.dataset[self.dataset[
                col + '_outlier'] == 1][col].size / self.dataset[col].size

    def mixture_model(self, n):
        for col in self.outlier_columns:
            print(f"Applying outlier criteria for column {col}")
            self.dataset = self.OutlierDistr.mixture_model(
                self.dataset, col, n)
            self.DataViz.plot_dataset(self.dataset, [col, col + '_mixture'],
                                      ['exact', 'exact'], ['line', 'points'])
            self.num_outliers[col] = self.dataset[
                col + '_mixture'].sum() / self.dataset[col + '_mixture'].size
            print(self.dataset[col + '_mixture'].max())
            if self.num_outliers[col] > 1:
                print(self.dataset[col + '_mixture'])

    def simple_distance_based(self, d_min, f_min):
        for col in self.outlier_columns:
            print(f"Applying outlier criteria for column {col}")
            self.dataset = self.OutlierDist.simple_distance_based(
                self.dataset, [col], 'euclidean', d_min, f_min)
            self.DataViz.plot_binary_outliers(self.dataset, col,
            self.num_outliers[col] = self.dataset[
                self.dataset['simple_dist_outlier'] ==
                1][col].size / self.dataset[col].size

    def local_outlier_factor(self, k):
        for col in self.outlier_columns:
            print(f"Applying outlier criteria for column {col}")
            self.dataset = self.OutlierDist.local_outlier_factor(
                self.dataset, [col], 'euclidean', k)
            self.DataViz.plot_dataset(self.dataset, [col, 'lof'],
                                      ['exact', 'exact'], ['line', 'points'])
            self.num_outliers[col] = self.dataset[
                self.dataset['lof'] == 1][col].size / self.dataset[col].size
Exemplo n.º 8
def main():
    # Set up file names and locations.
    DATA_PATH = Path('./intermediate_datafiles/')
    DATASET_FNAME = sys.argv[1] if len(
        sys.argv) > 1 else 'phoneSensorsA3_ch2.csv'
    RESULT_FNAME = sys.argv[2] if len(
        sys.argv) > 2 else 'phoneSensorsA3_outliers_ch3.csv'

    # Next, import the data from the specified location and parse the date index.
        dataset = pd.read_csv(Path(DATA_PATH / DATASET_FNAME), index_col=0)
        dataset.index = pd.to_datetime(dataset.index)

    except IOError as e:
            'File not found, try to run the preceding crowdsignals scripts first!'
        raise e

    # We'll create an instance of our visualization class to plot the results.
    DataViz = VisualizeDataset()

    # Compute the number of milliseconds covered by an instance using the first two rows.
    milliseconds_per_instance = (dataset.index[1] -
                                 dataset.index[0]).microseconds / 1000

    # Step 1: Let us see whether we have some outliers we would prefer to remove.

    # Determine the columns we want to experiment on.
    outlier_columns = [
        'acc_mobile_x', 'acc_mobile_y', 'acc_mobile_z', 'gyr_mobile_x',
        'gyr_mobile_y', 'gyr_mobile_z', 'mag_mobile_x', 'mag_mobile_y',
        'mag_mobile_z', 'prox_mobile_distance', 'loc_mobile_latitude',
        'loc_mobile_longitude', 'loc_mobile_height', 'loc_mobile_velocity',
        'loc_mobile_direction', 'loc_mobile_horizontalAccuracy',

    # Create the outlier classes.
    OutlierDistr = DistributionBasedOutlierDetection()
    OutlierDist = DistanceBasedOutlierDetection()

    # And investigate the approaches for all relevant attributes.
    for col in outlier_columns:

        dataset_outliers_sdb = OutlierDist.simple_distance_based(
            copy.deepcopy(dataset), [col], 'euclidean', 0.10, 0.99)
        DataViz.plot_binary_outliers(dataset_outliers_sdb, col,

        print(f"Applying outlier criteria for column {col}")

        # And try out all different approaches. Note that we have done some optimization
        # of the parameter values for each of the approaches by visual inspection.

        #dataset = OutlierDistr.chauvenet(dataset, col)
        #DataViz.plot_binary_outliers(dataset, col, col + '_outlier')
        #dataset = OutlierDistr.mixture_model(dataset, col)
        #DataViz.plot_dataset(dataset, [col, col + '_mixture'], ['exact', 'exact'], ['line', 'points'])

        # This requires:
        # n_data_points * n_data_points * point_size =
        # 31839 * 31839 * 32 bits = ~4GB available memory

        # try:
        #     dataset = OutlierDist.simple_distance_based(dataset, [col], 'euclidean', 0.10, 0.99)
        #     DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier')
        # except MemoryError as e:
        #     print('Not enough memory available for simple distance-based outlier detection...')
        #     print('Skipping.')

        # try:
        #     dataset = OutlierDist.local_outlier_factor(dataset, [col], 'euclidean', 2)
        #     DataViz.plot_dataset(dataset, [col, 'lof'], ['exact','exact'], ['line', 'points'])
        # except MemoryError as e:
        #     print('Not enough memory available for lof...')
        #     print('Skipping.')

        # Remove all the stuff from the dataset again.
        cols_to_remove = [
            col + '_outlier', col + '_mixture', 'simple_dist_outlier', 'lof'
        for to_remove in cols_to_remove:
            if to_remove in dataset:
                del dataset[to_remove]

    # We take Chauvenet's criterion and apply it to all but the label data...

    for col in [c for c in dataset.columns if not 'label' in c]:
        print(f'Measurement is now: {col}')
        dataset = OutlierDistr.chauvenet(dataset, col)
        dataset.loc[dataset[f'{col}_outlier'] == True, col] = np.nan
        del dataset[col + '_outlier']

    dataset.to_csv(DATA_PATH / RESULT_FNAME)
Exemplo n.º 9
def main():
    # Set up file names and locations.
    DATA_PATH = Path('./intermediate_datafiles/')
    DATASET_FNAME = sys.argv[1] if len(sys.argv) > 1 else 'chapter2_result.csv'
    RESULT_FNAME = sys.argv[2] if len(
        sys.argv) > 2 else 'chapter3_result_outliers.csv'

    # Next, import the data from the specified location and parse the date index.
        dataset = pd.read_csv(Path(DATA_PATH / DATASET_FNAME), index_col=0)
        dataset.index = pd.to_datetime(dataset.index)

    except IOError as e:
            'File not found, try to run the preceding crowdsignals scripts first!'
        raise e

    # We'll create an instance of our visualization class to plot the results.
    DataViz = VisualizeDataset(__file__)

    # Compute the number of milliseconds covered by an instance using the first two rows.
    milliseconds_per_instance = (dataset.index[1] -
                                 dataset.index[0]).microseconds / 1000

    # Step 1: Let us see whether we have some outliers we would prefer to remove.

    # Determine the columns we want to experiment on.
    outlier_columns = [

    # Create the outlier classes.
    OutlierDistr = DistributionBasedOutlierDetection()
    OutlierDist = DistanceBasedOutlierDetection()

    # And investigate the approaches for all relevant attributes.
    for col in outlier_columns:

        print(f"Applying outlier criteria for column {col}")

        # And try out all different approaches. Note that we have done some optimization
        # of the parameter values for each of the approaches by visual inspection.
        dataset = OutlierDistr.chauvenet(dataset, col)
        DataViz.plot_binary_outliers(dataset, col, col + '_outlier')
        dataset = OutlierDistr.mixture_model(dataset, col)
        DataViz.plot_dataset(dataset, [col, col + '_mixture'],
                             ['exact', 'exact'], ['line', 'points'])
        # This requires:
        # n_data_points * n_data_points * point_size =
        # 31839 * 31839 * 32 bits = ~4GB available memory

            dataset = OutlierDist.simple_distance_based(
                dataset, [col], 'euclidean', 0.10, 0.99)
            DataViz.plot_binary_outliers(dataset, col, 'simple_dist_outlier')
        except MemoryError as e:
                'Not enough memory available for simple distance-based outlier detection...'

            dataset = OutlierDist.local_outlier_factor(dataset, [col],
                                                       'euclidean', 5)
            DataViz.plot_dataset(dataset, [col, 'lof'], ['exact', 'exact'],
                                 ['line', 'points'])
            DataViz.plot_dataset_boxplot(dataset, ['lof'])
            # print(col, dataset['lof'].describe())
            qtls = list(dataset['lof'].quantile([0.01, 0.25, 0.5, 0.75, 0.99]))
            # print(col, qtls)
            #print(col, qtls[4])

            dataset['lof_outliers'] = False
            dataset.loc[(dataset['lof'] > qtls[4]), 'lof_outliers'] = True

            DataViz.plot_binary_outliers(dataset, col, 'lof_outliers')
        except MemoryError as e:
            print('Not enough memory available for lof...')

        # Remove all the stuff from the dataset again.
        cols_to_remove = [
            col + '_outlier', col + '_mixture', 'simple_dist_outlier', 'lof',
        for to_remove in cols_to_remove:
            if to_remove in dataset:
                del dataset[to_remove]

    # We take Chauvenet's criterion and apply it to all but the label data...
    for col in [c for c in dataset.columns if not 'label' in c]:
        print(f'Measurement is now: {col}')
        if col.startswith('mag'):
            dataset = OutlierDist.simple_distance_based(
                dataset, [col], 'euclidean', 0.10,
                0.99).rename(columns={'simple_dist_outlier': f'{col}_outlier'})
            dataset = OutlierDistr.chauvenet(dataset, col)

        dataset.loc[dataset[f'{col}_outlier'] == True, col] = np.nan
        DataViz.plot_binary_outliers(dataset, col, f'{col}_outlier')
        del dataset[col + '_outlier']

    dataset.to_csv(DATA_PATH / RESULT_FNAME)
Exemplo n.º 10
dataset_own.index = dataset_own.index.to_datetime()
dataset_cs.index = dataset_cs.index.to_datetime()

# Compute the number of milliseconds covered by an instance based on the first two rows
milliseconds_per_instance = (dataset_own.index[1] -
                             dataset_own.index[0]).microseconds / 1000

# Step 1: Let us see whether we have some outliers we would prefer to remove.

# Determine the columns we want to experiment on.
outlier_columns = ['acc_phone_x', 'acc_phone_y']

# Create the outlier classes.
OutlierDistr = DistributionBasedOutlierDetection()
OutlierDist = DistanceBasedOutlierDetection()

# Parameters that can be played around with for different outlier detection methods
# Chauvenet
constant = 2  # given was 2
# Mixture models
NumDist = 3  # given was 3
# Simple Distance
dmin = 0.10  # given was 0.10
fmin = 0.99  # given was 0.99
# Local outlier factor
k = 5  # given was 5

##### Outlier filtering for the CS dataset #####

#And investigate the approaches for all relevant attributes.
Exemplo n.º 11
from Chapter3.OutlierDetection import DistributionBasedOutlierDetection
from Chapter3.OutlierDetection import DistanceBasedOutlierDetection
import sys
import copy
import pandas as pd
import numpy as np

DATA_PATH = './intermediate_datafiles/'
dataset = pd.read_csv(DATA_PATH + 'chapter2_result.csv', index_col=0)
dataset.index = pd.to_datetime(dataset.index)

DataViz = VisualizeDataset()

milliseconds_per_instance = (dataset.index[1] -
                             dataset.index[0]).microseconds / 1000

outlier_columns = ['acc_phone_x']

OutlierDist = DistanceBasedOutlierDetection()

#Run last two methods here and change parameters to get different figures
for col in outlier_columns:
    ##distance-based approach, tried 0.11,0.99 and 0.50,0.99
    # dataset_outliers_sdb = OutlierDist.simple_distance_based(copy.deepcopy(dataset), [col], 'euclidean', 0.50, 0.99)
    # DataViz.plot_binary_outliers(dataset_outliers_sdb, col, 'simple_dist_outlier')

    #LOF approach, tried k=2 and k=9
    dataset = OutlierDist.local_outlier_factor(dataset, [col], 'euclidean', 9)
    DataViz.plot_dataset(dataset, [col, 'lof'], ['exact', 'exact'],
                         ['line', 'points'])