예제 #1
0
def main():
    params = get_argument()
    all_state_folder = params['all_state_folder'][:]
    features_list = params['features_list']
    model_type = params['model_type']
    resample_rate = 6400

    stride = 1
    epochs = 300

    save_result = True
    output_dir = './results'

    params_grid = {
        'kernel': [40, 80, 120, 200, 240, 360],
        'transform_type': ['minmax'],
        'with_lazy': [0.00],  # , 0.01, 0.015, 0.02],
        # 'loss': ['mae', 'mse'],
        # 'activation': [layers.LeakyReLU(alpha=0.3), 'relu', 'tanh']
    }
    # if model_type == 'bilstm':
    #     params_grid['activation'] = ['relu', 'tanh']

    for model_type in [
            'pca', 'svm', 'cluster', 'cnn', 'deep', 'lstm', 'bilstm'
    ]:

        skip_list = [0]
        train_list = [1]

        combs = []
        states = list(range(len(all_state_folder)))
        for i in range(len(all_state_folder) - 1):
            r = i + 1
            l = list(itertools.combinations(states, r=r))
            l = [list(x) for x in l]

            combs += l

        for selected_states in combs:
            ds_train_list = []
            y_train_list = []
            ds_test_list = []
            y_test_list = []

            # Read train and test files
            print('Evaluation state: {}'.format(selected_states))
            for state_id, folder in enumerate(all_state_folder):
                print('Read state: ', os.path.basename(folder))
                files = get_files(folder, ext='lvm')
                for i, filename in enumerate(files):
                    if i in skip_list:
                        print('Skip: ', filename)
                        continue

                    ds = read_ds_lvm(filename, get_header=False)
                    ds = ds[features_list]
                    ds = resample(ds, resample_rate)

                    if i in train_list and state_id not in selected_states:
                        ds_train_list.append(ds)
                        print('Train state {} file: {}'.format(
                            state_id, filename))
                        y_train_list.append(state_id)
                    else:
                        ds_test_list.append(ds)
                        print('Test state {} file: {}'.format(
                            state_id, filename))
                        y_test_list.append(state_id)

            ds_res = []
            for grid in ParameterGrid(params_grid):

                print('\n Params:')
                print(grid)

                kernel = grid['kernel']
                transform_type = grid['transform_type']

                model_params = dict(grid)
                model_params.pop('kernel', None)
                model_params.pop('transform_type', None)
                if 'skernel' in model_params:
                    model_params['kernel'] = model_params['skernel']
                    model_params.pop('skernel', None)

                # Apply transform
                transformer = None
                if transform_type:
                    print('Apply transform: ', transform_type)
                    x_train_list, transformer = transform_data(
                        ds_train_list, transform_type)
                    x_test_list = [
                        apply_transform(ds, transformer) for ds in ds_test_list
                    ]

                else:
                    print('No transform selected')
                    x_train_list = ds_train_list
                    x_test_list = ds_test_list

                # Create train and test matrix set
                x_train, y_train = prepare_data(x_train_list,
                                                labels=y_train_list,
                                                kernel=kernel,
                                                stride=stride)
                x_test, y_test = prepare_data(x_test_list,
                                              labels=y_test_list,
                                              kernel=kernel,
                                              stride=stride)

                print('Train size:       ', x_train.shape)
                print('Train label size: ', y_train.shape)
                print('Test size:        ', x_test.shape)
                print('Test label size:  ', y_test.shape)

                order = np.random.permutation(len(x_train))
                x_new = x_train[order]
                y_new = y_train[order]

                # Model initialization
                print("Model initialization: {}".format(model_type))
                model = get_deep_model(model_type, model_params=model_params)

                # Training
                print("Training...")
                model.fit(x=x_new, epochs=epochs, batch_size=64, verbose=2)

                if model_type in ['cnn', 'deep', 'lstm', 'bilstm']:
                    original_threshold = model.threshold
                    print('Anomaly threshold: ', original_threshold)
                    thresholds = [
                        0.00, 0.01, 0.015, 0.02, 0.03, 0.05, 0.07, 0.1, 0.125,
                        0.15
                    ]
                    for th in thresholds:
                        model.threshold = original_threshold + th
                        print('\n Lazy ', model.threshold)

                        print("Anomaly accuracy")
                        y_pred = model.predict(x_test, classifier=False)
                        y_true = np.zeros(len(y_test))

                        for selected_state_id in selected_states:
                            y_true[y_test == selected_state_id] = 1

                        print(classification_report(y_true, y_pred))

                        report_dict = classification_report(y_true,
                                                            y_pred,
                                                            output_dict=True)
                        record = get_classification_report_record(report_dict)
                        record.update(grid)
                        record['with_lazy'] = th

                        ds_res.append(record)
                else:
                    print("Anomaly accuracy")
                    y_pred = model.predict(x_test)
                    y_true = np.zeros(len(y_test))

                    for selected_state_id in selected_states:
                        y_true[y_test == selected_state_id] = 1

                    print(classification_report(y_true, y_pred))

                    report_dict = classification_report(y_true,
                                                        y_pred,
                                                        output_dict=True)
                    record = get_classification_report_record(report_dict)
                    record.update(grid)
                    ds_res.append(record)

            ds_res = pd.DataFrame(ds_res)

            if save_result:
                if not os.path.isdir(output_dir):
                    os.makedirs(output_dir, exist_ok=True)

                name = [str(x) for x in selected_states]
                name = '_'.join(name)

                filename = os.path.join(
                    output_dir, 'results_grid_anomaly__{}__{}.csv'.format(
                        name, model_type))
                ds_res.to_csv(filename, index=True)
예제 #2
0
def main():
    params = get_argument()

    # model input
    train_file = params['train']
    test_file = params['test']

    # feature params
    features_list = params['features_list']
    kernel = params['kernel']
    stride = params['stride']

    # feature extraction
    resample_rate = params.get('resample_rate', 6400)
    custom_resample = params.get('custom_resample', False)

    # model params
    model_type = params['model_type']
    params_file = params['model_params']

    # Read train file
    print("Read Train File: ", os.path.basename(train_file))
    ds_train = pd.read_csv(train_file)

    # Select features
    if features_list:
        ds_train = ds_train[features_list]

    # Resample
    train_len = len(ds_train)
    if custom_resample:
        ds_train = resample_with_feature_extractor(ds_train, resample_rate)
    else:
        if resample_rate > 1:
            ds_train = resample(ds_train, resample_rate)

    print('Train Original File Length: ', train_len)
    print('New File Length {} {:.02f}'.format(len(ds_train),
                                              100 * len(ds_train) / train_len))

    # Create training set
    print("Create training set")
    x_train = get_sliding_window_matrix(ds_train.values, kernel, stride)
    print('Train shape ', x_train.shape)

    # Model initialization
    print("Model initialization: {}".format(model_type))
    model = get_model(model_type, params_file=params_file)

    # Training
    print("Training...")
    model.fit(x_train)

    print("Read Test File: ", os.path.basename(test_file))
    ds_test = pd.read_csv(test_file)

    # Select features
    if features_list:
        ds_test = ds_test[features_list]

    # Resample
    test_len = len(ds_test)
    if custom_resample:
        ds_test = resample_with_feature_extractor(ds_test, resample_rate)
    else:
        if resample_rate > 1:
            ds_test = resample(ds_test, resample_rate)

    print('Test Original File Length: ', test_len)
    print('New File Length {} {:.02f}'.format(len(ds_test),
                                              100 * len(ds_test) / test_len))

    print('Testing...')
    y_pred = predict_anomaly(ds_test, model, kernel, with_skip=False)

    # Encoding results into triplet formats
    results = create_triplet_time_series(y_pred, with_support=True)

    # Show results
    print("Results:")
    results = pd.DataFrame(results)
    print(tabulate(results, headers='keys', tablefmt='psql'))
예제 #3
0
def main():
    params = get_argument()
    all_state_folder = params['all_state_folder']
    features_list = params['features_list']
    resample_rate = 6400

    stride = 1
    epochs = 500

    transform_type = 'minmax'

    save_result = True
    output_dir = './results'

    cluster_models = {
        'agglomerative': AgglomerativeClustering,
        'kmeans': KMeans,
        'spectral': SpectralClustering
    }

    for train_id in [1, 2]:
        skip_list = []
        train_list = [train_id]
        ds_train_list = []
        y_train_list = []
        ds_test_list = []
        y_test_list = []

        # Read train and test files
        print('Read all datasets')
        for state_id, folder in enumerate(all_state_folder):
            print('\nRead state: ', os.path.basename(folder))
            files = get_files(folder, ext='lvm')

            selected_train_id = [
                x for x in range(len(files)) if x in train_list
            ]
            if not len(selected_train_id):
                selected_train_id = [1]

            for i, filename in enumerate(files):
                if i in skip_list:
                    print('Skip:               {}'.format(filename))
                    continue

                # ds = None
                ds = read_ds_lvm(filename, get_header=False)
                ds = ds[features_list]
                ds = resample(ds, resample_rate)

                if i in selected_train_id:
                    print('Train state {} file: {}'.format(state_id, filename))
                    ds_train_list.append(ds)
                    y_train_list.append(state_id)
                else:
                    print('Test state {} file:  {}'.format(state_id, filename))
                    ds_test_list.append(ds)
                    y_test_list.append(state_id)

        # Apply transform
        transformer = None
        if transform_type:
            print('Apply transform: ', transform_type)
            x_train_list, transformer = transform_data(ds_train_list,
                                                       transform_type)
            x_test_list = [
                apply_transform(ds, transformer) for ds in ds_test_list
            ]

        else:
            print('No transform selected')
            x_train_list = ds_train_list
            x_test_list = ds_test_list

        for kernel in [40, 80, 120, 200, 240, 360]:

            # Create train and test matrix set
            x_train, y_train = prepare_data(x_train_list,
                                            labels=y_train_list,
                                            kernel=kernel,
                                            stride=stride)
            x_test, y_test = prepare_data(x_test_list,
                                          labels=y_test_list,
                                          kernel=kernel,
                                          stride=stride)

            print('Train size:       ', x_train.shape)
            print('Train label size: ', y_train.shape)
            print('Test size:        ', x_test.shape)
            print('Test label size:  ', y_test.shape)

            order = np.random.permutation(len(x_train))
            x_new = x_train[order]
            y_new = y_train[order]

            record = {}
            for cluster_name, cluster_model in cluster_models.items():
                print('\n', cluster_name)

                # ToDo: remove n_clusters params
                cls = cluster_model(n_clusters=4)

                enc_pred = x_test.reshape(len(x_test), -1)

                print(enc_pred.shape)

                y_pred = cls.fit_predict(enc_pred)

                ami = adjusted_mutual_info_score(y_test, y_pred)
                r_score = adjusted_rand_score(y_test, y_pred)
                hom_score = homogeneity_score(y_test, y_pred)

                record[cluster_name] = {
                    'adjusted_mutual_info_score': ami,
                    'adjusted_rand_score': r_score,
                    'homogenity_score': hom_score
                }

                print(record[cluster_name])

            ds_res = pd.DataFrame(record)
            if save_result:
                if not os.path.isdir(output_dir):
                    os.makedirs(output_dir, exist_ok=True)

                filename = os.path.join(
                    output_dir, 'results_{}_cluster_{}_{}.csv'.format(
                        train_id, 'raw', kernel))
                ds_res.to_csv(filename, index=True)

            for model_type in ['cnn', 'deep', 'lstm', 'bilstm']:

                # Model initialization
                print("Model initialization: {}".format(model_type))
                model = get_model(model_type)

                # Training
                print("Training...")
                model.fit(x=x_new, epochs=epochs, verbose=2)

                enc_pred = model.encoder.predict(x_test)
                enc_pred = enc_pred.reshape((len(x_test), -1))

                record = {}
                for cluster_name, cluster_model in cluster_models.items():
                    print('\n', cluster_name)
                    print(enc_pred.shape)

                    # ToDo: remove n_clusters params
                    cls = cluster_model(n_clusters=4)
                    y_pred = cls.fit_predict(enc_pred)

                    ami = adjusted_mutual_info_score(y_test, y_pred)
                    r_score = adjusted_rand_score(y_test, y_pred)
                    hom_score = homogeneity_score(y_test, y_pred)

                    record[cluster_name] = {
                        'adjusted_mutual_info_score': ami,
                        'adjusted_rand_score': r_score,
                        'homogenity_score': hom_score
                    }
                    print(record[cluster_name])

                ds_res = pd.DataFrame(record)
                if save_result:
                    if not os.path.isdir(output_dir):
                        os.makedirs(output_dir, exist_ok=True)

                    filename = os.path.join(
                        output_dir, 'results_{}_cluster_{}_{}.csv'.format(
                            train_id, model_type, kernel))
                    ds_res.to_csv(filename, index=True)
def main():
    params = get_argument()
    all_state_folder = params['all_state_folder']
    features_list = params['features_list']
    kernel = params['kernel']
    stride = params['stride']
    model_type = params['model_type']
    resample_rate = params.get('resample_rate', 6400)
    with_decision_score = params.get('with_decision_score', False)
    custom_resample = params.get('custom_resample', False)

    # resample_rate = 12800  # 12800 sample are 1 second
    # num_sample = 1000000
    with_skip = False

    params_file = './params/params_{}.json'.format(model_type)
    save_result = True
    overwrite = True
    output_dir = './results'

    result_array = []

    # Get list of list of files, where for each state we have a list of file
    curr_files = []

    # Get list of test files
    test_files = []

    for folder in all_state_folder:
        files = get_files(folder, ext='lvm')
        curr_files.append(files)
        test_files += files

    max_size = min([len(files) for files in curr_files[:3]])

    # Get train files where each element is a list of files for a single train
    train_files = []
    for i in range(max_size):
        train_pack = [files[i] for files in curr_files[:3]]

        for j in range(1, len(train_pack)):
            train_files.append(train_pack[:j + 1])

    for train_pack in train_files:
        if len(train_pack) < 3:
            continue

        print('\n' + '\\\\//' * 20)

        selected_files = []
        train_states = []
        x_states = []

        print('\n Train Pack')
        for train_file in train_pack:
            train_state = os.path.split(os.path.dirname(train_file))[-1]
            print("State: ", train_state)
            print("Read File: ", os.path.basename(train_file))
            ds_train = read_ds_lvm(train_file, get_header=False)

            # Check train
            if ds_train is None or ds_train.empty:
                print('Impossible read train file')
                continue

            # Select features
            ds_train = ds_train[features_list]

            # Resample
            train_len = len(ds_train)
            if custom_resample:
                ds_train = resample_with_feature_extractor(
                    ds_train, resample_rate)
            else:
                ds_train = resample(ds_train, resample_rate)

            # ds_train = ds_train[:num_sample]
            print('Original File Length: ', train_len)
            print('New File Length {} {:.02f}'.format(
                len(ds_train), 100 * len(ds_train) / train_len))

            # Create training set
            print("Create set")
            x_train = get_sliding_window_matrix(ds_train.values, kernel,
                                                stride)
            print('Shape ', x_train.shape)

            selected_files.append(train_file)
            train_states.append(train_state)
            x_states.append(x_train)

        x_states = np.vstack(x_states)
        print('\n Train Size: ', x_states.shape)
        print('Train state: ', train_states)

        # Model initialization
        print("Model initialization: {}".format(model_type))
        model = get_model(model_type, params_file=params_file)

        # Training
        print("Training...")
        model.fit(x_states)

        for test_file in test_files:

            test_state = os.path.split(os.path.dirname(test_file))[-1]

            if test_file in selected_files:
                continue

            # if test_state in train_states:
            #     continue

            print("\n State Test: ", test_state)
            print("Read Test File: ", os.path.basename(test_file))
            ds_test = read_ds_lvm(test_file, get_header=False)

            # t1 = datetime.now()

            # Check test
            if ds_test is None or ds_test.empty:
                print('Impossible read test file')
                continue

            # Select features
            ds_test = ds_test[features_list]

            # Resample
            test_len = len(ds_test)
            if custom_resample:
                ds_test = resample_with_feature_extractor(
                    ds_test, resample_rate)
            else:
                ds_test = resample(ds_test, resample_rate)
            # ds_test = ds_test[:num_sample]
            print('Test Original File Length: ', test_len)
            print('New File Length {} {:.02f}'.format(
                len(ds_test), 100 * len(ds_test) / test_len))

            if with_skip:
                test_stride = kernel
            else:
                test_stride = 1

            # Create set
            print("Create testing set")
            x_test = get_sliding_window_matrix(ds_test.values, kernel,
                                               test_stride)
            print('Test shape ', x_test.shape)

            # Testing
            print('Testing...')
            if with_decision_score:
                y_pred = model.decision_score(x_test)
            else:
                y_pred = model.predict(x_test)

            num_error = np.sum(y_pred > 0)
            mean_error = np.mean(y_pred)
            if num_error > 0:
                mean_only_error = np.mean(y_pred[y_pred > 0])
            else:
                mean_only_error = 0

            if not np.sum(y_pred > 0):
                print("Results: NO Anomaly founded")
            else:
                print("Results: {} anomalies "
                      "({:.05f} total {})".format(num_error, mean_error,
                                                  len(x_test)))

            result_record = {
                'MODEL':
                model_type,
                'KERNEL':
                kernel,
                'STRIDE':
                stride,
                'TRAIN_STATE':
                train_states,
                'TRAIN': [
                    os.path.basename(train_file)
                    for train_file in selected_files
                ],
                'TEST_STATE':
                test_state,
                'TEST':
                os.path.basename(test_file),
                'NUM_SINGLE_ANOMALY':
                num_error,
                'PCT_ANOMALY':
                mean_error,
                'NUM_SAMPLE_ANOMALY':
                mean_only_error,
                'NUM_SAMPLE':
                len(x_test),
                'LABEL':
                test_state not in train_states
            }

            result_array.append(result_record)

    if save_result:
        if not os.path.isdir(output_dir):
            os.makedirs(output_dir, exist_ok=True)

        filename = os.path.join(output_dir,
                                'results_multi_' + model_type + '.csv')

        result_ds = pd.DataFrame(result_array)

        if os.path.isfile(filename) and not overwrite:
            prev_result_ds = pd.read_csv(filename)
            result_ds = pd.concat([prev_result_ds, result_ds],
                                  axis=0,
                                  ignore_index=True)

        result_ds.to_csv(filename, index=False)
def main():
    train_state = os.path.split(os.path.dirname(train_file))[-1]
    print("\n State Train: ", train_state)
    print("Read Train File: ", os.path.basename(train_file))
    ds_train = read_ds_lvm(train_file, get_header=False)

    # Check train
    if ds_train is None or ds_train.empty:
        print('Impossible read train file')
        return

    # Select features
    ds_train = ds_train[features_list]

    # Resample
    train_len = len(ds_train)
    if custom_resample:
        ds_train = resample_with_feature_extractor(ds_train, resample_rate)
    else:
        ds_train = resample(ds_train, resample_rate)
    # ds_train = ds_train[:num_sample]
    print('Train Original File Length: ', train_len)
    print('New File Length {} {:.02f}'.format(len(ds_train),
                                              100 * len(ds_train) / train_len))

    # Create training set
    print("Create training set")
    x_train = get_sliding_window_matrix(ds_train.values, kernel, stride)
    print('Train shape ', x_train.shape)

    # Model initialization
    print("Model initialization: {}".format(model_type))
    model = get_model(model_type, params_file=params_file)

    # Training
    print("Training...")
    model.fit(x_train)

    test_state = os.path.split(os.path.dirname(test_file))[-1]
    print("\n State Test: ", test_state)
    print("Read Test File: ", os.path.basename(test_file))
    ds_test = read_ds_lvm(test_file, get_header=False)

    # Check test
    if ds_test is None or ds_test.empty:
        print('Impossible read test file')
        return

    # Select features
    ds_test = ds_test[features_list]

    # Resample
    test_len = len(ds_test)
    if custom_resample:
        ds_test = resample_with_feature_extractor(ds_test, resample_rate)
    else:
        ds_test = resample(ds_test, resample_rate)
    # ds_test = ds_test[:num_sample]
    print('Test Original File Length: ', test_len)
    print('New File Length {} {:.02f}'.format(len(ds_test),
                                              100 * len(ds_test) / test_len))

    # Testing
    # y_pred = predict_anomaly(ds_test, model, kernel, with_skip=with_skip)

    if with_skip:
        test_stride = kernel
    else:
        test_stride = 1

    # Create set
    print("Create testing set")
    x_test = get_sliding_window_matrix(ds_test.values, kernel, test_stride)
    print('Test shape ', x_test.shape)

    # Testing
    print('Testing...')
    if with_decision_score:
        y_pred = model.decision_score(x_test)
    else:
        y_pred = model.predict(x_test)

    num_error = np.sum(y_pred > 0)
    mean_error = np.mean(y_pred)
    if num_error > 0:
        mean_only_error = np.mean(y_pred[y_pred > 0])
    else:
        mean_only_error = 0

    if not np.sum(y_pred > 0):
        print("Results: NO Anomaly founded")
    else:
        print("Results: {} anomalies "
              "({:.05f} {:.05f} total {})".format(num_error,
                                                  mean_error, mean_only_error,
                                                  len(x_test)))

    # Encoding results into triplet formats
    results = create_triplet_time_series(y_pred, with_support=True)

    # Show results
    results = pd.DataFrame(results)
    if results.empty:
        print("Results: NO Anomaly founded")
    else:
        # print(tabulate(results, headers='keys', tablefmt='psql'))

        test_stride = kernel if with_skip else 1

        # Number of test samples of kernel length
        test_sample = int((len(ds_test) - kernel) / test_stride) + 1

        # Number of single anomaly point
        tot = results['support'].sum()
        pct_tot = 100 * tot / (test_sample * test_stride)

        print("Results: {} (record {:.02f})".format(tot, pct_tot))

        if with_skip:
            # Number of anomaly sample
            tot_sample = int(tot / test_stride)
            print("Anomaly Sample: {} (test sample {:.02f})".format(
                int(tot_sample), test_sample))
예제 #6
0
def main():
    params = get_argument()
    all_state_folder = params['all_state_folder']
    features_list = params['features_list']
    # model_type = params['model_type']
    resample_rate = 6400

    kernel = 120  # 40, 80, 120, 200
    stride = 1
    # model_type = 'cnn'        # 'cnn', 'deep', 'lstm'
    transform_type = 'minmax'  # 'std', 'minmax', None

    epochs = 200

    save_result = True
    output_dir = './results'

    model_params = {
        'with_lazy': 0.02,  # 0.00, 0.01, 0.015, 0.02
        # 'loss': 'mae'  # 'mae', 'mse'
    }

    skip_list = [0]
    train_list = [1]

    for selected_state_id, selected_state in enumerate(all_state_folder):
        ds_train_list = []
        y_train_list = []
        ds_test_list = []
        y_test_list = []

        # Read train and test files
        print('Evaluation state: {}'.format(selected_state_id))
        for state_id, folder in enumerate(all_state_folder):
            print('Read state: ', os.path.basename(folder))
            files = get_files(folder, ext='lvm')
            for i, filename in enumerate(files):
                if i in skip_list:
                    continue

                ds = read_ds_lvm(filename, get_header=False)
                ds = ds[features_list]
                ds = resample(ds, resample_rate)

                if i in train_list and state_id != selected_state_id:
                    ds_train_list.append(ds)
                    print('Train state {} file: {}'.format(state_id, filename))
                    y_train_list.append(state_id)
                else:
                    ds_test_list.append(ds)
                    print('Test state {} file: {}'.format(state_id, filename))
                    y_test_list.append(state_id)

        # Apply transform
        transformer = None
        if transform_type:
            print('Apply transform: ', transform_type)
            x_train_list, transformer = transform_data(ds_train_list,
                                                       transform_type)
            x_test_list = [
                apply_transform(ds, transformer) for ds in ds_test_list
            ]

        else:
            print('No transform selected')
            x_train_list = ds_train_list
            x_test_list = ds_test_list

        # Create train and test matrix set
        x_train, y_train = prepare_data(x_train_list,
                                        labels=y_train_list,
                                        kernel=kernel,
                                        stride=stride)
        x_test, y_test = prepare_data(x_test_list,
                                      labels=y_test_list,
                                      kernel=kernel,
                                      stride=stride)

        print('Train size:       ', x_train.shape)
        print('Train label size: ', y_train.shape)
        print('Test size:        ', x_test.shape)
        print('Test label size:  ', y_test.shape)

        order = np.random.permutation(len(x_train))
        x_new = x_train[order]
        y_new = y_train[order]

        for model_type in ['cnn', 'deep', 'lstm', 'bilstm']:

            # Model initialization
            print("Model initialization: {}".format(model_type))
            model = get_deep_model(model_type, model_params=model_params)

            # Training
            print("Training...")
            model.fit(x=x_new, epochs=epochs, verbose=2)

            print("Anomaly accuracy")
            y_pred = model.predict(x_test, classifier=False)
            y_true = np.zeros(len(y_test))
            y_true[y_test == selected_state_id] = 1
            print(classification_report(y_true, y_pred))

            ds_res = pd.DataFrame(
                classification_report(y_true, y_pred, output_dict=True))

            if save_result:
                if not os.path.isdir(output_dir):
                    os.makedirs(output_dir, exist_ok=True)

                filename = os.path.join(
                    output_dir, 'results_anomaly_{}_{}_.csv'.format(
                        selected_state_id, model_type))
                ds_res.to_csv(filename, index=True)

            print("Locate Anomaly")
            x_selected = x_test[y_test == selected_state_id]
            y_selected = y_test[y_test == selected_state_id]
            x_reconstructed = model.model.predict(x_selected)

            ds_res = []
            num_records = len(x_selected)
            for i in range(num_records):
                x_true = x_selected[i]
                x_pred = x_reconstructed[i]
                if transformer is not None:
                    x_true = transformer.inverse_transform(x_true)
                    x_pred = transformer.inverse_transform(x_pred)

                diff = np.mean(np.abs(x_true - x_pred), axis=0)
                res = {k: val for k, val in zip(features_list, diff)}
                res['threshold'] = model.threshold
                res['score'] = y_selected[i]
                ds_res.append(res)

            ds_res = pd.DataFrame(ds_res)

            if save_result:
                if not os.path.isdir(output_dir):
                    os.makedirs(output_dir, exist_ok=True)

                filename = os.path.join(
                    output_dir, 'results_locate__{}__{}__{}.csv'.format(
                        kernel, selected_state_id, model_type))
                ds_res.to_csv(filename, index=False)
예제 #7
0
def main():
    params = get_argument()
    all_state_folder = params['all_state_folder']
    features_list = params['features_list']
    resample_rate = 6400

    stride = 1
    epochs = 300

    transform_type = 'minmax'  # 'minmax'

    save_result = True
    output_dir = './results'

    for train_id in [1, 2, 0]:
        skip_list = []
        train_list = [train_id]
        ds_train_list = []
        y_train_list = []
        ds_test_list = []
        y_test_list = []

        # Read train and test files
        print('Read all datasets')
        for state_id, folder in enumerate(all_state_folder):
            print('\nRead state: ', os.path.basename(folder))
            files = get_files(folder, ext='lvm')

            selected_train_id = [
                x for x in range(len(files)) if x in train_list
            ]
            if not len(selected_train_id):
                selected_train_id = [1]

            for i, filename in enumerate(files):
                if i in skip_list:
                    print('Skip:               {}'.format(filename))
                    continue

                # ds = None
                ds = read_ds_lvm(filename, get_header=False)
                ds = ds[features_list]
                ds = resample(ds, resample_rate)

                if i in selected_train_id:
                    print('Train state {} file: {}'.format(state_id, filename))
                    ds_train_list.append(ds)
                    y_train_list.append(state_id)
                else:
                    print('Test state {} file:  {}'.format(state_id, filename))
                    ds_test_list.append(ds)
                    y_test_list.append(state_id)

        # Apply transform
        transformer = None
        if transform_type:
            print('Apply transform: ', transform_type)
            x_train_list, transformer = transform_data(ds_train_list,
                                                       transform_type)
            x_test_list = [
                apply_transform(ds, transformer) for ds in ds_test_list
            ]

        else:
            print('No transform selected')
            x_train_list = ds_train_list
            x_test_list = ds_test_list

        for kernel in [40, 80, 120, 200, 240, 360]:

            # Create train and test matrix set
            x_train, y_train = prepare_data(x_train_list,
                                            labels=y_train_list,
                                            kernel=kernel,
                                            stride=stride)
            x_test, y_test = prepare_data(x_test_list,
                                          labels=y_test_list,
                                          kernel=kernel,
                                          stride=stride)

            print('Train size:       ', x_train.shape)
            print('Train label size: ', y_train.shape)
            print('Test size:        ', x_test.shape)
            print('Test label size:  ', y_test.shape)

            order = np.random.permutation(len(x_train))
            x_new = x_train[order]
            y_new = y_train[order]

            for model_type in [
                    'classifier', 'linear', 'cnn', 'deep', 'lstm', 'bilstm'
            ]:

                # Model initialization
                print("Model initialization: {}".format(model_type))
                model = get_model(model_type)

                # Training
                print("Training...")
                model.fit(x=x_new,
                          y=y_new,
                          epochs=epochs,
                          batch_size=32,
                          verbose=2)

                y_pred = model.predict(x_test, classifier=True)

                print(classification_report(y_test, y_pred))
                ds_res = pd.DataFrame(
                    classification_report(y_test, y_pred, output_dict=True))

                if save_result:
                    if not os.path.isdir(output_dir):
                        os.makedirs(output_dir, exist_ok=True)

                    filename = os.path.join(
                        output_dir, 'results_{}_accuracy_{}_{}.csv'.format(
                            train_id, model_type, kernel))
                    ds_res.to_csv(filename, index=True)
예제 #8
0
def main():
    output_dir = './results'

    selected_files = [
        "/export/static/pub/softlab/dataset_sbdio/Anomaly Detection/TEST 2/testaccelerometri.lvm",
        "/export/static/pub/softlab/dataset_sbdio/Anomaly Detection/TEST 2/testaccelerometri.lvm",
        "/export/static/pub/softlab/dataset_sbdio/Anomaly Detection/TEST 3/testaccelerometri_1.lvm",
        "/export/static/pub/softlab/dataset_sbdio/Anomaly Detection/TEST 4/testaccelerometri.lvm",
    ]

    features_list = [
        "Acceleration_X1",
        "Acceleration_Y1",
        "Acceleration_Z1",
        "Acceleration_X2",
        "Acceleration_Y2",
        "Acceleration_Z2",
        "Acceleration_X3",
        "Acceleration_Y3",
        "Acceleration_Z3"
    ]

    stride = 1

    model_list = [
        'cnn',
        'lstm',
        'deep',
        'isolation_forest',
        'setup_clustering',
        'pca',
        'lof',
        'svm',
    ]

    kernel_list = [180 if model_type in ['cnn', 'lstm', 'deep'] else 10 for model_type in model_list]

    resample_rate = 6400

    # Initialize result array to memorize performance result
    result_array = []

    # Model cycle
    for model_type, kernel in zip(model_list, kernel_list):

        print('\n\n')
        print('\nModel: {}\n'.format(model_type))

        params_file = './params/params_{}.json'.format(model_type)

        # Train cycle
        for i in range(len(selected_files)):
            x_train = []

            # Get train
            for pos, train_file in enumerate(selected_files[:i + 1]):
                if i > 0 and pos == 0:
                    continue

                ds_train = read_ds_lvm(train_file, get_header=False)

                if ds_train is None or ds_train.empty:
                    raise ValueError('Impossible read train file')

                ds_train = ds_train[features_list]
                ds_train = resample(ds_train, resample_rate)
                x = get_sliding_window_matrix(ds_train.values, kernel, stride)

                if pos == 0:
                    x = x[:len(x) // 2]

                x_train.append(x)

            # Train set
            x_train = np.vstack(x_train)

            print('\nTrain size: {}\n'.format(len(x_train)))

            # Model init
            model = get_model(model_type, params_file=params_file)

            # Model training
            train_start = datetime.now()
            model.fit(x_train)
            train_end = datetime.now()

            # Test cycle
            for j in range(len(selected_files)):

                x_test = []

                # Get test
                for pos, test_file in enumerate(selected_files[:j + 1]):
                    if j > 0 and pos == 0:
                        continue

                    ds_test = read_ds_lvm(test_file, get_header=False)

                    if ds_test is None or ds_test.empty:
                        raise ValueError('Impossible read test file')

                    ds_test = ds_test[features_list]
                    ds_test = resample(ds_test, resample_rate)
                    x = get_sliding_window_matrix(ds_test.values, kernel, stride)

                    if pos == 0:
                        x = x[:1]

                    x_test.append(x)

                # Test set
                x_test = np.vstack(x_test)

                print('\nTest size: {}\n'.format(len(x_test)))

                # Model predict
                test_start = datetime.now()
                model.predict(x_test)
                test_end = datetime.now()

                result_record = {
                    'model': model_type,
                    'train_size': len(x_train),
                    'train_time': train_end - train_start,
                    'test_size': len(x_test),
                    'test_time': test_end - test_start,
                }

                result_array.append(result_record)

    # Save results
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir, exist_ok=True)
    filename = os.path.join(output_dir, 'performance.csv')
    result_ds = pd.DataFrame(result_array)
    result_ds.to_csv(filename, index=False)
def main():
    params = get_argument()
    all_state_folder = params['all_state_folder']

    size = 3

    features_list = [
        "Acceleration_X1", "Acceleration_Y1", "Acceleration_Z1",
        "Acceleration_X2", "Acceleration_Y2", "Acceleration_Z2",
        "Acceleration_X3", "Acceleration_Y3", "Acceleration_Z3"
    ]

    stride = 1

    model_list = [
        'cnn',
        'lstm',
        'deep',
        'isolation_forest',
        'setup_clustering',
        'pca',
        'lof',
        'svm',
    ]

    kernel_list = [
        180 if model_type in ['cnn', 'lstm', 'deep'] else 10
        for model_type in model_list
    ]

    resample_rate = 6400

    save_result = True
    output_dir = './results'

    # Initialize result array to memorize result
    # for each train and test step
    result_array = []

    # Get files from selected folder to use for training and testing
    curr_files = []
    for folder in all_state_folder:
        curr_files += get_files(folder, ext='lvm')[:]

    test_files = curr_files

    for model_type, kernel in zip(model_list, kernel_list):
        print('\n' + '\\\\//' * 20)
        print('\n Model: {}\n'.format(model_type))

        params_file = './params/params_{}.json'.format(model_type)

        for pos, train_file in enumerate(curr_files):
            skip_step = False
            train_state = os.path.split(os.path.dirname(train_file))[-1]

            x_train = []

            print("\n State Train: ", train_state)

            for i in range(size):
                if pos + i >= len(curr_files):
                    print('Not enough files')
                    skip_step = True
                    break

                tmp_file = curr_files[pos + i]
                tmp_state = os.path.split(os.path.dirname(tmp_file))[-1]

                if tmp_state != train_state:
                    print('Different state and skip current train')
                    skip_step = True
                    break

                print("Read {} Train File: {}".format(
                    i, os.path.basename(tmp_file)))

                ds_tmp = read_ds_lvm(tmp_file, get_header=False)

                # Check train
                if ds_tmp is None or ds_tmp.empty:
                    print('Impossible read train file')
                    skip_step = True
                    break

                # Select features
                ds_tmp = ds_tmp[features_list]
                # Resample
                ds_tmp = resample(ds_tmp, resample_rate)
                # Create training set
                x_tmp = get_sliding_window_matrix(ds_tmp.values, kernel,
                                                  stride)

                x_train.append(x_tmp)

            if skip_step:
                print('Skip current train')
                continue

            # Train set
            x_train = np.vstack(x_train)
            train_len = len(x_train)
            print('\nTrain size: {}\n'.format(x_train.shape))

            # Model initialization
            print("Model initialization: {}".format(model_type))
            model = get_model(model_type, params_file=params_file)

            # Training
            print("Training...")
            model.fit(x_train)

            for test_file in test_files:

                test_state = os.path.split(os.path.dirname(test_file))[-1]

                if train_state == test_state \
                        and test_file == train_file:
                    continue

                print("\n State Test: ", test_state)
                print("Read Test File: ", os.path.basename(test_file))
                ds_test = read_ds_lvm(test_file, get_header=False)

                # t1 = datetime.now()

                # Check test
                if ds_test is None or ds_test.empty:
                    print('Impossible read test file')
                    continue

                # Select features
                ds_test = ds_test[features_list]

                # Resample
                test_len = len(ds_test)
                ds_test = resample(ds_test, resample_rate)
                # ds_test = ds_test[:num_sample]
                print('Test Original File Length: ', test_len)
                print('New File Length {} {:.02f}'.format(
                    len(ds_test), 100 * len(ds_test) / test_len))

                test_stride = 1

                # Create set
                print("Create testing set")
                x_test = get_sliding_window_matrix(ds_test.values, kernel,
                                                   test_stride)
                print('Test shape ', x_test.shape)

                # Testing
                print('Testing...')
                y_pred = model.predict(x_test)

                num_error = np.sum(y_pred > 0)
                mean_error = np.mean(y_pred)
                if num_error > 0:
                    mean_only_error = np.mean(y_pred[y_pred > 0])
                else:
                    mean_only_error = 0

                if not np.sum(y_pred > 0):
                    print("Results: NO Anomaly founded")
                else:
                    print("Results: {} anomalies "
                          "({:.05f} total {})".format(num_error, mean_error,
                                                      len(x_test)))

                result_record = {
                    'MODEL': model_type,
                    'KERNEL': kernel,
                    'STRIDE': stride,
                    'TRAIN_STATE': train_state,
                    'TRAIN': os.path.basename(train_file),
                    'TRAIN_SIZE': train_len,
                    'TEST_STATE': test_state,
                    'TEST': os.path.basename(test_file),
                    'TEST_LEN': test_len,
                    'NUM_SINGLE_ANOMALY': num_error,
                    'PCT_ANOMALY': mean_error,
                    'NUM_SAMPLE_ANOMALY': mean_only_error,
                    'NUM_SAMPLE': len(x_test),
                    'LABEL': train_state != test_state
                }

                result_array.append(result_record)

        if save_result:
            if not os.path.isdir(output_dir):
                os.makedirs(output_dir, exist_ok=True)

            filename = os.path.join(
                output_dir,
                'results_single_{}'.format(size) + model_type + '.csv')

            result_ds = pd.DataFrame(result_array)

            result_ds.to_csv(filename, index=False)