示例#1
0
def main():
    # Get all .CSV files in global folder
    files = get_files(global_dir, ext='.CSV')
    print('Found {} files'.format(len(files)))

    # Get settings dataset, where each row represent a new setting entry
    ds_settings = get_time_series_dataset(settings_file, sep=';', col='DT')
    print('Found {} settings'.format(len(ds_settings)))

    # Identify settings label
    label_settings(ds_settings, ds_settings.columns[:13])
    ds_settings.ltime = pd.to_datetime(ds_settings.ltime)
    ds_settings.rtime = pd.to_datetime(ds_settings.rtime)
    print('Found {} unique settings'.format(len(np.unique(ds_settings.label))))

    settings_map = {}
    setup_files = []

    # Create settings map that associates a setting to each file
    print('\nSettings File identification')
    for file in files:
        # Read dataset
        ds = get_time_series_dataset(file, sep=';', col='DT')

        # Get nearest left setting
        setting = get_settings(ds, ds_settings)

        # Update settings_map
        if str(setting.label) not in settings_map:
            settings_map[str(setting.label)] = [file]
        else:
            settings_map[str(setting.label)] += [file]

        # Check if the setting start overlap with file timely interval
        if check_setup(ds, setting):
            print('Found setup {}: {} - {} in ds {} - {}'.format(
                setting.label, setting.ltime, setting.rtime, ds.index.min(),
                ds.index.max()))
            setup_files += [file]

        elif lazy_check_setup(ds, setting):
            print('Found lazy setup {}: {} - {} in ds {} - {}'.format(
                setting.label, setting.ltime, setting.rtime, ds.index.min(),
                ds.index.max()))
            setup_files += [file]

    print('Number of timely series with setup: {}'.format(len(setup_files)))

    y_pred_single = {}
    y_true_single = {}

    normal_files = {}

    # Save settings_map and setup_files list
    # with open('../results/settings_map.json', 'w') as outfile:
    #     json.dump(settings_map, outfile)
    #
    # with open('../results/setup_files.json', 'w') as outfile:
    #     json.dump(setup_files, outfile)

    # For each state we train a models with a "normal" file and predict anomalies
    print('\nTraining and Testing - {}'.format(model_type))
    for k, val in settings_map.items():
        print('\nState {} has {} files'.format(k, len(val)))

        # Get normal file from constant_normal_files dictionary
        if k not in constant_normal_files:
            print('Skip, normal files founded')
            continue

        normal_file = constant_normal_files[k]
        normal_files[k] = normal_file

        if normal_file is None:
            print('Impossible get normal file')
            return

        # Training
        ds_train = get_time_series_dataset(filename=normal_file,
                                           sep=';',
                                           col='DT')
        # Check train
        if ds_train is None:
            print('Impossible read train file')
            return

        y_pred_single[k] = {}
        y_true_single[k] = {}

        for col in ds_train.columns:
            x_train = ds_train[[col]]
            x_train = get_sliding_window_matrix(x_train.values, kernel, stride)

            # Selected models
            if model_type == 'pca':
                model = PCA(n_components=0.95, threshold=100, c_alpha=3.2905)
            elif model_type == 'clustering':
                model = SetupClustering(distance="cosine",
                                        max_dist=0.001,
                                        anomaly_threshold=0.0001)
            elif model_type == 'svm':
                model = OneClassSVM(nu=0.001,
                                    tol=0.001,
                                    kernel="rbf",
                                    gamma="scale")
            elif model_type == 'lof':
                model = LOF(n_neighbors=50,
                            algorithm='auto',
                            metric='minkowski',
                            contamination='auto')
            elif model_type == 'if':
                model = IsolationForest(n_estimators=200,
                                        max_samples=512,
                                        contamination=0.0003,
                                        max_features=0.8)
            else:
                print("Select the wrong models")
                return

            # Training
            print("Training... state {} col {}".format(k, col))
            model.fit(x_train)

            y_pred_single[k][col] = []
            y_true_single[k][col] = []

            print("Testing...")
            for file in val:
                # y_true_single is useless
                # setup_files doesn't have value for label
                if file in setup_files:
                    y_true_single[k][col].append(1)
                else:
                    y_true_single[k][col].append(0)

                x_test = get_time_series_dataset(filename=file,
                                                 sep=';',
                                                 col='DT')

                # Check test
                if x_test is None:
                    print('Impossible read test file')
                    return

                # Create testing values
                x_test = x_test[[col]]
                x_test = get_sliding_window_matrix(x_test.values, kernel,
                                                   kernel)

                # Testing
                y_pred = model.predict(x_test)

                # Save number of detected anomalies
                y_pred_single[k][col].append(len(y_pred[y_pred == 1]))

        # break

    print('\nSelected normal files:')
    for k, file in normal_files.items():
        print("State {} -> {}".format(k, file))

    # Create result dataset
    y_pred = []
    y_true = []
    cols = []
    files = []
    states = []

    for k in y_pred_single.keys():
        for col in y_pred_single[k].keys():
            i = 0
            for pred, true in zip(y_pred_single[k][col],
                                  y_true_single[k][col]):
                y_pred.append(pred)
                y_true.append(true)
                cols.append(col)
                files.append(settings_map[k][i])
                states.append(k)
                i += 1

    res_ds = pd.DataFrame({
        'file': files,
        'cols': cols,
        'states': states,
        'y_pred': y_pred,
        'y_true': y_true
    })

    # Create real ground truth
    res_ds['file'] = res_ds['file'].apply(lambda x: x.split('\\')[-1])
    normal_file_list = ["File ({}).CSV".format(x) for x in normal_file_id_list]
    res_ds['y_true'] = 1
    res_ds.loc[res_ds['file'].isin(normal_file_list), 'y_true'] = 0

    # Save results
    res_ds.to_csv('../results/{}_evaluation.CSV'.format(model_type),
                  sep=';',
                  index=False)

    # Evaluation
    print("\nEvaluation")
    true_positive = len(res_ds[(res_ds['y_pred'] > 0)
                               & (res_ds['y_true'] > 0)])
    false_positive = len(res_ds[(res_ds['y_pred'] > 0)
                                & (res_ds['y_true'] == 0)])
    true_negative = len(res_ds[(res_ds['y_pred'] <= 0)
                               & (res_ds['y_true'] == 0)])
    false_negative = len(res_ds[(res_ds['y_pred'] <= 0)
                                & (res_ds['y_true'] > 0)])

    acc = 100 * (true_positive + true_negative) / len(res_ds)
    print("Accuracy: {}".format(acc))

    precision = 100 * true_positive / (true_positive + false_positive)
    print("Precision: {}".format(precision))
    recall = 100 * true_positive / (true_positive + false_negative)
    print("Recall: {}".format(recall))
    f_score = 2 * precision * recall / (precision + recall)
    print("F-score: {}".format(f_score))
示例#2
0
def main():
    params = get_argument()
    all_state_folder = params['all_state_folder'][:]
    features_list = params['features_list']
    model_type = params['model_type']
    resample_rate = 6400

    stride = 1
    epochs = 300

    save_result = True
    output_dir = './results'

    params_grid = {
        'kernel': [40, 80, 120, 200, 240, 360],
        'transform_type': ['minmax'],
        'with_lazy': [0.00],  # , 0.01, 0.015, 0.02],
        # 'loss': ['mae', 'mse'],
        # 'activation': [layers.LeakyReLU(alpha=0.3), 'relu', 'tanh']
    }
    # if model_type == 'bilstm':
    #     params_grid['activation'] = ['relu', 'tanh']

    for model_type in [
            'pca', 'svm', 'cluster', 'cnn', 'deep', 'lstm', 'bilstm'
    ]:

        skip_list = [0]
        train_list = [1]

        combs = []
        states = list(range(len(all_state_folder)))
        for i in range(len(all_state_folder) - 1):
            r = i + 1
            l = list(itertools.combinations(states, r=r))
            l = [list(x) for x in l]

            combs += l

        for selected_states in combs:
            ds_train_list = []
            y_train_list = []
            ds_test_list = []
            y_test_list = []

            # Read train and test files
            print('Evaluation state: {}'.format(selected_states))
            for state_id, folder in enumerate(all_state_folder):
                print('Read state: ', os.path.basename(folder))
                files = get_files(folder, ext='lvm')
                for i, filename in enumerate(files):
                    if i in skip_list:
                        print('Skip: ', filename)
                        continue

                    ds = read_ds_lvm(filename, get_header=False)
                    ds = ds[features_list]
                    ds = resample(ds, resample_rate)

                    if i in train_list and state_id not in selected_states:
                        ds_train_list.append(ds)
                        print('Train state {} file: {}'.format(
                            state_id, filename))
                        y_train_list.append(state_id)
                    else:
                        ds_test_list.append(ds)
                        print('Test state {} file: {}'.format(
                            state_id, filename))
                        y_test_list.append(state_id)

            ds_res = []
            for grid in ParameterGrid(params_grid):

                print('\n Params:')
                print(grid)

                kernel = grid['kernel']
                transform_type = grid['transform_type']

                model_params = dict(grid)
                model_params.pop('kernel', None)
                model_params.pop('transform_type', None)
                if 'skernel' in model_params:
                    model_params['kernel'] = model_params['skernel']
                    model_params.pop('skernel', None)

                # Apply transform
                transformer = None
                if transform_type:
                    print('Apply transform: ', transform_type)
                    x_train_list, transformer = transform_data(
                        ds_train_list, transform_type)
                    x_test_list = [
                        apply_transform(ds, transformer) for ds in ds_test_list
                    ]

                else:
                    print('No transform selected')
                    x_train_list = ds_train_list
                    x_test_list = ds_test_list

                # Create train and test matrix set
                x_train, y_train = prepare_data(x_train_list,
                                                labels=y_train_list,
                                                kernel=kernel,
                                                stride=stride)
                x_test, y_test = prepare_data(x_test_list,
                                              labels=y_test_list,
                                              kernel=kernel,
                                              stride=stride)

                print('Train size:       ', x_train.shape)
                print('Train label size: ', y_train.shape)
                print('Test size:        ', x_test.shape)
                print('Test label size:  ', y_test.shape)

                order = np.random.permutation(len(x_train))
                x_new = x_train[order]
                y_new = y_train[order]

                # Model initialization
                print("Model initialization: {}".format(model_type))
                model = get_deep_model(model_type, model_params=model_params)

                # Training
                print("Training...")
                model.fit(x=x_new, epochs=epochs, batch_size=64, verbose=2)

                if model_type in ['cnn', 'deep', 'lstm', 'bilstm']:
                    original_threshold = model.threshold
                    print('Anomaly threshold: ', original_threshold)
                    thresholds = [
                        0.00, 0.01, 0.015, 0.02, 0.03, 0.05, 0.07, 0.1, 0.125,
                        0.15
                    ]
                    for th in thresholds:
                        model.threshold = original_threshold + th
                        print('\n Lazy ', model.threshold)

                        print("Anomaly accuracy")
                        y_pred = model.predict(x_test, classifier=False)
                        y_true = np.zeros(len(y_test))

                        for selected_state_id in selected_states:
                            y_true[y_test == selected_state_id] = 1

                        print(classification_report(y_true, y_pred))

                        report_dict = classification_report(y_true,
                                                            y_pred,
                                                            output_dict=True)
                        record = get_classification_report_record(report_dict)
                        record.update(grid)
                        record['with_lazy'] = th

                        ds_res.append(record)
                else:
                    print("Anomaly accuracy")
                    y_pred = model.predict(x_test)
                    y_true = np.zeros(len(y_test))

                    for selected_state_id in selected_states:
                        y_true[y_test == selected_state_id] = 1

                    print(classification_report(y_true, y_pred))

                    report_dict = classification_report(y_true,
                                                        y_pred,
                                                        output_dict=True)
                    record = get_classification_report_record(report_dict)
                    record.update(grid)
                    ds_res.append(record)

            ds_res = pd.DataFrame(ds_res)

            if save_result:
                if not os.path.isdir(output_dir):
                    os.makedirs(output_dir, exist_ok=True)

                name = [str(x) for x in selected_states]
                name = '_'.join(name)

                filename = os.path.join(
                    output_dir, 'results_grid_anomaly__{}__{}.csv'.format(
                        name, model_type))
                ds_res.to_csv(filename, index=True)
def main():
    params = get_argument()
    all_state_folder = params['all_state_folder']
    features_list = params['features_list']
    kernel = params['kernel']
    stride = params['stride']
    model_type = params['model_type']
    resample_rate = params.get('resample_rate', 6400)
    with_decision_score = params.get('with_decision_score', False)
    custom_resample = params.get('custom_resample', False)

    # resample_rate = 12800  # 12800 sample are 1 second
    # num_sample = 1000000
    with_skip = False

    params_file = './params/params_{}.json'.format(model_type)
    save_result = True
    overwrite = True
    output_dir = './results'

    result_array = []

    # Get list of list of files, where for each state we have a list of file
    curr_files = []

    # Get list of test files
    test_files = []

    for folder in all_state_folder:
        files = get_files(folder, ext='lvm')
        curr_files.append(files)
        test_files += files

    max_size = min([len(files) for files in curr_files[:3]])

    # Get train files where each element is a list of files for a single train
    train_files = []
    for i in range(max_size):
        train_pack = [files[i] for files in curr_files[:3]]

        for j in range(1, len(train_pack)):
            train_files.append(train_pack[:j + 1])

    for train_pack in train_files:
        if len(train_pack) < 3:
            continue

        print('\n' + '\\\\//' * 20)

        selected_files = []
        train_states = []
        x_states = []

        print('\n Train Pack')
        for train_file in train_pack:
            train_state = os.path.split(os.path.dirname(train_file))[-1]
            print("State: ", train_state)
            print("Read File: ", os.path.basename(train_file))
            ds_train = read_ds_lvm(train_file, get_header=False)

            # Check train
            if ds_train is None or ds_train.empty:
                print('Impossible read train file')
                continue

            # Select features
            ds_train = ds_train[features_list]

            # Resample
            train_len = len(ds_train)
            if custom_resample:
                ds_train = resample_with_feature_extractor(
                    ds_train, resample_rate)
            else:
                ds_train = resample(ds_train, resample_rate)

            # ds_train = ds_train[:num_sample]
            print('Original File Length: ', train_len)
            print('New File Length {} {:.02f}'.format(
                len(ds_train), 100 * len(ds_train) / train_len))

            # Create training set
            print("Create set")
            x_train = get_sliding_window_matrix(ds_train.values, kernel,
                                                stride)
            print('Shape ', x_train.shape)

            selected_files.append(train_file)
            train_states.append(train_state)
            x_states.append(x_train)

        x_states = np.vstack(x_states)
        print('\n Train Size: ', x_states.shape)
        print('Train state: ', train_states)

        # Model initialization
        print("Model initialization: {}".format(model_type))
        model = get_model(model_type, params_file=params_file)

        # Training
        print("Training...")
        model.fit(x_states)

        for test_file in test_files:

            test_state = os.path.split(os.path.dirname(test_file))[-1]

            if test_file in selected_files:
                continue

            # if test_state in train_states:
            #     continue

            print("\n State Test: ", test_state)
            print("Read Test File: ", os.path.basename(test_file))
            ds_test = read_ds_lvm(test_file, get_header=False)

            # t1 = datetime.now()

            # Check test
            if ds_test is None or ds_test.empty:
                print('Impossible read test file')
                continue

            # Select features
            ds_test = ds_test[features_list]

            # Resample
            test_len = len(ds_test)
            if custom_resample:
                ds_test = resample_with_feature_extractor(
                    ds_test, resample_rate)
            else:
                ds_test = resample(ds_test, resample_rate)
            # ds_test = ds_test[:num_sample]
            print('Test Original File Length: ', test_len)
            print('New File Length {} {:.02f}'.format(
                len(ds_test), 100 * len(ds_test) / test_len))

            if with_skip:
                test_stride = kernel
            else:
                test_stride = 1

            # Create set
            print("Create testing set")
            x_test = get_sliding_window_matrix(ds_test.values, kernel,
                                               test_stride)
            print('Test shape ', x_test.shape)

            # Testing
            print('Testing...')
            if with_decision_score:
                y_pred = model.decision_score(x_test)
            else:
                y_pred = model.predict(x_test)

            num_error = np.sum(y_pred > 0)
            mean_error = np.mean(y_pred)
            if num_error > 0:
                mean_only_error = np.mean(y_pred[y_pred > 0])
            else:
                mean_only_error = 0

            if not np.sum(y_pred > 0):
                print("Results: NO Anomaly founded")
            else:
                print("Results: {} anomalies "
                      "({:.05f} total {})".format(num_error, mean_error,
                                                  len(x_test)))

            result_record = {
                'MODEL':
                model_type,
                'KERNEL':
                kernel,
                'STRIDE':
                stride,
                'TRAIN_STATE':
                train_states,
                'TRAIN': [
                    os.path.basename(train_file)
                    for train_file in selected_files
                ],
                'TEST_STATE':
                test_state,
                'TEST':
                os.path.basename(test_file),
                'NUM_SINGLE_ANOMALY':
                num_error,
                'PCT_ANOMALY':
                mean_error,
                'NUM_SAMPLE_ANOMALY':
                mean_only_error,
                'NUM_SAMPLE':
                len(x_test),
                'LABEL':
                test_state not in train_states
            }

            result_array.append(result_record)

    if save_result:
        if not os.path.isdir(output_dir):
            os.makedirs(output_dir, exist_ok=True)

        filename = os.path.join(output_dir,
                                'results_multi_' + model_type + '.csv')

        result_ds = pd.DataFrame(result_array)

        if os.path.isfile(filename) and not overwrite:
            prev_result_ds = pd.read_csv(filename)
            result_ds = pd.concat([prev_result_ds, result_ds],
                                  axis=0,
                                  ignore_index=True)

        result_ds.to_csv(filename, index=False)
示例#4
0
def main():
    params = get_argument()
    all_state_folder = params['all_state_folder']
    features_list = params['features_list']
    resample_rate = 6400

    stride = 1
    epochs = 500

    transform_type = 'minmax'

    save_result = True
    output_dir = './results'

    cluster_models = {
        'agglomerative': AgglomerativeClustering,
        'kmeans': KMeans,
        'spectral': SpectralClustering
    }

    for train_id in [1, 2]:
        skip_list = []
        train_list = [train_id]
        ds_train_list = []
        y_train_list = []
        ds_test_list = []
        y_test_list = []

        # Read train and test files
        print('Read all datasets')
        for state_id, folder in enumerate(all_state_folder):
            print('\nRead state: ', os.path.basename(folder))
            files = get_files(folder, ext='lvm')

            selected_train_id = [
                x for x in range(len(files)) if x in train_list
            ]
            if not len(selected_train_id):
                selected_train_id = [1]

            for i, filename in enumerate(files):
                if i in skip_list:
                    print('Skip:               {}'.format(filename))
                    continue

                # ds = None
                ds = read_ds_lvm(filename, get_header=False)
                ds = ds[features_list]
                ds = resample(ds, resample_rate)

                if i in selected_train_id:
                    print('Train state {} file: {}'.format(state_id, filename))
                    ds_train_list.append(ds)
                    y_train_list.append(state_id)
                else:
                    print('Test state {} file:  {}'.format(state_id, filename))
                    ds_test_list.append(ds)
                    y_test_list.append(state_id)

        # Apply transform
        transformer = None
        if transform_type:
            print('Apply transform: ', transform_type)
            x_train_list, transformer = transform_data(ds_train_list,
                                                       transform_type)
            x_test_list = [
                apply_transform(ds, transformer) for ds in ds_test_list
            ]

        else:
            print('No transform selected')
            x_train_list = ds_train_list
            x_test_list = ds_test_list

        for kernel in [40, 80, 120, 200, 240, 360]:

            # Create train and test matrix set
            x_train, y_train = prepare_data(x_train_list,
                                            labels=y_train_list,
                                            kernel=kernel,
                                            stride=stride)
            x_test, y_test = prepare_data(x_test_list,
                                          labels=y_test_list,
                                          kernel=kernel,
                                          stride=stride)

            print('Train size:       ', x_train.shape)
            print('Train label size: ', y_train.shape)
            print('Test size:        ', x_test.shape)
            print('Test label size:  ', y_test.shape)

            order = np.random.permutation(len(x_train))
            x_new = x_train[order]
            y_new = y_train[order]

            record = {}
            for cluster_name, cluster_model in cluster_models.items():
                print('\n', cluster_name)

                # ToDo: remove n_clusters params
                cls = cluster_model(n_clusters=4)

                enc_pred = x_test.reshape(len(x_test), -1)

                print(enc_pred.shape)

                y_pred = cls.fit_predict(enc_pred)

                ami = adjusted_mutual_info_score(y_test, y_pred)
                r_score = adjusted_rand_score(y_test, y_pred)
                hom_score = homogeneity_score(y_test, y_pred)

                record[cluster_name] = {
                    'adjusted_mutual_info_score': ami,
                    'adjusted_rand_score': r_score,
                    'homogenity_score': hom_score
                }

                print(record[cluster_name])

            ds_res = pd.DataFrame(record)
            if save_result:
                if not os.path.isdir(output_dir):
                    os.makedirs(output_dir, exist_ok=True)

                filename = os.path.join(
                    output_dir, 'results_{}_cluster_{}_{}.csv'.format(
                        train_id, 'raw', kernel))
                ds_res.to_csv(filename, index=True)

            for model_type in ['cnn', 'deep', 'lstm', 'bilstm']:

                # Model initialization
                print("Model initialization: {}".format(model_type))
                model = get_model(model_type)

                # Training
                print("Training...")
                model.fit(x=x_new, epochs=epochs, verbose=2)

                enc_pred = model.encoder.predict(x_test)
                enc_pred = enc_pred.reshape((len(x_test), -1))

                record = {}
                for cluster_name, cluster_model in cluster_models.items():
                    print('\n', cluster_name)
                    print(enc_pred.shape)

                    # ToDo: remove n_clusters params
                    cls = cluster_model(n_clusters=4)
                    y_pred = cls.fit_predict(enc_pred)

                    ami = adjusted_mutual_info_score(y_test, y_pred)
                    r_score = adjusted_rand_score(y_test, y_pred)
                    hom_score = homogeneity_score(y_test, y_pred)

                    record[cluster_name] = {
                        'adjusted_mutual_info_score': ami,
                        'adjusted_rand_score': r_score,
                        'homogenity_score': hom_score
                    }
                    print(record[cluster_name])

                ds_res = pd.DataFrame(record)
                if save_result:
                    if not os.path.isdir(output_dir):
                        os.makedirs(output_dir, exist_ok=True)

                    filename = os.path.join(
                        output_dir, 'results_{}_cluster_{}_{}.csv'.format(
                            train_id, model_type, kernel))
                    ds_res.to_csv(filename, index=True)
示例#5
0
def main():
    params = get_argument()
    all_state_folder = params['all_state_folder']
    features_list = params['features_list']
    # model_type = params['model_type']
    resample_rate = 6400

    kernel = 120  # 40, 80, 120, 200
    stride = 1
    # model_type = 'cnn'        # 'cnn', 'deep', 'lstm'
    transform_type = 'minmax'  # 'std', 'minmax', None

    epochs = 200

    save_result = True
    output_dir = './results'

    model_params = {
        'with_lazy': 0.02,  # 0.00, 0.01, 0.015, 0.02
        # 'loss': 'mae'  # 'mae', 'mse'
    }

    skip_list = [0]
    train_list = [1]

    for selected_state_id, selected_state in enumerate(all_state_folder):
        ds_train_list = []
        y_train_list = []
        ds_test_list = []
        y_test_list = []

        # Read train and test files
        print('Evaluation state: {}'.format(selected_state_id))
        for state_id, folder in enumerate(all_state_folder):
            print('Read state: ', os.path.basename(folder))
            files = get_files(folder, ext='lvm')
            for i, filename in enumerate(files):
                if i in skip_list:
                    continue

                ds = read_ds_lvm(filename, get_header=False)
                ds = ds[features_list]
                ds = resample(ds, resample_rate)

                if i in train_list and state_id != selected_state_id:
                    ds_train_list.append(ds)
                    print('Train state {} file: {}'.format(state_id, filename))
                    y_train_list.append(state_id)
                else:
                    ds_test_list.append(ds)
                    print('Test state {} file: {}'.format(state_id, filename))
                    y_test_list.append(state_id)

        # Apply transform
        transformer = None
        if transform_type:
            print('Apply transform: ', transform_type)
            x_train_list, transformer = transform_data(ds_train_list,
                                                       transform_type)
            x_test_list = [
                apply_transform(ds, transformer) for ds in ds_test_list
            ]

        else:
            print('No transform selected')
            x_train_list = ds_train_list
            x_test_list = ds_test_list

        # Create train and test matrix set
        x_train, y_train = prepare_data(x_train_list,
                                        labels=y_train_list,
                                        kernel=kernel,
                                        stride=stride)
        x_test, y_test = prepare_data(x_test_list,
                                      labels=y_test_list,
                                      kernel=kernel,
                                      stride=stride)

        print('Train size:       ', x_train.shape)
        print('Train label size: ', y_train.shape)
        print('Test size:        ', x_test.shape)
        print('Test label size:  ', y_test.shape)

        order = np.random.permutation(len(x_train))
        x_new = x_train[order]
        y_new = y_train[order]

        for model_type in ['cnn', 'deep', 'lstm', 'bilstm']:

            # Model initialization
            print("Model initialization: {}".format(model_type))
            model = get_deep_model(model_type, model_params=model_params)

            # Training
            print("Training...")
            model.fit(x=x_new, epochs=epochs, verbose=2)

            print("Anomaly accuracy")
            y_pred = model.predict(x_test, classifier=False)
            y_true = np.zeros(len(y_test))
            y_true[y_test == selected_state_id] = 1
            print(classification_report(y_true, y_pred))

            ds_res = pd.DataFrame(
                classification_report(y_true, y_pred, output_dict=True))

            if save_result:
                if not os.path.isdir(output_dir):
                    os.makedirs(output_dir, exist_ok=True)

                filename = os.path.join(
                    output_dir, 'results_anomaly_{}_{}_.csv'.format(
                        selected_state_id, model_type))
                ds_res.to_csv(filename, index=True)

            print("Locate Anomaly")
            x_selected = x_test[y_test == selected_state_id]
            y_selected = y_test[y_test == selected_state_id]
            x_reconstructed = model.model.predict(x_selected)

            ds_res = []
            num_records = len(x_selected)
            for i in range(num_records):
                x_true = x_selected[i]
                x_pred = x_reconstructed[i]
                if transformer is not None:
                    x_true = transformer.inverse_transform(x_true)
                    x_pred = transformer.inverse_transform(x_pred)

                diff = np.mean(np.abs(x_true - x_pred), axis=0)
                res = {k: val for k, val in zip(features_list, diff)}
                res['threshold'] = model.threshold
                res['score'] = y_selected[i]
                ds_res.append(res)

            ds_res = pd.DataFrame(ds_res)

            if save_result:
                if not os.path.isdir(output_dir):
                    os.makedirs(output_dir, exist_ok=True)

                filename = os.path.join(
                    output_dir, 'results_locate__{}__{}__{}.csv'.format(
                        kernel, selected_state_id, model_type))
                ds_res.to_csv(filename, index=False)
示例#6
0
def main():
    params = get_argument()
    all_state_folder = params['all_state_folder']
    features_list = params['features_list']
    resample_rate = 6400

    stride = 1
    epochs = 300

    transform_type = 'minmax'  # 'minmax'

    save_result = True
    output_dir = './results'

    for train_id in [1, 2, 0]:
        skip_list = []
        train_list = [train_id]
        ds_train_list = []
        y_train_list = []
        ds_test_list = []
        y_test_list = []

        # Read train and test files
        print('Read all datasets')
        for state_id, folder in enumerate(all_state_folder):
            print('\nRead state: ', os.path.basename(folder))
            files = get_files(folder, ext='lvm')

            selected_train_id = [
                x for x in range(len(files)) if x in train_list
            ]
            if not len(selected_train_id):
                selected_train_id = [1]

            for i, filename in enumerate(files):
                if i in skip_list:
                    print('Skip:               {}'.format(filename))
                    continue

                # ds = None
                ds = read_ds_lvm(filename, get_header=False)
                ds = ds[features_list]
                ds = resample(ds, resample_rate)

                if i in selected_train_id:
                    print('Train state {} file: {}'.format(state_id, filename))
                    ds_train_list.append(ds)
                    y_train_list.append(state_id)
                else:
                    print('Test state {} file:  {}'.format(state_id, filename))
                    ds_test_list.append(ds)
                    y_test_list.append(state_id)

        # Apply transform
        transformer = None
        if transform_type:
            print('Apply transform: ', transform_type)
            x_train_list, transformer = transform_data(ds_train_list,
                                                       transform_type)
            x_test_list = [
                apply_transform(ds, transformer) for ds in ds_test_list
            ]

        else:
            print('No transform selected')
            x_train_list = ds_train_list
            x_test_list = ds_test_list

        for kernel in [40, 80, 120, 200, 240, 360]:

            # Create train and test matrix set
            x_train, y_train = prepare_data(x_train_list,
                                            labels=y_train_list,
                                            kernel=kernel,
                                            stride=stride)
            x_test, y_test = prepare_data(x_test_list,
                                          labels=y_test_list,
                                          kernel=kernel,
                                          stride=stride)

            print('Train size:       ', x_train.shape)
            print('Train label size: ', y_train.shape)
            print('Test size:        ', x_test.shape)
            print('Test label size:  ', y_test.shape)

            order = np.random.permutation(len(x_train))
            x_new = x_train[order]
            y_new = y_train[order]

            for model_type in [
                    'classifier', 'linear', 'cnn', 'deep', 'lstm', 'bilstm'
            ]:

                # Model initialization
                print("Model initialization: {}".format(model_type))
                model = get_model(model_type)

                # Training
                print("Training...")
                model.fit(x=x_new,
                          y=y_new,
                          epochs=epochs,
                          batch_size=32,
                          verbose=2)

                y_pred = model.predict(x_test, classifier=True)

                print(classification_report(y_test, y_pred))
                ds_res = pd.DataFrame(
                    classification_report(y_test, y_pred, output_dict=True))

                if save_result:
                    if not os.path.isdir(output_dir):
                        os.makedirs(output_dir, exist_ok=True)

                    filename = os.path.join(
                        output_dir, 'results_{}_accuracy_{}_{}.csv'.format(
                            train_id, model_type, kernel))
                    ds_res.to_csv(filename, index=True)
def main():
    params = get_argument()
    all_state_folder = params['all_state_folder']

    size = 3

    features_list = [
        "Acceleration_X1", "Acceleration_Y1", "Acceleration_Z1",
        "Acceleration_X2", "Acceleration_Y2", "Acceleration_Z2",
        "Acceleration_X3", "Acceleration_Y3", "Acceleration_Z3"
    ]

    stride = 1

    model_list = [
        'cnn',
        'lstm',
        'deep',
        'isolation_forest',
        'setup_clustering',
        'pca',
        'lof',
        'svm',
    ]

    kernel_list = [
        180 if model_type in ['cnn', 'lstm', 'deep'] else 10
        for model_type in model_list
    ]

    resample_rate = 6400

    save_result = True
    output_dir = './results'

    # Initialize result array to memorize result
    # for each train and test step
    result_array = []

    # Get files from selected folder to use for training and testing
    curr_files = []
    for folder in all_state_folder:
        curr_files += get_files(folder, ext='lvm')[:]

    test_files = curr_files

    for model_type, kernel in zip(model_list, kernel_list):
        print('\n' + '\\\\//' * 20)
        print('\n Model: {}\n'.format(model_type))

        params_file = './params/params_{}.json'.format(model_type)

        for pos, train_file in enumerate(curr_files):
            skip_step = False
            train_state = os.path.split(os.path.dirname(train_file))[-1]

            x_train = []

            print("\n State Train: ", train_state)

            for i in range(size):
                if pos + i >= len(curr_files):
                    print('Not enough files')
                    skip_step = True
                    break

                tmp_file = curr_files[pos + i]
                tmp_state = os.path.split(os.path.dirname(tmp_file))[-1]

                if tmp_state != train_state:
                    print('Different state and skip current train')
                    skip_step = True
                    break

                print("Read {} Train File: {}".format(
                    i, os.path.basename(tmp_file)))

                ds_tmp = read_ds_lvm(tmp_file, get_header=False)

                # Check train
                if ds_tmp is None or ds_tmp.empty:
                    print('Impossible read train file')
                    skip_step = True
                    break

                # Select features
                ds_tmp = ds_tmp[features_list]
                # Resample
                ds_tmp = resample(ds_tmp, resample_rate)
                # Create training set
                x_tmp = get_sliding_window_matrix(ds_tmp.values, kernel,
                                                  stride)

                x_train.append(x_tmp)

            if skip_step:
                print('Skip current train')
                continue

            # Train set
            x_train = np.vstack(x_train)
            train_len = len(x_train)
            print('\nTrain size: {}\n'.format(x_train.shape))

            # Model initialization
            print("Model initialization: {}".format(model_type))
            model = get_model(model_type, params_file=params_file)

            # Training
            print("Training...")
            model.fit(x_train)

            for test_file in test_files:

                test_state = os.path.split(os.path.dirname(test_file))[-1]

                if train_state == test_state \
                        and test_file == train_file:
                    continue

                print("\n State Test: ", test_state)
                print("Read Test File: ", os.path.basename(test_file))
                ds_test = read_ds_lvm(test_file, get_header=False)

                # t1 = datetime.now()

                # Check test
                if ds_test is None or ds_test.empty:
                    print('Impossible read test file')
                    continue

                # Select features
                ds_test = ds_test[features_list]

                # Resample
                test_len = len(ds_test)
                ds_test = resample(ds_test, resample_rate)
                # ds_test = ds_test[:num_sample]
                print('Test Original File Length: ', test_len)
                print('New File Length {} {:.02f}'.format(
                    len(ds_test), 100 * len(ds_test) / test_len))

                test_stride = 1

                # Create set
                print("Create testing set")
                x_test = get_sliding_window_matrix(ds_test.values, kernel,
                                                   test_stride)
                print('Test shape ', x_test.shape)

                # Testing
                print('Testing...')
                y_pred = model.predict(x_test)

                num_error = np.sum(y_pred > 0)
                mean_error = np.mean(y_pred)
                if num_error > 0:
                    mean_only_error = np.mean(y_pred[y_pred > 0])
                else:
                    mean_only_error = 0

                if not np.sum(y_pred > 0):
                    print("Results: NO Anomaly founded")
                else:
                    print("Results: {} anomalies "
                          "({:.05f} total {})".format(num_error, mean_error,
                                                      len(x_test)))

                result_record = {
                    'MODEL': model_type,
                    'KERNEL': kernel,
                    'STRIDE': stride,
                    'TRAIN_STATE': train_state,
                    'TRAIN': os.path.basename(train_file),
                    'TRAIN_SIZE': train_len,
                    'TEST_STATE': test_state,
                    'TEST': os.path.basename(test_file),
                    'TEST_LEN': test_len,
                    'NUM_SINGLE_ANOMALY': num_error,
                    'PCT_ANOMALY': mean_error,
                    'NUM_SAMPLE_ANOMALY': mean_only_error,
                    'NUM_SAMPLE': len(x_test),
                    'LABEL': train_state != test_state
                }

                result_array.append(result_record)

        if save_result:
            if not os.path.isdir(output_dir):
                os.makedirs(output_dir, exist_ok=True)

            filename = os.path.join(
                output_dir,
                'results_single_{}'.format(size) + model_type + '.csv')

            result_ds = pd.DataFrame(result_array)

            result_ds.to_csv(filename, index=False)