def main(): # Get all .CSV files in global folder files = get_files(global_dir, ext='.CSV') print('Found {} files'.format(len(files))) # Get settings dataset, where each row represent a new setting entry ds_settings = get_time_series_dataset(settings_file, sep=';', col='DT') print('Found {} settings'.format(len(ds_settings))) # Identify settings label label_settings(ds_settings, ds_settings.columns[:13]) ds_settings.ltime = pd.to_datetime(ds_settings.ltime) ds_settings.rtime = pd.to_datetime(ds_settings.rtime) print('Found {} unique settings'.format(len(np.unique(ds_settings.label)))) settings_map = {} setup_files = [] # Create settings map that associates a setting to each file print('\nSettings File identification') for file in files: # Read dataset ds = get_time_series_dataset(file, sep=';', col='DT') # Get nearest left setting setting = get_settings(ds, ds_settings) # Update settings_map if str(setting.label) not in settings_map: settings_map[str(setting.label)] = [file] else: settings_map[str(setting.label)] += [file] # Check if the setting start overlap with file timely interval if check_setup(ds, setting): print('Found setup {}: {} - {} in ds {} - {}'.format( setting.label, setting.ltime, setting.rtime, ds.index.min(), ds.index.max())) setup_files += [file] elif lazy_check_setup(ds, setting): print('Found lazy setup {}: {} - {} in ds {} - {}'.format( setting.label, setting.ltime, setting.rtime, ds.index.min(), ds.index.max())) setup_files += [file] print('Number of timely series with setup: {}'.format(len(setup_files))) y_pred_single = {} y_true_single = {} normal_files = {} # Save settings_map and setup_files list # with open('../results/settings_map.json', 'w') as outfile: # json.dump(settings_map, outfile) # # with open('../results/setup_files.json', 'w') as outfile: # json.dump(setup_files, outfile) # For each state we train a models with a "normal" file and predict anomalies print('\nTraining and Testing - {}'.format(model_type)) for k, val in settings_map.items(): print('\nState {} has {} files'.format(k, len(val))) # Get normal file from constant_normal_files dictionary if k not in constant_normal_files: print('Skip, normal files founded') continue normal_file = constant_normal_files[k] normal_files[k] = normal_file if normal_file is None: print('Impossible get normal file') return # Training ds_train = get_time_series_dataset(filename=normal_file, sep=';', col='DT') # Check train if ds_train is None: print('Impossible read train file') return y_pred_single[k] = {} y_true_single[k] = {} for col in ds_train.columns: x_train = ds_train[[col]] x_train = get_sliding_window_matrix(x_train.values, kernel, stride) # Selected models if model_type == 'pca': model = PCA(n_components=0.95, threshold=100, c_alpha=3.2905) elif model_type == 'clustering': model = SetupClustering(distance="cosine", max_dist=0.001, anomaly_threshold=0.0001) elif model_type == 'svm': model = OneClassSVM(nu=0.001, tol=0.001, kernel="rbf", gamma="scale") elif model_type == 'lof': model = LOF(n_neighbors=50, algorithm='auto', metric='minkowski', contamination='auto') elif model_type == 'if': model = IsolationForest(n_estimators=200, max_samples=512, contamination=0.0003, max_features=0.8) else: print("Select the wrong models") return # Training print("Training... state {} col {}".format(k, col)) model.fit(x_train) y_pred_single[k][col] = [] y_true_single[k][col] = [] print("Testing...") for file in val: # y_true_single is useless # setup_files doesn't have value for label if file in setup_files: y_true_single[k][col].append(1) else: y_true_single[k][col].append(0) x_test = get_time_series_dataset(filename=file, sep=';', col='DT') # Check test if x_test is None: print('Impossible read test file') return # Create testing values x_test = x_test[[col]] x_test = get_sliding_window_matrix(x_test.values, kernel, kernel) # Testing y_pred = model.predict(x_test) # Save number of detected anomalies y_pred_single[k][col].append(len(y_pred[y_pred == 1])) # break print('\nSelected normal files:') for k, file in normal_files.items(): print("State {} -> {}".format(k, file)) # Create result dataset y_pred = [] y_true = [] cols = [] files = [] states = [] for k in y_pred_single.keys(): for col in y_pred_single[k].keys(): i = 0 for pred, true in zip(y_pred_single[k][col], y_true_single[k][col]): y_pred.append(pred) y_true.append(true) cols.append(col) files.append(settings_map[k][i]) states.append(k) i += 1 res_ds = pd.DataFrame({ 'file': files, 'cols': cols, 'states': states, 'y_pred': y_pred, 'y_true': y_true }) # Create real ground truth res_ds['file'] = res_ds['file'].apply(lambda x: x.split('\\')[-1]) normal_file_list = ["File ({}).CSV".format(x) for x in normal_file_id_list] res_ds['y_true'] = 1 res_ds.loc[res_ds['file'].isin(normal_file_list), 'y_true'] = 0 # Save results res_ds.to_csv('../results/{}_evaluation.CSV'.format(model_type), sep=';', index=False) # Evaluation print("\nEvaluation") true_positive = len(res_ds[(res_ds['y_pred'] > 0) & (res_ds['y_true'] > 0)]) false_positive = len(res_ds[(res_ds['y_pred'] > 0) & (res_ds['y_true'] == 0)]) true_negative = len(res_ds[(res_ds['y_pred'] <= 0) & (res_ds['y_true'] == 0)]) false_negative = len(res_ds[(res_ds['y_pred'] <= 0) & (res_ds['y_true'] > 0)]) acc = 100 * (true_positive + true_negative) / len(res_ds) print("Accuracy: {}".format(acc)) precision = 100 * true_positive / (true_positive + false_positive) print("Precision: {}".format(precision)) recall = 100 * true_positive / (true_positive + false_negative) print("Recall: {}".format(recall)) f_score = 2 * precision * recall / (precision + recall) print("F-score: {}".format(f_score))
def main(): params = get_argument() all_state_folder = params['all_state_folder'][:] features_list = params['features_list'] model_type = params['model_type'] resample_rate = 6400 stride = 1 epochs = 300 save_result = True output_dir = './results' params_grid = { 'kernel': [40, 80, 120, 200, 240, 360], 'transform_type': ['minmax'], 'with_lazy': [0.00], # , 0.01, 0.015, 0.02], # 'loss': ['mae', 'mse'], # 'activation': [layers.LeakyReLU(alpha=0.3), 'relu', 'tanh'] } # if model_type == 'bilstm': # params_grid['activation'] = ['relu', 'tanh'] for model_type in [ 'pca', 'svm', 'cluster', 'cnn', 'deep', 'lstm', 'bilstm' ]: skip_list = [0] train_list = [1] combs = [] states = list(range(len(all_state_folder))) for i in range(len(all_state_folder) - 1): r = i + 1 l = list(itertools.combinations(states, r=r)) l = [list(x) for x in l] combs += l for selected_states in combs: ds_train_list = [] y_train_list = [] ds_test_list = [] y_test_list = [] # Read train and test files print('Evaluation state: {}'.format(selected_states)) for state_id, folder in enumerate(all_state_folder): print('Read state: ', os.path.basename(folder)) files = get_files(folder, ext='lvm') for i, filename in enumerate(files): if i in skip_list: print('Skip: ', filename) continue ds = read_ds_lvm(filename, get_header=False) ds = ds[features_list] ds = resample(ds, resample_rate) if i in train_list and state_id not in selected_states: ds_train_list.append(ds) print('Train state {} file: {}'.format( state_id, filename)) y_train_list.append(state_id) else: ds_test_list.append(ds) print('Test state {} file: {}'.format( state_id, filename)) y_test_list.append(state_id) ds_res = [] for grid in ParameterGrid(params_grid): print('\n Params:') print(grid) kernel = grid['kernel'] transform_type = grid['transform_type'] model_params = dict(grid) model_params.pop('kernel', None) model_params.pop('transform_type', None) if 'skernel' in model_params: model_params['kernel'] = model_params['skernel'] model_params.pop('skernel', None) # Apply transform transformer = None if transform_type: print('Apply transform: ', transform_type) x_train_list, transformer = transform_data( ds_train_list, transform_type) x_test_list = [ apply_transform(ds, transformer) for ds in ds_test_list ] else: print('No transform selected') x_train_list = ds_train_list x_test_list = ds_test_list # Create train and test matrix set x_train, y_train = prepare_data(x_train_list, labels=y_train_list, kernel=kernel, stride=stride) x_test, y_test = prepare_data(x_test_list, labels=y_test_list, kernel=kernel, stride=stride) print('Train size: ', x_train.shape) print('Train label size: ', y_train.shape) print('Test size: ', x_test.shape) print('Test label size: ', y_test.shape) order = np.random.permutation(len(x_train)) x_new = x_train[order] y_new = y_train[order] # Model initialization print("Model initialization: {}".format(model_type)) model = get_deep_model(model_type, model_params=model_params) # Training print("Training...") model.fit(x=x_new, epochs=epochs, batch_size=64, verbose=2) if model_type in ['cnn', 'deep', 'lstm', 'bilstm']: original_threshold = model.threshold print('Anomaly threshold: ', original_threshold) thresholds = [ 0.00, 0.01, 0.015, 0.02, 0.03, 0.05, 0.07, 0.1, 0.125, 0.15 ] for th in thresholds: model.threshold = original_threshold + th print('\n Lazy ', model.threshold) print("Anomaly accuracy") y_pred = model.predict(x_test, classifier=False) y_true = np.zeros(len(y_test)) for selected_state_id in selected_states: y_true[y_test == selected_state_id] = 1 print(classification_report(y_true, y_pred)) report_dict = classification_report(y_true, y_pred, output_dict=True) record = get_classification_report_record(report_dict) record.update(grid) record['with_lazy'] = th ds_res.append(record) else: print("Anomaly accuracy") y_pred = model.predict(x_test) y_true = np.zeros(len(y_test)) for selected_state_id in selected_states: y_true[y_test == selected_state_id] = 1 print(classification_report(y_true, y_pred)) report_dict = classification_report(y_true, y_pred, output_dict=True) record = get_classification_report_record(report_dict) record.update(grid) ds_res.append(record) ds_res = pd.DataFrame(ds_res) if save_result: if not os.path.isdir(output_dir): os.makedirs(output_dir, exist_ok=True) name = [str(x) for x in selected_states] name = '_'.join(name) filename = os.path.join( output_dir, 'results_grid_anomaly__{}__{}.csv'.format( name, model_type)) ds_res.to_csv(filename, index=True)
def main(): params = get_argument() all_state_folder = params['all_state_folder'] features_list = params['features_list'] kernel = params['kernel'] stride = params['stride'] model_type = params['model_type'] resample_rate = params.get('resample_rate', 6400) with_decision_score = params.get('with_decision_score', False) custom_resample = params.get('custom_resample', False) # resample_rate = 12800 # 12800 sample are 1 second # num_sample = 1000000 with_skip = False params_file = './params/params_{}.json'.format(model_type) save_result = True overwrite = True output_dir = './results' result_array = [] # Get list of list of files, where for each state we have a list of file curr_files = [] # Get list of test files test_files = [] for folder in all_state_folder: files = get_files(folder, ext='lvm') curr_files.append(files) test_files += files max_size = min([len(files) for files in curr_files[:3]]) # Get train files where each element is a list of files for a single train train_files = [] for i in range(max_size): train_pack = [files[i] for files in curr_files[:3]] for j in range(1, len(train_pack)): train_files.append(train_pack[:j + 1]) for train_pack in train_files: if len(train_pack) < 3: continue print('\n' + '\\\\//' * 20) selected_files = [] train_states = [] x_states = [] print('\n Train Pack') for train_file in train_pack: train_state = os.path.split(os.path.dirname(train_file))[-1] print("State: ", train_state) print("Read File: ", os.path.basename(train_file)) ds_train = read_ds_lvm(train_file, get_header=False) # Check train if ds_train is None or ds_train.empty: print('Impossible read train file') continue # Select features ds_train = ds_train[features_list] # Resample train_len = len(ds_train) if custom_resample: ds_train = resample_with_feature_extractor( ds_train, resample_rate) else: ds_train = resample(ds_train, resample_rate) # ds_train = ds_train[:num_sample] print('Original File Length: ', train_len) print('New File Length {} {:.02f}'.format( len(ds_train), 100 * len(ds_train) / train_len)) # Create training set print("Create set") x_train = get_sliding_window_matrix(ds_train.values, kernel, stride) print('Shape ', x_train.shape) selected_files.append(train_file) train_states.append(train_state) x_states.append(x_train) x_states = np.vstack(x_states) print('\n Train Size: ', x_states.shape) print('Train state: ', train_states) # Model initialization print("Model initialization: {}".format(model_type)) model = get_model(model_type, params_file=params_file) # Training print("Training...") model.fit(x_states) for test_file in test_files: test_state = os.path.split(os.path.dirname(test_file))[-1] if test_file in selected_files: continue # if test_state in train_states: # continue print("\n State Test: ", test_state) print("Read Test File: ", os.path.basename(test_file)) ds_test = read_ds_lvm(test_file, get_header=False) # t1 = datetime.now() # Check test if ds_test is None or ds_test.empty: print('Impossible read test file') continue # Select features ds_test = ds_test[features_list] # Resample test_len = len(ds_test) if custom_resample: ds_test = resample_with_feature_extractor( ds_test, resample_rate) else: ds_test = resample(ds_test, resample_rate) # ds_test = ds_test[:num_sample] print('Test Original File Length: ', test_len) print('New File Length {} {:.02f}'.format( len(ds_test), 100 * len(ds_test) / test_len)) if with_skip: test_stride = kernel else: test_stride = 1 # Create set print("Create testing set") x_test = get_sliding_window_matrix(ds_test.values, kernel, test_stride) print('Test shape ', x_test.shape) # Testing print('Testing...') if with_decision_score: y_pred = model.decision_score(x_test) else: y_pred = model.predict(x_test) num_error = np.sum(y_pred > 0) mean_error = np.mean(y_pred) if num_error > 0: mean_only_error = np.mean(y_pred[y_pred > 0]) else: mean_only_error = 0 if not np.sum(y_pred > 0): print("Results: NO Anomaly founded") else: print("Results: {} anomalies " "({:.05f} total {})".format(num_error, mean_error, len(x_test))) result_record = { 'MODEL': model_type, 'KERNEL': kernel, 'STRIDE': stride, 'TRAIN_STATE': train_states, 'TRAIN': [ os.path.basename(train_file) for train_file in selected_files ], 'TEST_STATE': test_state, 'TEST': os.path.basename(test_file), 'NUM_SINGLE_ANOMALY': num_error, 'PCT_ANOMALY': mean_error, 'NUM_SAMPLE_ANOMALY': mean_only_error, 'NUM_SAMPLE': len(x_test), 'LABEL': test_state not in train_states } result_array.append(result_record) if save_result: if not os.path.isdir(output_dir): os.makedirs(output_dir, exist_ok=True) filename = os.path.join(output_dir, 'results_multi_' + model_type + '.csv') result_ds = pd.DataFrame(result_array) if os.path.isfile(filename) and not overwrite: prev_result_ds = pd.read_csv(filename) result_ds = pd.concat([prev_result_ds, result_ds], axis=0, ignore_index=True) result_ds.to_csv(filename, index=False)
def main(): params = get_argument() all_state_folder = params['all_state_folder'] features_list = params['features_list'] resample_rate = 6400 stride = 1 epochs = 500 transform_type = 'minmax' save_result = True output_dir = './results' cluster_models = { 'agglomerative': AgglomerativeClustering, 'kmeans': KMeans, 'spectral': SpectralClustering } for train_id in [1, 2]: skip_list = [] train_list = [train_id] ds_train_list = [] y_train_list = [] ds_test_list = [] y_test_list = [] # Read train and test files print('Read all datasets') for state_id, folder in enumerate(all_state_folder): print('\nRead state: ', os.path.basename(folder)) files = get_files(folder, ext='lvm') selected_train_id = [ x for x in range(len(files)) if x in train_list ] if not len(selected_train_id): selected_train_id = [1] for i, filename in enumerate(files): if i in skip_list: print('Skip: {}'.format(filename)) continue # ds = None ds = read_ds_lvm(filename, get_header=False) ds = ds[features_list] ds = resample(ds, resample_rate) if i in selected_train_id: print('Train state {} file: {}'.format(state_id, filename)) ds_train_list.append(ds) y_train_list.append(state_id) else: print('Test state {} file: {}'.format(state_id, filename)) ds_test_list.append(ds) y_test_list.append(state_id) # Apply transform transformer = None if transform_type: print('Apply transform: ', transform_type) x_train_list, transformer = transform_data(ds_train_list, transform_type) x_test_list = [ apply_transform(ds, transformer) for ds in ds_test_list ] else: print('No transform selected') x_train_list = ds_train_list x_test_list = ds_test_list for kernel in [40, 80, 120, 200, 240, 360]: # Create train and test matrix set x_train, y_train = prepare_data(x_train_list, labels=y_train_list, kernel=kernel, stride=stride) x_test, y_test = prepare_data(x_test_list, labels=y_test_list, kernel=kernel, stride=stride) print('Train size: ', x_train.shape) print('Train label size: ', y_train.shape) print('Test size: ', x_test.shape) print('Test label size: ', y_test.shape) order = np.random.permutation(len(x_train)) x_new = x_train[order] y_new = y_train[order] record = {} for cluster_name, cluster_model in cluster_models.items(): print('\n', cluster_name) # ToDo: remove n_clusters params cls = cluster_model(n_clusters=4) enc_pred = x_test.reshape(len(x_test), -1) print(enc_pred.shape) y_pred = cls.fit_predict(enc_pred) ami = adjusted_mutual_info_score(y_test, y_pred) r_score = adjusted_rand_score(y_test, y_pred) hom_score = homogeneity_score(y_test, y_pred) record[cluster_name] = { 'adjusted_mutual_info_score': ami, 'adjusted_rand_score': r_score, 'homogenity_score': hom_score } print(record[cluster_name]) ds_res = pd.DataFrame(record) if save_result: if not os.path.isdir(output_dir): os.makedirs(output_dir, exist_ok=True) filename = os.path.join( output_dir, 'results_{}_cluster_{}_{}.csv'.format( train_id, 'raw', kernel)) ds_res.to_csv(filename, index=True) for model_type in ['cnn', 'deep', 'lstm', 'bilstm']: # Model initialization print("Model initialization: {}".format(model_type)) model = get_model(model_type) # Training print("Training...") model.fit(x=x_new, epochs=epochs, verbose=2) enc_pred = model.encoder.predict(x_test) enc_pred = enc_pred.reshape((len(x_test), -1)) record = {} for cluster_name, cluster_model in cluster_models.items(): print('\n', cluster_name) print(enc_pred.shape) # ToDo: remove n_clusters params cls = cluster_model(n_clusters=4) y_pred = cls.fit_predict(enc_pred) ami = adjusted_mutual_info_score(y_test, y_pred) r_score = adjusted_rand_score(y_test, y_pred) hom_score = homogeneity_score(y_test, y_pred) record[cluster_name] = { 'adjusted_mutual_info_score': ami, 'adjusted_rand_score': r_score, 'homogenity_score': hom_score } print(record[cluster_name]) ds_res = pd.DataFrame(record) if save_result: if not os.path.isdir(output_dir): os.makedirs(output_dir, exist_ok=True) filename = os.path.join( output_dir, 'results_{}_cluster_{}_{}.csv'.format( train_id, model_type, kernel)) ds_res.to_csv(filename, index=True)
def main(): params = get_argument() all_state_folder = params['all_state_folder'] features_list = params['features_list'] # model_type = params['model_type'] resample_rate = 6400 kernel = 120 # 40, 80, 120, 200 stride = 1 # model_type = 'cnn' # 'cnn', 'deep', 'lstm' transform_type = 'minmax' # 'std', 'minmax', None epochs = 200 save_result = True output_dir = './results' model_params = { 'with_lazy': 0.02, # 0.00, 0.01, 0.015, 0.02 # 'loss': 'mae' # 'mae', 'mse' } skip_list = [0] train_list = [1] for selected_state_id, selected_state in enumerate(all_state_folder): ds_train_list = [] y_train_list = [] ds_test_list = [] y_test_list = [] # Read train and test files print('Evaluation state: {}'.format(selected_state_id)) for state_id, folder in enumerate(all_state_folder): print('Read state: ', os.path.basename(folder)) files = get_files(folder, ext='lvm') for i, filename in enumerate(files): if i in skip_list: continue ds = read_ds_lvm(filename, get_header=False) ds = ds[features_list] ds = resample(ds, resample_rate) if i in train_list and state_id != selected_state_id: ds_train_list.append(ds) print('Train state {} file: {}'.format(state_id, filename)) y_train_list.append(state_id) else: ds_test_list.append(ds) print('Test state {} file: {}'.format(state_id, filename)) y_test_list.append(state_id) # Apply transform transformer = None if transform_type: print('Apply transform: ', transform_type) x_train_list, transformer = transform_data(ds_train_list, transform_type) x_test_list = [ apply_transform(ds, transformer) for ds in ds_test_list ] else: print('No transform selected') x_train_list = ds_train_list x_test_list = ds_test_list # Create train and test matrix set x_train, y_train = prepare_data(x_train_list, labels=y_train_list, kernel=kernel, stride=stride) x_test, y_test = prepare_data(x_test_list, labels=y_test_list, kernel=kernel, stride=stride) print('Train size: ', x_train.shape) print('Train label size: ', y_train.shape) print('Test size: ', x_test.shape) print('Test label size: ', y_test.shape) order = np.random.permutation(len(x_train)) x_new = x_train[order] y_new = y_train[order] for model_type in ['cnn', 'deep', 'lstm', 'bilstm']: # Model initialization print("Model initialization: {}".format(model_type)) model = get_deep_model(model_type, model_params=model_params) # Training print("Training...") model.fit(x=x_new, epochs=epochs, verbose=2) print("Anomaly accuracy") y_pred = model.predict(x_test, classifier=False) y_true = np.zeros(len(y_test)) y_true[y_test == selected_state_id] = 1 print(classification_report(y_true, y_pred)) ds_res = pd.DataFrame( classification_report(y_true, y_pred, output_dict=True)) if save_result: if not os.path.isdir(output_dir): os.makedirs(output_dir, exist_ok=True) filename = os.path.join( output_dir, 'results_anomaly_{}_{}_.csv'.format( selected_state_id, model_type)) ds_res.to_csv(filename, index=True) print("Locate Anomaly") x_selected = x_test[y_test == selected_state_id] y_selected = y_test[y_test == selected_state_id] x_reconstructed = model.model.predict(x_selected) ds_res = [] num_records = len(x_selected) for i in range(num_records): x_true = x_selected[i] x_pred = x_reconstructed[i] if transformer is not None: x_true = transformer.inverse_transform(x_true) x_pred = transformer.inverse_transform(x_pred) diff = np.mean(np.abs(x_true - x_pred), axis=0) res = {k: val for k, val in zip(features_list, diff)} res['threshold'] = model.threshold res['score'] = y_selected[i] ds_res.append(res) ds_res = pd.DataFrame(ds_res) if save_result: if not os.path.isdir(output_dir): os.makedirs(output_dir, exist_ok=True) filename = os.path.join( output_dir, 'results_locate__{}__{}__{}.csv'.format( kernel, selected_state_id, model_type)) ds_res.to_csv(filename, index=False)
def main(): params = get_argument() all_state_folder = params['all_state_folder'] features_list = params['features_list'] resample_rate = 6400 stride = 1 epochs = 300 transform_type = 'minmax' # 'minmax' save_result = True output_dir = './results' for train_id in [1, 2, 0]: skip_list = [] train_list = [train_id] ds_train_list = [] y_train_list = [] ds_test_list = [] y_test_list = [] # Read train and test files print('Read all datasets') for state_id, folder in enumerate(all_state_folder): print('\nRead state: ', os.path.basename(folder)) files = get_files(folder, ext='lvm') selected_train_id = [ x for x in range(len(files)) if x in train_list ] if not len(selected_train_id): selected_train_id = [1] for i, filename in enumerate(files): if i in skip_list: print('Skip: {}'.format(filename)) continue # ds = None ds = read_ds_lvm(filename, get_header=False) ds = ds[features_list] ds = resample(ds, resample_rate) if i in selected_train_id: print('Train state {} file: {}'.format(state_id, filename)) ds_train_list.append(ds) y_train_list.append(state_id) else: print('Test state {} file: {}'.format(state_id, filename)) ds_test_list.append(ds) y_test_list.append(state_id) # Apply transform transformer = None if transform_type: print('Apply transform: ', transform_type) x_train_list, transformer = transform_data(ds_train_list, transform_type) x_test_list = [ apply_transform(ds, transformer) for ds in ds_test_list ] else: print('No transform selected') x_train_list = ds_train_list x_test_list = ds_test_list for kernel in [40, 80, 120, 200, 240, 360]: # Create train and test matrix set x_train, y_train = prepare_data(x_train_list, labels=y_train_list, kernel=kernel, stride=stride) x_test, y_test = prepare_data(x_test_list, labels=y_test_list, kernel=kernel, stride=stride) print('Train size: ', x_train.shape) print('Train label size: ', y_train.shape) print('Test size: ', x_test.shape) print('Test label size: ', y_test.shape) order = np.random.permutation(len(x_train)) x_new = x_train[order] y_new = y_train[order] for model_type in [ 'classifier', 'linear', 'cnn', 'deep', 'lstm', 'bilstm' ]: # Model initialization print("Model initialization: {}".format(model_type)) model = get_model(model_type) # Training print("Training...") model.fit(x=x_new, y=y_new, epochs=epochs, batch_size=32, verbose=2) y_pred = model.predict(x_test, classifier=True) print(classification_report(y_test, y_pred)) ds_res = pd.DataFrame( classification_report(y_test, y_pred, output_dict=True)) if save_result: if not os.path.isdir(output_dir): os.makedirs(output_dir, exist_ok=True) filename = os.path.join( output_dir, 'results_{}_accuracy_{}_{}.csv'.format( train_id, model_type, kernel)) ds_res.to_csv(filename, index=True)
def main(): params = get_argument() all_state_folder = params['all_state_folder'] size = 3 features_list = [ "Acceleration_X1", "Acceleration_Y1", "Acceleration_Z1", "Acceleration_X2", "Acceleration_Y2", "Acceleration_Z2", "Acceleration_X3", "Acceleration_Y3", "Acceleration_Z3" ] stride = 1 model_list = [ 'cnn', 'lstm', 'deep', 'isolation_forest', 'setup_clustering', 'pca', 'lof', 'svm', ] kernel_list = [ 180 if model_type in ['cnn', 'lstm', 'deep'] else 10 for model_type in model_list ] resample_rate = 6400 save_result = True output_dir = './results' # Initialize result array to memorize result # for each train and test step result_array = [] # Get files from selected folder to use for training and testing curr_files = [] for folder in all_state_folder: curr_files += get_files(folder, ext='lvm')[:] test_files = curr_files for model_type, kernel in zip(model_list, kernel_list): print('\n' + '\\\\//' * 20) print('\n Model: {}\n'.format(model_type)) params_file = './params/params_{}.json'.format(model_type) for pos, train_file in enumerate(curr_files): skip_step = False train_state = os.path.split(os.path.dirname(train_file))[-1] x_train = [] print("\n State Train: ", train_state) for i in range(size): if pos + i >= len(curr_files): print('Not enough files') skip_step = True break tmp_file = curr_files[pos + i] tmp_state = os.path.split(os.path.dirname(tmp_file))[-1] if tmp_state != train_state: print('Different state and skip current train') skip_step = True break print("Read {} Train File: {}".format( i, os.path.basename(tmp_file))) ds_tmp = read_ds_lvm(tmp_file, get_header=False) # Check train if ds_tmp is None or ds_tmp.empty: print('Impossible read train file') skip_step = True break # Select features ds_tmp = ds_tmp[features_list] # Resample ds_tmp = resample(ds_tmp, resample_rate) # Create training set x_tmp = get_sliding_window_matrix(ds_tmp.values, kernel, stride) x_train.append(x_tmp) if skip_step: print('Skip current train') continue # Train set x_train = np.vstack(x_train) train_len = len(x_train) print('\nTrain size: {}\n'.format(x_train.shape)) # Model initialization print("Model initialization: {}".format(model_type)) model = get_model(model_type, params_file=params_file) # Training print("Training...") model.fit(x_train) for test_file in test_files: test_state = os.path.split(os.path.dirname(test_file))[-1] if train_state == test_state \ and test_file == train_file: continue print("\n State Test: ", test_state) print("Read Test File: ", os.path.basename(test_file)) ds_test = read_ds_lvm(test_file, get_header=False) # t1 = datetime.now() # Check test if ds_test is None or ds_test.empty: print('Impossible read test file') continue # Select features ds_test = ds_test[features_list] # Resample test_len = len(ds_test) ds_test = resample(ds_test, resample_rate) # ds_test = ds_test[:num_sample] print('Test Original File Length: ', test_len) print('New File Length {} {:.02f}'.format( len(ds_test), 100 * len(ds_test) / test_len)) test_stride = 1 # Create set print("Create testing set") x_test = get_sliding_window_matrix(ds_test.values, kernel, test_stride) print('Test shape ', x_test.shape) # Testing print('Testing...') y_pred = model.predict(x_test) num_error = np.sum(y_pred > 0) mean_error = np.mean(y_pred) if num_error > 0: mean_only_error = np.mean(y_pred[y_pred > 0]) else: mean_only_error = 0 if not np.sum(y_pred > 0): print("Results: NO Anomaly founded") else: print("Results: {} anomalies " "({:.05f} total {})".format(num_error, mean_error, len(x_test))) result_record = { 'MODEL': model_type, 'KERNEL': kernel, 'STRIDE': stride, 'TRAIN_STATE': train_state, 'TRAIN': os.path.basename(train_file), 'TRAIN_SIZE': train_len, 'TEST_STATE': test_state, 'TEST': os.path.basename(test_file), 'TEST_LEN': test_len, 'NUM_SINGLE_ANOMALY': num_error, 'PCT_ANOMALY': mean_error, 'NUM_SAMPLE_ANOMALY': mean_only_error, 'NUM_SAMPLE': len(x_test), 'LABEL': train_state != test_state } result_array.append(result_record) if save_result: if not os.path.isdir(output_dir): os.makedirs(output_dir, exist_ok=True) filename = os.path.join( output_dir, 'results_single_{}'.format(size) + model_type + '.csv') result_ds = pd.DataFrame(result_array) result_ds.to_csv(filename, index=False)