def filterData(self, series=pd.DataFrame(), unwanted_columns = []): # Setup autoencoder model anomaly_model = H2OAutoEncoderEstimator( activation=self.activation, hidden=self.layers, l1=1e-4, epochs=self.epochs, ) # Split data frame pValidate = series.sample(frac=self.validation_ratio, random_state=200) pTrain = series.drop(pValidate.index) # Convert pandas to h2o frame - for anomaly detection hValidate = h2o.H2OFrame(pValidate) hValidate.set_names(list(pValidate.columns)) hTrain = h2o.H2OFrame(pTrain) hTrain.set_names(list(pTrain.columns)) # Select columns train_columns = [x for x in list(series.columns) if x not in unwanted_columns] # Train model anomaly_model.train(x=train_columns, training_frame=hTrain, validation_frame=hValidate) # Get reconstruction error reconstruction_error = anomaly_model.anomaly(test_data=hTrain, per_feature=False) error_str = reconstruction_error.get_frame_data() err_list = map(float, error_str.split("\n")[1:-1]) err_list = np.array(err_list) if self.anomaly_remove_function == 'iqr': print ""
h_test.set_names(list(p_test.columns)) # Select columns for AutoEncoder ac_train_columns = list(p_data.columns) # Define autoencoder train columns rm_columns = ['RUL', 'UnitNumber', 'Time', 'Setting1', 'Setting2', 'Setting3'] # Columns need to be removed ''' Because we are using auto encoders to remove noises in sensor readings. So we have to select only sensor readings ''' for column in rm_columns: ac_train_columns.remove(column) # Define AutoEncoder model auto_encoder_model = H2OAutoEncoderEstimator(activation="Tanh", hidden=18, epochs=150, loss='Quadratic', distribution='gaussian') # Train AutoEncoder model auto_encoder_model.train(x=ac_train_columns, training_frame=h_train, validation_frame=h_validate) # Get reconstruction error reconstruction_error = auto_encoder_model.anomaly(test_data=h_train, per_feature=False) error_str = reconstruction_error.get_frame_data() err_list = list(map(float, error_str.split("\n")[1:-1])) # Filter anomalies in reconstruction error
# AutoEncoder anomaly removal process p_train = ProcessData.trainData(bin_classification=True) p_test = ProcessData.testData(bin_classification=True) # Converting to h2o frane h_test = h2o.H2OFrame(p_test) h_test.set_names(list(p_test.columns)) h_train = h2o.H2OFrame(p_train) h_train.set_names(list(p_train.columns)) # Define autoencoder anomaly_model = H2OAutoEncoderEstimator(activation="Rectifier", hidden=[25, 12, 25], sparse=True, l1=1e-4, epochs=100) # Select relevant features anomaly_train_columns = list(p_train.columns) print(anomaly_train_columns) anomaly_train_columns.remove('RUL') anomaly_train_columns.remove('BIN') anomaly_train_columns.remove('UnitNumber') anomaly_train_columns.remove('Time') anomaly_train_columns.remove('Setting1') anomaly_train_columns.remove('Setting2') anomaly_train_columns.remove('Setting3') # Train model
pTrain.to_csv("Auto-Train.csv", index=False) # Select relevant features anomaly_train_columns = list(hTrain.columns) anomaly_train_columns.remove(response_column) anomaly_train_columns.remove('UnitNumber') anomaly_train_columns.remove('Time') column_count = len(anomaly_train_columns) layers = [20, 6, 20] print "Layers:", layers # Define model anomaly_model = H2OAutoEncoderEstimator( activation="Rectifier", hidden=layers, l1=1e-4, epochs=100, ) # Train model anomaly_model.train(x=anomaly_train_columns, training_frame=hTrain, validation_frame=hValidate) # Get reconstruction error reconstruction_error = anomaly_model.anomaly(test_data=hTrain, per_feature=False) error_str = reconstruction_error.get_frame_data() err_list = map(float, error_str.split("\n")[1:-1]) err_list = np.array(err_list)
return value_str = data_column.get_frame_data() splitter_list = value_str.split("\n")[1:-1] if data_type == "real": return list(map(float, splitter_list)) elif data_type == "enum": return splitter_list h2o.init() pd_train = pd.read_csv('na_filled_random_forest.csv') training_frame = h2o.H2OFrame(pd_train) columns = list(pd_train.columns) anomaly_model = H2OAutoEncoderEstimator() anomaly_model.train(x=columns, training_frame=training_frame) reconstruction_error = anomaly_model.anomaly(test_data=training_frame, per_feature=False) reconstruction_error = list(map(float, h2OColumnToList(reconstruction_error))) pd_train['reconstruction_error'] = reconstruction_error pd_test = pd.read_csv('dataset/dengue_features_test.csv') testing_frame = h2o.H2OFrame(pd_test) columns = list(pd_test.columns) anomaly_model = H2OAutoEncoderEstimator() anomaly_model.train(x=columns, training_frame=testing_frame) reconstruction_error = anomaly_model.anomaly(test_data=testing_frame, per_feature=False) reconstruction_error = list(map(float, h2OColumnToList(reconstruction_error))) pd_test['reconstruction_error'] = reconstruction_error
def function(): # AutoEncoder anomaly removal process p_train = ProcessData.trainData(moving_median_centered_average=True, standard_deviation=True, probability_distribution=True, bin_classification=True) p_test = ProcessData.testData(moving_median_centered_average=True, standard_deviation=True, probability_from_file=True, bin_classification=True) # Converting to h2o frane h_test = h2o.H2OFrame(p_test) h_test.set_names(list(p_test.columns)) h_train = h2o.H2OFrame(p_train) h_train.set_names(list(p_train.columns)) # Define autoencoder anomaly_model = H2OAutoEncoderEstimator(activation="Rectifier", hidden=[25, 12, 25], sparse=True, l1=1e-4, epochs=100) # Select relevant features anomaly_train_columns = list(p_train.columns) print(anomaly_train_columns) anomaly_train_columns.remove('RUL') anomaly_train_columns.remove('BIN') anomaly_train_columns.remove('UnitNumber') anomaly_train_columns.remove('Time') anomaly_train_columns.remove('Setting1') anomaly_train_columns.remove('Setting2') anomaly_train_columns.remove('Setting3') # Train model anomaly_model.train(x=anomaly_train_columns, training_frame=h_train) # Get reconstruction error reconstruction_error = anomaly_model.anomaly(test_data=h_train, per_feature=False) error_str = reconstruction_error.get_frame_data() err_list = list(map(float, error_str.split("\n")[1:-1])) err_list = np.array(err_list) # Threshold threshold = np.amax(err_list) * 0.97 print("Max Reconstruction Error :", reconstruction_error.max()) print("Threshold Reconstruction Error :", threshold) # Filter anomalies based on reconstruction error p_filter = Filter.filterDataAutoEncoder(panda_frame=p_train, reconstruction_error=err_list, threshold=threshold) # Drop features del p_filter['Setting3'] del p_filter['Sensor1'] del p_filter['Sensor5'] del p_filter['Sensor10'] del p_filter['Sensor16'] del p_filter['Sensor18'] del p_filter['Sensor19'] h_filter = h2o.H2OFrame(p_filter) h_filter.set_names(list(p_filter.columns)) h_test = h2o.H2OFrame(p_test) h_test.set_names(list(p_test.columns)) training_columns = list(p_filter.columns) training_columns.remove('UnitNumber') training_columns.remove('Time') training_columns.remove('RUL') training_columns.remove('BIN') h_filter['BIN'] = h_filter['BIN'].asfactor() h_test['BIN'] = h_test['BIN'].asfactor() model = H2ODeepLearningEstimator(variable_importances=True) model.train(x=training_columns, y='BIN', training_frame=h_filter, nfolds=10) predict = model.predict(test_data=h_test) predict = DataFrameParser.h2oToList(predict['predict']) actual = DataFrameParser.h2oToList(h_test['BIN']) Measures.confusion_matrix(actual, predict) print(predict) print(actual)
print list(iq_train.columns) print "" h2o.init() print "Adding Reconstruction Error" print "---------------------------" print "Applying to SJ Train" print "---------------------------" columns = list(sj_train.columns) columns.remove('total_cases') sj_training_frame = h2o.H2OFrame(sj_train) sj_training_frame.set_names(list(sj_train.columns)) sj_testing_frame = h2o.H2OFrame(sj_test) sj_testing_frame.set_names(list(sj_test.columns)) sj_model = H2OAutoEncoderEstimator() sj_model.train(x=columns, training_frame=sj_training_frame) sj_reconstruction_error = sj_model.anomaly(test_data=sj_training_frame, per_feature=False) sj_reconstruction_error = list( map(float, h2OColumnToList(sj_reconstruction_error))) sj_reconstruction_error_test = sj_model.anomaly(test_data=sj_testing_frame, per_feature=False) sj_reconstruction_error_test = list( map(float, h2OColumnToList(sj_reconstruction_error_test))) sj_train['reconstruction_error'] = sj_reconstruction_error sj_test['reconstruction_error'] = sj_reconstruction_error_test print "" print "Applying to IQ Train"