def test_make_counting_grid(self): """ Test counting grid for a base "well-known" scenario """ ## Get grid prediction, to use size and region params infile = open( '/Users/anamaria/Desktop/dev/security_project/aggressive_behavior_model/pkl/experiment_seppexp_10_2_siedco_prediction.pkl', 'rb') loaded_siedco = pickle.load(infile) infile.close() grid = loaded_siedco['prediction'].values[0] ## Select points to represent on counting matrix df = pd.read_csv( "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv" ) data = ProcessData( "SIEDCO", "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv" ) df_input = data.add_timestamp(df) timed_pts, _ = ProcessData.get_time_space_points( df_input, data.dataset_dict) counting_matrix = prediction_metrics.make_counting_grid( grid, timed_pts) self.assertEqual(counting_matrix.xoffset, 958645.8182116301) self.assertEqual(counting_matrix.yoffset, 904338.0678953262) self.assertEqual(counting_matrix.xsize, 150) self.assertEqual(counting_matrix.ysize, 150) self.assertEqual(counting_matrix._matrix.shape, (816, 343)) self.assertEqual(counting_matrix._matrix.max(), 357) self.assertEqual(counting_matrix._matrix.min(), 0)
def test_hit_rate_1(self): """ Test hit_rate=1 if all real events falls on hotspots """ df = pd.read_csv( "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv" ) data = ProcessData( "SIEDCO", "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv" ) df_input = data.add_timestamp(df) date = '2018-01-01' dataset_dict = data.dataset_dict df_filtered = ProcessData.filter_by_date(df_input, dataset_dict, date, date) timed_pts, region = ProcessData.get_time_space_points( df_filtered, data.dataset_dict) counting_kernel = open_cp.naive.CountingGridKernel(grid_width=150, region=region) counting_kernel.data = timed_pts grid_prediction = counting_kernel.predict() coverages = [2, 4, 6, 8, 10] hit_rates_default = prediction_metrics.measure_hit_rates( grid_prediction, timed_pts, coverages, 'default') hit_rates_ground_truth = prediction_metrics.measure_hit_rates( grid_prediction, timed_pts, coverages, 'ground_truth_coverage') self.assertEqual(hit_rates_default, { 2: 1.0, 4: 1.0, 6: 1.0, 8: 1.0, 10: 1.0 }) self.assertEqual(hit_rates_ground_truth, {0.46187915216703573: 1.0})
def test_mse_1(self): """ Test mse=0 if both matrices (prediction and ground truth) are equal """ df = pd.read_csv( "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv" ) data = ProcessData( "SIEDCO", "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv" ) df_input = data.add_timestamp(df) timed_pts, region = ProcessData.get_time_space_points( df_input, data.dataset_dict) counting_kernel = open_cp.naive.CountingGridKernel(grid_width=150, region=region) counting_kernel.data = timed_pts grid_prediction = counting_kernel.predict() mse = prediction_metrics.mse(grid_prediction, timed_pts) self.assertEqual(mse, 0)
def test_hit_rate_2(self): """ Test hit_rate=0 if no events falls on hotspots """ df = pd.read_csv( "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv" ) data = ProcessData( "SIEDCO", "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv" ) df_input = data.add_timestamp(df) date = '2018-01-01' dataset_dict = data.dataset_dict df_input = ProcessData.filter_by_date(df_input, dataset_dict, date, date) df_1 = ProcessData.filter_by_field(df_input, 'LOCALIDAD', 'SUBA') df_2 = ProcessData.filter_by_field(df_input, 'LOCALIDAD', 'BOSA') timed_pts, region = ProcessData.get_time_space_points( df_1, data.dataset_dict) counting_kernel = open_cp.naive.CountingGridKernel(grid_width=150, region=region) counting_kernel.data = timed_pts grid_prediction = counting_kernel.predict() coverages = [2, 4, 6, 8, 10] eval_pts, _ = ProcessData.get_time_space_points( df_2, data.dataset_dict) hit_rates_default = prediction_metrics.measure_hit_rates( grid_prediction, eval_pts, coverages, 'default') hit_rates_ground_truth = prediction_metrics.measure_hit_rates( grid_prediction, eval_pts, coverages, 'ground_truth_coverage') self.assertEqual(hit_rates_default, { 2: 0.0, 4: 0.0, 6: 0.0, 8: 0.0, 10: 0.0 }) self.assertEqual(hit_rates_ground_truth, {0.6632653061224489: 0.0})
class TestCase(unittest.TestCase): def setUp(self): dataset_name = 'SIEDCO' head_path = '/Users/anamaria/Desktop/dev/security_project/datasets/' file = 'deduplicate_siedco_10032020.csv' dataset_path = head_path + file self.my_data = ProcessData(dataset_name, dataset_path) def test_set_up(self): self.assertEqual(self.my_data.dataset_name, 'SIEDCO') self.assertEqual( self.my_data.dataset_path, '/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_10032020.csv' ) self.assertEqual(self.my_data.dataset_dict, siedco_dict) def test_filter_by_date_case1(self): #case 1: date on interval, siedco df = pd.read_csv(self.my_data.dataset_path) df = self.my_data.add_timestamp(df) initial_date = '2018-01-01' final_date = '2018-01-01' dataset_dict = self.my_data.dataset_dict df_filtered = ProcessData.filter_by_date(df, dataset_dict, initial_date, final_date) df_expected = df.loc[df['FECHA_HECHO'] == "2018-01-01"] self.assertEqual(len(df_filtered), len(df_expected)) def test_filter_by_date_case2(self): #case 2: initial date out of available data, siedco df = pd.read_csv(self.my_data.dataset_path) df = self.my_data.add_timestamp(df) initial_date = '2021-01-01' final_date = '2021-01-02' dataset_dict = self.my_data.dataset_dict self.assertWarns( UserWarning, lambda: ProcessData.filter_by_date( df, dataset_dict, initial_date, final_date)) def test_filter_by_date_case3(self): #case 3: date on interval, nuse sample dataset = {'name': 'NUSE', 'path': ''} self.my_data.dataset_name = 'NUSE' self.my_data.dataset_dict = self.my_data.set_dictionary() head_path = '/Users/anamaria/Desktop/dev/security_project/datasets/' file = 'verify_enrich_nuse_29112019.csv' df = pd.read_csv(head_path + file) df = self.my_data.add_timestamp(df) initial_date = '2018-01-01' final_date = '2018-01-01' dataset_dict = self.my_data.dataset_dict df_filtered = ProcessData.filter_by_date(df, dataset_dict, initial_date, final_date) df_expected = df.loc[df['FECHA'] == "2018-01-01"] self.assertEqual(len(df_filtered), len(df_expected)) def test_filter_by_date_case4(self): #case 4: date on interval, nuse full data dataset = {'name': 'NUSE', 'path': ''} self.my_data.dataset_name = 'NUSE' self.my_data.dataset_dict = self.my_data.set_dictionary() head_path = '/Users/anamaria/Desktop/dev/security_project/datasets/' file = 'verify_enrich_nuse_29112019.csv' df = pd.read_csv(head_path + file) df = self.my_data.add_timestamp(df) initial_date = '2018-01-01' final_date = '2018-01-01' dataset_dict = self.my_data.dataset_dict df_filtered = ProcessData.filter_by_date(df, dataset_dict, initial_date, final_date) df_expected = df.loc[df['FECHA'] == "2018-01-01"] self.assertEqual(len(df_filtered), len(df_expected)) def test_filter_by_date_case5(self): #case 5: date on interval, rnmc dataset = {'name': 'RNMC', 'path': ''} self.my_data.dataset_name = 'RNMC' self.my_data.dataset_dict = self.my_data.set_dictionary() head_path = '/Users/anamaria/Desktop/dev/security_project/datasets/' file = '06. verify_enrich_rnmc_12022020.csv' df = pd.read_csv(head_path + file) df = self.my_data.add_timestamp(df) initial_date = '2018-01-01' final_date = '2018-01-01' dataset_dict = self.my_data.dataset_dict df_filtered = ProcessData.filter_by_date(df, dataset_dict, initial_date, final_date) df_expected = df.loc[df['FECHA'] == "2018-01-01"] self.assertEqual(len(df_filtered), len(df_expected)) def test_filter_by_field_case1(self): #case 1: filter successful df = pd.read_csv(self.my_data.dataset_path) df_filtered = ProcessData.filter_by_field(df, 'LOCALIDAD', 'BOSA') self.assertEqual(df_filtered.LOCALIDAD.unique()[0], 'BOSA') def test_filter_by_field_case2(self): #case 2: filter successful, without field value df = pd.read_csv(self.my_data.dataset_path) df_filtered = ProcessData.filter_by_field(df, '', '') assertion_proxy = df_filtered.equals(df) self.assertEqual(assertion_proxy, True) def test_filter_by_field_case3(self): #case 3: error, field doesn't exist df = pd.read_csv(self.my_data.dataset_path) self.assertRaises( ValueError, lambda: ProcessData.filter_by_field(df, 'nombre', 'Pedro')) def test_filter_by_field_case4(self): #case 4: error, value doesn't exist df = pd.read_csv(self.my_data.dataset_path) self.assertRaises( ValueError, lambda: ProcessData.filter_by_field(df, 'LOCALIDAD', 'NORMANDIA')) def test_normalize_matrix(self): matrix = np.arange(1, 5) matrix = matrix.reshape(2, 2) matrix_expected = np.array([[0.25, 0.5], [0.75, 1]]) matrix_normalized = ProcessData.normalize_matrix(matrix) self.assertTrue((matrix_normalized == matrix_expected).all())