Exemplo n.º 1
0
    def test_make_counting_grid(self):
        """ Test counting grid for a base "well-known" scenario """
        ## Get grid prediction, to use size and region params
        infile = open(
            '/Users/anamaria/Desktop/dev/security_project/aggressive_behavior_model/pkl/experiment_seppexp_10_2_siedco_prediction.pkl',
            'rb')
        loaded_siedco = pickle.load(infile)
        infile.close()
        grid = loaded_siedco['prediction'].values[0]

        ## Select points to represent on counting matrix
        df = pd.read_csv(
            "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv"
        )
        data = ProcessData(
            "SIEDCO",
            "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv"
        )
        df_input = data.add_timestamp(df)
        timed_pts, _ = ProcessData.get_time_space_points(
            df_input, data.dataset_dict)

        counting_matrix = prediction_metrics.make_counting_grid(
            grid, timed_pts)
        self.assertEqual(counting_matrix.xoffset, 958645.8182116301)
        self.assertEqual(counting_matrix.yoffset, 904338.0678953262)
        self.assertEqual(counting_matrix.xsize, 150)
        self.assertEqual(counting_matrix.ysize, 150)
        self.assertEqual(counting_matrix._matrix.shape, (816, 343))
        self.assertEqual(counting_matrix._matrix.max(), 357)
        self.assertEqual(counting_matrix._matrix.min(), 0)
Exemplo n.º 2
0
    def test_hit_rate_1(self):
        """ Test hit_rate=1 if all real events falls on hotspots """
        df = pd.read_csv(
            "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv"
        )
        data = ProcessData(
            "SIEDCO",
            "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv"
        )
        df_input = data.add_timestamp(df)
        date = '2018-01-01'
        dataset_dict = data.dataset_dict
        df_filtered = ProcessData.filter_by_date(df_input, dataset_dict, date,
                                                 date)

        timed_pts, region = ProcessData.get_time_space_points(
            df_filtered, data.dataset_dict)
        counting_kernel = open_cp.naive.CountingGridKernel(grid_width=150,
                                                           region=region)
        counting_kernel.data = timed_pts
        grid_prediction = counting_kernel.predict()

        coverages = [2, 4, 6, 8, 10]
        hit_rates_default = prediction_metrics.measure_hit_rates(
            grid_prediction, timed_pts, coverages, 'default')
        hit_rates_ground_truth = prediction_metrics.measure_hit_rates(
            grid_prediction, timed_pts, coverages, 'ground_truth_coverage')
        self.assertEqual(hit_rates_default, {
            2: 1.0,
            4: 1.0,
            6: 1.0,
            8: 1.0,
            10: 1.0
        })
        self.assertEqual(hit_rates_ground_truth, {0.46187915216703573: 1.0})
Exemplo n.º 3
0
    def test_mse_1(self):
        """ Test mse=0 if both matrices (prediction and ground truth) are equal """
        df = pd.read_csv(
            "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv"
        )
        data = ProcessData(
            "SIEDCO",
            "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv"
        )
        df_input = data.add_timestamp(df)
        timed_pts, region = ProcessData.get_time_space_points(
            df_input, data.dataset_dict)

        counting_kernel = open_cp.naive.CountingGridKernel(grid_width=150,
                                                           region=region)
        counting_kernel.data = timed_pts
        grid_prediction = counting_kernel.predict()
        mse = prediction_metrics.mse(grid_prediction, timed_pts)
        self.assertEqual(mse, 0)
Exemplo n.º 4
0
    def test_hit_rate_2(self):
        """ Test hit_rate=0 if no events falls on hotspots """
        df = pd.read_csv(
            "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv"
        )
        data = ProcessData(
            "SIEDCO",
            "/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_09062020.csv"
        )
        df_input = data.add_timestamp(df)
        date = '2018-01-01'
        dataset_dict = data.dataset_dict
        df_input = ProcessData.filter_by_date(df_input, dataset_dict, date,
                                              date)
        df_1 = ProcessData.filter_by_field(df_input, 'LOCALIDAD', 'SUBA')
        df_2 = ProcessData.filter_by_field(df_input, 'LOCALIDAD', 'BOSA')

        timed_pts, region = ProcessData.get_time_space_points(
            df_1, data.dataset_dict)
        counting_kernel = open_cp.naive.CountingGridKernel(grid_width=150,
                                                           region=region)
        counting_kernel.data = timed_pts
        grid_prediction = counting_kernel.predict()

        coverages = [2, 4, 6, 8, 10]
        eval_pts, _ = ProcessData.get_time_space_points(
            df_2, data.dataset_dict)
        hit_rates_default = prediction_metrics.measure_hit_rates(
            grid_prediction, eval_pts, coverages, 'default')
        hit_rates_ground_truth = prediction_metrics.measure_hit_rates(
            grid_prediction, eval_pts, coverages, 'ground_truth_coverage')
        self.assertEqual(hit_rates_default, {
            2: 0.0,
            4: 0.0,
            6: 0.0,
            8: 0.0,
            10: 0.0
        })
        self.assertEqual(hit_rates_ground_truth, {0.6632653061224489: 0.0})
class TestCase(unittest.TestCase):
    def setUp(self):
        dataset_name = 'SIEDCO'
        head_path = '/Users/anamaria/Desktop/dev/security_project/datasets/'
        file = 'deduplicate_siedco_10032020.csv'
        dataset_path = head_path + file
        self.my_data = ProcessData(dataset_name, dataset_path)

    def test_set_up(self):
        self.assertEqual(self.my_data.dataset_name, 'SIEDCO')
        self.assertEqual(
            self.my_data.dataset_path,
            '/Users/anamaria/Desktop/dev/security_project/datasets/deduplicate_siedco_10032020.csv'
        )
        self.assertEqual(self.my_data.dataset_dict, siedco_dict)

    def test_filter_by_date_case1(self):
        #case 1: date on interval, siedco
        df = pd.read_csv(self.my_data.dataset_path)
        df = self.my_data.add_timestamp(df)
        initial_date = '2018-01-01'
        final_date = '2018-01-01'
        dataset_dict = self.my_data.dataset_dict
        df_filtered = ProcessData.filter_by_date(df, dataset_dict,
                                                 initial_date, final_date)
        df_expected = df.loc[df['FECHA_HECHO'] == "2018-01-01"]
        self.assertEqual(len(df_filtered), len(df_expected))

    def test_filter_by_date_case2(self):
        #case 2: initial date out of available data, siedco
        df = pd.read_csv(self.my_data.dataset_path)
        df = self.my_data.add_timestamp(df)
        initial_date = '2021-01-01'
        final_date = '2021-01-02'
        dataset_dict = self.my_data.dataset_dict
        self.assertWarns(
            UserWarning, lambda: ProcessData.filter_by_date(
                df, dataset_dict, initial_date, final_date))

    def test_filter_by_date_case3(self):
        #case 3: date on interval, nuse sample
        dataset = {'name': 'NUSE', 'path': ''}
        self.my_data.dataset_name = 'NUSE'
        self.my_data.dataset_dict = self.my_data.set_dictionary()
        head_path = '/Users/anamaria/Desktop/dev/security_project/datasets/'
        file = 'verify_enrich_nuse_29112019.csv'
        df = pd.read_csv(head_path + file)
        df = self.my_data.add_timestamp(df)
        initial_date = '2018-01-01'
        final_date = '2018-01-01'
        dataset_dict = self.my_data.dataset_dict
        df_filtered = ProcessData.filter_by_date(df, dataset_dict,
                                                 initial_date, final_date)
        df_expected = df.loc[df['FECHA'] == "2018-01-01"]
        self.assertEqual(len(df_filtered), len(df_expected))

    def test_filter_by_date_case4(self):
        #case 4: date on interval, nuse full data
        dataset = {'name': 'NUSE', 'path': ''}
        self.my_data.dataset_name = 'NUSE'
        self.my_data.dataset_dict = self.my_data.set_dictionary()
        head_path = '/Users/anamaria/Desktop/dev/security_project/datasets/'
        file = 'verify_enrich_nuse_29112019.csv'
        df = pd.read_csv(head_path + file)
        df = self.my_data.add_timestamp(df)
        initial_date = '2018-01-01'
        final_date = '2018-01-01'
        dataset_dict = self.my_data.dataset_dict
        df_filtered = ProcessData.filter_by_date(df, dataset_dict,
                                                 initial_date, final_date)
        df_expected = df.loc[df['FECHA'] == "2018-01-01"]
        self.assertEqual(len(df_filtered), len(df_expected))

    def test_filter_by_date_case5(self):
        #case 5: date on interval, rnmc
        dataset = {'name': 'RNMC', 'path': ''}
        self.my_data.dataset_name = 'RNMC'
        self.my_data.dataset_dict = self.my_data.set_dictionary()
        head_path = '/Users/anamaria/Desktop/dev/security_project/datasets/'
        file = '06. verify_enrich_rnmc_12022020.csv'
        df = pd.read_csv(head_path + file)
        df = self.my_data.add_timestamp(df)
        initial_date = '2018-01-01'
        final_date = '2018-01-01'
        dataset_dict = self.my_data.dataset_dict
        df_filtered = ProcessData.filter_by_date(df, dataset_dict,
                                                 initial_date, final_date)
        df_expected = df.loc[df['FECHA'] == "2018-01-01"]
        self.assertEqual(len(df_filtered), len(df_expected))

    def test_filter_by_field_case1(self):
        #case 1: filter successful
        df = pd.read_csv(self.my_data.dataset_path)
        df_filtered = ProcessData.filter_by_field(df, 'LOCALIDAD', 'BOSA')
        self.assertEqual(df_filtered.LOCALIDAD.unique()[0], 'BOSA')

    def test_filter_by_field_case2(self):
        #case 2: filter successful, without field value
        df = pd.read_csv(self.my_data.dataset_path)
        df_filtered = ProcessData.filter_by_field(df, '', '')
        assertion_proxy = df_filtered.equals(df)
        self.assertEqual(assertion_proxy, True)

    def test_filter_by_field_case3(self):
        #case 3: error, field doesn't exist
        df = pd.read_csv(self.my_data.dataset_path)
        self.assertRaises(
            ValueError,
            lambda: ProcessData.filter_by_field(df, 'nombre', 'Pedro'))

    def test_filter_by_field_case4(self):
        #case 4: error, value doesn't exist
        df = pd.read_csv(self.my_data.dataset_path)
        self.assertRaises(
            ValueError,
            lambda: ProcessData.filter_by_field(df, 'LOCALIDAD', 'NORMANDIA'))

    def test_normalize_matrix(self):
        matrix = np.arange(1, 5)
        matrix = matrix.reshape(2, 2)
        matrix_expected = np.array([[0.25, 0.5], [0.75, 1]])
        matrix_normalized = ProcessData.normalize_matrix(matrix)
        self.assertTrue((matrix_normalized == matrix_expected).all())