Exemplo n.º 1
0
 def test_get_best_entropic_attribute_empty_candidates_and_dataset(self):
     empty_dataset = DummyEmptyDataset()
     result = _get_best_conditional_entropic_attribute(
         empty_dataset, current_attributes=AttributeSet({ATTRIBUTES[0],
                                                         ATTRIBUTES[1]}),
         candidate_attributes=AttributeSet())
     self.assertIsNone(result)
Exemplo n.º 2
0
 def test_candidate_attributes_property(self):
     self.check_candidate_attributes_property(self._dummy_fp_dataset,
                                              AttributeSet(ATTRIBUTES))
     self.check_candidate_attributes_property(self._empty_dataset,
                                              AttributeSet())
     self.check_candidate_attributes_property(self._clean_dataset,
                                              AttributeSet(ATTRIBUTES))
Exemplo n.º 3
0
 def setUp(self):
     self._user_agent = ATTRIBUTES[0]
     self._timezone = ATTRIBUTES[1]
     self._do_not_track = ATTRIBUTES[2]
     self._empty_attr_set = AttributeSet()
     self._single_attribute_set = AttributeSet({self._timezone})
     self._attribute_set = AttributeSet(
         {self._user_agent, self._timezone, self._do_not_track})
Exemplo n.º 4
0
 def test_best_conditional_entropic_attribute_empty_parameters(self):
     self._dataset = DummyEmptyDataset()
     self._df_w_one_fp_per_browser = (
         self._dataset.get_df_w_one_fp_per_browser())
     best_cond_ent_attr = _best_conditional_entropic_attribute(
         self._df_w_one_fp_per_browser,
         current_attributes=AttributeSet({ATTRIBUTES[0], ATTRIBUTES[1]}),
         candidate_attributes=AttributeSet())
     self.assertIsNone(best_cond_ent_attr[0])
Exemplo n.º 5
0
 def test_run(self):
     # Run the exploration
     self._exploration.run()
     expected_solution = AttributeSet({ATTRIBUTES[0], ATTRIBUTES[1]})
     expected_satisfying_attribute_sets = {
         AttributeSet({ATTRIBUTES[0], ATTRIBUTES[1]}),
         AttributeSet({ATTRIBUTES[0], ATTRIBUTES[1], ATTRIBUTES[2]})
     }
     expected_explored_attribute_sets = (
         DummyExploration.EXPLORED_ATTRIBUTE_SETS)
     self.check_run(expected_solution, expected_satisfying_attribute_sets,
                    expected_explored_attribute_sets)
Exemplo n.º 6
0
 def test_run_asynchronous(self):
     # Run the exploration
     process = self._exploration.run_asynchronous()
     process.join()  # Wait for the process to end
     expected_solution = AttributeSet({ATTRIBUTES[0], ATTRIBUTES[1]})
     expected_satisfying_attribute_sets = {
         AttributeSet({ATTRIBUTES[0], ATTRIBUTES[1]}),
         AttributeSet({ATTRIBUTES[0], ATTRIBUTES[1], ATTRIBUTES[2]})
     }
     expected_explored_attribute_sets = (
         DummyExploration.EXPLORED_ATTRIBUTE_SETS)
     self.check_run(expected_solution, expected_satisfying_attribute_sets,
                    expected_explored_attribute_sets)
Exemplo n.º 7
0
    def _search_for_solution(self):
        # Set the final solution found
        self._update_solution(AttributeSet({ATTRIBUTES[0], ATTRIBUTES[1]}))

        # Update the set of the attribute sets that satisfy the sensitivity
        self._add_satisfying_attribute_set(
            AttributeSet({ATTRIBUTES[0], ATTRIBUTES[1]}))
        self._add_satisfying_attribute_set(AttributeSet(ATTRIBUTES))

        # Update the list of the explored attributes. We ignore the first
        # attribute set composed of all the attributes which is automatically
        # added when checking that the sensitivity threshold is reachable)
        for explored_attribute_set in self.EXPLORED_ATTRIBUTE_SETS[1:]:
            self._add_explored_attribute_set(explored_attribute_set)
Exemplo n.º 8
0
 def test_best_conditional_entropic_attribute(self):
     # This will just take the first attribute which is sufficient as it has
     # unique values
     best_cond_ent_attr = _best_conditional_entropic_attribute(
         self._df_w_one_fp_per_browser, current_attributes=AttributeSet(),
         candidate_attributes=self._attribute_set)
     self.assertEqual(best_cond_ent_attr[0], ATTRIBUTES[1])
Exemplo n.º 9
0
 def test_evaluate_sequential_only(self):
     attribute_set = AttributeSet(ATTRIBUTES)
     cost, cost_explanation = (
         self._memory_instability_time_measure.evaluate(attribute_set))
     expected_memory_cost = sum(SIZES.values())
     expected_weighted_memory_cost = (
         expected_memory_cost * WEIGHTS[CostDimension.MEMORY])
     expected_instability_cost = sum(INSTABILITIES.values())
     expected_weighted_instability_cost = (
         expected_instability_cost * WEIGHTS[CostDimension.INSTABILITY])
     # Sequential test: there are only sequential attributes.
     expected_time_cost = sum(
         avg_col_time for avg_col_time, _ in COLLECTION_TIMES.values())
     expected_weighted_time_cost = (
         expected_time_cost * WEIGHTS[CostDimension.TIME])
     expected_cost = sum((expected_weighted_memory_cost,
                          expected_weighted_instability_cost,
                          expected_weighted_time_cost))
     expected_cost_explanation = {
         CostDimension.MEMORY: expected_memory_cost,
         f'weighted_{CostDimension.MEMORY}': expected_weighted_memory_cost,
         CostDimension.INSTABILITY: expected_instability_cost,
         f'weighted_{CostDimension.INSTABILITY}': (
             expected_weighted_instability_cost),
         CostDimension.TIME: expected_time_cost,
         f'weighted_{CostDimension.TIME}': expected_weighted_time_cost
     }
     self.assertEqual(cost, expected_cost)
     self.assertDictEqual(cost_explanation, expected_cost_explanation)
Exemplo n.º 10
0
 def test_empty_attributes(self):
     self._attributes = AttributeSet({})
     grouped_by_browser = self._get_grouped_by_browser()
     attributes_instability = _compute_attributes_instability(
         grouped_by_browser, self._attributes)
     expected_result = {}
     self.assertDictEqual(expected_result, attributes_instability)
Exemplo n.º 11
0
 def test_empty_dataset_and_empty_attribute_set(self):
     self._dataset = DummyEmptyDataset()
     self._df_one_fp_per_browser = (
         self._dataset.get_df_w_one_fp_per_browser())
     self._attribute_set = AttributeSet()
     with self.assertRaises(ValueError):
         self.check_entropy_result(WONT_COMPUTE)
Exemplo n.º 12
0
 def test_two_attributes(self):
     self._attribute_set = AttributeSet([ATTRIBUTES[0], ATTRIBUTES[1]])
     first_attribute_values = set(self._dataset.DATAS[ATTRIBUTES[0].name])
     second_attribute_values = set(self._dataset.DATAS[ATTRIBUTES[1].name])
     possible_values = set(
         product(first_attribute_values, second_attribute_values))
     self.check_sample_result(possible_values)
Exemplo n.º 13
0
    def _set_candidate_attributes(self):
        """Set the candidate attributes.

        The default behavior is to generate the candidate attributes from the
        columns of the DataFrame, ignoring the browser_id and time_of_collect
        fields.
        """
        self._candidate_attributes = AttributeSet(self.attributes)
Exemplo n.º 14
0
 def test_get_best_entropic_attribute_empty_dataset(self):
     empty_dataset = DummyEmptyDataset()
     with self.assertRaises(ValueError):
         _get_best_conditional_entropic_attribute(
             empty_dataset,
             current_attributes=AttributeSet({ATTRIBUTES[0],
                                              ATTRIBUTES[1]}),
             candidate_attributes=self._attribute_set)
Exemplo n.º 15
0
 def test_best_conditional_entropic_attribute_unexistent_attribute(self):
     self._attribute_set.add(UNEXISTENT_ATTRIBUTE)
     with self.assertRaises(KeyError):
         best_cond_ent_attr = _best_conditional_entropic_attribute(
             self._df_w_one_fp_per_browser,
             current_attributes=AttributeSet({ATTRIBUTES[0],
                                              ATTRIBUTES[1]}),
             candidate_attributes=self._attribute_set)
Exemplo n.º 16
0
 def test_get_best_entropic_attribute_unexistent_attribute(self):
     self._attribute_set.add(UNEXISTENT_ATTRIBUTE)
     with self.assertRaises(KeyError):
         _get_best_conditional_entropic_attribute(
             self._dataset,
             current_attributes=AttributeSet({ATTRIBUTES[0],
                                              ATTRIBUTES[1]}),
             candidate_attributes=self._attribute_set)
Exemplo n.º 17
0
 def test_add_new_attribute(self):
     new_attr_set = AttributeSet({self._user_agent})
     self.assertEqual(1, len(new_attr_set))
     new_attribute = ATTRIBUTES[2]
     new_attr_set.add(new_attribute)
     self.assertEqual(2, len(new_attr_set))
     self.assertIn(self._user_agent, new_attr_set)
     self.assertIn(new_attribute, new_attr_set)
Exemplo n.º 18
0
 def test_empty_dataset_and_attributes(self):
     self._dataset = DummyEmptyDataset()
     self._dataframe = self._dataset.dataframe
     self._attributes = AttributeSet({})
     attributes_avg_size = _compute_attribute_avg_size(
         self._dataset, self._attributes)
     expected_result = self._get_expected_result()
     self.assertDictEqual(expected_result, attributes_avg_size)
Exemplo n.º 19
0
def _expand_attribute_sets(attr_sets_to_expand: List[AttributeSet],
                           candidate_attributes: AttributeSet,
                           satisfying_attribute_sets: Set[AttributeSet],
                           attribute_sets_ignored_supersets: Set[AttributeSet],
                           use_pruning_methods: bool) -> Set[AttributeSet]:
    """Expand a subset of the attribute sets to expand.

    Args:
        attr_sets_to_expand: The attribute sets to expand.
        candidate_attributes: The complete set of the candidate attributes.
        satisfying_attribute_sets: The attribute sets that satisfy the
                                   sensitivity threshold.
        attribute_sets_ignored_supersets: The attribute sets for which to
                                          ignore their supersets.
        use_pruning_methods: Whether we use the pruning methods or not.

    Returns:
        The set of the next attribute sets to explore.
    """
    next_attr_sets_to_explore = set()

    # Generate the attr. sets composed of S_i with one more attr.
    # For all S_i in S
    for set_to_expand in attr_sets_to_expand:
        # For all a in A diff C
        for attribute in candidate_attributes:
            if attribute in set_to_expand:
                continue

            # The attr. set C with one more attribute (S_i union {a})
            new_attr_set = AttributeSet(set_to_expand)
            new_attr_set.add(attribute)
            add_new_attr_set = True

            # Ignore C if the attr. a is already in the attr. set S_i
            if attribute in set_to_expand:
                add_new_attr_set = False
                continue

            # Ignore C if it is a superset of an attr. set of T
            for attr_set_sat in satisfying_attribute_sets:
                if new_attr_set.issuperset(attr_set_sat):
                    add_new_attr_set = False
                    break

            # Ignore C if we use the pruning methods and it is a superset
            # of an attr. set which supersets are to be ignored
            if use_pruning_methods and add_new_attr_set:
                for attr_set_to_ign in attribute_sets_ignored_supersets:
                    if new_attr_set.issuperset(attr_set_to_ign):
                        add_new_attr_set = False
                        break

            # If C is fine, it is added to the attr. sets to explore
            if add_new_attr_set:
                next_attr_sets_to_explore.add(new_attr_set)

    return next_attr_sets_to_explore
Exemplo n.º 20
0
 def test_all_the_attributes(self):
     self._attribute_set = AttributeSet(ATTRIBUTES)
     first_attribute_values = set(self._dataset.DATAS[ATTRIBUTES[0].name])
     second_attribute_values = set(self._dataset.DATAS[ATTRIBUTES[1].name])
     third_attribute_values = set(self._dataset.DATAS[ATTRIBUTES[2].name])
     possible_values = set(
         product(first_attribute_values, second_attribute_values,
                 third_attribute_values))
     self.check_sample_result(possible_values)
Exemplo n.º 21
0
 def test_remove(self):
     new_attr_set = AttributeSet({self._user_agent, self._timezone})
     self.assertEqual(2, len(new_attr_set))
     self.assertIn(self._user_agent, new_attr_set)
     self.assertIn(self._timezone, new_attr_set)
     new_attr_set.remove(self._user_agent)
     self.assertEqual(1, len(new_attr_set))
     self.assertIn(self._timezone, new_attr_set)
     self.assertNotIn(self._user_agent, new_attr_set)
Exemplo n.º 22
0
 def test_top_42_fingerprints(self):
     self._most_common_fps = 42
     top_k_fps_per_attribute = {
         ATTRIBUTES[0]: 3 / 3,
         ATTRIBUTES[1]: 3 / 3,
         ATTRIBUTES[2]: 3 / 3
     }
     for attribute in self._candidate_attributes:
         self._attribute_set = AttributeSet([attribute])
         self.check_top_k_fingerprints(top_k_fps_per_attribute[attribute])
Exemplo n.º 23
0
    def test_run(self):
        # Run the exploration
        self._exploration.run()

        # Load the comparison file as a json dictionary
        tests_module_path = PurePath(path.abspath(__file__)).parents[1]
        comparison_trace_path = tests_module_path.joinpath(
            self._expected_trace_path)
        with open(comparison_trace_path, 'r') as comparison_file:
            comparison_dict = json.load(comparison_file)
        expected_explored_attribute_sets = comparison_dict[
            TraceData.EXPLORATION]

        expected_solution = AttributeSet({ATTRIBUTES[0], ATTRIBUTES[1]})
        expected_satisfying_attribute_sets = {
            AttributeSet({ATTRIBUTES[0], ATTRIBUTES[1]}),
            AttributeSet({ATTRIBUTES[0], ATTRIBUTES[1], ATTRIBUTES[2]})}
        self.check_run(expected_solution, expected_satisfying_attribute_sets,
                       expected_explored_attribute_sets)
Exemplo n.º 24
0
    def _set_candidate_attributes(self):
        """Set the candidate attributes.

        This implementation generates the candidate attributes from the columns
        of the DataFrame, ignoring the browser_id and time_of_collect fields.
        """
        self._candidate_attributes = AttributeSet()
        for column_id, column in enumerate(self._dataframe.columns, 1):
            attribute = Attribute(column_id, column)
            self._candidate_attributes.add(attribute)
Exemplo n.º 25
0
 def test_best_conditional_entropic_attribute_empty_dataset(self):
     self._dataset = DummyEmptyDataset()
     self._df_w_one_fp_per_browser = (
         self._dataset.get_df_w_one_fp_per_browser())
     with self.assertRaises(ValueError):
         best_cond_ent_attr = _best_conditional_entropic_attribute(
             self._df_w_one_fp_per_browser,
             current_attributes=AttributeSet({ATTRIBUTES[0],
                                              ATTRIBUTES[1]}),
             candidate_attributes=self._attribute_set)
Exemplo n.º 26
0
 def test_remove_attribute_not_present(self):
     new_attr_set = AttributeSet({self._user_agent, self._timezone})
     self.assertEqual(2, len(new_attr_set))
     self.assertIn(self._user_agent, new_attr_set)
     self.assertIn(self._timezone, new_attr_set)
     non_present_attribute = Attribute(42, 'unknown')
     with self.assertRaises(KeyError):
         new_attr_set.remove(non_present_attribute)
     self.assertEqual(2, len(new_attr_set))
     self.assertIn(self._user_agent, new_attr_set)
     self.assertIn(self._timezone, new_attr_set)
Exemplo n.º 27
0
    def _execute_using_multiprocessing(self):
        """Measure the average fingerprint size using multiprocessing."""
        # The list of pairs of (Attribute, attribute average size)
        attributes_avg_size = SortedDict()

        # Infer the number of cores to use
        free_cores = params.getint('Multiprocessing', 'free_cores')
        nb_cores = max(cpu_count() - free_cores, 1)
        nb_attributes = len(self._dataset.candidate_attributes)
        attributes_per_core = int(ceil(nb_attributes / nb_cores))
        logger.debug(f'Sharing {nb_attributes} attributes over '
                     f'{nb_cores}(+{free_cores}) cores, hence '
                     f'{attributes_per_core} attributes per core.')

        def update_attributes_average_size(attrs_size: Dict[Attribute, float]):
            """Update the complete dictionary attributes_avg_size.

            Args:
                attrs_size: The dictionary containing the subset of the results
                            computed by a process.

            Note: This is executed by the main thread and does not pose any
                  concurrency or synchronization problem.
            """
            for attribute, attribute_average_size in attrs_size.items():
                attributes_avg_size[attribute] = attribute_average_size

        # Spawn a number of processes equal to the number of cores
        attributes_list = list(self._dataset.candidate_attributes)
        async_results = []
        with Pool(processes=nb_cores) as pool:
            for process_id in range(nb_cores):
                # Generate the candidate attributes for this process
                start_id = process_id * attributes_per_core
                end_id = (process_id + 1) * attributes_per_core
                attributes_subset = AttributeSet(
                    attributes_list[start_id:end_id])

                async_result = pool.apply_async(
                    _compute_attribute_avg_size,
                    args=(self._dataset.dataframe, attributes_subset),
                    callback=update_attributes_average_size)
                async_results.append(async_result)

            # Wait for all the processes to finish (otherwise we would exit
            # before collecting their result)
            for async_result in async_results:
                async_result.wait()

        self._result = attributes_avg_size
Exemplo n.º 28
0
    def test_get_best_entropic_attribute(self):
        # The order is 1 (unique values), then 0 (some collisions), then
        # 2 (the same value for each browser)
        first_best = _get_best_conditional_entropic_attribute(
            self._dataset, current_attributes=AttributeSet(),
            candidate_attributes=self._attribute_set)
        self.assertEqual(first_best, ATTRIBUTES[1])

        second_best = _get_best_conditional_entropic_attribute(
            self._dataset, current_attributes=AttributeSet({ATTRIBUTES[1]}),
            candidate_attributes=self._attribute_set)
        self.assertEqual(second_best, ATTRIBUTES[0])

        third_best = _get_best_conditional_entropic_attribute(
            self._dataset, current_attributes=AttributeSet({ATTRIBUTES[1],
                                                            ATTRIBUTES[0]}),
            candidate_attributes=self._attribute_set)
        self.assertEqual(third_best, ATTRIBUTES[2])

        no_more_available = _get_best_conditional_entropic_attribute(
            self._dataset, current_attributes=AttributeSet({
                ATTRIBUTES[1], ATTRIBUTES[0], ATTRIBUTES[2]}),
            candidate_attributes=self._attribute_set)
        self.assertIsNone(no_more_available)
Exemplo n.º 29
0
    def get_solution(self) -> AttributeSet:
        """Provide the solution found by the algorithm after the exploration.

        Returns:
            The AttributeSet that satisfies the sensitivity threshold at the
            lowest cost according to the exploration method (no optimality
            guaranteed).

        Raises:
            ExplorationNotRun: The exploration was not run.
            SensitivityThresholdUnreachable: In asynchronous run: the
                                             sensitivity threshold is
                                             unreachable.
        """
        self._check_exploration_state()
        # Create a new AttributeSet to exit the shared memory space
        return AttributeSet(self._solution[0])
Exemplo n.º 30
0
    def __init__(self, sensitivity_measure: SensitivityMeasure,
                 usability_cost_measure: UsabilityCostMeasure,
                 dataset: FingerprintDataset, sensitivity_threshold: float,
                 explored_paths: int, pruning: bool):
        """Initialize the FPSelect exploration algorithm.

        Args:
            sensitivity_measure: The sensitivity measure.
            usability_cost_measure: The usability cost.
            dataset: The fingerprint dataset.
            sensitivity_threshold: The sensivity threshold.
            explored_paths: The number of paths explored by FPSelect.
            pruning: Use the pruning methods.
        """
        # Initialize using the __init__ function of Exploration
        super().__init__(sensitivity_measure, usability_cost_measure, dataset,
                         sensitivity_threshold)

        # Check the number of explored paths
        if explored_paths < 1:
            raise AttributeError('The number of explored paths is required to '
                                 'be a positive number.')

        # Initialize the specific parameters of FPSelect
        self._explored_paths = explored_paths
        self._pruning = pruning
        logger.info(f'Initialized FPSelect with {explored_paths} paths to '
                    'explore.')
        if pruning:
            logger.info('Pruning methods are activated.')
        else:
            logger.info('Pruning methods are ignored.')

        # Initialize the minimum cost currently found
        self._solution.append(float('inf'))  # Stored in self._solution[1]

        # The set S of the attributes set to expand at each step, initialized
        # to k empty sets
        self._attribute_sets_to_expand = set({AttributeSet()})

        # The set I of the attribute sets which supersets are to ignore
        self._attribute_sets_ignored_supersets = set()