class FingerprintDatasetFromCSVInMemory(FingerprintDataset): """A fingerprint dataset read from a in memory csv file.""" def __init__(self, file_handle: TextIOWrapper): """Initialize with the file handle of the in memory csv file. Args: file_handle: The file handle of the in memory csv file. """ self._file_handle = file_handle super().__init__() def _process_dataset(self): """Process the dataset to obtain a DataFrame from the file. - The resulting fingerprint dataset is stored in self._dataframe. - The fingerprint dataset has to be a DataFrame with the two indices being browser_id (int64) and time_of_collect (datetime64). - The columns are named after the attributes and have the value collected for the browser browser_id at the time time_of_collect. - The name of each column should correspond to the attribute.name property of an attribute of the candidate attributes. This implementation generates a DataFrame from the csv stored in memory with the two indices set. """ # Read the file from the in memory csv file # + Parse the 'time_of_collect' column as a date with the option # infer_datetime_format activated for performances self._dataframe = pd.read_csv(self._file_handle, index_col=False) # Check that the necessary metadatas are present for required_metadata in MetadataField.ALL: if required_metadata not in self._dataframe: raise MissingMetadatasFields( f'The required metadata field {required_metadata} is ' 'missing from the dataset.') # Format the indices self._dataframe[MetadataField.TIME_OF_COLLECT] = pd.to_datetime( self._dataframe[MetadataField.TIME_OF_COLLECT]) # Set the indices as 'browser_id' and 'time_of_collect' self._dataframe.set_index( [MetadataField.BROWSER_ID, MetadataField.TIME_OF_COLLECT], inplace=True) # Remove the file handle as it is not needed anymore and cannot be # pickled del self._file_handle def _set_candidate_attributes(self): """Set the candidate attributes. This implementation generates the candidate attributes from the columns of the DataFrame, ignoring the browser_id and time_of_collect fields. """ self._candidate_attributes = AttributeSet() for column_id, column in enumerate(self._dataframe.columns, 1): attribute = Attribute(column_id, column) self._candidate_attributes.add(attribute)
class FingerprintDatasetFromFile(FingerprintDataset): """A fingerprint dataset read from a file.""" def __init__(self, dataset_path: str): """Initialize the FingerprintDataset with the path to the dataset. Args: dataset_path: The path to the fingerprint dataset. Raises: FileNotFoundError: There is not file at the given dataset path. """ if not path.isfile(dataset_path): raise FileNotFoundError(f'The dataset file at {dataset_path} is ' 'not found.') self._dataset_path = dataset_path super().__init__() def __repr__(self) -> str: """Provide a string representation of this fingerprint dataset. Returns: A string representation of this fingerprint dataset. """ return f'{self.__class__.__name__}({self._dataset_path})' def _process_dataset(self): """Process the dataset to obtain a DataFrame from the file. - The resulting fingerprint dataset is stored in self._dataframe. - The fingerprint dataset has to be a DataFrame with the two indices being browser_id (int64) and time_of_collect (datetime64). - The columns are named after the attributes and have the value collected for the browser browser_id at the time time_of_collect. - The name of each column should correspond to the attribute.name property of an attribute of the candidate attributes. Raises: NotImplementedError: This abstract method is not defined. """ raise NotImplementedError def _set_candidate_attributes(self): """Set the candidate attributes. This implementation generates the candidate attributes from the columns of the DataFrame, ignoring the browser_id and time_of_collect fields. """ self._candidate_attributes = AttributeSet() for column_id, column in enumerate(self._dataframe.columns, 1): attribute = Attribute(column_id, column) self._candidate_attributes.add(attribute) @property def dataset_path(self) -> str: """Give the path to the fingerprint dataset. Returns: The path to the fingerprint dataset. """ return self._dataset_path
class TestBestConditionalEntropic(unittest.TestCase): def setUp(self): self._attribute_set = AttributeSet(ATTRIBUTES) self._dataset = DummyCleanDataset() self._df_w_one_fp_per_browser = ( self._dataset.get_df_w_one_fp_per_browser()) def test_best_conditional_entropic_attribute(self): # This will just take the first attribute which is sufficient as it has # unique values best_cond_ent_attr = _best_conditional_entropic_attribute( self._df_w_one_fp_per_browser, current_attributes=AttributeSet(), candidate_attributes=self._attribute_set) self.assertEqual(best_cond_ent_attr[0], ATTRIBUTES[1]) def test_best_conditional_entropic_attribute_all_taken(self): best_cond_ent_attr = _best_conditional_entropic_attribute( self._df_w_one_fp_per_browser, current_attributes=self._attribute_set, candidate_attributes=self._attribute_set) self.assertIsNone(best_cond_ent_attr[0]) def test_best_conditional_entropic_attribute_empty_attribute_set(self): best_cond_ent_attr = _best_conditional_entropic_attribute( self._df_w_one_fp_per_browser, current_attributes=AttributeSet({ATTRIBUTES[0], ATTRIBUTES[1]}), candidate_attributes=AttributeSet()) self.assertIsNone(best_cond_ent_attr[0]) def test_best_conditional_entropic_attribute_empty_dataset(self): self._dataset = DummyEmptyDataset() self._df_w_one_fp_per_browser = ( self._dataset.get_df_w_one_fp_per_browser()) with self.assertRaises(ValueError): best_cond_ent_attr = _best_conditional_entropic_attribute( self._df_w_one_fp_per_browser, current_attributes=AttributeSet({ATTRIBUTES[0], ATTRIBUTES[1]}), candidate_attributes=self._attribute_set) def test_best_conditional_entropic_attribute_empty_parameters(self): self._dataset = DummyEmptyDataset() self._df_w_one_fp_per_browser = ( self._dataset.get_df_w_one_fp_per_browser()) best_cond_ent_attr = _best_conditional_entropic_attribute( self._df_w_one_fp_per_browser, current_attributes=AttributeSet({ATTRIBUTES[0], ATTRIBUTES[1]}), candidate_attributes=AttributeSet()) self.assertIsNone(best_cond_ent_attr[0]) def test_best_conditional_entropic_attribute_unexistent_attribute(self): self._attribute_set.add(UNEXISTENT_ATTRIBUTE) with self.assertRaises(KeyError): best_cond_ent_attr = _best_conditional_entropic_attribute( self._df_w_one_fp_per_browser, current_attributes=AttributeSet({ATTRIBUTES[0], ATTRIBUTES[1]}), candidate_attributes=self._attribute_set)
def test_add_new_attribute(self): new_attr_set = AttributeSet({self._user_agent}) self.assertEqual(1, len(new_attr_set)) new_attribute = ATTRIBUTES[2] new_attr_set.add(new_attribute) self.assertEqual(2, len(new_attr_set)) self.assertIn(self._user_agent, new_attr_set) self.assertIn(new_attribute, new_attr_set)
def _expand_attribute_sets(attr_sets_to_expand: List[AttributeSet], candidate_attributes: AttributeSet, satisfying_attribute_sets: Set[AttributeSet], attribute_sets_ignored_supersets: Set[AttributeSet], use_pruning_methods: bool) -> Set[AttributeSet]: """Expand a subset of the attribute sets to expand. Args: attr_sets_to_expand: The attribute sets to expand. candidate_attributes: The complete set of the candidate attributes. satisfying_attribute_sets: The attribute sets that satisfy the sensitivity threshold. attribute_sets_ignored_supersets: The attribute sets for which to ignore their supersets. use_pruning_methods: Whether we use the pruning methods or not. Returns: The set of the next attribute sets to explore. """ next_attr_sets_to_explore = set() # Generate the attr. sets composed of S_i with one more attr. # For all S_i in S for set_to_expand in attr_sets_to_expand: # For all a in A diff C for attribute in candidate_attributes: if attribute in set_to_expand: continue # The attr. set C with one more attribute (S_i union {a}) new_attr_set = AttributeSet(set_to_expand) new_attr_set.add(attribute) add_new_attr_set = True # Ignore C if the attr. a is already in the attr. set S_i if attribute in set_to_expand: add_new_attr_set = False continue # Ignore C if it is a superset of an attr. set of T for attr_set_sat in satisfying_attribute_sets: if new_attr_set.issuperset(attr_set_sat): add_new_attr_set = False break # Ignore C if we use the pruning methods and it is a superset # of an attr. set which supersets are to be ignored if use_pruning_methods and add_new_attr_set: for attr_set_to_ign in attribute_sets_ignored_supersets: if new_attr_set.issuperset(attr_set_to_ign): add_new_attr_set = False break # If C is fine, it is added to the attr. sets to explore if add_new_attr_set: next_attr_sets_to_explore.add(new_attr_set) return next_attr_sets_to_explore
class TestAttributeSetEntropyFunction(unittest.TestCase): def setUp(self): self._dataset = DummyCleanDataset() self._df_one_fp_per_browser = ( self._dataset.get_df_w_one_fp_per_browser()) self._attribute_set = AttributeSet(ATTRIBUTES) def check_entropy_result(self, expected_entropy: float): computed_entropy = attribute_set_entropy(self._df_one_fp_per_browser, self._attribute_set) self.assertAlmostEqual(expected_entropy, computed_entropy) def test_empty_dataset_and_empty_attribute_set(self): self._dataset = DummyEmptyDataset() self._df_one_fp_per_browser = ( self._dataset.get_df_w_one_fp_per_browser()) self._attribute_set = AttributeSet() with self.assertRaises(ValueError): self.check_entropy_result(WONT_COMPUTE) def test_empty_dataset(self): self._dataset = DummyEmptyDataset() self._df_one_fp_per_browser = ( self._dataset.get_df_w_one_fp_per_browser()) with self.assertRaises(ValueError): self.check_entropy_result(WONT_COMPUTE) def test_empty_attribute_set(self): self._attribute_set = AttributeSet() with self.assertRaises(ValueError): self.check_entropy_result(WONT_COMPUTE) def test_unexistent_attribute(self): self._attribute_set.add(UNEXISTENT_ATTRIBUTE) with self.assertRaises(KeyError): self.check_entropy_result(WONT_COMPUTE) def test_always_the_same_value(self): self._attribute_set = AttributeSet([ATTRIBUTES[2]]) self.check_entropy_result(0.0) def test_in_between_entropy(self): self._attribute_set = AttributeSet([ATTRIBUTES[0]]) expected_entropy = -1 * ((1/5)*log2(1/5) + (2/5)*log2(2/5) + (2/5)*log2(2/5)) self.check_entropy_result(expected_entropy) def test_unique_values(self): self._attribute_set = AttributeSet([ATTRIBUTES[1]]) expected_entropy = log2(len(self._dataset.dataframe)) self.check_entropy_result(expected_entropy)
def _best_conditional_entropic_attribute(df_one_fp_per_browser: pd.DataFrame, current_attributes: AttributeSet, candidate_attributes: AttributeSet ) -> Tuple[Attribute, float]: """Get the best conditional entropic attribute among the candidates. Args: df_one_fp_per_browser: The dataframe with only one fingerprint per browser. current_attributes: The attributes that compose the current solution. candidate_attributes: The candidate attributes for this process to check. Returns: A tuple with the best attribute for this process and the total entropy when adding this attribute to the current attributes. """ best_local_attribute, best_local_total_entropy = None, -float('inf') for attribute in candidate_attributes: # Ignore the attributes that are already in the current attribute set if attribute in current_attributes: continue # Generate a new attribute set with this attribute attribute_set = AttributeSet(current_attributes) attribute_set.add(attribute) # Evaluate the conditional entropy of this new attribute set and save # it in the dictionary attr_set_entropy = attribute_set_entropy(df_one_fp_per_browser, attribute_set) if attr_set_entropy > best_local_total_entropy: best_local_attribute = attribute best_local_total_entropy = attr_set_entropy return (best_local_attribute, best_local_total_entropy)
def test_add_new_attribute_already_present(self): new_attr_set = AttributeSet({self._user_agent, self._timezone}) with self.assertRaises(DuplicateAttributeId): new_attr_set.add(self._timezone)
class TestAttributeSetEntropy(unittest.TestCase): def setUp(self): self._dataset = DummyCleanDataset() self._attribute_set = AttributeSet(ATTRIBUTES) self._csv_result_path = CSV_RESULT_PATH def check_entropy_result(self, expected_entropy: float): maximum_entropy = log2(len(self._dataset.dataframe)) attribute_set_entropy_analysis = AttributeSetEntropy( self._dataset, self._attribute_set) attribute_set_entropy_analysis.execute() analysis_result = attribute_set_entropy_analysis.result expected_result = { ENTROPY_RESULT: expected_entropy, MAXIMUM_ENTROPY_RESULT: maximum_entropy, NORMALIZED_ENTROPY_RESULT: expected_entropy/maximum_entropy } for result_name, expected_value in expected_result.items(): self.assertAlmostEqual(analysis_result[result_name], expected_value) def test_empty_dataset_and_empty_attribute_set(self): self._dataset = DummyEmptyDataset() self._attribute_set = AttributeSet() with self.assertRaises(ValueError): self.check_entropy_result(WONT_COMPUTE) def test_empty_dataset(self): self._dataset = DummyEmptyDataset() with self.assertRaises(ValueError): self.check_entropy_result(WONT_COMPUTE) def test_empty_attribute_set(self): self._attribute_set = AttributeSet() with self.assertRaises(ValueError): self.check_entropy_result(WONT_COMPUTE) def test_unexistent_attribute(self): self._attribute_set.add(UNEXISTENT_ATTRIBUTE) with self.assertRaises(KeyError): self.check_entropy_result(WONT_COMPUTE) def test_in_between_entropy(self): self._attribute_set = AttributeSet([ATTRIBUTES[0]]) expected_entropy = -1 * ((1/5)*log2(1/5) + (2/5)*log2(2/5) + (2/5)*log2(2/5)) self.check_entropy_result(expected_entropy) def test_always_the_same_value(self): self._attribute_set = AttributeSet([ATTRIBUTES[2]]) self.check_entropy_result(0.0) def test_unique_values(self): self._attribute_set = AttributeSet([ATTRIBUTES[1]]) maximum_entropy = log2(len(self._dataset.dataframe)) self.check_entropy_result(maximum_entropy) def test_save_csv_result(self): attribute_set_entropy_analysis = AttributeSetEntropy( self._dataset, self._attribute_set) attribute_set_entropy_analysis.execute() attribute_set_entropy_analysis.save_csv_result(self._csv_result_path) remove(self._csv_result_path)
class TestGetBestConditionalEntropicAttribute(unittest.TestCase): def setUp(self): self._attribute_set = AttributeSet(ATTRIBUTES) self._dataset = DummyCleanDataset() def test_get_best_entropic_attribute(self): # The order is 1 (unique values), then 0 (some collisions), then # 2 (the same value for each browser) first_best = _get_best_conditional_entropic_attribute( self._dataset, current_attributes=AttributeSet(), candidate_attributes=self._attribute_set) self.assertEqual(first_best, ATTRIBUTES[1]) second_best = _get_best_conditional_entropic_attribute( self._dataset, current_attributes=AttributeSet({ATTRIBUTES[1]}), candidate_attributes=self._attribute_set) self.assertEqual(second_best, ATTRIBUTES[0]) third_best = _get_best_conditional_entropic_attribute( self._dataset, current_attributes=AttributeSet({ATTRIBUTES[1], ATTRIBUTES[0]}), candidate_attributes=self._attribute_set) self.assertEqual(third_best, ATTRIBUTES[2]) no_more_available = _get_best_conditional_entropic_attribute( self._dataset, current_attributes=AttributeSet({ ATTRIBUTES[1], ATTRIBUTES[0], ATTRIBUTES[2]}), candidate_attributes=self._attribute_set) self.assertIsNone(no_more_available) def test_get_best_entropic_attribute_every_attribute_already_taken(self): result = _get_best_conditional_entropic_attribute( self._dataset, current_attributes=self._attribute_set, candidate_attributes=self._attribute_set) self.assertIsNone(result) def test_get_best_entropic_attribute_empty_attribute_set(self): result = _get_best_conditional_entropic_attribute( self._dataset, current_attributes=AttributeSet({ATTRIBUTES[0], ATTRIBUTES[1]}), candidate_attributes=AttributeSet()) self.assertIsNone(result) def test_get_best_entropic_attribute_empty_dataset(self): empty_dataset = DummyEmptyDataset() with self.assertRaises(ValueError): _get_best_conditional_entropic_attribute( empty_dataset, current_attributes=AttributeSet({ATTRIBUTES[0], ATTRIBUTES[1]}), candidate_attributes=self._attribute_set) def test_get_best_entropic_attribute_empty_candidates_and_dataset(self): empty_dataset = DummyEmptyDataset() result = _get_best_conditional_entropic_attribute( empty_dataset, current_attributes=AttributeSet({ATTRIBUTES[0], ATTRIBUTES[1]}), candidate_attributes=AttributeSet()) self.assertIsNone(result) def test_get_best_entropic_attribute_unexistent_attribute(self): self._attribute_set.add(UNEXISTENT_ATTRIBUTE) with self.assertRaises(KeyError): _get_best_conditional_entropic_attribute( self._dataset, current_attributes=AttributeSet({ATTRIBUTES[0], ATTRIBUTES[1]}), candidate_attributes=self._attribute_set)
def _search_for_solution(self): """Search for a solution using the entropy-based exploration algorithm. This function has to - Set the best solution currently found (AttributeSet). - Update the set of the attribute sets that satisfy the sensitivity threshold (Set[AttributeSet]). - Update the list of the explored attributes which is the trace of the execution. The information regarding an explored attribute is stored as a dictionary with the following key/values: * time (float): The time spent since the starting of the exploration in seconds (use timedelta.total_seconds()). * attributes (Set[int]): The set of the ids of the attributes. * sensitivity (float): The sensitivity of the attribute set. * usability_cost (float): The usability cost of the attribute set. * cost_explanation (Dict[str: float]): The explanation of the cost of the attribute set. * state (State): The state of this attribute set (see State class). - Log the explored attribute sets for debugging purposes using loguru. Note: We use the ids of the attributes instead of their name to reduce the size of the trace in memory and when saved in json format. """ # The temporary solution (empty set) and the current sensitivity (1.0 # as it is equivalent to no browser fingerprinting used at all) temp_solution, sensitivity = AttributeSet(), 1.0 # We already checked that the sensitivity threshold is reachable, hence # we always reach it when processing the Exploration while sensitivity > self._sensitivity_threshold: # Find the attribute that has the highest conditional entropy best_cond_ent_attr = _get_best_conditional_entropic_attribute( self._dataset, temp_solution, self._dataset.candidate_attributes) # NOTE Removed as we already check that a solution exists before # running the exploration. As a result, we always reach an # attribute set that satisfies the sensitivity threshold, the # complete set of the candidate attributes in the worst case. # If no more solution is proposed, end the exploration # if not best_cond_ent_attr: # break # Add this attribute to the temporary solution temp_solution.add(best_cond_ent_attr) # Compute its sensitivity and its cost logger.debug(f'Exploring {temp_solution}...') sensitivity = self._sensitivity.evaluate(temp_solution) cost, cost_explanation = ( self._usability_cost.evaluate(temp_solution)) logger.debug(f' Sensitivity ({sensitivity}), ' f'usability cost ({cost})') # If it satisfies the sensitivity threshold, quit the loop if sensitivity <= self._sensitivity_threshold: self._update_solution(temp_solution) attribute_set_state = State.SATISFYING self._add_satisfying_attribute_set(temp_solution) else: attribute_set_state = State.EXPLORED # Store this attribute set in the explored sets compute_time = str(datetime.now() - self._start_time) self._add_explored_attribute_set({ TraceData.TIME: compute_time, TraceData.ATTRIBUTES: temp_solution.attribute_ids, TraceData.SENSITIVITY: sensitivity, TraceData.USABILITY_COST: cost, TraceData.COST_EXPLANATION: cost_explanation, TraceData.STATE: attribute_set_state })
def _search_for_solution(self): """Search for a solution using the entropy-based exploration algorithm. This function has to - Set the best solution currently found (AttributeSet). - Update the set of the attribute sets that satisfy the sensitivity threshold (Set[AttributeSet]). - Update the list of the explored attributes which is the trace of the execution. The information regarding an explored attribute is stored as a dictionary with the following key/values: * time (float): The time spent since the starting of the exploration in seconds (use timedelta.total_seconds()). * attributes (Set[int]): The set of the ids of the attributes. * sensitivity (float): The sensitivity of the attribute set. * usability_cost (float): The usability cost of the attribute set. * cost_explanation (Dict[str: float]): The explanation of the cost of the attribute set. * state (State): The state of this attribute set (see State class). - Log the explored attribute sets for debugging purposes using loguru. Note: We use the ids of the attributes instead of their name to reduce the size of the trace in memory and when saved in json format. """ # Get a dictionary of the entropy of each attribute logger.info('Computing the entropy of each attribute...') attributes_entropy = _get_attributes_entropy( self._dataset, self._dataset.candidate_attributes) entropy_compute_time = datetime.now() - self._start_time logger.info('Entropy of the attributes computed after ' f'{entropy_compute_time}.') # Take the attributes in the order of their entropy attribute_set = AttributeSet() for attribute, _ in sort_dict_by_value(attributes_entropy, reverse=True): # Check the new attribute set that is obtained attribute_set.add(attribute) logger.debug(f'Exploring {attribute_set}...') # Compute its sensitivity and its cost sensitivity = self._sensitivity.evaluate(attribute_set) cost, cost_explanation = ( self._usability_cost.evaluate(attribute_set)) logger.debug(f' Sensitivity ({sensitivity}), ' f'usability cost ({cost})') # If it satisfies the sensitivity threshold, quit the loop if sensitivity <= self._sensitivity_threshold: self._update_solution(attribute_set) self._add_satisfying_attribute_set(attribute_set) # Store this attribute set in the explored sets compute_time = str(datetime.now() - self._start_time) self._add_explored_attribute_set({ TraceData.TIME: compute_time, TraceData.ATTRIBUTES: attribute_set.attribute_ids, TraceData.SENSITIVITY: sensitivity, TraceData.USABILITY_COST: cost, TraceData.COST_EXPLANATION: cost_explanation, TraceData.STATE: State.SATISFYING }) # Quit the loop if we found a solution break # If it does not satisfy the sensitivity threshold, we continue compute_time = str(datetime.now() - self._start_time) self._add_explored_attribute_set({ TraceData.TIME: compute_time, TraceData.ATTRIBUTES: attribute_set.attribute_ids, TraceData.SENSITIVITY: sensitivity, TraceData.USABILITY_COST: cost, TraceData.COST_EXPLANATION: cost_explanation, TraceData.STATE: State.EXPLORED })
class TestAttributeSetUnicity(unittest.TestCase): def setUp(self): self._dataset = DummyCleanDataset() self._attribute_set = AttributeSet(ATTRIBUTES) self._csv_result_path = CSV_RESULT_PATH def check_unicity_result(self, expected_unique_fps: int): total_browsers = len(self._dataset.dataframe) attribute_set_unicity_analysis = AttributeSetUnicity( self._dataset, self._attribute_set) attribute_set_unicity_analysis.execute() analysis_result = attribute_set_unicity_analysis.result expected_result = { UNIQUE_FPS_RESULT: expected_unique_fps, TOTAL_BROWSERS_RESULT: total_browsers, UNICITY_RATE_RESULT: expected_unique_fps / total_browsers } for result_name, expected_value in expected_result.items(): self.assertAlmostEqual(analysis_result[result_name], expected_value) def test_empty_dataset_and_empty_attribute_set(self): self._dataset = DummyEmptyDataset() self._attribute_set = AttributeSet() with self.assertRaises(ValueError): self.check_unicity_result(WONT_COMPUTE) def test_empty_dataset(self): self._dataset = DummyEmptyDataset() with self.assertRaises(ValueError): self.check_unicity_result(WONT_COMPUTE) def test_empty_attribute_set(self): self._attribute_set = AttributeSet() with self.assertRaises(ValueError): self.check_unicity_result(WONT_COMPUTE) def test_unexistent_attribute(self): self._attribute_set.add(UNEXISTENT_ATTRIBUTE) with self.assertRaises(KeyError): self.check_unicity_result(WONT_COMPUTE) def test_in_between_entropy(self): self._attribute_set = AttributeSet([ATTRIBUTES[0]]) self.check_unicity_result(1) def test_always_the_same_value(self): self._attribute_set = AttributeSet([ATTRIBUTES[2]]) self.check_unicity_result(0) def test_unique_values(self): self._attribute_set = AttributeSet([ATTRIBUTES[1]]) total_browsers = len(self._dataset.dataframe) self.check_unicity_result(total_browsers) def test_save_csv_result(self): attribute_set_unicity_analysis = AttributeSetUnicity( self._dataset, self._attribute_set) attribute_set_unicity_analysis.execute() attribute_set_unicity_analysis.save_csv_result(self._csv_result_path) remove(self._csv_result_path)
class TestComputeAttributesInstability(unittest.TestCase): def setUp(self): self._dataset = DummyCleanDataset() self._attributes = AttributeSet(ATTRIBUTES) def _get_grouped_by_browser(self): # 1. Group by the browser id (no sort for performances, no group key to # not add an additonal column with the group key) # 2. Sort by the time of collection for each group (give a DataFrame) # 3. Regroup by the browser id, here each group has the fingerprints # sorted by the time of collection return (self._dataset.dataframe.groupby( MetadataField.BROWSER_ID, sort=False, group_keys=False).apply(lambda group_df: group_df.sort_values( MetadataField.TIME_OF_COLLECT)).groupby( MetadataField.BROWSER_ID, sort=False, group_keys=False)) def test_empty_dataset(self): self._dataset = DummyEmptyDataset() grouped_by_browser = self._get_grouped_by_browser() attributes_instability = _compute_attributes_instability( grouped_by_browser, self._attributes) expected_result = { ATTRIBUTES[0]: 0.0, ATTRIBUTES[1]: 0.0, ATTRIBUTES[2]: 0.0 } self.assertDictEqual(expected_result, attributes_instability) def test_unexistent_attribute(self): self._attributes.add(UNEXISTENT_ATTRIBUTE) grouped_by_browser = self._get_grouped_by_browser() with self.assertRaises(KeyError): _compute_attributes_instability(grouped_by_browser, self._attributes) def test_empty_attributes(self): self._attributes = AttributeSet({}) grouped_by_browser = self._get_grouped_by_browser() attributes_instability = _compute_attributes_instability( grouped_by_browser, self._attributes) expected_result = {} self.assertDictEqual(expected_result, attributes_instability) def test_empty_dataset_and_attributes(self): self._dataset = DummyEmptyDataset() self._attributes = AttributeSet({}) grouped_by_browser = self._get_grouped_by_browser() attributes_instability = _compute_attributes_instability( grouped_by_browser, self._attributes) expected_result = {} self.assertDictEqual(expected_result, attributes_instability) def test_clean_dataset(self): grouped_by_browser = self._get_grouped_by_browser() attributes_instability = _compute_attributes_instability( grouped_by_browser, self._attributes) expected_result = { ATTRIBUTES[0]: 0.0, ATTRIBUTES[1]: 0.0, ATTRIBUTES[2]: 0.0 } self.assertDictEqual(expected_result, attributes_instability) def test_dummy_fingerprint_dataset(self): self._dataset = DummyFingerprintDataset() grouped_by_browser = self._get_grouped_by_browser() attributes_instability = _compute_attributes_instability( grouped_by_browser, self._attributes) expected_result = { ATTRIBUTES[0]: 0.0, ATTRIBUTES[1]: 0.0, ATTRIBUTES[2]: 0.0 } self.assertDictEqual(expected_result, attributes_instability) def test_dummy_dataset_with_changes(self): self._dataset = DummyDatasetWithChanges() grouped_by_browser = self._get_grouped_by_browser() attributes_instability = _compute_attributes_instability( grouped_by_browser, self._attributes) expected_result = { ATTRIBUTES[0]: 1 / 2, ATTRIBUTES[1]: 1.0, ATTRIBUTES[2]: 0.0 } self.assertDictEqual(expected_result, attributes_instability)
class TestAttributeSetSample(unittest.TestCase): def setUp(self): self._dataset = DummyCleanDataset() self._attribute_set = AttributeSet(ATTRIBUTES) self._sample_size = SAMPLE_SIZE self._csv_result_path = CSV_RESULT_PATH def check_sample_result(self, possible_values: Set[tuple]): attribute_set_sample_analysis = AttributeSetSample( self._dataset, self._attribute_set, self._sample_size) attribute_set_sample_analysis.execute() analysis_result = attribute_set_sample_analysis.result first_fingerprint = next(iter(analysis_result.values())) expected_sample_size = min(self._sample_size, len(self._dataset.dataframe)) self.assertEqual(len(first_fingerprint), len(self._attribute_set)) self.assertEqual(expected_sample_size, len(analysis_result)) for sample_id, sample_fingerprint in analysis_result.items(): self.assertIn(sample_fingerprint, possible_values) def test_wrong_sample_size(self): with self.assertRaises(AttributeError): wrong_sample_size = 0 AttributeSetSample(self._dataset, self._attribute_set, wrong_sample_size) with self.assertRaises(AttributeError): wrong_sample_size = -3 AttributeSetSample(self._dataset, self._attribute_set, wrong_sample_size) def test_empty_dataset_and_empty_attribute_set(self): self._dataset = DummyEmptyDataset() self._attribute_set = AttributeSet() with self.assertRaises(ValueError): self.check_sample_result(WONT_COMPUTE) def test_empty_dataset(self): self._dataset = DummyEmptyDataset() with self.assertRaises(ValueError): self.check_sample_result(WONT_COMPUTE) def test_empty_attribute_set(self): self._attribute_set = AttributeSet() with self.assertRaises(ValueError): self.check_sample_result(WONT_COMPUTE) def test_unexistent_attribute(self): self._attribute_set.add(UNEXISTENT_ATTRIBUTE) with self.assertRaises(KeyError): self.check_sample_result(WONT_COMPUTE) def test_first_attribute_only(self): self._attribute_set = AttributeSet([ATTRIBUTES[0]]) possible_values = set(product(self._dataset.DATAS[ATTRIBUTES[0].name])) self.check_sample_result(possible_values) def test_second_attribute_only(self): self._attribute_set = AttributeSet([ATTRIBUTES[1]]) possible_values = set(product(self._dataset.DATAS[ATTRIBUTES[1].name])) self.check_sample_result(possible_values) def test_third_attribute_only(self): self._attribute_set = AttributeSet([ATTRIBUTES[2]]) possible_values = set(product(self._dataset.DATAS[ATTRIBUTES[2].name])) self.check_sample_result(possible_values) def test_two_attributes(self): self._attribute_set = AttributeSet([ATTRIBUTES[0], ATTRIBUTES[1]]) first_attribute_values = set(self._dataset.DATAS[ATTRIBUTES[0].name]) second_attribute_values = set(self._dataset.DATAS[ATTRIBUTES[1].name]) possible_values = set( product(first_attribute_values, second_attribute_values)) self.check_sample_result(possible_values) def test_all_the_attributes(self): self._attribute_set = AttributeSet(ATTRIBUTES) first_attribute_values = set(self._dataset.DATAS[ATTRIBUTES[0].name]) second_attribute_values = set(self._dataset.DATAS[ATTRIBUTES[1].name]) third_attribute_values = set(self._dataset.DATAS[ATTRIBUTES[2].name]) possible_values = set( product(first_attribute_values, second_attribute_values, third_attribute_values)) self.check_sample_result(possible_values) def test_first_attribute_only_higher_sample_size(self): self._attribute_set = AttributeSet([ATTRIBUTES[0]]) self._sample_size = len(self._dataset.dataframe) + SAMPLE_SIZE_INCREASE with self.assertRaises(ValueError): self.check_sample_result(WONT_COMPUTE) def test_two_attributes_higher_sample_size(self): self._attribute_set = AttributeSet([ATTRIBUTES[0], ATTRIBUTES[1]]) self._sample_size = len(self._dataset.dataframe) + SAMPLE_SIZE_INCREASE with self.assertRaises(ValueError): self.check_sample_result(WONT_COMPUTE) def test_all_the_attributes_higher_sample_size(self): self._attribute_set = AttributeSet(ATTRIBUTES) self._sample_size = len(self._dataset.dataframe) + SAMPLE_SIZE_INCREASE with self.assertRaises(ValueError): self.check_sample_result(WONT_COMPUTE) def test_save_csv_result(self): attribute_set_sample_analysis = AttributeSetSample( self._dataset, self._attribute_set, self._sample_size) attribute_set_sample_analysis.execute() attribute_set_sample_analysis.save_csv_result(self._csv_result_path) remove(self._csv_result_path)