Exemplo n.º 1
0
    def _default_parameters(self) -> Dict[str, Any]:
        """Give the parameters that are always present in an exploration.

        Returns:
            A dictionary with the default parameters of an exploration.
        """
        analysis_engine = params.get('DataAnalysis', 'engine')
        if analysis_engine == ANALYSIS_ENGINES[1]:
            modin_engine = params.get('DataAnalysis', 'modin_engine')
            analysis_engine += f"[{modin_engine}]"
        return {
            ExplorationParameters.METHOD:
            self.__class__.__name__,
            ExplorationParameters.SENSITIVITY_MEASURE:
            str(self._sensitivity),
            ExplorationParameters.USABILITY_COST_MEASURE:
            str(self._usability_cost),
            ExplorationParameters.DATASET:
            str(self._dataset),
            ExplorationParameters.SENSITIVITY_THRESHOLD:
            (self._sensitivity_threshold),
            ExplorationParameters.ANALYSIS_ENGINE:
            analysis_engine,
            ExplorationParameters.MULTIPROCESSING:
            params.getboolean('Multiprocessing', 'explorations'),
            ExplorationParameters.FREE_CORES:
            params.getint('Multiprocessing', 'free_cores')
        }
Exemplo n.º 2
0
    def _execute_using_multiprocessing(self):
        """Measure the average fingerprint size using multiprocessing."""
        # The list of pairs of (Attribute, attribute average size)
        attributes_avg_size = SortedDict()

        # Infer the number of cores to use
        free_cores = params.getint('Multiprocessing', 'free_cores')
        nb_cores = max(cpu_count() - free_cores, 1)
        nb_attributes = len(self._dataset.candidate_attributes)
        attributes_per_core = int(ceil(nb_attributes / nb_cores))
        logger.debug(f'Sharing {nb_attributes} attributes over '
                     f'{nb_cores}(+{free_cores}) cores, hence '
                     f'{attributes_per_core} attributes per core.')

        def update_attributes_average_size(attrs_size: Dict[Attribute, float]):
            """Update the complete dictionary attributes_avg_size.

            Args:
                attrs_size: The dictionary containing the subset of the results
                            computed by a process.

            Note: This is executed by the main thread and does not pose any
                  concurrency or synchronization problem.
            """
            for attribute, attribute_average_size in attrs_size.items():
                attributes_avg_size[attribute] = attribute_average_size

        # Spawn a number of processes equal to the number of cores
        attributes_list = list(self._dataset.candidate_attributes)
        async_results = []
        with Pool(processes=nb_cores) as pool:
            for process_id in range(nb_cores):
                # Generate the candidate attributes for this process
                start_id = process_id * attributes_per_core
                end_id = (process_id + 1) * attributes_per_core
                attributes_subset = AttributeSet(
                    attributes_list[start_id:end_id])

                async_result = pool.apply_async(
                    _compute_attribute_avg_size,
                    args=(self._dataset.dataframe, attributes_subset),
                    callback=update_attributes_average_size)
                async_results.append(async_result)

            # Wait for all the processes to finish (otherwise we would exit
            # before collecting their result)
            for async_result in async_results:
                async_result.wait()

        self._result = attributes_avg_size
Exemplo n.º 3
0
    def _expand_s(self) -> Set[AttributeSet]:
        """Expand the set S to obtain the attribute sets to explore.

        For each S_i of S, we generate the attribute sets to explore that are
        composed of each S_i with one more attribute that is not in S_i.
        E <-- {C = S_i Union {a} :
               For all S_i in S, For all a in A, a Not in S_i}

        We do not hold C in E if:
        - It is a superset of an attr. set of T (i.e., if it is a superset of
          an attr. set that satisfies the sensitivity threshold).
        - The pruning methods are used and C is a superset of the attr. sets
          whose supersets are to be ignored.

        Returns:
            The set E of the next attribute sets to explore.
        """
        # The set E of the next attribute sets to explore
        next_attr_sets_to_explore = set()

        # If we execute on a single process
        if not params.getboolean('Multiprocessing', 'explorations'):
            logger.debug('Expanding the next attribute sets to explore on a '
                         'single process...')
            return _expand_attribute_sets(
                self._attribute_sets_to_expand,
                self._dataset.candidate_attributes,
                self.get_satisfying_attribute_sets(),
                self._attribute_sets_ignored_supersets, self._pruning)

        # Infer the number of cores to use
        free_cores = params.getint('Multiprocessing', 'free_cores')
        nb_cores = max(cpu_count() - free_cores, 1)
        attribute_sets_per_core = int(ceil(len(self._attribute_sets_to_expand)
                                           / nb_cores))
        logger.debug(f'Sharing {len(self._attribute_sets_to_expand)} attribute'
                     f' sets to expand over {nb_cores}(+{free_cores}) cores, '
                     f'hence {attribute_sets_per_core} attribute sets per '
                     'core.')

        def update_next_attribute_sets_to_explore(result: Set[AttributeSet]):
            """Update the complete set of the next attribute sets to explore.

            Args:
                result: The next attribute sets to explore given by a process.

            Note: This is executed by the main thread and does not pose any
                  concurrency or synchronization problem.
            """
            next_attr_sets_to_explore.update(result)

        # Spawn a number of processes equal to the number of cores
        satisfying_attribute_sets = self.get_satisfying_attribute_sets()
        attribute_sets_to_expand_as_list = list(self._attribute_sets_to_expand)
        async_results = []
        with Pool(processes=nb_cores) as pool:
            for process_id in range(nb_cores):
                # Generate the attribute sets to expand for this process
                start_id = process_id * attribute_sets_per_core
                end_id = (process_id + 1) * attribute_sets_per_core
                process_attr_sets_to_expand = (
                    attribute_sets_to_expand_as_list[start_id:end_id])

                async_result = pool.apply_async(
                    _expand_attribute_sets,
                    args=(process_attr_sets_to_expand,
                          self._dataset.candidate_attributes,
                          satisfying_attribute_sets,
                          self._attribute_sets_ignored_supersets,
                          self._pruning),
                    callback=update_next_attribute_sets_to_explore)
                async_results.append(async_result)

            # Wait for all the processes to finish (otherwise we would exit
            # before collecting their result)
            for async_result in async_results:
                async_result.wait()

        return next_attr_sets_to_explore
Exemplo n.º 4
0
    def _explore_level(self, attribute_sets_to_explore: Set[AttributeSet]
                       ) -> Dict[AttributeSet, float]:
        """Explore the attribute sets of a level (i.e., having the same size).

        Args:
            attribute_sets_to_explore: The attribute sets of this level to
                                       explore.

        Returns:
            The resulting attribute sets that can be explored with their
            efficiency. Only a subset of these will actually be explored.
        """
        # Dictionary of { AttributeSet => efficiency } to build the list of the
        # next attribute sets to explore that will be sorted at the end
        attribute_sets_efficiency = {}

        def update_after_exploration(result: Tuple[Dict[AttributeSet, float],
                                                   Set[AttributeSet],
                                                   Set[AttributeSet]]):
            """Update the informations after exploring a level.

            Args:
                result: A triplet with
                    - The attribute sets that could be explored afterwards and
                       their efficiency.
                    - The attribute sets that satisfy the threshold.
                    - The attribute sets which supersets are to be ignored.

            Note: This is executed by the main thread and does not pose any
                  concurrency or synchronization problem.
            """
            attr_sets_eff, satisf_attr_sets, attr_sets_ign_sups = result
            attribute_sets_efficiency.update(attr_sets_eff)
            self._attribute_sets_ignored_supersets.update(attr_sets_ign_sups)
            self._satisfying_attribute_sets.extend(satisf_attr_sets)

        # If we execute on a single process
        if not params.getboolean('Multiprocessing', 'explorations'):
            logger.debug('Exploring the attribute sets of this level on a '
                         'single process...')
            update_after_exploration(
                _explore_attribute_sets(
                    attribute_sets_to_explore, self._sensitivity,
                    self._usability_cost, self._sensitivity_threshold,
                    self._max_cost, self._solution, self._explored_attr_sets,
                    self._start_time, self._pruning))
            return attribute_sets_efficiency

        # Infer the number of cores to use
        free_cores = params.getint('Multiprocessing', 'free_cores')
        nb_cores = max(cpu_count() - free_cores, 1)
        attribute_sets_per_core = int(ceil(len(attribute_sets_to_explore)
                                           / nb_cores))
        logger.debug(f'Sharing {len(attribute_sets_to_explore)} attribute sets'
                     f' to explore over {nb_cores}(+{free_cores}) cores, hence'
                     f' {attribute_sets_per_core} attribute sets per core.')

        # Spawn a number of processes equal to the number of cores
        attribute_sets_to_explore_list = list(attribute_sets_to_explore)
        async_results = []
        with Pool(processes=nb_cores) as pool:
            for process_id in range(nb_cores):

                # Generate the attribute sets to explore for this process
                start_id = process_id * attribute_sets_per_core
                end_id = (process_id + 1) * attribute_sets_per_core
                subset_attr_sets_to_explore = (
                    attribute_sets_to_explore_list[start_id:end_id])

                async_result = pool.apply_async(
                    _explore_attribute_sets,
                    args=(subset_attr_sets_to_explore, self._sensitivity,
                          self._usability_cost, self._sensitivity_threshold,
                          self._max_cost, self._solution,
                          self._explored_attr_sets, self._start_time,
                          self._pruning),
                    callback=update_after_exploration)
                async_results.append(async_result)

            # Wait for all the processes to finish (otherwise we would exit
            # before collecting their result)
            for async_result in async_results:
                async_result.wait()

        return attribute_sets_efficiency
Exemplo n.º 5
0
def attribute_set_information(attribute_set_id: int):
    """Show information about an attribute set.

    Args:
        attribute_set_id: The id of the attribute set to show.
    """
    global TRACE_DATA
    global FINGERPRINT_DATASET
    global REAL_TIME_EXPLORATION
    logger.info('Getting the information about the attribute set '
                f'{attribute_set_id}.')

    # Check that there is an explored attribute set with this id in the
    # trace
    attribute_set_infos = None
    if attribute_set_id == -1:
        attribute_set_infos = EMPTY_NODE
    elif REAL_TIME_EXPLORATION:
        attribute_set_infos_list = (
            REAL_TIME_EXPLORATION.get_explored_attribute_sets(
                attribute_set_id, attribute_set_id + 1))
        if attribute_set_infos_list:
            attribute_set_infos = attribute_set_infos_list[0]
            attribute_set_infos['id'] = attribute_set_id
    elif TRACE_DATA:
        for explored_attr_set in TRACE_DATA['exploration']:
            if explored_attr_set['id'] == attribute_set_id:
                attribute_set_infos = explored_attr_set
                break
    else:
        error_message = ('Accessing the attribute set information page '
                         'requires a trace or a real time exploration to be '
                         'set.')
        logger.error(error_message)
        abort(HTTPStatus.NOT_FOUND, description=error_message)

    if not attribute_set_infos:
        error_message = (f'The attribute set id {attribute_set_id} was not'
                         ' found.')
        logger.error(error_message)
        abort(HTTPStatus.NOT_FOUND, description=error_message)

    # Generate the attribute set object and get the names of these attributes
    if REAL_TIME_EXPLORATION:
        attributes = AttributeSet(
            FINGERPRINT_DATASET.candidate_attributes.get_attribute_by_id(
                attribute_id)
            for attribute_id in attribute_set_infos['attributes'])
    elif TRACE_DATA:
        attributes = AttributeSet(
            Attribute(attribute_id, TRACE_DATA['attributes'][str(
                attribute_id)])
            for attribute_id in attribute_set_infos['attributes'])
    attribute_names = [attribute.name for attribute in attributes]

    # If there is a fingerprint dataset, compute the additional/optional
    # results from it (the subset for now)
    fingerprint_sample = None
    if attribute_set_id == -1:
        pass  # Avoid trying to get the subset with an empty attribute set
    elif FINGERPRINT_DATASET:
        # Collect a sample of the resulting fingerprints
        attr_subset_sample = AttributeSetSample(
            FINGERPRINT_DATASET, attributes,
            params.getint('WebServer', 'fingerprint_sample_size'))
        attr_subset_sample.execute()
        fingerprint_sample = attr_subset_sample.result
    else:
        flash(
            'Please provide a fingerprint dataset to obtain more insight on '
            'the selected attributes',
            params.get('WebServer', 'flash_info_class'))

    # Compute the textual representation of the state of this attribute set
    attribute_set_state = None
    if attribute_set_infos['state'] == State.EXPLORED:
        attribute_set_state = 'Explored'
    elif attribute_set_infos['state'] == State.PRUNED:
        attribute_set_state = 'Pruned'
    elif attribute_set_infos['state'] == State.SATISFYING:
        attribute_set_state = 'Satisfying the threshold'
    elif attribute_set_infos['state'] == State.EMPTY_NODE:
        attribute_set_state = 'Starting empty node'

    # Prepare a dictionary with the cost percentage of each dimension
    # { cost dimension => (bootstrap progress bar class,  # for pretty display
    #                      percentage of the cost of the candidate attributes)
    # }
    usability_cost_ratio = {}
    if REAL_TIME_EXPLORATION:
        candidate_attributes_infos = (
            REAL_TIME_EXPLORATION.get_explored_attribute_sets(0, 1)[0])
    elif TRACE_DATA:
        candidate_attributes_infos = TRACE_DATA['exploration'][0]
    bootstrap_progess_bars = (params.get(
        'WebServer', 'bootstrap_progess_bars').splitlines())

    # The total usability cost
    cost_percentage = (100 * attribute_set_infos['usability_cost'] /
                       candidate_attributes_infos['usability_cost'])
    usability_cost_ratio['usability'] = (bootstrap_progess_bars[0],
                                         '%.2f' % cost_percentage)

    if attribute_set_id > -1:
        # For each cost dimension except the "weighted" ones
        can_attrs_cost_explanation = candidate_attributes_infos[
            'cost_explanation']
        progress_bar_class_id = 1  # 0 already taken
        for cost_dimension, cost_value in can_attrs_cost_explanation.items():
            if cost_dimension.startswith('weighted'):
                continue
            cost_percentage = (
                100 * attribute_set_infos['cost_explanation'][cost_dimension] /
                cost_value)
            usability_cost_ratio[cost_dimension] = (
                bootstrap_progess_bars[progress_bar_class_id %
                                       len(bootstrap_progess_bars)],
                '%.2f' % cost_percentage)
            progress_bar_class_id += 1

    # Display the attribute information page
    return render_template('attribute-set-information.html',
                           attribute_set_infos=attribute_set_infos,
                           attribute_names=attribute_names,
                           attribute_set_state=attribute_set_state,
                           usability_cost_ratio=usability_cost_ratio,
                           fingerprint_sample=fingerprint_sample,
                           javascript_parameters=params)
Exemplo n.º 6
0
def real_time_exploration_configuration():
    """Configure the assets for a real time exploration."""
    global TRACE_DATA
    global FINGERPRINT_DATASET
    global REAL_TIME_EXPLORATION
    global EXPLORATION_PROCESS

    # The exploration methods, sensitivity and usability cost measures
    exploration_methods = list(EXPLORATION_METHODS.keys())
    sensitivity_measures = list(SENSITIVITY_MEASURES.keys())
    usability_cost_measures = list(USABILITY_COST_MEASURES.keys())

    # We store a dictionary mapping each form field to an error message if the
    # field is invalid
    errors = {}

    # -------------------------- POST request handle --------------------------
    if request.method == 'POST':
        # Clear the previous data if there were some
        TRACE_DATA, FINGERPRINT_DATASET = None, None
        if EXPLORATION_PROCESS:
            EXPLORATION_PROCESS.terminate()
            EXPLORATION_PROCESS = None
        REAL_TIME_EXPLORATION = None

        # ------------ Manage the required fingerprint dataset file -----------
        # Check that the dataset file is in the received POST request
        fp_dataset_error_message = erroneous_post_file(
            request, 'fingerprint-dataset', expected_extension='csv')
        if fp_dataset_error_message:
            errors['fingerprint-dataset'] = fp_dataset_error_message
        # --------- End of the management of the required trace file ----------

        # ------------------ Handle the sensitivity threshold -----------------
        sens_thresh_error_message = erroneous_field(
            request, 'sensitivity-threshold',
            lambda v: v and is_str_float(v) and float(v) >= 0.0,
            'The sensitivity threshold should be a positive float.')
        if sens_thresh_error_message:
            errors['sensitivity-threshold'] = sens_thresh_error_message
        else:
            sensitivity_threshold = float(
                request.form['sensitivity-threshold'])
        # -------------- End of handle the sensitivity threshold --------------

        # ------------------- Handle the exploration method -------------------
        exploration_method_error_message = erroneous_field(
            request, 'exploration-method', lambda v: v in exploration_methods,
            'The exploration method is unknown.')
        if exploration_method_error_message:
            errors['exploration-method'] = exploration_method_error_message
        else:
            exploration_method = request.form['exploration-method']
        # --------------- End of handle the exploration method ----------------

        # ------------------ Handle the FPSelect parameters -------------------
        fpselect_method_name = exploration_methods[0]
        if exploration_method == fpselect_method_name:
            use_pruning_methods = 'use-pruning-methods' in request.form

            # Check that it is a strictly positive integer comprised in the
            # expected range
            minimum_explored_paths = params.getint(
                'WebServer', 'fpselect_minimum_explored_paths')
            maximum_explored_paths = params.getint(
                'WebServer', 'fpselect_maximum_explored_paths')
            explored_paths_error_message = erroneous_field(
                request, 'explored-paths',
                lambda v: v.isdigit() and (0 < minimum_explored_paths <= int(v)
                                           <= maximum_explored_paths),
                'The number of explored paths is required to be a strictly '
                'positive integer comprised in '
                f'[{minimum_explored_paths}; {maximum_explored_paths}].')
            if explored_paths_error_message:
                errors['explored-paths'] = explored_paths_error_message
            else:
                explored_paths = int(request.form['explored-paths'])
        # -------------- End of handle the FPSelect parameters ----------------

        # ------------------ Handle the sensitivity measure -------------------
        sensitivity_measure_error_message = erroneous_field(
            request, 'sensitivity-measure',
            lambda v: v in sensitivity_measures,
            'Unknown sensitivity measure.')
        if sensitivity_measure_error_message:
            errors['sensitivity-measure'] = sensitivity_measure_error_message
        else:
            sensitivity_measure = request.form['sensitivity-measure']
        # -------------- End of handle the sensitivity measure ----------------

        # ------------- Handle the most common fingerprints (=k) --------------
        top_k_fps_sens_meas = sensitivity_measures[0]
        if sensitivity_measure == top_k_fps_sens_meas:
            minimum_common_fps = params.getint(
                'WebServer', 'top_k_fingerprints_sensitivity_measure_min_k')
            maximum_common_fps = params.getint(
                'WebServer', 'top_k_fingerprints_sensitivity_measure_max_k')

            # Check that it is a strictly positive integer and comprised in the
            # range
            top_k_fps_error_message = erroneous_field(
                request, 'most-common-fingerprints', lambda v: v.isdigit() and
                (0 < minimum_common_fps <= int(v) <= maximum_common_fps),
                'The number of explored paths is required to be a strictly '
                'positive integer and comprised in the range'
                f'[{minimum_common_fps}; {maximum_common_fps}].')
            if top_k_fps_error_message:
                errors['most-common-fingerprints'] = top_k_fps_error_message
            else:
                most_common_fingerprints = int(
                    request.form['most-common-fingerprints'])
        # --------- End of handle the most common fingerprints (=k) -----------

        # --- Initialize the dataset (needed to process the usability costs)
        candidate_attributes = None
        try:
            FINGERPRINT_DATASET = FingerprintDatasetFromCSVInMemory(
                request.files['fingerprint-dataset'])

            # We will need the candidate attributes afterwards
            candidate_attributes = FINGERPRINT_DATASET.candidate_attributes
        except MissingMetadatasFields as mmf_error:
            error_message = str(mmf_error)
            flash(error_message, params.get('WebServer', 'flash_error_class'))
            logger.error(error_message)
            errors['fingerprint-dataset'] = error_message

        logger.debug('The fingerprint dataset is set.')

        # ----------------- Handle the usability cost measure -----------------
        # The weights of the cost dimensions
        cost_dim_weights = {}

        # Check the chosen usability cost measure
        usab_cost_meas_error_message = erroneous_field(
            request, 'usability-cost-measure',
            lambda v: v in usability_cost_measures,
            'Unknown usability cost measure.')
        if usab_cost_meas_error_message:
            errors['usability-cost-measure'] = usab_cost_meas_error_message
        else:
            usability_cost_measure = request.form['usability-cost-measure']
            # All the usability cost measures for now include the memory cost
            # and the instability cost, check these two

            # The memory cost results
            memory_file_error_message = erroneous_post_file(
                request, 'memory-cost-results', expected_extension='csv')
            if memory_file_error_message:
                errors['memory-cost-results'] = memory_file_error_message

            # The memory cost weight
            memory_weight_error_message = erroneous_field(
                request, 'memory-cost-weight',
                lambda v: v and is_str_float(v) and float(v) >= 0.0,
                'The memory cost weight should be a positive float.')
            if memory_weight_error_message:
                errors['memory-cost-weight'] = memory_weight_error_message
            else:
                cost_dim_weights[CostDimension.MEMORY] = float(
                    request.form['memory-cost-weight'])

            # Read the memory cost results
            if candidate_attributes:
                memory_cost_content = (request.files['memory-cost-results'].
                                       read().decode().splitlines())
                memory_costs = {}
                mem_file_reader = DictReader(memory_cost_content)
                for row in mem_file_reader:
                    try:
                        attribute = candidate_attributes.get_attribute_by_name(
                            row['attribute'])
                        memory_costs[attribute] = float(row['average_size'])
                    except KeyError as key_error:
                        error_message = (
                            f'The {key_error.args[0]} field is missing from '
                            'the memory cost results file.')
                        flash(error_message,
                              params.get('WebServer', 'flash_error_class'))
                        logger.error(error_message)
                        errors['memory-cost-results'] = error_message
                        break  # Exit the for loop

            # The instability cost results
            instab_file_error_message = erroneous_post_file(
                request, 'instability-cost-results', expected_extension='csv')
            if instab_file_error_message:
                errors['instability-cost-results'] = instab_file_error_message

            # The instability cost weight
            instab_weight_error_message = erroneous_field(
                request, 'instability-cost-weight',
                lambda v: v and is_str_float(v) and float(v) >= 0.0,
                'The instability cost weight should be a positive float.')
            if instab_weight_error_message:
                errors['instability-cost-weight'] = instab_weight_error_message
            else:
                cost_dim_weights[CostDimension.INSTABILITY] = float(
                    request.form['instability-cost-weight'])

            # Read the instability cost results
            if candidate_attributes:
                instability_cost_content = (
                    request.files['instability-cost-results'].read().decode(
                    ).splitlines())
                instability_costs = {}
                instability_file_reader = DictReader(instability_cost_content)
                for row in instability_file_reader:
                    try:
                        attribute = candidate_attributes.get_attribute_by_name(
                            row['attribute'])
                        instability_costs[attribute] = float(
                            row['proportion_of_changes'])
                    except KeyError as key_error:
                        error_message = (
                            f'The {key_error.args[0]} field is missing from '
                            'the instability cost results file.')
                        flash(error_message,
                              params.get('WebServer', 'flash_error_class'))
                        logger.error(error_message)
                        errors['instability-cost-results'] = error_message
                        break  # Exit the for loop

            # If there is also the collection time to consider
            mem_inst_time_usab_cost = usability_cost_measures[1]
            if usability_cost_measure == mem_inst_time_usab_cost:
                # The collection time cost results
                ct_file_err_mess = erroneous_post_file(
                    request,
                    'collection-time-cost-results',
                    expected_extension='csv')
                if ct_file_err_mess:
                    errors['collection-time-cost-results'] = ct_file_err_mess

                # The collection time cost weight
                col_time_weight_error_message = erroneous_field(
                    request, 'collection-time-cost-weight',
                    lambda v: v and is_str_float(v) and float(v) >= 0.0,
                    'The weight of the collection time cost should be a '
                    'positive float.')
                if col_time_weight_error_message:
                    errors['collection-time-cost-weight'] = (
                        col_time_weight_error_message)
                else:
                    cost_dim_weights[CostDimension.TIME] = float(
                        request.form['collection-time-cost-weight'])

                # Read the content of the collection time results
                if candidate_attributes:
                    collection_time_content = (
                        request.files['collection-time-cost-results'].read(
                        ).decode().splitlines())
                    collection_time_costs = {}
                    coll_time_file_reader = DictReader(collection_time_content)
                    for row in coll_time_file_reader:
                        try:
                            attribute = (
                                candidate_attributes.get_attribute_by_name(
                                    row['attribute']))
                            collection_time_costs[attribute] = (
                                float(row['average_collection_time']),
                                bool(row['is_asynchronous']))
                        except KeyError as key_error:
                            err_mess = (
                                f'The {key_error.args[0]} field is missing '
                                'from the collection time cost results file.')
                            flash(err_mess,
                                  params.get('WebServer', 'flash_error_class'))
                            logger.error(err_mess)
                            errors['collection-time-cost-results'] = err_mess
                            break  # Exit the for loop
        # ------------- End of handle the usability cost measure --------------

        # At the end, redirect to the real time exploration page if there are
        # no errors, otherwise redirect to the configuration page.
        if not errors:
            # --- Initialize the sensitivity measure
            sens_meas_class = SENSITIVITY_MEASURES[sensitivity_measure]
            # For now on, there is only the TopKFingerprints
            actual_sens_meas = sens_meas_class(FINGERPRINT_DATASET,
                                               most_common_fingerprints)
            logger.debug('Initialized the sensitivity measure '
                         f'{actual_sens_meas}.')

            # --- Initialize the usability cost measure
            usab_cost_meas_class = USABILITY_COST_MEASURES[
                usability_cost_measure]

            if usability_cost_measure == mem_inst_time_usab_cost:
                # Initialize the memory, instability, and collection time
                actual_usab_cost_meas = usab_cost_meas_class(
                    memory_costs, instability_costs, collection_time_costs,
                    cost_dim_weights)
            else:
                actual_usab_cost_meas = usab_cost_meas_class(
                    memory_costs, instability_costs, cost_dim_weights)
            logger.debug('Initialized the usability cost measure '
                         f'{actual_usab_cost_meas}.')

            # --- Initialize the exploration class
            exploration_class = EXPLORATION_METHODS[exploration_method]

            # If FPSelect
            if exploration_method == fpselect_method_name:
                exploration = exploration_class(actual_sens_meas,
                                                actual_usab_cost_meas,
                                                FINGERPRINT_DATASET,
                                                sensitivity_threshold,
                                                explored_paths,
                                                use_pruning_methods)
            else:
                exploration = exploration_class(actual_sens_meas,
                                                actual_usab_cost_meas,
                                                FINGERPRINT_DATASET,
                                                sensitivity_threshold)
            logger.debug(f'Initialized the exploration {exploration}.')

            # Execute the exploration in an asynchronous manner before
            REAL_TIME_EXPLORATION = exploration
            EXPLORATION_PROCESS = REAL_TIME_EXPLORATION.run_asynchronous()

            logger.debug('Redirecting to the real time exploration page')
            return redirect(url_for('real_time_exploration'))
        # -------------------- End of POST request handle ---------------------

    # Show the real time exploration configuration page
    return render_template('real-time-exploration-configuration.html',
                           params=params,
                           errors=errors,
                           exploration_methods=exploration_methods,
                           sensitivity_measures=sensitivity_measures,
                           usability_cost_measures=usability_cost_measures)
Exemplo n.º 7
0
                                                        UNIQUE_FPS_RESULT,
                                                        TOTAL_BROWSERS_RESULT)
from brfast.measures.sensitivity.fpselect import TopKFingerprints
from brfast.measures.usability_cost.fpselect import (CostDimension,
                                                     MemoryInstability,
                                                     MemoryInstabilityTime)
from brfast.utils.conversion import is_str_float
from brfast.webserver.files_verification import trace_file_errors
from brfast.webserver.form_validation import (erroneous_field,
                                              erroneous_post_file)

# The Flask application
app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = params.get('WebServer', 'upload_folder')
app.secret_key = secrets.token_bytes(
    params.getint('WebServer', 'secret_key_size'))

# Set the exploration methods, the sensitivity measures, and the usability cost
# measures below
EXPLORATION_METHODS = {
    'FPSelect': FPSelect,
    'Entropy': Entropy,
    'Conditional entropy': ConditionalEntropy
}
SENSITIVITY_MEASURES = {'Top-k fingerprints': TopKFingerprints}
USABILITY_COST_MEASURES = {
    'Memory and instability': MemoryInstability,
    'Memory, instability, and collection time': MemoryInstabilityTime
}

# The empty node that is virtually added to the explored attribute sets
Exemplo n.º 8
0
def _get_best_conditional_entropic_attribute(dataset: FingerprintDataset,
                                             current_attributes: AttributeSet,
                                             candidate_attributes: AttributeSet
                                             ) -> Attribute:
    """Get the attribute that provides the highest total entropy.

    When several attributes provide the same total entropy, the attribute of
    the lowest id is given. If no attribute increases the total entropy, we
    still provide the attribute of the lowest id.

    Args:
        dataset: The dataset used to compute the conditional entropy.
        current_attributes: The attributes that compose the current solution.
        candidate_attributes: The candidate attributes (i.e., those available).

    Raises:
        ValueError: There are candidate attributes and the fingerprint dataset
                    is empty.
        KeyError: One of the candidate attributes is not in the fingerprint
                  dataset.

    Returns:
        The attribute that has the highest conditional entropy among the
        candidate attributes and that is not part of the current attributes.
    """
    logger.debug('Getting the best conditional entropic attribute from '
                 f'{current_attributes}...')

    # Some checks before starting the exploration
    if candidate_attributes and dataset.dataframe.empty:
        raise ValueError('Cannot compute the conditional entropy on an empty '
                         'dataset.')
    for attribute in candidate_attributes:
        if attribute not in dataset.candidate_attributes:
            raise KeyError(f'The attribute {attribute} is not in the dataset.')

    # We will work on a dataset with only a fingerprint per browser to avoid
    # overcounting effects
    df_one_fp_per_browser = dataset.get_df_w_one_fp_per_browser()

    # If we execute on a single process
    if not params.getboolean('Multiprocessing', 'explorations'):
        logger.debug('Measuring the attributes entropy on a single process...')
        best_attribute, best_total_ent = _best_conditional_entropic_attribute(
            df_one_fp_per_browser, current_attributes, candidate_attributes)
        logger.debug(f'  The best attribute is {best_attribute} for a total '
                     f'entropy of {best_total_ent}.')
        return best_attribute

    # The values to update through the search for the best attribute
    best_attribute_informations = {}
    logger.debug('Measuring the attributes conditional entropy using '
                 'multiprocessing...')

    # Infer the number of cores to use
    free_cores = params.getint('Multiprocessing', 'free_cores')
    nb_cores = max(cpu_count() - free_cores, 1)
    attributes_per_core = int(ceil(len(candidate_attributes)/nb_cores))
    logger.debug(f'Sharing {len(candidate_attributes)} candidate attributes '
                 f'over {nb_cores}(+{free_cores}) cores, hence '
                 f'{attributes_per_core} attributes per core.')

    def update_best_conditional_entropy_attribute(result: Tuple[Attribute,
                                                                float]):
        """Update the best conditional entropy attribute.

        Args:
            result: A tuple with the best attribute and the best total entropy.

        Note: This is executed by the main thread and does not pose any
              concurrency or synchronization problem.
        """
        best_attribute, best_total_entropy = result
        if best_attribute:  # To avoid the empty results which are None
            best_attribute_informations[best_attribute] = best_total_entropy

    # Spawn a number of processes equal to the number of cores
    candidate_attributes_list = list(candidate_attributes)
    async_results = []
    with Pool(processes=nb_cores) as pool:
        for process_id in range(nb_cores):
            # Generate the candidate attributes for this process
            start_id = process_id * attributes_per_core
            end_id = (process_id + 1) * attributes_per_core
            candidate_attributes_subset = AttributeSet(
                candidate_attributes_list[start_id:end_id])

            async_result = pool.apply_async(
                _best_conditional_entropic_attribute,
                args=(df_one_fp_per_browser, current_attributes,
                      candidate_attributes_subset),
                callback=update_best_conditional_entropy_attribute)
            async_results.append(async_result)

        # Wait for all the processes to finish (otherwise we would exit
        # before collecting their result)
        for async_result in async_results:
            async_result.wait()

    # Search for the best attribute in the local results. If several provide
    # the same total entropy, we provide the attribute having the lowest id.
    best_attribute, best_total_entropy = None, -float('inf')
    for attribute in sorted(best_attribute_informations):
        attribute_total_entropy = best_attribute_informations[attribute]
        if attribute_total_entropy > best_total_entropy:
            best_total_entropy = attribute_total_entropy
            best_attribute = attribute

    logger.debug(f'  The best attribute is {best_attribute} for a total '
                 f'entropy of {best_total_entropy}.')

    return best_attribute
Exemplo n.º 9
0
def _get_attributes_entropy(
        dataset: FingerprintDataset,
        attributes: AttributeSet) -> Dict[Attribute, float]:
    """Give a dictionary with the entropy of each attribute.

    Args:
        dataset: The fingerprint dataset used to compute the entropy.
        attributes: The attributes for which we compute the entropy.

    Raises:
        ValueError: There are attributes and the fingerprint dataset is empty.
        KeyError: An attribute is not in the fingerprint dataset.

    Returns:
        A dictionary with each attribute (Attribute) and its entropy.
    """
    # Some checks before starting the exploration
    if attributes and dataset.dataframe.empty:
        raise ValueError('Cannot compute the entropy on an empty dataset.')
    for attribute in attributes:
        if attribute not in dataset.candidate_attributes:
            raise KeyError(f'The attribute {attribute} is not in the dataset.')

    # We will work on a dataset with only a fingerprint per browser to avoid
    # overcounting effects
    df_one_fp_per_browser = dataset.get_df_w_one_fp_per_browser()

    # If we execute on a single process
    if not params.getboolean('Multiprocessing', 'explorations'):
        logger.debug('Measuring the attributes entropy on a single process...')
        return _compute_attribute_entropy(df_one_fp_per_browser, attributes)

    # The dictionary to update when using multiprocessing
    logger.debug('Measuring the attributes entropy using multiprocessing...')
    attributes_entropy = {}

    # Infer the number of cores to use
    free_cores = params.getint('Multiprocessing', 'free_cores')
    nb_cores = max(cpu_count() - free_cores, 1)
    attributes_per_core = int(ceil(len(attributes) / nb_cores))
    logger.debug(f'Sharing {len(attributes)} attributes over '
                 f'{nb_cores}(+{free_cores}) cores, hence '
                 f'{attributes_per_core} attributes per core.')

    def update_attributes_entropy(attrs_entropy: Dict[Attribute, float]):
        """Update the complete dictionary attributes_entropy.

        Args:
            attrs_size: The dictionary containing the subset of the results
                        computed by a process.

        Note: This is executed by the main thread and does not pose any
              concurrency or synchronization problem.
        """
        for attribute, attribute_entropy in attrs_entropy.items():
            attributes_entropy[attribute] = attribute_entropy

    # Spawn a number of processes equal to the number of cores
    attributes_list = list(attributes)
    async_results = []
    with Pool(processes=nb_cores) as pool:
        for process_id in range(nb_cores):
            # Generate the candidate attributes for this process
            start_id = process_id * attributes_per_core
            end_id = (process_id + 1) * attributes_per_core
            attributes_subset = AttributeSet(attributes_list[start_id:end_id])

            async_result = pool.apply_async(_compute_attribute_entropy,
                                            args=(df_one_fp_per_browser,
                                                  attributes_subset),
                                            callback=update_attributes_entropy)
            async_results.append(async_result)

        # Wait for all the processes to finish (otherwise we would exit
        # before collecting their result)
        for async_result in async_results:
            async_result.wait()

    return attributes_entropy