Exemplo n.º 1
0
    def evaluate(self, recommender, metric=None, **kwargs):
        """
        Evaluates the predictor

        Parameters
        ----------
        recommender: The BaseRecommender instance
                The recommender instance to be evaluated.

        metric: [None|'rmse'|'f1score'|'precision'|'recall'|'nmae'|'mae']
            If metrics is None, all metrics available will be evaluated.
        Otherwise it will return the specified metric evaluated.

        sampling_users:  float or sampling, optional, default = None
            If an float is passed, it is the percentage of evaluated
        users. If sampling_users is None, all users are used in the
        evaluation. Specific sampling objects can be passed, see
        scikits.crab.metrics.sampling module for the list of possible
        objects.

        sampling_ratings:  float or sampling, optional, default = None
            If an float is passed, it is the percentage of evaluated
        ratings. If sampling_ratings is None, 70% will be used in the
        training set and 30% in the test set. Specific sampling objects
        can be passed, see scikits.crab.metrics.sampling module
        for the list of possible objects.

        at: integer, optional, default = None
            This number at is the 'at' value, as in 'precision at 5'.  For
        example this would mean precision or recall evaluated by removing
        the top 5 preferences for a user and then finding the percentage of
        those 5 items included in the top 5 recommendations for that user.
        If at is None, it will consider all the top 3 elements.

        Returns
        -------
        Returns a dictionary containing the evaluation results:
        (NMAE, MAE, RMSE, Precision, Recall, F1-Score)

        """
        sampling_users = kwargs.pop('sampling_users', None)
        sampling_ratings = kwargs.pop('sampling_ratings', 0.7)
        permutation = kwargs.pop('permutation', True)
        at = kwargs.pop('at', 3)

        if metric not in evaluation_metrics and metric is not None:
            raise ValueError('metric %s is not recognized. valid keywords \
              are %s' % (metric, evaluation_metrics.keys()))

        n_users = recommender.model.users_count()
        sampling_users = check_sampling(sampling_users, n_users)
        users_set, _ = sampling_users.split(permutation=permutation)

        training_set = {}
        testing_set = {}

        #Select the users to be evaluated.
        user_ids = recommender.model.user_ids()
        for user_id in user_ids[users_set]:
            #Select the ratings to be evaluated.
            preferences = recommender.model.preferences_from_user(user_id)

            sampling_eval = check_sampling(sampling_ratings, \
                                             len(preferences))
            train_set, test_set = sampling_eval.split(indices=True,
                                                      permutation=permutation)

            preferences = list(preferences)
            if recommender.model.has_preference_values():
                training_set[user_id] = dict(
                    (preferences[idx]
                     for idx in train_set)) if preferences else {}
                testing_set[user_id] = [preferences[idx] for idx in test_set
                                        ] if preferences else []
            else:
                training_set[user_id] = dict(
                    ((preferences[idx], 1.0)
                     for idx in train_set)) if preferences else {}
                testing_set[user_id] = [
                    (preferences[idx], 1.0) for idx in test_set
                ] if preferences else []

        #Evaluate the recommender.
        recommender_training = self._build_recommender(training_set, \
                                recommender)

        real_preferences = []
        estimated_preferences = []

        for user_id, preferences in testing_set.iteritems():
            for item_id, preference in preferences:
                #Estimate the preferences
                try:
                    estimated = recommender_training.estimate_preference(
                        user_id, item_id)
                    real_preferences.append(preference)
                except ItemNotFoundError:
                    # It is possible that an item exists in the test data but
                    # not training data in which case an exception will be
                    # throw. Just ignore it and move on
                    continue
                estimated_preferences.append(estimated)

        #Return the error results.
        if metric in ['rmse', 'mae', 'nmae']:
            eval_function = evaluation_metrics[metric]
            if metric == 'nmae':
                return {
                    metric:
                    eval_function(real_preferences, estimated_preferences,
                                  recommender.model.maximum_preference_value(),
                                  recommender.model.minimum_preference_value())
                }
            return {
                metric: eval_function(real_preferences, estimated_preferences)
            }

        #IR_Statistics
        relevant_arrays = []
        real_arrays = []

        #Select the users to be evaluated.
        user_ids = recommender.model.user_ids()
        for user_id in user_ids[users_set]:
            preferences = recommender.model.preferences_from_user(user_id)
            preferences = list(preferences)
            if len(preferences) < 2 * at:
                # Really not enough prefs to meaningfully evaluate the user
                continue

            # List some most-preferred items that would count as most
            if not recommender.model.has_preference_values():
                preferences = [(preference, 1.0) for preference in preferences]

            preferences = sorted(preferences, key=lambda x: x[1], reverse=True)
            relevant_item_ids = [
                item_id for item_id, preference in preferences[:at]
            ]

            if len(relevant_item_ids) == 0:
                continue

            training_set = {}
            for other_user_id in recommender.model.user_ids():
                preferences_other_user = \
                    recommender.model.preferences_from_user(other_user_id)

                if not recommender.model.has_preference_values():
                    preferences_other_user = [
                        (preference, 1.0)
                        for preference in preferences_other_user
                    ]
                if other_user_id == user_id:
                    preferences_other_user = \
                        [pref for pref in preferences_other_user \
                            if pref[0] not in relevant_item_ids]

                    if preferences_other_user:
                        training_set[other_user_id] = \
                            dict(preferences_other_user)
                else:
                    training_set[other_user_id] = dict(preferences_other_user)

            #Evaluate the recommender
            recommender_training = self._build_recommender(training_set, \
                        recommender)

            try:
                preferences = \
                    recommender_training.model.preferences_from_user(user_id)
                preferences = list(preferences)
                if not preferences:
                    continue
            except:
                #Excluded all prefs for the user. move on.
                continue

            recommended_items = recommender_training.recommend(user_id, at)
            relevant_arrays.append(list(relevant_item_ids))
            real_arrays.append(list(recommended_items))

        relevant_arrays = np.array(relevant_arrays)
        real_arrays = np.array(real_arrays)

        #Return the IR results.
        if metric in ['precision', 'recall', 'f1score']:
            eval_function = evaluation_metrics[metric]
            return {metric: eval_function(real_arrays, relevant_arrays)}

        if metric is None:
            #Return all
            mae, nmae, rmse = evaluation_error(
                real_preferences, estimated_preferences,
                recommender.model.maximum_preference_value(),
                recommender.model.minimum_preference_value())
            f = f1_score(real_arrays, relevant_arrays)
            r = recall_score(real_arrays, relevant_arrays)
            p = precision_score(real_arrays, relevant_arrays)

            return {
                'mae': mae,
                'nmae': nmae,
                'rmse': rmse,
                'precision': p,
                'recall': r,
                'f1score': f
            }
Exemplo n.º 2
0
    def evaluate_on_split(self, recommender, metric=None, cv=None, **kwargs):
        """
        Evaluate on the folds of a dataset split

        Parameters
        ----------
        recommender: The BaseRecommender instance
                The recommender instance to be evaluated.

        metric: [None|'rmse'|'f1score'|'precision'|'recall'|'nmae'|'mae']
            If metrics is None, all metrics available will be evaluated.
        Otherwise it will return the specified metric evaluated.

        sampling_users:  float or sampling, optional, default = None
            If an float is passed, it is the percentage of evaluated
        users. If sampling_users is None, all users are used in the
        evaluation. Specific sampling objects can be passed, see
        scikits.crab.metrics.sampling module for the list of possible
        objects.

        cv: integer or crossvalidation, optional, default = None
            If an integer is passed, it is the number of fold (default 3).
            Specific sampling objects can be passed, see
            scikits.crab.metrics.cross_validation module for the list of
            possible objects.

        at: integer, optional, default = None
            This number at is the 'at' value, as in 'precision at 5'.  For
        example this would mean precision or recall evaluated by removing
        the top 5 preferences for a user and then finding the percentage of
        those 5 items included in the top 5 recommendations for that user.
        If at is None, it will consider all the top 3 elements.

        Returns
        -------
        score: dict
            a dictionary containing the average results over
            the different permutations on the split.

        permutation_scores : array, shape = [n_permutations]
            The scores obtained for each permutations.

        """
        sampling_users = kwargs.pop('sampling_users', 0.7)
        permutation = kwargs.pop('permutation', True)
        at = kwargs.pop('at', 3)

        if metric not in evaluation_metrics and metric is not None:
            raise ValueError('metric %s is not recognized. valid keywords \
              are %s' % (metric, evaluation_metrics.keys()))

        permutation_scores_error = []
        permutation_scores_ir = []
        final_score_error = {'avg': {}, 'stdev': {}}
        final_score_ir = {'avg': {}, 'stdev': {}}

        n_users = recommender.model.users_count()
        sampling_users = check_sampling(sampling_users, n_users)
        users_set, _ = sampling_users.split(permutation=permutation)

        total_ratings = []
        #Select the users to be evaluated.
        user_ids = recommender.model.user_ids()
        for user_id in user_ids[users_set]:
            #Select the ratings to be evaluated.
            preferences = recommender.model.preferences_from_user(user_id)
            preferences = list(preferences)
            total_ratings.extend([(user_id, preference)
                                  for preference in preferences])

        n_ratings = len(total_ratings)
        cross_val = check_cv(cv, n_ratings)
        #Defining the splits and run on the splits.
        for train_set, test_set in cross_val:

            training_set = {}
            testing_set = {}

            for idx in train_set:
                user_id, pref = total_ratings[idx]
                if recommender.model.has_preference_values():
                    training_set.setdefault(user_id, {})
                    training_set[user_id][pref[0]] = pref[1]
                else:
                    training_set.setdefault(user_id, {})
                    training_set[user_id][pref] = 1.0

            for idx in test_set:
                user_id, pref = total_ratings[idx]
                if recommender.model.has_preference_values():
                    testing_set.setdefault(user_id, [])
                    testing_set[user_id].append(pref)
                else:
                    testing_set.setdefault(user_id, [])
                    testing_set[user_id].append((pref, 1.0))

            #Evaluate the recommender.
            recommender_training = self._build_recommender(training_set, \
                                    recommender)

            real_preferences = []
            estimated_preferences = []

            for user_id, preferences in testing_set.iteritems():
                for item_id, preference in preferences:
                    #Estimate the preferences
                    try:
                        estimated = recommender_training.estimate_preference(
                            user_id, item_id)
                        real_preferences.append(preference)
                    except:
                        # It is possible that an item exists
                        #in the test data but
                        # not training data in which case
                        #an exception will be
                        # throw. Just ignore it and move on
                        continue
                    estimated_preferences.append(estimated)

            #Return the error results.
            if metric in ['rmse', 'mae', 'nmae']:
                eval_function = evaluation_metrics[metric]
                if metric == 'nmae':
                    permutation_scores_error.append({
                        metric:
                        eval_function(
                            real_preferences, estimated_preferences,
                            recommender.model.maximum_preference_value(),
                            recommender.model.minimum_preference_value())
                    })
                else:
                    permutation_scores_error.append({
                        metric:
                        eval_function(real_preferences, estimated_preferences)
                    })
            elif metric is None:
                #Return all
                mae, nmae, rmse = evaluation_error(
                    real_preferences, estimated_preferences,
                    recommender.model.maximum_preference_value(),
                    recommender.model.minimum_preference_value())
                permutation_scores_error.append({
                    'mae': mae,
                    'nmae': nmae,
                    'rmse': rmse
                })

        #IR_Statistics (Precision, Recall and F1-Score)
        n_users = recommender.model.users_count()
        cross_val = check_cv(cv, n_users)

        for train_idx, test_idx in cross_val:
            relevant_arrays = []
            real_arrays = []
            for user_id in user_ids[train_idx]:
                preferences = recommender.model.preferences_from_user(user_id)
                preferences = list(preferences)
                if len(preferences) < 2 * at:
                    # Really not enough prefs to meaningfully evaluate the user
                    continue

                # List some most-preferred items that would count as most
                if not recommender.model.has_preference_values():
                    preferences = [(preference, 1.0)
                                   for preference in preferences]

                preferences = sorted(preferences,
                                     key=lambda x: x[1],
                                     reverse=True)
                relevant_item_ids = [
                    item_id for item_id, preference in preferences[:at]
                ]

                if len(relevant_item_ids) == 0:
                    continue

                #Build the training set.
                training_set = {}
                for other_user_id in recommender.model.user_ids():
                    preferences_other_user = \
                        recommender.model.preferences_from_user(other_user_id)

                    if not recommender.model.has_preference_values():
                        preferences_other_user = [
                            (preference, 1.0)
                            for preference in preferences_other_user
                        ]
                    if other_user_id == user_id:
                        preferences_other_user = \
                            [pref for pref in preferences_other_user \
                                if pref[0] not in relevant_item_ids]

                        if preferences_other_user:
                            training_set[other_user_id] = \
                                dict(preferences_other_user)
                    else:
                        training_set[other_user_id] = dict(
                            preferences_other_user)

                #Evaluate the recommender
                recommender_training = self._build_recommender(training_set, \
                            recommender)

                try:
                    preferences = \
                        recommender_training.model.preferences_from_user(user_id)
                    preferences = list(preferences)
                    if not preferences:
                        continue
                except:
                    #Excluded all prefs for the user. move on.
                    continue

                recommended_items = recommender_training.recommend(user_id, at)
                relevant_arrays.append(list(relevant_item_ids))
                real_arrays.append(list(recommended_items))

            relevant_arrays = np.array(relevant_arrays)
            real_arrays = np.array(real_arrays)

            #Return the IR results.
            if metric in ['precision', 'recall', 'f1score']:
                eval_function = evaluation_metrics[metric]
                permutation_scores_ir.append(
                    {metric: eval_function(real_arrays, relevant_arrays)})
            elif metric is None:
                f = f1_score(real_arrays, relevant_arrays)
                r = recall_score(real_arrays, relevant_arrays)
                p = precision_score(real_arrays, relevant_arrays)
                permutation_scores_ir.append({
                    'precision': p,
                    'recall': r,
                    'f1score': f
                })

        #Compute the final score for Error Statistics
        for result in permutation_scores_error:
            for key in result:
                final_score_error['avg'].setdefault(key, [])
                final_score_error['avg'][key].append(result[key])
        for key in final_score_error['avg']:
            final_score_error['stdev'][key] = np.std(
                final_score_error['avg'][key])
            final_score_error['avg'][key] = np.average(
                final_score_error['avg'][key])

        #Compute the final score for IR statistics
        for result in permutation_scores_ir:
            for key in result:
                final_score_ir['avg'].setdefault(key, [])
                final_score_ir['avg'][key].append(result[key])
        for key in final_score_ir['avg']:
            final_score_ir['stdev'][key] = np.std(final_score_ir['avg'][key])
            final_score_ir['avg'][key] = np.average(final_score_ir['avg'][key])

        permutation_scores = {}
        scores = {}
        if permutation_scores_error:
            permutation_scores['error'] = permutation_scores_error
            scores['final_error'] = final_score_error
        if permutation_scores_ir:
            permutation_scores['ir'] = permutation_scores_ir
            scores.setdefault('final_error', {})
            scores['final_error'].setdefault('avg', {})
            scores['final_error'].setdefault('stdev', {})
            scores['final_error']['avg'].update(final_score_ir['avg'])
            scores['final_error']['stdev'].update(final_score_ir['stdev'])

        return permutation_scores, scores
Exemplo n.º 3
0
    def evaluate_on_split(self, recommender, metric=None, cv=None, **kwargs):
        """
        Evaluate on the folds of a dataset split

        Parameters
        ----------
        recommender: The BaseRecommender instance
                The recommender instance to be evaluated.

        metric: [None|'rmse'|'f1score'|'precision'|'recall'|'nmae'|'mae']
            If metrics is None, all metrics available will be evaluated.
        Otherwise it will return the specified metric evaluated.

        sampling_users:  float or sampling, optional, default = None
            If an float is passed, it is the percentage of evaluated
        users. If sampling_users is None, all users are used in the
        evaluation. Specific sampling objects can be passed, see
        scikits.crab.metrics.sampling module for the list of possible
        objects.

        cv: integer or crossvalidation, optional, default = None
            If an integer is passed, it is the number of fold (default 3).
            Specific sampling objects can be passed, see
            scikits.crab.metrics.cross_validation module for the list of
            possible objects.

        at: integer, optional, default = None
            This number at is the 'at' value, as in 'precision at 5'.  For
        example this would mean precision or recall evaluated by removing
        the top 5 preferences for a user and then finding the percentage of
        those 5 items included in the top 5 recommendations for that user.
        If at is None, it will consider all the top 3 elements.

        Returns
        -------
        score: dict
            a dictionary containing the average results over
            the different permutations on the split.

        permutation_scores : array, shape = [n_permutations]
            The scores obtained for each permutations.

        """
        sampling_users = kwargs.pop('sampling_users', 0.7)
        permutation = kwargs.pop('permutation', True)
        at = kwargs.pop('at', 3)

        if metric not in evaluation_metrics and metric is not None:
            raise ValueError('metric %s is not recognized. valid keywords \
              are %s' % (metric, evaluation_metrics.keys()))

        permutation_scores_error = []
        permutation_scores_ir = []
        final_score_error = {'avg': {}, 'stdev': {}}
        final_score_ir = {'avg': {}, 'stdev': {}}

        n_users = recommender.model.users_count()
        sampling_users = check_sampling(sampling_users, n_users)
        users_set, _ = sampling_users.split(permutation=permutation)

        total_ratings = []
        #Select the users to be evaluated.
        user_ids = recommender.model.user_ids()
        for user_id in user_ids[users_set]:
            #Select the ratings to be evaluated.
            preferences = recommender.model.preferences_from_user(user_id)
            preferences = list(preferences)
            total_ratings.extend([(user_id, preference)
                                 for preference in preferences])

        n_ratings = len(total_ratings)
        cross_val = check_cv(cv, n_ratings)
        #Defining the splits and run on the splits.
        for train_set, test_set in cross_val:

            training_set = {}
            testing_set = {}

            for idx in train_set:
                user_id, pref = total_ratings[idx]
                if recommender.model.has_preference_values():
                    training_set.setdefault(user_id, {})
                    training_set[user_id][pref[0]] = pref[1]
                else:
                    training_set.setdefault(user_id, {})
                    training_set[user_id][pref] = 1.0

            for idx in test_set:
                user_id, pref = total_ratings[idx]
                if recommender.model.has_preference_values():
                    testing_set.setdefault(user_id, [])
                    testing_set[user_id].append(pref)
                else:
                    testing_set.setdefault(user_id, [])
                    testing_set[user_id].append((pref, 1.0))

            #Evaluate the recommender.
            recommender_training = self._build_recommender(training_set, \
                                    recommender)

            real_preferences = []
            estimated_preferences = []

            for user_id, preferences in testing_set.iteritems():
                for item_id, preference in preferences:
                    #Estimate the preferences
                    try:
                        estimated = recommender_training.estimate_preference(
                                    user_id, item_id)
                        real_preferences.append(preference)
                    except:
                        # It is possible that an item exists
                        #in the test data but
                        # not training data in which case
                        #an exception will be
                        # throw. Just ignore it and move on
                        continue
                    estimated_preferences.append(estimated)

            #Return the error results.
            if metric in ['rmse', 'mae', 'nmae']:
                eval_function = evaluation_metrics[metric]
                if metric == 'nmae':
                    permutation_scores_error.append({
                                metric: eval_function(real_preferences,
                                                 estimated_preferences,
                                recommender.model.maximum_preference_value(),
                                recommender.model.minimum_preference_value())})
                else:
                    permutation_scores_error.append(
                    {metric: eval_function(real_preferences,
                                       estimated_preferences)})
            elif metric is None:
                #Return all
                mae, nmae, rmse = evaluation_error(real_preferences,
                        estimated_preferences,
                        recommender.model.maximum_preference_value(),
                        recommender.model.minimum_preference_value())
                permutation_scores_error.append({'mae': mae, 'nmae': nmae,
                                                  'rmse': rmse})

        #IR_Statistics (Precision, Recall and F1-Score)
        n_users = recommender.model.users_count()
        cross_val = check_cv(cv, n_users)

        for train_idx, test_idx in cross_val:
            relevant_arrays = []
            real_arrays = []
            for user_id in user_ids[train_idx]:
                preferences = recommender.model.preferences_from_user(user_id)
                preferences = list(preferences)
                if len(preferences) < 2 * at:
                    # Really not enough prefs to meaningfully evaluate the user
                    continue

                # List some most-preferred items that would count as most
                if not recommender.model.has_preference_values():
                    preferences = [(preference, 1.0) for preference in preferences]

                preferences = sorted(preferences, key=lambda x: x[1], reverse=True)
                relevant_item_ids = [item_id for item_id, preference
                                        in preferences[:at]]

                if len(relevant_item_ids) == 0:
                    continue

                #Build the training set.
                training_set = {}
                for other_user_id in recommender.model.user_ids():
                    preferences_other_user = \
                        recommender.model.preferences_from_user(other_user_id)

                    if not recommender.model.has_preference_values():
                        preferences_other_user = [(preference, 1.0)
                                         for preference in preferences_other_user]
                    if other_user_id == user_id:
                        preferences_other_user = \
                            [pref for pref in preferences_other_user \
                                if pref[0] not in relevant_item_ids]

                        if preferences_other_user:
                            training_set[other_user_id] = \
                                dict(preferences_other_user)
                    else:
                        training_set[other_user_id] = dict(preferences_other_user)

                #Evaluate the recommender
                recommender_training = self._build_recommender(training_set, \
                            recommender)

                try:
                    preferences = \
                        recommender_training.model.preferences_from_user(user_id)
                    preferences = list(preferences)
                    if not preferences:
                        continue
                except:
                    #Excluded all prefs for the user. move on.
                    continue

                recommended_items = recommender_training.recommend(user_id, at)
                relevant_arrays.append(list(relevant_item_ids))
                real_arrays.append(list(recommended_items))

            relevant_arrays = np.array(relevant_arrays)
            real_arrays = np.array(real_arrays)

            #Return the IR results.
            if metric in ['precision', 'recall', 'f1score']:
                eval_function = evaluation_metrics[metric]
                permutation_scores_ir.append({metric: eval_function(real_arrays,
                                          relevant_arrays)})
            elif metric is None:
                f = f1_score(real_arrays, relevant_arrays)
                r = recall_score(real_arrays, relevant_arrays)
                p = precision_score(real_arrays, relevant_arrays)
                permutation_scores_ir.append({'precision': p, 'recall': r, 'f1score': f})

        #Compute the final score for Error Statistics
        for result in permutation_scores_error:
            for key in result:
                final_score_error['avg'].setdefault(key, [])
                final_score_error['avg'][key].append(result[key])
        for key in final_score_error['avg']:
            final_score_error['stdev'][key] = np.std(final_score_error['avg'][key])
            final_score_error['avg'][key] = np.average(final_score_error['avg'][key])

        #Compute the final score for IR statistics
        for result in permutation_scores_ir:
            for key in result:
                final_score_ir['avg'].setdefault(key, [])
                final_score_ir['avg'][key].append(result[key])
        for key in final_score_ir['avg']:
            final_score_ir['stdev'][key] = np.std(final_score_ir['avg'][key])
            final_score_ir['avg'][key] = np.average(final_score_ir['avg'][key])

        permutation_scores = {}
        scores = {}
        if permutation_scores_error:
            permutation_scores['error'] = permutation_scores_error
            scores['final_error'] = final_score_error
        if permutation_scores_ir:
            permutation_scores['ir'] = permutation_scores_ir
            scores.setdefault('final_error', {})
            scores['final_error'].setdefault('avg', {})
            scores['final_error'].setdefault('stdev', {})
            scores['final_error']['avg'].update(final_score_ir['avg'])
            scores['final_error']['stdev'].update(final_score_ir['stdev'])

        return permutation_scores, scores
    def evaluate(self, recommender, metric=None, **kwargs):
        sampling_users = kwargs.pop('sampling_users', None)
        sampling_ratings = kwargs.pop('sampling_ratings', 0.7)
        permutation = kwargs.pop('permutation', True)
        at = kwargs.pop('at', 3)

        if metric not in evaluation_metrics and metric is not None:
            raise ValueError('metric %s is not recognized. valid keywords \
              are %s' % (metric, evaluation_metrics.keys()))

        n_users = recommender.model.users_count()
        sampling_users = check_sampling(sampling_users, n_users)
        users_set, _ = sampling_users.split(permutation=permutation)

        training_set = {}
        testing_set = {}

        #Select the users to be evaluated.
        user_ids = recommender.model.user_ids()
        for user_id in user_ids[users_set]:
            #Select the ratings to be evaluated.
            preferences = recommender.model.preferences_from_user(user_id)
            sampling_eval = check_sampling(sampling_ratings, len(preferences))
            train_set, test_set = sampling_eval.split(indices=True,
                                                      permutation=permutation)

            preferences = list(preferences)
            if recommender.model.has_preference_values():
                training_set[user_id] = dict(
                    (preferences[idx]
                     for idx in train_set)) if preferences else {}
                testing_set[user_id] = [preferences[idx] for idx in test_set
                                        ] if preferences else []
            else:
                training_set[user_id] = dict(
                    ((preferences[idx], 1.0)
                     for idx in train_set)) if preferences else {}
                testing_set[user_id] = [
                    (preferences[idx], 1.0) for idx in test_set
                ] if preferences else []

        #Evaluate the recommender.
        recommender_training = self._build_recommender(training_set,
                                                       recommender)

        real_preferences = []
        estimated_preferences = []

        if isinstance(recommender, UserBasedRecommender):
            for user_id, preferences in testing_set.iteritems():
                print 'Evaluating user {0}'.format(user_id)
                nearest_neighbors, similarities, user_pref_mean, neighbors_mean = recommender_training.user_sim_neighbors(
                    user_id)
                for item_id, preference in preferences:
                    #Estimate the preferences
                    try:
                        #estimated = recommender_training._estimate_preference(user_id, item_id)
                        estimated = recommender_training.estimate_preference(
                            user_id, item_id, nearest_neighbors, similarities,
                            user_pref_mean, neighbors_mean)
                        real_preferences.append(preference)
                        print 'Estimated:{0}\tReal:{1}'.format(
                            estimated, preference)
                    except ItemNotFoundError:
                        # It is possible that an item exists in the test data but
                        # not training data in which case an exception will be
                        # throw. Just ignore it and move on
                        continue
                    estimated_preferences.append(estimated)
        elif isinstance(recommender, ItemBasedRecommender):
            for user_id, preferences in testing_set.iteritems():
                print 'Evaluating user {0}'.format(user_id)
                for item_id, preference in preferences:
                    #Estimate the preferences
                    try:
                        estimated = recommender_training.estimate_preference(
                            user_id, item_id)
                        real_preferences.append(preference)
                    except:
                        continue
                    estimated_preferences.append(estimated)
        print 'estimated_preferences:{0}\nReal preferences:{1}'.format(
            estimated_preferences, real_preferences)
        real_preferences = np.array(real_preferences)
        estimated_preferences = np.array(estimated_preferences)
        real_preferences = real_preferences[~np.isnan(estimated_preferences)]
        estimated_preferences = estimated_preferences[
            ~np.isnan(estimated_preferences)]

        #Return the error results.
        if metric in ['rmse', 'mae', 'nmae']:
            eval_function = evaluation_metrics[metric]
            if metric == 'nmae':
                return {
                    metric:
                    eval_function(real_preferences, estimated_preferences,
                                  recommender.model.maximum_preference_value(),
                                  recommender.model.minimum_preference_value())
                }
            return {
                metric: eval_function(real_preferences, estimated_preferences)
            }
        return evaluation_error(real_preferences, estimated_preferences,
                                recommender.model.maximum_preference_value(),
                                recommender.model.minimum_preference_value())
Exemplo n.º 5
0
    def evaluate(self, recommender, metric=None, **kwargs):
        """
        Evaluates the predictor

        Parameters
        ----------
        recommender: The BaseRecommender instance
                The recommender instance to be evaluated.

        metric: [None|'rmse'|'f1score'|'precision'|'recall'|'nmae'|'mae']
            If metrics is None, all metrics available will be evaluated.
        Otherwise it will return the specified metric evaluated.

        sampling_users:  float or sampling, optional, default = None
            If an float is passed, it is the percentage of evaluated
        users. If sampling_users is None, all users are used in the
        evaluation. Specific sampling objects can be passed, see
        scikits.crab.metrics.sampling module for the list of possible
        objects.

        sampling_ratings:  float or sampling, optional, default = None
            If an float is passed, it is the percentage of evaluated
        ratings. If sampling_ratings is None, 70% will be used in the
        training set and 30% in the test set. Specific sampling objects
        can be passed, see scikits.crab.metrics.sampling module
        for the list of possible objects.

        at: integer, optional, default = None
            This number at is the 'at' value, as in 'precision at 5'.  For
        example this would mean precision or recall evaluated by removing
        the top 5 preferences for a user and then finding the percentage of
        those 5 items included in the top 5 recommendations for that user.
        If at is None, it will consider all the top 3 elements.

        Returns
        -------
        Returns a dictionary containing the evaluation results:
        (NMAE, MAE, RMSE, Precision, Recall, F1-Score)

        """
        sampling_users = kwargs.pop('sampling_users', None)
        sampling_ratings = kwargs.pop('sampling_ratings', 0.7)
        permutation = kwargs.pop('permutation', True)
        at = kwargs.pop('at', 3)

        if metric not in evaluation_metrics and metric is not None:
            raise ValueError('metric %s is not recognized. valid keywords \
              are %s' % (metric, evaluation_metrics.keys()))

        n_users = recommender.model.users_count()
        sampling_users = check_sampling(sampling_users, n_users)
        users_set, _ = sampling_users.split(permutation=permutation)

        training_set = {}
        testing_set = {}

        #Select the users to be evaluated.
        user_ids = recommender.model.user_ids()
        for user_id in user_ids[users_set]:
            #Select the ratings to be evaluated.
            preferences = recommender.model.preferences_from_user(user_id)

            sampling_eval = check_sampling(sampling_ratings, \
                                             len(preferences))
            train_set, test_set = sampling_eval.split(indices=True,
                                        permutation=permutation)

            preferences = list(preferences)
            if recommender.model.has_preference_values():
                training_set[user_id] = dict((preferences[idx]
                             for idx in train_set)) if preferences else {}
                testing_set[user_id] = [preferences[idx]
                             for idx in test_set] if preferences else []
            else:
                training_set[user_id] = dict(((preferences[idx], 1.0)
                             for idx in train_set)) if preferences else {}
                testing_set[user_id] = [(preferences[idx], 1.0)
                             for idx in test_set] if preferences else []

        #Evaluate the recommender.
        recommender_training = self._build_recommender(training_set, \
                                recommender)

        real_preferences = []
        estimated_preferences = []

        for user_id, preferences in testing_set.iteritems():
            for item_id, preference in preferences:
            #Estimate the preferences
                try:
                    estimated = recommender_training.estimate_preference(
                                user_id, item_id)
                    real_preferences.append(preference)
                except ItemNotFoundError:
                    # It is possible that an item exists in the test data but
                    # not training data in which case an exception will be
                    # throw. Just ignore it and move on
                    continue
                estimated_preferences.append(estimated)

        #Return the error results.
        if metric in ['rmse', 'mae', 'nmae']:
            eval_function = evaluation_metrics[metric]
            if metric == 'nmae':
                return {metric: eval_function(real_preferences,
                                          estimated_preferences,
                                recommender.model.maximum_preference_value(),
                                recommender.model.minimum_preference_value())}
            return {metric: eval_function(real_preferences,
                                          estimated_preferences)}

        #IR_Statistics
        relevant_arrays = []
        real_arrays = []

        #Select the users to be evaluated.
        user_ids = recommender.model.user_ids()
        for user_id in user_ids[users_set]:
            preferences = recommender.model.preferences_from_user(user_id)
            preferences = list(preferences)
            if len(preferences) < 2 * at:
                # Really not enough prefs to meaningfully evaluate the user
                continue

            # List some most-preferred items that would count as most
            if not recommender.model.has_preference_values():
                preferences = [(preference, 1.0) for preference in preferences]

            preferences = sorted(preferences, key=lambda x: x[1], reverse=True)
            relevant_item_ids = [item_id for item_id, preference
                                    in preferences[:at]]

            if len(relevant_item_ids) == 0:
                continue

            training_set = {}
            for other_user_id in recommender.model.user_ids():
                preferences_other_user = \
                    recommender.model.preferences_from_user(other_user_id)

                if not recommender.model.has_preference_values():
                    preferences_other_user = [(preference, 1.0)
                                     for preference in preferences_other_user]
                if other_user_id == user_id:
                    preferences_other_user = \
                        [pref for pref in preferences_other_user \
                            if pref[0] not in relevant_item_ids]

                    if preferences_other_user:
                        training_set[other_user_id] = \
                            dict(preferences_other_user)
                else:
                    training_set[other_user_id] = dict(preferences_other_user)

            #Evaluate the recommender
            recommender_training = self._build_recommender(training_set, \
                        recommender)

            try:
                preferences = \
                    recommender_training.model.preferences_from_user(user_id)
                preferences = list(preferences)
                if not preferences:
                    continue
            except:
                #Excluded all prefs for the user. move on.
                continue

            recommended_items = recommender_training.recommend(user_id, at)
            relevant_arrays.append(list(relevant_item_ids))
            real_arrays.append(list(recommended_items))

        relevant_arrays = np.array(relevant_arrays)
        real_arrays = np.array(real_arrays)

        #Return the IR results.
        if metric in ['precision', 'recall', 'f1score']:
            eval_function = evaluation_metrics[metric]
            return {metric: eval_function(real_arrays, relevant_arrays)}

        if metric is None:
            #Return all
            mae, nmae, rmse = evaluation_error(real_preferences,
                        estimated_preferences,
                        recommender.model.maximum_preference_value(),
                        recommender.model.minimum_preference_value())
            f = f1_score(real_arrays, relevant_arrays)
            r = recall_score(real_arrays, relevant_arrays)
            p = precision_score(real_arrays, relevant_arrays)

            return {'mae': mae, 'nmae': nmae, 'rmse': rmse,
                    'precision': p, 'recall': r, 'f1score': f}
    def evaluate(self, recommender, metric=None, **kwargs):
        sampling_users = kwargs.pop('sampling_users', None)
        sampling_ratings = kwargs.pop('sampling_ratings', 0.7)
        permutation = kwargs.pop('permutation', True)
        at = kwargs.pop('at', 3)

        if metric not in evaluation_metrics and metric is not None:
            raise ValueError('metric %s is not recognized. valid keywords \
              are %s' % (metric, evaluation_metrics.keys()))

        n_users = recommender.model.users_count()
        sampling_users = check_sampling(sampling_users, n_users)
        users_set, _ = sampling_users.split(permutation=permutation)

        training_set = {}
        testing_set = {}

        #Select the users to be evaluated.
        user_ids = recommender.model.user_ids()
        for user_id in user_ids[users_set]:
            #Select the ratings to be evaluated.
            preferences = recommender.model.preferences_from_user(user_id)
            sampling_eval = check_sampling(sampling_ratings, len(preferences))
            train_set, test_set = sampling_eval.split(indices=True,permutation=permutation)

            preferences = list(preferences)
            if recommender.model.has_preference_values():
                training_set[user_id] = dict((preferences[idx]
                             for idx in train_set)) if preferences else {}
                testing_set[user_id] = [preferences[idx]
                             for idx in test_set] if preferences else []
            else:
                training_set[user_id] = dict(((preferences[idx], 1.0)
                             for idx in train_set)) if preferences else {}
                testing_set[user_id] = [(preferences[idx], 1.0)
                             for idx in test_set] if preferences else []

        #Evaluate the recommender.
        recommender_training = self._build_recommender(training_set, recommender)

        real_preferences = []
        estimated_preferences = []

        if isinstance(recommender, UserBasedRecommender):
            for user_id, preferences in testing_set.iteritems():
                print 'Evaluating user {0}'.format(user_id)
                nearest_neighbors, similarities, user_pref_mean, neighbors_mean = recommender_training.user_sim_neighbors(user_id)
                for item_id, preference in preferences:
                    #Estimate the preferences
                    try:
                        #estimated = recommender_training._estimate_preference(user_id, item_id)
                        estimated = recommender_training.estimate_preference(user_id, item_id, nearest_neighbors, similarities, user_pref_mean, neighbors_mean)
                        real_preferences.append(preference)
                        print 'Estimated:{0}\tReal:{1}'.format(estimated,preference)
                    except ItemNotFoundError:
                        # It is possible that an item exists in the test data but
                        # not training data in which case an exception will be
                        # throw. Just ignore it and move on
                        continue
                    estimated_preferences.append(estimated)
        elif isinstance(recommender, ItemBasedRecommender):
            for user_id, preferences in testing_set.iteritems():
                print 'Evaluating user {0}'.format(user_id)
                for item_id, preference in preferences:
                    #Estimate the preferences
                    try:
                        estimated = recommender_training.estimate_preference(user_id, item_id)
                        real_preferences.append(preference)
                    except:
                        continue
                    estimated_preferences.append(estimated)
        print 'estimated_preferences:{0}\nReal preferences:{1}'.format(estimated_preferences, real_preferences)
        real_preferences = np.array(real_preferences)
        estimated_preferences = np.array(estimated_preferences)
        real_preferences = real_preferences[~np.isnan(estimated_preferences)]
        estimated_preferences = estimated_preferences[~np.isnan(estimated_preferences)]

        #Return the error results.
        if metric in ['rmse', 'mae', 'nmae']:
            eval_function = evaluation_metrics[metric]
            if metric == 'nmae':
                return {metric: eval_function(real_preferences,
                                          estimated_preferences,
                                recommender.model.maximum_preference_value(),
                                recommender.model.minimum_preference_value())}
            return {metric: eval_function(real_preferences,
                                          estimated_preferences)}
        return evaluation_error(real_preferences,
                    estimated_preferences,
                    recommender.model.maximum_preference_value(),
                    recommender.model.minimum_preference_value())
Exemplo n.º 7
0
            if metric in ['rmse', 'mae', 'nmae']:
                eval_function = evaluation_metrics[metric]
                if metric == 'nmae':
                    permutation_scores_error.append({
                        metric: eval_function(real_preferences,
                            estimated_preferences,
                            recommender.model.maximum_preference_value(),
                            recommender.model.minimum_preference_value())})
                else:
                    permutation_scores_error.append(
                        {metric: eval_function(real_preferences,
                            estimated_preferences)})
            elif metric is None:
                #Return all
                mae, nmae, rmse = evaluation_error(real_preferences,
                    estimated_preferences,
                    recommender.model.maximum_preference_value(),
                    recommender.model.minimum_preference_value())
                permutation_scores_error.append({'mae': mae, 'nmae': nmae,
                                                 'rmse': rmse})

        #IR_Statistics (Precision, Recall and F1-Score)
        n_users = recommender.model.users_count()
        cross_val = check_cv(cv, n_users)

        for train_idx, test_idx in cross_val:
            relevant_arrays = []
            real_arrays = []
            for user_id in user_ids[train_idx]:
                preferences = recommender.model.preferences_from_user(user_id)
                preferences = list(preferences)
                if len(preferences) < 2 * at:
Exemplo n.º 8
0
    def evaluate(self, recommender, metric=None, **kwargs):
        sampling_users = kwargs.pop('sampling_users', None)
        sampling_ratings = kwargs.pop('sampling_ratings', 0.7)
        permutation = kwargs.pop('permutation', True)
        at = kwargs.pop('at', 3)

        if metric not in evaluation_metrics and metric is not None:
            raise ValueError('metric %s is not recognized. valid keywords \
                      are %s' % (metric, evaluation_metrics.keys()))

        n_users = recommender.model.users_count()
        sampling_users = check_sampling(sampling_users, n_users)
        users_set, _ = sampling_users.split(permutation=permutation)

        training_set = {}
        testing_set = {}

        # Select the users to be evaluated.
        user_ids = recommender.model.user_ids()
        for user_id in user_ids[users_set]:
            # Select the ratings to be evaluated.
            preferences = recommender.model.preferences_from_user(user_id)

            sampling_eval = check_sampling(sampling_ratings, \
                                           len(preferences))
            train_set, test_set = sampling_eval.split(indices=True,
                                                      permutation=permutation)

            preferences = list(preferences)
            if recommender.model.has_preference_values():
                training_set[user_id] = dict((preferences[idx]
                                              for idx in train_set)) if preferences else {}
                testing_set[user_id] = [preferences[idx]
                                        for idx in test_set] if preferences else []
            else:
                training_set[user_id] = dict(((preferences[idx], 1.0)
                                              for idx in train_set)) if preferences else {}
                testing_set[user_id] = [(preferences[idx], 1.0)
                                        for idx in test_set] if preferences else []

        # Evaluate the recommender.
        recommender_training = self._build_recommender(training_set, \
                                                       recommender)

        real_preferences = []
        estimated_preferences = []

        for user_id, preferences in testing_set.iteritems():
            for item_id, preference in preferences:
                # Estimate the preferences
                try:
                    estimated = recommender_training.estimate_preference(
                        user_id, item_id)
                    real_preferences.append(preference)
                except ItemNotFoundError:
                    # It is possible that an item exists in the test data but
                    # not training data in which case an exception will be
                    # throw. Just ignore it and move on
                    continue
                estimated_preferences.append(estimated)

        # Return the error results.
        if metric in ['rmse', 'mae', 'nmae']:
            eval_function = evaluation_metrics[metric]
            if metric == 'nmae':
                return {metric: eval_function(real_preferences,
                                              estimated_preferences,
                                              recommender.model.maximum_preference_value(),
                                              recommender.model.minimum_preference_value())}
            return {metric: eval_function(real_preferences,
                                          estimated_preferences)}

        # IR_Statistics
        relevant_arrays = []
        real_arrays = []

        # Select the users to be evaluated.
        user_ids = recommender.model.user_ids()
        for user_id in user_ids[users_set]:
            preferences = recommender.model.preferences_from_user(user_id)
            preferences = list(preferences)
            if len(preferences) < 2 * at:
                # Really not enough prefs to meaningfully evaluate the user
                continue

            # List some most-preferred items that would count as most
            if not recommender.model.has_preference_values():
                preferences = [(preference, 1.0) for preference in preferences]

            preferences = sorted(preferences, key=lambda x: x[1], reverse=True)
            relevant_item_ids = [item_id for item_id, preference
                                 in preferences[:at]]

            if len(relevant_item_ids) == 0:
                continue

            training_set = {}
            for other_user_id in recommender.model.user_ids():
                preferences_other_user = \
                    recommender.model.preferences_from_user(other_user_id)

                if not recommender.model.has_preference_values():
                    preferences_other_user = [(preference, 1.0)
                                              for preference in preferences_other_user]
                if other_user_id == user_id:
                    preferences_other_user = \
                        [pref for pref in preferences_other_user \
                         if pref[0] not in relevant_item_ids]

                    if preferences_other_user:
                        training_set[other_user_id] = \
                            dict(preferences_other_user)
                else:
                    training_set[other_user_id] = dict(preferences_other_user)

            # Evaluate the recommender
            recommender_training = self._build_recommender(training_set, \
                                                           recommender)

            try:
                preferences = \
                    recommender_training.model.preferences_from_user(user_id)
                preferences = list(preferences)
                if not preferences:
                    continue
            except:
                # Excluded all prefs for the user. move on.
                continue

            recommended_items = recommender_training.recommend(user_id, at)
            relevant_arrays.append(list(relevant_item_ids))
            real_arrays.append(list(recommended_items))

        relevant_arrays = np.array(relevant_arrays)
        real_arrays = np.array(real_arrays)

        # Return the IR results.
        if metric in ['precision', 'recall', 'f1score']:
            eval_function = evaluation_metrics[metric]
            return {metric: eval_function(real_arrays, relevant_arrays)}

        if metric is None:
            # Return all
            mae, nmae, rmse = evaluation_error(real_preferences,
                                               estimated_preferences,
                                               recommender.model.maximum_preference_value(),
                                               recommender.model.minimum_preference_value())
            f = f1_score(real_arrays, relevant_arrays)
            r = recall_score(real_arrays, relevant_arrays)
            p = precision_score(real_arrays, relevant_arrays)

            return {'mae': mae, 'nmae': nmae, 'rmse': rmse,
                    'precision': p, 'recall': r, 'f1score': f}
Exemplo n.º 9
0
    def evaluate_on_split(self, recommender, metric=None, cv=None, **kwargs):
        sampling_users = kwargs.pop('sampling_users', 0.7)
        permutation = kwargs.pop('permutation', True)
        at = kwargs.pop('at', 3)

        if metric not in evaluation_metrics and metric is not None:
            raise ValueError('metric %s is not recognized. valid keywords \
                      are %s' % (metric, evaluation_metrics.keys()))

        permutation_scores_error = []
        permutation_scores_ir = []
        final_score_error = {'avg': {}, 'stdev': {}}
        final_score_ir = {'avg': {}, 'stdev': {}}

        n_users = recommender.model.users_count()
        sampling_users = check_sampling(sampling_users, n_users)
        users_set, _ = sampling_users.split(permutation=permutation)

        total_ratings = []
        # Select the users to be evaluated.
        user_ids = recommender.model.user_ids()
        for user_id in user_ids[users_set]:
            # Select the ratings to be evaluated.
            preferences = recommender.model.preferences_from_user(user_id)
            preferences = list(preferences)
            total_ratings.extend([(user_id, preference)
                                  for preference in preferences])

        n_ratings = len(total_ratings)
        cross_val = check_cv(cv, n_ratings)
        # Defining the splits and run on the splits.
        for train_set, test_set in cross_val:

            training_set = {}
            testing_set = {}

            for idx in train_set:
                user_id, pref = total_ratings[idx]
                if recommender.model.has_preference_values():
                    training_set.setdefault(user_id, {})
                    training_set[user_id][pref[0]] = pref[1]
                else:
                    training_set.setdefault(user_id, {})
                    training_set[user_id][pref] = 1.0

            for idx in test_set:
                user_id, pref = total_ratings[idx]
                if recommender.model.has_preference_values():
                    testing_set.setdefault(user_id, [])
                    testing_set[user_id].append(pref)
                else:
                    testing_set.setdefault(user_id, [])
                    testing_set[user_id].append((pref, 1.0))

            # Evaluate the recommender.
            recommender_training = self._build_recommender(training_set, \
                                                           recommender)

            real_preferences = []
            estimated_preferences = []

            for user_id, preferences in testing_set.iteritems():
                for item_id, preference in preferences:
                    # Estimate the preferences
                    try:
                        estimated = recommender_training.estimate_preference(
                            user_id, item_id)
                        real_preferences.append(preference)
                    except:
                        # It is possible that an item exists
                        # in the test data but
                        # not training data in which case
                        # an exception will be
                        # throw. Just ignore it and move on
                        continue
                    estimated_preferences.append(estimated)

            # Return the error results.
            if metric in ['rmse', 'mae', 'nmae']:
                eval_function = evaluation_metrics[metric]
                if metric == 'nmae':
                    permutation_scores_error.append({
                        metric: eval_function(real_preferences,
                                              estimated_preferences,
                                              recommender.model.maximum_preference_value(),
                                              recommender.model.minimum_preference_value())})
                else:
                    permutation_scores_error.append(
                        {metric: eval_function(real_preferences,
                                               estimated_preferences)})
            elif metric is None:
                # Return all
                mae, nmae, rmse = evaluation_error(real_preferences,
                                                   estimated_preferences,
                                                   recommender.model.maximum_preference_value(),
                                                   recommender.model.minimum_preference_value())
                permutation_scores_error.append({'mae': mae, 'nmae': nmae,
                                                 'rmse': rmse})

        # IR_Statistics (Precision, Recall and F1-Score)
        n_users = recommender.model.users_count()
        cross_val = check_cv(cv, n_users)

        for train_idx, test_idx in cross_val:
            relevant_arrays = []
            real_arrays = []
            for user_id in user_ids[train_idx]:
                preferences = recommender.model.preferences_from_user(user_id)
                preferences = list(preferences)
                if len(preferences) < 2 * at:
                    # Really not enough prefs to meaningfully evaluate the user
                    continue

                # List some most-preferred items that would count as most
                if not recommender.model.has_preference_values():
                    preferences = [(preference, 1.0) for preference in preferences]

                preferences = sorted(preferences, key=lambda x: x[1], reverse=True)
                relevant_item_ids = [item_id for item_id, preference
                                     in preferences[:at]]

                if len(relevant_item_ids) == 0:
                    continue

                # Build the training set.
                training_set = {}
                for other_user_id in recommender.model.user_ids():
                    preferences_other_user = recommender.model.preferences_from_user(other_user_id)

                    if not recommender.model.has_preference_values():
                        preferences_other_user = [(preference, 1.0) for preference in preferences_other_user]
                    if other_user_id == user_id:
                        preferences_other_user = [pref for pref in preferences_other_user if
                                                  pref[0] not in relevant_item_ids]

                        if preferences_other_user:
                            training_set[other_user_id] = dict(preferences_other_user)
                    else:
                        training_set[other_user_id] = dict(preferences_other_user)

                # Evaluate the recommender
                recommender_training = self._build_recommender(training_set, recommender)

                try:
                    preferences = recommender_training.model.preferences_from_user(user_id)
                    preferences = list(preferences)
                    if not preferences:
                        continue
                except:
                    # Excluded all prefs for the user. move on.
                    continue

                recommended_items = recommender_training.recommend(user_id, at)
                relevant_arrays.append(list(relevant_item_ids))
                real_arrays.append(list(recommended_items))

            relevant_arrays = np.array(relevant_arrays)
            real_arrays = np.array(real_arrays)

            # Return the IR results.
            if metric in ['precision', 'recall', 'f1score']:
                eval_function = evaluation_metrics[metric]
                permutation_scores_ir.append({metric: eval_function(real_arrays, relevant_arrays)})
            elif metric is None:
                f = f1_score(real_arrays, relevant_arrays)
                r = recall_score(real_arrays, relevant_arrays)
                p = precision_score(real_arrays, relevant_arrays)
                permutation_scores_ir.append({'precision': p, 'recall': r, 'f1score': f})

        # Compute the final score for Error Statistics
        for result in permutation_scores_error:
            for key in result:
                final_score_error['avg'].setdefault(key, [])
                final_score_error['avg'][key].append(result[key])
        for key in final_score_error['avg']:
            final_score_error['stdev'][key] = np.std(final_score_error['avg'][key])
            final_score_error['avg'][key] = np.average(final_score_error['avg'][key])

        # Compute the final score for IR statistics
        for result in permutation_scores_ir:
            for key in result:
                final_score_ir['avg'].setdefault(key, [])
                final_score_ir['avg'][key].append(result[key])
        for key in final_score_ir['avg']:
            final_score_ir['stdev'][key] = np.std(final_score_ir['avg'][key])
            final_score_ir['avg'][key] = np.average(final_score_ir['avg'][key])

        permutation_scores = {}
        scores = {}
        if permutation_scores_error:
            permutation_scores['error'] = permutation_scores_error
            scores['final_error'] = final_score_error
        if permutation_scores_ir:
            permutation_scores['ir'] = permutation_scores_ir
            scores.setdefault('final_error', {})
            scores['final_error'].setdefault('avg', {})
            scores['final_error'].setdefault('stdev', {})
            scores['final_error']['avg'].update(final_score_ir['avg'])
            scores['final_error']['stdev'].update(final_score_ir['stdev'])

        return permutation_scores, scores