def test_prmse_sparse_matrix_array_as_input(self): human_scores = self.human_score_columns df_humans = self.data_sparse[human_scores].to_numpy() system = np.array(self.data_sparse['system']) expected_prmse_true = 0.538748 prmse = prmse_true(system, df_humans) assert_almost_equal(prmse, expected_prmse_true, 7)
def test_prmse_sparse_matrix_computed_ve(self): human_scores = self.human_score_columns df_humans = self.data_sparse[human_scores] system = self.data_sparse['system'] expected_prmse_true = 0.538748 prmse = prmse_true(system, df_humans) assert_almost_equal(prmse, expected_prmse_true, 7)
def test_prmse_full_matrix_computed_ve(self): human_scores = self.human_score_columns df_humans = self.data_full[human_scores] system = self.data_full['system'] expected_prmse_true = 0.5409673 prmse = prmse_true(system, df_humans) assert_almost_equal(prmse, expected_prmse_true, 7)
def test_prmse_sparse_matrix_given_ve(self): human_scores = self.human_score_columns df_humans = self.data_sparse[human_scores] system = self.data_sparse['system'] variance_errors_human = 0.5150882 expected_prmse_true = 0.538748 prmse = prmse_true(system, df_humans, variance_errors_human) assert_almost_equal(prmse, expected_prmse_true, 7)
def test_prmse_full_matrix_given_ve(self): human_scores = self.human_score_columns df_humans = self.data_full[human_scores] system = self.data_full['system'] variance_errors_human = 0.509375 expected_prmse_true = 0.5409673 prmse = prmse_true(system, df_humans, variance_errors_human) assert_almost_equal(prmse, expected_prmse_true, 7)
def test_prmse_all_single_scored(): # this test should raise a UserWarning system_scores = [1, 2, 3, 4, 5] sc1 = [1, 2, 3, None, None] sc2 = [None, None, None, 2, 3] df = pd.DataFrame({'sc1': sc1, 'sc2': sc2, 'system': system_scores}) with warnings.catch_warnings(record=True) as warning_list: prmse = prmse_true(df['system'], df[['sc1', 'sc2']]) ok_(prmse is None) assert issubclass(warning_list[-1].category, UserWarning)
def compute_prmse_one_system_multiple_rater_pairs(df_scores, system_id, rater_pairs): """ Compute the PRMSE score for the system against all given rater pairs. This function computes the value of the PRMSE metric between the scores of the given system (``system_id``) against the scores assigned by the two simulated raters ``rater_id1`` and ``rater_id2``. Parameters ---------- df_scores : pandas.DataFrame The data frame containing the simulated scores. This is usually one of the data frames returned by the ``simulation.dataset.Dataset.to_frame()`` method. system_id : str The ID for the simulated system to be evaluated. This must be a column in ``df_scores``. Description rater_pairs : list of lists of str A list containing rater pairs against which the system is to be evaluated. Each rater pair is a list of rater ID, e.g., ``[h_1, h_33]``. Returns ------- prmse_values : list of float A list containing the values for the PRMSE metric for each of the given rater pairs. """ # initialize a list that will hold the series prmse_for_all_pairs = [] # iterate over each given rater pair for rater_id1, rater_id2 in rater_pairs: # call the per-pair function prmse_for_this_pair = prmse_true(df_scores[system_id], df_scores[[rater_id1, rater_id2]]) # save the returned lists of series prmse_for_all_pairs.append(prmse_for_this_pair) return prmse_for_all_pairs
def test_prmse_single_human_ve_array_as_input(): system_scores = np.array([1, 2, 5]) human_scores = np.array([2, 3, 5]) prmse = prmse_true(system_scores, human_scores, 0.5) eq_(prmse, 0.9090909090909091)
def test_prmse_single_human_ve(): df = pd.DataFrame({'system': [1, 2, 5], 'sc1': [2, 3, 5]}) prmse = prmse_true(df['system'], df['sc1'], 0.5) eq_(prmse, 0.9090909090909091)