def test_observed_agreement(self): anno1 = np.array([0, 0, 1, 1, MV, 3]) anno2 = np.array([0, MV, 1, 1, MV, 2]) nvalid = np.sum(is_valid(anno1) & is_valid(anno2)) expected = np.array([1., 2., 0., 0.]) / nvalid freqs = pmh.observed_agreement_frequency(anno1, anno2, 4) np.testing.assert_array_equal(freqs, expected)
def add_markings(self, mark_classes, mark_name, marker_shape, delta_x, delta_y, marker_size=5, line_width=1., marker_color='white'): plot = self.plot_posterior nannotations = plot.data.arrays['values'].shape[0] y_name = mark_name + '_y' x_name = mark_name + '_x' valid = is_valid(mark_classes) y_values = np.arange(nannotations)[valid] + delta_y + 0.5 x_values = mark_classes[valid].astype(float) + delta_x + 0.5 plot.data.set_data(y_name, y_values) plot.data.set_data(x_name, x_values) plot.plot((x_name, y_name), type='scatter', name=mark_name, marker=marker_shape, marker_size=marker_size, color='transparent', outline_color=marker_color, line_width=line_width)
def _compute_accuracy(self, category, annotations, use_prior): """Return accuracy, P(annotation_j = k' | category=k) Helper function to compute an estimate of the accuracy parameters theta, given labels and annotations. Returns ------- accuracy : ndarray, shape = (n_annotators, n_classes, n_classes) accuracy[j,k,k'] = P(annotation_j = k' | category=k). """ nitems, nannotators = annotations.shape # alpha - 1 : the mode of a Dirichlet is (alpha_i - 1) / (alpha_0 - K) alpha_prior_count = self.alpha - 1. valid_mask = is_valid(annotations) annotators = np.arange(nannotators)[None,:] if use_prior: accuracy = np.tile(alpha_prior_count, (nannotators, 1, 1)) else: accuracy = np.zeros((nannotators, self.nclasses, self.nclasses)) for i in xrange(nitems): valid = valid_mask[i,:] accuracy[annotators[:,valid],:,annotations[i,valid]] += category[i,:] accuracy /= accuracy.sum(2)[:, :, None] return accuracy
def _compute_accuracy(self, category, annotations, use_prior): """Return accuracy, P(annotation_j = k' | category=k) Helper function to compute an estimate of the accuracy parameters theta, given labels and annotations. Returns ------- accuracy : ndarray, shape = (n_annotators, n_classes, n_classes) accuracy[j,k,k'] = P(annotation_j = k' | category=k). """ nitems, nannotators = annotations.shape # alpha - 1 : the mode of a Dirichlet is (alpha_i - 1) / (alpha_0 - K) alpha_prior_count = self.alpha - 1. valid_mask = is_valid(annotations) annotators = np.arange(nannotators)[None, :] if use_prior: accuracy = np.tile(alpha_prior_count, (nannotators, 1, 1)) else: accuracy = np.zeros((nannotators, self.nclasses, self.nclasses)) for i in range(nitems): valid = valid_mask[i, :] accuracy[annotators[:, valid], :, annotations[i, valid]] += category[i, :] accuracy /= accuracy.sum(2)[:, :, None] return accuracy
def test_generate_annotations(self): # test to check that annotations are masked correctly when the number # of items is not divisible by the number of annotators nclasses, nitems = 5, 8*30+3 model = ModelA.create_initial_state(nclasses) annotations = model.generate_annotations(nitems) valid = is_valid(annotations) # check that on every row there are exactly 3 annotations self.assertTrue(np.all(valid.sum(1) == 3))
def spearmans_rho(annotations1, annotations2, nclasses=None): """Compute Spearman's rank correlation coefficient. See also :func:`~pyanno.measures.helpers.pairwise_matrix`. **References:** * `Wikipedia entry <http://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_ Arguments --------- annotations1 : ndarray, shape = (n_items, ) Array of annotations for a single annotator. Missing values should be indicated by :attr:`pyanno.util.MISSING_VALUE` annotations2 : ndarray, shape = (n_items, ) Array of annotations for a single annotator. Missing values should be indicated by :attr:`pyanno.util.MISSING_VALUE` nclasses : int Number of annotation classes. If None, `nclasses` is inferred from the values in the annotations Returns ------- stat : float The value of the statistics """ valid = is_valid(annotations1) & is_valid(annotations2) if all(~valid): logger.debug('No valid annotations') return np.nan rho, pval = scipy.stats.spearmanr(annotations1[valid], annotations2[valid]) return rho
def coincidence_matrix(annotations, nclasses): """Build coincidence matrix. The element c,k of the coincidence matrix contains the number of c-k pairs in the data (across annotators), over the total number of observed pairs. **Reference** * `Wikipedia entry <http://en.wikipedia.org/wiki/Krippendorff%27s_Alpha#Coincidence_matrices>`_ Arguments --------- annotations : ndarray, shape = (n_items, n_annotators) Array of annotations for multiple annotators. Missing values should be indicated by :attr:`pyanno.util.MISSING_VALUE` nclasses : int Number of annotation classes. If None, `nclasses` is inferred from the values in the annotations Returns ------- coinc_mat : ndarray, shape = (n_classes, n_classes) Coincidence matrix """ # total number of annotations in row nannotations = is_valid(annotations).sum(1).astype(float) valid = nannotations > 1 nannotations = nannotations[valid] annotations = annotations[valid, :] # number of annotations of class c in row nc_in_row = np.empty((nannotations.shape[0], nclasses), dtype=int) for c in range(nclasses): nc_in_row[:, c] = (annotations == c).sum(1) coincidences = np.empty((nclasses, nclasses), dtype=float) for c in range(nclasses): for k in range(nclasses): if c == k: nck_pairs = nc_in_row[:, c] * (nc_in_row[:, c] - 1) else: nck_pairs = nc_in_row[:, c] * nc_in_row[:, k] coincidences[c, k] = (nck_pairs / (nannotations - 1.0)).sum() return coincidences
def coincidence_matrix(annotations, nclasses): """Build coincidence matrix. The element c,k of the coincidence matrix contains the number of c-k pairs in the data (across annotators), over the total number of observed pairs. **Reference** * `Wikipedia entry <http://en.wikipedia.org/wiki/Krippendorff%27s_Alpha#Coincidence_matrices>`_ Arguments --------- annotations : ndarray, shape = (n_items, n_annotators) Array of annotations for multiple annotators. Missing values should be indicated by :attr:`pyanno.util.MISSING_VALUE` nclasses : int Number of annotation classes. If None, `nclasses` is inferred from the values in the annotations Returns ------- coinc_mat : ndarray, shape = (n_classes, n_classes) Coincidence matrix """ # total number of annotations in row nannotations = is_valid(annotations).sum(1).astype(float) valid = nannotations > 1 nannotations = nannotations[valid] annotations = annotations[valid, :] # number of annotations of class c in row nc_in_row = np.empty((nannotations.shape[0], nclasses), dtype=int) for c in range(nclasses): nc_in_row[:, c] = (annotations == c).sum(1) coincidences = np.empty((nclasses, nclasses), dtype=float) for c in range(nclasses): for k in range(nclasses): if c == k: nck_pairs = nc_in_row[:, c] * (nc_in_row[:, c] - 1) else: nck_pairs = nc_in_row[:, c] * nc_in_row[:, k] coincidences[c, k] = (nck_pairs / (nannotations - 1.)).sum() return coincidences
def test_generate_annotations(self): nitems = 2000*8 nclasses = 3 # create random model model = ModelA.create_initial_state(nclasses) # create random data annotations = model.generate_annotations(nitems) self.assertEqual(annotations.shape, (nitems, model.nannotators)) self.assertTrue(np.all(is_valid(annotations).sum(1) == 3)) freqs = (np.array([(annotations==psi).sum() / float(nitems*3) for psi in range(nclasses)])) testing.assert_allclose(model.omega, freqs, atol=1e-1, rtol=0.)
def infer_labels(self, annotations): """Infer posterior distribution over label classes. Compute the posterior distribution over label classes given observed annotations, :math:`P( \mathbf{y} | \mathbf{x}, \\theta, \omega)`. Arguments ---------- annotations : ndarray, shape = (n_items, n_annotators) annotations[i,j] is the annotation of annotator j for item i Returns ------- posterior : ndarray, shape = (n_items, n_classes) posterior[i,k] is the posterior probability of class k given the annotation observed in item i. """ self._raise_if_incompatible(annotations) nitems = annotations.shape[0] gamma = self.gamma nclasses = self.nclasses # get indices of annotators active in each row valid_entries = is_valid(annotations).nonzero() annotator_indices = np.reshape(valid_entries[1], (nitems, self.nannotators_per_item)) valid_annotations = annotations[valid_entries] valid_annotations = np.reshape(valid_annotations, (nitems, self.nannotators_per_item)) # thetas of active annotators theta_equal = self.theta[annotator_indices] theta_not_equal = (1. - theta_equal) / (nclasses - 1.) # compute posterior over psi psi_distr = np.zeros((nitems, nclasses)) for psi in xrange(nclasses): tmp = np.where(valid_annotations == psi, theta_equal, theta_not_equal) psi_distr[:,psi] = gamma[psi] * tmp.prod(1) # normalize distribution psi_distr /= psi_distr.sum(1)[:,np.newaxis] return psi_distr
def infer_labels(self, annotations): """Infer posterior distribution over label classes. Compute the posterior distribution over label classes given observed annotations, :math:`P( \mathbf{y} | \mathbf{x}, \\theta, \omega)`. Arguments ---------- annotations : ndarray, shape = (n_items, n_annotators) annotations[i,j] is the annotation of annotator j for item i Returns ------- posterior : ndarray, shape = (n_items, n_classes) posterior[i,k] is the posterior probability of class k given the annotation observed in item i. """ self._raise_if_incompatible(annotations) nitems = annotations.shape[0] gamma = self.gamma nclasses = self.nclasses # get indices of annotators active in each row valid_entries = is_valid(annotations).nonzero() annotator_indices = np.reshape(valid_entries[1], (nitems, self.nannotators_per_item)) valid_annotations = annotations[valid_entries] valid_annotations = np.reshape(valid_annotations, (nitems, self.nannotators_per_item)) # thetas of active annotators theta_equal = self.theta[annotator_indices] theta_not_equal = (1. - theta_equal) / (nclasses - 1.) # compute posterior over psi psi_distr = np.zeros((nitems, nclasses)) for psi in xrange(nclasses): tmp = np.where(valid_annotations == psi, theta_equal, theta_not_equal) psi_distr[:, psi] = gamma[psi] * tmp.prod(1) # normalize distribution psi_distr /= psi_distr.sum(1)[:, np.newaxis] return psi_distr
def test_generate_annotations(self): # test to check that annotations are masked correctly when the number # of items is not divisible by the number of annotators nclasses, nannotators, nitems = 5, 7, 201 model = ModelBt.create_initial_state(nclasses, nannotators) annotations = model.generate_annotations(nitems) valid = is_valid(annotations) self.assertEqual(annotations.shape, (nitems, nannotators)) model.are_annotations_compatible(annotations) # perfect annotators, annotations correspond to prior nitems = 20000 model.theta[:] = 1. annotations = model.generate_annotations(nitems) freq = labels_frequency(annotations, nclasses) np.testing.assert_almost_equal(freq, model.gamma, 2)
def _missing_mask(self, annotations): missing_mask = ~ is_valid(annotations) missing_mask_nclasses = np.tile(missing_mask[:, :, None], (1, 1, self.nclasses)) return missing_mask_nclasses
def all_invalid(*annotations): """Return True if all annotations are invalid.""" for anno in annotations: if np.any(is_valid(anno)): return False return True
def _missing_mask(self, annotations): missing_mask = ~is_valid(annotations) missing_mask_nclasses = np.tile(missing_mask[:, :, None], (1, 1, self.nclasses)) return missing_mask_nclasses