def test_check_indices(self): indices = check_indices(None, max_index=4) np.testing.assert_array_equal(list(range(5)), indices) indices = check_indices([2, 3], max_index=4) np.testing.assert_array_equal([2, 3], indices) indices = [3, 3] self.assertRaises(ValueError, check_indices, indices=indices, max_index=5) indices = [2, 6] self.assertRaises(ValueError, check_indices, indices=indices, max_index=5)
def confidence_scores(self, X, annotator_ids=None, **kwargs): """Method returning the confidence scores for labelling the given samples. Parameters ---------- X: array-like, shape (n_samples, n_features) Samples whose class labels are queried. annotator_ids: array-like, shape (n_queried_annotators) The indices of the annotators whose confidence scores are queried. Returns ------- C: numpy.ndarray, shape (n_samples, n_annotators) confidence scores of the queried annotators for labelling the given samples. The non queried annotators should return np.nan values. """ # check annotator_ids annotator_ids = check_indices(annotator_ids, self.n_annotators() - 1, 'annotator_ids') # obtain ids of queried samples X = check_array(X) sample_ids = indices(self.X_, X, missing=-1) sample_ids_flag = sample_ids >= 0 # confidence scores provided by queried annotators C = np.full((np.size(X, 0), self.n_annotators()), np.nan) C[sample_ids_flag, annotator_ids[:, None]] = self.C_[sample_ids[sample_ids_flag], annotator_ids[:, None]] return C
def class_labels(self, X, annotator_ids=None, query_value=1, **kwargs): """Method returning the class labels of the given samples. Parameters ---------- X: array-like, shape (n_samples, n_features) Samples whose class labels are queried. y_true: array-like, shape (n_samples) The true class label of each given sample. annotator_ids: array-like, shape (n_queried_annotators) The indices of the annotators whose class labels are queried. query_value: int The query value represents the increment of the query statistics of the queried annotators. Returns ------- Y: numpy.ndarray, shape (n_samples, n_annotators) Class labels of the given samples which were provided by the queried annotators. The non queried annotators return np.nan values. """ # check parameters X, y_true = check_X_y(X, kwargs.get('y_true')) annotator_ids = check_indices(annotator_ids, self.n_annotators() - 1, 'annotator_ids') query_value = check_positive_integer(query_value, 'query_value') prev_n_queries = np.copy(self.n_queries()) Y = self.annotator_model_.class_labels(X=X, annotator_ids=annotator_ids, query_value=query_value, **kwargs) # ensures constant accuracy for given number of queries random_states = [np.random.RandomState(prev_n_queries[a_idx]) for a_idx in range(self.n_annotators())] if np.sum(prev_n_queries) > 0: # obtain class labels for x_idx in range(len(X)): Y_x = Y[x_idx, :] for a_idx in range(self.annotator_model_.n_annotators()): flip_p = min(abs(self.learning_rates_[a_idx]) * prev_n_queries[a_idx], 1) flip = random_states[a_idx].binomial(1, flip_p) if self.learning_rates_[a_idx] < 0: if y_true[x_idx] == Y_x[a_idx]: if flip and not self.adversarial_[a_idx]: Y_x[a_idx] = random_states[a_idx].choice(self.y_unique_) if flip and self.adversarial_[a_idx]: false_labels = self.y_unique_[self.y_unique_ != y_true[x_idx]] Y_x[a_idx] = random_states[a_idx].choice(false_labels) else: if flip and not self.adversarial_[a_idx]: Y_x[a_idx] = random_states[a_idx].choice(self.y_unique_) elif self.learning_rates_[a_idx] > 0 and y_true[x_idx] != Y_x[a_idx]: if flip: Y_x[a_idx] = y_true[x_idx] Y[x_idx, :] = Y_x return Y
def plot_labelling_accuracy(self, X, y_true, annotator_ids=None, figsize=(4, 4), dpi=150, fontsize=12): """Method plotting the labelling accuracy of each desired annotator. Parameters ---------- X: array-like, shape (n_samples, n_features) Samples on which the labelling accuracies of the annotators is evaluated. y_true: array-like, shape (n_samples) True class labels of the given samples. annotator_ids: array-like, shape (n_queried_annotators) The indices of the annotators whose labelling accuracies are plotted. figsize: 2-tuple of floats, default: (4, 4 Figure dimension (width, height) in inches. dpi: float, default: 150 Dots per inch. fontsize: int Font size of plotted text. Returns ------- fig: matplotlib.figure.Figure object Created figure. ax: :py:class:`matplotlib.axes.Axes` object Created axes. """ annotator_ids = check_indices(annotator_ids, self.n_annotators() - 1, 'annotator_ids') acc = np.asarray(self.labelling_performance(X, y_true)) acc[np.isnan(acc)] = 0 x = np.arange(self.n_annotators()) annot_names = [ r'annotator $a_' + str(a_idx) + '$' for a_idx in annotator_ids ] fig, ax = plt.subplots(1, 1, figsize=figsize, dpi=dpi) ax.bar(x, acc[annotator_ids]) plt.xticks(x, annot_names, fontsize=fontsize) plt.yticks(fontsize=fontsize) plt.ylabel('labelling accuracy', fontsize=fontsize) plt.title('labelling accuracy of annotators', fontsize=fontsize) return fig, ax
def class_labels(self, X, annotator_ids=None, query_value=1, **kwargs): """Method returning the class labels of the given samples. Parameters ---------- X: array-like, shape (n_samples, n_features) Samples whose class labels are queried. annotator_ids: array-like, shape (n_queried_annotators) The indices of the annotators whose class labels are queried. query_value: int The query value represents the increment of the query statistics of the queried annotators. Returns ------- Y: numpy.ndarray, shape (n_samples, n_annotators) Class labels of the given samples which were provided by the queried annotators. The non queried annotators return np.nan values. """ X = check_array(X) if annotator_ids is None: Y = np.hstack([ a.class_labels(X, None, query_value, **kwargs) for a in self.annotator_types_ ]) else: annotator_ids = self._transform_ids( check_indices(annotator_ids, self.n_annotators() - 1, 'annotator_ids')) Y = [] for a in range(len(self.annotator_types_)): if len(annotator_ids[a]) > 0: Y_a = self.annotator_types_[a].class_labels( X=X, annotator_ids=annotator_ids[a], query_value=query_value, **kwargs) else: Y_a = np.empty( (len(X), self.annotator_types_[a].n_annotators())) Y_a.fill(np.nan) Y.append(Y_a) Y = np.hstack(Y) return Y
def class_labels(self, X, annotator_ids=None, query_value=1, **kwargs): """Method returning the class labels of the given samples. If the query value is greater than zero, it updates the n_queries and queried sample statistics Parameters ---------- X: array-like, shape (n_samples, n_features) Samples whose class labels are queried. annotator_ids: array-like, shape (n_queried_annotators) The indices of the annotators whose class labels are queried. query_value: int The query value represents the increment of the query statistics of the queried annotators. Returns ------- Y: numpy.ndarray, shape (n_samples, n_annotators) Class labels of the given samples which were provided by the queried annotators. The non queried annotators return np.nan values. """ # check annotator_ids annotator_ids = check_indices(annotator_ids, self.n_annotators() - 1, 'annotator_ids') # obtain ids of queried samples X = check_array(X) sample_ids = indices(self.X_, X, missing=-1) sample_ids_flag = sample_ids >= 0 # class labels provided by queried annotators Y = np.full((np.size(X, 0), self.n_annotators()), np.nan) Y[sample_ids_flag, annotator_ids[:, None]] = self.Y_[sample_ids[sample_ids_flag], annotator_ids[:, None]] # update query statistics if query_value > 0: self.queried_flags_[sample_ids, annotator_ids[:, None]] = True self.n_queries_[annotator_ids] += query_value return Y
def confidence_scores(self, X, annotator_ids=None, **kwargs): """Method returning the confidence scores for labelling the given samples. Parameters ---------- X: array-like, shape (n_samples, n_features) Samples whose class labels are queried. annotator_ids: array-like, shape (n_queried_annotators) The indices of the annotators whose confidence scores are queried. Returns ------- C: numpy.ndarray, shape (n_samples, n_annotators) Confidence scores of the queried annotators for labelling the given samples. The non queried annotators should return np.nan values. """ X = check_array(X) if annotator_ids is None: C = np.hstack( [a.confidence_scores(X) for a in self.annotator_types_]) else: annotator_ids = self._transform_ids( check_indices(annotator_ids, self.n_annotators() - 1, 'annotator_ids')) C = [] for a in range(len(self.annotator_types_)): if len(annotator_ids[a]) > 0: C_a = self.annotator_types_[a].confidence_scores( X, annotator_ids[a]) else: C_a = np.empty( (len(X), self.annotator_types_[a].n_annotators())) C_a.fill(np.nan) C.append(C_a) C = np.hstack(C) return C
def plot_class_labels(self, X, features_ids=None, annotator_ids=None, plot_confidences=5, y_true=None, figsize=(5, 3), dpi=150, fontsize=7, **kwargs): """Method creating scatter plots of the given samples for each annotator. In each scatter plot, the samples are colored according the class labels provided by the corresponding annotator. Parameters ---------- X: array-like, shape (n_samples, n_features) Samples which are plotted. features_ids: array-like, shape (2) The feature indices to be plotted. The array is limited to two indices. annotator_ids: array-like, shape (n_queried_annotators) The indices of the annotators whose class label distributions are plotted. plot_confidences: boolean If true, the size of the markers is plotted according to the given confidence scores. y_true: array-like, shape (n_samples) This is a optional parameter. If the true class labels are given, the samples are marked according to correctness of their predicted class label figsize: 2-tuple of floats, default: (5, 3*n_annotators) Figure dimension (width, height*n_annotators) in inches. dpi: float, default: 150 Dots per inch. fontsize: int Font size of plotted text. Returns ------- fig: matplotlib.axes.Axes object Created figure. ax: array-like, shape (n_annotator_ids). The array ax is a collection of :py:class:`matplotlib.axes.Axes` instances representing the plots of the annotators. """ # check annotator_ids annotator_ids = check_indices(annotator_ids, self.n_annotators() - 1, 'annotator_ids') # check features_ids X = check_array(X) features_ids = np.arange( 1 + (np.size(X, 1) > 1), dtype=int) if features_ids is None else features_ids n_features = len(features_ids) features_ids = check_indices(features_ids, n_features - 1, 'feature_ids') x_0 = X[:, features_ids[0]] if n_features > 1: x_1 = X[:, features_ids[1]] else: x_1 = np.zeros(np.size(X, 0)) # check true class labels # y_true = kwargs.get('y_true', None) if y_true is not None: X, y_true = check_X_y(X, y_true) m_true = 'o' m_false = 'x' # query class labels Y = self.class_labels(X=X, annotator_ids=annotator_ids, query_value=0, y_true=y_true, **kwargs) C = self.confidence_scores(X=X, annotator_ids=annotator_ids) y_unique = np.unique(Y[~np.isnan(Y)]) n_classes = len(y_unique) # setup the scatter plots fig, ax = plt.subplots(len(annotator_ids), 1, figsize=(figsize[0], len(annotator_ids) * figsize[1]), dpi=dpi) ax = [ax] if len(annotator_ids) == 1 else ax colors = cm.rainbow(np.linspace(0, 1, n_classes + 1)) for a in range(len(annotator_ids)): y_a = Y[:, annotator_ids[a]] if plot_confidences: c_a = C[:, annotator_ids[a]] c_a = (.015 * dpi + .0175 * dpi * c_a / np.nanmax(C))**(350 / dpi) c_a[np.isnan(c_a)] = 1 else: c_a = np.full(len(y_a), .1 * dpi) if y_true is None: for i, y in enumerate(y_unique): flag = y_a == y ax[a].scatter(x_0[flag], x_1[flag], color=colors[i].reshape(1, -1), label='prediction is' + str(int(y)), s=c_a[flag]) else: for i, y in enumerate(y_unique): true_predictions = np.logical_and(y_a == y, y_a == y_true) false_predictions = np.logical_and(y_a == y, y_a != y_true) ax[a].scatter(x_0[true_predictions], x_1[true_predictions], color=colors[i].reshape(1, -1), label='prediction ' + str(int(y)) + ' is true', marker=m_true, s=c_a[true_predictions]) ax[a].scatter(x_0[false_predictions], x_1[false_predictions], color=colors[i].reshape(1, -1), label='prediction ' + str(int(y)) + ' is false', marker=m_false, s=c_a[false_predictions]) if np.sum(np.isnan(y_a)) > 0: ax[a].scatter(x_0[np.isnan(y_a)], x_1[np.isnan(y_a)], color=colors[n_classes].reshape(1, -1), label='prediction is NA', marker=m_false, s=np.full(np.sum(np.isnan(y_a)), 5)) lgnd = ax[a].legend(loc='best', fancybox=False, framealpha=0.5, prop={'size': fontsize}) for handle in lgnd.legendHandles: handle.set_sizes([fontsize]) ax[a].tick_params(labelsize=fontsize) ax[a].set_title(r'class labels predicted by annotator $a_' + str(annotator_ids[a]) + '$', fontsize=fontsize) return fig, ax
def plot_labelling_confusion_matrices(self, X, y_true, y_unique, annotator_ids=None, figsize=(4, 4), dpi=150, fontsize=12): """Method plotting the labelling confusion matrix of each desired annotator. Parameters ---------- X: array-like, shape (n_samples, n_features) Samples on which the labelling confusion matrices of the annotators is evaluated. y_true: array-like, shape (n_samples) True class labels of the given samples. annotator_ids: array-like, shape (n_queried_annotators) The indices of the annotators whose labelling confusion matrices are plotted. figsize: 2-tuple of floats, default: (4, 4*n_annotators) Figure dimension (width, height*n_annotators) in inches. dpi: float, default: 150 Dots per inch. fontsize: int Font size of plotted text. Returns ------- fig: matplotlib.figure.Figure object Created figure. ax: array-like, shape (n_annotator_ids). The array ax is a collection of :py:class:`matplotlib.axes.Axes` instances representing the plots of the annotators. """ annotator_ids = check_indices(annotator_ids, self.n_annotators() - 1, 'annotator_ids') conf_matrices = self.labelling_performance(X, y_true, perf_func=confusion_matrix) n_classes = len(y_unique) for matrix_idx in range(len(conf_matrices)): if not isinstance(conf_matrices[matrix_idx], np.ndarray): conf_matrices[matrix_idx] = np.zeros((n_classes, n_classes), dtype=int) y_unique = np.sort(y_unique) fig, ax = plt.subplots(len(annotator_ids), 1, figsize=(figsize[0], figsize[1] * len(annotator_ids)), dpi=dpi) ax = [ax] if len(annotator_ids) == 1 else ax for i, a_idx in enumerate(annotator_ids): df_cm = pd.DataFrame(conf_matrices[a_idx], range(n_classes), range(n_classes)) sn.heatmap(df_cm, annot=True, annot_kws={"size": fontsize}, xticklabels=y_unique, yticklabels=y_unique, cmap="YlGnBu", fmt="d", ax=ax[i], cbar=False) ax[i].tick_params(labelsize=fontsize) ax[i].set_xlabel('predicted class labels', fontsize=fontsize) ax[i].set_ylabel('true class labels', fontsize=fontsize) ax[i].set_title('confusion matrix of annotator $a_' + str(a_idx) + '$', fontsize=fontsize) return fig, ax