def __main__(argv): if len(argv) != 2: print("Specify cmd arg") sys.exit(2) else: arg = argv[1] if arg == 'img': reliability_mat = getReliabilityMatImg("../data/imageGID_job_map_expt2_corrected.csv") else: reliability_mat = getReliabilityMatTurker() t = AnnotationTask(data=reliability_mat) print("Calculating the agreement scores") alpha = t.alpha() print("Alpha = %f" %alpha) s = t.S() print("S = %f" %s) pi = t.pi() print("Pi = %f" %pi) kappa = t.kappa() print("kappa = %f" %kappa)
def agree_tags(delta, column): """ egytokenes címkézési feladatokra számol egyetértést :param delta: az összevetett adat :param column: az az oszlop, amelyre egyetértést akarunk számolni :return: """ by_field = reverse_tags(delta, column) task = AnnotationTask(data=by_field) oa = task.avg_Ao() # observed agreement s = task.S() # Bennett, Albert and Goldstein S (1954) all categories are equally likely pi = task.pi() # Scott pi (1955) single distribution kappa = task.kappa() # Cohen kappa (1960) individual coder distribution w_kappa = task.weighted_kappa() alpha = task.alpha() # Krippendorff alpha (1980) return oa, s, pi, kappa, w_kappa, alpha
def compute_annotator_agreement_nltkmetrics(data_array): ''' See http://nltk.org/api/nltk.metrics.html#nltk.metrics.agreement ''' print "####### Agreement coefficients according to NLTK metrics.agreement #######" t = AnnotationTask(data=data_array) print "Average observed agreement across all coders and items: "+str(t.avg_Ao()) print "Cohen's Kappa (Cohen 1960): "+str(t.kappa()) print "Weighted kappa (Cohen 1968): "+str(t.weighted_kappa()) print "Scott's pi (Scott 1955): "+str(t.pi()) #print "pi_avg: "+str(t.pi_avg()) print "alpha (Krippendorff 1980): "+str(t.alpha()) print "Observed disagreement for the alpha coefficient: "+str(t.Do_alpha()) print "S (Bennett, Albert and Goldstein 1954): "+str(t.S()) #print "n-notation used in Artstein and Poesio (2007): "+str(t.N(k=, ic???)) print "Observed disagreement for the weighted kappa coefficient averaged over all labelers: "+str(t.Do_Kw())
experts = ['KEY', 'MG', 'MS', 'TM'] novices = ['KEY', 'CK', 'GK', 'RM'] cols = novices # Total values taskdata = [] for coder in cols: for i in data[coder].index: taskdata.append([coder, i, data[coder][i]]) ratingtask = AnnotationTask(data=taskdata) print("kappa " + str(ratingtask.kappa())) print("fleiss " + str(ratingtask.multi_kappa())) print("alpha " + str(ratingtask.alpha())) print("scotts " + str(ratingtask.pi())) # Pairwise values similarities = [] for coders in itertools.product(cols, repeat=2): if coders[0] == coders[1]: similarities.append(1) else: taskdata = [] for coder in coders: for i in data[coder].index: taskdata.append([coder, i, data[coder][i]]) ratingtask = AnnotationTask(data=taskdata) k = ratingtask.kappa() f = ratingtask.multi_kappa()
def status_view(request, task_id=None): """ Renders the evaluation tasks status page for staff users. """ LOGGER.info('Rendering evaluation task overview for user "{0}".'.format( request.user.username)) # Check if user is member in WMT13 group. If so, redirect to wmt13 app. if request.user.groups.filter(name="WMT13").exists(): LOGGER.info('Redirecting user "{0}" to WMT13 overview.'.format( request.user.username)) return redirect('appraise.wmt13.views.overview') if task_id: task = get_object_or_404(EvaluationTask, task_id=task_id) headers = task.get_status_header() status = [] for user in task.users.all(): status.append((user.username, task.get_status_for_user(user))) scores = None result_data = [] raw_result_data = Counter() users = list(task.users.all()) for item in EvaluationItem.objects.filter(task=task): results = [] for user in users: qset = EvaluationResult.objects.filter(user=user, item=item) if qset.exists(): category = str(qset[0].results) results.append((user.id, item.id, category)) raw_result_data[qset[0].raw_result] += 1 if len(results) == len(users): result_data.extend(results) # todo for gisting, calculate - somehow - the percentage of answers against the number of different answers -> # in that same gap, and also regroup them for readability _raw_results = [] _keys = raw_result_data.keys() _total_results = float(sum(raw_result_data.values())) for key in sorted(_keys): value = raw_result_data[key] _raw_results.append((key, value, 100 * value / _total_results)) try: # Computing inter-annotator agreement only makes sense for more # than one coder -- otherwise, we only display result_data... if len(users) > 1: # Check if we can safely use NLTK's AnnotationTask class. try: from nltk.metrics.agreement import AnnotationTask chk = AnnotationTask(data=[('b', '1', 'k'), ('a', '1', 'k')]) assert(chk == 1.0) except AssertionError: LOGGER.debug('Fixing outdated version of AnnotationTask.') from appraise.utils import AnnotationTask # We have to sort annotation data to prevent StopIterator errors. result_data.sort() annotation_task = AnnotationTask(result_data) scores = ( annotation_task.alpha(), annotation_task.kappa(), annotation_task.S(), annotation_task.pi() ) except ZeroDivisionError: scores = None except ImportError: scores = None dictionary = { 'combined': task.get_status_for_users(), 'commit_tag': COMMIT_TAG, 'headers': headers, 'scores': scores, 'raw_results': _raw_results, 'status': status, 'task_id': task.task_id, 'task_name': task.task_name, 'title': 'Evaluation Task Status', } return render(request, 'evaluation/status_task.html', dictionary) else: evaluation_tasks = {} for task_type_id, task_type in APPRAISE_TASK_TYPE_CHOICES: # We collect a list of task descriptions for this task_type. evaluation_tasks[task_type] = [] # Super users see all EvaluationTask items, even non-active ones. if request.user.is_superuser: _tasks = EvaluationTask.objects.filter(task_type=task_type_id) else: _tasks = EvaluationTask.objects.filter(task_type=task_type_id, active=True) # Loop over the QuerySet and compute task description data. for _task in _tasks: if not APPRAISE_TASK_CACHE.has_key(_task.task_id): APPRAISE_TASK_CACHE[_task.task_id] = {} _cache = APPRAISE_TASK_CACHE[_task.task_id] if not _cache.has_key(request.user.username): _update_task_cache(_task, request.user) _task_data = _cache[request.user.username] # Append new task description to current task_type list. evaluation_tasks[task_type].append(_task_data) # If there are no tasks descriptions for this task_type, we skip it. if len(evaluation_tasks[task_type]) == 0: evaluation_tasks.pop(task_type) dictionary = { 'active_page': "STATUS", 'commit_tag': COMMIT_TAG, 'evaluation_tasks': evaluation_tasks, 'title': 'Evaluation Task Status', } return render(request, 'evaluation/status.html', dictionary)
def status_view(request, task_id=None): """ Renders the evaluation tasks status page for staff users. """ LOGGER.info('Rendering evaluation task overview for user "{0}".'.format( request.user.username)) # Check if user is member in WMT13 group. If so, redirect to wmt13 app. if request.user.groups.filter(name="WMT13").exists(): LOGGER.info('Redirecting user "{0}" to WMT13 overview.'.format( request.user.username)) return redirect('appraise.wmt13.views.overview') if task_id: task = get_object_or_404(EvaluationTask, task_id=task_id) headers = task.get_status_header() status = [] for user in task.users.all(): status.append((user.username, task.get_status_for_user(user))) scores = None result_data = [] raw_result_data = Counter() users = list(task.users.all()) for item in EvaluationItem.objects.filter(task=task): results = [] for user in users: qset = EvaluationResult.objects.filter(user=user, item=item) if qset.exists(): category = str(qset[0].results) results.append((user.id, item.id, category)) raw_result_data[qset[0].raw_result] += 1 if len(results) == len(users): result_data.extend(results) _raw_results = [] _keys = raw_result_data.keys() _total_results = float(sum(raw_result_data.values())) for key in sorted(_keys): value = raw_result_data[key] _raw_results.append((key, value, 100 * value / _total_results)) try: # Computing inter-annotator agreement only makes sense for more # than one coder -- otherwise, we only display result_data... if len(users) > 1: # Check if we can safely use NLTK's AnnotationTask class. try: from nltk.metrics.agreement import AnnotationTask chk = AnnotationTask(data=[('b', '1', 'k'), ('a', '1', 'k')]) assert (chk == 1.0) except AssertionError: LOGGER.debug('Fixing outdated version of AnnotationTask.') from appraise.utils import AnnotationTask # We have to sort annotation data to prevent StopIterator errors. result_data.sort() annotation_task = AnnotationTask(result_data) scores = (annotation_task.alpha(), annotation_task.kappa(), annotation_task.S(), annotation_task.pi()) except ZeroDivisionError: scores = None except ImportError: scores = None dictionary = { 'combined': task.get_status_for_users(), 'commit_tag': COMMIT_TAG, 'headers': headers, 'scores': scores, 'raw_results': _raw_results, 'status': status, 'task_id': task.task_id, 'task_name': task.task_name, 'title': 'Evaluation Task Status', } return render(request, 'evaluation/status_task.html', dictionary) else: evaluation_tasks = {} for task_type_id, task_type in APPRAISE_TASK_TYPE_CHOICES: # We collect a list of task descriptions for this task_type. evaluation_tasks[task_type] = [] # Super users see all EvaluationTask items, even non-active ones. if request.user.is_superuser: _tasks = EvaluationTask.objects.filter(task_type=task_type_id) else: _tasks = EvaluationTask.objects.filter(task_type=task_type_id, active=True) # Loop over the QuerySet and compute task description data. for _task in _tasks: if not APPRAISE_TASK_CACHE.has_key(_task.task_id): APPRAISE_TASK_CACHE[_task.task_id] = {} _cache = APPRAISE_TASK_CACHE[_task.task_id] if not _cache.has_key(request.user.username): _update_task_cache(_task, request.user) _task_data = _cache[request.user.username] # Append new task description to current task_type list. evaluation_tasks[task_type].append(_task_data) # If there are no tasks descriptions for this task_type, we skip it. if len(evaluation_tasks[task_type]) == 0: evaluation_tasks.pop(task_type) dictionary = { 'active_page': "STATUS", 'commit_tag': COMMIT_TAG, 'evaluation_tasks': evaluation_tasks, 'title': 'Evaluation Task Status', } return render(request, 'evaluation/status.html', dictionary)
def calculate_iaa_label(number, data_dict): data = [] y_true = [] y_pred = [] i = 0 if number == 1: for key, value in data_dict.items(): i += 1 if value['label1'] in ['', ' ']: data.append(('Annotator1', str(i), '0')) y_pred.append('0') else: data.append(('Annotator1', str(i), value['label1'])) y_pred.append(value['label1']) if value['label1_2'] in ['', ' ']: data.append(('Annotator2', str(i), '0')) y_true.append('0') else: data.append(('Annotator2', str(i), value['label1_2'])) y_true.append(value['label1_2']) t = AnnotationTask(data) print('Cohen\'s Kappa for Label {}: {}'.format(number, t.pi())) matrix = confusion_matrix(y_true, y_pred) disp = ConfusionMatrixDisplay(matrix, display_labels=[ "0", "1", "1+", "1-", "2", "2+", "2-", "3", "3+", "3-", "4", "4+", "4-", "5", "5+", "5-" ]) disp = disp.plot(include_values=True, values_format="d") fig = plt.gcf() fig.set_size_inches(6.5, 6.5) plt.xlabel('Annotator2') plt.ylabel('Annotator1') plt.title('Agreement Label 1') plt.show() plt.savefig('agreement_label1') else: for key, value in data_dict.items(): i += 1 if value['label2'] in ['', ' ']: data.append(('Annotator1', str(i), '0')) y_pred.append('0') else: data.append(('Annotator1', str(i), value['label2'])) y_pred.append(value['label2']) if value['label2_2'] in ['', ' ']: data.append(('Annotator2', str(i), '0')) y_true.append('0') else: data.append(('Annotator2', str(i), value['label2_2'])) y_true.append(value['label2_2']) t = AnnotationTask(data) print('Cohen\'s Kappa for Label {}: {}'.format(number, t.pi())) matrix = confusion_matrix(y_true, y_pred) disp = ConfusionMatrixDisplay(matrix, display_labels=[ "0", "1", "1+", "1-", "2", "2+", "2-", "3", "3+", "3-", "4", "4+", "4-", "5", "5+", "5-" ]) disp = disp.plot(include_values=True, values_format="d") fig = plt.gcf() fig.set_size_inches(6.5, 6.5) plt.xlabel('Annotator2') plt.ylabel('Annotator1') plt.title('Agreement Label 2') plt.show() plt.savefig('agreement_label2')
def test_agreement_statistics(): """Tests agreement statistics functions against those found in NLTK: https://www.nltk.org/api/nltk.metrics.html#module-nltk.metrics.agreement Compares the values of agreement statistics with those found in: Artstein, R. and Poesio, M. (2005) Kappa 3 = Alpha (or Beta) University of Essex NLE Technote Data is in: artstein_poesio_example.txt """ file_path = os.path.join("label_data", "artstein_poesio_example.txt") # Distance function for weighted agreement stats def test_distance_func(label_a, label_b): if label_a == label_b: return 0 elif (label_a == 'ireq' and label_b == 'stat') or (label_b == 'ireq' and label_a == 'stat'): return 1 else: return 0.5 # Gets individual user labels def get_user_labels(path): with open(path, 'r') as file: a_stat = [0] * 100 a_ireq = [0] * 100 a_chck = [0] * 100 b_stat = [0] * 100 b_ireq = [0] * 100 b_chck = [0] * 100 for line in file: usr = line.split()[0] ind = int(line.split()[1]) lbl = line.split()[2] if usr == 'a': if lbl == 'chck': a_chck[ind - 1] += 1 elif lbl == 'stat': a_stat[ind - 1] += 1 elif lbl == 'ireq': a_ireq[ind - 1] += 1 elif usr == 'b': if lbl == 'chck': b_chck[ind - 1] += 1 elif lbl == 'stat': b_stat[ind - 1] += 1 elif lbl == 'ireq': b_ireq[ind - 1] += 1 a_data = {'stat': a_stat, 'ireq': a_ireq, 'chck': a_chck} a_frame = pd.DataFrame(a_data) b_data = {'stat': b_stat, 'ireq': b_ireq, 'chck': b_chck} b_frame = pd.DataFrame(b_data) example_users_dict = {'a': a_frame, 'b': b_frame} return example_users_dict # NLTK stats nltk_stats = AnnotationTask(data=[x.split() for x in open(file_path)]) print("nltk:") print("multi-Pi - " + str(nltk_stats.pi())) print("multi-kappa - " + str(nltk_stats.multi_kappa())) print("alpha - " + str(nltk_stats.alpha())) # Stats from my functions example_users = get_user_labels(file_path) print("Mine:") print("Multi-Pi - {0:.4f}".format(multi_pi(example_users))) print("multi-kappa - {0:.4f}".format(multi_kappa(example_users))) print("alpha - {0:.4f}".format(alpha(example_users, test_distance_func))) print("alpha prime - {0:.4f}".format( alpha_prime(example_users, test_distance_func))) print("beta - {0:.4f}".format(beta(example_users, test_distance_func))) # Expected values from Artstein and Poesio print("Expected:") print("mulit-Pi - " + str(0.7995)) print("mulit-kappa - " + str(0.8013)) print("alpha - " + str(0.8156)) print("alpha prime - " + str(0.8146)) print("beta - " + str(0.8163)) # Test bias uniform_path = os.path.join("label_data", "bias_uniform.txt") unequal_path = os.path.join("label_data", "bias_unequal.txt") b_uniform = get_user_labels(uniform_path) b_unequal = get_user_labels(unequal_path) print("Bias with example_users:") print("alpha - {0:.4f}".format(alpha(example_users, test_distance_func))) print("beta - {0:.4f}".format(beta(example_users, test_distance_func))) print("Bias - {0:.4f}".format(bias(example_users, test_distance_func))) # Test uniform first print("Bias with uniform:") print("alpha - {0:.4f}".format(alpha(b_uniform, test_distance_func))) print("beta - {0:.4f}".format(beta(b_uniform, test_distance_func))) print("Bias - {0:.4f}".format(bias(b_uniform, test_distance_func))) print("Bias with unequal:") print("alpha - {0:.4f}".format(alpha(b_unequal, test_distance_func))) print("beta - {0:.4f}".format(beta(b_unequal, test_distance_func))) print("Bias - {0:.4f}".format(bias(b_unequal, test_distance_func)))