if len(set(X)) >= 1 and len(set(Y)) >= 1:
                        spearmans.append(correlation.spearman_rho_tr(X, Y))
            write('\t%s, %s: bootstrapped spearman: %.3f (dev=%.3f)' %
                (condition, condition, utils.mean(spearmans), utils.dev(spearmans)))

for ktype in pair_means.keys():
    write('question type: %s' % ktype)
    for (condition1, consensus1) in pair_means[ktype].items():
        for (condition2, consensus2) in pair_means[ktype].items():
            if not consensus1 or not consensus2 or condition1 == condition2 or condition1 == 'all' or condition2 == 'all':
                continue
            write('\tcondition %s, %s' % (condition1, condition2))
            
            common_ids = list(set(consensus1.keys()).intersection(consensus2.keys()))
            X = [consensus1[pid] for pid in common_ids]
            Y = [consensus2[pid] for pid in common_ids]

            diffs = [(abs(x - y), x, y, pid) for (pid, x, y) in zip(common_ids, X, Y)]
            diffs.sort()
            diffs.reverse()

            write('\t\tpearson: %.3f' % (correlation.pearson_rho(X, Y)))
            write('\t\tspearman: %.3f' % (correlation.spearman_rho_tr(X, Y)))
            for (d, x, y, pid) in diffs[:10]:
                pair = s.id_to_phrases(pid)
                ratings1 = pair_ratings[ktype][condition1][pid]
                ratings2 = pair_ratings[ktype][condition2][pid]
                (t, p) = stats.ttest_ind(ratings1, ratings2)
                
                write('\t\t\t%s, %s: %.2f to %.2f (ttest t=%.5f, p=%.5f)' % (pair[0], pair[1], x, y, t, p))
示例#2
0
#!/usr/bin/python -O

import sys

import correlation
import utils

survey = utils.Survey()

wordsim = {}
for line in open('dat/general.txt'):
    tokens = line.split('\t')
    pair = tuple(sorted([t.strip() for t in tokens[:2]]))
    wordsim[pair] = float(tokens[2])


for condition in ('all', 'mturk', 'scholar'):
    ratings = survey.get_ratings_by_condition('general', condition)
    X = []
    Y = []
    for (pair_id, pair_ratings) in ratings.items():
        pair = tuple(sorted(survey.id_to_phrases(pair_id)))
        if pair in wordsim and len(pair_ratings) > 10:
            X.append(utils.mean([r.response for r in pair_ratings if r.has_response()]))
            Y.append(wordsim[pair])
        else:
            print 'unknown pair:', pair
    print condition, correlation.pearson_rho(X, Y), correlation.spearman_rho_tr(X, Y)
示例#3
0
                   utils.dev(spearmans)))

for ktype in pair_means.keys():
    write('question type: %s' % ktype)
    for (condition1, consensus1) in pair_means[ktype].items():
        for (condition2, consensus2) in pair_means[ktype].items():
            if not consensus1 or not consensus2 or condition1 == condition2 or condition1 == 'all' or condition2 == 'all':
                continue
            write('\tcondition %s, %s' % (condition1, condition2))

            common_ids = list(
                set(consensus1.keys()).intersection(consensus2.keys()))
            X = [consensus1[pid] for pid in common_ids]
            Y = [consensus2[pid] for pid in common_ids]

            diffs = [(abs(x - y), x, y, pid)
                     for (pid, x, y) in zip(common_ids, X, Y)]
            diffs.sort()
            diffs.reverse()

            write('\t\tpearson: %.3f' % (correlation.pearson_rho(X, Y)))
            write('\t\tspearman: %.3f' % (correlation.spearman_rho_tr(X, Y)))
            for (d, x, y, pid) in diffs[:10]:
                pair = s.id_to_phrases(pid)
                ratings1 = pair_ratings[ktype][condition1][pid]
                ratings2 = pair_ratings[ktype][condition2][pid]
                (t, p) = stats.ttest_ind(ratings1, ratings2)

                write('\t\t\t%s, %s: %.2f to %.2f (ttest t=%.5f, p=%.5f)' %
                      (pair[0], pair[1], x, y, t, p))
            write('\tcondition %s, %s' % (condition1, condition2))
            pearsons = []
            spearmans = []
            samples_per_user = 10
            for u1 in users1:
                for u2 in users2:
                    if id(u1) == id(u2):
                        continue
                    common_pair_ids = list(
                        set(u1.keys()).intersection(u2.keys()))
                    if len(common_pair_ids) >= 5:
                        up = []  # user pearson samples
                        us = []  # user spearman samples
                        for i in xrange(samples_per_user):
                            sample_pair_ids = random.sample(common_pair_ids, 5)
                            X = [u1[pid] for pid in sample_pair_ids]
                            Y = [u2[pid] for pid in sample_pair_ids]
                            if len(set(X)) != 1 and len(set(Y)) != 1:
                                p = correlation.pearson_rho(X, Y)
                                s = correlation.spearman_rho_tr(X, Y)
                                up.append(p)
                                us.append(s)
                        if up and us:
                            pearsons.append(utils.mean(up))
                            spearmans.append(utils.mean(us))
            write('\t\td unique pairs of users: %d' % len(spearmans))
            write('\t\tspearman: mean=%.3f, dev=%s' %
                  (utils.mean(spearmans), utils.dev(spearmans)))
            write('\t\tpearson: mean=%.3f, dev=%s' %
                  (utils.mean(pearsons), utils.dev(pearsons)))
        for (condition2, users2) in users[ktype].items():
            if not users1 or not users2:
                continue
            write('\tcondition %s, %s' % (condition1, condition2))
            pearsons = []
            spearmans = []
            samples_per_user = 10
            for u1 in users1:
                for u2 in users2:
                    if id(u1) == id(u2):
                        continue
                    common_pair_ids = list(set(u1.keys()).intersection(u2.keys()))
                    if len(common_pair_ids) >= 5:
                        up = [] # user pearson samples
                        us = [] # user spearman samples
                        for i in xrange(samples_per_user):
                            sample_pair_ids = random.sample(common_pair_ids, 5)
                            X = [u1[pid] for pid in sample_pair_ids]
                            Y = [u2[pid] for pid in sample_pair_ids]
                            if len(set(X)) != 1 and len(set(Y)) != 1:
                                p = correlation.pearson_rho(X, Y)
                                s = correlation.spearman_rho_tr(X, Y)
                                up.append(p)
                                us.append(s)
                        if up and us:
                            pearsons.append(utils.mean(up))
                            spearmans.append(utils.mean(us))
            write('\t\td unique pairs of users: %d' % len(spearmans))
            write('\t\tspearman: mean=%.3f, dev=%s' % (utils.mean(spearmans), utils.dev(spearmans)))
            write('\t\tpearson: mean=%.3f, dev=%s' % (utils.mean(pearsons), utils.dev(pearsons)))