import seaborn as sns
sns.set(style="white", color_codes=True)


def preprocessing(raw_document):
    urls = r'(http.+?(\s|$))'
    specialchar = r'|[^A-Za-z\s]'
    doc = raw_document.lower()
    tokens = word_tokenize(re.sub(urls + specialchar, ' ', doc))
    return tokens


A = Annotation('data/exportMedium.json')

docAnnos = A.perDocument(1, ['document', 'annotations'])

# This blob produces a dictionary with the values ['proposal', 'no
# proposal', 'wrong proposal']. Values are tuples (number of word in document, proposalFlag, duration)
blob = dict((k, list(v))
            for k, v in groupby(sorted([(len(preprocessing(document)),
                                         annotation['proposalFlag'],
                                         annotation['duration'])
                                        for (document, annotations) in docAnnos
                                        for annotation in annotations],
                                       key=itemgetter(1)),
                                key=itemgetter(1)))

plotContent = dict((
    f,
    map(lambda (count, flag, duration): (count, A.durationToSec(duration)), v))
Exemplo n.º 2
0
from __future__ import division
from Annotation import Annotation as Annotation
from operator import itemgetter
from itertools import groupby

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
sns.set(color_codes=True, style="whitegrid")

A = Annotation('data/exportMedium.json')

documents, allAnnotations = zip(*A.perDocument(2, ['document', 'annotations']))

totalByAggrement = [(len(annotations),
                     len(set(map(lambda a: a['labels'][0], annotations))))
                    for annotations in allAnnotations]

rlt = [(length, [(num, len(list(v2)))
                 for num, v2 in groupby(v1, key=itemgetter(1))])
       for length, v1 in groupby(sorted(totalByAggrement), key=itemgetter(0))]

# upper limit for the number of annotations per document
# rlt = rlt[:2]

numOfAnno, x = zip(*rlt)
maxDiff = max(i[0] for l in x for i in l)

fig = plt.Figure()

index = np.arange(len(rlt))
Exemplo n.º 3
0
    kappa = (Pd - Pe) / (1 - Pe)
    return kappa


def duplicates(lst):
    return [(item, count) for item, count in collections.Counter(lst).items()
            if count > 1]


A = Annotation('data/exportMedium.json')

maxAnno = 3

labelsPerDocDist = [
    FreqDist(map(lambda x: x['labels'][0], pD[0]))
    for pD in A.perDocument(3, ['annotations']) if len(pD[0]) == maxAnno
]

labels = ['Neg', 'Neut', 'Pos', 'No Sent', 'Undecided', 'Irrelevant']

matrix = np.array([
    map(lambda l: fd[l] if fd.get(l) else 0, labels) for fd in labelsPerDocDist
])

fKappa = fleissKappa(matrix, maxAnno)
print " Fleiss' Kappa: {}".format(fKappa)

path = '/home/kai/Dropbox/MA/thesis/const/fleissKappa.tex'
with open(path, 'w+') as file:
    file.write(str(round(fKappa * 1000) / 1000))
Exemplo n.º 4
0
import matplotlib.pyplot as plt
import numpy as np

import seaborn as sns; sns.set(color_codes=True, style="whitegrid")

def save(name, value):
    path = '/home/kai/Dropbox/MA/thesis/const/{}.tex'.format(name)
    with open(path, 'w+') as file:
        file.write(value)


A = Annotation('data/exportMedium.json')

annotations = A.ofAll(['labels', 'proposalFlag'])

perDocument = A.perDocument(3, ['dateTime'])

save('numberOfAnnotations', str(len(annotations)))
#

save('numberOfDocuments', str(len(perDocument)))
#

flags = dict((k, list(v)) for k, v in
             groupby(sorted(annotations, key=itemgetter(1)), key=itemgetter(1)))

save('numberOfProposals', str(len(flags['proposal'])))
save('numberOfNoProposals', str(len(flags['no proposal'])))
save('numberOfWrongProposals', str(len(flags['wrong proposal'])))
Exemplo n.º 5
0

def groupAnnotatorAgreement(allAnnotations):
    numDiffLabels = [(len(set(map(lambda a: a['labels'][0],
                                  annotations))), annotations)
                     for annotations in allAnnotations]
    k = itemgetter(0)
    return dict((k, map(itemgetter(1), v))
                for k, v in groupby(sorted(numDiffLabels, key=k), key=k))


A = Annotation('data/exportMedium.json')

sortedAnnotations = [
    sortByTime(annotations[0])
    for annotations in A.perDocument(3, ['annotations'])
    if len(annotations[0]) == 3
]

lastPropAnnotations = filter(lastHasProposal, sortedAnnotations)

annotatorAgreement = groupAnnotatorAgreement(lastPropAnnotations)

print """
The goal is to predict the third annotation based on the two
previously seen annotations. This might enable to reduce the number of
annotations from 3 to 2.

In case all annotators agree, this might indicate that the annotation
for this document is obvious. Let's say we want to reduce the number
of annotations from 3 to 2. Then, if the first two annotators agree
Exemplo n.º 6
0
import numpy as np
import math

import seaborn as sns; sns.set(color_codes=True, style="whitegrid")

def autolabel(rect, label):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2., 1.05*height,
            label,
            ha='center', va='bottom')


A = Annotation('data/exportMedium.json')

annoPerDocument = map(lambda annos: map(lambda a: a['labels'][0], annos[0]),
                      A.perDocument(3, ['annotations']) )

keys = [u'Neg', u'Neut', u'Pos', u'No Sent', u'Undecided',  u'Irrelevant']

annotations = list(chain(*annoPerDocument))

annoDist = FreqDist(annotations)

labelDist = FreqDist(map(lambda a: Counter(a).most_common()[0][0]
                          if len(set(a))<3 else None,
                         annoPerDocument))

annoCount = np.array(map(lambda k: annoDist[k], keys))
labelCount = np.array(map(lambda k: labelDist[k], keys))

index = np.arange(len(keys))
Exemplo n.º 7
0
from __future__ import division
from Annotation import Annotation as Annotation
from operator import itemgetter
from itertools import groupby

A = Annotation('data/exportMedium.json')

totalByAggrement = [
    (len(annotations), len(set(map(lambda a: a['labels'][0], annotations))),
     '\t'.join([document] + map(lambda a: a['labels'][0], annotations)))
    for document, annotations in A.perDocument(2, ['document', 'annotations'])
]

rlt = dict(
    (length,
     dict((num, map(itemgetter(2), v2))
          for num, v2 in groupby(v1, key=itemgetter(1))))
    for length, v1 in groupby(sorted(totalByAggrement), key=itemgetter(0)))

c = [rlt[3][1][:20], rlt[3][2][:20], rlt[3][3][:20]]

for i in range(1, 4):
    with open(str(i) + '.tsv', 'w') as f:
        f.write('\n'.join(c[i - 1]).encode('utf-8'))
Exemplo n.º 8
0
from itertools import groupby

import matplotlib.pyplot as plt
import numpy as np

A = Annotation('data/exportMedium.json')

numAnnos = 3

threeAnnoPerDoc = filter(
    lambda d: len(d) == numAnnos,
    map(
        lambda annos: map(
            lambda a:
            (a['labels'][0], a['proposalFlag'], a['user']), annos[0]),
        A.perDocument(numAnnos, ['annotations'])))


def wrongProposal(pD):
    return True if (pD[0][1] != 'wrong proposal'
                    and pD[1][1] != 'wrong proposal'
                    and pD[2][1] == 'wrong proposal') else False


wP = filter(wrongProposal,
            map(lambda pD: sorted(pD, key=itemgetter(1)), threeAnnoPerDoc))

# def match(pD):
#     match = []
#     annotations = pD[0]
#     for annotation in annotations: