Пример #1
0
def generateBayes(w, data, senseLocation, tokenLocation, contextSize = 2):
    allSenses = indexBy(senseLocation, data)
    
    #get word bags
    wordBags = dict([(s, []) for s in allSenses])
    for each in allSenses:
        sentences = [x[tokenLocation] for x in allSenses[each]]
        sentences = filter(lambda x: w in x, sentences)
        contexts = [extract(prepSentence(x), contextSize, w, contextSize)\
                for x in sentences]
        wordBags[each] = contexts

    #bind words to their relative positions to w
    # originally, C(v_j, s_k) -- l factors in locational information
    locWords = dict([(s, []) for s in allSenses])
    for each in wordBags:
        wbs = wordBags[each]
        if len(wbs) < 1: 
            del locWords[each]
            continue
        locWords[each] = markLocations(wbs)
        

    #first loop from training algorithm
    #calculates P(v_j|s_k)
    # represented in python as Pbayes[s_k][v_j]
    # ie, the bayesian probability, given sense k, of vocab item j
    rawWords = reduce(lambda x,y: x+locWords[y], [[]]+list(locWords))
    words = histogram(rawWords)
    smooth = 0.5
    Pbayes = dict([(s, dict([(w, smooth) for w in words])) for s in locWords])
    for s_k in locWords:
        localCounts = histogram(locWords[s_k])
        for each in localCounts: 
            Pbayes[s_k][each] += localCounts[each]
            Pbayes[s_k][each] = Pbayes[s_k][each]/words[each]
    
    Psense = {}
    totalSenstances = reduce(lambda x,y: x+len(locWords[y]), [0]+list(locWords))
    for s_k in locWords: 
        Psense[s_k] = float(len(locWords[s_k]))/totalSenstances

    return lambda c: bayesDisambiguator(Pbayes, Psense, smooth, c)
Пример #2
0
def confMatrix(data, parm, target):
    dataPairs = []
    dataByPos = indexBy("corpus_pos", data)
    for each in dataByPos:
        dataPairs.append(dict(map(lambda d: (d[parm],d[target]),\
                dataByPos[each])))
    
    #grab possible values for parms
    parms = list(dict(dataPairs[0]))
    if len(parms) != 2: return "Fail--parm doesn't have two values"
    
    #grab all target values
    targetValues = map(lambda d: d[target], data)
    targetValues = list(set(targetValues))
    targetValues.sort()

    #total up 
    matrix = dict([((v, w), 0) for v in targetValues for w in targetValues])
    for each in dataPairs:
        vals = tuple(map(lambda p: each[p], parms))
        matrix[vals]+=1
    
    return matrix
Пример #3
0
###################

data = loadTurkData(["partyNN.csv"])

cleanData = []
for each in data:
    each["token"] = each["token"].strip("<> \'\"")
    cleanData.append(each)

pivot = pivotize("corpus_pos", "value", cleanData)
print fKappa(pivot)

################
#Cohen's kappas#
################
items = indexBy("corpus_pos", cleanData)

#find pairs of annotators
annotatorPairs = {}
for each in items:
    annotatorPair = map(lambda x: x["annotator"], items[each])
    annotatorPair.sort()
    if tuple(annotatorPair) not in annotatorPairs:
        annotatorPairs[tuple(annotatorPair)] = items[each]
    else:
        annotatorPairs[tuple(annotatorPair)] += items[each]

confusionMatrices = dict(map(lambda p: (p, confMatrix(annotatorPairs[p],\
        "annotator", "value")), annotatorPairs))

cKappa = [(p, cKappa(confusionMatrices[p])) for p in confusionMatrices]