Exemplo n.º 1
def average_wup(fromWords, toWords):
    fromSynsets = [wn.synsets(sw.sub(x), wn.NOUN)[0] for x in fromWords if wn.synsets(sw.sub(x), wn.NOUN)]
    toSynsets = [wn.synsets(sw.sub(x), wn.NOUN)[0] for x in toWords if wn.synsets(sw.sub(x), wn.NOUN)]
    total_wup = 0
    total_words = 0
    for fromSynset in fromSynsets:
        total_from = 0
        total_from_words = 0
        for toSynset in toSynsets:
            total_from += fromSynset.wup_similarity(toSynset)
            total_from_words += 1
            if total_from_words == 0:
                total_from = 0
                total_from_words = 1
        total_wup += (total_from / total_from_words)
        total_words += 1
    if total_wup == 0:
        return 0
    return total_wup / total_words
Exemplo n.º 2
def classify(tag, labels = ['SPATIAL_ENTITY', 'PLACE', 'PATH']):
    phrase = [sw.sub(word) for word in tag.attrib['text'].split()]
    scores = [(ws.avg_wup(phrase, LABEL_PHRASES[label]), label) for label in labels]
    return max(scores)
Exemplo n.º 3
# -*- coding: utf-8 -*-

Code to classify an SPRL tag to an ISO-Space one.

import re
import os
import tagdoc as td
import wordsimilarity as ws
import stopwords as sw
from nltk.corpus import wordnet as wn

LABEL_DICT = td.TagDir(td.ISO_GOLD_DIR).tagDict
LABEL_TEXT = {key : list(set([w.attrib['text'] for w in LABEL_DICT[key]])) for key in LABEL_DICT.keys() if key in LABELS}
LABEL_PHRASES = {key : td.flatten([[sw.sub(x) for x in w.split()] for w in LABEL_TEXT[key]]) for key in LABEL_TEXT.keys()}

lm = td.TagDoc().tagDict['LANDMARK']

#this is not returning the same results as the original
#metric used in sprl_to_iso-space.py
#debug it if possible (the old one was good!)
def classify(tag, labels = ['SPATIAL_ENTITY', 'PLACE', 'PATH']):
    phrase = [sw.sub(word) for word in tag.attrib['text'].split()]
    scores = [(ws.avg_wup(phrase, LABEL_PHRASES[label]), label) for label in labels]
    return max(scores)
Exemplo n.º 4
def extent2List(extent):
    words = extent.split()
    return [sw.sub(word) for word in words]