예제 #1
0
파일: features.py 프로젝트: fbuijs/educe
def read_corpus_inputs(args):
    """
    Read and filter the part of the corpus we want features for
    """
    reader = educe.stac.Reader(args.corpus)
    anno_files = reader.filter(reader.files(),
                               mk_is_interesting(args, args.single))
    corpus = reader.slurp(anno_files, verbose=True)

    if not args.ignore_cdus:
        strip_cdus(corpus, mode=args.strip_mode)
    postags = postag.read_tags(corpus, args.corpus)
    parses = corenlp.read_results(corpus, args.corpus)
    _fuse_corpus(corpus, postags)

    for lex in LEXICONS:
        lex.read(args.resources)
    pdtb_lex = read_pdtb_lexicon(args)
    inq_lex = {}  # _read_inquirer_lexicon(args)

    verbnet_entries = [
        VerbNetEntry(x, frozenset(vnet.lemmas(x))) for x in VERBNET_CLASSES
    ]

    return FeatureInput(corpus=corpus,
                        postags=postags,
                        parses=parses,
                        lexicons=LEXICONS,
                        pdtb_lex=pdtb_lex,
                        verbnet_entries=verbnet_entries,
                        inquirer_lex=inq_lex)
예제 #2
0
def in_verblist(lem):
    """Return true if the given lemma is found in the verbnet verb list"""
    verblist = verbnet.lemmas()
    if lem in verblist:
        return True
    else:
        return False
예제 #3
0
파일: features.py 프로젝트: kowey/educe
def read_corpus_inputs(args):
    """
    Read and filter the part of the corpus we want features for
    """
    reader = educe.stac.Reader(args.corpus)
    anno_files = reader.filter(reader.files(),
                               mk_is_interesting(args, args.single))
    corpus = reader.slurp(anno_files, verbose=True)

    if not args.ignore_cdus:
        strip_cdus(corpus)
    postags = postag.read_tags(corpus, args.corpus)
    parses = corenlp.read_results(corpus, args.corpus)
    _fuse_corpus(corpus, postags)

    for lex in LEXICONS:
        lex.read(args.resources)
    pdtb_lex = read_pdtb_lexicon(args)
    inq_lex = {} #_read_inquirer_lexicon(args)

    verbnet_entries = [VerbNetEntry(x, frozenset(vnet.lemmas(x)))
                       for x in VERBNET_CLASSES]

    return FeatureInput(corpus=corpus,
                        postags=postags,
                        parses=parses,
                        lexicons=LEXICONS,
                        pdtb_lex=pdtb_lex,
                        verbnet_entries=verbnet_entries,
                        inquirer_lex=inq_lex)
예제 #4
0
def in_verblist(lem):
    """Return true if the given lemma is found in the verbnet verb list"""
    verblist = verbnet.lemmas()
    if lem in verblist:
        return True
    else:
        return False
예제 #5
0
 def filterOut(self, L):
     L = list(set(L))
     lemma = WordNetLemmatizer()
     #taking the verb lemas of the words in L
     L = [lemma.lemmatize(i, pos='v') for i in L]
     #extracting the verbs from the L which are in Wordnet's Verb and are a stopWord
     L = [i for i in L if i in verbnet.lemmas() and i not in stopWords]
     L = list(set(L))
     return L
예제 #6
0
 def test_corpus_verbnet_method_returns_correct_result(self):
     self.assertEqual(
         verbnet.classids('accept'),
         ['approve-77', 'characterize-29.2-1-1', 'obtain-13.5.2'])
     self.assertEqual(verbnet.longid('77'), 'approve-77')
     self.assertEqual(verbnet.lemmas()[0:10], [
         'December', 'FedEx', 'UPS', 'abandon', 'abase', 'abash', 'abate',
         'abbreviate', 'abduct', 'abet'
     ])
예제 #7
0
def verbs():
    """Return a list of verbs from verbnet that can be used to define a set of activities"""
    try_import('nltk')
    import nltk
    nltkdir = remkdir(os.path.join(
        os.environ['VIPY_CACHE'],
        'nltk')) if 'VIPY_CACHE' in os.environ else tempfile.gettempdir()
    os.environ['NLTK_DATA'] = nltkdir
    print('[vipy.annotation.verbs]: Downloading verbnet to "%s"' %
          tempfile.gettempdir())
    nltk.download('verbnet', tempfile.gettempdir())
    from nltk.corpus import verbnet
    return verbnet.lemmas()
    def filterOutVerbs(self, A):
        '''
            return the words :
            1 - which are in Wordnet's Verb
            2 - which are not the stopwords

        '''
        L = A[:]
        lemma = WordNetLemmatizer()
        L = [lemma.lemmatize(i, pos='v') for i in L]
        L = [i for i in L if i in verbnet.lemmas() and i not in stopWords]
        L = list(set(L))
        return L
예제 #9
0
    def get_all_infinitives():
        import docassemble_pattern.en as pattern

        import nltk
        from nltk.corpus import wordnet as wn
        try:
            wn.all_synsets
        except LookupError:
            print("Downloading wordnet...")
            nltk.download("wordnet")

        from nltk.corpus import verbnet
        try:
            verbnet.lemmas
        except LookupError:
            print("Downloading verbnet...")
            nltk.download("verbnet")

        # from pattern - 8.5K
        all_pattern_verbs = []
        """
        Original pattern package have bugs in this verbs.infinitives class.
        For this reason, in the annotation pipeline, we didn't use their verb seed. 
        Here we are installing pattern from the docassemble_pattern project, so bug is fixed. 
        If you wish for consistency with original QANom dataset, leave the next line commented out.
        If you care only for better coverage, uncomment the following line:
        """
        all_pattern_verbs = list(pattern.verbs.infinitives)

        # from wordnet - 8.7K verbs
        all_wn_verbs = sorted(
            set(l.name() for v_syn in wn.all_synsets(pos="v")
                for l in v_syn.lemmas() if "_" not in l.name()))
        # from verbnet - 3.6K verbs
        all_verbnet_verbs = list(verbnet.lemmas())
        # together - 10.6K verbs
        infinitives = sorted(
            set(all_wn_verbs + all_pattern_verbs + all_verbnet_verbs))
        return infinitives
    def get_all(self):
        '''
            This function return intent, action_on and attributs of the sentence
        '''

        chunkGramForNounVerb = r'''NV: {<VB.?>+<.*>*<NN.?>+ | <NN.?>+<.*>*<VB.?>+}  #chunking noun verbs
                            }<[^(VB|NN)].*|VBZ|>{       #chinking all except POS starting with VB or NN and VBZ'''

        chunkGramForOnlyVerb = r'''VR: {<VB.?>}
                            }<VBP|VBZ>+{'''

        tree = self.get_POS_tree(chunkGramForNounVerb)
        obj = []
        series = []
        L = []

        for subtree in tree.subtrees(filter=lambda t: t.label() == 'NV'):
            if len(subtree.leaves()) > 1:
                series = subtree.leaves()
            obj = obj + [s for s in subtree.leaves()]

        #working below, if tree got some pairs as subtree
        if len(series) and (self.intent is False or self.action_upon is False):
            #tree.draw()
            verbs = [w for w, pos in series if re.match(r'VB.?', pos) is not None]
            nouns = [w for w, pos in series if re.match(r'NN.?', pos) is not None]
            if len(verbs):
                L = self.filterOutVerbs(verbs)
                #discarding the unwanted verbs
                L = [i for i in L if i not in discardedVerbs]
                if len(L): self.intent = L

            if len(nouns):
                lemma = WordNetLemmatizer()
                # considering those nouns which do not act as a verb
                L = [i for i in nouns if i not in stopWords and lemma.lemmatize(i, pos='v') not in verbnet.lemmas()]
                if len(L): self.action_upon = L

        #below works, if intent or action_upon did nt found yet
        if len(obj) and (self.intent is False or self.action_upon is False):
            #tree.draw()
            verbs = [w for w, pos in obj if re.match(r'VB.?', pos) is not None]
            nouns = [w for w, pos in obj if re.match(r'NN.?', pos) is not None]
            if len(verbs) and self.intent is False:
                L = self.filterOutVerbs(verbs)
                #again removing the unwanted verbs
                L = [i for i in L if i not in discardedVerbs]
                if len(L): self.intent = L

            if len(nouns) and self.intent is False:
                #filtering out nouns which are verbs
                L = self.filterOutVerbs(nouns)
                if len(L):
                    #adding noun in discardedNouns which are verbs and present as intent
                    for i in L: self.discardedNouns.append(i)
                    self.intent = L

            if len(nouns) and self.action_upon is False:
                self.action_upon = nouns

        # if no intent is found yet then below
        if self.intent is False:
            print('3')
            tree = self.get_POS_tree(chunkGramForOnlyVerb)
            #tree.draw()
            verbs = []
            verbs.clear()
            for subtree in tree.subtrees(filter=lambda t: t.label() == 'VR'):
                verbs = verbs + [s for s in subtree.leaves()]

            if len(verbs):
                L = [w for w, pos in verbs]
                L = self.filterOutVerbs(L)
                if len(L):
                    self.intent = L

        if self.action_upon is not False:
            #again discarding nouns which are present as intent
            self.action_upon = [i for i in self.action_upon if i not in self.discardedNouns]

        L = self.get_attributes()

        if len(L):
            self.attributes = L

        if self.intent is False or len(self.intent) == 0:
            self.intent = ['No Intent Found']

        if self.action_upon is False:
            self.action_upon = ['No Action Found']
        if self.attributes is False:
            self.attributes = ['No Attribute(s) Found']

        return {
            'intent': ' , '.join(self.intent),
            'action_on': ' , '.join(self.action_upon),
            'attributes': ' , '.join(self.attributes)
        }
예제 #11
0
all_pattern_verbs = []
"""
Original pattern package have bugs in this verbs.infinitives class.
For this reason, in the annotation pipeline, we didn't use their verb seed. 
Here we are installing pattern from git, so bug is fixed. 
If you wish for consistency with original QANom dataset, leave the next line commented out.
If you care only for better coverage, uncomment the following line:
"""
# all_pattern_verbs = list(pattern.verbs.infinitives)

# from wordnet - 8.7K verbs
all_wn_verbs = sorted(
    set(l.name() for v_syn in wn.all_synsets(pos="v") for l in v_syn.lemmas()
        if "_" not in l.name()))
# from verbnet - 3.6K verbs
all_verbnet_verbs = list(verbnet.lemmas())
# together - 10.6K verbs
infinitives = sorted(set(all_wn_verbs + all_pattern_verbs + all_verbnet_verbs))


def as_gerund(verb):
    try:  # pattern's lazy-loaders have a bug (first-time failure)
        v_prog = pattern.conjugate(verb, aspect=pattern.PROGRESSIVE)
    except:
        v_prog = pattern.conjugate(verb, aspect=pattern.PROGRESSIVE)

    return v_prog


class SuffixBasedNominalizationCandidates:
    def __init__(self, verb_seed=infinitives):
generalThing = datum.thing
verbnetRoot = generalThing.get("verbnet")
wordnetRoot = generalThing.find("wordnet")
class_ = verbnetRoot.get("class")
verbclassID = verbnetRoot.get("verb class id")
verbroot = verbnetRoot.get("verbroot")
example = verbnetRoot.get("example")
semantics = verbnetRoot.get("semantics")
syntax = verbnetRoot.get("syntax")
verbclass_ = verbnetRoot.get("verb class")
description = verbnetRoot.get("description")
semanticsArguments = verbnetRoot.get("semantics argument")
syntaxArguments = verbnetRoot.get("syntax argument")
syntaxFramesKatum = verbnetRoot.get("syntactic argument")
semanticsFramesKatum = verbnetRoot.get("semantics predicate")
predicateValue = verbnetRoot.get("predicate value")
themroles = verbnetRoot.get("thematic role")
roleType = verbnetRoot.get("role")
listOfAllLemmas = vn.lemmas()
uniqueClassIDs = []
for lemma in listOfAllLemmas:
    uniqueClassIDs.extend(vn.classids(lemma))
uniqueClassIDs = list(set(uniqueClassIDs))
processClassID(uniqueClassIDs)
for v in vn.lemmas():
    verbRootInstance = verbroot.get(v)
    for verbclass in vn.classids(v):
        verbRootInstance._is(classToKatumDict[verbclass], False)

generalThing.save('wordnet-verbnet.datum')
예제 #13
0
    if loops == 0:
        sent = sent.replace("loops", "")
    elif loops == 1:
        sent = sent.replace("loops", "loops-1")

    sent = sent.replace("object", "relation")
    return " ".join(sent.split())


grammar = {
    # should be pluralizable
    # should be able to say "<relation-singular> of the guy"
    "relation-singular": relations,
    "relation": ["relation-singular", "relation-plural"],
    # should be able to say "they <verb-plural> me"
    "verb-plural": vn.lemmas("admire-31.2") + vn.lemmas("amuse-31.1"),
    "verb": ["verb-singular", "verb-plural"],
    "time": time,
    "beginning": ["time the", "the"],
    "loops": ["", "relation of the loops"],
    "loops-1": ["relation of the loops"],
}

grammar["relation-plural"] = [
    pluralize(relation) for relation in grammar["relation-singular"]
]
grammar["verb-singular"] = [pluralize(verb) for verb in grammar["verb-plural"]]


def generate(tpl):
    toks = []
    maxCount = 0
avg = total / l
# print bagOfWords["elizabeth"]
# print avg

allverbs = []

# Creating training set
fr = open(extfile, 'r')
for line in fr:
    token = line.strip("\n")
    extList[token] = avg
    words = verbnet.classids(token)
    for w in words:
        finalWord = w.decode("UTF-8", "ignore")
        allverbs += verbnet.lemmas(finalWord)

for v in allverbs:
    extList[v] = avg / 2
# print len(extList)

allverbs = []

fr = open(intfile, 'r')
for line in fr:
    token = line.strip("\n")
    intList[token] = avg
    words = verbnet.classids(token)
    for w in words:
        finalWord = w.decode("UTF-8", "ignore")
        allverbs += verbnet.lemmas(finalWord)
예제 #15
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

from nltk.corpus import verbnet
verbnet.lemmas()[20:25]
verbnet.classids()[:5]
verbnet.classids('accept')
verbnet.vnclass('remove-10.1')  # doctest: +ELLIPSIS
verbnet.vnclass('10.1')  # doctest: +ELLIPSIS
vn_31_2 = verbnet.vnclass('admire-31.2')
for themrole in vn_31_2.findall('THEMROLES/THEMROLE'):
    print(themrole.attrib['type'])
    for selrestr in themrole.findall('SELRESTRS/SELRESTR'):
        print('[%(Value)s%(type)s]' % selrestr.attrib)
    print()

print(verbnet.pprint('57'))
예제 #16
0
print(twitter_samples.tokenized('tweets.20150430-223406.json'))
# rte
# nltk.download('rte')
print(rte.fileids())  # doctest: +ELLIPSIS
rtepairs = rte.pairs(['rte2_test.xml', 'rte3_test.xml'])
print(rtepairs)  # doctest: +ELLIPSIS
print(rtepairs[5])
print(rtepairs[5].text)  # doctest: +NORMALIZE_WHITESPACE
print(rtepairs[5].hyp)
print(rtepairs[5].value)
xmltree = rte.xml('rte3_dev.xml')
print(xmltree)  # doctest: +SKIP
print(xmltree[7].findtext('t'))  # doctest: +NORMALIZE_WHITESPACE
# verbnet
# nltk.download('verbnet')
print(verbnet.lemmas()[20:25])
print(verbnet.classids()[:5])
print(verbnet.classids('accept'))
print(verbnet.vnclass('remove-10.1'))  # doctest: +ELLIPSIS
print(verbnet.vnclass('10.1'))  # doctest: +ELLIPSIS
vn_31_2 = verbnet.vnclass('admire-31.2')
for themrole in vn_31_2.findall('THEMROLES/THEMROLE'):
    print(themrole.attrib['type'])
    for selrestr in themrole.findall('SELRESTRS/SELRESTR'):
        print('[%(Value)s%(type)s]' % selrestr.attrib)
    print()
print(verbnet.pprint('57'))
# nps_chat
# nltk.download('nps_chat')
print(nltk.corpus.nps_chat.words())
print(nltk.corpus.nps_chat.tagged_words())