示例#1
0
 def __get_folia_doc__(self, tokens):
     doc = folia.Document(id='nltk-sentence')
     folia_sent = doc.add(folia.Text)
     for tok, pos in tokens:
         word = folia_sent.add(folia.Word, tok)
         word.add(folia.PosAnnotation(None, set='custom', cls=pos))
     return doc
示例#2
0
def parse_cgn_postag(rawtag, raisefeatureexceptions=False):
    global subsets, constraints
    """decodes PoS features like "N(soort,ev,basis,onz,stan)" into a PosAnnotation data structure 
    based on CGN tag overview compiled by Matje van de Camp"""

    begin = rawtag.find('(')
    if rawtag[-1] == ')' and begin > 0:
        tag = folia.PosAnnotation(None,
                                  cls=rawtag,
                                  set='http://ilk.uvt.nl/folia/sets/cgn')

        head = rawtag[0:begin]
        tag.append(folia.Feature, subset='head', cls=head)

        rawfeatures = rawtag[begin + 1:-1].split(',')
        for rawfeature in rawfeatures:
            if rawfeature:
                found = False
                for subset, classes in subsets.items():
                    if rawfeature in classes:
                        if subset in constraints:
                            if not head in constraints[subset]:
                                continue  #constraint not met!
                        found = True
                        tag.append(folia.Feature,
                                   subset=subset,
                                   cls=rawfeature)
                        break
                if not found:
                    print("\t\tUnknown feature value: " + rawfeature + " in " +
                          rawtag,
                          file=stderr)
                    if raisefeatureexceptions:
                        raise InvalidFeatureException(
                            "Unknown feature value: " + rawfeature + " in " +
                            rawtag)
                    else:
                        continue
        return tag
    else:
        raise InvalidTagException("Not a valid CGN tag")
示例#3
0
 for s in p.sentences():
     found_w = False
     for w in s.words():
         found_w = True
     found_s = True
     if found_w:
         #pass tokenised sentence
         words = s.words()
         response = frogclient.process(" ".join([unicode(w) for w in words]))
         for i, (word, lemma, morph, pos) in enumerate(response):
             if legacy: legacyout(i,word,lemma,morph,pos)                    
             if unicode(words[i]) == word:
                 if lemma:
                     words[i].append( folia.LemmaAnnotation(foliadoc, cls=lemma) )
                 if pos:
                     words[i].append( folia.PosAnnotation(foliadoc, cls=pos) )  
             else:
                 print >>sys.stderr,"WARNING: Out of sync after calling Frog! ", i, word
             
     else:
         #pass untokenised sentence
         try:
             sentext = s.text()
         except folia.NoSuchText:
             continue
         response = frogclient.process(sentext)
         for i, (word, lemma, morph, pos) in enumerate(response):
             if legacy: legacyout(i,word,lemma,morph,pos)                             
             if word:
                 w = folia.Word(foliadoc, text=word, generate_id_in=s)                                                
                 if lemma: