def __get_folia_doc__(self, tokens): doc = folia.Document(id='nltk-sentence') folia_sent = doc.add(folia.Text) for tok, pos in tokens: word = folia_sent.add(folia.Word, tok) word.add(folia.PosAnnotation(None, set='custom', cls=pos)) return doc
def parse_cgn_postag(rawtag, raisefeatureexceptions=False): global subsets, constraints """decodes PoS features like "N(soort,ev,basis,onz,stan)" into a PosAnnotation data structure based on CGN tag overview compiled by Matje van de Camp""" begin = rawtag.find('(') if rawtag[-1] == ')' and begin > 0: tag = folia.PosAnnotation(None, cls=rawtag, set='http://ilk.uvt.nl/folia/sets/cgn') head = rawtag[0:begin] tag.append(folia.Feature, subset='head', cls=head) rawfeatures = rawtag[begin + 1:-1].split(',') for rawfeature in rawfeatures: if rawfeature: found = False for subset, classes in subsets.items(): if rawfeature in classes: if subset in constraints: if not head in constraints[subset]: continue #constraint not met! found = True tag.append(folia.Feature, subset=subset, cls=rawfeature) break if not found: print("\t\tUnknown feature value: " + rawfeature + " in " + rawtag, file=stderr) if raisefeatureexceptions: raise InvalidFeatureException( "Unknown feature value: " + rawfeature + " in " + rawtag) else: continue return tag else: raise InvalidTagException("Not a valid CGN tag")
for s in p.sentences(): found_w = False for w in s.words(): found_w = True found_s = True if found_w: #pass tokenised sentence words = s.words() response = frogclient.process(" ".join([unicode(w) for w in words])) for i, (word, lemma, morph, pos) in enumerate(response): if legacy: legacyout(i,word,lemma,morph,pos) if unicode(words[i]) == word: if lemma: words[i].append( folia.LemmaAnnotation(foliadoc, cls=lemma) ) if pos: words[i].append( folia.PosAnnotation(foliadoc, cls=pos) ) else: print >>sys.stderr,"WARNING: Out of sync after calling Frog! ", i, word else: #pass untokenised sentence try: sentext = s.text() except folia.NoSuchText: continue response = frogclient.process(sentext) for i, (word, lemma, morph, pos) in enumerate(response): if legacy: legacyout(i,word,lemma,morph,pos) if word: w = folia.Word(foliadoc, text=word, generate_id_in=s) if lemma: