class HunposPosTagger(PosTaggerWrapper): """ Wraps NltkTools, which wraps HunPos as a POS tagger :). In order for NLTK to find the hunpos executable, the $HUNPOS environment variable must point to the directory with the hunpos-tag executable in it. The following parameters are used: - hunpos_model: the hunpos model file. Default is $HUNPOS/english.model; - hunpos_encoding: the encoding used by the hunpos model file. Default is iso-8859-1. """ def __init__(self, params): self.nt = NltkTools(pos=True, pos_model=params['hunpos_model']) self.encoding = params.get('hunpos_encoding', 'iso-8859-1') def pos_tag(self, tokens): for sen_i, sen in enumerate(tokens): tagged_sen = self.nt.pos_tag([tok[0].encode(self.encoding) for tok in sen]) for tok_i, tagged_tok in enumerate(tagged_sen): try: tok, pos = [x.decode(self.encoding) for x in tagged_tok] except ValueError: continue tokens[sen_i][tok_i].append(pos)
class HunposPosTagger(PosTaggerWrapper): """ Wraps NltkTools, which wraps HunPos as a POS tagger :). In order for NLTK to find the hunpos executable, the $HUNPOS environment variable must point to the directory with the hunpos-tag executable in it. The following parameters are used: - hunpos_model: the hunpos model file. Default is $HUNPOS/english.model; - hunpos_encoding: the encoding used by the hunpos model file. Default is iso-8859-1. """ def __init__(self, params): self.nt = NltkTools(pos=True, pos_model=params['hunpos_model']) self.encoding = params.get('hunpos_encoding', 'iso-8859-1') def pos_tag(self, tokens): for sen_i, sen in enumerate(tokens): tagged_sen = self.nt.pos_tag( [tok[0].encode(self.encoding) for tok in sen]) for tok_i, tagged_tok in enumerate(tagged_sen): try: tok, pos = [x.decode(self.encoding) for x in tagged_tok] except ValueError: continue tokens[sen_i][tok_i].append(pos)
pageSep = "%%#PAGE" actPage = None starter = False for line in sys.stdin: l = line.strip().decode("utf-8") if l.startswith(pageSep): if actPage is not None: print actPage = l.split(" ", 1)[1] starter = True print l.encode("utf-8").replace(" ", "\t", 1) print "%%#Field\tTitle" titleTokens = nt.word_tokenize(actPage) titleTokensWithPos = list(nt.pos_tag(titleTokens)) stemmedTitleTokens = nt.stem(titleTokensWithPos) hardStemmedTitleTokens = list(nt.stem(((x[0][0].lower() + x[0][1:] if x[0][0].isupper() and x[0][1:].islower() else x[0]), x[1]) for x in titleTokensWithPos)) for i, (tok, pos, stem) in enumerate(stemmedTitleTokens): print u"{0}\t{1}\t{2}\t{3}\t{4}\t{5}".format(tok, "word", "0", pos, stem, hardStemmedTitleTokens[i][2]).encode("utf-8") print elif starter and l.startswith("Templates:"): try: templates = l.split("\t", 1)[1] print u"%%#Templates\t{0}".format(templates).encode("utf-8") except IndexError: pass elif starter and l.startswith("REDIRECT"): print "%%#Redirect" else: if starter: