Пример #1
0
class HunposPosTagger(PosTaggerWrapper):
    """
    Wraps NltkTools, which wraps HunPos as a POS tagger :).
    
    In order for NLTK to find the hunpos executable, the $HUNPOS environment
    variable must point to the directory with the hunpos-tag executable in it.

    The following parameters are used:
    - hunpos_model: the hunpos model file. Default is $HUNPOS/english.model;
    - hunpos_encoding: the encoding used by the hunpos model file. Default is
      iso-8859-1.
    """
    def __init__(self, params):
        self.nt = NltkTools(pos=True, pos_model=params['hunpos_model'])
        self.encoding = params.get('hunpos_encoding', 'iso-8859-1')

    def pos_tag(self, tokens):
        for sen_i, sen in enumerate(tokens):
            tagged_sen = self.nt.pos_tag([tok[0].encode(self.encoding) for tok in sen])
            for tok_i, tagged_tok in enumerate(tagged_sen):
                try:
                    tok, pos = [x.decode(self.encoding) for x in tagged_tok]
                except ValueError:
                    continue
                tokens[sen_i][tok_i].append(pos)
Пример #2
0
class HunposPosTagger(PosTaggerWrapper):
    """
    Wraps NltkTools, which wraps HunPos as a POS tagger :).
    
    In order for NLTK to find the hunpos executable, the $HUNPOS environment
    variable must point to the directory with the hunpos-tag executable in it.

    The following parameters are used:
    - hunpos_model: the hunpos model file. Default is $HUNPOS/english.model;
    - hunpos_encoding: the encoding used by the hunpos model file. Default is
      iso-8859-1.
    """
    def __init__(self, params):
        self.nt = NltkTools(pos=True, pos_model=params['hunpos_model'])
        self.encoding = params.get('hunpos_encoding', 'iso-8859-1')

    def pos_tag(self, tokens):
        for sen_i, sen in enumerate(tokens):
            tagged_sen = self.nt.pos_tag(
                [tok[0].encode(self.encoding) for tok in sen])
            for tok_i, tagged_tok in enumerate(tagged_sen):
                try:
                    tok, pos = [x.decode(self.encoding) for x in tagged_tok]
                except ValueError:
                    continue
                tokens[sen_i][tok_i].append(pos)
Пример #3
0
pageSep = "%%#PAGE"
actPage = None
starter = False
for line in sys.stdin:
    l = line.strip().decode("utf-8")
    if l.startswith(pageSep):
        if actPage is not None:
            print
        
        actPage = l.split(" ", 1)[1]
        starter = True
        print l.encode("utf-8").replace(" ", "\t", 1)
        print "%%#Field\tTitle"
        titleTokens = nt.word_tokenize(actPage)
        titleTokensWithPos = list(nt.pos_tag(titleTokens))
        stemmedTitleTokens = nt.stem(titleTokensWithPos)
        hardStemmedTitleTokens = list(nt.stem(((x[0][0].lower() + x[0][1:] if x[0][0].isupper() and x[0][1:].islower() else x[0]), x[1]) for x in titleTokensWithPos))
        for i, (tok, pos, stem) in enumerate(stemmedTitleTokens):
            print u"{0}\t{1}\t{2}\t{3}\t{4}\t{5}".format(tok, "word", "0", pos, stem, hardStemmedTitleTokens[i][2]).encode("utf-8")
        print
    elif starter and l.startswith("Templates:"):
        try:
            templates = l.split("\t", 1)[1]
            print u"%%#Templates\t{0}".format(templates).encode("utf-8")
        except IndexError:
            pass
    elif starter and l.startswith("REDIRECT"):
        print "%%#Redirect"
    else:
        if starter: