Python NltkTools.stem示例

编程语言: Python

命名空间/包名称: langtools.nltk.nltktools

类/类型: NltkTools

方法/功能: stem

hotexamples.com的示例: 3

Python NltkTools.stem - 已找到3个示例。这些是从开源项目中提取的最受好评的langtools.nltk.nltktools.NltkTools.stem现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

NltkTools(5)

pos_tag(2)

stem(2)

tag_raw(2)

filter_long_sentences(1)

sen_tokenize(1)

starts_with_upper(1)

word_tokenize(1)

示例#1

显示文件

class NltkToolsStemmer(LemmatizerWrapper):
    """
    Wraps the NltkTools stemmer. It currently uses WordnetLemmatizer,
    which is English only.

    @warning This is the original implementation as used in our English
             Wikipedia parser. No effort has been made to clean up the
             code, or to fix the hardwired indexing, etc. The data must
             be already POS tagged, and the POS field must be the last one.
    """
    def __init__(self, params):
        self.nt = NltkTools(stem=True)

    def lemmatize(self, tokens):
        # HACK
        for sen_i, sen in enumerate(tokens):
            stemmed = self.nt.stem(((tok[0], tok[-1]) for tok in sen))
            hard_stemmed = self.nt.stem(
                (((tok[0][0].lower() + tok[0][1:] if tok[0][0].isupper()
                   and tok[0][1:].islower() else tok[0]), tok[-1])
                 for tok in sen))
            for tok_i, (tok_stemmed, tok_hard_stemmed) in enumerate(
                    zip(stemmed, hard_stemmed)):
                tokens[sen_i][tok_i].append(tok_stemmed[2])
                tokens[sen_i][tok_i].append(tok_hard_stemmed[2])

示例#2

显示文件

文件： tool_wrapper.py 项目： WajihCZ/hunmisc

class NltkToolsStemmer(LemmatizerWrapper):
    """
    Wraps the NltkTools stemmer. It currently uses WordnetLemmatizer,
    which is English only.

    @warning This is the original implementation as used in our English
             Wikipedia parser. No effort has been made to clean up the
             code, or to fix the hardwired indexing, etc. The data must
             be already POS tagged, and the POS field must be the last one.
    """
    def __init__(self, params):
        self.nt = NltkTools(stem=True)

    def lemmatize(self, tokens):
        # HACK
        for sen_i, sen in enumerate(tokens):
            stemmed = self.nt.stem(((tok[0], tok[-1]) for tok in sen))
            hard_stemmed = self.nt.stem((((tok[0][0].lower() + tok[0][1:] if tok[0][0].isupper() and tok[0][1:].islower() else tok[0]), tok[-1]) for tok in sen))
            for tok_i, (tok_stemmed, tok_hard_stemmed) in enumerate(zip(stemmed, hard_stemmed)):
                tokens[sen_i][tok_i].append(tok_stemmed[2])
                tokens[sen_i][tok_i].append(tok_hard_stemmed[2])

示例#3

显示文件

pageSep = "%%#PAGE"
actPage = None
starter = False
for line in sys.stdin:
    l = line.strip().decode("utf-8")
    if l.startswith(pageSep):
        if actPage is not None:
            print
        
        actPage = l.split(" ", 1)[1]
        starter = True
        print l.encode("utf-8").replace(" ", "\t", 1)
        print "%%#Field\tTitle"
        titleTokens = nt.word_tokenize(actPage)
        titleTokensWithPos = list(nt.pos_tag(titleTokens))
        stemmedTitleTokens = nt.stem(titleTokensWithPos)
        hardStemmedTitleTokens = list(nt.stem(((x[0][0].lower() + x[0][1:] if x[0][0].isupper() and x[0][1:].islower() else x[0]), x[1]) for x in titleTokensWithPos))
        for i, (tok, pos, stem) in enumerate(stemmedTitleTokens):
            print u"{0}\t{1}\t{2}\t{3}\t{4}\t{5}".format(tok, "word", "0", pos, stem, hardStemmedTitleTokens[i][2]).encode("utf-8")
        print
    elif starter and l.startswith("Templates:"):
        try:
            templates = l.split("\t", 1)[1]
            print u"%%#Templates\t{0}".format(templates).encode("utf-8")
        except IndexError:
            pass
    elif starter and l.startswith("REDIRECT"):
        print "%%#Redirect"
    else:
        if starter:
            print "%%#Field\tBody"