class NltkToolsStemmer(LemmatizerWrapper): """ Wraps the NltkTools stemmer. It currently uses WordnetLemmatizer, which is English only. @warning This is the original implementation as used in our English Wikipedia parser. No effort has been made to clean up the code, or to fix the hardwired indexing, etc. The data must be already POS tagged, and the POS field must be the last one. """ def __init__(self, params): self.nt = NltkTools(stem=True) def lemmatize(self, tokens): # HACK for sen_i, sen in enumerate(tokens): stemmed = self.nt.stem(((tok[0], tok[-1]) for tok in sen)) hard_stemmed = self.nt.stem( (((tok[0][0].lower() + tok[0][1:] if tok[0][0].isupper() and tok[0][1:].islower() else tok[0]), tok[-1]) for tok in sen)) for tok_i, (tok_stemmed, tok_hard_stemmed) in enumerate( zip(stemmed, hard_stemmed)): tokens[sen_i][tok_i].append(tok_stemmed[2]) tokens[sen_i][tok_i].append(tok_hard_stemmed[2])
class NltkToolsStemmer(LemmatizerWrapper): """ Wraps the NltkTools stemmer. It currently uses WordnetLemmatizer, which is English only. @warning This is the original implementation as used in our English Wikipedia parser. No effort has been made to clean up the code, or to fix the hardwired indexing, etc. The data must be already POS tagged, and the POS field must be the last one. """ def __init__(self, params): self.nt = NltkTools(stem=True) def lemmatize(self, tokens): # HACK for sen_i, sen in enumerate(tokens): stemmed = self.nt.stem(((tok[0], tok[-1]) for tok in sen)) hard_stemmed = self.nt.stem((((tok[0][0].lower() + tok[0][1:] if tok[0][0].isupper() and tok[0][1:].islower() else tok[0]), tok[-1]) for tok in sen)) for tok_i, (tok_stemmed, tok_hard_stemmed) in enumerate(zip(stemmed, hard_stemmed)): tokens[sen_i][tok_i].append(tok_stemmed[2]) tokens[sen_i][tok_i].append(tok_hard_stemmed[2])
pageSep = "%%#PAGE" actPage = None starter = False for line in sys.stdin: l = line.strip().decode("utf-8") if l.startswith(pageSep): if actPage is not None: print actPage = l.split(" ", 1)[1] starter = True print l.encode("utf-8").replace(" ", "\t", 1) print "%%#Field\tTitle" titleTokens = nt.word_tokenize(actPage) titleTokensWithPos = list(nt.pos_tag(titleTokens)) stemmedTitleTokens = nt.stem(titleTokensWithPos) hardStemmedTitleTokens = list(nt.stem(((x[0][0].lower() + x[0][1:] if x[0][0].isupper() and x[0][1:].islower() else x[0]), x[1]) for x in titleTokensWithPos)) for i, (tok, pos, stem) in enumerate(stemmedTitleTokens): print u"{0}\t{1}\t{2}\t{3}\t{4}\t{5}".format(tok, "word", "0", pos, stem, hardStemmedTitleTokens[i][2]).encode("utf-8") print elif starter and l.startswith("Templates:"): try: templates = l.split("\t", 1)[1] print u"%%#Templates\t{0}".format(templates).encode("utf-8") except IndexError: pass elif starter and l.startswith("REDIRECT"): print "%%#Redirect" else: if starter: print "%%#Field\tBody"