def getStemclasses(): stemClasses = {} infile = open('good-1000-words.txt', 'r') terms = infile.readlines() infile.close() for voc in terms: voc = voc.strip() stem = PorterStemmer.useStemer(voc) stemClasses.setdefault(stem, []) stemClasses[stem].append(voc) return stemClasses
def getStemclasses(): stemClasses = {} vocabDict = getDictFromFile('wiki-small-vocab.json') counter = 0 for voc, vocDict in vocabDict.items(): stem = PorterStemmer.useStemer(voc) stemClasses.setdefault(stem, []) stemClasses[stem].append(voc) if (counter % 10000 == 0): print('\t', counter, voc) counter += 1 dumpJsonToFile('wiki-small-vocab-stem-classes.json', stemClasses, False)
from Porter import PorterStemmer from krovetzstemmer import Stemmer from common import readTextFromFile from common import getTextFromHTML krov = Stemmer() f = 'en/articles/d/o/r/Dorothy_Block_a8f8.html' text = getTextFromHTML(readTextFromFile(f)) print 'ori:\n', text, '\n' print 'porter:\n', PorterStemmer.useStemer(text), '\n' print 'krov:\n', krov.stem(text), '\n'
__author__ = 'liupeng' import os import re import optparse import sys from Porter import PorterStemmer LEN = 2665 dicts = {} p = PorterStemmer() start = 1 def getword(filename): global start print start, filename start += 1 r = open(rfc_path + filename, 'r') string = r.read() s = re.findall("[a-z]{2,}", str.lower(string)) word_num = len(s) for word in s: word = p.stem(word, 0, len(word) - 1) if dicts.has_key(word): if dicts[word].has_key(filename): dicts[word][filename][0] += 1 else: dicts[word][filename] = [1, word_num]