Пример #1
0
def getStemclasses():

	stemClasses = {}
	infile = open('good-1000-words.txt', 'r')
	terms = infile.readlines()
	infile.close()

	for voc in terms:
		voc = voc.strip()
			
		stem = PorterStemmer.useStemer(voc)
		stemClasses.setdefault(stem, [])
		stemClasses[stem].append(voc)

	return stemClasses
Пример #2
0
def getStemclasses():

    stemClasses = {}
    vocabDict = getDictFromFile('wiki-small-vocab.json')
    counter = 0

    for voc, vocDict in vocabDict.items():

        stem = PorterStemmer.useStemer(voc)
        stemClasses.setdefault(stem, [])
        stemClasses[stem].append(voc)

        if (counter % 10000 == 0):
            print('\t', counter, voc)

        counter += 1

    dumpJsonToFile('wiki-small-vocab-stem-classes.json', stemClasses, False)
Пример #3
0
from Porter import PorterStemmer
from krovetzstemmer import Stemmer

from common import readTextFromFile
from common import getTextFromHTML

krov = Stemmer()

f = 'en/articles/d/o/r/Dorothy_Block_a8f8.html'
text = getTextFromHTML(readTextFromFile(f))

print 'ori:\n', text, '\n'
print 'porter:\n', PorterStemmer.useStemer(text), '\n'
print 'krov:\n', krov.stem(text), '\n'
Пример #4
0
__author__ = 'liupeng'

import os
import re
import optparse
import sys
from Porter import PorterStemmer

LEN = 2665

dicts = {}

p = PorterStemmer()
start = 1


def getword(filename):
    global start
    print start, filename
    start += 1
    r = open(rfc_path + filename, 'r')
    string = r.read()
    s = re.findall("[a-z]{2,}", str.lower(string))
    word_num = len(s)
    for word in s:
        word = p.stem(word, 0, len(word) - 1)
        if dicts.has_key(word):
            if dicts[word].has_key(filename):
                dicts[word][filename][0] += 1
            else:
                dicts[word][filename] = [1, word_num]