Пример #1
0
 def test_has_my_stoplists(self):
     self.assertTrue(stoplists.load_stoplist('english') != None)
     self.assertTrue(stoplists.load_stoplist('patents') != None)
Пример #2
0
# A tokenizer. 
from alife.txtmine import util as txtutil
from alife.txtmine import stoplists

english_stops = stoplists.load_stoplist('english')
patent_stops = stoplists.load_stoplist('patents')
all_stops = list(set(english_stops).union(set(patent_stops)))

class Tokenizer(object):
    def __init__(self, replace_hyphens=True, stemming=False, stopwords=all_stops):
        self.replace_hyphens = replace_hyphens
        self.stemming = stemming
        self.stopwords = stopwords

    def tokenize(self, inString):
        if self.replace_hyphens:
            inString = inString.replace('-',' ')
        wo_numpunc = txtutil.rmv_numbers(txtutil.rmv_punc(inString))
        wordlist = map(lambda x: x.lower(), txtutil.mysplit(wo_numpunc))
        if self.stopwords is not None:
            wordlist = txtutil.rmv_elems_from_list(wordlist, self.stopwords)
        if self.stemming:
            wordlist = txtutil.stem_words_in_list(wordlist)
        return wordlist
Пример #3
0
 def test_not_found_behavior(self):
     with self.assertRaises(RuntimeError):
         stoplists.load_stoplist('Banasns')
Пример #4
0
 def test_has_my_stoplists(self):
     self.assertTrue(stoplists.load_stoplist('english') != None)
     self.assertTrue(stoplists.load_stoplist('patents') != None)
Пример #5
0
 def test_not_found_behavior(self):
     with self.assertRaises(RuntimeError):
         stoplists.load_stoplist('Banasns')
Пример #6
0
 def setUp(self):
     self.short = 'Bi1203498g    D9wg!!!!!!!'
     self.short2 = 'b21234897&*(ig-dog112-34098    play3r'
     self.sentence = 'Hello, the my name is @invention    abstract-patenter, with title and Sh234FS()8((#*t!!    I has a bag---words-bagger. bag-er#fjkl  '
     self.tokenizer = tokenizer.Tokenizer(stopwords=load_stoplist('english'))
Пример #7
0
 def setUp(self):
     self.short = 'Bi1203498g    D9wg!!!!!!!'
     self.short2 = 'b21234897&*(ig-dog112-34098    play3r'
     self.sentence = 'Hello, the my name is @invention    abstract-patenter, with title and Sh234FS()8((#*t!!    I has a bag---words-bagger. bag-er#fjkl  '
     self.tokenizer = tokenizer.Tokenizer(
         stopwords=load_stoplist('english'))