def test_has_my_stoplists(self): self.assertTrue(stoplists.load_stoplist('english') != None) self.assertTrue(stoplists.load_stoplist('patents') != None)
# A tokenizer. from alife.txtmine import util as txtutil from alife.txtmine import stoplists english_stops = stoplists.load_stoplist('english') patent_stops = stoplists.load_stoplist('patents') all_stops = list(set(english_stops).union(set(patent_stops))) class Tokenizer(object): def __init__(self, replace_hyphens=True, stemming=False, stopwords=all_stops): self.replace_hyphens = replace_hyphens self.stemming = stemming self.stopwords = stopwords def tokenize(self, inString): if self.replace_hyphens: inString = inString.replace('-',' ') wo_numpunc = txtutil.rmv_numbers(txtutil.rmv_punc(inString)) wordlist = map(lambda x: x.lower(), txtutil.mysplit(wo_numpunc)) if self.stopwords is not None: wordlist = txtutil.rmv_elems_from_list(wordlist, self.stopwords) if self.stemming: wordlist = txtutil.stem_words_in_list(wordlist) return wordlist
def test_not_found_behavior(self): with self.assertRaises(RuntimeError): stoplists.load_stoplist('Banasns')
def setUp(self): self.short = 'Bi1203498g D9wg!!!!!!!' self.short2 = 'b21234897&*(ig-dog112-34098 play3r' self.sentence = 'Hello, the my name is @invention abstract-patenter, with title and Sh234FS()8((#*t!! I has a bag---words-bagger. bag-er#fjkl ' self.tokenizer = tokenizer.Tokenizer(stopwords=load_stoplist('english'))
def setUp(self): self.short = 'Bi1203498g D9wg!!!!!!!' self.short2 = 'b21234897&*(ig-dog112-34098 play3r' self.sentence = 'Hello, the my name is @invention abstract-patenter, with title and Sh234FS()8((#*t!! I has a bag---words-bagger. bag-er#fjkl ' self.tokenizer = tokenizer.Tokenizer( stopwords=load_stoplist('english'))