def test_instantiation(self): """ Tests instantiating a tokenizers handle. """ tokenizer = Tokenizer() self.assertIsNotNone(tokenizer, msg="Tokenizer should not be None") tokenizer = Tokenizer(classname="weka.core.tokenizers.WordTokenizer") self.assertIsNotNone(tokenizer, msg="Tokenizer should not be None")
def test_tokenize(self): """ Tests calling the "tokenize" method. """ tokenizer = Tokenizer(classname="weka.core.tokenizers.WordTokenizer") self.assertIsNotNone(tokenizer, msg="Tokenizer should not be None") sentence = "The quick brown fox jumps over the lazy dog" words = list(tokenizer.tokenize(sentence)) self.assertEqual([u'The', u'quick', u'brown', u'fox', u'jumps', u'over', u'the', u'lazy', u'dog'], words, msg="Tokens differ")
def test_tokenize(self): """ Tests calling the "tokenize" method. """ tokenizer = Tokenizer(classname="weka.core.tokenizers.WordTokenizer") self.assertIsNotNone(tokenizer, msg="Tokenizer should not be None") sentence = "The quick brown fox jumps over the lazy dog" words = list(tokenizer.tokenize(sentence)) self.assertEqual([ u'The', u'quick', u'brown', u'fox', u'jumps', u'over', u'the', u'lazy', u'dog' ], words, msg="Tokens differ")
def main(): """ Just runs some example code. """ # load a dataset iris = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(iris) # remove class attribute helper.print_info("Removing class attribute") remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) remove.inputformat(data) filtered = remove.filter(data) # use MultiFilter helper.print_info("Use MultiFilter") remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) std = Filter(classname="weka.filters.unsupervised.attribute.Standardize") multi = MultiFilter() multi.filters = [remove, std] multi.inputformat(data) filtered_multi = multi.filter(data) # output datasets helper.print_title("Input") print(data) helper.print_title("Output") print(filtered) helper.print_title("Output (MultiFilter)") print(filtered_multi) # load text dataset text = helper.get_data_dir( ) + os.sep + "reutersTop10Randomized_1perc_shortened.arff" helper.print_info("Loading dataset: " + text) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(text) data.class_is_last() # apply StringToWordVector stemmer = Stemmer(classname="weka.core.stemmers.IteratedLovinsStemmer") stopwords = Stopwords(classname="weka.core.stopwords.Rainbow") tokenizer = Tokenizer(classname="weka.core.tokenizers.WordTokenizer") s2wv = StringToWordVector(options=["-W", "10", "-L", "-C"]) s2wv.stemmer = stemmer s2wv.stopwords = stopwords s2wv.tokenizer = tokenizer s2wv.inputformat(data) filtered = s2wv.filter(data) helper.print_title("Input (StringToWordVector)") print(data) helper.print_title("Output (StringToWordVector)") print(filtered)
def tokenizer(self): """ Returns the tokenizer. :return: the tokenizer :rtype: Tokenizer """ return Tokenizer( jobject=javabridge.call(self.jobject, "getTokenizer", "()Lweka/core/tokenizers/Tokenizer;"))
#data.attribute_by_name('text').values = print(data) loader = Loader(classname="weka.core.converters.ArffLoader") iris_inc = loader.load_file("C:/Users/Esteb/Desktop/prueba.arff") iris_inc.class_is_last() #nom2str = Filter(classname=("weka.filters.unsupervised.attribute.StringToNominal"), options=["-R", "2-last"]) #nom2str.inputformat(data) #filtered1 = nom2str.filter(data) stemmer = Stemmer(classname="weka.core.stemmers.IteratedLovinsStemmer") stopwords = Stopwords(classname="weka.core.stopwords.Rainbow") tokenizer = Tokenizer(classname="weka.core.tokenizers.WordTokenizer") s2wv = StringToWordVector(options=["-W", "10", "-L", "-C"]) s2wv.stemmer = stemmer s2wv.stopwords = stopwords s2wv.tokenizer = tokenizer s2wv.inputformat(iris_inc) filtered = s2wv.filter(iris_inc) print(filtered) cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes") # cls.build_classifier(filtered1) # print(cls) except ValueError: print("An exception occurred" + ValueError)