Пример #1
0
 def test_instantiation(self):
     """
     Tests instantiating a tokenizers handle.
     """
     tokenizer = Tokenizer()
     self.assertIsNotNone(tokenizer, msg="Tokenizer should not be None")
     tokenizer = Tokenizer(classname="weka.core.tokenizers.WordTokenizer")
     self.assertIsNotNone(tokenizer, msg="Tokenizer should not be None")
Пример #2
0
 def test_tokenize(self):
     """
     Tests calling the "tokenize" method.
     """
     tokenizer = Tokenizer(classname="weka.core.tokenizers.WordTokenizer")
     self.assertIsNotNone(tokenizer, msg="Tokenizer should not be None")
     sentence = "The quick brown fox jumps over the lazy dog"
     words = list(tokenizer.tokenize(sentence))
     self.assertEqual([u'The', u'quick', u'brown', u'fox', u'jumps', u'over', u'the', u'lazy', u'dog'], words, msg="Tokens differ")
Пример #3
0
 def test_tokenize(self):
     """
     Tests calling the "tokenize" method.
     """
     tokenizer = Tokenizer(classname="weka.core.tokenizers.WordTokenizer")
     self.assertIsNotNone(tokenizer, msg="Tokenizer should not be None")
     sentence = "The quick brown fox jumps over the lazy dog"
     words = list(tokenizer.tokenize(sentence))
     self.assertEqual([
         u'The', u'quick', u'brown', u'fox', u'jumps', u'over', u'the',
         u'lazy', u'dog'
     ],
                      words,
                      msg="Tokens differ")
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(iris)

    # remove class attribute
    helper.print_info("Removing class attribute")
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                    options=["-R", "last"])
    remove.inputformat(data)
    filtered = remove.filter(data)

    # use MultiFilter
    helper.print_info("Use MultiFilter")
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                    options=["-R", "first"])
    std = Filter(classname="weka.filters.unsupervised.attribute.Standardize")
    multi = MultiFilter()
    multi.filters = [remove, std]
    multi.inputformat(data)
    filtered_multi = multi.filter(data)

    # output datasets
    helper.print_title("Input")
    print(data)
    helper.print_title("Output")
    print(filtered)
    helper.print_title("Output (MultiFilter)")
    print(filtered_multi)

    # load text dataset
    text = helper.get_data_dir(
    ) + os.sep + "reutersTop10Randomized_1perc_shortened.arff"
    helper.print_info("Loading dataset: " + text)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(text)
    data.class_is_last()

    # apply StringToWordVector
    stemmer = Stemmer(classname="weka.core.stemmers.IteratedLovinsStemmer")
    stopwords = Stopwords(classname="weka.core.stopwords.Rainbow")
    tokenizer = Tokenizer(classname="weka.core.tokenizers.WordTokenizer")
    s2wv = StringToWordVector(options=["-W", "10", "-L", "-C"])
    s2wv.stemmer = stemmer
    s2wv.stopwords = stopwords
    s2wv.tokenizer = tokenizer
    s2wv.inputformat(data)
    filtered = s2wv.filter(data)

    helper.print_title("Input (StringToWordVector)")
    print(data)
    helper.print_title("Output (StringToWordVector)")
    print(filtered)
Пример #5
0
 def tokenizer(self):
     """
     Returns the tokenizer.
     :return: the tokenizer
     :rtype: Tokenizer
     """
     return Tokenizer(
         jobject=javabridge.call(self.jobject, "getTokenizer",
                                 "()Lweka/core/tokenizers/Tokenizer;"))
Пример #6
0
    #data.attribute_by_name('text').values =

    print(data)

    loader = Loader(classname="weka.core.converters.ArffLoader")
    iris_inc = loader.load_file("C:/Users/Esteb/Desktop/prueba.arff")
    iris_inc.class_is_last()

    #nom2str = Filter(classname=("weka.filters.unsupervised.attribute.StringToNominal"), options=["-R", "2-last"])
    #nom2str.inputformat(data)
    #filtered1 = nom2str.filter(data)

    stemmer = Stemmer(classname="weka.core.stemmers.IteratedLovinsStemmer")
    stopwords = Stopwords(classname="weka.core.stopwords.Rainbow")
    tokenizer = Tokenizer(classname="weka.core.tokenizers.WordTokenizer")
    s2wv = StringToWordVector(options=["-W", "10", "-L", "-C"])
    s2wv.stemmer = stemmer
    s2wv.stopwords = stopwords
    s2wv.tokenizer = tokenizer
    s2wv.inputformat(iris_inc)
    filtered = s2wv.filter(iris_inc)

    print(filtered)

    cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
    # cls.build_classifier(filtered1)

    # print(cls)
except ValueError:
    print("An exception occurred" + ValueError)