fname = data_dir + os.sep + "ReutersGrain-test.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") test = loader.load_file(fname) test.set_class_index(test.num_attributes() - 1) setups = ( ("weka.classifiers.trees.J48", []), ("weka.classifiers.bayes.NaiveBayes", []), ("weka.classifiers.bayes.NaiveBayesMultinomial", []), ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C"]), ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C", "-L", "-S"]) ) # cross-validate classifiers for setup in setups: classifier, opt = setup print("\n--> %s (filter options: %s)\n" % (classifier, " ".join(opt))) cls = FilteredClassifier() cls.set_classifier(Classifier(classname=classifier)) cls.set_filter(Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector", options=opt)) cls.build_classifier(data) evl = Evaluation(test) evl.test_model(cls, test) print("Accuracy: %0.0f%%" % evl.percent_correct()) tcdata = plc.generate_thresholdcurve_data(evl, 0) print("AUC: %0.3f" % plc.get_auc(tcdata)) print(evl.to_matrix("Matrix:")) jvm.stop()
sys.stdout.write("Repetitions=" + str(repetition)) # initialize curve curve = {} for percentage in percentages: curve[percentage] = 0 curves[repetition] = curve # run and add up percentage correct from repetition for seed in xrange(repetition): seed += 1 sys.stdout.write(".") for percentage in percentages: cls = Classifier(classname="weka.classifiers.trees.J48") flt = Filter(classname="weka.filters.unsupervised.instance.Resample", options=["-Z", str(percentage), "-no-replacement"]) fc = FilteredClassifier() fc.set_classifier(cls) fc.set_filter(flt) evl = Evaluation(data) evl.crossvalidate_model(fc, data, 10, Random(seed)) curve[percentage] += (evl.percent_correct() / repetition) # progress info sys.stdout.write("\n") # output the results if not plot.matplotlib_available: print("ZeroR: " + str(baseline)) for repetition in repetitions: y = [] for percentage in percentages: y.append(curves[repetition][percentage]) print("Repetitions = " + str(repetition) + ":\n" + str(y))