def vote_classifier_train(dicrectory, nameOfDataSet, flag): loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(dicrectory) data.class_is_last() meta = MultipleClassifiersCombiner( classname="weka.classifiers.meta.Vote", options=[ '-S', '1', '-B', 'weka.classifiers.trees.J48 -C 0.25 -M 2', '-B', 'weka.classifiers.trees.RandomTree -K 6 -M 1.0 -V 0.001 -S 1', '-B', 'weka.classifiers.meta.Bagging -P 100 -S 1 -num-slots 1 -I 10 -W weka.classifiers.trees.REPTree -- ' '-M 2 -V 0.001 -N 3 -S 1 -L -1 -I 0.0', '-B', 'weka.classifiers.meta.AdaBoostM1 -P 100 -S 1 -I 10 -W weka.classifiers.trees.DecisionStump', '-B', 'weka.classifiers.meta.Bagging -P 100 -S 1 -num-slots 1 -I 10 -W weka.classifiers.trees.REPTree -- ' '-M 2 -V 0.001 -N 3 -S 1 -L -1 -I 0.0', '-B', 'weka.classifiers.bayes.NaiveBayes ', '-R', 'AVG' ]) eval = Evaluation(data) pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") if flag: eval.crossvalidate_model(meta, data, 10, Random(1), pout) else: eval.evaluate_train_test_split(meta, data, 80.0, Random(1), pout) gc.collect() print_and_save('Proposed model', flag, nameOfDataSet, eval)
def exposed_evaluate(self, X, d, task, i_model, i_evl): data = np.reshape(eval(X), [d, -1], order='C') if task == 'regression': if i_model == 'LR': data = converters.ndarray_to_instances(data, relation='tmp') data.class_is_last() model = Classifier( classname='weka.classifiers.functions.LinearRegression') evl = Evaluation(data) evl.crossvalidate_model(model, data, 5, Random(0)) elif i_model == 'RF': data = converters.ndarray_to_instances(data, relation='tmp') data.class_is_last() model = Classifier( classname='weka.classifiers.trees.RandomForest') evl = Evaluation(data) evl.crossvalidate_model(model, data, 5, Random(0)) if i_evl == 'mae': r_mae = evl.mean_absolute_error return r_mae elif i_evl == 'mse': r_mae = evl.mean_square_error return r_mse elif i_evl == '1-rae': r_one_minus_rae = 1 - evl.relative_absolute_error / 100 del evl, model, data return r_one_minus_rae elif task == 'classification': le = LabelEncoder() data[:, -1] = le.fit_transform(data[:, -1]) if i_model == 'RF': dataRaw = converters.ndarray_to_instances(data, relation='tmp') weka_filter = Filter( classname= "weka.filters.unsupervised.attribute.NumericToNominal", options=["-R", "last"]) weka_filter.inputformat(dataRaw) data = weka_filter.filter(dataRaw) data.class_is_last() model = Classifier( classname='weka.classifiers.trees.RandomForest') evl = Evaluation(data) evl.crossvalidate_model(model, data, 5, Random(0)) elif i_model == 'LR': model = LogisticRegression(multi_class='ovr') elif i_model == 'SVM': model = svm.SVC() if i_evl == 'f_score': fscore = evl.weighted_f_measure del evl, model, data, dataRaw if not (fscore >= 0.01 and fscore < 1.01): fscore = 0.01 return fscore
def naive_bayse(dicrectory, nameOfDataSet, flag): loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(dicrectory) data.class_is_last() cls = Classifier(classname='weka.classifiers.bayes.NaiveBayes') eval = Evaluation(data) pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") if flag: eval.crossvalidate_model(cls, data, 10, Random(1), pout) else: eval.evaluate_train_test_split(cls, data, 80.0, Random(1), pout) print_and_save('Naive Bayes model', flag, nameOfDataSet, eval) gc.collect()
def RandomTree(data, rnm): data.class_is_last() fc = FilteredClassifier() fc.classifier = Classifier(classname="weka.classifiers.trees.RandomTree", options=["-K", "0", "-M", "1.0", "-V", "0.001", "-S", "1"]) fc.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"]) folds = 10 evl = Evaluation(data) evl.crossvalidate_model(fc, data, folds, Random(1), pred_output) fc.build_classifier(data) f0 = open(rnm + '_RT_Tree.txt', 'w') print >> f0, "Filename: ", rnm print >> f0, '\n\n' print >> f0, str(fc) f0.close() f1 = open(rnm + '_RT_Prediction.txt', 'w') print >> f1, 'Filename:', rnm print >> f1, 'Prediction Summary:', (pred_output.buffer_content()) f1.close() f2 = open(rnm + '_RT_Evaluation.txt', 'w') print >> f2, 'Filename:', rnm print >> f2, 'Evaluation Summary:', (evl.summary()) print >> f2, '\n\n\n' print >> f2, (evl.class_details()) f2.close() plot_roc(evl, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm+'_RT_ROC.png', wait=False) value_RT = str(evl.percent_correct) return value_RT
def runSMO(file, bound): loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(file) data.class_is_first() remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", bound]) cls = KernelClassifier( classname="weka.classifiers.functions.SMO", options=["-C", "1.0", "-L", "0.001", "-P", "1.0E-12", "-N", "0"]) kernel = Kernel( classname="weka.classifiers.functions.supportVector.PolyKernel", options=["-C", "250007", "-E", "1.0"]) cls.kernel = kernel pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") remove.inputformat(data) filtered = remove.filter(data) evl = Evaluation(filtered) evl.crossvalidate_model(cls, filtered, 10, Random(1), pout) #print(pout.buffer_content()) print(evl.percent_correct) #print(evl.summary()) result = evl.class_details() print(result) return result
def create_model(input_file, output_file): # Load data data = converters.load_any_file(input_file) data.class_is_last() # set class attribute # filter data print_title("Filtering Data") discretize = Filter( classname="weka.filters.unsupervised.attribute.Discretize", options=["-B", "10", "-M", "-1.0", "-R", "first-last"]) discretize.inputformat( data) # let the filter know about the type of data to filter filtered_data = discretize.filter(data) print("Done! (believe it or not)") print_title("Build Classifier") classifier = Classifier(classname="weka.classifiers.trees.RandomForest", options=["-I", "100", "-K", "0", "-S", "1"]) classifier.build_classifier(filtered_data) print("Done! (believe it or not)") serialization.write_all(output_file, [classifier, discretize]) print("Model and filter saved to ", output_file) evaluation = Evaluation(data) # initialize with priors evaluation.crossvalidate_model(classifier, filtered_data, 10, Random(42)) # 10-fold CV print(evaluation.summary()) print("pctCorrect: " + str(evaluation.percent_correct)) print("incorrect: " + str(evaluation.incorrect))
def train_and_eval_weka_classifier(clf, train, valid, n_instances): total_train_inst = train.num_instances percentage = (n_instances * 100) / total_train_inst if percentage == 100: opt = train else: opt, residual = train.train_test_split(percentage, Random(1)) # opt, residual = train.train_test_split(percentage, Random(1)) print('total_train_inst: ', total_train_inst, '| percentage: ', percentage, '| used_inst: ', opt.num_instances) clf.build_classifier(opt) evl = Evaluation(opt) evl.test_model(clf, valid) # evl.crossvalidate_model(clf, opt, 10, Random(1)) acc = evl.percent_correct auc = evl.weighted_area_under_roc err = evl.error_rate log = evl.sf_mean_scheme_entropy print( "# validating | loss: {:.2}, accuracy: {:.4}, AUC: {:.2}, error: {:.2}" .format(log, acc, auc, err)) return {'loss': log, 'accuracy': acc, 'auc': auc, 'err': err}
def Boost_J48(data, rnm): data.class_is_last() fc1 = FilteredClassifier() fc1.classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"]) fc1.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) fc2 = SingleClassifierEnhancer(classname="weka.classifiers.meta.AdaBoostM1", options=["-P", "100", "-S", "1", "-I", "10"]) fc2.classifier = fc1 pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"]) folds = 10 fc2.build_classifier(data) evaluation = Evaluation(data) evaluation.crossvalidate_model(fc2, data, folds, Random(1), pred_output) f0 = open(rnm + '_Boost_J48_Tree.txt', 'w') print >> f0, "Filename: ", rnm print >> f0, '\n\n' print >> f0, str(fc2) f0.close() f1 = open(rnm + '_Boost_J48_Prediction.txt', 'w') print >> f1, 'Filename:', rnm print >> f1, 'Prediction Summary:', (pred_output.buffer_content()) f1.close() f2 = open(rnm + '_Boost_j48_Evaluation.txt', 'w') print >> f2, 'Filename:', rnm print >> f2, 'Evaluation Summary:', (evaluation.summary()) print >> f2, '\n\n\n' print >> f2, (evaluation.class_details()) f2.close() plot_roc(evaluation, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm + '_Boost_J48_ROC.png', wait=False) value_Boost_J48 = str(evaluation.percent_correct) return value_Boost_J48
def run_naive_bayes_crossval(self, output_directory): # build classifier print("\nBuilding Classifier on training data.") buildTimeStart = time.time() cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes") cls.build_classifier(self.training_data) resultsString = "" resultsString = self.print_both(str(cls), resultsString) buildTimeString = "NB Cross Eval Classifier Built in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Evaluate Classifier resultsString = self.print_both("\nCross Evaluating on test data.", resultsString) buildTimeStart = time.time() evl = Evaluation(self.training_data) evl.crossvalidate_model(cls, self.training_data, 10, Random(1)) resultsString = self.print_both(str(evl.summary()), resultsString) resultsString = self.print_both(str(evl.class_details()), resultsString) resultsString = self.print_both(str(evl.confusion_matrix), resultsString) buildTimeString = "\nNB Cross Eval Classifier Evaluated in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Save Results and Cleanup self.save_results("Naive_Bayes_Crossval", resultsString, output_directory)
def experiment_file_random(path_features, path_folder_save_results, options, classifier, fold, random, name): print("start weka") cls = Classifier(classname=classifier, options=weka.core.classes.split_options(options)) d_results = { 'percent_correct': [], 'percent_incorrect': [], 'confusion_matrix': [] } data = converters.load_any_file(path_features) data.class_is_last() pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV") evl = Evaluation(data) evl.crossvalidate_model(cls, data, fold, Random(random), pout) d_results['percent_correct'].append(evl.percent_correct) d_results['percent_incorrect'].append(evl.percent_incorrect) d_results['confusion_matrix'].append( evl.matrix()) # Generates the confusion matrix. d_results = pd.DataFrame(data=d_results) d_results.to_csv(path_folder_save_results + '/' + str(name) + '.csv', index=False) save = pout.buffer_content() with open( path_folder_save_results + '/' + 'prediction/' + str(name) + '.csv', 'w') as f: f.write(save)
def naiveBayes(data): classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes", options=["-D"]) nfolds=13 rnd = Random(0) evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, nfolds, rnd) print(" Naive Bayes Cross-validation information") print(evaluation.summary()) print("precision: " + str(evaluation.precision(1))) print("recall: " + str(evaluation.recall(1))) print("F-measure: " + str(evaluation.f_measure(1))) print("==confusion matrix==") print(" a b") print(evaluation.confusion_matrix) print #write to file f = open("naiveeval.txt", "w") f.write(evaluation.summary()) f.write("\n") f.write("==confusion matrix==\n") f.write(" a b\n") for item in evaluation.confusion_matrix: f.write("%s\n" % item) f.close() #plot roc graph plcls.plot_roc(evaluation, title="Naive Bayes ROC", outfile="NBROC", wait=True) return evaluation.percent_correct
def main(): """ Shows how to use the CostSensitiveClassifier. """ # load a dataset data_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + data_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # classifier classifier = SingleClassifierEnhancer( classname="weka.classifiers.meta.CostSensitiveClassifier", options=["-cost-matrix", "[0 1; 2 0]", "-S", "2"]) base = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) classifier.classifier = base folds = 10 evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, folds, Random(1)) print("") print("=== Setup ===") print("Classifier: " + classifier.to_commandline()) print("Dataset: " + data.relationname) print("") print( evaluation.summary("=== " + str(folds) + " -fold Cross-Validation ==="))
def split_train_test_valid(self): try: self.data = self.return_data() total_inst = self.data.num_instances train_, self.test = self.data.train_test_split(80.0, Random(1)) self.train, self.valid = train_.train_test_split(75.0, Random(1)) print('total_inst: ', total_inst, '| train_inst: ', self.train.num_instances, '| valid_inst: ', self.valid.num_instances, '| test_inst: ', self.test.num_instances) except Exception: pass return self.train, self.valid, self.test
def main(): try: jvm.start() loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file("./data/adult.csv") data.class_is_last() # set class attribute # randomize data folds = k seed = 1 rnd = Random(seed) rand_data = Instances.copy_instances(data) rand_data.randomize(rnd) if rand_data.class_attribute.is_nominal: rand_data.stratify(folds) NaiveBayes(rand_data, folds, seed, data) DecisionTree(rand_data, folds, seed, data) except Exception as e: raise e finally: jvm.stop()
def run(): jvm.start() load_csv = Loader("weka.core.converters.CSVLoader") data_csv = load_csv.load_file( "/Users/imeiliasantoso/web_graduate_project4/predict_page/predict_data.csv" ) saver = Saver("weka.core.converters.ArffSaver") saver.save_file( data_csv, "/Users/imeiliasantoso/web_graduate_project4/predict_page/predict_data.arff" ) load_arff = Loader("weka.core.converters.ArffLoader") data_arff = load_arff.load_file( "/Users/imeiliasantoso/web_graduate_project4/predict_page/predict_data.arff" ) data_arff.class_is_last() global j48 J48_class = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"]) J48_class.build_classifier(data_arff) evaluationj48 = Evaluation(data_arff) evaluationj48.crossvalidate_model(J48_class, data_arff, 10, Random(100)) j48 = str(evaluationj48.percent_correct) jvm.stop() return j48
def main(): dataset = sys.argv[1] #load a dataset loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file("./data/" + dataset + ".arff") data.class_is_last() num_classes = data.class_attribute.num_values os.mkdir('resultados_' + sys.argv[1]) for random_cv in range(10): #10 CV # generate train/test split of randomized data train, test = data.train_test_split(75.0, Random(random_cv)) results_train, results_test = classification(data, train, test, num_classes) # results_test = classification(test, num_classes) #Write results in Excel format train_name = "./resultados_" + sys.argv[1] + "/resultados_" + sys.argv[ 1] + "_" + "E" + np.str(random_cv) + ".csv" test_name = "./resultados_" + sys.argv[1] + "/resultados_" + sys.argv[ 1] + "_" + "T" + np.str(random_cv) + ".csv" results_train.to_csv(train_name) results_test.to_csv(test_name)
def run(dataset_path): start = time.time() ### load a dataset ### train_data = model.load_dataset_weka(dataset_path) # to_nomial_class_filter = Filter( classname="weka.filters.unsupervised.attribute.NumericToNominal", options=["-R", "last"]) to_nomial_class_filter.inputformat(train_data) ### Naive Bayes ### Choose what you want classifier = Classifier("weka.classifiers.bayes.NaiveBayesMultinomial") # classifier = Classifier("weka.classifiers.bayes.NaiveBayes") # classifier.build_classifer(train_data) evaluation = Evaluation(to_nomial_class_filter.filter(train_data)) evaluation.crossvalidate_model(classifier, to_nomial_class_filter.filter(train_data), 10, Random(42)) # print(evaluation.summary()) # print(evaluation.class_details()) # print(evaluation.matrix()) # ### Naive Bayes ### # mlp = Classifier("weka.classifiers.bayes.Naive Bayes") # mlp.build_classifer(train_file_5EMO) print(time.time() - start)
def test_classifier(dataset: Instances, classifier: Classifier, params: dict): vars = params.keys() vals = params.values() results = defaultdict(list) for val_combo in itertools.product(*vals): results["numInstances"].append(dataset.num_instances) results["numAttributes"].append(dataset.num_attributes) opts = dict(zip(vars, val_combo)) for opt in opts: results[opt].append(opts[opt]) classifier.set_property( opt, opts[opt] if not isinstance(opts[opt], float) else typeconv.double_to_float(opts[opt])) evl = Evaluation(dataset) classifier.build_classifier(dataset) evl.test_model(classifier, dataset) results["Training_Accuracy"].append(evl.percent_correct) results["size"].append( int(javabridge.call(classifier.jobject, "measureTreeSize", "()D"))) evl.crossvalidate_model(classifier, dataset, 10, Random(1)) results["CV_Accuracy"].append(evl.percent_correct) return results
def evaluation(self, classifier, trainingData, testingData=None): trainingData.set_class_index(trainingData.num_attributes() - 1) if testingData == None: evaluation = Evaluation(trainingData) # initialize with priors evaluation.crossvalidate_model(classifier, trainingData, 10, Random(42)) # 10-fold CV return evaluation else: print "testing data exists" if testingData.num_attributes() == trainingData.num_attributes(): testingData.set_class_index(testingData.num_attributes() - 1) evaluation = Evaluation(trainingData) classifier.build_classifier(trainingData) evaluation.test_model(classifier, testingData) #for attribute in trainingData.attributes(): # print "train:" + str(attribute) #for attribute in testingData.attributes(): # print "test:" + str(attribute) return evaluation else: print "testing Data doesn't have same attribute with training data" for attribute in trainingData.attributes(): print "train:" + str(attribute) for attribute in testingData.attributes(): print "test:" + str(attribute)
def runCV(this, arffFile, classifier, folds): loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(arffFile) data.class_is_last() classes = [str(code) for code in data.class_attribute.values] header = ["Accuracy"] for name in classes: header += [name + " TP", name + " FP", name + " AUC ROC"] values = [] cls = Classifier(classname=classifier) evl = Evaluation(data) evl.crossvalidate_model(cls, data, folds, Random(1)) values.append(evl.percent_correct) for name in classes: index = classes.index(name) values += [ evl.true_positive_rate(index) * 100, evl.false_positive_rate(index) * 100, evl.area_under_roc(index) ] this.values = values this.header = header
def test_generate_thresholdcurve_data(self): """ Tests the generate_thresholdcurve_data method. """ loader = converters.Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(self.datafile("diabetes.arff")) data.class_is_last() remove = filters.Filter( classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "1-3"]) cls = classifiers.Classifier( classname="weka.classifiers.bayes.NaiveBayes") fc = classifiers.FilteredClassifier() fc.filter = remove fc.classifier = cls evl = classifiers.Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) data = plot.generate_thresholdcurve_data(evl, 0) self.assertEqual(13, data.num_attributes, msg="number of attributes differs") self.assertEqual(769, data.num_instances, msg="number of rows differs") attname = "True Positives" self.assertIsNotNone(data.attribute_by_name(attname), msg="Failed to locate attribute: " + attname) attname = "False Positive Rate" self.assertIsNotNone(data.attribute_by_name(attname), msg="Failed to locate attribute: " + attname) attname = "Lift" self.assertIsNotNone(data.attribute_by_name(attname), msg="Failed to locate attribute: " + attname)
def main(args): """ Loads a dataset, shuffles it, splits it into train/test set. Trains J48 with training set and evaluates the built model on the test set. :param args: the commandline arguments (optional, can be dataset filename) :type args: list """ # load a dataset if len(args) <= 1: data_file = helper.get_data_dir() + os.sep + "vote.arff" else: data_file = args[1] helper.print_info("Loading dataset: " + data_file) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # generate train/test split of randomized data train, test = data.train_test_split(66.0, Random(1)) # build classifier cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) print(cls) # evaluate evl = Evaluation(train) evl.test_model(cls, test) print(evl.summary())
def naivebay_classifier_weka(data): classifier = Classifier("weka.classifiers.bayes.NaiveBayes") evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, 10, Random(42)) print(evaluation.summary()) print(evaluation.confusion_matrix) return classifier
def splitTrainSet(data,m_numLabledData=10) : total = data.num_instances labeled_amount = int(m_numLabledData * total / 100) unlabeled_amount = total - labeled_amount rand = Random(1) data.randomize(rand) labledDataSet = Instances.create_instances(data.relationname,data.attributes(),labeled_amount) UnlabledDataSet = Instances.create_instances(data.relationname,data.attributes(),unlabeled_amount) for i in range(labeled_amount) : labledDataSet.add_instance(data.get_instance(i)) labledDataSet.randomize(rand) for i in range(unlabeled_amount) : UnlabledDataSet.add_instance(data.get_instance(labeled_amount + i)) # labledDataSet.randomize(rand) labledDataSet.class_is_last() # UnlabledDataSet.randomize(rand) UnlabledDataSet.class_is_last() return labledDataSet,UnlabledDataSet
def fitness(toeval : Individual): cls = Classifier(classname="weka.classifiers.functions.MultilayerPerceptron", options=toeval.settings()) fc = FilteredClassifier() fc.filter = remove fc.classifier = cls evl = Evaluation(data) evl.crossvalidate_model(fc, data, 10, Random(1)) return evl.percent_correct
def f_smote(): jvm.start() train_data, test_data = b_i_impute_data() train_data = train_data[:10000] y_train = train_data["class"] x_train = train_data.drop("class", axis=1) sm = SMOTE(ratio="minority") x_train_sm, y_train_sm = sm.fit_sample(x_train, y_train) x_train_sm_df = pd.DataFrame(x_train_sm, columns=x_train.columns) y_train_sm_df = pd.DataFrame(y_train_sm, columns=["class"]) train_data_sm_df = pd.concat([y_train_sm_df, x_train_sm_df], axis=1) print_f("smote train data shape", train_data_sm_df.shape) train_data_sm_df.to_csv("./train_data_sm.csv", index=False) train_data_sm = converters.load_any_file("train_data_sm.csv") train_data_sm.class_is_first() test_data = converters.load_any_file("test_data.csv") test_data.class_is_first() print_f("1") cls = Classifier(classname="weka.classifiers.trees.LMT") print_f("bulding classifier") cls.build_classifier(train_data_sm) print_f("Evaluating") evl = Evaluation(train_data_sm) evl.crossvalidate_model(cls, train_data_sm, 5, Random(1)) print_f("Train Accuracy:", evl.percent_correct) print_f("Train summary") print_f(evl.summary()) print_f("Train class details") print_f(evl.class_details()) print_f("Train confusion matrix") print_f(evl.confusion_matrix) plcls.plot_roc(evl, class_index=[0, 1], wait=True, outfile="./plots/2_f_smote_10k.png") plt.suptitle("Train ROC Curve", fontsize=20, y=0.95) evl = Evaluation(test_data) print_f("testing model") evl.test_model(cls, test_data) print_f("Test Accuracy:", evl.percent_correct) print_f("Test summary") print_f(evl.summary()) print_f(" Testclass details") print_f(evl.class_details()) print_f("Testconfusion matrix") print_f(evl.confusion_matrix) plcls.plot_roc(evl, class_index=[0, 1], wait=True) plt.suptitle("Test ROC Curve", fontsize=20, y=0.95) savefig("./plots/f_test_roc_curve.png")
def run_bayesNet(file): # Get filename from Pathlib object filename = file.parts[-1] dir = file.parents[0] print("Running BayesNet on %s" % filename) if not filename.endswith(".arff"): print("%s not ARFF file." % filename) return # Removes '.arff' from filename filename_base = filename[:-5] # Load data with class as first attr data = load_Arff_file(file) data.class_is_first() # Use BayesNet and set options cls = Classifier(classname="weka.classifiers.bayes.BayesNet", options=[ "-D", "-Q", "weka.classifiers.bayes.net.search.local.TAN", "--", "-P", "1", "-S", "BAYES", "-E", "weka.classifiers.bayes.net.estimate.SimpleEstimator", "--", "-A", "0.5" ]) # Predictions stored in pout pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") # Evaluate data evaluation = Evaluation(data) evaluation.crossvalidate_model(cls, data, 10, Random(1), output=pout) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.confusion_matrix) # Generate grid for ROC # plcls.plot_roc(evaluation, class_index=[0,1], wait=True) # mk dirs for output dir = dir / "bayesNet_results" dir.mkdir(parents=True, exist_ok=True) # Save summary, class details and confusion matrix to file result_output = filename_base + "_bayesNet_eval_results_TAN.txt" output_eval(evaluation, dir / result_output) # Save the predicited results to file prediction_output = filename_base + "_bayesNet_pred_results_TAN.txt" output_pred(pout, dir / prediction_output) print("BayesNet complete")
def load_data_split(self, filename, validation_split, filter=False): self.validation_split = validation_split self.filename = filename print("\nLoading dataset: " + filename) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(filename) data.class_is_first() if (filter): data = self.filter_data(data) train, test = data.train_test_split(self.validation_split, Random(1)) self.training_data = train self.testing_data = test
def get_weka_training_data(self): percentage_of_train_set = 100 - self.test_size * 100 loader = Loader(classname="weka.core.converters.CSVLoader") dataset = loader.load_file(os.path.join(constants.BASE_DIR, constants.BREAST_CANCER_FILE_NAME)) dataset.class_is_last() train_set, test_set = dataset.train_test_split(percentage_of_train_set, Random(1)) return { 'train_set': train_set, 'test_set': test_set, 'labels': dataset.class_attribute.values }
def main(args): """ Loads a dataset, shuffles it, splits it into train/test set. Trains J48 with training set and evaluates the built model on the test set. The predictions get recorded in two different ways: 1. in-memory via the test_model method 2. directly to file (more memory efficient), but a separate run of making predictions :param args: the commandline arguments (optional, can be dataset filename) :type args: list """ # load a dataset if len(args) <= 1: data_file = helper.get_data_dir() + os.sep + "vote.arff" else: data_file = args[1] helper.print_info("Loading dataset: " + data_file) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # generate train/test split of randomized data train, test = data.train_test_split(66.0, Random(1)) # build classifier cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) print(cls) # evaluate and record predictions in memory helper.print_title("recording predictions in-memory") output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-distribution"]) evl = Evaluation(train) evl.test_model(cls, test, output=output) print(evl.summary()) helper.print_info("Predictions:") print(output.buffer_content()) # record/output predictions separately helper.print_title("recording/outputting predictions separately") outputfile = helper.get_tmp_dir() + "/j48_vote.csv" output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-distribution", "-suppress", "-file", outputfile]) output.header = test output.print_all(cls, test) helper.print_info("Predictions stored in:" + outputfile) # by using "-suppress" we don't store the output in memory, the following statement won't output anything print(output.buffer_content())