def naiveBayes(data): classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes", options=["-D"]) nfolds=13 rnd = Random(0) evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, nfolds, rnd) print(" Naive Bayes Cross-validation information") print(evaluation.summary()) print("precision: " + str(evaluation.precision(1))) print("recall: " + str(evaluation.recall(1))) print("F-measure: " + str(evaluation.f_measure(1))) print("==confusion matrix==") print(" a b") print(evaluation.confusion_matrix) print #write to file f = open("naiveeval.txt", "w") f.write(evaluation.summary()) f.write("\n") f.write("==confusion matrix==\n") f.write(" a b\n") for item in evaluation.confusion_matrix: f.write("%s\n" % item) f.close() #plot roc graph plcls.plot_roc(evaluation, title="Naive Bayes ROC", outfile="NBROC", wait=True) return evaluation.percent_correct
def f_smote(): jvm.start() train_data, test_data = b_i_impute_data() train_data = train_data[:10000] y_train = train_data["class"] x_train = train_data.drop("class", axis=1) sm = SMOTE(ratio="minority") x_train_sm, y_train_sm = sm.fit_sample(x_train, y_train) x_train_sm_df = pd.DataFrame(x_train_sm, columns=x_train.columns) y_train_sm_df = pd.DataFrame(y_train_sm, columns=["class"]) train_data_sm_df = pd.concat([y_train_sm_df, x_train_sm_df], axis=1) print_f("smote train data shape", train_data_sm_df.shape) train_data_sm_df.to_csv("./train_data_sm.csv", index=False) train_data_sm = converters.load_any_file("train_data_sm.csv") train_data_sm.class_is_first() test_data = converters.load_any_file("test_data.csv") test_data.class_is_first() print_f("1") cls = Classifier(classname="weka.classifiers.trees.LMT") print_f("bulding classifier") cls.build_classifier(train_data_sm) print_f("Evaluating") evl = Evaluation(train_data_sm) evl.crossvalidate_model(cls, train_data_sm, 5, Random(1)) print_f("Train Accuracy:", evl.percent_correct) print_f("Train summary") print_f(evl.summary()) print_f("Train class details") print_f(evl.class_details()) print_f("Train confusion matrix") print_f(evl.confusion_matrix) plcls.plot_roc(evl, class_index=[0, 1], wait=True, outfile="./plots/2_f_smote_10k.png") plt.suptitle("Train ROC Curve", fontsize=20, y=0.95) evl = Evaluation(test_data) print_f("testing model") evl.test_model(cls, test_data) print_f("Test Accuracy:", evl.percent_correct) print_f("Test summary") print_f(evl.summary()) print_f(" Testclass details") print_f(evl.class_details()) print_f("Testconfusion matrix") print_f(evl.confusion_matrix) plcls.plot_roc(evl, class_index=[0, 1], wait=True) plt.suptitle("Test ROC Curve", fontsize=20, y=0.95) savefig("./plots/f_test_roc_curve.png")
def ClassifyParam(mode, binWidths): if not os.path.exists("classificationResults"): os.makedirs("classificationResults") if("normal" in mode): file = open("classificationResults/AllVsAll.csv","w") file.write("BinWidth, Accuracy\n") for binWidth in binWidths: train_set = "Data/arff/TrainSet_%s.arff"%(binWidth) test_set = "Data/arff/TestSet_%s.arff"%(binWidth) print "Loading Datasets..." train_data = converters.load_any_file(train_set) test_data = converters.load_any_file(test_set) #Set class attribute train_data.class_is_last() test_data.class_is_last() print "Dataset Loaded!" classifier_name = "weka.classifiers.meta.FilteredClassifier" classifier = Classifier(classname=classifier_name, options=[ "-F", "weka.filters.unsupervised.attribute.StringToWordVector -R first-last -W 1000 -C -T -N 1 -stemmer weka.core.stemmers.NullStemmer -M 1 -tokenizer \"weka.core.tokenizers.WordTokenizer -delimiters \\\" \\\\r\\\\n\\\\t.,;:\\\\\\\'\\\\\\\"()?!\\\"\"", "-W", "weka.classifiers.bayes.NaiveBayesMultinomial"]) start_train = time.time() classifier.build_classifier(train_data) end_train = time.time() print "Train\t%s\t%s"%(binWidth, end_train-start_train) for index, inst in enumerate(test_data): if(index == 0): start_sample = time.time() classifier.classify_instance(inst) end_sample = time.time() print "Sample\t%s\t%s"%(binWidth, end_sample-start_sample) print "Evaluating w/ Multinomial Naive Bayes classifier. BinWidth = %s"%(binWidth) evaluation = Evaluation(test_data) start_batch = time.time() evaluation.test_model(classifier, test_data) end_batch = time.time() print "Batch\t%s\t%s"%(binWidth,end_batch-start_batch) print evaluation.summary() acc = evaluation.percent_correct/100.0 print "Percent correct: " + str(acc) file.write("%s, %s\n"%(binWidth, acc)) file.close()
def create_model(input_file, output_file): # Load data data = converters.load_any_file(input_file) data.class_is_last() # set class attribute # filter data print_title("Filtering Data") discretize = Filter( classname="weka.filters.unsupervised.attribute.Discretize", options=["-B", "10", "-M", "-1.0", "-R", "first-last"]) discretize.inputformat( data) # let the filter know about the type of data to filter filtered_data = discretize.filter(data) print("Done! (believe it or not)") print_title("Build Classifier") classifier = Classifier(classname="weka.classifiers.trees.RandomForest", options=["-I", "100", "-K", "0", "-S", "1"]) classifier.build_classifier(filtered_data) print("Done! (believe it or not)") serialization.write_all(output_file, [classifier, discretize]) print("Model and filter saved to ", output_file) evaluation = Evaluation(data) # initialize with priors evaluation.crossvalidate_model(classifier, filtered_data, 10, Random(42)) # 10-fold CV print(evaluation.summary()) print("pctCorrect: " + str(evaluation.percent_correct)) print("incorrect: " + str(evaluation.incorrect))
def run_naive_bayes_crossval(self, output_directory): # build classifier print("\nBuilding Classifier on training data.") buildTimeStart = time.time() cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes") cls.build_classifier(self.training_data) resultsString = "" resultsString = self.print_both(str(cls), resultsString) buildTimeString = "NB Cross Eval Classifier Built in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Evaluate Classifier resultsString = self.print_both("\nCross Evaluating on test data.", resultsString) buildTimeStart = time.time() evl = Evaluation(self.training_data) evl.crossvalidate_model(cls, self.training_data, 10, Random(1)) resultsString = self.print_both(str(evl.summary()), resultsString) resultsString = self.print_both(str(evl.class_details()), resultsString) resultsString = self.print_both(str(evl.confusion_matrix), resultsString) buildTimeString = "\nNB Cross Eval Classifier Evaluated in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Save Results and Cleanup self.save_results("Naive_Bayes_Crossval", resultsString, output_directory)
def testNB(training_data, testing_data): train_data = Instances.copy_instances(training_data) test_data = Instances.copy_instances(testing_data) evaluation = Evaluation(train_data) classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") classifier.build_classifier( train_data) # build classifier on the training data evaluation.test_model(classifier, test_data) # test and evaluate model on the test set print("") print("") print( evaluation.summary( "--------------Naive Bayes Evaluation--------------")) print("Accuracy: " + str(evaluation.percent_correct)) print("") print("Label\tPrecision\t\tRecall\t\t\tF-Measure") print("<=50K\t" + str(evaluation.precision(0)) + "\t" + str(evaluation.recall(0)) + "\t" + str(evaluation.f_measure(0))) print(">50K\t" + str(evaluation.precision(1)) + "\t" + str(evaluation.recall(1)) + "\t" + str(evaluation.f_measure(1))) print("Mean\t" + str(((evaluation.precision(1)) + (evaluation.precision(0))) / 2) + "\t" + str(((evaluation.recall(1)) + (evaluation.recall(0))) / 2) + "\t" + str(((evaluation.f_measure(1)) + (evaluation.f_measure(0))) / 2))
def naivebay_classifier_weka(data): classifier = Classifier("weka.classifiers.bayes.NaiveBayes") evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, 10, Random(42)) print(evaluation.summary()) print(evaluation.confusion_matrix) return classifier
def RandomTree(data, rnm): data.class_is_last() fc = FilteredClassifier() fc.classifier = Classifier(classname="weka.classifiers.trees.RandomTree", options=["-K", "0", "-M", "1.0", "-V", "0.001", "-S", "1"]) fc.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"]) folds = 10 evl = Evaluation(data) evl.crossvalidate_model(fc, data, folds, Random(1), pred_output) fc.build_classifier(data) f0 = open(rnm + '_RT_Tree.txt', 'w') print >> f0, "Filename: ", rnm print >> f0, '\n\n' print >> f0, str(fc) f0.close() f1 = open(rnm + '_RT_Prediction.txt', 'w') print >> f1, 'Filename:', rnm print >> f1, 'Prediction Summary:', (pred_output.buffer_content()) f1.close() f2 = open(rnm + '_RT_Evaluation.txt', 'w') print >> f2, 'Filename:', rnm print >> f2, 'Evaluation Summary:', (evl.summary()) print >> f2, '\n\n\n' print >> f2, (evl.class_details()) f2.close() plot_roc(evl, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm+'_RT_ROC.png', wait=False) value_RT = str(evl.percent_correct) return value_RT
def Boost_J48(data, rnm): data.class_is_last() fc1 = FilteredClassifier() fc1.classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"]) fc1.filter = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) fc2 = SingleClassifierEnhancer(classname="weka.classifiers.meta.AdaBoostM1", options=["-P", "100", "-S", "1", "-I", "10"]) fc2.classifier = fc1 pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-p", "1"]) folds = 10 fc2.build_classifier(data) evaluation = Evaluation(data) evaluation.crossvalidate_model(fc2, data, folds, Random(1), pred_output) f0 = open(rnm + '_Boost_J48_Tree.txt', 'w') print >> f0, "Filename: ", rnm print >> f0, '\n\n' print >> f0, str(fc2) f0.close() f1 = open(rnm + '_Boost_J48_Prediction.txt', 'w') print >> f1, 'Filename:', rnm print >> f1, 'Prediction Summary:', (pred_output.buffer_content()) f1.close() f2 = open(rnm + '_Boost_j48_Evaluation.txt', 'w') print >> f2, 'Filename:', rnm print >> f2, 'Evaluation Summary:', (evaluation.summary()) print >> f2, '\n\n\n' print >> f2, (evaluation.class_details()) f2.close() plot_roc(evaluation, class_index=[0,1], title=rnm, key_loc='best', outfile=rnm + '_Boost_J48_ROC.png', wait=False) value_Boost_J48 = str(evaluation.percent_correct) return value_Boost_J48
def main(args): """ Loads a dataset, shuffles it, splits it into train/test set. Trains J48 with training set and evaluates the built model on the test set. :param args: the commandline arguments (optional, can be dataset filename) :type args: list """ # load a dataset if len(args) <= 1: data_file = helper.get_data_dir() + os.sep + "vote.arff" else: data_file = args[1] helper.print_info("Loading dataset: " + data_file) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # generate train/test split of randomized data train, test = data.train_test_split(66.0, Random(1)) # build classifier cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) print(cls) # evaluate evl = Evaluation(train) evl.test_model(cls, test) print(evl.summary())
def main(): """ Shows how to use the CostSensitiveClassifier. """ # load a dataset data_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + data_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # classifier classifier = SingleClassifierEnhancer( classname="weka.classifiers.meta.CostSensitiveClassifier", options=["-cost-matrix", "[0 1; 2 0]", "-S", "2"]) base = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) classifier.classifier = base folds = 10 evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, folds, Random(1)) print("") print("=== Setup ===") print("Classifier: " + classifier.to_commandline()) print("Dataset: " + data.relationname) print("") print(evaluation.summary("=== " + str(folds) + " -fold Cross-Validation ==="))
def main(): """ Shows how to use the CostSensitiveClassifier. """ # load a dataset data_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + data_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # classifier classifier = SingleClassifierEnhancer( classname="weka.classifiers.meta.CostSensitiveClassifier", options=["-cost-matrix", "[0 1; 2 0]", "-S", "2"]) base = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) classifier.classifier = base folds = 10 evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, folds, Random(1)) print("") print("=== Setup ===") print("Classifier: " + classifier.to_commandline()) print("Dataset: " + data.relationname) print("") print( evaluation.summary("=== " + str(folds) + " -fold Cross-Validation ==="))
def train_weka_model(self, training_data_dir, save_model_dir, log_file, mimic_env=None): """ Just runs some example code. """ loader = Loader(classname="weka.core.converters.CSVLoader") training_data = loader.load_file(training_data_dir) training_data.class_is_last() self.classifier = Classifier(classname="weka.classifiers.trees.M5P", options=self.options) # classifier help, check https://weka.sourceforge.io/doc.dev/weka/classifiers/trees/M5P.html self.classifier.build_classifier(training_data) # print(classifier) graph = self.classifier.graph node_number = float(graph.split('\n')[-3].split()[0].replace('N', '')) leaves_number = node_number / 2 serialization.write(save_model_dir, self.classifier) # print('Leaves number is {0}'.format(leave_number), file=log_file) evaluation = Evaluation(training_data) predicts = evaluation.test_model(self.classifier, training_data) # return_value = None # if mimic_env is not None: predict_dictionary = {} for predict_index in range(len(predicts)): predict_value = predicts[predict_index] if predict_value in predict_dictionary.keys(): predict_dictionary[predict_value].append(predict_index) else: predict_dictionary.update({predict_value: [predict_index]}) # return_value = mimic_env.get_return(state=list(predict_dictionary.values())) return_value_log = mimic_env.get_return( state=list(predict_dictionary.values())) return_value_log_struct = mimic_env.get_return( state=list(predict_dictionary.values()), apply_structure_cost=True) return_value_var_reduction = mimic_env.get_return( state=list(predict_dictionary.values()), apply_variance_reduction=True) # print("Training return is {0}".format(return_value), file=log_file) summary = evaluation.summary() numbers = summary.split('\n') corr = float(numbers[1].split()[-1]) mae = float(numbers[2].split()[-1]) rmse = float(numbers[3].split()[-1]) rae = float(numbers[4].split()[-2]) / 100 rrse = float(numbers[5].split()[-2]) / 100 # print(evl) # print("Training summary is "+summary, file=log_file) return return_value_log, return_value_log_struct, \ return_value_var_reduction, mae, rmse, leaves_number
def run_bayesNet(file): # Get filename from Pathlib object filename = file.parts[-1] dir = file.parents[0] print("Running BayesNet on %s" % filename) if not filename.endswith(".arff"): print("%s not ARFF file." % filename) return # Removes '.arff' from filename filename_base = filename[:-5] # Load data with class as first attr data = load_Arff_file(file) data.class_is_first() # Use BayesNet and set options cls = Classifier(classname="weka.classifiers.bayes.BayesNet", options=[ "-D", "-Q", "weka.classifiers.bayes.net.search.local.TAN", "--", "-P", "1", "-S", "BAYES", "-E", "weka.classifiers.bayes.net.estimate.SimpleEstimator", "--", "-A", "0.5" ]) # Predictions stored in pout pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") # Evaluate data evaluation = Evaluation(data) evaluation.crossvalidate_model(cls, data, 10, Random(1), output=pout) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.confusion_matrix) # Generate grid for ROC # plcls.plot_roc(evaluation, class_index=[0,1], wait=True) # mk dirs for output dir = dir / "bayesNet_results" dir.mkdir(parents=True, exist_ok=True) # Save summary, class details and confusion matrix to file result_output = filename_base + "_bayesNet_eval_results_TAN.txt" output_eval(evaluation, dir / result_output) # Save the predicited results to file prediction_output = filename_base + "_bayesNet_pred_results_TAN.txt" output_pred(pout, dir / prediction_output) print("BayesNet complete")
def e_model_tree(): # train_data, test_data = b_i_impute_data() # train_data.to_csv("./train_data.csv", index=False) # test_data.to_csv("./test_data.csv",index=False) jvm.start() train_data = converters.load_any_file("train_data.csv") train_data.class_is_first() test_data = converters.load_any_file("test_data.csv") test_data.class_is_first() print("1") cls = Classifier(classname="weka.classifiers.trees.LMT") print("2") cls.build_classifier(train_data) print("3") evl = Evaluation(train_data) evl.crossvalidate_model(cls, train_data, 5, Random(1)) print("Train Accuracy:", evl.percent_correct) print("Train summary") print(evl.summary()) print("Train class details") print(evl.class_details()) print("Train confusion matrix") print(evl.confusion_matrix) plcls.plot_roc(evl, class_index=[0, 1], wait=True) plt.suptitle("Train ROC Curve", fontsize=20, y=0.95) savefig("./plots/e_train_roc_curve.png") evl = Evaluation(test_data) evl.test_model(cls, test_data) print("Test Accuracy:", evl.percent_correct) print("Test summary") print(evl.summary()) print(" Testclass details") print(evl.class_details()) print("Testconfusion matrix") print(evl.confusion_matrix) plcls.plot_roc(evl, class_index=[0, 1], wait=True) plt.suptitle("Test ROC Curve", fontsize=20, y=0.95) savefig("./plots/e_test_roc_curve.png")
def main(args): """ Loads a dataset, shuffles it, splits it into train/test set. Trains J48 with training set and evaluates the built model on the test set. The predictions get recorded in two different ways: 1. in-memory via the test_model method 2. directly to file (more memory efficient), but a separate run of making predictions :param args: the commandline arguments (optional, can be dataset filename) :type args: list """ # load a dataset if len(args) <= 1: data_file = helper.get_data_dir() + os.sep + "vote.arff" else: data_file = args[1] helper.print_info("Loading dataset: " + data_file) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # generate train/test split of randomized data train, test = data.train_test_split(66.0, Random(1)) # build classifier cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) print(cls) # evaluate and record predictions in memory helper.print_title("recording predictions in-memory") output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-distribution"]) evl = Evaluation(train) evl.test_model(cls, test, output=output) print(evl.summary()) helper.print_info("Predictions:") print(output.buffer_content()) # record/output predictions separately helper.print_title("recording/outputting predictions separately") outputfile = helper.get_tmp_dir() + "/j48_vote.csv" output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV", options=["-distribution", "-suppress", "-file", outputfile]) output.header = test output.print_all(cls, test) helper.print_info("Predictions stored in:" + outputfile) # by using "-suppress" we don't store the output in memory, the following statement won't output anything print(output.buffer_content())
def test_weka_model(self, testing_data_dir, save_model_dir, log_file, mimic_env=None): self.classifier = Classifier( jobject=serialization.read(save_model_dir)) graph = self.classifier.graph node_number = float(graph.split('\n')[-3].split()[0].replace('N', '')) leaves_number = node_number / 2 serialization.write(save_model_dir, self.classifier) # print('Leaves number is {0}'.format(leave_number), file=log_file) loader = Loader(classname="weka.core.converters.CSVLoader") testing_data = loader.load_file(testing_data_dir) testing_data.class_is_last() evaluation = Evaluation(testing_data) predicts = evaluation.test_model(self.classifier, testing_data) predict_dictionary = {} for predict_index in range(len(predicts)): predict_value = predicts[predict_index] if predict_value in predict_dictionary.keys(): predict_dictionary[predict_value].append(predict_index) else: predict_dictionary.update({predict_value: [predict_index]}) return_value_log = mimic_env.get_return( state=list(predict_dictionary.values())) return_value_log_struct = mimic_env.get_return( state=list(predict_dictionary.values()), apply_structure_cost=True) return_value_var_reduction = mimic_env.get_return( state=list(predict_dictionary.values()), apply_variance_reduction=True) summary = evaluation.summary() numbers = summary.split('\n') corr = float(numbers[1].split()[-1]) mae = float(numbers[2].split()[-1]) rmse = float(numbers[3].split()[-1]) rae = float(numbers[4].split()[-2]) / 100 rrse = float(numbers[5].split()[-2]) / 100 # print(evl) # print("Testing summary is "+summary, file=log_file) return return_value_log, return_value_log_struct, \ return_value_var_reduction, mae, rmse, leaves_number
def trainAndMakePred(train, test): #IBK test and prediction classifierIBK = Classifier(classname="weka.classifiers.lazy.IBk", options=["-K", "5"]) classifierIBK.build_classifier(train) evaluationIBK = Evaluation(train) predicted_labelsIBK = evaluationIBK.test_model(classifierIBK, train) print(" IBKTraining information ") print(evaluationIBK.summary()) pred_outputIBK = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV") evaluationIBK = Evaluation(test) predicted_indicesIBK = evaluationIBK.test_model(classifierIBK, test, pred_outputIBK) print(" IBK Prediction information ") print(pred_outputIBK) #Naive bayes and prediction classifierNB = Classifier(classname="weka.classifiers.bayes.NaiveBayes", options=["-D"]) classifierNB.build_classifier(train) evaluationNB = Evaluation(train) predicted_labelsNB = evaluationNB.test_model(classifierNB, train) print(" Naive Bayes Training information ") print(evaluationNB.summary()) pred_outputNB = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.CSV") evaluationNB = Evaluation(test) predicted_indicesNB = evaluationNB.test_model(classifierNB, test, pred_outputNB) print(" Naive Bayes Prediction information ") print(pred_outputNB) #out put predictions to file a = 1 ID = 901 f = open("predict.csv", "w") f.write("ID,Predict 1,Predict 2\n") for pred1, pred2 in zip(predicted_indicesIBK, predicted_indicesNB): f.write("%s,%s,%s\n" % (ID,pred1,pred2)) ID += 1 f.close()
def CV5x2(dataset, algo, num_datasets): loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(dataset) data.class_is_last() cls = Classifier(classname=algo) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 2, Random(5)) print(evl.summary("=== " +str(algo)+ " on" + str(dataset) + " ===",False)) print(evl.matrix("=== on click prediction(confusion matrix) ===")) print("For Algo"+ str(algo)+"areaUnderROC/1: for CV5x2 " + str(evl.area_under_roc(1))) return evl.area_under_roc(1)
def crossValidate(self, arrfFile = None, classname="weka.classifiers.trees.J48", options=["-C", "0.3"]): if arrfFile is not None: self.initData( arrfFile ) if self.data is None: return print 'Classificador ' + str(classname) + ' ' + ' '.join(options) cls = Classifier(classname=classname, options=options) evl = Evaluation(self.data) evl.crossvalidate_model(cls, self.data, 10, Random(1)) print(evl.percent_correct) print(evl.summary()) print(evl.class_details())
def SimpleLogistic(): # load a dataset loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file("First_trial_classification.arff") data.class_is_last() # set class attribute cls = Classifier(classname="weka.classifiers.functions.SimpleLogistic") pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(486), pout) print(evl.summary()) print(pout.buffer_content()) # save model serialization.write_all("SimpleLogistic2.model", cls)
def cross_validate(self, detail = True): """Perform cross validation using trained data. Parameters ---------- detail : boolean, optional, default = True If true return a detailed information of cross validation. Returns ------- info : string Info with results of cross validation. """ #print 'cross_validation' start_time = TimeUtils.get_time() info = "Scheme:\t%s %s\n" % (str(self.classifier.classname) , " ".join([str(option) for option in self.classifier.options])) if detail == True: info += "Relation:\t%s\n" % (self.data.relationname) info += "Instances:\t%d\n" % (self.data.num_instances) info += "Attributes:\t%d\n\n" % (self.data.num_attributes) evl = WEvaluation(self.data) evl.crossvalidate_model(self.classifier, self.data, 10, WRandom(1)) if detail == False: info += "Correctly Classified Instances: %0.4f%%\n" % (evl.percent_correct) info += "Time taken to build model: %0.5f seconds\n\n" % (TimeUtils.get_time() - start_time) #info += str(evl.percent_correct) + "\n\n" if detail == True: info += "=== Stratified cross-validation ===\n" info += evl.summary() + "\n\n" info += str(evl.class_details()) + "\n\n" classes = [str(self.data.class_attribute.value(i)) for i in range(0, self.data.class_attribute.num_values)] cm = evl.confusion_matrix info += Classifier.confusion_matrix(classes, cm) return info
def run_bayes_hill_split(self, output_directory, parents=1): # build classifier print("\nBuilding Bayes Classifier on training data. Parents = " + str(parents) + "\n") buildTimeStart = time.time() cls = Classifier( classname="weka.classifiers.bayes.BayesNet", options=[ "-D", "-Q", "weka.classifiers.bayes.net.search.local.HillClimber", "--", "-P", "" + str(parents), "-S", "BAYES", "-E", "weka.classifiers.bayes.net.estimate.SimpleEstimator", "--", "-A", "0.5" ]) cls.build_classifier(self.training_data) resultsString = "" resultsString = self.print_both(str(cls), resultsString) buildTimeString = "Bayes Split Classifier Built in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Evaluate Classifier resultsString = self.print_both("\nEvaluating on test data.", resultsString) buildTimeStart = time.time() evl = Evaluation(self.training_data) evl.test_model(cls, self.testing_data) resultsString = self.print_both(str(evl.summary()), resultsString) resultsString = self.print_both(str(evl.class_details()), resultsString) resultsString = self.print_both(str(evl.confusion_matrix), resultsString) buildTimeString = "\nBayes Split Classifier Evaluated in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Save Results and Cleanup self.save_results("Bayes_Hill_P" + str(parents) + "_", resultsString, output_directory) self.save_results("Bayes_Hill_P" + str(parents) + "_Graph", cls.graph, output_directory, True)
def run_crossval(self, output_directory, classifier_name, classifier_weka_spec, options_list): # build classifier print("\nBuilding " + classifier_name + " Classifier on training data.") buildTimeStart = time.time() cls = Classifier(classname=classifier_weka_spec, options=options_list) cls.build_classifier(self.training_data) resultsString = "" resultsString = self.print_both(str(cls), resultsString) buildTimeString = classifier_name + " Cross Eval Classifier Built in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Evaluate Classifier resultsString = self.print_both("\nCross Evaluating on test data.", resultsString) buildTimeStart = time.time() evl = Evaluation(self.training_data) evl.crossvalidate_model(cls, self.training_data, 10, Random(1)) resultsString = self.print_both(str(evl.summary()), resultsString) resultsString += "\n" resultsString = self.print_both(str(evl.class_details()), resultsString) resultsString += "\n" resultsString = self.print_both(str(evl.confusion_matrix), resultsString) buildTimeString = "\n\n" + classifier_name + " Cross Eval Classifier Evaluated in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) options_string = "" for option in options_list: options_string = options_string + str(option) options_string = options_string.replace(".", "-") options_string = options_string.replace("-", "_") #Save Results and Cleanup self.save_results(classifier_name + options_string + "_Crossval", resultsString, output_directory)
def use_classifier(data): """ Uses the meta-classifier AttributeSelectedClassifier for attribute selection. :param data: the dataset to use :type data: Instances """ print("\n1. Meta-classifier") classifier = Classifier(classname="weka.classifiers.meta.AttributeSelectedClassifier") aseval = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval") assearch = ASSearch(classname="weka.attributeSelection.GreedyStepwise", options=["-B"]) base = Classifier(classname="weka.classifiers.trees.J48") # setting nested options is always a bit tricky, getting all the escaped double quotes right # simply using the bean property for setting Java objects is often easier and less error prone classifier.set_property("classifier", base.jobject) classifier.set_property("evaluator", aseval.jobject) classifier.set_property("search", assearch.jobject) evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, 10, Random(1)) print(evaluation.summary())
def HOV(dataset, algo, num_datasets): #Executing HOV \_*-*_/ loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(dataset) data.class_is_last() train, test = data.train_test_split(70.0, Random(10)) cls = Classifier(classname=algo) cls.build_classifier(train) evl = Evaluation(train) evl.test_model(cls, test) print(evl.summary("=== " +str(algo)+ " on" + str(dataset) + " ===",False)) print(evl.matrix("=== on click prediction(confusion matrix) ===")) print("For Algo"+ str(algo)+"areaUnderROC/1: for HOV " + str(evl.area_under_roc(1))) return evl.area_under_roc(1)
def crossValidate(self, arrfFile=None, classname="weka.classifiers.trees.J48", options=["-C", "0.3"]): if arrfFile is not None: self.initData(arrfFile) if self.data is None: return print 'Classificador ' + str(classname) + ' ' + ' '.join(options) cls = Classifier(classname=classname, options=options) evl = Evaluation(self.data) evl.crossvalidate_model(cls, self.data, 10, Random(1)) print(evl.percent_correct) print(evl.summary()) print(evl.class_details())
def TrainingModel(arff, modelOutput, clsfier): # 启动java虚拟机 jvm.start() # 导入训练集 loader = Loader(classname="weka.core.converters.ArffLoader") train = loader.load_file(arff) train.class_is_first() # 使用RandomForest算法进行训练,因为在GUI版本weka中使用多种方式训练后发现此方式TPR与TNR较高 cls_name = "weka.classifiers." + clsfier clsf = Classifier(classname=cls_name) clsf.build_classifier(train) print(clsf) # 建立模型 fc = FilteredClassifier() fc.classifier = clsf evl = Evaluation(train) evl.crossvalidate_model(fc, train, 10, Random(1)) print(evl.percent_correct) print(evl.summary()) print(evl.class_details()) print(evl.matrix()) # 结果统计 matrixResults = evl.confusion_matrix TN = float(matrixResults[0][0]) FP = float(matrixResults[0][1]) FN = float(matrixResults[1][0]) TP = float(matrixResults[1][1]) TPR = TP / (TP + FN) TNR = TN / (FP + TN) PPV = TP / (TP + FP) NPV = TN / (TN + FN) print("算法: " + clsfier) print("敏感度 TPR: " + str(TPR)) print("特异度 TNR: " + str(TNR)) print("PPV: " + str(PPV)) print("NPV: " + str(NPV)) # 保存模型 clsf.serialize(modelOutput, header=train) # 退出虚拟机 jvm.stop() print("分析模型建立完成")
def SMOreg(): loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file("First_trial_regression.arff") data.class_is_last() cls = KernelClassifier(classname="weka.classifiers.functions.SMOreg", options=["-N", "0"]) kernel = Kernel( classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.2"]) cls.kernel = kernel pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(486), pout) print(evl.summary()) print(pout.buffer_content()) # save model serialization.write_all("SMOreg.model2", cls)
def DecisionTree(rnd_data, folds, seed, data): data_size = rnd_data.num_instances fold_size = math.floor(data_size / folds) # cross-validation evaluation = Evaluation(rnd_data) for i in range(folds): this_fold = fold_size test_start = i * fold_size test_end = (test_start + fold_size) if ((data_size - test_end) / fold_size < 1): this_fold = data_size - test_start test = Instances.copy_instances(rnd_data, test_start, this_fold) # generate validation fold if i == 0: train = Instances.copy_instances(rnd_data, test_end, data_size - test_end) else: train_1 = Instances.copy_instances(rnd_data, 0, test_start) train_2 = Instances.copy_instances(rnd_data, test_end, data_size - test_end) train = Instances.append_instances( train_1, train_2) # generate training fold # build and evaluate classifier cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) # build classifier on training set evaluation.test_model(cls, test) # test classifier on validation/test set print("") print("=== Decision Tree ===") print("Classifier: " + cls.to_commandline()) print("Dataset: " + data.relationname) print("Folds: " + str(folds)) print("Seed: " + str(seed)) print("") print( evaluation.summary("=== " + str(folds) + "-fold Cross-Validation ==="))
def run_ibk_crossval(self, output_directory): # build classifier print("\nBuilding Classifier on training data.") buildTimeStart = time.time() cls = Classifier( classname="weka.classifiers.lazy.IBk", options=[ "-K", "3", "-W", "0", "-A", "weka.core.neighboursearch.LinearNNSearch -A \"weka.core.EuclideanDistance -R first-last\"" ]) cls.build_classifier(self.training_data) resultsString = "" resultsString = self.print_both(str(cls), resultsString) buildTimeString = "IBK Cross Eval Classifier Built in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Evaluate Classifier resultsString = self.print_both("\nCross Evaluating on test data.", resultsString) buildTimeStart = time.time() evl = Evaluation(self.training_data) evl.crossvalidate_model(cls, self.training_data, 10, Random(1)) resultsString = self.print_both(str(evl.summary()), resultsString) resultsString = self.print_both(str(evl.class_details()), resultsString) resultsString = self.print_both(str(evl.confusion_matrix), resultsString) buildTimeString = "\nIBK Cross Eval Classifier Evaluated in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Save Results and Cleanup self.save_results("IBK_Crossval", resultsString, output_directory)
from weka.classifiers import Classifier, Evaluation jvm.start() # load weather.nominal loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "weather.nominal.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.class_is_last() # perform 10-fold cross-validation cls = Classifier(classname="weka.classifiers.rules.OneR") evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("10-fold cross-validation (full):\n" + evl.summary()) cls.build_classifier(data) print("Model:\n\n" + str(cls)) # remove attribute "outlook" print("Removing attribute 'outlook'") data.delete_attribute(data.attribute_by_name("outlook").index) # perform 10-fold cross-validation (reduced dataset) cls = Classifier(classname="weka.classifiers.rules.OneR") evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("10-fold cross-validation (without 'outlook'):\n" + evl.summary()) cls.build_classifier(data) print("Model:\n\n" + str(cls))
def runner(self, cdat, heap_size = 16384, seed = None, verbose = True): self.set_status(Pipeline.RUNNING) self.logs.append('Initializing Pipeline') para = self.config self.logs.append('Reading Pipeline Configuration') head = '' name = get_rand_uuid_str() self.logs.append('Reading Input File') for i, stage in enumerate(self.stages): if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'): self.stages[i].status = Pipeline.RUNNING if stage.code == 'dat.fle': head = os.path.abspath(stage.value.path) name, _ = os.path.splitext(stage.value.name) self.logs.append('Parsing to ARFF') path = os.path.join(head, '{name}.arff'.format(name = name)) # This bug, I don't know why, using Config.schema instead. # cdat.toARFF(path, express_config = para.Preprocess.schema, verbose = verbose) for i, stage in enumerate(self.stages): if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'): self.stages[i].status = Pipeline.COMPLETE self.logs.append('Saved ARFF at {path}'.format(path = path)) self.logs.append('Splitting to Training and Testing Sets') JVM.start(max_heap_size = '{size}m'.format(size = heap_size)) load = Loader(classname = 'weka.core.converters.ArffLoader') # data = load.load_file(path) # save = Saver(classname = 'weka.core.converters.ArffSaver') data = load.load_file(os.path.join(head, 'iris.arff')) # For Debugging Purposes Only data.class_is_last() # For Debugging Purposes Only # data.class_index = cdat.iclss for i, stage in enumerate(self.stages): if stage.code == 'prp.kcv': self.stages[i].status = Pipeline.RUNNING self.logs.append('Splitting Training Set') # TODO - Check if this seed is worth it. seed = assign_if_none(seed, random.randint(0, 1000)) opts = ['-S', str(seed), '-N', str(para.Preprocess.FOLDS)] wobj = Filter(classname = 'weka.filters.supervised.instance.StratifiedRemoveFolds', options = opts + ['-V']) wobj.inputformat(data) tran = wobj.filter(data) self.logs.append('Splitting Testing Set') wobj.options = opts test = wobj.filter(data) for i, stage in enumerate(self.stages): if stage.code == 'prp.kcv': self.stages[i].status = Pipeline.COMPLETE self.logs.append('Performing Feature Selection') feat = [ ] for comb in para.FEATURE_SELECTION: if comb.USE: for i, stage in enumerate(self.stages): if stage.code == 'ats': search = stage.value.search.name evaluator = stage.value.evaluator.name if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME: self.stages[i].status = Pipeline.RUNNING srch = ASSearch(classname = 'weka.attributeSelection.{classname}'.format( classname = comb.Search.NAME, options = assign_if_none(comb.Search.OPTIONS, [ ]) )) ewal = ASEvaluation(classname = 'weka.attributeSelection.{classname}'.format( classname = comb.Evaluator.NAME, options = assign_if_none(comb.Evaluator.OPTIONS, [ ]) )) attr = AttributeSelection() attr.search(srch) attr.evaluator(ewal) attr.select_attributes(tran) meta = addict.Dict() meta.search = comb.Search.NAME meta.evaluator = comb.Evaluator.NAME meta.features = [tran.attribute(index).name for index in attr.selected_attributes] feat.append(meta) for i, stage in enumerate(self.stages): if stage.code == 'ats': search = stage.value.search.name evaluator = stage.value.evaluator.name if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME: self.stages[i].status = Pipeline.COMPLETE models = [ ] for model in para.MODEL: if model.USE: summary = addict.Dict() self.logs.append('Modelling {model}'.format(model = model.LABEL)) summary.label = model.LABEL summary.name = model.NAME summary.options = assign_if_none(model.OPTIONS, [ ]) for i, stage in enumerate(self.stages): if stage.code == 'lrn' and stage.value.name == model.NAME: self.stages[i].status = Pipeline.RUNNING for i, instance in enumerate(data): iclass = list(range(instance.num_classes)) options = assign_if_none(model.OPTIONS, [ ]) classifier = Classifier(classname = 'weka.classifiers.{classname}'.format(classname = model.NAME), options = options) classifier.build_classifier(tran) serializer.write(os.path.join(head, '{name}.{classname}.model'.format( name = name, classname = model.NAME )), classifier) self.logs.append('Testing model {model}'.format(model = model.LABEL)) evaluation = Evaluation(tran) evaluation.test_model(classifier, test) summary.summary = evaluation.summary() frame = pd.DataFrame(data = evaluation.confusion_matrix) axes = sns.heatmap(frame, cbar = False, annot = True) b64str = get_b64_plot(axes) summary.confusion_matrix = addict.Dict({ 'value': evaluation.confusion_matrix.tolist(), 'plot': b64str }) self.logs.append('Plotting Learning Curve for {model}'.format(model = model.LABEL)) buffer = io.BytesIO() plot_classifier_errors(evaluation.predictions, tran, test, outfile = buffer, wait = False) b64str = buffer_to_b64(buffer) summary.learning_curve = b64str buffer = io.BytesIO() plot_roc(evaluation, class_index = iclass, outfile = buffer, wait = False) b64str = buffer_to_b64(buffer) summary.roc_curve = b64str buffer = io.BytesIO() plot_prc(evaluation, class_index = iclass, outfile = buffer, wait = False) b64str = buffer_to_b64(buffer) summary.prc_curve = b64str if classifier.graph: summary.graph = classifier.graph for i, instance in enumerate(test): prediction = classifier.classify_instance(instance) for i, stage in enumerate(self.stages): if stage.code == 'lrn' and stage.value.name == model.NAME: self.stages[i].status = Pipeline.COMPLETE models.append(summary) self.gist.models = models JVM.stop() JSON.write(os.path.join(head, '{name}.cgist'.format(name = name)), self.gist) self.logs.append('Pipeline Complete') self.set_status(Pipeline.COMPLETE)
data_dir = os.environ.get("WEKAMOOC_DATA") if data_dir is None: data_dir = "." + os.sep + "data" import weka.core.jvm as jvm from weka.core.converters import Loader from weka.classifiers import Classifier, Evaluation, PredictionOutput from weka.core.classes import Random import weka.plot.classifiers as plc jvm.start() # load weather.nominal fname = data_dir + os.sep + "weather.nominal.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) data.class_is_last() # cross-validate NaiveBayes cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes") pout = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1), pout) print(evl.summary()) print(evl.matrix()) print(pout) plc.plot_roc(evl, wait=True) jvm.stop()
from weka.core.classes import Random from weka.classifiers import Classifier, Evaluation jvm.start() for dataset in ["diabetes.arff", "breast-cancer.arff"]: # load dataset loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + dataset print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.class_is_last() # cross-validate default J48, display model cls = Classifier(classname="weka.classifiers.trees.J48") evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("10-fold cross-validation (default):\n" + evl.summary()) cls.build_classifier(data) print("Model (default):\n\n" + str(cls)) # cross-validate unpruned J48, display model cls = Classifier(classname="weka.classifiers.trees.J48", options=["-U"]) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("10-fold cross-validation (unpruned):\n" + evl.summary()) cls.build_classifier(data) print("Model (unpruned):\n\n" + str(cls)) jvm.stop()
def process_classifier(runType, cls, occ, devList, fewCats, label, subtract): global devCount global save_orig global save_subtract conf_matrix = {} if occ: table = 'temp_dat_occ_vector_occ' else: table = 'temp_dat_occ_vector_2' writeStr = '=========================================================================================\n' + \ 'Running ' + runType + ' classifier for \'' + label + '\'' sys.stdout.write(writeStr + '\r') total_conf.write(writeStr + '\n') sys.stdout.flush() if runType == 'unseen': i = 0 indiv_results = {} for dev in devList: devCount += 1 remaining = chop_microseconds(((datetime.utcnow() - item_start)*totalDevs/devCount)-(datetime.utcnow() - item_start)) sys.stdout.write('Running ' + runType + ' classifier for \'' + label + '\' - ' + \ str(round(100*float(devCount)/totalDevs,2)) + ' pct complete (' + str(remaining) + ' remaining) \r') sys.stdout.flush() if fewCats: aws_c.execute('select * from ' + table + ' ' \ 'where duty!=0 and deviceMAC not in (select * from vector_reject) ' \ 'and deviceMAC in (select * from id_fewcats_mac) ' 'and deviceMAC!=\'' + dev + '\';') else: aws_c.execute('select * from ' + table + ' ' \ 'where duty!=0 and deviceMAC not in (select * from vector_reject) ' \ 'and deviceMAC!=\'' + dev + '\';') results = aws_c.fetchall() # Generate type list total_types = ['{'] for data in results: if(data[-1] not in total_types): total_types.append('\"') total_types.append(data[-1]) total_types.append('\"') total_types.append(',') total_types[-1] = '}' typeStr = ''.join(total_types) arff_train = label + '_' + dev + '_train' arff_test = label + '_' + dev + '_test' gen_arff(arff_train, typeStr, results, occ, arff_idcol) if fewCats: aws_c.execute('select * from ' + table + ' ' \ 'where duty!=0 and deviceMAC not in (select * from vector_reject) ' \ 'and deviceMAC in (select * from id_fewcats_mac) ' 'and deviceMAC=\'' + dev + '\';') else: aws_c.execute('select * from ' + table + ' ' \ 'where duty!=0 and deviceMAC not in (select * from vector_reject) ' \ 'and deviceMAC=\'' + dev + '\';') gen_arff(arff_test, typeStr, aws_c.fetchall(), occ, arff_idcol) train = loader.load_file(arff_train + '.arff') train.class_is_last() mv(arff_train + '.arff', master_saveDir) test = loader.load_file(arff_test + '.arff') test.class_is_last() mv(arff_test + '.arff', master_saveDir) cls.build_classifier(train) # output predictions testName = '' predictions = [] for index, inst in enumerate(test): if testName != '': if testName != inst.get_string_value(inst.class_index): print(str(testName) + ' ' + str(inst.get_string_value(inst.class_index))) exit() else: testName = inst.get_string_value(inst.class_index) else: testName = inst.get_string_value(inst.class_index) if testName not in conf_matrix: conf_matrix[testName] = {} pred = cls.classify_instance(inst) # dist = cls.distribution_for_instance(inst) # if(pred == inst.get_value(inst.class_index)): predName = inst.class_attribute.value(int(pred)) if predName not in conf_matrix[testName]: conf_matrix[testName][predName] = 0 conf_matrix[testName][predName] += 1 predictions.append(predName) total = 0 if testName != '': for predName in conf_matrix[testName]: if predName == testName: correct = conf_matrix[testName][predName] total += correct else: total += conf_matrix[testName][predName] # while (len(predictions) * 2) <= 100: # predictions += pyrandom.sample(predictions, len(predictions)) # if len(predictions) < 100: # predictions += pyrandom.sample(predictions, 100 - len(predictions)) lots_predictions = [] while len(lots_predictions) < 10000: lots_predictions += pyrandom.sample(predictions, 1) #indiv_results[dev] = [testName, pyrandom.sample(predictions, 100)] indiv_results[dev] = [testName, lots_predictions] # while len(predictions) < 100: # predictions += pyrandom.sample(predictions, 1) # indiv_results[dev] = [testName, predictions] # indiv_results[dev] = [testName, predictions] # Prep to print the how-many-days graph # days_output.write('\n\n\"' + dev + '\"\n') #print(str(testName) + ' ' + str(correct) + ' ' + str(total) + ' ' + str(float(correct)/total)) # i += 1 # if i == 10: # break correct, total = print_conf_matrix(conf_matrix, sys.stdout, False, False, False) correct, total = print_conf_matrix(conf_matrix, total_conf, False, False, False) if subtract == 'orig': save_orig = copy.deepcopy(conf_matrix) elif subtract == 'subtract': save_subtract = copy.deepcopy(conf_matrix) final_result = round(100*float(correct)/total,2) writeStr = '\nCorrectly Classified Instances\t\t' + str(correct) + '\t\t' + str(final_result) + '\n' + \ 'Incorrectly Classified Instances\t' + str(total-correct) + '\t\t' + str(round(100*float(total-correct)/total,2)) + '\n' + \ 'Total Number of Instances\t\t' + str(total) + '\n' print(writeStr) total_conf.write(writeStr + '\n') conf_interval = 10 total_instances = float(sum([sum([conf_matrix[test][pred] for pred in conf_matrix[test]]) for test in conf_matrix])) p_d = {} p_e = {} p_e_given_d = {} for testName in conf_matrix: count_d = float(sum([conf_matrix[testName][label] for label in conf_matrix[testName]])) p_d[testName] = count_d / total_instances p_e[testName] = float(sum([conf_matrix[label][testName] for label in conf_matrix if testName in conf_matrix[label]]) / total_instances) p_e_given_d[testName] = {} for predName in conf_matrix: if predName in conf_matrix[testName]: p_e_given_d[testName][predName] = conf_matrix[testName][predName] / count_d else: p_e_given_d[testName][predName] = 0 confidence = open('confidence.dat', 'w') for testName in conf_matrix: confidence.write('\n\n\"' + testName + '\"\n') print(testName) for classEvents in range(1, (conf_interval+1)): numerator = math.pow(p_e_given_d[testName][testName], classEvents) * p_d[testName] demoninator = 0 for otherName in conf_matrix: demoninator += math.pow(p_e_given_d[otherName][testName], classEvents) * p_d[otherName] confidence.write(str(classEvents) + '\t' + str(numerator/demoninator) + '\n') print(str(classEvents) + '\t' + str(numerator/demoninator)) print('') for predName in p_e_given_d['Router/Modem']: print('P( ' + predName + ' | Router/Modem ):\t' + str(p_e_given_d['Router/Modem'][predName])) for predName in p_e_given_d['Cable Box']: print('P( ' + predName + ' | Cable Box ):\t' + str(p_e_given_d['Cable Box'][predName])) #router = open('router', 'w') print('Router Stuff:') routerDev = 'Router/Modem' lampDev = 'Lamp' cableDev = 'Cable Box' origClassList = ['Router/Modem', 'Cable Box', 'Lamp', 'Router/Modem', 'Router/Modem', 'Cable Box', 'Router/Modem', 'Lamp', 'Router/Modem'] classListList = [['Router/Modem'] + list(listItem) for listItem in set(itertools.permutations(origClassList))] classListList = [ ['Router/Modem', 'Router/Modem', 'Lamp', 'Lamp', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Cable Box', 'Router/Modem'], ['Router/Modem', 'Cable Box', 'Lamp', 'Lamp', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Cable Box', 'Router/Modem'], ['Router/Modem', 'Router/Modem', 'Lamp', 'Lamp', 'Cable Box', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Cable Box', 'Router/Modem'], ['Router/Modem', 'Cable Box', 'Lamp', 'Lamp', 'Cable Box', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Cable Box', 'Router/Modem'], ['Router/Modem', 'Cable Box', 'Router/Modem', 'Lamp', 'Cable Box', 'Router/Modem', 'Router/Modem', 'Lamp', 'Router/Modem', 'Router/Modem'], ['Router/Modem', 'Router/Modem', 'Router/Modem', 'Lamp', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Lamp', 'Router/Modem', 'Router/Modem'], ['Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Lamp'] ] for idClass, classList in enumerate(classListList): print(idClass) for classEvents in range(1, (conf_interval+1)): numerator_router = p_d[routerDev] numerator_lamp = p_d[lampDev] numerator_cable = p_d[cableDev] for idx, classInst in enumerate(classList): if idx < classEvents: numerator_router *= p_e_given_d[routerDev][classInst] numerator_lamp *= p_e_given_d[lampDev][classInst] numerator_cable *= p_e_given_d[cableDev][classInst] demoninator = 0 for otherName in conf_matrix: obsValue = p_d[otherName] for idx, classInst in enumerate(classList): if idx < classEvents: obsValue *= p_e_given_d[otherName][classInst] demoninator += obsValue print(str(classEvents) + '\t' + str(numerator_router/demoninator) + '\t' + str(numerator_lamp/demoninator) + '\t' + str(numerator_cable/demoninator) + '\t\"' + classList[classEvents-1]) + '\"' print('') numberDevList(indiv_results) eachDev = open('indiv_results.dat', 'w') newIDStream = open('new_id.dat', 'w') for devItem in indiv_results: print_obsResults(conf_matrix, conf_interval, p_d, p_e, p_e_given_d, indiv_results[devItem], eachDev, devItem, newIDStream) print('') print('total devices: ' + str(len(indiv_results))) # print('total devices: ' + str(total_devices)) # print('total correct: ' + str(total_correct)) # print(' pct correct: ' + str(round(100*float(total_correct)/total_devices,2)) + '\n') print('initial confidence: ' + str(round(100*float(sum(initial_confidence))/len(initial_confidence),2))) print('initial accuracy: ' + str(round(100*float(sum(initial_accuracy))/len(initial_accuracy),2)) + '\n') # print('final confidence (correct): ' + str(round(100*float(sum(final_confidence_correct))/len(final_confidence_correct),2))) # print('final confidence (correct): ' + str(round(100*float(sum(final_confidence_incorrect))/len(final_confidence_incorrect),2))) # print('final accuracy: ' + str(round(100*float(total_correct)/total_devices,2))) for devType in final_accuracy: print('final accuracy ' + devType + ' : ' + str(round(float(sum(final_accuracy[devType]))/len(final_accuracy[devType]),6))) print('final confidence (correct) ' + devType + ' : ' + str(round(float(sum(final_confidence_correct[devType]))/len(final_confidence_correct[devType]),6))) if len(final_confidence_incorrect[devType]) > 0: print('final confidence (incorrect) ' + devType + ' : ' + str(round(float(sum(final_confidence_incorrect[devType]))/len(final_confidence_incorrect[devType]),6))) else: print('final confidence (incorrect) ' + devType + ' : ' + str(0)) print('final confidence ' + devType + ' : ' + str(round(float(sum(final_confidence_correct[devType])+sum(final_confidence_incorrect[devType]))/(len(final_confidence_correct[devType])+len(final_confidence_incorrect[devType])),2))) print_conf_matrix(new_conf_matrix, sys.stdout, False, False, False) for topType in actual_confidence_matrix: for botType in actual_confidence_matrix[topType]: storeArray = actual_confidence_matrix[topType][botType] if len(storeArray) > 0: actual_confidence_matrix[topType][botType] = round(sum(storeArray)/len(storeArray),2) else: actual_confidence_matrix[topType][botType] = 0 print_conf_matrix(conf_matrix, sys.stdout, False, False, False) print_conf_matrix(actual_confidence_matrix, sys.stdout, False, False, False) print_conf_matrix(actual_confidence_matrix, sys.stdout, True, False, True) for devType in acc_over_time_dev: printOverTime(devType, acc_over_time_dev[devType], conf_over_time_dev[devType]) printOverTime('total', acc_over_time, conf_over_time) elif runType == 'seen': if fewCats: aws_c.execute('select * from ' + table + ' ' \ 'where duty!=0 and deviceMAC not in (select * from vector_reject) ' \ 'and deviceMAC in (select * from id_fewcats_mac);') else: aws_c.execute('select * from ' + table + ' ' \ 'where duty!=0 and deviceMAC not in (select * from vector_reject);') results = aws_c.fetchall() devCount += 1 remaining = chop_microseconds(((datetime.utcnow() - item_start)*totalDevs/devCount)-(datetime.utcnow() - item_start)) sys.stdout.write('Running ' + runType + ' classifier for \'' + label + '\' - ' + \ str(round(100*float(devCount)/totalDevs,2)) + ' pct complete (' + str(remaining) + ' remaining) \r') sys.stdout.flush() # Generate type list total_types = ['{'] for data in results: if(data[-1] not in total_types): total_types.append('\"') total_types.append(data[-1]) total_types.append('\"') total_types.append(',') total_types[-1] = '}' typeStr = ''.join(total_types) arff_file = label + '_train' gen_arff(arff_file, typeStr, results, occ, arff_idcol) train = loader.load_file(arff_file + '.arff') train.class_is_last() mv(arff_file + '.arff', master_saveDir) cls.build_classifier(train) evl = Evaluation(train) evl.crossvalidate_model(cls, train, 10, Random(1)) print('\n') #print(evl.percent_correct) #print(evl.class_details()) print(evl.matrix()) total_conf.write('\n' + evl.matrix()) print(evl.summary()) total_conf.write(evl.summary() + '\n') final_result = round(evl.percent_correct, 2) else: success = [] for startDev in devList: for changeToDev in devList: if startDev != changeToDev: devCount += 1 remaining = chop_microseconds(((datetime.utcnow() - item_start)*totalDevs/devCount)-(datetime.utcnow() - item_start)) sys.stdout.write('Running ' + runType + ' classifier for \'' + label + '\' - ' + \ str(round(100*float(devCount)/totalDevs,2)) + ' pct complete (' + str(remaining) + ' remaining) \r') sys.stdout.flush() aws_c.execute('select * from temp_dat_occ_vector_2 ' \ 'where duty!=0 and deviceMAC in (\'' + startDev + '\',\'' + changeToDev + '\');') results = [x[:-1] + (x[1],) for x in aws_c.fetchall()] # Class label is just the deviceMAC if len(results) > 10: # Generate type list typeStr = '{' + startDev + ',' + changeToDev + '}' arff_file = label + '_' + startDev + '_' + changeToDev + '_train' gen_arff(arff_file, typeStr, results, occ, arff_idcol) train = loader.load_file(arff_file + '.arff') train.class_is_last() mv(arff_file + '.arff', master_saveDir) cls.build_classifier(train) evl = Evaluation(train) evl.crossvalidate_model(cls, train, 10, Random(1)) print('\n') #print(evl.percent_correct) #print(evl.class_details()) print(evl.matrix()) total_conf.write('\n' + evl.matrix()) print(evl.summary()) total_conf.write(evl.summary() + '\n') success.append(evl.percent_correct) if len(success) > 0: final_result = [sum(success)/len(success), percentile(success, 5), percentile(success, 10), percentile(success, 95)] else: final_result = False if label in total_results: print('Warning label ' + label + ' exists twice, overwriting...') if final_result != False: total_results[label] = final_result
print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) # we'll set the class attribute after filtering # apply NominalToBinary filter and set class attribute fltr = Filter("weka.filters.unsupervised.attribute.NominalToBinary") fltr.inputformat(data) filtered = fltr.filter(data) filtered.class_is_last() # cross-validate LinearRegression on filtered data, display model cls = Classifier(classname="weka.classifiers.functions.LinearRegression") pout = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.PlainText") evl = Evaluation(filtered) evl.crossvalidate_model(cls, filtered, 10, Random(1), pout) print("10-fold cross-validation:\n" + evl.summary()) print("Predictions:\n\n" + str(pout)) cls.build_classifier(filtered) print("Model:\n\n" + str(cls)) # use AddClassification filter with LinearRegression on filtered data print("Applying AddClassification to filtered data:\n") fltr = Filter( classname="weka.filters.supervised.attribute.AddClassification", options=["-W", "weka.classifiers.functions.LinearRegression", "-classification"]) fltr.inputformat(filtered) classified = fltr.filter(filtered) print(classified) # convert class back to nominal fltr = Filter(classname="weka.filters.unsupervised.attribute.NumericToNominal", options=["-R", "9"])
jvm.start() # load diabetes loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "diabetes.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.class_is_last() for classifier in ["weka.classifiers.bayes.NaiveBayes", "weka.classifiers.rules.ZeroR", "weka.classifiers.trees.J48"]: # train/test split 90% using classifier cls = Classifier(classname=classifier) evl = Evaluation(data) evl.evaluate_train_test_split(cls, data, 90.0, Random(1)) print("\n" + classifier + " train/test split (90%):\n" + evl.summary()) cls.build_classifier(data) print(classifier + " model:\n\n" + str(cls)) # calculate mean/stdev over 10 cross-validations for classifier in [ "weka.classifiers.meta.ClassificationViaRegression", "weka.classifiers.bayes.NaiveBayes", "weka.classifiers.rules.ZeroR", "weka.classifiers.trees.J48", "weka.classifiers.functions.Logistic"]: accuracy = [] for i in xrange(1,11): cls = Classifier(classname=classifier) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(i)) accuracy.append(evl.percent_correct) nacc = numpy.array(accuracy) print("%s: %0.2f +/-%0.2f" % (classifier, numpy.mean(nacc), numpy.std(nacc)))
from utilities import * import weka.core.jvm as jvm from weka.core.converters import Loader, Saver from weka.classifiers import Classifier, Evaluation from weka.core.classes import Random jvm.start(max_heap_size="3072m") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file("./Dataset/trainGrid.arff") data.class_is_last() #classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"]) classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") evaluation = Evaluation(data) #evaluation.crossvalidate_model(classifier, data, 10, Random(42)) evaluation.evaluate_train_test_split(classifier, data, 66, Random(42)) res = evaluation.summary() res += "\n" + evaluation.matrix() #f = open('./Dataset/resultsGrid.txt', 'w') #f.write(res) print res jvm.stop()
# Build classifier on training data cls.build_classifier(train) # print(cls) #import weka.plot.graph as graph #graph.plot_dot_graph(cls.graph) from weka.classifiers import Evaluation from weka.core.classes import Random evl = Evaluation(train) evl.crossvalidate_model(cls, train, 10, Random(1)) print ("Kappa Score") print (evl.kappa) # 0.50 - Not bad print ("Evaluation Summary") print (evl.summary()) # Accuracy: 83% ## Test model on new data ## evl = Evaluation(test) from weka.classifiers import PredictionOutput pred_output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) evl.crossvalidate_model(cls, test, 10, Random(1), pred_output) # View complete summary of the selected model on test data print(evl.summary()) # The kappa statistic is 45% in this case. Not surprising given the low number of instances. # The accuracy is 84.3%, which is fair.
jvm.start() # load cpu loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "cpu.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.class_is_last() # cross-validate LinearRegression, display model print("\n--> LinearRegression\n") cls = Classifier(classname="weka.classifiers.functions.LinearRegression") evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("10-fold cross-validation:\n" + evl.summary()) cls.build_classifier(data) print("Model:\n\n" + str(cls)) # cross-validate M5P, display model print("\n--> M5P\n") cls = Classifier(classname="weka.classifiers.trees.M5P") evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("10-fold cross-validation:\n" + evl.summary()) cls.build_classifier(data) print("Model:\n\n" + str(cls)) plg.plot_dot_graph(cls.graph) jvm.stop()
def classify_and_save(classifier, name, outfile): random.seed("ML349") csv_header = [ "Game Name", "SteamID", "Algorithm", "Number Players", "%Players of Training Set", "Accuracy", "Precision (0)", "Recall (0)", "F1 (0)", "Precision (1)", "Recall (1)", "F1 (1)" ] game_results = [] with open("data/games_by_username_all.csv", "r") as f: game_list = f.next().rstrip().split(",") loader = Loader(classname="weka.core.converters.ArffLoader") train = loader.load_file("data/final_train.arff") test = loader.load_file("data/final_test.arff") count = 0 for i in itertools.chain(xrange(0, 50), random.sample(xrange(50, len(game_list)), 450)): train.class_index = i test.class_index = i count += 1 classifier.build_classifier(train) evaluation = Evaluation(train) evaluation.test_model(classifier, test) confusion = evaluation.confusion_matrix num_players = sum(confusion[1]) steam_id = repr(train.class_attribute).split(" ")[1] result = [ game_list[i], steam_id, name, int(num_players), num_players/1955, evaluation.percent_correct, evaluation.precision(0), evaluation.recall(0), evaluation.f_measure(0), evaluation.precision(1), evaluation.recall(1), evaluation.f_measure(1) ] game_results.append(result) print "\nResult #{2}/500 for {0} (SteamID {1}):".format(game_list[i], steam_id, count), print evaluation.summary() with open(outfile, "wb") as f: csv_writer = csv.writer(f, delimiter=",") csv_writer.writerow(csv_header) for r in game_results: csv_writer.writerow(r)
from weka.filters import Filter # convert csv into arff format (weka compatable) # use convertcsvtoarff.py file # load arff file loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file("reviewsinformation_task2.arff") iris_data.class_is_last() loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # kernel classifier helper.print_title("Creating SMO as KernelClassifier") kernel = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"]) classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"]) classifier.kernel = kernel classifier.build_classifier(iris_data) print("classifier: " + classifier.to_commandline()) print("model:\n" + str(classifier)) #print("model:\n" + str(classifier)) evaluation = Evaluation('test_data.arff') evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.matrix())
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # classifier help helper.print_title("Creating help string") classifier = Classifier(classname="weka.classifiers.trees.J48") print(classifier.to_help()) # partial classname helper.print_title("Creating classifier from partial classname") clsname = ".J48" classifier = Classifier(classname=clsname) print(clsname + " --> " + classifier.classname) # classifier from commandline helper.print_title("Creating SMO from command-line string") cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"' classifier = from_commandline(cmdline, classname="weka.classifiers.Classifier") classifier.build_classifier(iris_data) print("input: " + cmdline) print("output: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # kernel classifier helper.print_title("Creating SMO as KernelClassifier") kernel = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"]) classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"]) classifier.kernel = kernel classifier.build_classifier(iris_data) print("classifier: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # build a classifier and output model helper.print_title("Training J48 classifier on iris") classifier = Classifier(classname="weka.classifiers.trees.J48") # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor" # property of the J48 classifier itself. However, being of type float rather than double, we need # to convert it to the correct type first using the double_to_float function: classifier.set_property("confidenceFactor", typeconv.double_to_float(0.3)) classifier.build_classifier(iris_data) print(classifier) print(classifier.graph) print(classifier.to_source("MyJ48")) plot_graph.plot_dot_graph(classifier.graph) # evaluate model on test set helper.print_title("Evaluating J48 classifier on iris") evaluation = Evaluation(iris_data) evl = evaluation.test_model(classifier, iris_data) print(evl) print(evaluation.summary()) # evaluate model on train/test split helper.print_title("Evaluating J48 classifier on iris (random split 66%)") classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) evaluation = Evaluation(iris_data) evaluation.evaluate_train_test_split(classifier, iris_data, 66.0, Random(1)) print(evaluation.summary()) # load a dataset incrementally and build classifier incrementally helper.print_title("Build classifier incrementally on iris") helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_inc = loader.load_file(iris_file, incremental=True) iris_inc.class_is_last() classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable") classifier.build_classifier(iris_inc) for inst in loader: classifier.update_classifier(inst) print(classifier) # construct meta-classifiers helper.print_title("Meta classifiers") # generic FilteredClassifier instantiation print("generic FilteredClassifier instantiation") meta = SingleClassifierEnhancer(classname="weka.classifiers.meta.FilteredClassifier") meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.set_property("filter", flter.jobject) print(meta.to_commandline()) # direct FilteredClassifier instantiation print("direct FilteredClassifier instantiation") meta = FilteredClassifier() meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.filter = flter print(meta.to_commandline()) # generic Vote print("generic Vote instantiation") meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote") classifiers = [ Classifier(classname="weka.classifiers.functions.SMO"), Classifier(classname="weka.classifiers.trees.J48") ] meta.classifiers = classifiers print(meta.to_commandline()) # cross-validate nominal classifier helper.print_title("Cross-validating NaiveBayes on diabetes") diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + diabetes_file) loader = Loader("weka.core.converters.ArffLoader") diabetes_data = loader.load_file(diabetes_file) diabetes_data.class_is_last() classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") pred_output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) evaluation = Evaluation(diabetes_data) evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.matrix()) print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0))) print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc)) print("areaUnderROC/1: " + str(evaluation.area_under_roc(1))) print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc)) print("avgCost: " + str(evaluation.avg_cost)) print("totalCost: " + str(evaluation.total_cost)) print("confusionMatrix: " + str(evaluation.confusion_matrix)) print("correct: " + str(evaluation.correct)) print("pctCorrect: " + str(evaluation.percent_correct)) print("incorrect: " + str(evaluation.incorrect)) print("pctIncorrect: " + str(evaluation.percent_incorrect)) print("unclassified: " + str(evaluation.unclassified)) print("pctUnclassified: " + str(evaluation.percent_unclassified)) print("coverageOfTestCasesByPredictedRegions: " + str(evaluation.coverage_of_test_cases_by_predicted_regions)) print("sizeOfPredictedRegions: " + str(evaluation.size_of_predicted_regions)) print("falseNegativeRate: " + str(evaluation.false_negative_rate(1))) print("weightedFalseNegativeRate: " + str(evaluation.weighted_false_negative_rate)) print("numFalseNegatives: " + str(evaluation.num_false_negatives(1))) print("trueNegativeRate: " + str(evaluation.true_negative_rate(1))) print("weightedTrueNegativeRate: " + str(evaluation.weighted_true_negative_rate)) print("numTrueNegatives: " + str(evaluation.num_true_negatives(1))) print("falsePositiveRate: " + str(evaluation.false_positive_rate(1))) print("weightedFalsePositiveRate: " + str(evaluation.weighted_false_positive_rate)) print("numFalsePositives: " + str(evaluation.num_false_positives(1))) print("truePositiveRate: " + str(evaluation.true_positive_rate(1))) print("weightedTruePositiveRate: " + str(evaluation.weighted_true_positive_rate)) print("numTruePositives: " + str(evaluation.num_true_positives(1))) print("fMeasure: " + str(evaluation.f_measure(1))) print("weightedFMeasure: " + str(evaluation.weighted_f_measure)) print("unweightedMacroFmeasure: " + str(evaluation.unweighted_macro_f_measure)) print("unweightedMicroFmeasure: " + str(evaluation.unweighted_micro_f_measure)) print("precision: " + str(evaluation.precision(1))) print("weightedPrecision: " + str(evaluation.weighted_precision)) print("recall: " + str(evaluation.recall(1))) print("weightedRecall: " + str(evaluation.weighted_recall)) print("kappa: " + str(evaluation.kappa)) print("KBInformation: " + str(evaluation.kb_information)) print("KBMeanInformation: " + str(evaluation.kb_mean_information)) print("KBRelativeInformation: " + str(evaluation.kb_relative_information)) print("SFEntropyGain: " + str(evaluation.sf_entropy_gain)) print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain)) print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy)) print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy)) print("matthewsCorrelationCoefficient: " + str(evaluation.matthews_correlation_coefficient(1))) print("weightedMatthewsCorrelation: " + str(evaluation.weighted_matthews_correlation)) print("class priors: " + str(evaluation.class_priors)) print("numInstances: " + str(evaluation.num_instances)) print("meanAbsoluteError: " + str(evaluation.mean_absolute_error)) print("meanPriorAbsoluteError: " + str(evaluation.mean_prior_absolute_error)) print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error)) print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error)) print("rootMeanPriorSquaredError: " + str(evaluation.root_mean_prior_squared_error)) print("rootRelativeSquaredError: " + str(evaluation.root_relative_squared_error)) print("prediction output:\n" + str(pred_output)) plot_cls.plot_roc( evaluation, title="ROC diabetes", class_index=range(0, diabetes_data.class_attribute.num_values), wait=False) plot_cls.plot_prc( evaluation, title="PRC diabetes", class_index=range(0, diabetes_data.class_attribute.num_values), wait=False) # load a numeric dataset bolts_file = helper.get_data_dir() + os.sep + "bolts.arff" helper.print_info("Loading dataset: " + bolts_file) loader = Loader("weka.core.converters.ArffLoader") bolts_data = loader.load_file(bolts_file) bolts_data.class_is_last() # build a classifier and output model helper.print_title("Training LinearRegression on bolts") classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) classifier.build_classifier(bolts_data) print(classifier) # cross-validate numeric classifier helper.print_title("Cross-validating LinearRegression on bolts") classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) evaluation = Evaluation(bolts_data) evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42)) print(evaluation.summary()) print("correlationCoefficient: " + str(evaluation.correlation_coefficient)) print("errorRate: " + str(evaluation.error_rate)) helper.print_title("Header - bolts") print(str(evaluation.header)) helper.print_title("Predictions on bolts") for index, pred in enumerate(evaluation.predictions): print(str(index+1) + ": " + str(pred) + " -> error=" + str(pred.error)) plot_cls.plot_classifier_errors(evaluation.predictions, wait=False) # learning curve cls = [ Classifier(classname="weka.classifiers.trees.J48"), Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")] plot_cls.plot_learning_curve( cls, diabetes_data, increments=0.05, label_template="[#] !", metric="percent_correct", wait=True) # access classifier's Java API labor_file = helper.get_data_dir() + os.sep + "labor.arff" helper.print_info("Loading dataset: " + labor_file) loader = Loader("weka.core.converters.ArffLoader") labor_data = loader.load_file(labor_file) labor_data.class_is_last() helper.print_title("Using JRip's Java API to access rules") jrip = Classifier(classname="weka.classifiers.rules.JRip") jrip.build_classifier(labor_data) rset = jrip.jwrapper.getRuleset() for i in range(rset.size()): r = rset.get(i) print(str(r.toString(labor_data.class_attribute.jobject)))
dataSet20x50.class_is_last() dataSet50x20 = loader.load_file("trainingSet/dataSet50x20.arff") dataSet50x20.class_is_last() classifier1 = Classifier(classname="weka.classifiers.functions.MultilayerPerceptron", options=["-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "9"]) classifier2 = Classifier(classname="weka.classifiers.functions.MultilayerPerceptron", options=["-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "11"]) classifier3 = Classifier(classname="weka.classifiers.functions.MultilayerPerceptron", options=["-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "9"]) print "\n\nTraining neural network 1" evaluation1 = Evaluation(dataSet20x20) evaluation1.crossvalidate_model(classifier1, dataSet20x20, 10, Random(42)) classifier1.build_classifier(dataSet20x20) serialization.write("trainingSet/nn1.model", classifier1) print "\n\n====================================================== NUERAL NETWORK 1 ======================================================" print(evaluation1.summary()) print(evaluation1.class_details()) print "Training neural network 2" evaluation2 = Evaluation(dataSet20x50) evaluation2.crossvalidate_model(classifier2, dataSet20x50, 10, Random(42)) classifier2.build_classifier(dataSet20x50) serialization.write("trainingSet/nn2.model", classifier2) print "\n\n====================================================== NUERAL NETWORK 2 ======================================================" print(evaluation2.summary()) print(evaluation2.class_details()) print "Training neural network 3" evaluation3 = Evaluation(dataSet50x20) evaluation3.crossvalidate_model(classifier3, dataSet50x20, 10, Random(42)) classifier3.build_classifier(dataSet50x20)
def main(): """ Just runs some example code. """ # load a dataset data_file = helper.get_data_dir() + os.sep + "vote.arff" helper.print_info("Loading dataset: " + data_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # classifier classifier = Classifier(classname="weka.classifiers.trees.J48") # randomize data folds = 10 seed = 1 rnd = Random(seed) rand_data = Instances.copy_instances(data) rand_data.randomize(rnd) if rand_data.class_attribute.is_nominal: rand_data.stratify(folds) # perform cross-validation and add predictions predicted_data = None evaluation = Evaluation(rand_data) for i in xrange(folds): train = rand_data.train_cv(folds, i) # the above code is used by the StratifiedRemoveFolds filter, # the following code is used by the Explorer/Experimenter # train = rand_data.train_cv(folds, i, rnd) test = rand_data.test_cv(folds, i) # build and evaluate classifier cls = Classifier.make_copy(classifier) cls.build_classifier(train) evaluation.test_model(cls, test) # add predictions addcls = Filter( classname="weka.filters.supervised.attribute.AddClassification", options=["-classification", "-distribution", "-error"]) # setting the java object directory avoids issues with correct quoting in option array addcls.set_property("classifier", Classifier.make_copy(classifier)) addcls.inputformat(train) addcls.filter(train) # trains the classifier pred = addcls.filter(test) if predicted_data is None: predicted_data = Instances.template_instances(pred, 0) for n in xrange(pred.num_instances): predicted_data.add_instance(pred.get_instance(n)) print("") print("=== Setup ===") print("Classifier: " + classifier.to_commandline()) print("Dataset: " + data.relationname) print("Folds: " + str(folds)) print("Seed: " + str(seed)) print("") print(evaluation.summary("=== " + str(folds) + " -fold Cross-Validation ===")) print("") print(predicted_data)