def experiment_file_random(path_features, path_folder_save_results, options, classifier, fold, random, name): print("start weka") cls = Classifier(classname=classifier, options=weka.core.classes.split_options(options)) d_results = { 'percent_correct': [], 'percent_incorrect': [], 'confusion_matrix': [] } data = converters.load_any_file(path_features) data.class_is_last() pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV") evl = Evaluation(data) evl.crossvalidate_model(cls, data, fold, Random(random), pout) d_results['percent_correct'].append(evl.percent_correct) d_results['percent_incorrect'].append(evl.percent_incorrect) d_results['confusion_matrix'].append( evl.matrix()) # Generates the confusion matrix. d_results = pd.DataFrame(data=d_results) d_results.to_csv(path_folder_save_results + '/' + str(name) + '.csv', index=False) save = pout.buffer_content() with open( path_folder_save_results + '/' + 'prediction/' + str(name) + '.csv', 'w') as f: f.write(save)
def case2(): loader1 = Loader(classname="weka.core.converters.ArffLoader") test_file = input("Enter the name of the test file:") data1 = loader1.load_file(test_file) data1.class_is_last() evaluation = Evaluation(data1) evl = evaluation.test_model(cls, data1) print(evaluation.matrix("=== (confusion matrix) ==="))
def case2(): loader1 = Loader(classname="weka.core.converters.ArffLoader") file = input("Enter the name of the model file:") cls2 = Classifier(jobject=serialization.read(file)) test_file = input("Enter the name of the test file:") data1 = loader1.load_file(test_file) data1.class_is_last() evaluation = Evaluation(data1) evl = evaluation.test_model(cls2, data1) print(evaluation.matrix("=== (confusion matrix) ==="))
def experiment_more_file(path_files, path_folder_save_results, fold, options, classifier, random, name): cls = Classifier(classname=classifier, options=weka.core.classes.split_options(options)) file_list = os.listdir(path_files) for file in file_list: if ".csv" not in file: file_list.remove(file) d_results = { 'name_file': [], 'percent_correct': [], 'percent_incorrect': [], 'confusion_matrix': [] } print(file_list) for file in file_list: print(str(file)) data = converters.load_any_file(path_files + "/" + file) data.class_is_last() pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV") evl = Evaluation(data) evl.crossvalidate_model(cls, data, fold, Random(random), pout) d_results['name_file'].append(str(file)) d_results['percent_correct'].append(evl.percent_correct) d_results['percent_incorrect'].append(evl.percent_incorrect) d_results['confusion_matrix'].append( evl.matrix()) # Generates the confusion matrix. save = pout.buffer_content() with open( path_folder_save_results + '/' + 'prediction/' + str(name) + str(file)[:-4] + 'pred_data.csv', 'w') as f: f.write(save) d_results = pd.DataFrame(data=d_results) d_results.to_csv(path_folder_save_results + '/' + str(name) + ".csv", index=False)
def CV5x2(dataset, algo, num_datasets): loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(dataset) data.class_is_last() cls = Classifier(classname=algo) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 2, Random(5)) print(evl.summary("=== " +str(algo)+ " on" + str(dataset) + " ===",False)) print(evl.matrix("=== on click prediction(confusion matrix) ===")) print("For Algo"+ str(algo)+"areaUnderROC/1: for CV5x2 " + str(evl.area_under_roc(1))) return evl.area_under_roc(1)
def HOV(dataset, algo, num_datasets): #Executing HOV \_*-*_/ loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(dataset) data.class_is_last() train, test = data.train_test_split(70.0, Random(10)) cls = Classifier(classname=algo) cls.build_classifier(train) evl = Evaluation(train) evl.test_model(cls, test) print(evl.summary("=== " +str(algo)+ " on" + str(dataset) + " ===",False)) print(evl.matrix("=== on click prediction(confusion matrix) ===")) print("For Algo"+ str(algo)+"areaUnderROC/1: for HOV " + str(evl.area_under_roc(1))) return evl.area_under_roc(1)
def TrainingModel(arff, modelOutput, clsfier): # 启动java虚拟机 jvm.start() # 导入训练集 loader = Loader(classname="weka.core.converters.ArffLoader") train = loader.load_file(arff) train.class_is_first() # 使用RandomForest算法进行训练,因为在GUI版本weka中使用多种方式训练后发现此方式TPR与TNR较高 cls_name = "weka.classifiers." + clsfier clsf = Classifier(classname=cls_name) clsf.build_classifier(train) print(clsf) # 建立模型 fc = FilteredClassifier() fc.classifier = clsf evl = Evaluation(train) evl.crossvalidate_model(fc, train, 10, Random(1)) print(evl.percent_correct) print(evl.summary()) print(evl.class_details()) print(evl.matrix()) # 结果统计 matrixResults = evl.confusion_matrix TN = float(matrixResults[0][0]) FP = float(matrixResults[0][1]) FN = float(matrixResults[1][0]) TP = float(matrixResults[1][1]) TPR = TP / (TP + FN) TNR = TN / (FP + TN) PPV = TP / (TP + FP) NPV = TN / (TN + FN) print("算法: " + clsfier) print("敏感度 TPR: " + str(TPR)) print("特异度 TNR: " + str(TNR)) print("PPV: " + str(PPV)) print("NPV: " + str(NPV)) # 保存模型 clsf.serialize(modelOutput, header=train) # 退出虚拟机 jvm.stop() print("分析模型建立完成")
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # classifier help helper.print_title("Creating help string") classifier = Classifier(classname="weka.classifiers.trees.J48") print(classifier.to_help()) # partial classname helper.print_title("Creating classifier from partial classname") clsname = ".J48" classifier = Classifier(classname=clsname) print(clsname + " --> " + classifier.classname) # classifier from commandline helper.print_title("Creating SMO from command-line string") cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"' classifier = from_commandline(cmdline, classname="weka.classifiers.Classifier") classifier.build_classifier(iris_data) print("input: " + cmdline) print("output: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # kernel classifier helper.print_title("Creating SMO as KernelClassifier") kernel = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"]) classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"]) classifier.kernel = kernel classifier.build_classifier(iris_data) print("classifier: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # build a classifier and output model helper.print_title("Training J48 classifier on iris") classifier = Classifier(classname="weka.classifiers.trees.J48") # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor" # property of the J48 classifier itself. However, being of type float rather than double, we need # to convert it to the correct type first using the double_to_float function: classifier.set_property("confidenceFactor", typeconv.double_to_float(0.3)) classifier.build_classifier(iris_data) print(classifier) print(classifier.graph) print(classifier.to_source("MyJ48")) plot_graph.plot_dot_graph(classifier.graph) # evaluate model on test set helper.print_title("Evaluating J48 classifier on iris") evaluation = Evaluation(iris_data) evl = evaluation.test_model(classifier, iris_data) print(evl) print(evaluation.summary()) # evaluate model on train/test split helper.print_title("Evaluating J48 classifier on iris (random split 66%)") classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) evaluation = Evaluation(iris_data) evaluation.evaluate_train_test_split(classifier, iris_data, 66.0, Random(1)) print(evaluation.summary()) # load a dataset incrementally and build classifier incrementally helper.print_title("Build classifier incrementally on iris") helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_inc = loader.load_file(iris_file, incremental=True) iris_inc.class_is_last() classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable") classifier.build_classifier(iris_inc) for inst in loader: classifier.update_classifier(inst) print(classifier) # construct meta-classifiers helper.print_title("Meta classifiers") # generic FilteredClassifier instantiation print("generic FilteredClassifier instantiation") meta = SingleClassifierEnhancer(classname="weka.classifiers.meta.FilteredClassifier") meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.set_property("filter", flter.jobject) print(meta.to_commandline()) # direct FilteredClassifier instantiation print("direct FilteredClassifier instantiation") meta = FilteredClassifier() meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.filter = flter print(meta.to_commandline()) # generic Vote print("generic Vote instantiation") meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote") classifiers = [ Classifier(classname="weka.classifiers.functions.SMO"), Classifier(classname="weka.classifiers.trees.J48") ] meta.classifiers = classifiers print(meta.to_commandline()) # cross-validate nominal classifier helper.print_title("Cross-validating NaiveBayes on diabetes") diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + diabetes_file) loader = Loader("weka.core.converters.ArffLoader") diabetes_data = loader.load_file(diabetes_file) diabetes_data.class_is_last() classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") pred_output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) evaluation = Evaluation(diabetes_data) evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.matrix()) print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0))) print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc)) print("areaUnderROC/1: " + str(evaluation.area_under_roc(1))) print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc)) print("avgCost: " + str(evaluation.avg_cost)) print("totalCost: " + str(evaluation.total_cost)) print("confusionMatrix: " + str(evaluation.confusion_matrix)) print("correct: " + str(evaluation.correct)) print("pctCorrect: " + str(evaluation.percent_correct)) print("incorrect: " + str(evaluation.incorrect)) print("pctIncorrect: " + str(evaluation.percent_incorrect)) print("unclassified: " + str(evaluation.unclassified)) print("pctUnclassified: " + str(evaluation.percent_unclassified)) print("coverageOfTestCasesByPredictedRegions: " + str(evaluation.coverage_of_test_cases_by_predicted_regions)) print("sizeOfPredictedRegions: " + str(evaluation.size_of_predicted_regions)) print("falseNegativeRate: " + str(evaluation.false_negative_rate(1))) print("weightedFalseNegativeRate: " + str(evaluation.weighted_false_negative_rate)) print("numFalseNegatives: " + str(evaluation.num_false_negatives(1))) print("trueNegativeRate: " + str(evaluation.true_negative_rate(1))) print("weightedTrueNegativeRate: " + str(evaluation.weighted_true_negative_rate)) print("numTrueNegatives: " + str(evaluation.num_true_negatives(1))) print("falsePositiveRate: " + str(evaluation.false_positive_rate(1))) print("weightedFalsePositiveRate: " + str(evaluation.weighted_false_positive_rate)) print("numFalsePositives: " + str(evaluation.num_false_positives(1))) print("truePositiveRate: " + str(evaluation.true_positive_rate(1))) print("weightedTruePositiveRate: " + str(evaluation.weighted_true_positive_rate)) print("numTruePositives: " + str(evaluation.num_true_positives(1))) print("fMeasure: " + str(evaluation.f_measure(1))) print("weightedFMeasure: " + str(evaluation.weighted_f_measure)) print("unweightedMacroFmeasure: " + str(evaluation.unweighted_macro_f_measure)) print("unweightedMicroFmeasure: " + str(evaluation.unweighted_micro_f_measure)) print("precision: " + str(evaluation.precision(1))) print("weightedPrecision: " + str(evaluation.weighted_precision)) print("recall: " + str(evaluation.recall(1))) print("weightedRecall: " + str(evaluation.weighted_recall)) print("kappa: " + str(evaluation.kappa)) print("KBInformation: " + str(evaluation.kb_information)) print("KBMeanInformation: " + str(evaluation.kb_mean_information)) print("KBRelativeInformation: " + str(evaluation.kb_relative_information)) print("SFEntropyGain: " + str(evaluation.sf_entropy_gain)) print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain)) print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy)) print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy)) print("matthewsCorrelationCoefficient: " + str(evaluation.matthews_correlation_coefficient(1))) print("weightedMatthewsCorrelation: " + str(evaluation.weighted_matthews_correlation)) print("class priors: " + str(evaluation.class_priors)) print("numInstances: " + str(evaluation.num_instances)) print("meanAbsoluteError: " + str(evaluation.mean_absolute_error)) print("meanPriorAbsoluteError: " + str(evaluation.mean_prior_absolute_error)) print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error)) print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error)) print("rootMeanPriorSquaredError: " + str(evaluation.root_mean_prior_squared_error)) print("rootRelativeSquaredError: " + str(evaluation.root_relative_squared_error)) print("prediction output:\n" + str(pred_output)) plot_cls.plot_roc( evaluation, title="ROC diabetes", class_index=range(0, diabetes_data.class_attribute.num_values), wait=False) plot_cls.plot_prc( evaluation, title="PRC diabetes", class_index=range(0, diabetes_data.class_attribute.num_values), wait=False) # load a numeric dataset bolts_file = helper.get_data_dir() + os.sep + "bolts.arff" helper.print_info("Loading dataset: " + bolts_file) loader = Loader("weka.core.converters.ArffLoader") bolts_data = loader.load_file(bolts_file) bolts_data.class_is_last() # build a classifier and output model helper.print_title("Training LinearRegression on bolts") classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) classifier.build_classifier(bolts_data) print(classifier) # cross-validate numeric classifier helper.print_title("Cross-validating LinearRegression on bolts") classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) evaluation = Evaluation(bolts_data) evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42)) print(evaluation.summary()) print("correlationCoefficient: " + str(evaluation.correlation_coefficient)) print("errorRate: " + str(evaluation.error_rate)) helper.print_title("Header - bolts") print(str(evaluation.header)) helper.print_title("Predictions on bolts") for index, pred in enumerate(evaluation.predictions): print(str(index+1) + ": " + str(pred) + " -> error=" + str(pred.error)) plot_cls.plot_classifier_errors(evaluation.predictions, wait=False) # learning curve cls = [ Classifier(classname="weka.classifiers.trees.J48"), Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")] plot_cls.plot_learning_curve( cls, diabetes_data, increments=0.05, label_template="[#] !", metric="percent_correct", wait=True) # access classifier's Java API labor_file = helper.get_data_dir() + os.sep + "labor.arff" helper.print_info("Loading dataset: " + labor_file) loader = Loader("weka.core.converters.ArffLoader") labor_data = loader.load_file(labor_file) labor_data.class_is_last() helper.print_title("Using JRip's Java API to access rules") jrip = Classifier(classname="weka.classifiers.rules.JRip") jrip.build_classifier(labor_data) rset = jrip.jwrapper.getRuleset() for i in range(rset.size()): r = rset.get(i) print(str(r.toString(labor_data.class_attribute.jobject)))
def experiment_sequential_file(path_indices, path_features, path_folder_save_results, options, classifier, name, indicator_col, images): ind_f = load(path_indices) lst = ind_f.files for item in lst: ind = ind_f[item] + 1 cls = Classifier(classname=classifier, options=weka.core.classes.split_options(options)) data = converters.load_any_file(path_features) ind = np.append(ind, len(data)) data.class_is_last() pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV") d_results = { 'index': [], 'percent_correct': [], 'percent_incorrect': [], 'precision': [], 'recall': [], 'f-score': [], 'confusion_matrix': [] } for j in range(len(ind) - 1): first = ind[j] if j == len(ind) - 2: last = ind[j + 1] else: last = ind[j + 1] - 1 d_test = data.subset(row_range=str(first) + '-' + str(last)) if j == 0: # first d_train = data.subset(row_range=str(last + 1) + '-' + str(ind[-1])) # last element print(str(last + 1) + '-' + str(ind[-1])) elif j == len(ind) - 2: # last d_train = data.subset(row_range='1-' + str(first - 1)) # last element print('1-' + str(first - 1)) else: # central s = '1-' + str(first - 1) + ',' + str(last + 1) + '-' + str( ind[-1]) print(s) d_train = data.subset(row_range=s) cls.build_classifier(d_train) evl = Evaluation(data) evl.test_model(cls, d_test, pout) # print(type(d_train)) # print(type(d_test)) d_results['index'].append(str(ind[j])) d_results['percent_correct'].append(evl.percent_correct) d_results['percent_incorrect'].append(evl.percent_incorrect) d_results['precision'].append(evl.precision(1)) d_results['recall'].append(evl.recall(1)) d_results['f-score'].append(evl.f_measure(1)) d_results['confusion_matrix'].append( evl.matrix()) # Generates the confusion matrix. save = pout.buffer_content() check_folder_or_create(path_folder_save_results + '/' + 'prediction') with open( path_folder_save_results + '/' + 'prediction/' + name + 'pred_data.csv', 'w') as f: f.write(save) buffer_save = pd.read_csv(path_folder_save_results + '/' + 'prediction/' + name + 'pred_data.csv', index_col=False, header=None) col_label = buffer_save[1] col_prediction = buffer_save[2] col_different = buffer_save[3] create_prediction(col_label, col_prediction, col_different, indicator_col, images, name, path_folder_save_results + '/prediction/') d_results = pd.DataFrame(data=d_results) d_results.to_csv(path_folder_save_results + '/' + name + 'results.csv', index=False)
def experiment_more_file(path_files, path_folder_save_results, fold, options, classifier, random, name, voting=False): cls = Classifier(classname=classifier, options=weka.core.classes.split_options(options)) file_list = os.listdir(path_files) for file in file_list: if ".csv" not in file: file_list.remove(file) d_results = { 'name_file': [], 'percent_correct': [], 'percent_incorrect': [], 'precision': [], 'recall': [], 'f-score': [], 'confusion_matrix': [] } for file in file_list: indicator_table = pd.read_csv(path_files + '/indicator/' + file[0] + '_indicator.csv') indicator = list(indicator_table['indicator']) images = list(indicator_table['image']) data = converters.load_any_file(path_files + "/" + file) data.class_is_last() pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV") evl = Evaluation(data) evl.crossvalidate_model(cls, data, fold, Random(random), pout) d_results['name_file'].append(str(file)) d_results['percent_correct'].append(evl.percent_correct) d_results['percent_incorrect'].append(evl.percent_incorrect) d_results['precision'].append(evl.precision(1)) d_results['recall'].append(evl.recall(1)) d_results['f-score'].append(evl.f_measure(1)) d_results['confusion_matrix'].append( evl.matrix()) # Generates the confusion matrix. save = pout.buffer_content() check_folder_or_create(path_folder_save_results + '/' + name + '/' + 'prediction') with open( path_folder_save_results + '/' + name + '/' + 'prediction/pred_data.csv', 'w') as f: f.write(save) buffer_save = pd.read_csv(path_folder_save_results + '/' + name + '/' + 'prediction/pred_data.csv', index_col=False) col_label = buffer_save['actual'] col_prediction = buffer_save['predicted'] col_different = buffer_save['error'] create_prediction( col_label, col_prediction, col_different, indicator, images, file[:-4], path_folder_save_results + '/' + name + '/prediction/') d_results = pd.DataFrame(data=d_results) d_results.to_csv(path_folder_save_results + '/' + str(name) + ".csv")
def process_classifier(runType, cls, occ, devList, fewCats, label, subtract): global devCount global save_orig global save_subtract conf_matrix = {} if occ: table = 'temp_dat_occ_vector_occ' else: table = 'temp_dat_occ_vector_2' writeStr = '=========================================================================================\n' + \ 'Running ' + runType + ' classifier for \'' + label + '\'' sys.stdout.write(writeStr + '\r') total_conf.write(writeStr + '\n') sys.stdout.flush() if runType == 'unseen': i = 0 indiv_results = {} for dev in devList: devCount += 1 remaining = chop_microseconds(((datetime.utcnow() - item_start)*totalDevs/devCount)-(datetime.utcnow() - item_start)) sys.stdout.write('Running ' + runType + ' classifier for \'' + label + '\' - ' + \ str(round(100*float(devCount)/totalDevs,2)) + ' pct complete (' + str(remaining) + ' remaining) \r') sys.stdout.flush() if fewCats: aws_c.execute('select * from ' + table + ' ' \ 'where duty!=0 and deviceMAC not in (select * from vector_reject) ' \ 'and deviceMAC in (select * from id_fewcats_mac) ' 'and deviceMAC!=\'' + dev + '\';') else: aws_c.execute('select * from ' + table + ' ' \ 'where duty!=0 and deviceMAC not in (select * from vector_reject) ' \ 'and deviceMAC!=\'' + dev + '\';') results = aws_c.fetchall() # Generate type list total_types = ['{'] for data in results: if(data[-1] not in total_types): total_types.append('\"') total_types.append(data[-1]) total_types.append('\"') total_types.append(',') total_types[-1] = '}' typeStr = ''.join(total_types) arff_train = label + '_' + dev + '_train' arff_test = label + '_' + dev + '_test' gen_arff(arff_train, typeStr, results, occ, arff_idcol) if fewCats: aws_c.execute('select * from ' + table + ' ' \ 'where duty!=0 and deviceMAC not in (select * from vector_reject) ' \ 'and deviceMAC in (select * from id_fewcats_mac) ' 'and deviceMAC=\'' + dev + '\';') else: aws_c.execute('select * from ' + table + ' ' \ 'where duty!=0 and deviceMAC not in (select * from vector_reject) ' \ 'and deviceMAC=\'' + dev + '\';') gen_arff(arff_test, typeStr, aws_c.fetchall(), occ, arff_idcol) train = loader.load_file(arff_train + '.arff') train.class_is_last() mv(arff_train + '.arff', master_saveDir) test = loader.load_file(arff_test + '.arff') test.class_is_last() mv(arff_test + '.arff', master_saveDir) cls.build_classifier(train) # output predictions testName = '' predictions = [] for index, inst in enumerate(test): if testName != '': if testName != inst.get_string_value(inst.class_index): print(str(testName) + ' ' + str(inst.get_string_value(inst.class_index))) exit() else: testName = inst.get_string_value(inst.class_index) else: testName = inst.get_string_value(inst.class_index) if testName not in conf_matrix: conf_matrix[testName] = {} pred = cls.classify_instance(inst) # dist = cls.distribution_for_instance(inst) # if(pred == inst.get_value(inst.class_index)): predName = inst.class_attribute.value(int(pred)) if predName not in conf_matrix[testName]: conf_matrix[testName][predName] = 0 conf_matrix[testName][predName] += 1 predictions.append(predName) total = 0 if testName != '': for predName in conf_matrix[testName]: if predName == testName: correct = conf_matrix[testName][predName] total += correct else: total += conf_matrix[testName][predName] # while (len(predictions) * 2) <= 100: # predictions += pyrandom.sample(predictions, len(predictions)) # if len(predictions) < 100: # predictions += pyrandom.sample(predictions, 100 - len(predictions)) lots_predictions = [] while len(lots_predictions) < 10000: lots_predictions += pyrandom.sample(predictions, 1) #indiv_results[dev] = [testName, pyrandom.sample(predictions, 100)] indiv_results[dev] = [testName, lots_predictions] # while len(predictions) < 100: # predictions += pyrandom.sample(predictions, 1) # indiv_results[dev] = [testName, predictions] # indiv_results[dev] = [testName, predictions] # Prep to print the how-many-days graph # days_output.write('\n\n\"' + dev + '\"\n') #print(str(testName) + ' ' + str(correct) + ' ' + str(total) + ' ' + str(float(correct)/total)) # i += 1 # if i == 10: # break correct, total = print_conf_matrix(conf_matrix, sys.stdout, False, False, False) correct, total = print_conf_matrix(conf_matrix, total_conf, False, False, False) if subtract == 'orig': save_orig = copy.deepcopy(conf_matrix) elif subtract == 'subtract': save_subtract = copy.deepcopy(conf_matrix) final_result = round(100*float(correct)/total,2) writeStr = '\nCorrectly Classified Instances\t\t' + str(correct) + '\t\t' + str(final_result) + '\n' + \ 'Incorrectly Classified Instances\t' + str(total-correct) + '\t\t' + str(round(100*float(total-correct)/total,2)) + '\n' + \ 'Total Number of Instances\t\t' + str(total) + '\n' print(writeStr) total_conf.write(writeStr + '\n') conf_interval = 10 total_instances = float(sum([sum([conf_matrix[test][pred] for pred in conf_matrix[test]]) for test in conf_matrix])) p_d = {} p_e = {} p_e_given_d = {} for testName in conf_matrix: count_d = float(sum([conf_matrix[testName][label] for label in conf_matrix[testName]])) p_d[testName] = count_d / total_instances p_e[testName] = float(sum([conf_matrix[label][testName] for label in conf_matrix if testName in conf_matrix[label]]) / total_instances) p_e_given_d[testName] = {} for predName in conf_matrix: if predName in conf_matrix[testName]: p_e_given_d[testName][predName] = conf_matrix[testName][predName] / count_d else: p_e_given_d[testName][predName] = 0 confidence = open('confidence.dat', 'w') for testName in conf_matrix: confidence.write('\n\n\"' + testName + '\"\n') print(testName) for classEvents in range(1, (conf_interval+1)): numerator = math.pow(p_e_given_d[testName][testName], classEvents) * p_d[testName] demoninator = 0 for otherName in conf_matrix: demoninator += math.pow(p_e_given_d[otherName][testName], classEvents) * p_d[otherName] confidence.write(str(classEvents) + '\t' + str(numerator/demoninator) + '\n') print(str(classEvents) + '\t' + str(numerator/demoninator)) print('') for predName in p_e_given_d['Router/Modem']: print('P( ' + predName + ' | Router/Modem ):\t' + str(p_e_given_d['Router/Modem'][predName])) for predName in p_e_given_d['Cable Box']: print('P( ' + predName + ' | Cable Box ):\t' + str(p_e_given_d['Cable Box'][predName])) #router = open('router', 'w') print('Router Stuff:') routerDev = 'Router/Modem' lampDev = 'Lamp' cableDev = 'Cable Box' origClassList = ['Router/Modem', 'Cable Box', 'Lamp', 'Router/Modem', 'Router/Modem', 'Cable Box', 'Router/Modem', 'Lamp', 'Router/Modem'] classListList = [['Router/Modem'] + list(listItem) for listItem in set(itertools.permutations(origClassList))] classListList = [ ['Router/Modem', 'Router/Modem', 'Lamp', 'Lamp', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Cable Box', 'Router/Modem'], ['Router/Modem', 'Cable Box', 'Lamp', 'Lamp', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Cable Box', 'Router/Modem'], ['Router/Modem', 'Router/Modem', 'Lamp', 'Lamp', 'Cable Box', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Cable Box', 'Router/Modem'], ['Router/Modem', 'Cable Box', 'Lamp', 'Lamp', 'Cable Box', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Cable Box', 'Router/Modem'], ['Router/Modem', 'Cable Box', 'Router/Modem', 'Lamp', 'Cable Box', 'Router/Modem', 'Router/Modem', 'Lamp', 'Router/Modem', 'Router/Modem'], ['Router/Modem', 'Router/Modem', 'Router/Modem', 'Lamp', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Lamp', 'Router/Modem', 'Router/Modem'], ['Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Router/Modem', 'Lamp'] ] for idClass, classList in enumerate(classListList): print(idClass) for classEvents in range(1, (conf_interval+1)): numerator_router = p_d[routerDev] numerator_lamp = p_d[lampDev] numerator_cable = p_d[cableDev] for idx, classInst in enumerate(classList): if idx < classEvents: numerator_router *= p_e_given_d[routerDev][classInst] numerator_lamp *= p_e_given_d[lampDev][classInst] numerator_cable *= p_e_given_d[cableDev][classInst] demoninator = 0 for otherName in conf_matrix: obsValue = p_d[otherName] for idx, classInst in enumerate(classList): if idx < classEvents: obsValue *= p_e_given_d[otherName][classInst] demoninator += obsValue print(str(classEvents) + '\t' + str(numerator_router/demoninator) + '\t' + str(numerator_lamp/demoninator) + '\t' + str(numerator_cable/demoninator) + '\t\"' + classList[classEvents-1]) + '\"' print('') numberDevList(indiv_results) eachDev = open('indiv_results.dat', 'w') newIDStream = open('new_id.dat', 'w') for devItem in indiv_results: print_obsResults(conf_matrix, conf_interval, p_d, p_e, p_e_given_d, indiv_results[devItem], eachDev, devItem, newIDStream) print('') print('total devices: ' + str(len(indiv_results))) # print('total devices: ' + str(total_devices)) # print('total correct: ' + str(total_correct)) # print(' pct correct: ' + str(round(100*float(total_correct)/total_devices,2)) + '\n') print('initial confidence: ' + str(round(100*float(sum(initial_confidence))/len(initial_confidence),2))) print('initial accuracy: ' + str(round(100*float(sum(initial_accuracy))/len(initial_accuracy),2)) + '\n') # print('final confidence (correct): ' + str(round(100*float(sum(final_confidence_correct))/len(final_confidence_correct),2))) # print('final confidence (correct): ' + str(round(100*float(sum(final_confidence_incorrect))/len(final_confidence_incorrect),2))) # print('final accuracy: ' + str(round(100*float(total_correct)/total_devices,2))) for devType in final_accuracy: print('final accuracy ' + devType + ' : ' + str(round(float(sum(final_accuracy[devType]))/len(final_accuracy[devType]),6))) print('final confidence (correct) ' + devType + ' : ' + str(round(float(sum(final_confidence_correct[devType]))/len(final_confidence_correct[devType]),6))) if len(final_confidence_incorrect[devType]) > 0: print('final confidence (incorrect) ' + devType + ' : ' + str(round(float(sum(final_confidence_incorrect[devType]))/len(final_confidence_incorrect[devType]),6))) else: print('final confidence (incorrect) ' + devType + ' : ' + str(0)) print('final confidence ' + devType + ' : ' + str(round(float(sum(final_confidence_correct[devType])+sum(final_confidence_incorrect[devType]))/(len(final_confidence_correct[devType])+len(final_confidence_incorrect[devType])),2))) print_conf_matrix(new_conf_matrix, sys.stdout, False, False, False) for topType in actual_confidence_matrix: for botType in actual_confidence_matrix[topType]: storeArray = actual_confidence_matrix[topType][botType] if len(storeArray) > 0: actual_confidence_matrix[topType][botType] = round(sum(storeArray)/len(storeArray),2) else: actual_confidence_matrix[topType][botType] = 0 print_conf_matrix(conf_matrix, sys.stdout, False, False, False) print_conf_matrix(actual_confidence_matrix, sys.stdout, False, False, False) print_conf_matrix(actual_confidence_matrix, sys.stdout, True, False, True) for devType in acc_over_time_dev: printOverTime(devType, acc_over_time_dev[devType], conf_over_time_dev[devType]) printOverTime('total', acc_over_time, conf_over_time) elif runType == 'seen': if fewCats: aws_c.execute('select * from ' + table + ' ' \ 'where duty!=0 and deviceMAC not in (select * from vector_reject) ' \ 'and deviceMAC in (select * from id_fewcats_mac);') else: aws_c.execute('select * from ' + table + ' ' \ 'where duty!=0 and deviceMAC not in (select * from vector_reject);') results = aws_c.fetchall() devCount += 1 remaining = chop_microseconds(((datetime.utcnow() - item_start)*totalDevs/devCount)-(datetime.utcnow() - item_start)) sys.stdout.write('Running ' + runType + ' classifier for \'' + label + '\' - ' + \ str(round(100*float(devCount)/totalDevs,2)) + ' pct complete (' + str(remaining) + ' remaining) \r') sys.stdout.flush() # Generate type list total_types = ['{'] for data in results: if(data[-1] not in total_types): total_types.append('\"') total_types.append(data[-1]) total_types.append('\"') total_types.append(',') total_types[-1] = '}' typeStr = ''.join(total_types) arff_file = label + '_train' gen_arff(arff_file, typeStr, results, occ, arff_idcol) train = loader.load_file(arff_file + '.arff') train.class_is_last() mv(arff_file + '.arff', master_saveDir) cls.build_classifier(train) evl = Evaluation(train) evl.crossvalidate_model(cls, train, 10, Random(1)) print('\n') #print(evl.percent_correct) #print(evl.class_details()) print(evl.matrix()) total_conf.write('\n' + evl.matrix()) print(evl.summary()) total_conf.write(evl.summary() + '\n') final_result = round(evl.percent_correct, 2) else: success = [] for startDev in devList: for changeToDev in devList: if startDev != changeToDev: devCount += 1 remaining = chop_microseconds(((datetime.utcnow() - item_start)*totalDevs/devCount)-(datetime.utcnow() - item_start)) sys.stdout.write('Running ' + runType + ' classifier for \'' + label + '\' - ' + \ str(round(100*float(devCount)/totalDevs,2)) + ' pct complete (' + str(remaining) + ' remaining) \r') sys.stdout.flush() aws_c.execute('select * from temp_dat_occ_vector_2 ' \ 'where duty!=0 and deviceMAC in (\'' + startDev + '\',\'' + changeToDev + '\');') results = [x[:-1] + (x[1],) for x in aws_c.fetchall()] # Class label is just the deviceMAC if len(results) > 10: # Generate type list typeStr = '{' + startDev + ',' + changeToDev + '}' arff_file = label + '_' + startDev + '_' + changeToDev + '_train' gen_arff(arff_file, typeStr, results, occ, arff_idcol) train = loader.load_file(arff_file + '.arff') train.class_is_last() mv(arff_file + '.arff', master_saveDir) cls.build_classifier(train) evl = Evaluation(train) evl.crossvalidate_model(cls, train, 10, Random(1)) print('\n') #print(evl.percent_correct) #print(evl.class_details()) print(evl.matrix()) total_conf.write('\n' + evl.matrix()) print(evl.summary()) total_conf.write(evl.summary() + '\n') success.append(evl.percent_correct) if len(success) > 0: final_result = [sum(success)/len(success), percentile(success, 5), percentile(success, 10), percentile(success, 95)] else: final_result = False if label in total_results: print('Warning label ' + label + ' exists twice, overwriting...') if final_result != False: total_results[label] = final_result
data.class_is_last() """Naive Bayes Classifier for Bug Prediction""" classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") pred_output = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, 10, Random(42), output=pred_output) plot_cls.plot_roc(evaluation, title="ROC bugs",class_index=range(0, data.class_attribute.num_values), wait=False) plot_cls.plot_prc(evaluation, title="PRC bugs - NaiveBayes",class_index=range(0, data.class_attribute.num_values), wait=False) """Performance Metrics - Naive Bayes Classifier""" print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.matrix()) print("confusionMatrix: " + str(evaluation.confusion_matrix)) print("fMeasure: " + str(evaluation.f_measure(1))) print("precision: " + str(evaluation.precision(1))) print("recall: " + str(evaluation.recall(1))) """Random Forest Classifier""" classifier2 = Classifier(classname="weka.classifiers.trees.RandomForest") evaluation2 = Evaluation(data) evaluation2.crossvalidate_model(classifier2, data, 10, Random(42)) plot_cls.plot_roc(evaluation2, title="ROC bugs",class_index=range(0, data.class_attribute.num_values), wait=False) plot_cls.plot_prc(evaluation2, title="PRC bugs - RandomForest",class_index=range(0, data.class_attribute.num_values), wait=False) """Performance Evaluation Metrics - Random Forest"""
from utilities import * import weka.core.jvm as jvm from weka.core.converters import Loader, Saver from weka.classifiers import Classifier, Evaluation from weka.core.classes import Random jvm.start(max_heap_size="3072m") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file("./Dataset/trainGrid.arff") data.class_is_last() #classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"]) classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") evaluation = Evaluation(data) #evaluation.crossvalidate_model(classifier, data, 10, Random(42)) evaluation.evaluate_train_test_split(classifier, data, 66, Random(42)) res = evaluation.summary() res += "\n" + evaluation.matrix() #f = open('./Dataset/resultsGrid.txt', 'w') #f.write(res) print res jvm.stop()
def CrossValidateFullDataset(): #Tests a classifier performance with 10x cross-validation data_dir = "test/" print "Loading Dataset..." data = converters.load_any_file(data_dir + "full_dataset.csv") print "Dataset Loaded!" #Set class attribute data.class_is_last() cls_classes = [#"weka.classifiers.trees.J48", "weka.classifiers.trees.RandomForest", #"weka.classifiers.lazy.IBk" ] classifiers = [] for cls in cls_classes: classifiers.append(Classifier(classname=cls)) #Regex for attribute selection #(Useful for testing different combinations of attributes) identifier_att = ".*id.*" #timeseries_att = "raw.*" rmNoise_att = "rmNoise.*" #doppler_att = "doppler.*" #phase_att = "phase.*" #music_att = "music.*" #beamform_att = "beamform.*" #music_sliding_att = "music_sliding.*" #music_agg_att = "music_agg.*" #music_angles_att = "music_angles.*" att_set = [rmNoise_att] ################################################## #Remove instances identifier attribute data = FilterAttribute(identifier_att,data) ################################################ for att_comb in powerset(att_set): data_filtered = data for att in att_comb: if(len(att) != len(att_set)): data_filtered = FilterAttribute(att,data_filtered) if str(list(set(att_set) - set(att_comb)))=='[]': continue print att_set print att_comb print colored("======================================================",'green') print colored("Full attribute set: " + str(att_set),'green') print colored("Removed attributes: " + str(att_comb),'red') if(len(att_comb) > 0): print colored("Using attributes: " + str(list(set(att_set) - set(att_comb))), 'green') print colored("======================================================",'green') print data_dir for i, cls in enumerate(classifiers): evl = Evaluation(data_filtered) evl.crossvalidate_model(cls, data_filtered, 10, Random(1)) print colored("=> 10x cross-validation for " + cls_classes[i], 'red') print(evl.summary()) print(evl.matrix())
def ClassifyTestSet(): #Tests a classifier performance with a dedicated test set # Models are evaluated for different combinations of features # Several classifiers may be used # Load Datasets data_dir = "Testbed/" #h=open(data_dir+"training_dataset.csv","rb") #print h a = open(data_dir +"training_dataset.csv", "r") print len(a.readlines()) a = open(data_dir +"testing_dataset.csv", "r") print len(a.readlines()) training = converters.load_any_file(data_dir+"training_dataset.csv") training.class_is_last() testing = converters.load_any_file(data_dir +"testing_dataset.csv") testing.class_is_last() #set class attribute to be the last one listed #Choose classifiers to use cls_classes = ["weka.classifiers.trees.RandomForest", "weka.classifiers.trees.J48", "weka.classifiers.lazy.IBk" ] classifiers = [] for cls in cls_classes: classifiers.append(Classifier(classname=cls)) #Regex for attribute selection #(Useful for testing different combinations of attributes) identifier_att = ".*id.*" timeseries_att = "Mic.*" doppler_att = "doppler.*" phase_att = "phase.*" music_att = "music.*" beamform_att = "beamform.*" att_set = [timeseries_att, doppler_att, phase_att, music_att, beamform_att] ################################################## #Remove instances identifier attribute training = FilterAttribute(identifier_att,training) testing = FilterAttribute(identifier_att,testing) ################################################ for att_comb in powerset(att_set): training_filtered = training testing_filtered = testing for att in att_comb: if(len(att) != len(att_set)): training_filtered = FilterAttribute(att,training_filtered) testing_filtered = FilterAttribute(att,testing_filtered) print colored("======================================================",'green') print colored("Full attribute set: " + str(att_set),'green') print colored("Removed attributes: " + str(att_comb),'green') print colored("======================================================",'green') for i, cls in enumerate(classifiers): cls.build_classifier(training_filtered) evl = Evaluation(training) evl.test_model(cls, testing_filtered) print colored("=> Testing for " + cls_classes[i], 'red') print(evl.summary()) print(evl.matrix())
jvm.start() # load glass fname = data_dir + os.sep + "glass.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) data.class_is_last() # cross-validate default J48 print("\nDefault J48") cls = Classifier(classname="weka.classifiers.trees.J48") evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print(evl.summary()) print(evl.matrix()) # build and plot model cls.build_classifier(data) plg.plot_dot_graph(cls.graph) # cross-validate unpruned J48 with larger leaf size print("\nUnpruned J48 (minNumObj=15)") cls = Classifier(classname="weka.classifiers.trees.J48", options=["-U", "-M", "15"]) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print(evl.summary()) print(evl.matrix()) # build and plot model
loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file("reviewsinformation_task2.arff") iris_data.class_is_last() loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # kernel classifier helper.print_title("Creating SMO as KernelClassifier") kernel = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"]) classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"]) classifier.kernel = kernel classifier.build_classifier(iris_data) print("classifier: " + classifier.to_commandline()) print("model:\n" + str(classifier)) #print("model:\n" + str(classifier)) evaluation = Evaluation('test_data.arff') evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.matrix())
sep=',', index=False) smote_test_data = convert.load_any_file( filename=GENERATED_SMOTE_TEST_DATA_FILE_PATH) smote_test_data.class_is_first() # load logistic model tree algorithm log_tree = Classifier(classname="weka.classifiers.trees.LMT") eval_smote_test_obj = Evaluation(smote_test_data) eval_smote_test_obj.crossvalidate_model(classifier=log_tree, data=smote_test_data, num_folds=5, rnd=Random(1)) print("SMOTE Test CV (5-folds) Error = %.2f%%" % (eval_smote_test_obj.percent_incorrect)) print(eval_smote_test_obj.matrix()) print("=================\"Summary\"====================") print(eval_smote_test_obj.summary()) log_tree.build_classifier(smote_test_data) y_predict = eval_smote_test_obj.test_model(log_tree, smote_test_data) y_test = to_binary_numeric(y_test.head(500), classNeg="neg") falsePositiveRate, truePositiveRate, thresholds = roc_curve( y_test, y_predict) # compute Area Under the Curve (AUC) using the trapezoidal rule area = auc(falsePositiveRate, truePositiveRate) plt.plot(falsePositiveRate, truePositiveRate,
export_test_data.to_csv(GENERATED_TEST_DATA_FILE_PATH, sep=',', index=False) test_data = convert.load_any_file(filename=GENERATED_TEST_DATA_FILE_PATH) test_data.class_is_first() # load logistic model tree algorithm log_tree = Classifier(classname="weka.classifiers.trees.LMT") eval_test_obj = Evaluation(test_data) eval_test_obj.crossvalidate_model(classifier=log_tree, data=test_data, num_folds=5, rnd=Random(1)) print("Test CV (10-folds) Error = %.2f%%" % (eval_test_obj.percent_incorrect)) print(eval_test_obj.matrix()) print("=================\"Summary\"====================") print(eval_test_obj.summary()) log_tree.build_classifier(test_data) y_predict = eval_test_obj.test_model(log_tree, test_data) y_test = to_binary_numeric(y_test.head(500), classNeg="neg") falsePositiveRate, truePositiveRate, thresholds = roc_curve(y_test, y_predict, pos_label=0) # compute Area Under the Curve (AUC) using the trapezoidal rule area = auc(falsePositiveRate, truePositiveRate) plt.plot(falsePositiveRate,
def ClassifyParam(website, mode, binWidths, truncation_modes=["full", "truncated"]): if not os.path.exists("classificationResults"): os.makedirs("classificationResults") if("normal" in mode): for truncation in truncation_modes: file = open("classificationResults/SingleWebsite_%s_%s.csv"%(truncation, website),"w") file.write("BinWidth, Accuracy, FalsePositiveRate, FalseNegativeRate\n") for binWidth in binWidths: train_set_file = "TrainSet_%s_%s.arff"%(truncation, binWidth) train_set = "Data/%s/arff/%s"%(website, train_set_file) test_set = "Data/%s/arff/%s"%(website, train_set_file.replace("TrainSet", "TestSet")) print "Loading Datasets..." print "Train: " + train_set train_data = converters.load_any_file(train_set) print "Test: " + test_set test_data = converters.load_any_file(test_set) #Set class attribute train_data.class_is_last() test_data.class_is_last() print "Dataset Loaded!" classifier_name = "weka.classifiers.meta.FilteredClassifier" classifier = Classifier(classname=classifier_name, options=[ "-F", "weka.filters.unsupervised.attribute.StringToWordVector -R first-last -W 1000 -C -T -N 1 -stemmer weka.core.stemmers.NullStemmer -M 1 -tokenizer \"weka.core.tokenizers.WordTokenizer -delimiters \\\" \\\\r\\\\n\\\\t.,;:\\\\\\\'\\\\\\\"()?!\\\"\"", "-W", "weka.classifiers.bayes.NaiveBayesMultinomial"]) start_train = time.time() classifier.build_classifier(train_data) end_train = time.time() print "Train\t%s\t%s"%(binWidth, end_train-start_train) for index, inst in enumerate(test_data): if(index == 0): start_sample = time.time() classifier.classify_instance(inst) end_sample = time.time() print "Sample\t%s\t%s"%(binWidth, end_sample-start_sample) print "Evaluating w/ Multinomial Naive Bayes classifier. BinWidth = %s"%(binWidth) evaluation = Evaluation(test_data) start_batch = time.time() evaluation.test_model(classifier, test_data) end_batch = time.time() print "Batch\t%s\t%s"%(binWidth,end_batch-start_batch) print evaluation.summary() print evaluation.matrix() #Just as an example, we're measuring the fpr and fnr of the website indexed as class 1 tp = evaluation.num_true_positives(1) tn = evaluation.num_true_negatives(1) fp = evaluation.num_false_positives(1) fn = evaluation.num_false_negatives(1) acc = (tp+tn)/float(tp+tn+fp+fn) fpr = evaluation.false_positive_rate(1) fnr = evaluation.false_negative_rate(1) print "Accuracy: %s"%(acc) print "False Positive Rate: %s"%(fpr) print "False Negative Rate: %s"%(fnr) file.write("%s, %s, %s, %s\n"%(binWidth, acc, fpr, fnr)) file.close()
import weka.core.jvm as jvm import weka.core.converters as conv from weka.classifiers import Evaluation, Classifier from weka.core.classes import Random import weka.plot.classifiers as plcls # NB: matplotlib is required import os data_dir = "/home/suruchi/Desktop/BTECH Pro/new/click_prediction/" jvm.start(packages=True) from weka.core.converters import Loader loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(data_dir + "click_prediction.arff") data.class_is_last() #print(data) cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes") evl = Evaluation(data) evl.crossvalidate_model(cls, data, 2, Random(5)) print(evl.summary("=== NaiveBayes on click prediction (stats) ===", False)) print(evl.matrix("=== NaiveBayes on click prediction(confusion matrix) ===")) #plcls.plot_classifier_errors(evl.predictions, absolute=False,wait = True) plcls.plot_roc(evl, class_index=[0, 1], wait=True) print("areaUnderROC/1: " + str(evl.area_under_roc(1))) jvm.stop()
X_train, y_train = load_data(ROOT_PATH + APS_TRAIN, skip_first_row=21, y_column_index=0, assignedColumnNames=APS_FULL_COLUMNS, missingSymbol='na', needImpute=True, dropOrNot=True) export_train_data = pd.concat([y_train.head(500), X_train.head(500)], axis=1) # export data to csv export_train_data.to_csv(GENERATED_TRAIN_DATA_FILE_PATH, sep=',', index=False) train_data = convert.load_any_file(filename=GENERATED_TRAIN_DATA_FILE_PATH) train_data.class_is_first() # load logistic model tree algorithm log_tree = Classifier(classname="weka.classifiers.trees.LMT") eval_train_obj = Evaluation(train_data) eval_train_obj.crossvalidate_model(classifier=log_tree, data=train_data, num_folds=5, rnd=Random(1)) print("Train CV (10-folds) Error = %.2f%%" % (eval_train_obj.percent_incorrect)) print(eval_train_obj.matrix()) print("=================\"Summary\"====================") print(eval_train_obj.summary()) log_tree.build_classifier(train_data) y_predict = eval_train_obj.test_model(log_tree, train_data) # y_train = np.array(np.where(y_train.head(500).to_numpy() == 'neg', 0, 1)) y_train = to_binary_numeric(y_train.head(500), classNeg="neg") falsePositiveRate, truePositiveRate, thresholds = roc_curve(y_train, y_predict, pos_label=0) # compute Area Under the Curve (AUC) using the trapezoidal rule area = auc(falsePositiveRate, truePositiveRate) plt.plot(falsePositiveRate, truePositiveRate, color='red', label='ROC = ' + str(area)) plt.plot([0, 1], [0, 1], linestyle='dotted')
def call_weka(file_dir, ml_opt, ofile_dir): loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(file_dir) data.class_is_last() filtered = data ml_id = '' if ml_opt != '0': if ml_opt == '1': classifier = Classifier( classname="weka.classifiers.functions.LibSVM", options=[ "-S", "0", "-K", "2", "-D", "3", "-G", "0.0", "-R", "0.0", "-N", "0.5", "-M", "40.0", "-C", "1.0", "-E", "0.001", "-P", "0.1", "-seed", "1" ]) ml_id = 'SVM' elif ml_opt == '3': classifier = Classifier( classname="weka.classifiers.functions.MLPClassifier", options=[ '-N', '2', '-R', '0.01', '-O', '1.0E-6', '-P', '1', '-E', '1', '-S', '1' ]) ml_id = 'MLPC' elif ml_opt == '4': classifier = Classifier( classname="weka.classifiers.trees.RandomForest", options=["-I", "100", "-K", "0", "-S", "1", "-num-slots", "1"]) ml_id = 'RF' elif ml_opt == '2': classifier = Classifier(classname="weka.classifiers.meta.Bagging", options=[ "-P", "100", "-S", "1", "-I", "10", "-W", "weka.classifiers.trees.M5P", "--", "-M", "4.0" ]) ml_id = 'BagM5P' elif ml_opt == '5': classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"]) ml_id = 'J48' elif ml_opt == '7': classifier = Classifier( classname="weka.classifiers.functions.RBFNetwork", options=[ "-B", "2", "-S", "1", "-R", "1.0E-8", "-M", "-1", "-W", "0.1" ]) ml_id = 'RBFNet' elif ml_opt == '8': classifier = Classifier( classname="weka.classifiers.bayes.BayesNet", options=[ "-D", "-Q", "weka.classifiers.bayes.net.search.local.K2", "--", "-P", "1", "-S", "BAYES", "-E", "weka.classifiers.bayes.net.estimate.SimpleEstimator", "--", "-A", "0.5" ]) ml_id = 'BayesNet' elif ml_opt == '6': classifier = Classifier( classname="weka.classifiers.bayes.NaiveBayes") ml_id = 'NaiveBayes' elif ml_opt == '9': classifier = Classifier( classname="weka.classifiers.functions.SimpleLogistic", options=["-I", "0", "-M", "500", "-H", "50", "-W", "0.0"]) ml_id = 'LogReg' filtered.class_is_last() evaluation = Evaluation(filtered) evaluation.crossvalidate_model(classifier, filtered, 10, Random(42)) print "Evaluation: Done." ofile = open(ofile_dir + ml_id + "_results.txt", 'wb') print >> ofile, evaluation.summary() print >> ofile, evaluation.class_details().encode('ascii', 'ignore') print >> ofile, evaluation.matrix().encode('ascii', 'ignore') serialization.write(ofile_dir + ml_id + ".model", classifier) print "Saving " + ml_id + " Model: Done." ofile.close()
def experiment_sequential_file(path_indices, path_features, path_folder_save_results, options, classifier, name): ind_f = load(path_indices) lst = ind_f.files for item in lst: ind = ind_f[item] + 1 cls = Classifier(classname=classifier, options=weka.core.classes.split_options(options)) data = converters.load_any_file(path_features) ind = np.append(ind, len(data)) data.class_is_last() pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV") d_results = { 'index': [], 'percent_correct': [], 'percent_incorrect': [], 'confusion_matrix': [] } for j in range(len(ind) - 1): print(j) print(str(ind[j]) + '-' + str(ind[j + 1])) d_test = data.subset(row_range=str(ind[j]) + '-' + str(ind[j + 1])) if j == 0: # first d_train = data.subset(row_range=str(ind[j + 1] + 1) + '-' + str(ind[-1])) # last element elif j == len(ind) - 2: # last d_train = data.subset(row_range='1-' + str(ind[j] - 1)) # last element else: # central s = '1-' + str(ind[j] - 1) + ',' + str(ind[j + 1] + 1) + '-' + str( ind[-1]) d_train = data.subset(row_range=s) cls.build_classifier(d_train) evl = Evaluation(data) evl.test_model(cls, d_test, pout) save = pout.buffer_content() with open( path_folder_save_results + '/' + '/prediction/' + name + str(j) + 'pred_data.csv', 'w') as f: f.write(save) d_results['index'].append(str(ind[j])) d_results['percent_correct'].append(evl.percent_correct) d_results['percent_incorrect'].append(evl.percent_incorrect) d_results['confusion_matrix'].append( evl.matrix()) # Generates the confusion matrix. d_results = pd.DataFrame(data=d_results) d_results.to_csv(path_folder_save_results + '/' + name + 'results.csv', index=False)
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # classifier help helper.print_title("Creating help string") classifier = Classifier(classname="weka.classifiers.trees.J48") print(classifier.to_help()) # partial classname helper.print_title("Creating classifier from partial classname") clsname = ".J48" classifier = Classifier(classname=clsname) print(clsname + " --> " + classifier.classname) # classifier from commandline helper.print_title("Creating SMO from command-line string") cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"' classifier = from_commandline(cmdline, classname="weka.classifiers.Classifier") classifier.build_classifier(iris_data) print("input: " + cmdline) print("output: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # kernel classifier helper.print_title("Creating SMO as KernelClassifier") kernel = Kernel( classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"]) classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"]) classifier.kernel = kernel classifier.build_classifier(iris_data) print("classifier: " + classifier.to_commandline()) print("model:\n" + str(classifier)) # build a classifier and output model helper.print_title("Training J48 classifier on iris") classifier = Classifier(classname="weka.classifiers.trees.J48") # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor" # property of the J48 classifier itself. However, being of type float rather than double, we need # to convert it to the correct type first using the double_to_float function: classifier.set_property("confidenceFactor", types.double_to_float(0.3)) classifier.build_classifier(iris_data) print(classifier) print(classifier.graph) print(classifier.to_source("MyJ48")) plot_graph.plot_dot_graph(classifier.graph) # evaluate model on test set helper.print_title("Evaluating J48 classifier on iris") evaluation = Evaluation(iris_data) evl = evaluation.test_model(classifier, iris_data) print(evl) print(evaluation.summary()) # evaluate model on train/test split helper.print_title("Evaluating J48 classifier on iris (random split 66%)") classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) evaluation = Evaluation(iris_data) evaluation.evaluate_train_test_split(classifier, iris_data, 66.0, Random(1)) print(evaluation.summary()) # load a dataset incrementally and build classifier incrementally helper.print_title("Build classifier incrementally on iris") helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_inc = loader.load_file(iris_file, incremental=True) iris_inc.class_is_last() classifier = Classifier( classname="weka.classifiers.bayes.NaiveBayesUpdateable") classifier.build_classifier(iris_inc) for inst in loader: classifier.update_classifier(inst) print(classifier) # construct meta-classifiers helper.print_title("Meta classifiers") # generic FilteredClassifier instantiation print("generic FilteredClassifier instantiation") meta = SingleClassifierEnhancer( classname="weka.classifiers.meta.FilteredClassifier") meta.classifier = Classifier( classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.set_property("filter", flter.jobject) print(meta.to_commandline()) # direct FilteredClassifier instantiation print("direct FilteredClassifier instantiation") meta = FilteredClassifier() meta.classifier = Classifier( classname="weka.classifiers.functions.LinearRegression") flter = Filter("weka.filters.unsupervised.attribute.Remove") flter.options = ["-R", "first"] meta.filter = flter print(meta.to_commandline()) # generic Vote print("generic Vote instantiation") meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote") classifiers = [ Classifier(classname="weka.classifiers.functions.SMO"), Classifier(classname="weka.classifiers.trees.J48") ] meta.classifiers = classifiers print(meta.to_commandline()) # cross-validate nominal classifier helper.print_title("Cross-validating NaiveBayes on diabetes") diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + diabetes_file) loader = Loader("weka.core.converters.ArffLoader") diabetes_data = loader.load_file(diabetes_file) diabetes_data.class_is_last() classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") pred_output = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) evaluation = Evaluation(diabetes_data) evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output) print(evaluation.summary()) print(evaluation.class_details()) print(evaluation.matrix()) print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0))) print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc)) print("areaUnderROC/1: " + str(evaluation.area_under_roc(1))) print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc)) print("avgCost: " + str(evaluation.avg_cost)) print("totalCost: " + str(evaluation.total_cost)) print("confusionMatrix: " + str(evaluation.confusion_matrix)) print("correct: " + str(evaluation.correct)) print("pctCorrect: " + str(evaluation.percent_correct)) print("incorrect: " + str(evaluation.incorrect)) print("pctIncorrect: " + str(evaluation.percent_incorrect)) print("unclassified: " + str(evaluation.unclassified)) print("pctUnclassified: " + str(evaluation.percent_unclassified)) print("coverageOfTestCasesByPredictedRegions: " + str(evaluation.coverage_of_test_cases_by_predicted_regions)) print("sizeOfPredictedRegions: " + str(evaluation.size_of_predicted_regions)) print("falseNegativeRate: " + str(evaluation.false_negative_rate(1))) print("weightedFalseNegativeRate: " + str(evaluation.weighted_false_negative_rate)) print("numFalseNegatives: " + str(evaluation.num_false_negatives(1))) print("trueNegativeRate: " + str(evaluation.true_negative_rate(1))) print("weightedTrueNegativeRate: " + str(evaluation.weighted_true_negative_rate)) print("numTrueNegatives: " + str(evaluation.num_true_negatives(1))) print("falsePositiveRate: " + str(evaluation.false_positive_rate(1))) print("weightedFalsePositiveRate: " + str(evaluation.weighted_false_positive_rate)) print("numFalsePositives: " + str(evaluation.num_false_positives(1))) print("truePositiveRate: " + str(evaluation.true_positive_rate(1))) print("weightedTruePositiveRate: " + str(evaluation.weighted_true_positive_rate)) print("numTruePositives: " + str(evaluation.num_true_positives(1))) print("fMeasure: " + str(evaluation.f_measure(1))) print("weightedFMeasure: " + str(evaluation.weighted_f_measure)) print("unweightedMacroFmeasure: " + str(evaluation.unweighted_macro_f_measure)) print("unweightedMicroFmeasure: " + str(evaluation.unweighted_micro_f_measure)) print("precision: " + str(evaluation.precision(1))) print("weightedPrecision: " + str(evaluation.weighted_precision)) print("recall: " + str(evaluation.recall(1))) print("weightedRecall: " + str(evaluation.weighted_recall)) print("kappa: " + str(evaluation.kappa)) print("KBInformation: " + str(evaluation.kb_information)) print("KBMeanInformation: " + str(evaluation.kb_mean_information)) print("KBRelativeInformation: " + str(evaluation.kb_relative_information)) print("SFEntropyGain: " + str(evaluation.sf_entropy_gain)) print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain)) print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy)) print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy)) print("matthewsCorrelationCoefficient: " + str(evaluation.matthews_correlation_coefficient(1))) print("weightedMatthewsCorrelation: " + str(evaluation.weighted_matthews_correlation)) print("class priors: " + str(evaluation.class_priors)) print("numInstances: " + str(evaluation.num_instances)) print("meanAbsoluteError: " + str(evaluation.mean_absolute_error)) print("meanPriorAbsoluteError: " + str(evaluation.mean_prior_absolute_error)) print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error)) print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error)) print("rootMeanPriorSquaredError: " + str(evaluation.root_mean_prior_squared_error)) print("rootRelativeSquaredError: " + str(evaluation.root_relative_squared_error)) print("prediction output:\n" + str(pred_output)) plot_cls.plot_roc(evaluation, title="ROC diabetes", class_index=range( 0, diabetes_data.class_attribute.num_values), wait=False) plot_cls.plot_prc(evaluation, title="PRC diabetes", class_index=range( 0, diabetes_data.class_attribute.num_values), wait=False) # train 2nd classifier on diabetes dataset classifier2 = Classifier(classname="weka.classifiers.trees.RandomForest") evaluation2 = Evaluation(diabetes_data) evaluation2.crossvalidate_model(classifier2, diabetes_data, 10, Random(42)) plot_cls.plot_rocs({ "NB": evaluation, "RF": evaluation2 }, title="ROC diabetes", class_index=0, wait=False) plot_cls.plot_prcs({ "NB": evaluation, "RF": evaluation2 }, title="PRC diabetes", class_index=0, wait=False) # load a numeric dataset bolts_file = helper.get_data_dir() + os.sep + "bolts.arff" helper.print_info("Loading dataset: " + bolts_file) loader = Loader("weka.core.converters.ArffLoader") bolts_data = loader.load_file(bolts_file) bolts_data.class_is_last() # build a classifier and output model helper.print_title("Training LinearRegression on bolts") classifier = Classifier( classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) classifier.build_classifier(bolts_data) print(classifier) # cross-validate numeric classifier helper.print_title("Cross-validating LinearRegression on bolts") classifier = Classifier( classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"]) evaluation = Evaluation(bolts_data) evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42)) print(evaluation.summary()) print("correlationCoefficient: " + str(evaluation.correlation_coefficient)) print("errorRate: " + str(evaluation.error_rate)) helper.print_title("Header - bolts") print(str(evaluation.header)) helper.print_title("Predictions on bolts") for index, pred in enumerate(evaluation.predictions): print( str(index + 1) + ": " + str(pred) + " -> error=" + str(pred.error)) plot_cls.plot_classifier_errors(evaluation.predictions, wait=False) # train 2nd classifier and show errors in same plot classifier2 = Classifier(classname="weka.classifiers.functions.SMOreg") evaluation2 = Evaluation(bolts_data) evaluation2.crossvalidate_model(classifier2, bolts_data, 10, Random(42)) plot_cls.plot_classifier_errors( { "LR": evaluation.predictions, "SMOreg": evaluation2.predictions }, wait=False) # learning curve cls = [ Classifier(classname="weka.classifiers.trees.J48"), Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable") ] plot_cls.plot_learning_curve(cls, diabetes_data, increments=0.05, label_template="[#] !", metric="percent_correct", wait=True) # access classifier's Java API labor_file = helper.get_data_dir() + os.sep + "labor.arff" helper.print_info("Loading dataset: " + labor_file) loader = Loader("weka.core.converters.ArffLoader") labor_data = loader.load_file(labor_file) labor_data.class_is_last() helper.print_title("Using JRip's Java API to access rules") jrip = Classifier(classname="weka.classifiers.rules.JRip") jrip.build_classifier(labor_data) rset = jrip.jwrapper.getRuleset() for i in xrange(rset.size()): r = rset.get(i) print(str(r.toString(labor_data.class_attribute.jobject)))
train = converters.load_any_file("imbalanced_train.arff") test = converters.load_any_file("imbalanced_test.arff") train.class_is_last() test.class_is_last() # Setting the number of iterations performed by Logit Boost cls = Classifier(classname="weka.classifiers.trees.LMT", options=["-B", "-I", "10"]) # 5 Fold Cross Validation Error evl = Evaluation(train) evl.crossvalidate_model(cls, train, 5, Random(1)) # Prints Out Confusion Matrix along with other summary statistics print("LMT (imbalanced classes) CV = 5 Error: %.2f%%" % (evl.percent_incorrect)) print(evl.matrix()) #Confusion Matrix # Plots ROC plcls.plot_roc(evl, class_index=[0, 1], wait=True) # Extra Summary print(evl.summary()) print(evl.class_details()) # Evaluate the classifier on test set cls.build_classifier(train) tevl = Evaluation(test) tevl.test_model(cls, test) # Prints Out Confusion Matrix along with other summary statistics print("LMT (imbalanced classes) Test Error: %.2f%%" % (tevl.percent_incorrect))
trainData = loader.load_file('segment-challenge.arff') trainData.class_is_last() testData = loader.load_file('segment-test.arff') testData.class_is_last() # Default C4.5 tree classifier = Classifier(classname="weka.classifiers.trees.J48") # Search for the best parameters and build a classifier with them classifier.build_classifier(trainData) print("\n\n=========== Classifier information ================\n\n") print(classifier.options) print(classifier) print("\n\n=========== Train results ================\n\n") evaluation = Evaluation(trainData) evaluation.test_model(classifier, trainData) print(classifier.to_commandline()) print(evaluation.matrix()) print("Train recognition: %0.2f%%" % evaluation.percent_correct) print("\n\n=========== Test results ================\n\n") evaluation = Evaluation(testData) evaluation.test_model(classifier, testData) print(classifier.to_commandline()) print(evaluation.matrix()) print("Test recognition: %0.2f%%" % evaluation.percent_correct) jvm.stop()
fname = data_dir + os.sep + "ReutersGrain-test.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") test = loader.load_file(fname) test.class_is_last() setups = ( ("weka.classifiers.trees.J48", []), ("weka.classifiers.bayes.NaiveBayes", []), ("weka.classifiers.bayes.NaiveBayesMultinomial", []), ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C"]), ("weka.classifiers.bayes.NaiveBayesMultinomial", ["-C", "-L", "-stopwords-handler", "weka.core.stopwords.Rainbow"]) ) # cross-validate classifiers for setup in setups: classifier, opt = setup print("\n--> %s (filter options: %s)\n" % (classifier, " ".join(opt))) cls = FilteredClassifier() cls.classifier = Classifier(classname=classifier) cls.filter = Filter(classname="weka.filters.unsupervised.attribute.StringToWordVector", options=opt) cls.build_classifier(data) evl = Evaluation(test) evl.test_model(cls, test) print("Accuracy: %0.0f%%" % evl.percent_correct) tcdata = plc.generate_thresholdcurve_data(evl, 0) print("AUC: %0.3f" % plc.get_auc(tcdata)) print(evl.matrix("Matrix:")) jvm.stop()
data_dir = os.environ.get("WEKAMOOC_DATA") if data_dir is None: data_dir = "." + os.sep + "data" import weka.core.jvm as jvm from weka.core.converters import Loader from weka.classifiers import Classifier, Evaluation, PredictionOutput from weka.core.classes import Random import weka.plot.classifiers as plc jvm.start() # load weather.nominal fname = data_dir + os.sep + "weather.nominal.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) data.class_is_last() # cross-validate NaiveBayes cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes") pout = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"]) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1), pout) print(evl.summary()) print(evl.matrix()) print(pout) plc.plot_roc(evl, wait=True) jvm.stop()
sep=',', index=False) smote_train_data = convert.load_any_file( filename=GENERATED_SMOTE_TRAIN_DATA_FILE_PATH) smote_train_data.class_is_first() # load logistic model tree algorithm log_tree = Classifier(classname="weka.classifiers.trees.LMT") eval_smote_train_obj = Evaluation(smote_train_data) eval_smote_train_obj.crossvalidate_model(classifier=log_tree, data=smote_train_data, num_folds=5, rnd=Random(1)) print("SMOTE Train CV (5-folds) Error = %.2f%%" % (eval_smote_train_obj.percent_incorrect)) print(eval_smote_train_obj.matrix()) print("=================\"Summary\"====================") print(eval_smote_train_obj.summary()) log_tree.build_classifier(smote_train_data) y_predict = eval_smote_train_obj.test_model(log_tree, smote_train_data) y_train_smote = to_binary_numeric(y_train_smote, classNeg="neg") falsePositiveRate, truePositiveRate, thresholds = roc_curve(y_train_smote, y_predict, pos_label=0) # compute Area Under the Curve (AUC) using the trapezoidal rule area = auc(falsePositiveRate, truePositiveRate) plt.plot(falsePositiveRate,