def main(): """ Just runs some example code. """ # load ARFF file helper.print_title("Loading ARFF file") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(helper.get_data_dir() + os.sep + "iris.arff") print(str(data)) # load CSV file helper.print_title("Loading CSV file") loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(helper.get_data_dir() + os.sep + "iris.csv") print(str(data)) # load directory # changes this to something sensible text_dir = "/some/where" if os.path.exists(text_dir) and os.path.isdir(text_dir): helper.print_title("Loading directory: " + text_dir) loader = TextDirectoryLoader( options=["-dir", text_dir, "-F", "-charset", "UTF-8"]) data = loader.load() print(unicode(data))
def run(): jvm.start() load_csv = Loader("weka.core.converters.CSVLoader") data_csv = load_csv.load_file( "/Users/imeiliasantoso/web_graduate_project4/predict_page/predict_data.csv" ) saver = Saver("weka.core.converters.ArffSaver") saver.save_file( data_csv, "/Users/imeiliasantoso/web_graduate_project4/predict_page/predict_data.arff" ) load_arff = Loader("weka.core.converters.ArffLoader") data_arff = load_arff.load_file( "/Users/imeiliasantoso/web_graduate_project4/predict_page/predict_data.arff" ) data_arff.class_is_last() global j48 J48_class = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"]) J48_class.build_classifier(data_arff) evaluationj48 = Evaluation(data_arff) evaluationj48.crossvalidate_model(J48_class, data_arff, 10, Random(100)) j48 = str(evaluationj48.percent_correct) jvm.stop() return j48
def main(): """ Just runs some example code. """ # load a dataset iris = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(iris) # remove class attribute helper.print_info("Removing class attribute") remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) remove.inputformat(data) filtered = remove.filter(data) # use MultiFilter helper.print_info("Use MultiFilter") remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) std = Filter(classname="weka.filters.unsupervised.attribute.Standardize") multi = MultiFilter() multi.filters = [remove, std] multi.inputformat(data) filtered_multi = multi.filter(data) # output datasets helper.print_title("Input") print(data) helper.print_title("Output") print(filtered) helper.print_title("Output (MultiFilter)") print(filtered_multi) # load text dataset text = helper.get_data_dir( ) + os.sep + "reutersTop10Randomized_1perc_shortened.arff" helper.print_info("Loading dataset: " + text) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(text) data.class_is_last() # apply StringToWordVector stemmer = Stemmer(classname="weka.core.stemmers.IteratedLovinsStemmer") stopwords = Stopwords(classname="weka.core.stopwords.Rainbow") tokenizer = Tokenizer(classname="weka.core.tokenizers.WordTokenizer") s2wv = StringToWordVector(options=["-W", "10", "-L", "-C"]) s2wv.stemmer = stemmer s2wv.stopwords = stopwords s2wv.tokenizer = tokenizer s2wv.inputformat(data) filtered = s2wv.filter(data) helper.print_title("Input (StringToWordVector)") print(data) helper.print_title("Output (StringToWordVector)") print(filtered)
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(iris_file) # remove class attribute data.delete_last_attribute() # build a clusterer and output model helper.print_title("Training SimpleKMeans clusterer") clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) clusterer.build_clusterer(data) print(clusterer) helper.print_info("Evaluating on data") evaluation = ClusterEvaluation() evaluation.set_model(clusterer) evaluation.test_model(data) print("# clusters: " + str(evaluation.num_clusters)) print("log likelihood: " + str(evaluation.log_likelihood)) print("cluster assignments:\n" + str(evaluation.cluster_assignments)) plc.plot_cluster_assignments(evaluation, data, inst_no=True) # using a filtered clusterer helper.print_title("Filtered clusterer") loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(iris_file) clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) fclusterer = FilteredClusterer() fclusterer.clusterer = clusterer fclusterer.filter = remove fclusterer.build_clusterer(data) print(fclusterer) # load a dataset incrementally and build clusterer incrementally helper.print_title("Incremental clusterer") loader = Loader("weka.core.converters.ArffLoader") iris_inc = loader.load_file(iris_file, incremental=True) clusterer = Clusterer("weka.clusterers.Cobweb") remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) remove.inputformat(iris_inc) iris_filtered = remove.outputformat() clusterer.build_clusterer(iris_filtered) for inst in loader: remove.input(inst) inst_filtered = remove.output() clusterer.update_clusterer(inst_filtered) clusterer.update_finished() print(clusterer.to_commandline()) print(clusterer) print(clusterer.graph) plg.plot_dot_graph(clusterer.graph)
def create_weka_dataset(self, X, y=None): """Create weka dataset using temporaly file Arguments: X {array like} -- non target class instances y {array like} -- target class instances Returns: java object wrapped -- weka dataset """ try: # Create new temporal file temp = tempfile.NamedTemporaryFile() # Concat X and y. Write csv to temporaly file. if y is None: y = pd.DataFrame(["?"]*X.shape[0], columns=self.experiment_configuration["target"]) X.reset_index(drop=True, inplace=True) y.reset_index(drop=True, inplace=True) dataframe = pd.concat([X, y], axis=1, ignore_index=True) dataframe.to_csv(temp.name, index=None) options = None if self.y_uniques is not None: options = ["-L", "{}:{}".format(dataframe.shape[1], ",".join(map(str, self.y_uniques)))] if not self.is_classification(): options = ["-R", "last"] loader = Loader(classname="weka.core.converters.CSVLoader", options=options) data = loader.load_file(temp.name) # Last column of data is target data.class_is_last() finally: temp.close() return data
def main(): dataset = sys.argv[1] #load a dataset loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file("./data/" + dataset + ".arff") data.class_is_last() num_classes = data.class_attribute.num_values os.mkdir('resultados_' + sys.argv[1]) for random_cv in range(10): #10 CV # generate train/test split of randomized data train, test = data.train_test_split(75.0, Random(random_cv)) results_train, results_test = classification(data, train, test, num_classes) # results_test = classification(test, num_classes) #Write results in Excel format train_name = "./resultados_" + sys.argv[1] + "/resultados_" + sys.argv[ 1] + "_" + "E" + np.str(random_cv) + ".csv" test_name = "./resultados_" + sys.argv[1] + "/resultados_" + sys.argv[ 1] + "_" + "T" + np.str(random_cv) + ".csv" results_train.to_csv(train_name) results_test.to_csv(test_name)
def train_and_predict_instances(self, trainingFile, classifier): loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(trainingFile) data.class_is_last() classes = [str(code) for code in data.class_attribute.values] head = [className + " probability" for className in classes] head.append("Guess") cls = Classifier(classname=classifier) cls.build_classifier(data) predictions = [[0, 0]] * len(data) realLabels = [""] * len(data) guess = [0] * len(data) for index, inst in enumerate(data): pred = cls.classify_instance(inst) if inst.get_value(inst.class_index) == pred: guess[index] = 1.0 else: guess[index] = 0.0 dist = cls.distribution_for_instance(inst) predictions[index] = [p for p in dist] realLabels[index] = classes[int(inst.get_value(inst.class_index))] print( str(index + 1) + ": label index=" + str(pred) + ", class distribution=" + str(dist)) return [predictions, guess, head, realLabels]
def main(path, num_trees): loader = Loader( classname='weka.core.converters.ArffLoader') # load the data ds = loader.load_file(path) ds.class_is_last() accuracy = list() for i in range(10): random.seed(i) data_train, data_test = split_data(ds, 1.0 / 3) # split it labels_test = [inst.values[inst.class_index] for inst in data_test] rf = train(data_train, n_estimators=num_trees) # train the model predicted = predict(rf, data_test) # test the model # compute the accuracy of correctly classified instances num_correct = sum( [1.0 for y, gt in zip(predicted, labels_test) if y == gt]) accuracy.append(num_correct / len(labels_test)) # compute the pecrcentage of correctly classified instances (averaged result) acc_mean = 1.0 * sum(accuracy) / len(accuracy) acc_std = (sum((x - acc_mean)**2 for x in accuracy) / len(accuracy))**0.5 print('Maximum number of trees: {}\n'.format(num_trees)) print('Accuracy (mean={:.2%}, std={:.2%}):\n{}'.format( acc_mean, acc_std, ', '.join('{:.2%}'.format(acc) for acc in accuracy))) print('-' * 120)
def load_data_from_arff(self): print("loading data from raw") loader = Loader(classname="weka.core.converters.ArffLoader") #target print("Loading target data") all_target = loader.load_file(self.arff_data_path + self.target_name + ".arff") all_target.class_is_last() train_vs_test_percent = (self.num_games_target / self.num_games_source) * 100 self.target, self.eval = all_target.train_test_split( train_vs_test_percent) print("target size:", self.target.num_instances) print("Eval size:", self.eval.num_instances) #source print("Loading source data") i = 0 allFiles = os.listdir(self.arff_data_path) random.shuffle(allFiles) while i < len(allFiles): filename = allFiles[i] if filename != self.target_name + ".arff": print("Loading", filename) source = loader.load_file(self.arff_data_path + filename) source.class_is_last() print("Size:", source.num_instances) self.source.append(source) i += 1
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") full = loader.load_file(iris_file) full.class_is_last() # remove class attribute data = Instances.copy_instances(full) data.no_class() data.delete_last_attribute() # build a clusterer and output model helper.print_title("Training SimpleKMeans clusterer") clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) clusterer.build_clusterer(data) print("done") # classes to clusters evl = ClusterEvaluation() evl.set_model(clusterer) evl.test_model(full) helper.print_title("Cluster results") print(evl.cluster_results) helper.print_title("Classes to clusters") print(evl.classes_to_clusters)
def evaluation_data(self, model): try: loader = Loader(classname="weka.core.converters.ArffLoader") data_test = loader.load_file(self.dataTestName) #helper.print_info("Evaluating on data:") evaluation = ClusterEvaluation() evaluation.set_model(model) evaluation.test_model(data_test) #print("# clusters: " + str(evaluation.num_clusters)) #print("# log likelihood: " + str(evaluation.log_likelihood)) cluster_ass = evaluation.cluster_assignments #print("# cluster assignments:\n" + str(cluster_ass)) f = open("result_data.txt", "w+") i = 0 for ins in data_test: stt = "normal" if (cluster_ass[i] == 0): stt = "anomaly" statement = str(ins) + "," + stt #print statement f.write(statement + "\n") i = i + 1 f.close() return evaluation.cluster_results except Exception, e: raise e print(traceback.format_exc())
def main(args): """ Trains a J48 classifier on a training set and outputs the predicted class and class distribution alongside the actual class from a test set. Class attribute is assumed to be the last attribute. :param args: the commandline arguments (train and test datasets) :type args: list """ # load a dataset helper.print_info("Loading train: " + args[1]) loader = Loader(classname="weka.core.converters.ArffLoader") train = loader.load_file(args[1]) train.class_index = train.num_attributes - 1 helper.print_info("Loading test: " + args[2]) test = loader.load_file(args[2]) test.class_is_last() # classifier cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) # output predictions print("# - actual - predicted - error - distribution") for index, inst in enumerate(test): pred = cls.classify_instance(inst) dist = cls.distribution_for_instance(inst) print("%d - %s - %s - %s - %s" % (index + 1, inst.get_string_value( inst.class_index), inst.class_attribute.value(int(pred)), "yes" if pred != inst.get_value(inst.class_index) else "no", str(dist.tolist())))
def main(): """ Shows how to use the CostSensitiveClassifier. """ # load a dataset data_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + data_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # classifier classifier = SingleClassifierEnhancer( classname="weka.classifiers.meta.CostSensitiveClassifier", options=["-cost-matrix", "[0 1; 2 0]", "-S", "2"]) base = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) classifier.classifier = base folds = 10 evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, folds, Random(1)) print("") print("=== Setup ===") print("Classifier: " + classifier.to_commandline()) print("Dataset: " + data.relationname) print("") print( evaluation.summary("=== " + str(folds) + " -fold Cross-Validation ==="))
def main(): try: jvm.start() loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file("./data/adult.csv") data.class_is_last() # set class attribute # randomize data folds = k seed = 1 rnd = Random(seed) rand_data = Instances.copy_instances(data) rand_data.randomize(rnd) if rand_data.class_attribute.is_nominal: rand_data.stratify(folds) NaiveBayes(rand_data, folds, seed, data) DecisionTree(rand_data, folds, seed, data) except Exception as e: raise e finally: jvm.stop()
def runSMO(file, bound): loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(file) data.class_is_first() remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", bound]) cls = KernelClassifier( classname="weka.classifiers.functions.SMO", options=["-C", "1.0", "-L", "0.001", "-P", "1.0E-12", "-N", "0"]) kernel = Kernel( classname="weka.classifiers.functions.supportVector.PolyKernel", options=["-C", "250007", "-E", "1.0"]) cls.kernel = kernel pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") remove.inputformat(data) filtered = remove.filter(data) evl = Evaluation(filtered) evl.crossvalidate_model(cls, filtered, 10, Random(1), pout) #print(pout.buffer_content()) print(evl.percent_correct) #print(evl.summary()) result = evl.class_details() print(result) return result
def score(self, testExamples, labels): f = open("testingweka.arff", "w") f.write("@relation randomset\n") for j in range(len(testExamples[0])): f.write("@attribute feature%d real\n" % j) f.write("@attribute class {TRUE, FALSE}\n") f.write("@data\n") for (example, label) in zip(testExamples, labels): for feature in example: f.write("%f," % feature) if label == 1: f.write("TRUE\n") else: f.write("FALSE\n") f.close() loader = Loader(classname="weka.core.converters.ArffLoader") # options=["-H", "-B", "10000"]) self.testingData = loader.load_file("testingweka.arff") self.testingData.set_class_index(self.testingData.num_attributes() - 1) evaluation = Evaluation(self.trainingData) evaluation.test_model(self.classifier, self.testingData) #print evaluation.percent_correct() #jvm.stop() return evaluation.percent_correct()
def retrain(self, examples, labels): f = open("trainingweka.arff", "w") f.write("@relation randomset\n") for j in range(len(examples[0])): f.write("@attribute feature%d real\n" % j) f.write("@attribute class {TRUE, FALSE}\n") f.write("@data\n") for (example, label) in zip(examples, labels): for feature in example: f.write("%f," % feature) if label == 1: f.write("TRUE\n") else: f.write("FALSE\n") f.close() loader = Loader(classname="weka.core.converters.ArffLoader") # options=["-H", "-B", "10000"]) self.trainingData = loader.load_file("trainingweka.arff") self.trainingData.set_class_index(self.trainingData.num_attributes() - 1) self.classifier = Classifier( classname="weka.classifiers.functions.Logistic", options=["-R", "%f" % (1.0 / self.C)]) self.classifier.build_classifier(self.trainingData)
def PredecirUnaTemporada(path): jvm.start() insta = CrearInstanciaParaPredecir(path) atributos = "" file = open('ModelData/wekaHeader.arff', 'r') atributos = file.readlines() file.close() file = open('ModelData/predictionFiles/inst.arff', 'w') file.writelines(atributos) file.write("\n" + insta + '\n') file.close() objects = serialization.read_all("ModelData/77PercentModelPaisajes.model") classifier = Classifier(jobject=objects[0]) loader = Loader() data = loader.load_file("ModelData/predictionFiles/inst.arff") data.class_is_last() clases = ["invierno", "verano", "otono", "primavera"] prediccion = "" for index, inst in enumerate(data): pred = classifier.classify_instance(inst) dist = classifier.distribution_for_instance(inst) prediccion = clases[int(pred)] jvm.stop() return prediccion
def TestClassification(arff, modelInput, results): # 启动java虚拟机 jvm.start() # 导入分析模型 objects = serialization.read_all(modelInput) clsf = Classifier(jobject=objects[0]) print(clsf) # 导入测试组 loader = Loader(classname="weka.core.converters.ArffLoader") test = loader.load_file(arff) test.class_is_first() # 分析结果 resultsFile = open(results, "w") resultsFile.write("序号\t原判断\t预测\t良性概率\t恶性概率\n") print("序号\t原判断\t预测\t良性概率\t恶性概率") for index, inst in enumerate(test): pred = clsf.classify_instance(inst) dist = clsf.distribution_for_instance(inst) sampleID = index + 1 origin = inst.get_string_value(inst.class_index) prediction = inst.class_attribute.value(int(pred)) sameAsOrigin = "yes" if pred != inst.get_value( inst.class_index) else "no" NRate = dist.tolist()[0] PRate = dist.tolist()[1] resultsFile.write( "%d\t%s\t%s\t%s\t%s" % (sampleID, origin, prediction, str(NRate), str(PRate)) + "\n") print("%d\t%s\t%s\t%s\t%s" % (sampleID, origin, prediction, str(NRate), str(PRate))) resultsFile.close() # 退出java虚拟机 jvm.stop() print("检测完成")
def main(args): """ Loads a dataset, shuffles it, splits it into train/test set. Trains J48 with training set and evaluates the built model on the test set. :param args: the commandline arguments (optional, can be dataset filename) :type args: list """ # load a dataset if len(args) <= 1: data_file = helper.get_data_dir() + os.sep + "vote.arff" else: data_file = args[1] helper.print_info("Loading dataset: " + data_file) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # generate train/test split of randomized data train, test = data.train_test_split(66.0, Random(1)) # build classifier cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) print(cls) # evaluate evl = Evaluation(train) evl.test_model(cls, test) print(evl.summary())
def predict_proba(self, X): evaluation = Evaluation(self.train_data) # Add class column (we can't copy X, because this is a large object, so we add the column and remove it later) X['class'] = None filename = self.to_arff(X, True) # Remove class column del X['class'] loader = Loader("weka.core.converters.ArffLoader") test_data = loader.load_file(filename) test_data.class_is_last() evaluation.test_model(self.classifier, test_data) probas = None # Return probabilities for pred in evaluation.predictions: if probas is None: probas = pred.distribution else: probas = np.vstack([probas, pred.distribution]) return probas
def save_all_scores_on_validate(): for user in user_list: user_validate_dir = os.listdir("../data/arff_files/" + str(user) + "/validate/") user_validate_dir.sort() n = len(user_validate_dir) for expression_index in range(n): print expression_index, "=>", str( expression_list[expression_index]), ':', str( user_validate_dir[expression_index]) id = str(expression_list[expression_index]) + '_' + str(user) target_dir = '../results/' + str( expression_list[expression_index]) + '/' + str(user) + '/' model_dir = '../models/' + str( expression_list[expression_index]) + '/' + str(user) + '/' validate_data_file = "../data/arff_files/" + str( user) + "/validate/" + str(user_validate_dir[expression_index]) print validate_data_file, "=>", model_dir, "all algos", "=>", target_dir, "\n" loader = Loader(classname="weka.core.converters.ArffLoader") validate_data = loader.load_file(validate_data_file) for algo in algo_func_dict.keys(): trained_model = Classifier( jobject=serialization.read(model_dir + algo + ".model")) scores_matrix = get_classifier_score(trained_model, validate_data) out_file = target_dir + algo + "_scores.csv" #writing scores to target file np.savetxt(out_file, scores_matrix, delimiter=",")
def gridsearch(): """ Applies GridSearch to a dataset. GridSearch package must be not be installed, as the monolithic weka.jar already contains this package. """ helper.print_title("GridSearch") # load a dataset fname = helper.get_data_dir() + os.sep + "bolts.arff" helper.print_info("Loading train: " + fname) loader = Loader(classname="weka.core.converters.ArffLoader") train = loader.load_file(fname) train.class_is_last() # classifier grid = GridSearch(options=["-sample-size", "100.0", "-traversal", "ROW-WISE", "-num-slots", "1", "-S", "1"]) grid.evaluation = "CC" grid.y = {"property": "kernel.gamma", "min": -3.0, "max": 3.0, "step": 1.0, "base": 10.0, "expression": "pow(BASE,I)"} grid.x = {"property": "C", "min": -3.0, "max": 3.0, "step": 1.0, "base": 10.0, "expression": "pow(BASE,I)"} cls = Classifier( classname="weka.classifiers.functions.SMOreg", options=["-K", "weka.classifiers.functions.supportVector.RBFKernel"]) grid.classifier = cls grid.build_classifier(train) print("Model:\n" + str(grid)) print("\nBest setup:\n" + grid.best.to_commandline())
def vote_classifier_train(dicrectory, nameOfDataSet, flag): loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(dicrectory) data.class_is_last() meta = MultipleClassifiersCombiner( classname="weka.classifiers.meta.Vote", options=[ '-S', '1', '-B', 'weka.classifiers.trees.J48 -C 0.25 -M 2', '-B', 'weka.classifiers.trees.RandomTree -K 6 -M 1.0 -V 0.001 -S 1', '-B', 'weka.classifiers.meta.Bagging -P 100 -S 1 -num-slots 1 -I 10 -W weka.classifiers.trees.REPTree -- ' '-M 2 -V 0.001 -N 3 -S 1 -L -1 -I 0.0', '-B', 'weka.classifiers.meta.AdaBoostM1 -P 100 -S 1 -I 10 -W weka.classifiers.trees.DecisionStump', '-B', 'weka.classifiers.meta.Bagging -P 100 -S 1 -num-slots 1 -I 10 -W weka.classifiers.trees.REPTree -- ' '-M 2 -V 0.001 -N 3 -S 1 -L -1 -I 0.0', '-B', 'weka.classifiers.bayes.NaiveBayes ', '-R', 'AVG' ]) eval = Evaluation(data) pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") if flag: eval.crossvalidate_model(meta, data, 10, Random(1), pout) else: eval.evaluate_train_test_split(meta, data, 80.0, Random(1), pout) gc.collect() print_and_save('Proposed model', flag, nameOfDataSet, eval)
def initData(self, arrfFile): loader = Loader(classname="weka.core.converters.ArffLoader") print self.dataDir + '/' + arrfFile self.data = loader.load_file(self.dataDir + '/' + arrfFile) self.data.class_is_last() print 'Carregando arquivo ' + self.dataDir + '/' + arrfFile
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(iris_file) # remove class attribute data.delete_last_attribute() # build a clusterer and output model helper.print_title("Training SimpleKMeans clusterer") clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) clusterer.build_clusterer(data) print(clusterer) # cluster data helper.print_info("Clustering data") for index, inst in enumerate(data): cl = clusterer.cluster_instance(inst) dist = clusterer.distribution_for_instance(inst) print(str(index+1) + ": cluster=" + str(cl) + ", distribution=" + str(dist))
def main(args): """ Trains a NaiveBayesUpdateable classifier incrementally on a dataset. The dataset can be supplied as parameter. :param args: the commandline arguments :type args: list """ # load a dataset if len(args) <= 1: data_file = helper.get_data_dir() + os.sep + "vote.arff" else: data_file = args[1] helper.print_info("Loading dataset: " + data_file) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(data_file, incremental=True) data.class_is_last() # classifier nb = Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable") nb.build_classifier(data) # train incrementally for inst in loader: nb.update_classifier(inst) print(nb)
def load_custom_loader(): """ Loads a dataset using a custom loader. """ # setup the flow helper.print_title("Load dataset (custom loader)") iris = helper.get_data_dir() + os.sep + "iris.csv" flow = Flow(name="load dataset") filesupplier = FileSupplier() filesupplier.config["files"] = [iris] flow.actors.append(filesupplier) loaddataset = LoadDataset() loaddataset.config["incremental"] = False loaddataset.config["use_custom_loader"] = True loaddataset.config["custom_loader"] = Loader( classname="weka.core.converters.CSVLoader") flow.actors.append(loaddataset) console = Console() flow.actors.append(console) # run the flow msg = flow.setup() if msg is None: msg = flow.execute() if msg is not None: print("Error executing flow:\n" + msg) else: print("Error setting up flow:\n" + msg) flow.wrapup() flow.cleanup()
def main(): """ Just runs some example code. """ classifier = Classifier("weka.classifiers.trees.J48") helper.print_title("Capabilities") capabilities = classifier.capabilities print(capabilities) # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() data_capabilities = Capabilities.for_instances(iris_data) print(data_capabilities) print("classifier handles dataset: " + str(capabilities.supports(data_capabilities))) # disable/enable helper.print_title("Disable/Enable") capability = Capability(member="UNARY_ATTRIBUTES") capabilities.disable(capability) capabilities.min_instances = 10 print("Removing: " + str(capability)) print(capabilities)
def crossTest(this, trainingFile, classifier, testFile): loader = Loader(classname="weka.core.converters.ArffLoader") data1 = loader.load_file(trainingFile) data1.class_is_last() cls = Classifier(classname=classifier) cls.build_classifier(data1) data2 = loader.load_file(testFile) data2.class_is_last() classes = [str(code) for code in data2.class_attribute.values] header = ["Accuracy"] for name in classes: header += [name + " TP", name + " FP", name + " AUC ROC"] values = [] evl = Evaluation(data2) evl.test_model(cls, data2) values.append(evl.percent_correct) for name in classes: index = classes.index(name) values += [ evl.true_positive_rate(index) * 100, evl.false_positive_rate(index) * 100, evl.area_under_roc(index) ] this.values = values this.header = header