def train_and_predict_instances(self, trainingFile, classifier): loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(trainingFile) data.class_is_last() classes = [str(code) for code in data.class_attribute.values] head = [className + " probability" for className in classes] head.append("Guess") cls = Classifier(classname=classifier) cls.build_classifier(data) predictions = [[0, 0]] * len(data) realLabels = [""] * len(data) guess = [0] * len(data) for index, inst in enumerate(data): pred = cls.classify_instance(inst) if inst.get_value(inst.class_index) == pred: guess[index] = 1.0 else: guess[index] = 0.0 dist = cls.distribution_for_instance(inst) predictions[index] = [p for p in dist] realLabels[index] = classes[int(inst.get_value(inst.class_index))] print( str(index + 1) + ": label index=" + str(pred) + ", class distribution=" + str(dist)) return [predictions, guess, head, realLabels]
def train(self, dataset, training_data, force = False): """Perform the training of classifier. Parameters ---------- dataset : string Path to image dataset. training_data : string Name of ARFF training file. force : boolean, optional, default = False If False don't perform new training if there is trained data. """ if self.data is not None and not force: return if self.data is not None: self.reset() loader = WLoader(classname="weka.core.converters.ArffLoader") training_file = File.make_path(dataset, training_data + ".arff") self.data = loader.load_file(training_file) self.data.class_is_last() options = None if self.options.value == 'default' else self.options.value.split() self.classifier = WClassifier(classname=self.classname.value, options=options) self.classifier.build_classifier(self.data)
def crossTest(this, trainingFile, classifier, testFile): loader = Loader(classname="weka.core.converters.ArffLoader") data1 = loader.load_file(trainingFile) data1.class_is_last() cls = Classifier(classname=classifier) cls.build_classifier(data1) data2 = loader.load_file(testFile) data2.class_is_last() classes = [str(code) for code in data2.class_attribute.values] header = ["Accuracy"] for name in classes: header += [name + " TP", name + " FP", name + " AUC ROC"] values = [] evl = Evaluation(data2) evl.test_model(cls, data2) values.append(evl.percent_correct) for name in classes: index = classes.index(name) values += [ evl.true_positive_rate(index) * 100, evl.false_positive_rate(index) * 100, evl.area_under_roc(index) ] this.values = values this.header = header
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # train classifier classifier = Classifier("weka.classifiers.trees.J48") classifier.build_classifier(iris_data) # save and read object helper.print_title("I/O: single object") outfile = tempfile.gettempdir() + os.sep + "j48.model" serialization.write(outfile, classifier) model = Classifier(jobject=serialization.read(outfile)) print(model) # save classifier and dataset header (multiple objects) helper.print_title("I/O: single object") serialization.write_all(outfile, [classifier, Instances.template_instances(iris_data)]) objects = serialization.read_all(outfile) for i, obj in enumerate(objects): helper.print_info("Object #" + str(i+1) + ":") if javabridge.get_env().is_instance_of(obj, javabridge.get_env().find_class("weka/core/Instances")): obj = Instances(jobject=obj) elif javabridge.get_env().is_instance_of(obj, javabridge.get_env().find_class("weka/classifiers/Classifier")): obj = Classifier(jobject=obj) print(obj)
class Classifier(): def __init__(self): jvm.start(class_path=['/vol/customopt/machine-learning/src/weka/weka-3-6-8/weka.jar']) self.loader = Loader(classname="weka.core.converters.ArffLoader") def train(self,classifier,trainfile): if classifier == "ripper": self.cls = classifiers.Classifier(classname="weka.classifiers.rules.JRip",options=["-P", "false","-E","false","O","5"]) data = self.loader.load_file(trainfile) data.set_class_index(data.num_attributes() - 1) self.cls.build_classifier(data) return(self.cls.__str__()) def test(self,testfile): predictions = [] testdata = self.loader.load_file(testfile, incremental=True) testdata.set_class_index(testdata.num_attributes() - 1) while True: inst = self.loader.next_instance(testdata) if inst is None: break predictions.append([self.cls.classify_instance(inst)," ".join([str(round(x,2)) for x in self.cls.distribution_for_instance(inst)])]) return predictions def stop(self): jvm.stop()
def runSMO(file, bound): loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(file) data.class_is_first() remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", bound]) cls = KernelClassifier( classname="weka.classifiers.functions.SMO", options=["-C", "1.0", "-L", "0.001", "-P", "1.0E-12", "-N", "0"]) kernel = Kernel( classname="weka.classifiers.functions.supportVector.PolyKernel", options=["-C", "250007", "-E", "1.0"]) cls.kernel = kernel pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") remove.inputformat(data) filtered = remove.filter(data) evl = Evaluation(filtered) evl.crossvalidate_model(cls, filtered, 10, Random(1), pout) #print(pout.buffer_content()) print(evl.percent_correct) #print(evl.summary()) result = evl.class_details() print(result) return result
def retrain(self, examples, labels): f = open("trainingweka.arff", "w") f.write("@relation randomset\n") for j in range(len(examples[0])): f.write("@attribute feature%d real\n" % j) f.write("@attribute class {TRUE, FALSE}\n") f.write("@data\n") for (example, label) in zip(examples, labels): for feature in example: f.write("%f," % feature) if label == 1: f.write("TRUE\n") else: f.write("FALSE\n") f.close() loader = Loader(classname="weka.core.converters.ArffLoader") # options=["-H", "-B", "10000"]) self.trainingData = loader.load_file("trainingweka.arff") self.trainingData.set_class_index(self.trainingData.num_attributes() - 1) self.classifier = Classifier( classname="weka.classifiers.functions.Logistic", options=["-R", "%f" % (1.0 / self.C)]) self.classifier.build_classifier(self.trainingData)
def score(self, testExamples, labels): f = open("testingweka.arff", "w") f.write("@relation randomset\n") for j in range(len(testExamples[0])): f.write("@attribute feature%d real\n" % j) f.write("@attribute class {TRUE, FALSE}\n") f.write("@data\n") for (example, label) in zip(testExamples, labels): for feature in example: f.write("%f," % feature) if label == 1: f.write("TRUE\n") else: f.write("FALSE\n") f.close() loader = Loader(classname="weka.core.converters.ArffLoader") # options=["-H", "-B", "10000"]) self.testingData = loader.load_file("testingweka.arff") self.testingData.set_class_index(self.testingData.num_attributes() - 1) evaluation = Evaluation(self.trainingData) evaluation.test_model(self.classifier, self.testingData) #print evaluation.percent_correct() #jvm.stop() return evaluation.percent_correct()
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") full = loader.load_file(iris_file) full.class_is_last() # remove class attribute data = Instances.copy_instances(full) data.no_class() data.delete_last_attribute() # build a clusterer and output model helper.print_title("Training SimpleKMeans clusterer") clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) clusterer.build_clusterer(data) print("done") # classes to clusters evl = ClusterEvaluation() evl.set_model(clusterer) evl.test_model(full) helper.print_title("Cluster results") print(evl.cluster_results) helper.print_title("Classes to clusters") print(evl.classes_to_clusters)
def PredecirUnaTemporada(path): jvm.start() insta = CrearInstanciaParaPredecir(path) atributos = "" file = open('ModelData/wekaHeader.arff', 'r') atributos = file.readlines() file.close() file = open('ModelData/predictionFiles/inst.arff', 'w') file.writelines(atributos) file.write("\n" + insta + '\n') file.close() objects = serialization.read_all("ModelData/77PercentModelPaisajes.model") classifier = Classifier(jobject=objects[0]) loader = Loader() data = loader.load_file("ModelData/predictionFiles/inst.arff") data.class_is_last() clases = ["invierno", "verano", "otono", "primavera"] prediccion = "" for index, inst in enumerate(data): pred = classifier.classify_instance(inst) dist = classifier.distribution_for_instance(inst) prediccion = clases[int(pred)] jvm.stop() return prediccion
def main(path, num_trees): loader = Loader( classname='weka.core.converters.ArffLoader') # load the data ds = loader.load_file(path) ds.class_is_last() accuracy = list() for i in range(10): random.seed(i) data_train, data_test = split_data(ds, 1.0 / 3) # split it labels_test = [inst.values[inst.class_index] for inst in data_test] rf = train(data_train, n_estimators=num_trees) # train the model predicted = predict(rf, data_test) # test the model # compute the accuracy of correctly classified instances num_correct = sum( [1.0 for y, gt in zip(predicted, labels_test) if y == gt]) accuracy.append(num_correct / len(labels_test)) # compute the pecrcentage of correctly classified instances (averaged result) acc_mean = 1.0 * sum(accuracy) / len(accuracy) acc_std = (sum((x - acc_mean)**2 for x in accuracy) / len(accuracy))**0.5 print('Maximum number of trees: {}\n'.format(num_trees)) print('Accuracy (mean={:.2%}, std={:.2%}):\n{}'.format( acc_mean, acc_std, ', '.join('{:.2%}'.format(acc) for acc in accuracy))) print('-' * 120)
def main(args): """ Trains a J48 classifier on a training set and outputs the predicted class and class distribution alongside the actual class from a test set. Class attribute is assumed to be the last attribute. :param args: the commandline arguments (train and test datasets) :type args: list """ # load a dataset helper.print_info("Loading train: " + args[1]) loader = Loader(classname="weka.core.converters.ArffLoader") train = loader.load_file(args[1]) train.class_index = train.num_attributes - 1 helper.print_info("Loading test: " + args[2]) test = loader.load_file(args[2]) test.class_is_last() # classifier cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) # output predictions print("# - actual - predicted - error - distribution") for index, inst in enumerate(test): pred = cls.classify_instance(inst) dist = cls.distribution_for_instance(inst) print( "%d - %s - %s - %s - %s" % (index+1, inst.get_string_value(inst.class_index), inst.class_attribute.value(int(pred)), "yes" if pred != inst.get_value(inst.class_index) else "no", str(dist.tolist())))
def vote_classifier_train(dicrectory, nameOfDataSet, flag): loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(dicrectory) data.class_is_last() meta = MultipleClassifiersCombiner( classname="weka.classifiers.meta.Vote", options=[ '-S', '1', '-B', 'weka.classifiers.trees.J48 -C 0.25 -M 2', '-B', 'weka.classifiers.trees.RandomTree -K 6 -M 1.0 -V 0.001 -S 1', '-B', 'weka.classifiers.meta.Bagging -P 100 -S 1 -num-slots 1 -I 10 -W weka.classifiers.trees.REPTree -- ' '-M 2 -V 0.001 -N 3 -S 1 -L -1 -I 0.0', '-B', 'weka.classifiers.meta.AdaBoostM1 -P 100 -S 1 -I 10 -W weka.classifiers.trees.DecisionStump', '-B', 'weka.classifiers.meta.Bagging -P 100 -S 1 -num-slots 1 -I 10 -W weka.classifiers.trees.REPTree -- ' '-M 2 -V 0.001 -N 3 -S 1 -L -1 -I 0.0', '-B', 'weka.classifiers.bayes.NaiveBayes ', '-R', 'AVG' ]) eval = Evaluation(data) pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.PlainText") if flag: eval.crossvalidate_model(meta, data, 10, Random(1), pout) else: eval.evaluate_train_test_split(meta, data, 80.0, Random(1), pout) gc.collect() print_and_save('Proposed model', flag, nameOfDataSet, eval)
def Feature_Selection(infile): directory = os.getcwd() + '/' csvpath = directory + infile jvm.start(packages=True, max_heap_size="4g") print "\n\n" print "Loaded file: ", infile csvloader = Loader(classname="weka.core.converters.CSVLoader") csvdata = csvloader.load_file(csvpath) remover = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", " 1"]) remover.inputformat(csvdata) filtered_data = remover.filter(csvdata) filtered_data.class_is_last() search = ASSearch(classname="weka.attributeSelection.BestFirst", options=["-D", "1", "-N", "5"]) evaluator = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval", options=["-P", "1", "-E", "1"]) attribs = AttributeSelection() attribs.search(search) attribs.evaluator(evaluator) attribs.select_attributes(filtered_data) print "Summary of Attribute Selection: " print attribs.results_string jvm.stop() return
def main(args): """ Loads a dataset, shuffles it, splits it into train/test set. Trains J48 with training set and evaluates the built model on the test set. :param args: the commandline arguments (optional, can be dataset filename) :type args: list """ # load a dataset if len(args) <= 1: data_file = helper.get_data_dir() + os.sep + "vote.arff" else: data_file = args[1] helper.print_info("Loading dataset: " + data_file) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # generate train/test split of randomized data train, test = data.train_test_split(66.0, Random(1)) # build classifier cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) print(cls) # evaluate evl = Evaluation(train) evl.test_model(cls, test) print(evl.summary())
def create_weka_dataset(self, X, y=None): """Create weka dataset using temporaly file Arguments: X {array like} -- non target class instances y {array like} -- target class instances Returns: java object wrapped -- weka dataset """ try: # Create new temporal file temp = tempfile.NamedTemporaryFile() # Concat X and y. Write csv to temporaly file. if y is None: y = pd.DataFrame(["?"]*X.shape[0], columns=self.experiment_configuration["target"]) X.reset_index(drop=True, inplace=True) y.reset_index(drop=True, inplace=True) dataframe = pd.concat([X, y], axis=1, ignore_index=True) dataframe.to_csv(temp.name, index=None) options = None if self.y_uniques is not None: options = ["-L", "{}:{}".format(dataframe.shape[1], ",".join(map(str, self.y_uniques)))] if not self.is_classification(): options = ["-R", "last"] loader = Loader(classname="weka.core.converters.CSVLoader", options=options) data = loader.load_file(temp.name) # Last column of data is target data.class_is_last() finally: temp.close() return data
def main(args): """ Trains a NaiveBayesUpdateable classifier incrementally on a dataset. The dataset can be supplied as parameter. :param args: the commandline arguments :type args: list """ # load a dataset if len(args) <= 1: data_file = helper.get_data_dir() + os.sep + "vote.arff" else: data_file = args[1] helper.print_info("Loading dataset: " + data_file) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(data_file, incremental=True) data.class_is_last() # classifier nb = Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable") nb.build_classifier(data) # train incrementally for inst in loader: nb.update_classifier(inst) print(nb)
def main(): try: jvm.start() loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file("./data/adult.csv") data.class_is_last() # set class attribute # randomize data folds = k seed = 1 rnd = Random(seed) rand_data = Instances.copy_instances(data) rand_data.randomize(rnd) if rand_data.class_attribute.is_nominal: rand_data.stratify(folds) NaiveBayes(rand_data, folds, seed, data) DecisionTree(rand_data, folds, seed, data) except Exception as e: raise e finally: jvm.stop()
def load_data_from_arff(self): print("loading data from raw") loader = Loader(classname="weka.core.converters.ArffLoader") #target print("Loading target data") all_target = loader.load_file(self.arff_data_path + self.target_name + ".arff") all_target.class_is_last() train_vs_test_percent = (self.num_games_target / self.num_games_source) * 100 self.target, self.eval = all_target.train_test_split( train_vs_test_percent) print("target size:", self.target.num_instances) print("Eval size:", self.eval.num_instances) #source print("Loading source data") i = 0 allFiles = os.listdir(self.arff_data_path) random.shuffle(allFiles) while i < len(allFiles): filename = allFiles[i] if filename != self.target_name + ".arff": print("Loading", filename) source = loader.load_file(self.arff_data_path + filename) source.class_is_last() print("Size:", source.num_instances) self.source.append(source) i += 1
def evaluation_data(self, model): try: loader = Loader(classname="weka.core.converters.ArffLoader") data_test = loader.load_file(self.dataTestName) #helper.print_info("Evaluating on data:") evaluation = ClusterEvaluation() evaluation.set_model(model) evaluation.test_model(data_test) #print("# clusters: " + str(evaluation.num_clusters)) #print("# log likelihood: " + str(evaluation.log_likelihood)) cluster_ass = evaluation.cluster_assignments #print("# cluster assignments:\n" + str(cluster_ass)) f = open("result_data.txt", "w+") i = 0 for ins in data_test: stt = "normal" if (cluster_ass[i] == 0): stt = "anomaly" statement = str(ins) + "," + stt #print statement f.write(statement + "\n") i = i + 1 f.close() return evaluation.cluster_results except Exception, e: raise e print(traceback.format_exc())
def main(): """ Shows how to use the CostSensitiveClassifier. """ # load a dataset data_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + data_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # classifier classifier = SingleClassifierEnhancer( classname="weka.classifiers.meta.CostSensitiveClassifier", options=["-cost-matrix", "[0 1; 2 0]", "-S", "2"]) base = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) classifier.classifier = base folds = 10 evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, folds, Random(1)) print("") print("=== Setup ===") print("Classifier: " + classifier.to_commandline()) print("Dataset: " + data.relationname) print("") print(evaluation.summary("=== " + str(folds) + " -fold Cross-Validation ==="))
def save_all_scores_on_validate(): for user in user_list: user_validate_dir = os.listdir("../data/arff_files/" + str(user) + "/validate/") user_validate_dir.sort() n = len(user_validate_dir) for expression_index in range(n): print expression_index, "=>", str( expression_list[expression_index]), ':', str( user_validate_dir[expression_index]) id = str(expression_list[expression_index]) + '_' + str(user) target_dir = '../results/' + str( expression_list[expression_index]) + '/' + str(user) + '/' model_dir = '../models/' + str( expression_list[expression_index]) + '/' + str(user) + '/' validate_data_file = "../data/arff_files/" + str( user) + "/validate/" + str(user_validate_dir[expression_index]) print validate_data_file, "=>", model_dir, "all algos", "=>", target_dir, "\n" loader = Loader(classname="weka.core.converters.ArffLoader") validate_data = loader.load_file(validate_data_file) for algo in algo_func_dict.keys(): trained_model = Classifier( jobject=serialization.read(model_dir + algo + ".model")) scores_matrix = get_classifier_score(trained_model, validate_data) out_file = target_dir + algo + "_scores.csv" #writing scores to target file np.savetxt(out_file, scores_matrix, delimiter=",")
def predict_proba(self, X): evaluation = Evaluation(self.train_data) # Add class column (we can't copy X, because this is a large object, so we add the column and remove it later) X['class'] = None filename = self.to_arff(X, True) # Remove class column del X['class'] loader = Loader("weka.core.converters.ArffLoader") test_data = loader.load_file(filename) test_data.class_is_last() evaluation.test_model(self.classifier, test_data) probas = None # Return probabilities for pred in evaluation.predictions: if probas is None: probas = pred.distribution else: probas = np.vstack([probas, pred.distribution]) return probas
def main(): dataset = sys.argv[1] #load a dataset loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file("./data/" + dataset + ".arff") data.class_is_last() num_classes = data.class_attribute.num_values os.mkdir('resultados_' + sys.argv[1]) for random_cv in range(10): #10 CV # generate train/test split of randomized data train, test = data.train_test_split(75.0, Random(random_cv)) results_train, results_test = classification(data, train, test, num_classes) # results_test = classification(test, num_classes) #Write results in Excel format train_name = "./resultados_" + sys.argv[1] + "/resultados_" + sys.argv[ 1] + "_" + "E" + np.str(random_cv) + ".csv" test_name = "./resultados_" + sys.argv[1] + "/resultados_" + sys.argv[ 1] + "_" + "T" + np.str(random_cv) + ".csv" results_train.to_csv(train_name) results_test.to_csv(test_name)
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(iris_file) # remove class attribute data.delete_last_attribute() # build a clusterer and output model helper.print_title("Training SimpleKMeans clusterer") clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) clusterer.build_clusterer(data) print(clusterer) # cluster data helper.print_info("Clustering data") for index, inst in enumerate(data): cl = clusterer.cluster_instance(inst) dist = clusterer.distribution_for_instance(inst) print(str(index+1) + ": cluster=" + str(cl) + ", distribution=" + str(dist))
def main(): """ Just runs some example code. """ classifier = Classifier("weka.classifiers.trees.J48") helper.print_title("Capabilities") capabilities = classifier.capabilities print(capabilities) # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() data_capabilities = Capabilities.for_instances(iris_data) print(data_capabilities) print("classifier handles dataset: " + str(capabilities.supports(data_capabilities))) # disable/enable helper.print_title("Disable/Enable") capability = Capability(member="UNARY_ATTRIBUTES") capabilities.disable(capability) capabilities.min_instances = 10 print("Removing: " + str(capability)) print(capabilities)
def main(): """ Shows how to use the CostSensitiveClassifier. """ # load a dataset data_file = helper.get_data_dir() + os.sep + "diabetes.arff" helper.print_info("Loading dataset: " + data_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # classifier classifier = SingleClassifierEnhancer( classname="weka.classifiers.meta.CostSensitiveClassifier", options=["-cost-matrix", "[0 1; 2 0]", "-S", "2"]) base = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) classifier.classifier = base folds = 10 evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, folds, Random(1)) print("") print("=== Setup ===") print("Classifier: " + classifier.to_commandline()) print("Dataset: " + data.relationname) print("") print( evaluation.summary("=== " + str(folds) + " -fold Cross-Validation ==="))
def initData(self, arrfFile): loader = Loader(classname="weka.core.converters.ArffLoader") print self.dataDir + '/' + arrfFile self.data = loader.load_file(self.dataDir + '/' + arrfFile) self.data.class_is_last() print 'Carregando arquivo ' + self.dataDir + '/' + arrfFile
def runCV(this, arffFile, classifier, folds): loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(arffFile) data.class_is_last() classes = [str(code) for code in data.class_attribute.values] header = ["Accuracy"] for name in classes: header += [name + " TP", name + " FP", name + " AUC ROC"] values = [] cls = Classifier(classname=classifier) evl = Evaluation(data) evl.crossvalidate_model(cls, data, folds, Random(1)) values.append(evl.percent_correct) for name in classes: index = classes.index(name) values += [ evl.true_positive_rate(index) * 100, evl.false_positive_rate(index) * 100, evl.area_under_roc(index) ] this.values = values this.header = header
def gridsearch(): """ Applies GridSearch to a dataset. GridSearch package must be not be installed, as the monolithic weka.jar already contains this package. """ helper.print_title("GridSearch") # load a dataset fname = helper.get_data_dir() + os.sep + "bolts.arff" helper.print_info("Loading train: " + fname) loader = Loader(classname="weka.core.converters.ArffLoader") train = loader.load_file(fname) train.class_is_last() # classifier grid = GridSearch(options=["-sample-size", "100.0", "-traversal", "ROW-WISE", "-num-slots", "1", "-S", "1"]) grid.evaluation = "CC" grid.y = {"property": "kernel.gamma", "min": -3.0, "max": 3.0, "step": 1.0, "base": 10.0, "expression": "pow(BASE,I)"} grid.x = {"property": "C", "min": -3.0, "max": 3.0, "step": 1.0, "base": 10.0, "expression": "pow(BASE,I)"} cls = Classifier( classname="weka.classifiers.functions.SMOreg", options=["-K", "weka.classifiers.functions.supportVector.RBFKernel"]) grid.classifier = cls grid.build_classifier(train) print("Model:\n" + str(grid)) print("\nBest setup:\n" + grid.best.to_commandline())
def main(args): """ Trains a J48 classifier on a training set and outputs the predicted class and class distribution alongside the actual class from a test set. Class attribute is assumed to be the last attribute. :param args: the commandline arguments (train and test datasets) :type args: list """ # load a dataset helper.print_info("Loading train: " + args[1]) loader = Loader(classname="weka.core.converters.ArffLoader") train = loader.load_file(args[1]) train.class_index = train.num_attributes - 1 helper.print_info("Loading test: " + args[2]) test = loader.load_file(args[2]) test.class_is_last() # classifier cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) # output predictions print("# - actual - predicted - error - distribution") for index, inst in enumerate(test): pred = cls.classify_instance(inst) dist = cls.distribution_for_instance(inst) print("%d - %s - %s - %s - %s" % (index + 1, inst.get_string_value( inst.class_index), inst.class_attribute.value(int(pred)), "yes" if pred != inst.get_value(inst.class_index) else "no", str(dist.tolist())))
def TestClassification(arff, modelInput, results): # 启动java虚拟机 jvm.start() # 导入分析模型 objects = serialization.read_all(modelInput) clsf = Classifier(jobject=objects[0]) print(clsf) # 导入测试组 loader = Loader(classname="weka.core.converters.ArffLoader") test = loader.load_file(arff) test.class_is_first() # 分析结果 resultsFile = open(results, "w") resultsFile.write("序号\t原判断\t预测\t良性概率\t恶性概率\n") print("序号\t原判断\t预测\t良性概率\t恶性概率") for index, inst in enumerate(test): pred = clsf.classify_instance(inst) dist = clsf.distribution_for_instance(inst) sampleID = index + 1 origin = inst.get_string_value(inst.class_index) prediction = inst.class_attribute.value(int(pred)) sameAsOrigin = "yes" if pred != inst.get_value( inst.class_index) else "no" NRate = dist.tolist()[0] PRate = dist.tolist()[1] resultsFile.write( "%d\t%s\t%s\t%s\t%s" % (sampleID, origin, prediction, str(NRate), str(PRate)) + "\n") print("%d\t%s\t%s\t%s\t%s" % (sampleID, origin, prediction, str(NRate), str(PRate))) resultsFile.close() # 退出java虚拟机 jvm.stop() print("检测完成")
def case2(): loader1 = Loader(classname="weka.core.converters.ArffLoader") test_file = input("Enter the name of the test file:") data1 = loader1.load_file(test_file) data1.class_is_last() evaluation = Evaluation(data1) evl = evaluation.test_model(cls, data1) print(evaluation.matrix("=== (confusion matrix) ==="))
def return_data(self): loader = Loader(classname="weka.core.converters.ArffLoader") self.data = loader.load_file(self.directory) self.data.class_is_last() return self.data
def _load_data(self, dfile, index = None): loader = Loader(classname = 'weka.core.converters.CSVLoader') data = loader.load_file(dfile = dfile) if index == None: data.set_class_index(data.num_attributes() - 1) else: data.set_class_index(index) return data
def main(args): """ Trains Apriori on the specified dataset (uses vote UCI dataset if no dataset specified). :param args: the commandline arguments :type args: list """ # load a dataset if len(args) <= 1: data_file = helper.get_data_dir() + os.sep + "vote.arff" else: data_file = args[1] helper.print_info("Loading dataset: " + data_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # build Apriori, using last attribute as class attribute apriori = Associator(classname="weka.associations.Apriori", options=["-c", "-1"]) apriori.build_associations(data) print(str(apriori)) # iterate association rules (low-level) helper.print_info("Rules (low-level)") # make the underlying rules list object iterable in Python rules = javabridge.iterate_collection(apriori.jwrapper.getAssociationRules().getRules().o) for i, r in enumerate(rules): # wrap the Java object to make its methods accessible rule = JWrapper(r) print(str(i+1) + ". " + str(rule)) # output some details on rule print(" - consequence support: " + str(rule.getConsequenceSupport())) print(" - premise support: " + str(rule.getPremiseSupport())) print(" - total support: " + str(rule.getTotalSupport())) print(" - total transactions: " + str(rule.getTotalTransactions())) # iterate association rules (high-level) helper.print_info("Rules (high-level)") print("can produce rules? " + str(apriori.can_produce_rules())) print("rule metric names: " + str(apriori.rule_metric_names)) rules = apriori.association_rules() if rules is not None: print("producer: " + rules.producer) print("# rules: " + str(len(rules))) for i, rule in enumerate(rules): print(str(i+1) + ". " + str(rule)) # output some details on rule print(" - consequence support: " + str(rule.consequence_support)) print(" - consequence: " + str(rule.consequence)) print(" - premise support: " + str(rule.premise_support)) print(" - premise: " + str(rule.premise)) print(" - total support: " + str(rule.total_support)) print(" - total transactions: " + str(rule.total_transactions)) print(" - metric names: " + str(rule.metric_names)) print(" - metric values: " + str(rule.metric_values)) print(" - metric value 'Confidence': " + str(rule.metric_value('Confidence'))) print(" - primary metric name: " + str(rule.primary_metric_name)) print(" - primary metric value: " + str(rule.primary_metric_value))
def load_Arff(self, inputPath): #Loading input file #print inputPath loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(inputPath) return data
def use_classifier(data_filename, cli): loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(data_filename) data.class_is_last() cls = from_commandline(cli, classname="weka.classifiers.Classifier") cls.build_classifier(data) evaluation = Evaluation(data) evaluation.crossvalidate_model(cls, data, 10, Random(1)) return cls, evaluation
def train(self): filename = "train.arff" self.write_arff(filename, "train", 0, self.input_x, self.input_y) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(filename) data.class_is_last() self.cls = Classifier(classname="weka.classifiers.meta.Bagging", options=["-S", "5"]) self.cls.build_classifier(data) os.remove(filename)
def convertCsvtoArff(indata, outdata): ''' :param indata: -> input csv file :param outdata: -> output file :return: ''' loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(indata) saver = Saver(classname="weka.core.converters.ArffSaver") saver.save_file(data, outdata)
def generate_folds(dataset_path, output_folder, n_folds=10, random_state=None): """ Given a dataset df, generate n_folds for it and store them in <output_folder>/<dataset_name>. :type dataset_path: str :param dataset_path: Path to dataset with .arff file extension (i.e my_dataset.arff) :type output_folder: str :param output_folder: Path to store both index file with folds and fold files. :type n_folds: int :param n_folds: Optional - Number of folds to split the dataset into. Defaults to 10. :type random_state: int :param random_state: Optional - Seed to use in the splitting process. Defaults to None (no seed). """ import warnings warnings.filterwarnings('error') dataset_name = dataset_path.split('/')[-1].split('.')[0] af = load_arff(dataset_path) df = load_dataframe(af) skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_state) fold_iter = skf.split(df[df.columns[:-1]], df[df.columns[-1]]) fold_index = dict() jvm.start() csv_loader = Loader(classname="weka.core.converters.CSVLoader") arff_saver = Saver(classname='weka.core.converters.ArffSaver') for i, (arg_rest, arg_test) in enumerate(fold_iter): fold_index[i] = list(arg_test) _temp_path = 'temp_%s_%d.csv' % (dataset_name, i) fold_data = df.loc[arg_test] # type: pd.DataFrame fold_data.to_csv(_temp_path, sep=',', index=False) java_arff_dataset = csv_loader.load_file(_temp_path) java_arff_dataset.relationname = af['relation'] java_arff_dataset.class_is_last() arff_saver.save_file(java_arff_dataset, os.path.join(output_folder, '%s_fold_%d.arff' % (dataset_name, i))) os.remove(_temp_path) json.dump( fold_index, open(os.path.join(output_folder, dataset_name + '.json'), 'w'), indent=2 ) jvm.stop() warnings.filterwarnings('default')
def run(arff_path, model_out): jvm.start() loader = Loader(classname = "weka.core.converters.ArffLoader") data = loader.load_file(arff_path) data.class_is_last() cls = Logistic() cls.build_classifier(data) cls.save_model(model_out) coefficients = cls.coefficients for coeff in coefficients: print str(coeff) return coefficients
def predict(attributes): jvm.start() file_path = print_to_file(attributes) # load the saved model objects = serialization.read_all("/Users/hosyvietanh/Desktop/data_mining/trained_model.model") classifier = Classifier(jobject=objects[0]) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(file_path) data.class_is_last() for index, inst in enumerate(data): pred = classifier.classify_instance(inst) dist = classifier.distribution_for_instance(inst) return int(pred) jvm.stop()
def playback_speed_checker(inputFile, dirRef): TRAINING_ARFF = 'dataset_playback.arff' inputRef = "" # Start JVM jvm.start() jvm.start(system_cp=True, packages=True) jvm.start(max_heap_size="512m") # Find reference file for file in os.listdir(dirRef): if str(file).find(str(os.path.basename(inputFile))) != -1: inputRef = os.path.join(dirRef, file) break # Calculation distance (result, distance) = dtw_checker(inputFile, inputRef) # Loading data loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(TRAINING_ARFF) data.class_is_last() # set class attribute # Train the classifier #cls = Classifier(classname="weka.classifiers.functions.SMO") cls = Classifier(classname="weka.classifiers.trees.J48", options = ["-C", "0.3", "-M", "10"]) cls.build_classifier(data) # Classify instance speed_instance = Instance.create_instance(numpy.ndarray(distance), classname='weka.core.DenseInstance', weight=1.0) speed_instance.dataset = data # Classify instance speed_flag = cls.classify_instance(speed_instance) if (distance == 0): speed_class = 'nominal' else: if speed_flag == 0: speed_class = 'down_speed' if speed_flag == 0: speed_class = 'up_speed' # print os.path.basename(inputFile) + ' --- ' + speed_class # Stop JVM jvm.stop() print "SPEED IS: " + speed_class return speed_class
def calculate_amino_type(self, model, pro): if pro: return [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] i = Instance.create_instance(values=[1.0, self.a, self.b]) if (self.a==-1 and self.b==-1 ): return [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] elif (self.a==-1): i.set_missing(1) elif (self.b==-1): i.set_missing(2) from weka.core.converters import Loader loader = Loader("weka.core.converters.ArffLoader") myDataset = loader.load_file("weka/testingthisthingout.arff") myDataset.set_class_index(0) i.set_dataset(myDataset) return model.distribution_for_instance(i)
def main(): """ Just runs some example code. """ # load a dataset vote_file = helper.get_data_dir() + os.sep + "vote.arff" helper.print_info("Loading dataset: " + vote_file) loader = Loader("weka.core.converters.ArffLoader") vote_data = loader.load_file(vote_file) vote_data.class_is_last() # train and output associator associator = Associator(classname="weka.associations.Apriori", options=["-N", "9", "-I"]) associator.build_associations(vote_data) print(associator)
def build_and_classify(classifier, classifier_name, approach_name, infile, percentage='10'): """ Creates model and classifies against input data. Returns accuracy statistics """ # set seed so results are consistent random.seed('iot') # load data loader = Loader(classname='weka.core.converters.CSVLoader') data = loader.load_file(infile) data.class_is_last() # convert all numeric attributes to nominal to_nominal = Filter(classname='weka.filters.unsupervised.attribute.NumericToNominal', options=['-R', 'first-last']) to_nominal.inputformat(data) data = to_nominal.filter(data) # randomize data with constant seed randomize = Filter(classname='weka.filters.unsupervised.instance.Randomize', options=['-S', '42']) randomize.inputformat(data) data = randomize.filter(data) # create training set and testing set train_percent_filter = Filter(classname='weka.filters.unsupervised.instance.RemovePercentage', options=['-P', percentage, '-V']) train_percent_filter.inputformat(data) train = train_percent_filter.filter(data) test = data # build and test classifier classifier.build_classifier(train) evaluation = Evaluation(train) evaluation.test_model(classifier, test) # return results as array results = [ approach_name, classifier_name, percentage, evaluation.percent_correct, evaluation.weighted_f_measure ] return results
def main(): """ Just runs some example code. """ # load a dataset bodyfat_file = helper.get_data_dir() + os.sep + "bodyfat.arff" helper.print_info("Loading dataset: " + bodyfat_file) loader = Loader("weka.core.converters.ArffLoader") bodyfat_data = loader.load_file(bodyfat_file) bodyfat_data.class_is_last() # classifier help helper.print_title("Creating help string") classifier = Classifier(classname="weka.classifiers.trees.M5P") classifier.build_classifier(bodyfat_data) print(classifier)
def predict(self, test_data): filename = "test.arff" self.write_arff(filename, "test", 0, test_data) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(filename) data.class_is_last() # evl = Evaluation(data) # evl.evaluate_model(self.cls,data) # data.set_class_label(data.numAttributes() - 1) # data.setClassIndex(data.numAttributes() - 1) result = [] for index, inst in enumerate(data): pred = self.cls.classify_instance(inst) dist = self.cls.distribution_for_instance(inst) result.append(dist[0]) # print(str(index+1) + ": label index=" + str(pred) + ", class distribution=" + str(dist)) # print str(index+1) + 'dist:'+ str(dist) os.remove(filename) return result
def riaa_checker(inputFile): TRAINING_ARFF = 'C:\Users\ASUS\Desktop\IGNASI\SMC\Workspace\dataset_riaa.arff' # Start JVM jvm.start() jvm.start(system_cp=True, packages=True) jvm.start(max_heap_size="512m") # Calculation of bark bands information (absolute_bark, relative_bark, bark_ratios) = compute_bark_spectrum(inputFile) # Loading data loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(TRAINING_ARFF) data.class_is_last() # set class attribute # Train the classifier cls = Classifier(classname="weka.classifiers.functions.SMO") #cls = Classifier(classname="weka.classifiers.trees.J48", options = ["-C", "0.3", "-M", "10"]) cls.build_classifier(data) # Classify instance bark_instance = Instance.create_instance(bark_ratios, classname='weka.core.DenseInstance', weight=1.0) bark_instance.dataset = data # Classify instance riaa_flag = cls.classify_instance(bark_instance) if riaa_flag == 0: riaa_class = 'riaa_ok' else: riaa_class = 'riaa_ko' # print os.path.basename(inputFile) + ' --- ' + riaa_class # Stop JVM jvm.stop() print "RIAA FILTERING?: " + riaa_class return riaa_class
def main(args): """ Performs attribute selection on the specified dataset (uses vote UCI dataset if no dataset specified). Last attribute is assumed to be the class attribute. Used: CfsSubsetEval, GreedyStepwise, J48 :param args: the commandline arguments :type args: list """ # load a dataset if len(args) <= 1: data_file = helper.get_data_dir() + os.sep + "vote.arff" else: data_file = args[1] helper.print_info("Loading dataset: " + data_file) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() use_classifier(data) use_filter(data) use_low_level(data)
def main(): """ Just runs some example code. """ # load ARFF file helper.print_title("Loading ARFF file") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(helper.get_data_dir() + os.sep + "iris.arff") print(str(data)) # load CSV file helper.print_title("Loading CSV file") loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(helper.get_data_dir() + os.sep + "iris.csv") print(str(data)) # load directory # changes this to something sensible text_dir = "/some/where" if os.path.exists(text_dir) and os.path.isdir(text_dir): helper.print_title("Loading directory: " + text_dir) loader = TextDirectoryLoader(options=["-dir", text_dir, "-F", "-charset", "UTF-8"]) data = loader.load() print(unicode(data))
def classify(self, predictFile): if self.data is None or self.classifier is None: return [-1] loader = Loader(classname="weka.core.converters.ArffLoader") predict_data = loader.load_file(self.dataDir + '/' + predictFile) predict_data.class_is_last() values = str(predict_data.class_attribute)[19:-1].split(',') classes = [] for index, inst in enumerate(predict_data): #pred = self.classifier.classify_instance(inst) prediction = self.classifier.distribution_for_instance(inst) cl = int(values[prediction.argmax()][7:]) print 'Classe:', cl classes.append(cl) return classes