def crossTest(this, trainingFile, classifier, testFile): loader = Loader(classname="weka.core.converters.ArffLoader") data1 = loader.load_file(trainingFile) data1.class_is_last() cls = Classifier(classname=classifier) cls.build_classifier(data1) data2 = loader.load_file(testFile) data2.class_is_last() classes = [str(code) for code in data2.class_attribute.values] header = ["Accuracy"] for name in classes: header += [name + " TP", name + " FP", name + " AUC ROC"] values = [] evl = Evaluation(data2) evl.test_model(cls, data2) values.append(evl.percent_correct) for name in classes: index = classes.index(name) values += [ evl.true_positive_rate(index) * 100, evl.false_positive_rate(index) * 100, evl.area_under_roc(index) ] this.values = values this.header = header
def logit_PC(df_train, df_test, attr_label): ''' logistic regression with PC members only :param df_train: training data, pandas data frame :param df_test: testing data, pandas data frame :param attr_label: label attribute, string :return: PC members, logistic regression model and AUC ''' pcs = RF.learnPC_R(df_train, attr_label) if pcs: # model = LogisticRegression().fit(df_train[pcs], df_train[attr_label]) # pred = model.predict_proba(df_test[pcs]) # pred = [x[1] for x in pred] # auc = evaluate_auc(df_test[attr_label].values.tolist(), pred) df2Instances = DF2Instances(df_train[pcs+[attr_label]], 'train', attr_label) data_train = df2Instances.df_to_instances() data_train.class_is_last() # set class attribute model = Classifier(classname="weka.classifiers.functions.Logistic") model.build_classifier(data_train) df2Instances = DF2Instances(df_test[pcs+[attr_label]], 'test', attr_label) data_test = df2Instances.df_to_instances() data_test.class_is_last() # set class attribute preds = [] for index, inst in enumerate(data_test): preds.append(model.distribution_for_instance(inst)[1]) auc = evaluate_auc(df_test[attr_label].values.tolist(), preds) return pcs, model, auc else: return pcs, None, None
def predBtn_clicked(self): gender = self.gender_entry.get() age = int(self.age_entry.get()) height = int(self.height_entry.get()) weight = int(self.weight_entry.get()) sociability = self.sociability_entry.get() stability = self.stability_entry.get() '''Create the model''' objects = serialization.read_all("J48.model") cls = Classifier(jobject=objects[0]) data = Instances(jobject=objects[1]) '''Create the test set to be classified''' gender_values = ["Man", "Woman"] sociability_values = ["Introvert", "Extrovert"] stability_values = ["Stable", "Unstable"] values = [ gender_values.index(gender), age, height, weight, self.BMI(weight, height), stability_values.index(stability), sociability_values.index(sociability), Instance.missing_value() ] inst = Instance.create_instance(values) inst.dataset = data '''Classification''' prediction = int(cls.classify_instance(inst)) self.controller.show_frame("Result").show(prediction) self.clear()
def testNB(training_data, testing_data): train_data = Instances.copy_instances(training_data) test_data = Instances.copy_instances(testing_data) evaluation = Evaluation(train_data) classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") classifier.build_classifier( train_data) # build classifier on the training data evaluation.test_model(classifier, test_data) # test and evaluate model on the test set print("") print("") print( evaluation.summary( "--------------Naive Bayes Evaluation--------------")) print("Accuracy: " + str(evaluation.percent_correct)) print("") print("Label\tPrecision\t\tRecall\t\t\tF-Measure") print("<=50K\t" + str(evaluation.precision(0)) + "\t" + str(evaluation.recall(0)) + "\t" + str(evaluation.f_measure(0))) print(">50K\t" + str(evaluation.precision(1)) + "\t" + str(evaluation.recall(1)) + "\t" + str(evaluation.f_measure(1))) print("Mean\t" + str(((evaluation.precision(1)) + (evaluation.precision(0))) / 2) + "\t" + str(((evaluation.recall(1)) + (evaluation.recall(0))) / 2) + "\t" + str(((evaluation.f_measure(1)) + (evaluation.f_measure(0))) / 2))
def DecisionTree(data): classifier = Classifier(classname="weka.classifiers.trees.J48") classifier.build_classifier(data) print("") print("=== Decision Tree ===") print(classifier) count_class1 = 0 count_class0 = 0 print("Labeling income status of each instance. Please wait..") for index, inst in enumerate(data): pred = classifier.classify_instance(inst) # calculate no. of instances classified in class 1 and 0 if str(pred) == "1.0": count_class1 += 1 else: count_class0 += 1 if index % 5000 == 0: print(".") print("No of instances in class '>50K' = " + str(count_class1)) print("No of instances in class '<=50K' = " + str(count_class0))
def build(self): """ Build J48 classifier using data loaded from ARFF :param storeModel: Store model after built :return: """ try: dataLoaded = self.loadClassifierData() if dataLoaded is True: # Decision tree options if self.unpruned is True: self.dtOptions = ['-U'] else: self.dtOptions = ['-C', str(self.confidenceValue)] # Decision tree classificator print '[Building J48 DT from training]' self.classifierInstance = Classifier(classname="weka.classifiers.trees.J48", options=self.dtOptions) self.classifierInstance.build_classifier(self.classificationData) return True except: return False return False
def __init__(self, classifier_name): # Defaults class_name = 'weka.classifiers.trees.RandomForest' options = None self.proba = None if classifier_name == 'wrf': class_name = 'weka.classifiers.trees.RandomForest' options = None elif classifier_name == 'wj48': class_name = 'weka.classifiers.trees.J48' options = None elif classifier_name == 'wnb': class_name = 'weka.classifiers.bayes.NaiveBayes' options = '-D' elif classifier_name == 'wbn': class_name = 'weka.classifiers.bayes.BayesNet' options = '-D -Q weka.classifiers.bayes.net.search.local.TAN -- -S BAYES -E weka.classifiers.bayes.net.estimate.SimpleEstimator -- -A 0.5' elif classifier_name == 'wsv': # Implementation of one-class SVM used in Anomaly Detection mode class_name = 'weka.classifiers.functions.LibSVM' options = '-S 2' if options is not None: self._classifier = Classifier(classname=class_name, options=[option for option in options.split()]) else: self._classifier = Classifier(classname=class_name) self.model_ = None
def main(args): """ Trains a NaiveBayesUpdateable classifier incrementally on a dataset. The dataset can be supplied as parameter. :param args: the commandline arguments :type args: list """ # load a dataset if len(args) <= 1: data_file = helper.get_data_dir() + os.sep + "vote.arff" else: data_file = args[1] helper.print_info("Loading dataset: " + data_file) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(data_file, incremental=True) data.class_is_last() # classifier nb = Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable") nb.build_classifier(data) # train incrementally for inst in loader: nb.update_classifier(inst) print(nb)
def test_classifier(dataset: Instances, classifier: Classifier, params: dict): vars = params.keys() vals = params.values() results = defaultdict(list) for val_combo in itertools.product(*vals): results["numInstances"].append(dataset.num_instances) results["numAttributes"].append(dataset.num_attributes) opts = dict(zip(vars, val_combo)) for opt in opts: results[opt].append(opts[opt]) classifier.set_property( opt, opts[opt] if not isinstance(opts[opt], float) else typeconv.double_to_float(opts[opt])) evl = Evaluation(dataset) classifier.build_classifier(dataset) evl.test_model(classifier, dataset) results["Training_Accuracy"].append(evl.percent_correct) results["size"].append( int(javabridge.call(classifier.jobject, "measureTreeSize", "()D"))) evl.crossvalidate_model(classifier, dataset, 10, Random(1)) results["CV_Accuracy"].append(evl.percent_correct) return results
class ClassifierNaiveBayes(ClassifierAbstract): """ Naive Bayes classifier algorithm in Weka """ def build(self): """ Build J48 classifier using data loaded from ARFF :param storeModel: Store model after built :return: """ try: dataLoaded = self.loadClassifierData() if dataLoaded is True: # Naive Bayes classificator print '[Building Naive Bayes from training]' self.classifierInstance = Classifier( classname="weka.classifiers.bayes.NaiveBayes") self.classifierInstance.build_classifier( self.classificationData) return True except: return False return False
def main(args): """ Trains a J48 classifier on a training set and outputs the predicted class and class distribution alongside the actual class from a test set. Class attribute is assumed to be the last attribute. :param args: the commandline arguments (train and test datasets) :type args: list """ # load a dataset helper.print_info("Loading train: " + args[1]) loader = Loader(classname="weka.core.converters.ArffLoader") train = loader.load_file(args[1]) train.class_index = train.num_attributes - 1 helper.print_info("Loading test: " + args[2]) test = loader.load_file(args[2]) test.class_is_last() # classifier cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) # output predictions print("# - actual - predicted - error - distribution") for index, inst in enumerate(test): pred = cls.classify_instance(inst) dist = cls.distribution_for_instance(inst) print( "%d - %s - %s - %s - %s" % (index+1, inst.get_string_value(inst.class_index), inst.class_attribute.value(int(pred)), "yes" if pred != inst.get_value(inst.class_index) else "no", str(dist.tolist())))
def main(args): """ Loads a dataset, shuffles it, splits it into train/test set. Trains J48 with training set and evaluates the built model on the test set. :param args: the commandline arguments (optional, can be dataset filename) :type args: list """ # load a dataset if len(args) <= 1: data_file = helper.get_data_dir() + os.sep + "vote.arff" else: data_file = args[1] helper.print_info("Loading dataset: " + data_file) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(data_file) data.class_is_last() # generate train/test split of randomized data train, test = data.train_test_split(66.0, Random(1)) # build classifier cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) print(cls) # evaluate evl = Evaluation(train) evl.test_model(cls, test) print(evl.summary())
def create_model(input_file, output_file): # Load data data = converters.load_any_file(input_file) data.class_is_last() # set class attribute # filter data print_title("Filtering Data") discretize = Filter( classname="weka.filters.unsupervised.attribute.Discretize", options=["-B", "10", "-M", "-1.0", "-R", "first-last"]) discretize.inputformat( data) # let the filter know about the type of data to filter filtered_data = discretize.filter(data) print("Done! (believe it or not)") print_title("Build Classifier") classifier = Classifier(classname="weka.classifiers.trees.RandomForest", options=["-I", "100", "-K", "0", "-S", "1"]) classifier.build_classifier(filtered_data) print("Done! (believe it or not)") serialization.write_all(output_file, [classifier, discretize]) print("Model and filter saved to ", output_file) evaluation = Evaluation(data) # initialize with priors evaluation.crossvalidate_model(classifier, filtered_data, 10, Random(42)) # 10-fold CV print(evaluation.summary()) print("pctCorrect: " + str(evaluation.percent_correct)) print("incorrect: " + str(evaluation.incorrect))
def retrain(self, examples, labels): f = open("trainingweka.arff", "w") f.write("@relation randomset\n") for j in range(len(examples[0])): f.write("@attribute feature%d real\n" % j) f.write("@attribute class {TRUE, FALSE}\n") f.write("@data\n") for (example, label) in zip(examples, labels): for feature in example: f.write("%f," % feature) if label == 1: f.write("TRUE\n") else: f.write("FALSE\n") f.close() loader = Loader(classname="weka.core.converters.ArffLoader") # options=["-H", "-B", "10000"]) self.trainingData = loader.load_file("trainingweka.arff") self.trainingData.set_class_index(self.trainingData.num_attributes() - 1) self.classifier = Classifier( classname="weka.classifiers.functions.Logistic", options=["-R", "%f" % (1.0 / self.C)]) self.classifier.build_classifier(self.trainingData)
def PredecirUnaTemporada(path): jvm.start() insta = CrearInstanciaParaPredecir(path) atributos = "" file = open('ModelData/wekaHeader.arff', 'r') atributos = file.readlines() file.close() file = open('ModelData/predictionFiles/inst.arff', 'w') file.writelines(atributos) file.write("\n" + insta + '\n') file.close() objects = serialization.read_all("ModelData/77PercentModelPaisajes.model") classifier = Classifier(jobject=objects[0]) loader = Loader() data = loader.load_file("ModelData/predictionFiles/inst.arff") data.class_is_last() clases = ["invierno", "verano", "otono", "primavera"] prediccion = "" for index, inst in enumerate(data): pred = classifier.classify_instance(inst) dist = classifier.distribution_for_instance(inst) prediccion = clases[int(pred)] jvm.stop() return prediccion
def run_naive_bayes_crossval(self, output_directory): # build classifier print("\nBuilding Classifier on training data.") buildTimeStart = time.time() cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes") cls.build_classifier(self.training_data) resultsString = "" resultsString = self.print_both(str(cls), resultsString) buildTimeString = "NB Cross Eval Classifier Built in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Evaluate Classifier resultsString = self.print_both("\nCross Evaluating on test data.", resultsString) buildTimeStart = time.time() evl = Evaluation(self.training_data) evl.crossvalidate_model(cls, self.training_data, 10, Random(1)) resultsString = self.print_both(str(evl.summary()), resultsString) resultsString = self.print_both(str(evl.class_details()), resultsString) resultsString = self.print_both(str(evl.confusion_matrix), resultsString) buildTimeString = "\nNB Cross Eval Classifier Evaluated in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Save Results and Cleanup self.save_results("Naive_Bayes_Crossval", resultsString, output_directory)
def TestClassification(arff, modelInput, results): # 启动java虚拟机 jvm.start() # 导入分析模型 objects = serialization.read_all(modelInput) clsf = Classifier(jobject=objects[0]) print(clsf) # 导入测试组 loader = Loader(classname="weka.core.converters.ArffLoader") test = loader.load_file(arff) test.class_is_first() # 分析结果 resultsFile = open(results, "w") resultsFile.write("序号\t原判断\t预测\t良性概率\t恶性概率\n") print("序号\t原判断\t预测\t良性概率\t恶性概率") for index, inst in enumerate(test): pred = clsf.classify_instance(inst) dist = clsf.distribution_for_instance(inst) sampleID = index + 1 origin = inst.get_string_value(inst.class_index) prediction = inst.class_attribute.value(int(pred)) sameAsOrigin = "yes" if pred != inst.get_value( inst.class_index) else "no" NRate = dist.tolist()[0] PRate = dist.tolist()[1] resultsFile.write( "%d\t%s\t%s\t%s\t%s" % (sampleID, origin, prediction, str(NRate), str(PRate)) + "\n") print("%d\t%s\t%s\t%s\t%s" % (sampleID, origin, prediction, str(NRate), str(PRate))) resultsFile.close() # 退出java虚拟机 jvm.stop() print("检测完成")
def set_params(self, **params): """ Sets the options for the classifier, expects 'classname' and 'options'. :param params: the parameter dictionary :type params: dict """ if len(params) == 0: return if "classname" not in params: raise Exception("Cannot find 'classname' in parameters!") if "options" not in params: raise Exception("Cannot find 'options' in parameters!") self._classname = params["classname"] self._options = params["options"] self._classifier = Classifier(classname=self._classname, options=self._options) self._nominal_input_vars = None if "nominal_input_vars" in params: self._nominal_input_vars = params["nominal_input_vars"] self._nominal_output_var = None if "nominal_output_var" in params: self._nominal_output_var = params["nominal_output_var"] self._num_nominal_input_labels = None if "num_nominal_input_labels" in params: self._num_nominal_input_labels = params["num_nominal_input_labels"] self._num_nominal_output_labels = None if "num_nominal_output_labels" in params: self._num_nominal_output_labels = params["num_nominal_output_labels"]
def run(): jvm.start() load_csv = Loader("weka.core.converters.CSVLoader") data_csv = load_csv.load_file( "/Users/imeiliasantoso/web_graduate_project4/predict_page/predict_data.csv" ) saver = Saver("weka.core.converters.ArffSaver") saver.save_file( data_csv, "/Users/imeiliasantoso/web_graduate_project4/predict_page/predict_data.arff" ) load_arff = Loader("weka.core.converters.ArffLoader") data_arff = load_arff.load_file( "/Users/imeiliasantoso/web_graduate_project4/predict_page/predict_data.arff" ) data_arff.class_is_last() global j48 J48_class = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"]) J48_class.build_classifier(data_arff) evaluationj48 = Evaluation(data_arff) evaluationj48.crossvalidate_model(J48_class, data_arff, 10, Random(100)) j48 = str(evaluationj48.percent_correct) jvm.stop() return j48
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") iris_data = loader.load_file(iris_file) iris_data.class_is_last() # train classifier classifier = Classifier("weka.classifiers.trees.J48") classifier.build_classifier(iris_data) # save and read object helper.print_title("I/O: single object") outfile = tempfile.gettempdir() + os.sep + "j48.model" serialization.write(outfile, classifier) model = Classifier(jobject=serialization.read(outfile)) print(model) # save classifier and dataset header (multiple objects) helper.print_title("I/O: single object") serialization.write_all(outfile, [classifier, Instances.template_instances(iris_data)]) objects = serialization.read_all(outfile) for i, obj in enumerate(objects): helper.print_info("Object #" + str(i+1) + ":") if javabridge.get_env().is_instance_of(obj, javabridge.get_env().find_class("weka/core/Instances")): obj = Instances(jobject=obj) elif javabridge.get_env().is_instance_of(obj, javabridge.get_env().find_class("weka/classifiers/Classifier")): obj = Classifier(jobject=obj) print(obj)
def __init__(self, class_name, options=None): if options is not None: self._classifier = Classifier(classname=class_name, options=[ option for option in options.split()]) else: self._classifier = Classifier(classname=class_name)
def autoweka(data, duration, metric, nb_folds): classifier = Classifier( classname="weka.classifiers.meta.AutoWEKAClassifier", options=["-x", nb_folds, "-timeLimit", duration, "-metric", metric] ) #classname="weka.classifiers.functions.Logistic", options=["-R", "1.0E-2"] classifier.build_classifier(data) print(classifier)
def f_smote(): jvm.start() train_data, test_data = b_i_impute_data() train_data = train_data[:10000] y_train = train_data["class"] x_train = train_data.drop("class", axis=1) sm = SMOTE(ratio="minority") x_train_sm, y_train_sm = sm.fit_sample(x_train, y_train) x_train_sm_df = pd.DataFrame(x_train_sm, columns=x_train.columns) y_train_sm_df = pd.DataFrame(y_train_sm, columns=["class"]) train_data_sm_df = pd.concat([y_train_sm_df, x_train_sm_df], axis=1) print_f("smote train data shape", train_data_sm_df.shape) train_data_sm_df.to_csv("./train_data_sm.csv", index=False) train_data_sm = converters.load_any_file("train_data_sm.csv") train_data_sm.class_is_first() test_data = converters.load_any_file("test_data.csv") test_data.class_is_first() print_f("1") cls = Classifier(classname="weka.classifiers.trees.LMT") print_f("bulding classifier") cls.build_classifier(train_data_sm) print_f("Evaluating") evl = Evaluation(train_data_sm) evl.crossvalidate_model(cls, train_data_sm, 5, Random(1)) print_f("Train Accuracy:", evl.percent_correct) print_f("Train summary") print_f(evl.summary()) print_f("Train class details") print_f(evl.class_details()) print_f("Train confusion matrix") print_f(evl.confusion_matrix) plcls.plot_roc(evl, class_index=[0, 1], wait=True, outfile="./plots/2_f_smote_10k.png") plt.suptitle("Train ROC Curve", fontsize=20, y=0.95) evl = Evaluation(test_data) print_f("testing model") evl.test_model(cls, test_data) print_f("Test Accuracy:", evl.percent_correct) print_f("Test summary") print_f(evl.summary()) print_f(" Testclass details") print_f(evl.class_details()) print_f("Testconfusion matrix") print_f(evl.confusion_matrix) plcls.plot_roc(evl, class_index=[0, 1], wait=True) plt.suptitle("Test ROC Curve", fontsize=20, y=0.95) savefig("./plots/f_test_roc_curve.png")
def train_weka_model(self, training_data_dir, save_model_dir, log_file, mimic_env=None): """ Just runs some example code. """ loader = Loader(classname="weka.core.converters.CSVLoader") training_data = loader.load_file(training_data_dir) training_data.class_is_last() self.classifier = Classifier(classname="weka.classifiers.trees.M5P", options=self.options) # classifier help, check https://weka.sourceforge.io/doc.dev/weka/classifiers/trees/M5P.html self.classifier.build_classifier(training_data) # print(classifier) graph = self.classifier.graph node_number = float(graph.split('\n')[-3].split()[0].replace('N', '')) leaves_number = node_number / 2 serialization.write(save_model_dir, self.classifier) # print('Leaves number is {0}'.format(leave_number), file=log_file) evaluation = Evaluation(training_data) predicts = evaluation.test_model(self.classifier, training_data) # return_value = None # if mimic_env is not None: predict_dictionary = {} for predict_index in range(len(predicts)): predict_value = predicts[predict_index] if predict_value in predict_dictionary.keys(): predict_dictionary[predict_value].append(predict_index) else: predict_dictionary.update({predict_value: [predict_index]}) # return_value = mimic_env.get_return(state=list(predict_dictionary.values())) return_value_log = mimic_env.get_return( state=list(predict_dictionary.values())) return_value_log_struct = mimic_env.get_return( state=list(predict_dictionary.values()), apply_structure_cost=True) return_value_var_reduction = mimic_env.get_return( state=list(predict_dictionary.values()), apply_variance_reduction=True) # print("Training return is {0}".format(return_value), file=log_file) summary = evaluation.summary() numbers = summary.split('\n') corr = float(numbers[1].split()[-1]) mae = float(numbers[2].split()[-1]) rmse = float(numbers[3].split()[-1]) rae = float(numbers[4].split()[-2]) / 100 rrse = float(numbers[5].split()[-2]) / 100 # print(evl) # print("Training summary is "+summary, file=log_file) return return_value_log, return_value_log_struct, \ return_value_var_reduction, mae, rmse, leaves_number
def __init__(self, index = 0, inference = "ExactInference", ghostAgents = None): BustersAgent.__init__(self, index, inference, ghostAgents) jvm.start(max_heap_size="512m") self.loader = Loader(classname="weka.core.converters.ArffLoader") self.data = self.loader.load_file("data/training-fase3.arff") self.data.class_is_last() self.cls = Classifier(classname="weka.classifiers.trees.REPTree", options=["-M", "2","-V", "0.001","-N", "3", "-S", "1", "-L", "-1"]) self.cls.build_classifier(self.data) serialization.write("data/out.model", self.cls)
def train(data_train, n_estimators): # train the model # create `Classifier` object rf = Classifier(classname="weka.classifiers.trees.RandomForest", options=['-num-slots', '0', '-I', str(n_estimators)]) # train classifier on the train split rf.build_classifier(data_train) return rf
def test_single(): #['long', 'nK', 'nK_delta', 'nT', 'nT_delta', 'str', 'strsum']: objs = ['olsize', 'ylsize'] for obj in objs: c = Classifier(jobject=serialization.read(model_file('hash', obj))) values = [3.0, 192.0, 124.0, 192.0, 124.0, 6.0, 144.0] values.append(0) # should be obj ins = Instance.create_instance(values) prediction = c.classify_instance(ins) print obj, prediction
class python_weka(object): def __init__(self, input_x, input_y, labels): self.input_x = input_x self.input_y = input_y self.labels = labels def write_arff(self, filename, relation, train_or_predict, input_x, input_y=None): f = open(filename, "w") f.write("@relation " + relation + "\n") for i in self.labels: train_or_predict += 1 if train_or_predict == len(self.labels): break f.write("@attribute " + i + " " + self.labels[i] + "\n") f.write("\n") f.write("@data" + "\n") for i in range(len(input_x)): for j in input_x[i]: f.write(str(j) + " ") if train_or_predict == 0: f.write(str(input_y[i])) else: f.write(str(0)) f.write("\n") f.close() def train(self): filename = "train.arff" self.write_arff(filename, "train", 0, self.input_x, self.input_y) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(filename) data.class_is_last() self.cls = Classifier(classname="weka.classifiers.meta.Bagging", options=["-S", "5"]) self.cls.build_classifier(data) os.remove(filename) def predict(self, test_data): filename = "test.arff" self.write_arff(filename, "test", 0, test_data) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(filename) data.class_is_last() # evl = Evaluation(data) # evl.evaluate_model(self.cls,data) # data.set_class_label(data.numAttributes() - 1) # data.setClassIndex(data.numAttributes() - 1) result = [] for index, inst in enumerate(data): pred = self.cls.classify_instance(inst) dist = self.cls.distribution_for_instance(inst) result.append(dist[0]) # print(str(index+1) + ": label index=" + str(pred) + ", class distribution=" + str(dist)) # print str(index+1) + 'dist:'+ str(dist) os.remove(filename) return result
class ClassifierDecisionTreeJ48(ClassifierAbstract): """ Decision tree using J48 algorithm in Weka """ def __init__(self, arffFileName, confidenceValue=0.25): """ Class constructor (overridden) :param arffFileName: ARFF file name :param confidenceValue: Confidence value for classifier """ ClassifierAbstract.__init__(self, arffFileName) # Store confidence value if 0 <= confidenceValue <= 1: self.confidenceValue = confidenceValue else: # Set default confidence value self.confidencevalue = 0.25 self.unpruned = False def setUnprunedTree(self, unpruned): """ Set unpruned tree option :param unpruned: If tree result is unpruned or not (TRUE or FALSE) :return: """ self.unpruned = unpruned def build(self): """ Build J48 classifier using data loaded from ARFF :param storeModel: Store model after built :return: """ try: dataLoaded = self.loadClassifierData() if dataLoaded is True: # Decision tree options if self.unpruned is True: self.dtOptions = ['-U'] else: self.dtOptions = ['-C', str(self.confidenceValue)] # Decision tree classificator print '[Building J48 DT from training]' self.classifierInstance = Classifier(classname="weka.classifiers.trees.J48", options=self.dtOptions) self.classifierInstance.build_classifier(self.classificationData) return True except: return False return False
def predictWithWeka(csvFilenameWithInputToPredict, modelFilename): """ # Nota: para usar sin conocer la clase, se puede colocar una clase dummy # e ignorar los valores actual y error de @return results. # # Nota: es necesario que el archivo de nombre @csvFilenameWithInputToPredict # contenga instancias de ambas clases (spam y sanas) # # @csvFilenameWithInputToPredict : nombre del archivo csv con las instancias # a predecir. # # @modelFilename : nombre del archivo de modelo generado por weka y # compatible con el archivo csv de entrada # # @return results : lista de diccionarios con los siguientes indices # index, actual, predicted, error y distribution """ loader = Loader(classname="weka.core.converters.CSVLoader") cls = Classifier(jobject=serialization.read(modelFilename)) #print(cls) data = loader.load_file(csvFilenameWithInputToPredict) data.class_is_last() multi = MultiFilter() remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) numericToNom = Filter( classname="weka.filters.unsupervised.attribute.NumericToNominal", options=["-R", "8,11"]) normalize = Filter( classname="weka.filters.unsupervised.attribute.Normalize", options=["-S", "1.0", "-T", "0.0"]) multi.filters = [remove, numericToNom, normalize] multi.inputformat(data) test = multi.filter(data) results = [] for index, inst in enumerate(test): result = dict() pred = cls.classify_instance(inst) dist = cls.distribution_for_instance(inst) result["index"] = index + 1 result["actual"] = inst.get_string_value(inst.class_index) result["predicted"] = inst.class_attribute.value(int(pred)) result["error"] = "yes" if pred != inst.get_value( inst.class_index) else "no" result["distribution"] = str(dist.tolist()) results.append(result) #print result return results
def exposed_evaluate(self, X, d, task, i_model, i_evl): data = np.reshape(eval(X), [d, -1], order='C') if task == 'regression': if i_model == 'LR': data = converters.ndarray_to_instances(data, relation='tmp') data.class_is_last() model = Classifier( classname='weka.classifiers.functions.LinearRegression') evl = Evaluation(data) evl.crossvalidate_model(model, data, 5, Random(0)) elif i_model == 'RF': data = converters.ndarray_to_instances(data, relation='tmp') data.class_is_last() model = Classifier( classname='weka.classifiers.trees.RandomForest') evl = Evaluation(data) evl.crossvalidate_model(model, data, 5, Random(0)) if i_evl == 'mae': r_mae = evl.mean_absolute_error return r_mae elif i_evl == 'mse': r_mae = evl.mean_square_error return r_mse elif i_evl == '1-rae': r_one_minus_rae = 1 - evl.relative_absolute_error / 100 del evl, model, data return r_one_minus_rae elif task == 'classification': le = LabelEncoder() data[:, -1] = le.fit_transform(data[:, -1]) if i_model == 'RF': dataRaw = converters.ndarray_to_instances(data, relation='tmp') weka_filter = Filter( classname= "weka.filters.unsupervised.attribute.NumericToNominal", options=["-R", "last"]) weka_filter.inputformat(dataRaw) data = weka_filter.filter(dataRaw) data.class_is_last() model = Classifier( classname='weka.classifiers.trees.RandomForest') evl = Evaluation(data) evl.crossvalidate_model(model, data, 5, Random(0)) elif i_model == 'LR': model = LogisticRegression(multi_class='ovr') elif i_model == 'SVM': model = svm.SVC() if i_evl == 'f_score': fscore = evl.weighted_f_measure del evl, model, data, dataRaw if not (fscore >= 0.01 and fscore < 1.01): fscore = 0.01 return fscore
def LMT(self): model = Classifier(classname="weka.classifiers.trees.LMT") model.build_classifier(self.data_train) print(model) preds = [] for index, inst in enumerate(self.data_test): preds.append(model.distribution_for_instance(inst)[1]) auc = evaluate_auc(self.df_test[self.attr_label].values.tolist(), preds) return auc
def __init__(self): jvm.start() data_dir = "./DataSet/" self.data = converters.load_any_file(data_dir + "chatbot2.arff") self.data.class_is_last() self.cls = Classifier(classname="weka.classifiers.trees.J48") self.cls.build_classifier(self.data) self.intens = self.data.attribute_by_name("intent")
def get_classifier(min_no, seed): cls = Classifier(classname="weka.classifiers.rules.JRip") # options = ["-N", "25.0"] #-N: minNo, -F folds, -O num optimizations, -batch-size, -S: seed options = list() options.append("-N") options.append(str(min_no)) options.append("-S") options.append(str(seed)) cls.options = options return cls
def setOptions(self, options): """ Parses a given list of options. Parameter(s): 'options' -- the list of options as an array of strings """ Classifier.setOptions(self, options) return
def assign_classify(file_location, output="classified.out", model="naivebayes.model"): data = read_csv_file(file_location) jvm.start() # load clusters obj = serialization.read(model) classifier = Classifier(jobject=obj) # create file with cluster group with open(output, 'w') as cluster_file: for index, attrs in enumerate(data): inst = Instance.create_instance(attrs[1:]) pred = classifier.classify_instance(inst) print(str(index + 1) + ": label index=" + str(pred)) jvm.stop()
def predict(attributes): jvm.start() file_path = print_to_file(attributes) # load the saved model objects = serialization.read_all("/Users/hosyvietanh/Desktop/data_mining/trained_model.model") classifier = Classifier(jobject=objects[0]) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(file_path) data.class_is_last() for index, inst in enumerate(data): pred = classifier.classify_instance(inst) dist = classifier.distribution_for_instance(inst) return int(pred) jvm.stop()
def playback_speed_checker(inputFile, dirRef): TRAINING_ARFF = 'dataset_playback.arff' inputRef = "" # Start JVM jvm.start() jvm.start(system_cp=True, packages=True) jvm.start(max_heap_size="512m") # Find reference file for file in os.listdir(dirRef): if str(file).find(str(os.path.basename(inputFile))) != -1: inputRef = os.path.join(dirRef, file) break # Calculation distance (result, distance) = dtw_checker(inputFile, inputRef) # Loading data loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(TRAINING_ARFF) data.class_is_last() # set class attribute # Train the classifier #cls = Classifier(classname="weka.classifiers.functions.SMO") cls = Classifier(classname="weka.classifiers.trees.J48", options = ["-C", "0.3", "-M", "10"]) cls.build_classifier(data) # Classify instance speed_instance = Instance.create_instance(numpy.ndarray(distance), classname='weka.core.DenseInstance', weight=1.0) speed_instance.dataset = data # Classify instance speed_flag = cls.classify_instance(speed_instance) if (distance == 0): speed_class = 'nominal' else: if speed_flag == 0: speed_class = 'down_speed' if speed_flag == 0: speed_class = 'up_speed' # print os.path.basename(inputFile) + ' --- ' + speed_class # Stop JVM jvm.stop() print "SPEED IS: " + speed_class return speed_class
def getCapabilities(self): """ returns the capabilities of this classifier Return: the capabilities of this classifier """ result = Classifier.getCapabilities(self) # attributes result.enable(Capability.NOMINAL_ATTRIBUTES) result.enable(Capability.NUMERIC_ATTRIBUTES) result.enable(Capability.DATE_ATTRIBUTES) result.enable(Capability.STRING_ATTRIBUTES) result.enable(Capability.RELATIONAL_ATTRIBUTES) result.enable(Capability.MISSING_VALUES) # class result.enable(Capability.NOMINAL_CLASS) result.enable(Capability.NUMERIC_CLASS) result.enable(Capability.DATE_CLASS) result.enable(Capability.MISSING_CLASS_VALUES) # instances result.setMinimumNumberInstances(0) return result
def main(): """ Just runs some example code. """ # load a dataset bodyfat_file = helper.get_data_dir() + os.sep + "bodyfat.arff" helper.print_info("Loading dataset: " + bodyfat_file) loader = Loader("weka.core.converters.ArffLoader") bodyfat_data = loader.load_file(bodyfat_file) bodyfat_data.class_is_last() # classifier help helper.print_title("Creating help string") classifier = Classifier(classname="weka.classifiers.trees.M5P") classifier.build_classifier(bodyfat_data) print(classifier)
def train_J48(self, min_per_rule=20): params = [ '-C','0.3', '-M',str(min_per_rule), # '-N',str(folds), # '-R', ] self.base_classifier = Classifier(classname='weka.classifiers.trees.J48', options=params) self._train()
def train(self): filename = "train.arff" self.write_arff(filename, "train", 0, self.input_x, self.input_y) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(filename) data.class_is_last() self.cls = Classifier(classname="weka.classifiers.meta.Bagging", options=["-S", "5"]) self.cls.build_classifier(data) os.remove(filename)
def train_model(self, training_data): model_weka = None if os.path.isfile(self.model_file): print 'Model ' + self.name + ' already trained.' else: print 'Starting to train_model model ' + self.name + '.' model_weka = Classifier(classname = self.classname, options = self.options) model_weka.build_classifier(data = training_data) serialization.write(filename = self.model_file, jobject = model_weka) print 'Model ' + self.name + ' trained and saved.' if os.path.isfile(self.parameter_file): print 'Parameters of the model ' + self.name + ' already saved.' else: if model_weka == None: model_weka = Classifier(jobject = serialization.read(self.model_file)) save_file(file_name = self.parameter_file, content = str(model_weka)) print 'Parameters of the model ' + self.name + ' saved.'
def train_JRip(self, min_per_rule=20, optimizations=2, folds=3, seed=42): params = [ '-F', str(folds), # folds '-N', str(min_per_rule), # min elements per rule '-O', str(optimizations), # optimizations '-S', str(seed) #seed ] self.base_classifier = Classifier(classname='weka.classifiers.rules.JRip', options=params) self._train()
def listOptions(self): """ Returns an enumeration describing the available options. Return: an enumeration of all the available options. """ return Classifier.listOptions(self)
def getOptions(self): """ Gets the current settings of the Classifier as string array. Return: an array of strings suitable for passing to setOptions """ return Classifier.getOptions(self)
def trainData(self, arrfFile = None, classname="weka.classifiers.trees.J48", options=["-C", "0.3"]): if arrfFile is not None: self.initData( arrfFile ) if self.data is None: return print 'Contruindo classificador ' + str(classname) + ' ' + ' '.join(options) self.classifier = Classifier(classname=classname, options=options) self.classifier.build_classifier(self.data)
def riaa_checker(inputFile): TRAINING_ARFF = 'C:\Users\ASUS\Desktop\IGNASI\SMC\Workspace\dataset_riaa.arff' # Start JVM jvm.start() jvm.start(system_cp=True, packages=True) jvm.start(max_heap_size="512m") # Calculation of bark bands information (absolute_bark, relative_bark, bark_ratios) = compute_bark_spectrum(inputFile) # Loading data loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(TRAINING_ARFF) data.class_is_last() # set class attribute # Train the classifier cls = Classifier(classname="weka.classifiers.functions.SMO") #cls = Classifier(classname="weka.classifiers.trees.J48", options = ["-C", "0.3", "-M", "10"]) cls.build_classifier(data) # Classify instance bark_instance = Instance.create_instance(bark_ratios, classname='weka.core.DenseInstance', weight=1.0) bark_instance.dataset = data # Classify instance riaa_flag = cls.classify_instance(bark_instance) if riaa_flag == 0: riaa_class = 'riaa_ok' else: riaa_class = 'riaa_ko' # print os.path.basename(inputFile) + ' --- ' + riaa_class # Stop JVM jvm.stop() print "RIAA FILTERING?: " + riaa_class return riaa_class
def getDecisionTree(self, inputPath): #load arff data = self.load_Arff(inputPath) #classifier data.set_class_index(data.num_attributes() - 1) # set class attribute classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) data.set_class_index(data.num_attributes() - 1) classifier.build_classifier(data) classifierStr = str(classifier) for index in range(0,data.num_instances()): instance = data.get_instance(index) #print instance result = classifier.distribution_for_instance(instance) #print result graph = classifier.graph() return graph
def classify(train, test, name="RF", tuning=False): jvm.start() if isinstance(train, list) and isinstance(test, list): train = weka_instance(train) trn_data = converters.load_any_file(train) test = weka_instance(test) tst_data = converters.load_any_file(test) elif os.path.isfile(train) and os.path.isfile(test): trn_data = converters.load_any_file(train) tst_data = converters.load_any_file(test) else: trn = csv_as_ndarray(train) tst = csv_as_ndarray(test) trn_data = converters.ndarray_to_instances(trn, relation="Train") tst_data = converters.ndarray_to_instances(tst, relation="Test") trn_data.class_is_last() tst_data.class_is_last() # t = time() if tuning: opt = tune(train) else: opt = default_opt # print("Time to tune: {} seconds".format(time() - t)) cls = Classifier(classname=classifiers[name.lower()], options=opt) cls.build_classifier(trn_data) distr = [cls.distribution_for_instance(inst)[1] for inst in tst_data] preds = [cls.classify_instance(inst) for inst in tst_data] jvm.stop() return preds, distr
def _train(self): params = [ '-F','weka.filters.unsupervised.attribute.RemoveByName -E ^('+'|'.join(self.attrs)+')$ -V', '-W', self.base_classifier.classname, '--', ] params.extend(self.base_classifier.options) # self.classifier = Classifier(classname='weka.classifiers.meta.FilteredClassifier', options=params) self.classifier = FilteredClassifier(options=params) # self.classifier.filter(Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=['-E','^('+'|'.join(self.attrs)+')$','-V'])) self.classifier.build_classifier(self.data) self.out(self.classifier.__str__().encode('ascii', 'ignore').split("\n")[-2])
def __init__(self, jobject=None, options=None): """ Initializes the specified classifier using either the classname or the supplied JB_Object. :param classname: the classname of the classifier :type classname: str :param jobject: the JB_Object to use :type jobject: JB_Object :param options: the list of commandline options to set :type options: list """ classname = "weka.classifiers.functions.Logistic" if jobject is None: jobject = Classifier.new_instance(classname) self.enforce_type(jobject, "weka.classifiers.functions.Logistic") super(Logistic, self).__init__(classname=classname, jobject=jobject, options=options)
def fit(self, X, y): # Check params self.n_features_ = X.shape[1] random_state = check_random_state(self.random_state) if isinstance(self.max_features, str): if self.max_features == "auto": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "sqrt": max_features = max(1, int(np.sqrt(self.n_features_))) elif self.max_features == "log2": max_features = max(1, int(np.log2(self.n_features_))) else: raise ValueError( 'Invalid value for max_features. Allowed string ' 'values are "auto", "sqrt" or "log2".') elif self.max_features is None: max_features = self.n_features_ elif isinstance(self.max_features, (numbers.Integral, np.integer)): max_features = self.max_features else: # float max_features = int(self.max_features * self.n_features_) params = {} params["-I"] = self.n_estimators params["-K"] = max_features params["-depth"] = 0 if self.max_depth is None else self.max_depth params["-no-cv"] = None params["-s"] = random_state.randint(1000000) # Convert data self.classes_ = np.unique(y) self.n_classes_ = len(self.classes_) y = np.searchsorted(self.classes_, y) tf = tempfile.NamedTemporaryFile(mode="w", suffix=".arff", dir="/dev/shm", delete=False) to_arff(X, y, self.n_classes_, tf) tf.close() # Run self.model_ = Classifier(name="weka.classifiers.trees.RandomForest", ckargs=params) self.model_.train(tf.name) os.remove(tf.name) return self
def use_classifier(data): """ Uses the meta-classifier AttributeSelectedClassifier for attribute selection. :param data: the dataset to use :type data: Instances """ print("\n1. Meta-classifier") classifier = Classifier(classname="weka.classifiers.meta.AttributeSelectedClassifier") aseval = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval") assearch = ASSearch(classname="weka.attributeSelection.GreedyStepwise", options=["-B"]) base = Classifier(classname="weka.classifiers.trees.J48") # setting nested options is always a bit tricky, getting all the escaped double quotes right # simply using the bean property for setting Java objects is often easier and less error prone classifier.set_property("classifier", base.jobject) classifier.set_property("evaluator", aseval.jobject) classifier.set_property("search", assearch.jobject) evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, 10, Random(1)) print(evaluation.summary())
import weka.core.jvm as jvm jvm.start() jvm.start(system_cp=True, packages=True) jvm.start(packages="/usr/local/lib/python2.7/dist-packages/weka") jvm.start(max_heap_size="512m") data_dir="CSDMC2010_SPAM/CSDMC2010_SPAM/TRAINING" from weka.classifiers import Classifier cls = Classifier(classname="weka.classifiers.trees.J48") cls.options = ["-C", "0.3"] print(cls.options) jvm.stop()
if data_dir is None: data_dir = "." + os.sep + "data" import os import weka.core.jvm as jvm from weka.core.converters import Loader from weka.core.classes import Random from weka.classifiers import Classifier, Evaluation from weka.filters import Filter jvm.start() # load weather.nominal loader = Loader(classname="weka.core.converters.ArffLoader") fname = data_dir + os.sep + "weather.nominal.arff" print("\nLoading dataset: " + fname + "\n") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) # perform 10-fold cross-validation cls = Classifier(classname="weka.classifiers.rules.OneR") evl = Evaluation(data) evl.crossvalidate_model(cls, data, 10, Random(1)) print("10-fold cross-validation:\n" + evl.to_summary()) # build model on full dataset and output it cls.build_classifier(data) print("Model:\n\n" + str(cls)) jvm.stop()
class WekaWrapper: def __init__(self, questionID, algorithm, classifier, parameters, modelParams, optimizer, predict = 0): self.questionID = questionID self.algorithm = algorithm self.classifier = classifier self.parameters = parameters self.modelParams = modelParams self.api = nemoApi() self.config = nemoConfig() self.optimizer = optimizer self.predict = predict self.prediction = None def retrieveData(self, id, dataset): query = self.api.getDataQuery(id, dataset) iquery = InstanceQuery() iquery.db_url = "jdbc:mysql://" + self.config.HOST + ":" + str(self.config.PORT) + "/" + self.config.DB iquery.user = self.config.USER iquery.password = self.config.PASS iquery.query = query data = iquery.retrieve_instances() data.class_is_last() return data def uploadData(self): # Upload file to database self.api.addModel(self.questionID, '?', self.acc, self.model, self.algorithm, False, self.matrix, self.optimizer) info = self.api.fetchQuestionInfo(self.questionID) modelID = info['ID'] for mParam in self.modelParams: mParam.AIModel = modelID self.api.addAIModelParam(mParam) def uploadPrediction(self): # Upload best classifier prediction to database if self.prediction is not None: # Convert prediction to string predStr = 'No prediction' if (self.prediction == 1.0): predStr = "True" elif (self.prediction == 0.0): predStr = "False" print 'Writing ' + predStr self.api.updatePrediction(self.questionID, predStr) def addInstancesToDataset(self, source, dest): # Align the instances of a source dataset to destination's header and add them to the destination dataset i = 0 while i < source.num_instances: values = source.get_instance(i).values it = np.nditer(values, flags=['f_index'], op_flags=['readwrite']) while not it.finished: (it[0], it.index), if (source.attribute(it.index).is_nominal): stringVal = source.get_instance(i).get_string_value(it.index) # print stringVal if(stringVal != '?'): values[it.index] = dest.attribute(it.index).values.index(stringVal) it.iternext() dest.add_instance(Instance.create_instance(values)) i = i + 1 def buildPatientObject(self): # Build a patient to classify patient = self.api.fetchPatientJSON(self.questionID) if patient is not None: newPatient = {} demographics = ['race_cd', 'sex_cd', 'age_in_years_num'] observation_fact_features = ['tval_char', 'nval_num'] for demo in demographics: if demo not in patient: print "Patient definition missing" + demo + "." newPatient[demo] = float('nan') else: if patient[demo] is not None and patient[demo] != '': newPatient[demo] = patient[demo] else: print "Demographic " + demo + " for patient is empty" newPatient[demo] = float('nan') for obs in patient['observation_facts']: concept_cd = obs['concept_cd'] for feat in observation_fact_features: if feat in obs: if obs[feat] is not None: newPatient[(concept_cd + feat)] = obs[feat] else: newPatient[(concept_cd + feat)] = float('nan') else: print "Feature " + concept_cd + feat + " missing from Patient definition, marking it None" newPatient[(concept_cd + feat)] = float('nan') return newPatient else: return None def addPatientNominals(self, patient, dataset): # Add the nominal values for the patient to the master header, in case they aren't already there # Loop and add patient's nominal values in case they aren't in masterDataset # newDataset will be the new master header # Waiting on prediction patient to be defined # Should be like {sex_cd: "m", ...} ignoreAttributes = ['readmitted'] atts = [] for a in dataset.attributes(): if (not (a.is_nominal)) or (a.name in ignoreAttributes) : atts.append(a) else: newValues = list(a.values) #print a.name pvalue = patient[a.name] if(pvalue not in newValues): newValues.append(pvalue) atts.append(Attribute.create_nominal(a.name, newValues)) newDataset = Instances.create_instances("Dataset", atts, 0) newDataset.class_is_last() return newDataset def addNominals(self, dataset): # Add the nominal values for all columns, in case a column has none ignoreAttributes = ['readmitted'] atts = [] for a in dataset.attributes(): if (not (a.is_nominal)) or (a.name in ignoreAttributes) : atts.append(a) else: newValues = list(a.values) pvalue = 'DefaultNominal' if(pvalue not in newValues): newValues.append(pvalue) atts.append(Attribute.create_nominal(a.name, newValues)) newDataset = Instances.create_instances("Dataset", atts, 0) newDataset.class_is_last() return newDataset def createPatientInstance(self, patient, dataset): # Create a patient instance to classify ignoreAttributes = ['readmitted'] values = [] for a in dataset.attributes(): if not a.is_nominal: values.append(patient[a.name]) elif a.name in ignoreAttributes: values.append(0) else: values.append(a.values.index(patient[a.name])) #print values newInst = Instance.create_instance(values) return newInst def run(self): # Attach JVM javabridge.attach() # Debug print "Classifier" print self.classifier print "Params" print self.parameters print "Model Params" print self.modelParams # Get data for testing and learning learnerData = self.retrieveData(self.questionID, "learner") testData = self.retrieveData(self.questionID, 'test') masterData = self.retrieveData(self.questionID, 'all') masterData = self.addNominals(masterData) # Check if there is enough correct data to run if (learnerData.num_instances < 1 or testData.num_instances < 1): self.status = self.config.NOT_ENOUGH_DATA return False # If this is a prediction and there is a valid patient, change masterData header patientObj = self.buildPatientObject() patientInstance = None if ((patientObj is not None) and (self.predict == 1)): masterData = self.addPatientNominals(patientObj, masterData) patientInstance = self.createPatientInstance(patientObj, masterData) masterData.add_instance(patientInstance) elif (patientObj is None) and (self.predict == 1): print 'No patient defined for prediction. Exiting' return True # Fix dataset headers up to match and fix instances to match headers masterData.delete() learner = masterData.copy_instances(masterData, 0, 0) test = masterData.copy_instances(masterData, 0, 0) self.addInstancesToDataset(learnerData, learner) self.addInstancesToDataset(testData, test) # Comparison of data for testing purposes # print 'learnerData' # print learnerData # print 'learner' # print learner # print 'testData' # print testData # print 'test' # print test # pdb.set_trace() # Instantiate classifier self.cls = Classifier(classname=self.classifier, options=self.parameters) # Run classifier self.cls.build_classifier(learner) # for index, inst in enumerate(learnerData): # prediction = self.cls.classify_instance(inst) # distribution = self.cls.distribution_for_instance(inst) # Test classifier evl = Evaluation(learner) evl.test_model(self.cls, test) # Store information about matrix self.acc = evl.percent_correct self.val = None # Convert numpy array into simple array confusionMatrix = [] confusionMatrix.append([evl.confusion_matrix[0][0], evl.confusion_matrix[0][1]]) confusionMatrix.append([evl.confusion_matrix[1][0], evl.confusion_matrix[1][1]]) # Convert matrix into json format self.matrix = json.dumps(confusionMatrix) # print 'Classifier: ', self.classifier # print 'ID: ', self.questionID # print 'ACC: ', self.acc # print(evl.summary()) # If this is a prediction... make the prediction if ((patientObj is not None) and (self.predict == 1)): masterData.add_instance(patientInstance) print "Running prediction on patient: " print masterData.get_instance(0) self.prediction = self.cls.classify_instance(masterData.get_instance(0)) #self.uploadPrediction() # Temporarily store file to serialize to fileName = str(self.questionID) + self.algorithm + ".model" serialization.write(fileName, self.cls) # Open that file and store it self.model = None with open(fileName, 'rb') as f: self.model = f.read() # Remove temporary file os.remove(fileName) # Set status to awaiting feedback self.status = self.config.AWAITING_FEEDBACK_STATUS return True
def run(self): # Attach JVM javabridge.attach() # Debug print "Classifier" print self.classifier print "Params" print self.parameters print "Model Params" print self.modelParams # Get data for testing and learning learnerData = self.retrieveData(self.questionID, "learner") testData = self.retrieveData(self.questionID, 'test') masterData = self.retrieveData(self.questionID, 'all') masterData = self.addNominals(masterData) # Check if there is enough correct data to run if (learnerData.num_instances < 1 or testData.num_instances < 1): self.status = self.config.NOT_ENOUGH_DATA return False # If this is a prediction and there is a valid patient, change masterData header patientObj = self.buildPatientObject() patientInstance = None if ((patientObj is not None) and (self.predict == 1)): masterData = self.addPatientNominals(patientObj, masterData) patientInstance = self.createPatientInstance(patientObj, masterData) masterData.add_instance(patientInstance) elif (patientObj is None) and (self.predict == 1): print 'No patient defined for prediction. Exiting' return True # Fix dataset headers up to match and fix instances to match headers masterData.delete() learner = masterData.copy_instances(masterData, 0, 0) test = masterData.copy_instances(masterData, 0, 0) self.addInstancesToDataset(learnerData, learner) self.addInstancesToDataset(testData, test) # Comparison of data for testing purposes # print 'learnerData' # print learnerData # print 'learner' # print learner # print 'testData' # print testData # print 'test' # print test # pdb.set_trace() # Instantiate classifier self.cls = Classifier(classname=self.classifier, options=self.parameters) # Run classifier self.cls.build_classifier(learner) # for index, inst in enumerate(learnerData): # prediction = self.cls.classify_instance(inst) # distribution = self.cls.distribution_for_instance(inst) # Test classifier evl = Evaluation(learner) evl.test_model(self.cls, test) # Store information about matrix self.acc = evl.percent_correct self.val = None # Convert numpy array into simple array confusionMatrix = [] confusionMatrix.append([evl.confusion_matrix[0][0], evl.confusion_matrix[0][1]]) confusionMatrix.append([evl.confusion_matrix[1][0], evl.confusion_matrix[1][1]]) # Convert matrix into json format self.matrix = json.dumps(confusionMatrix) # print 'Classifier: ', self.classifier # print 'ID: ', self.questionID # print 'ACC: ', self.acc # print(evl.summary()) # If this is a prediction... make the prediction if ((patientObj is not None) and (self.predict == 1)): masterData.add_instance(patientInstance) print "Running prediction on patient: " print masterData.get_instance(0) self.prediction = self.cls.classify_instance(masterData.get_instance(0)) #self.uploadPrediction() # Temporarily store file to serialize to fileName = str(self.questionID) + self.algorithm + ".model" serialization.write(fileName, self.cls) # Open that file and store it self.model = None with open(fileName, 'rb') as f: self.model = f.read() # Remove temporary file os.remove(fileName) # Set status to awaiting feedback self.status = self.config.AWAITING_FEEDBACK_STATUS return True
fname = data_dir + os.sep + "simpletext-train.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) data.set_class_index(data.num_attributes() - 1) # 1a filter data print("Filtering data...") fltr = Filter("weka.filters.unsupervised.attribute.StringToWordVector") fltr.set_inputformat(data) filtered = fltr.filter(data) filtered.set_class_index(0) # 1b build classifier print("Building/evaluating classifier...") cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(filtered) evl = Evaluation(filtered) evl.test_model(cls, filtered) print(evl.to_summary()) print(str(cls)) plg.plot_dot_graph(cls.graph()) # 2. filtered classifier fname = data_dir + os.sep + "simpletext-test.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") test = loader.load_file(fname) test.set_class_index(test.num_attributes() - 1) print("Building/evaluating filtered classifier...") cls = FilteredClassifier()