def f_smote(): jvm.start() train_data, test_data = b_i_impute_data() train_data = train_data[:10000] y_train = train_data["class"] x_train = train_data.drop("class", axis=1) sm = SMOTE(ratio="minority") x_train_sm, y_train_sm = sm.fit_sample(x_train, y_train) x_train_sm_df = pd.DataFrame(x_train_sm, columns=x_train.columns) y_train_sm_df = pd.DataFrame(y_train_sm, columns=["class"]) train_data_sm_df = pd.concat([y_train_sm_df, x_train_sm_df], axis=1) print_f("smote train data shape", train_data_sm_df.shape) train_data_sm_df.to_csv("./train_data_sm.csv", index=False) train_data_sm = converters.load_any_file("train_data_sm.csv") train_data_sm.class_is_first() test_data = converters.load_any_file("test_data.csv") test_data.class_is_first() print_f("1") cls = Classifier(classname="weka.classifiers.trees.LMT") print_f("bulding classifier") cls.build_classifier(train_data_sm) print_f("Evaluating") evl = Evaluation(train_data_sm) evl.crossvalidate_model(cls, train_data_sm, 5, Random(1)) print_f("Train Accuracy:", evl.percent_correct) print_f("Train summary") print_f(evl.summary()) print_f("Train class details") print_f(evl.class_details()) print_f("Train confusion matrix") print_f(evl.confusion_matrix) plcls.plot_roc(evl, class_index=[0, 1], wait=True, outfile="./plots/2_f_smote_10k.png") plt.suptitle("Train ROC Curve", fontsize=20, y=0.95) evl = Evaluation(test_data) print_f("testing model") evl.test_model(cls, test_data) print_f("Test Accuracy:", evl.percent_correct) print_f("Test summary") print_f(evl.summary()) print_f(" Testclass details") print_f(evl.class_details()) print_f("Testconfusion matrix") print_f(evl.confusion_matrix) plcls.plot_roc(evl, class_index=[0, 1], wait=True) plt.suptitle("Test ROC Curve", fontsize=20, y=0.95) savefig("./plots/f_test_roc_curve.png")
def ClassifyParam(mode, binWidths): if not os.path.exists("classificationResults"): os.makedirs("classificationResults") if("normal" in mode): file = open("classificationResults/AllVsAll.csv","w") file.write("BinWidth, Accuracy\n") for binWidth in binWidths: train_set = "Data/arff/TrainSet_%s.arff"%(binWidth) test_set = "Data/arff/TestSet_%s.arff"%(binWidth) print "Loading Datasets..." train_data = converters.load_any_file(train_set) test_data = converters.load_any_file(test_set) #Set class attribute train_data.class_is_last() test_data.class_is_last() print "Dataset Loaded!" classifier_name = "weka.classifiers.meta.FilteredClassifier" classifier = Classifier(classname=classifier_name, options=[ "-F", "weka.filters.unsupervised.attribute.StringToWordVector -R first-last -W 1000 -C -T -N 1 -stemmer weka.core.stemmers.NullStemmer -M 1 -tokenizer \"weka.core.tokenizers.WordTokenizer -delimiters \\\" \\\\r\\\\n\\\\t.,;:\\\\\\\'\\\\\\\"()?!\\\"\"", "-W", "weka.classifiers.bayes.NaiveBayesMultinomial"]) start_train = time.time() classifier.build_classifier(train_data) end_train = time.time() print "Train\t%s\t%s"%(binWidth, end_train-start_train) for index, inst in enumerate(test_data): if(index == 0): start_sample = time.time() classifier.classify_instance(inst) end_sample = time.time() print "Sample\t%s\t%s"%(binWidth, end_sample-start_sample) print "Evaluating w/ Multinomial Naive Bayes classifier. BinWidth = %s"%(binWidth) evaluation = Evaluation(test_data) start_batch = time.time() evaluation.test_model(classifier, test_data) end_batch = time.time() print "Batch\t%s\t%s"%(binWidth,end_batch-start_batch) print evaluation.summary() acc = evaluation.percent_correct/100.0 print "Percent correct: " + str(acc) file.write("%s, %s\n"%(binWidth, acc)) file.close()
def command(): jvm.start() import weka.core.converters as converters clusters = request.form['clusternum'] a1 = request.form['firstcol'] a2 = request.form['secondcol'] # print clusters # print a1 # print a2 if (a1 == 'B' and a2 == 'C'): data = converters.load_any_file("Data.csv") elif (a1 == 'B' and a2 == 'D'): data = converters.load_any_file("Data1.csv") elif (a1 == 'C' and a2 == 'D'): data = converters.load_any_file("Data2.csv") elif (a1 == 'C' and a2 == 'E'): data = converters.load_any_file("Data3.csv") elif (a1 == 'D' and a2 == 'E'): data = converters.load_any_file("Data4.csv") #data.class_is_last() print(data) # from weka.attribute_selection import ASSearch, ASEvaluation, AttributeSelection # search = ASSearch(classname="weka.attributeSelection.BestFirst", options=["-D", "1", "-N", "5"]) # evaluator = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval", options=["-P", "2", "-E", "1"]) # attsel = AttributeSelection() # attsel.search(search) # attsel.evaluator(evaluator) # attsel.select_attributes(data) f = open("filename.txt", "w") from weka.clusterers import Clusterer clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "{}".format(clusters)]) clusterer.build_clusterer(data) print(clusterer) f.write(str(clusterer)) # cluster the data for inst in data: cl = clusterer.cluster_instance(inst) # 0-based cluster index dist = clusterer.distribution_for_instance( inst) # cluster membership distribution print("cluster=" + str(cl) + ", distribution=" + str(dist)) f.write("cluster=" + str(cl) + ", distribution=" + str(dist)) return render_template("output.html") f.close()
def train(objs, paras, outfiles): outfile = preprocess(outfiles) print 'train', objs, paras, outfile data = converters.load_any_file(outfile) preds = {} reals = {} for obj in objs: preds[obj] = [] reals[obj] = [] label = [] testidxes = [] for idx, ins in enumerate(data): label.append(random.randint(0, 9)) for i in range(10): trainfile, testfile, testidx = split(data, istest_10fold, label, i) for obj in objs: traindata = cleanup(trainfile, paras, obj) testdata = cleanup(testfile, paras, obj) pred, real = eval_one_split(traindata, testdata, obj) preds[obj].extend(pred) reals[obj].extend(real) testidxes.extend(testidx) subprocess.call('rm %s %s' % (trainfile, testfile), shell=True) subprocess.call('rm %s' % outfile, shell=True) print 'num ins', data.num_instances for obj in objs: print obj, metric(preds[obj], reals[obj]) return data, preds, reals, testidxes
def experiment_file_random(path_features, path_folder_save_results, options, classifier, fold, random, name): print("start weka") cls = Classifier(classname=classifier, options=weka.core.classes.split_options(options)) d_results = { 'percent_correct': [], 'percent_incorrect': [], 'confusion_matrix': [] } data = converters.load_any_file(path_features) data.class_is_last() pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV") evl = Evaluation(data) evl.crossvalidate_model(cls, data, fold, Random(random), pout) d_results['percent_correct'].append(evl.percent_correct) d_results['percent_incorrect'].append(evl.percent_incorrect) d_results['confusion_matrix'].append( evl.matrix()) # Generates the confusion matrix. d_results = pd.DataFrame(data=d_results) d_results.to_csv(path_folder_save_results + '/' + str(name) + '.csv', index=False) save = pout.buffer_content() with open( path_folder_save_results + '/' + 'prediction/' + str(name) + '.csv', 'w') as f: f.write(save)
def predict(exp, arff_path, dst_folder): """The function to generate a detailed prediction sequence of the experiment. Args: exp(obj): An util.runtime.Observation object. arff_path(str): The string that represents the path of the input arff file. dst_folder(str): The path of the folder to put the result. Returns: None """ global __predictors import util.runtime as runtime import weka.core.converters as converters data = converters.load_any_file(arff_path) data.class_is_last() for cls_name, cls in __predictors.items(): f_path = os.path.join(dst_folder, cls_name + '.txt') with open(f_path, 'w') as f: lines = [] for index, inst in enumerate(data): prediction = cls.classify_instance(inst) print("Predictions file:", f_path, "Prediction:", prediction, "[", int(prediction), "]", runtime.all_classes[int(prediction)]) # print("runtime.all_classes:", runtime.all_classes) lines.append(runtime.all_classes[int(prediction)]) f.writelines('\n'.join(lines))
def create_model(input_file, output_file): # Load data data = converters.load_any_file(input_file) data.class_is_last() # set class attribute # filter data print_title("Filtering Data") discretize = Filter( classname="weka.filters.unsupervised.attribute.Discretize", options=["-B", "10", "-M", "-1.0", "-R", "first-last"]) discretize.inputformat( data) # let the filter know about the type of data to filter filtered_data = discretize.filter(data) print("Done! (believe it or not)") print_title("Build Classifier") classifier = Classifier(classname="weka.classifiers.trees.RandomForest", options=["-I", "100", "-K", "0", "-S", "1"]) classifier.build_classifier(filtered_data) print("Done! (believe it or not)") serialization.write_all(output_file, [classifier, discretize]) print("Model and filter saved to ", output_file) evaluation = Evaluation(data) # initialize with priors evaluation.crossvalidate_model(classifier, filtered_data, 10, Random(42)) # 10-fold CV print(evaluation.summary()) print("pctCorrect: " + str(evaluation.percent_correct)) print("incorrect: " + str(evaluation.incorrect))
def OnlineClassification(): #Classifies instances in an online way. #TODO: This is just a simple example of how online learning can be automated w/ WEKA # May be useful to later stages of the project. data_dir = "Testbed/" training = converters.load_any_file(data_dir + "training_dataset.csv") training.class_is_last() testing = converters.load_any_file(data_dir + "testing_dataset.csv") testing.class_is_last() a = open(data_dir +"testing_dataset.csv", "r") print len(a.readlines()) cls_classes = ["weka.classifiers.trees.J48", "weka.classifiers.trees.RandomForest", "weka.classifiers.lazy.IBk"] classifiers = [] for cls in cls_classes: classifiers.append(Classifier(classname=cls)) #Set class attribute print colored("======================================================",'green') print colored("Experiment for dataset",'green') print colored("======================================================",'green') for i, cls in enumerate(classifiers): cls.build_classifier(training) print("# - actual - predicted - right - distribution") for index, inst in enumerate(testing): pred = cls.classify_instance(inst) dist = cls.distribution_for_instance(inst) print( "%d - %s - %s - %s - %s" % (index+1, inst.get_string_value(inst.class_index), inst.class_attribute.value(int(pred)), "yes" if pred == inst.get_value(inst.class_index) else "no", str(dist.tolist())))
def e_model_tree(): # train_data, test_data = b_i_impute_data() # train_data.to_csv("./train_data.csv", index=False) # test_data.to_csv("./test_data.csv",index=False) jvm.start() train_data = converters.load_any_file("train_data.csv") train_data.class_is_first() test_data = converters.load_any_file("test_data.csv") test_data.class_is_first() print("1") cls = Classifier(classname="weka.classifiers.trees.LMT") print("2") cls.build_classifier(train_data) print("3") evl = Evaluation(train_data) evl.crossvalidate_model(cls, train_data, 5, Random(1)) print("Train Accuracy:", evl.percent_correct) print("Train summary") print(evl.summary()) print("Train class details") print(evl.class_details()) print("Train confusion matrix") print(evl.confusion_matrix) plcls.plot_roc(evl, class_index=[0, 1], wait=True) plt.suptitle("Train ROC Curve", fontsize=20, y=0.95) savefig("./plots/e_train_roc_curve.png") evl = Evaluation(test_data) evl.test_model(cls, test_data) print("Test Accuracy:", evl.percent_correct) print("Test summary") print(evl.summary()) print(" Testclass details") print(evl.class_details()) print("Testconfusion matrix") print(evl.confusion_matrix) plcls.plot_roc(evl, class_index=[0, 1], wait=True) plt.suptitle("Test ROC Curve", fontsize=20, y=0.95) savefig("./plots/e_test_roc_curve.png")
def __init__(self): jvm.start() data_dir = "./DataSet/" self.data = converters.load_any_file(data_dir + "chatbot2.arff") self.data.class_is_last() self.cls = Classifier(classname="weka.classifiers.trees.J48") self.cls.build_classifier(self.data) self.intens = self.data.attribute_by_name("intent")
def cleanup(f, attrs, obj): data = converters.load_any_file(f) n = data.num_attributes for idx in range(n): data.delete_with_missing(idx) for idx in reversed(range(n)): if data.attribute(idx).name not in attrs and data.attribute(idx).name != obj: data.delete_attribute(idx) for idx in range(data.num_attributes): if data.attribute(idx).name == obj: data.class_index = idx return data
def classify(train, test, name="RF", tuning=False): jvm.start() if isinstance(train, list) and isinstance(test, list): train = weka_instance(train) trn_data = converters.load_any_file(train) test = weka_instance(test) tst_data = converters.load_any_file(test) elif os.path.isfile(train) and os.path.isfile(test): trn_data = converters.load_any_file(train) tst_data = converters.load_any_file(test) else: trn = csv_as_ndarray(train) tst = csv_as_ndarray(test) trn_data = converters.ndarray_to_instances(trn, relation="Train") tst_data = converters.ndarray_to_instances(tst, relation="Test") trn_data.class_is_last() tst_data.class_is_last() # t = time() if tuning: opt = tune(train) else: opt = default_opt # print("Time to tune: {} seconds".format(time() - t)) cls = Classifier(classname=classifiers[name.lower()], options=opt) cls.build_classifier(trn_data) distr = [cls.distribution_for_instance(inst)[1] for inst in tst_data] preds = [cls.classify_instance(inst) for inst in tst_data] jvm.stop() return preds, distr
def experiment_more_file(path_files, path_folder_save_results, fold, options, classifier, random, name): cls = Classifier(classname=classifier, options=weka.core.classes.split_options(options)) file_list = os.listdir(path_files) for file in file_list: if ".csv" not in file: file_list.remove(file) d_results = { 'name_file': [], 'percent_correct': [], 'percent_incorrect': [], 'confusion_matrix': [] } print(file_list) for file in file_list: print(str(file)) data = converters.load_any_file(path_files + "/" + file) data.class_is_last() pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV") evl = Evaluation(data) evl.crossvalidate_model(cls, data, fold, Random(random), pout) d_results['name_file'].append(str(file)) d_results['percent_correct'].append(evl.percent_correct) d_results['percent_incorrect'].append(evl.percent_incorrect) d_results['confusion_matrix'].append( evl.matrix()) # Generates the confusion matrix. save = pout.buffer_content() with open( path_folder_save_results + '/' + 'prediction/' + str(name) + str(file)[:-4] + 'pred_data.csv', 'w') as f: f.write(save) d_results = pd.DataFrame(data=d_results) d_results.to_csv(path_folder_save_results + '/' + str(name) + ".csv", index=False)
def obtainSVM(file): data = converters.load_any_file(folderPathOfArffFiles + file + ".arff") remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "1-2"]) remove.inputformat(data) data = remove.filter(data) data.class_is_last() classifier = Classifier(classname="weka.classifiers.functions.LibSVM") evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, kFold, Random(42)) info = evaluation.class_details() roc_area = float(info[406:411]) return roc_area
def associateRule(request): jvm.start() data_dir = os.path.dirname(os.path.abspath(__file__)) data = converters.load_any_file(data_dir + "/templates/upload_files/export.csv") data.class_is_last() associator = Associator(classname="weka.associations.Apriori", options=["-C", "-1", "-I"]) # associator = Associator(classname="weka.associations.Apriori", options=["-N", "9", "-I"]) associator.build_associations(data) rules = str(associator) jvm.stop() return HttpResponse(rules)
def all_feature(file): jvm.start(packages=True) data = converters.load_any_file(file) data.class_is_last() search = ASSearch(classname="weka.attributeSelection.Ranker", options=["-T", "-1.7976931348623157E308", "-N", "-1"]) attsel = AttributeSelection() attsel.search(search) evaluator = ASEvaluation( classname="weka.attributeSelection.ChiSquaredAttributeEval") attsel.evaluator(evaluator) attsel.select_attributes(data) t = attsel.ranked_attributes[:, 0] chi = t.astype(int) evaluator = ASEvaluation( classname="weka.attributeSelection.InfoGainAttributeEval") attsel.evaluator(evaluator) attsel.select_attributes(data) t = attsel.ranked_attributes[:, 0] info_gain = t.astype(int) evaluator = ASEvaluation( classname="weka.attributeSelection.GainRatioAttributeEval") attsel.evaluator(evaluator) attsel.select_attributes(data) t = attsel.ranked_attributes[:, 0] gain_ratio = t.astype(int) evaluator = ASEvaluation( classname="weka.attributeSelection.SymmetricalUncertAttributeEval") attsel.evaluator(evaluator) attsel.select_attributes(data) t = attsel.ranked_attributes[:, 0] symmetric_uncertainty = t.astype(int) jvm.stop() return chi, info_gain, gain_ratio, symmetric_uncertainty
def bayes_classifier(features): #carrega o dataset instancias = load_any_file("caracteristicas.arff") # sinaliza que o ultimo atributo é a classe instancias.class_is_last() # Carrega o classificafor Naive Bayes e Classifica com base nas características da imagem classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes") classifier.build_classifier(instancias) # Cria uma nova instância com base nas caracteristicas extraidas new_instance = Instance.create_instance(features) # Adiciona a nova instância ao dataset instancias.add_instance(new_instance) # Liga a nova instancia ao dataset new_instance.dataset = instancias # Classifica a nova instância trazendo as probabilidades de ela pertencer as classes definidas classification = classifier.distribution_for_instance(new_instance) print("Classificação", " - Apu: ", round(classification[0] * 100, 2), " Nelson: ", round(classification[1], 2)) return classification
def convert_file(from_x, to_y): # Create nominals for emotion attr value_list = [] for i in range(2): value_list.append(str(i)) # Check if more nominals needed if not from_x.parent.name.endswith("happy_data"): for i in range(2, 7): value_list.append(str(i)) if type(from_x) is not str: from_x = str(from_x) if type(to_y) is not str: to_y = str(to_y) # Loads data based on file type data = converters.load_any_file(from_x) # emotion attribute located at index 0 emotion_atr = data.attribute(0) # need emotion attr to be nominal if not emotion_atr.is_nominal: # Modify emotion attr emotion_atr = emotion_atr.create_nominal(emotion_atr.name, value_list) # Store all emotion values before swapping # to modified emotion_atr emotion_vals = [] for i in dataset.InstanceIterator(data): emotion_vals.append(int(i.get_value(0))) # Replace emotion attr data.delete_first_attribute() data.insert_attribute(emotion_atr, 0) # Set the values in new emotion attr for i in dataset.InstanceIterator(data): i.set_string_value(0, str(emotion_vals.pop(0))) converters.save_any_file(data, to_y)
def obtainBayesNet(file): #The path of the arff extension file must be put. data = converters.load_any_file(folderPathOfArffFiles + file + ".arff") #In the case of this specific data set, the first two attributes were removed since they # represent the name and ranking which are unique values that would affect the classification. # Depending on the data set, certain attributes must be removed. remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "1-2"]) remove.inputformat(data) data = remove.filter(data) #It is specified that the class value is the last attribute. data.class_is_last() #Define the classifier to be used. classifier = Classifier(classname="weka.classifiers.bayes.BayesNet") evaluation = Evaluation(data) evaluation.crossvalidate_model(classifier, data, kFold, Random(42)) #The ROC-AUC is extracted from the string that is received from Weka. info = evaluation.class_details() roc_area = float(info[406:411]) return roc_area
def loadData(self, fName, temp=True): if temp: data = converters.load_any_file(fName) else: data = converters.load_any_file(os.path.join(self.dataDir, fName)) return data
from dataformatter import DataFormatter import weka.core.packages as packages dataDir = os.path.join(os.path.dirname(os.path.abspath('')), 'data') modelDir = os.path.join(os.path.dirname(os.path.abspath('')), 'models') dformat = DataFormatter(dataDir) dformat.dict2arff(os.path.join(dataDir, 'System.csv'), os.path.join(dataDir, 'System.arff')) #Arff_file = os.path.join(dataDir, 'System.arff') jvm.start(packages=True) data = converters.load_any_file(os.path.join(dataDir, 'System.arff')) clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "10", "-S", "10"]) clusterer.build_clusterer(data) # print clusterer # cluster the data # for inst in data: # cl = clusterer.cluster_instance(inst) # 0-based cluster index # dist = clusterer.distribution_for_instance(inst) # cluster membership distribution # print("cluster=" + str(cl) + ", distribution=" + str(dist)) # print inst # serialization.write(os.path.join(modelDir, 'SKM.model'), clusterer) clusterEM = Clusterer(classname="weka.clusterers.EM",
def convertArff2Csv(infile, outfile): jvm.start() data = converters.load_any_file(infile) converters.save_any_file(data, outfile) jvm.stop()
def ClassifyTestSet(): #Tests a classifier performance with a dedicated test set # Models are evaluated for different combinations of features # Several classifiers may be used # Load Datasets data_dir = "Testbed/" #h=open(data_dir+"training_dataset.csv","rb") #print h a = open(data_dir +"training_dataset.csv", "r") print len(a.readlines()) a = open(data_dir +"testing_dataset.csv", "r") print len(a.readlines()) training = converters.load_any_file(data_dir+"training_dataset.csv") training.class_is_last() testing = converters.load_any_file(data_dir +"testing_dataset.csv") testing.class_is_last() #set class attribute to be the last one listed #Choose classifiers to use cls_classes = ["weka.classifiers.trees.RandomForest", "weka.classifiers.trees.J48", "weka.classifiers.lazy.IBk" ] classifiers = [] for cls in cls_classes: classifiers.append(Classifier(classname=cls)) #Regex for attribute selection #(Useful for testing different combinations of attributes) identifier_att = ".*id.*" timeseries_att = "Mic.*" doppler_att = "doppler.*" phase_att = "phase.*" music_att = "music.*" beamform_att = "beamform.*" att_set = [timeseries_att, doppler_att, phase_att, music_att, beamform_att] ################################################## #Remove instances identifier attribute training = FilterAttribute(identifier_att,training) testing = FilterAttribute(identifier_att,testing) ################################################ for att_comb in powerset(att_set): training_filtered = training testing_filtered = testing for att in att_comb: if(len(att) != len(att_set)): training_filtered = FilterAttribute(att,training_filtered) testing_filtered = FilterAttribute(att,testing_filtered) print colored("======================================================",'green') print colored("Full attribute set: " + str(att_set),'green') print colored("Removed attributes: " + str(att_comb),'green') print colored("======================================================",'green') for i, cls in enumerate(classifiers): cls.build_classifier(training_filtered) evl = Evaluation(training) evl.test_model(cls, testing_filtered) print colored("=> Testing for " + cls_classes[i], 'red') print(evl.summary()) print(evl.matrix())
def experiment_sequential_file(path_indices, path_features, path_folder_save_results, options, classifier, name, indicator_col, images): ind_f = load(path_indices) lst = ind_f.files for item in lst: ind = ind_f[item] + 1 cls = Classifier(classname=classifier, options=weka.core.classes.split_options(options)) data = converters.load_any_file(path_features) ind = np.append(ind, len(data)) data.class_is_last() pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV") d_results = { 'index': [], 'percent_correct': [], 'percent_incorrect': [], 'precision': [], 'recall': [], 'f-score': [], 'confusion_matrix': [] } for j in range(len(ind) - 1): first = ind[j] if j == len(ind) - 2: last = ind[j + 1] else: last = ind[j + 1] - 1 d_test = data.subset(row_range=str(first) + '-' + str(last)) if j == 0: # first d_train = data.subset(row_range=str(last + 1) + '-' + str(ind[-1])) # last element print(str(last + 1) + '-' + str(ind[-1])) elif j == len(ind) - 2: # last d_train = data.subset(row_range='1-' + str(first - 1)) # last element print('1-' + str(first - 1)) else: # central s = '1-' + str(first - 1) + ',' + str(last + 1) + '-' + str( ind[-1]) print(s) d_train = data.subset(row_range=s) cls.build_classifier(d_train) evl = Evaluation(data) evl.test_model(cls, d_test, pout) # print(type(d_train)) # print(type(d_test)) d_results['index'].append(str(ind[j])) d_results['percent_correct'].append(evl.percent_correct) d_results['percent_incorrect'].append(evl.percent_incorrect) d_results['precision'].append(evl.precision(1)) d_results['recall'].append(evl.recall(1)) d_results['f-score'].append(evl.f_measure(1)) d_results['confusion_matrix'].append( evl.matrix()) # Generates the confusion matrix. save = pout.buffer_content() check_folder_or_create(path_folder_save_results + '/' + 'prediction') with open( path_folder_save_results + '/' + 'prediction/' + name + 'pred_data.csv', 'w') as f: f.write(save) buffer_save = pd.read_csv(path_folder_save_results + '/' + 'prediction/' + name + 'pred_data.csv', index_col=False, header=None) col_label = buffer_save[1] col_prediction = buffer_save[2] col_different = buffer_save[3] create_prediction(col_label, col_prediction, col_different, indicator_col, images, name, path_folder_save_results + '/prediction/') d_results = pd.DataFrame(data=d_results) d_results.to_csv(path_folder_save_results + '/' + name + 'results.csv', index=False)
def experiment_more_file(path_files, path_folder_save_results, fold, options, classifier, random, name, voting=False): cls = Classifier(classname=classifier, options=weka.core.classes.split_options(options)) file_list = os.listdir(path_files) for file in file_list: if ".csv" not in file: file_list.remove(file) d_results = { 'name_file': [], 'percent_correct': [], 'percent_incorrect': [], 'precision': [], 'recall': [], 'f-score': [], 'confusion_matrix': [] } for file in file_list: indicator_table = pd.read_csv(path_files + '/indicator/' + file[0] + '_indicator.csv') indicator = list(indicator_table['indicator']) images = list(indicator_table['image']) data = converters.load_any_file(path_files + "/" + file) data.class_is_last() pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV") evl = Evaluation(data) evl.crossvalidate_model(cls, data, fold, Random(random), pout) d_results['name_file'].append(str(file)) d_results['percent_correct'].append(evl.percent_correct) d_results['percent_incorrect'].append(evl.percent_incorrect) d_results['precision'].append(evl.precision(1)) d_results['recall'].append(evl.recall(1)) d_results['f-score'].append(evl.f_measure(1)) d_results['confusion_matrix'].append( evl.matrix()) # Generates the confusion matrix. save = pout.buffer_content() check_folder_or_create(path_folder_save_results + '/' + name + '/' + 'prediction') with open( path_folder_save_results + '/' + name + '/' + 'prediction/pred_data.csv', 'w') as f: f.write(save) buffer_save = pd.read_csv(path_folder_save_results + '/' + name + '/' + 'prediction/pred_data.csv', index_col=False) col_label = buffer_save['actual'] col_prediction = buffer_save['predicted'] col_different = buffer_save['error'] create_prediction( col_label, col_prediction, col_different, indicator, images, file[:-4], path_folder_save_results + '/' + name + '/prediction/') d_results = pd.DataFrame(data=d_results) d_results.to_csv(path_folder_save_results + '/' + str(name) + ".csv")
def train(training_dataset_path, model_cache_file_name, evaluation_is_on, summary_file_path): """Model Training function The function uses the WEKA machine learning library, implemented by python-weka-wrapper Python library. Divides the data into given folds, and do the training and evaluation. Trained model copied to __predictors global variable and also saved (together with training data set) to the model_cache_file_name file. Evaluation summary is being written to summary_file_path file. Args: :param training_dataset_path: the path of the input arff file. :param model_cache_file_name: :param evaluation_is_on: run evaluation after training (true / false) :param summary_file_path: the path of the model evaluation summary file. Returns: None """ global __classifiers global __predictors training_data = converters.load_any_file(training_dataset_path) training_data.class_is_last() lines = [] summaries = [] summary_line = [ 'Model'.ljust(16), 'Precision'.ljust(12), 'Recall'.ljust(12), 'F-measure'.ljust(12), 'Accuracy'.ljust(12), 'FPR'.ljust(12) ] summaries.append('\t'.join(summary_line)) for classifier, option_str in __classifiers.items(): option_list = re.findall(r'"(?:[^"]+)"|(?:[^ ]+)', option_str) option_list = [s.replace('"', '') for s in option_list] classifier_name = classifier.split('.')[-1] info_str = "Using classifier: {classifier}, options: {options}".format( classifier=classifier_name, options=str(option_list)) localizer_log.msg(info_str) lines.append(info_str) # Train cls = Classifier(classname=classifier, options=option_list) localizer_log.msg("Start building classifier") cls.build_classifier(training_data) localizer_log.msg("Completed building classifier") localizer_log.msg("Saving trained model to {model_cache_name}".format( model_cache_name=model_cache_file_name)) # localizer_config.save_model(cls, training_data, model_cache_file_name) path = os.path.join('caches', 'model') if not os.path.exists(path): os.makedirs(path, exist_ok=True) path = os.path.join(path, model_cache_file_name + '.cache') cls.serialize(path) localizer_log.msg("Trained model saved") classifier2, _ = Classifier.deserialize(path) print(classifier2) __predictors[classifier_name] = cls if evaluation_is_on: # Model evaluation localizer_log.msg("Start evaluation classifier") evl = Evaluation(training_data) localizer_log.msg("Complete evaluation classifier") localizer_log.msg("Start cross-validating classifier") evl.crossvalidate_model(cls, training_data, 10, Random(1)) localizer_log.msg("Complete cross-validating classifier") # print(evl.percent_correct) # print(evl.summary()) # print(evl.class_details()) lines.append(evl.summary()) lines.append(evl.class_details()) summary_line = [] summary_line.append(classifier_name.ljust(16)) summary_line.append("{:.3f}".format(evl.weighted_precision * 100).ljust(12)) summary_line.append("{:.3f}".format(evl.weighted_recall * 100).ljust(12)) summary_line.append("{:.3f}".format(evl.weighted_f_measure * 100).ljust(12)) summary_line.append("{:.3f}".format(evl.percent_correct).ljust(12)) summary_line.append("{:.3f}".format( evl.weighted_false_positive_rate * 100).ljust(12)) summaries.append('\t'.join(summary_line)) # Save evaluation summary to file with open(summary_file_path, 'w') as f: f.writelines('\n'.join(lines)) f.writelines('\n' * 5) f.writelines('\n'.join(summaries))
""" Naive Bayes in Weka Created on Sun Jul 03 15:49:46 2016 @author: SkYe """ import weka.core.jvm as jvm jvm.start(max_heap_size="2500m") # Load data: Must be a weka-derived object # Dataset has nominal and numeric variables import weka.core.converters as converters data_dir = "data/" data = converters.load_any_file(data_dir + "adult.csv") data.class_is_last() # Create train and test sets from weka.core.classes import Random test, train = data.train_test_split(0.90, Random(1)) # Check data in datasets print(train.num_instances) print(test.num_instances) # Check data in datasets print(train.num_attributes) print(test.num_attributes)
import weka.core.jvm as jvm import weka.core.converters as conv from weka.classifiers import Evaluation, Classifier from weka.core.classes import Random import weka.plot.classifiers as plcls import os jvm.start(packages=True) data = conv.load_any_file("Dataset/test.arff") #print(data) data.class_is_last() cls = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) evl = Evaluation(data) evl.crossvalidate_model(cls, data, 15, Random(1)) #print(evl.summary("=== J48 on anneal (stats) === Rafael Manja", False)) #print(evl.matrix("Matriz do Rafael")) plcls.plot_classifier_errors(evl.predictions, absolute=False, wait=True) jvm.stop()
def loadData(self, fName, temp=True): if temp: data = converters.load_any_file(fName) else: data = converters.load_any_file(os.path.join(self.dataDir, fName)) return data
NewListColumns.append(ListColumns[i]) print("NewListColumns :") print(NewListColumns) for l in (NewListColumns): data_file = 'C:/PythonProjects/AgentsTurboFan/Test/Test4/Agent' + l + '.csv' test = [] test.append(l) print("test :") print(test) print("\n--> loading:\n") print(data_file) dataA = load_any_file(data_file) dataA.class_is_last() DFC = pd.read_csv('C:/PythonProjects/AgentsTurboFan/Test/Test4/Agent' + l + '.csv', delimiter=",") for a in range(len(DFC)): classvar = DFC.iloc[a, len(DFC.columns) - 1] classvarStr = str(classvar) print('classvarStr :', classvarStr) print('isreal(classvarStr) :', isreal(classvarStr)) if isreal(classvarStr) == True: classifier = Classifier(classname="weka.classifiers.trees.M5P",
def CrossValidateFullDataset(): #Tests a classifier performance with 10x cross-validation data_dir = "test/" print "Loading Dataset..." data = converters.load_any_file(data_dir + "full_dataset.csv") print "Dataset Loaded!" #Set class attribute data.class_is_last() cls_classes = [#"weka.classifiers.trees.J48", "weka.classifiers.trees.RandomForest", #"weka.classifiers.lazy.IBk" ] classifiers = [] for cls in cls_classes: classifiers.append(Classifier(classname=cls)) #Regex for attribute selection #(Useful for testing different combinations of attributes) identifier_att = ".*id.*" #timeseries_att = "raw.*" rmNoise_att = "rmNoise.*" #doppler_att = "doppler.*" #phase_att = "phase.*" #music_att = "music.*" #beamform_att = "beamform.*" #music_sliding_att = "music_sliding.*" #music_agg_att = "music_agg.*" #music_angles_att = "music_angles.*" att_set = [rmNoise_att] ################################################## #Remove instances identifier attribute data = FilterAttribute(identifier_att,data) ################################################ for att_comb in powerset(att_set): data_filtered = data for att in att_comb: if(len(att) != len(att_set)): data_filtered = FilterAttribute(att,data_filtered) if str(list(set(att_set) - set(att_comb)))=='[]': continue print att_set print att_comb print colored("======================================================",'green') print colored("Full attribute set: " + str(att_set),'green') print colored("Removed attributes: " + str(att_comb),'red') if(len(att_comb) > 0): print colored("Using attributes: " + str(list(set(att_set) - set(att_comb))), 'green') print colored("======================================================",'green') print data_dir for i, cls in enumerate(classifiers): evl = Evaluation(data_filtered) evl.crossvalidate_model(cls, data_filtered, 10, Random(1)) print colored("=> 10x cross-validation for " + cls_classes[i], 'red') print(evl.summary()) print(evl.matrix())
# package install chisq_name = "EvolutionarySearch" chisq_installed = False for p in pkg.installed_packages(): if p.name == chisq_name: chisq_installed = True if not chisq_installed: pkg.install_package(chisq_name) print("pkg %s installed, please restart" % chisq_name) jvm.stop() sys.exit(1) """ data_dir = "\\\\egr-1l11qd2\\CLS_lab\\Junya Zhao\\Data driven model _paper [June 25_2018\\FeatureSelection\\EvlSearch\\" globbed_files = glob.glob(data_dir + "*.csv") for csv in globbed_files: data = converters.load_any_file(csv) data.class_is_last() search = ASSearch(classname="weka.attributeSelection.EvolutionarySearch", options=[ "-population-size", "200", "-generations", "500", "-crossover-probability", "0.6" ]) evaluator = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval", options=["-P", "1", "E", "1"]) attsel = AttributeSelection() attsel.folds(10) attsel.crossvalidation(True) attsel.seed(1) attsel.search(search) attsel.evaluator(evaluator) attsel.select_attributes(data)
def experiment_sequential_file(path_indices, path_features, path_folder_save_results, options, classifier, name): ind_f = load(path_indices) lst = ind_f.files for item in lst: ind = ind_f[item] + 1 cls = Classifier(classname=classifier, options=weka.core.classes.split_options(options)) data = converters.load_any_file(path_features) ind = np.append(ind, len(data)) data.class_is_last() pout = PredictionOutput( classname="weka.classifiers.evaluation.output.prediction.CSV") d_results = { 'index': [], 'percent_correct': [], 'percent_incorrect': [], 'confusion_matrix': [] } for j in range(len(ind) - 1): print(j) print(str(ind[j]) + '-' + str(ind[j + 1])) d_test = data.subset(row_range=str(ind[j]) + '-' + str(ind[j + 1])) if j == 0: # first d_train = data.subset(row_range=str(ind[j + 1] + 1) + '-' + str(ind[-1])) # last element elif j == len(ind) - 2: # last d_train = data.subset(row_range='1-' + str(ind[j] - 1)) # last element else: # central s = '1-' + str(ind[j] - 1) + ',' + str(ind[j + 1] + 1) + '-' + str( ind[-1]) d_train = data.subset(row_range=s) cls.build_classifier(d_train) evl = Evaluation(data) evl.test_model(cls, d_test, pout) save = pout.buffer_content() with open( path_folder_save_results + '/' + '/prediction/' + name + str(j) + 'pred_data.csv', 'w') as f: f.write(save) d_results['index'].append(str(ind[j])) d_results['percent_correct'].append(evl.percent_correct) d_results['percent_incorrect'].append(evl.percent_incorrect) d_results['confusion_matrix'].append( evl.matrix()) # Generates the confusion matrix. d_results = pd.DataFrame(data=d_results) d_results.to_csv(path_folder_save_results + '/' + name + 'results.csv', index=False)