def train_and_predict_instances(self, trainingFile, classifier): loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(trainingFile) data.class_is_last() classes = [str(code) for code in data.class_attribute.values] head = [className + " probability" for className in classes] head.append("Guess") cls = Classifier(classname=classifier) cls.build_classifier(data) predictions = [[0, 0]] * len(data) realLabels = [""] * len(data) guess = [0] * len(data) for index, inst in enumerate(data): pred = cls.classify_instance(inst) if inst.get_value(inst.class_index) == pred: guess[index] = 1.0 else: guess[index] = 0.0 dist = cls.distribution_for_instance(inst) predictions[index] = [p for p in dist] realLabels[index] = classes[int(inst.get_value(inst.class_index))] print( str(index + 1) + ": label index=" + str(pred) + ", class distribution=" + str(dist)) return [predictions, guess, head, realLabels]
def main(args): """ Trains a J48 classifier on a training set and outputs the predicted class and class distribution alongside the actual class from a test set. Class attribute is assumed to be the last attribute. :param args: the commandline arguments (train and test datasets) :type args: list """ # load a dataset helper.print_info("Loading train: " + args[1]) loader = Loader(classname="weka.core.converters.ArffLoader") train = loader.load_file(args[1]) train.class_index = train.num_attributes - 1 helper.print_info("Loading test: " + args[2]) test = loader.load_file(args[2]) test.class_is_last() # classifier cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) # output predictions print("# - actual - predicted - error - distribution") for index, inst in enumerate(test): pred = cls.classify_instance(inst) dist = cls.distribution_for_instance(inst) print( "%d - %s - %s - %s - %s" % (index+1, inst.get_string_value(inst.class_index), inst.class_attribute.value(int(pred)), "yes" if pred != inst.get_value(inst.class_index) else "no", str(dist.tolist())))
def predBtn_clicked(self): gender = self.gender_entry.get() age = int(self.age_entry.get()) height = int(self.height_entry.get()) weight = int(self.weight_entry.get()) sociability = self.sociability_entry.get() stability = self.stability_entry.get() '''Create the model''' objects = serialization.read_all("J48.model") cls = Classifier(jobject=objects[0]) data = Instances(jobject=objects[1]) '''Create the test set to be classified''' gender_values = ["Man", "Woman"] sociability_values = ["Introvert", "Extrovert"] stability_values = ["Stable", "Unstable"] values = [ gender_values.index(gender), age, height, weight, self.BMI(weight, height), stability_values.index(stability), sociability_values.index(sociability), Instance.missing_value() ] inst = Instance.create_instance(values) inst.dataset = data '''Classification''' prediction = int(cls.classify_instance(inst)) self.controller.show_frame("Result").show(prediction) self.clear()
def TestClassification(arff, modelInput, results): # 启动java虚拟机 jvm.start() # 导入分析模型 objects = serialization.read_all(modelInput) clsf = Classifier(jobject=objects[0]) print(clsf) # 导入测试组 loader = Loader(classname="weka.core.converters.ArffLoader") test = loader.load_file(arff) test.class_is_first() # 分析结果 resultsFile = open(results, "w") resultsFile.write("序号\t原判断\t预测\t良性概率\t恶性概率\n") print("序号\t原判断\t预测\t良性概率\t恶性概率") for index, inst in enumerate(test): pred = clsf.classify_instance(inst) dist = clsf.distribution_for_instance(inst) sampleID = index + 1 origin = inst.get_string_value(inst.class_index) prediction = inst.class_attribute.value(int(pred)) sameAsOrigin = "yes" if pred != inst.get_value( inst.class_index) else "no" NRate = dist.tolist()[0] PRate = dist.tolist()[1] resultsFile.write( "%d\t%s\t%s\t%s\t%s" % (sampleID, origin, prediction, str(NRate), str(PRate)) + "\n") print("%d\t%s\t%s\t%s\t%s" % (sampleID, origin, prediction, str(NRate), str(PRate))) resultsFile.close() # 退出java虚拟机 jvm.stop() print("检测完成")
def DecisionTree(data): classifier = Classifier(classname="weka.classifiers.trees.J48") classifier.build_classifier(data) print("") print("=== Decision Tree ===") print(classifier) count_class1 = 0 count_class0 = 0 print("Labeling income status of each instance. Please wait..") for index, inst in enumerate(data): pred = classifier.classify_instance(inst) # calculate no. of instances classified in class 1 and 0 if str(pred) == "1.0": count_class1 += 1 else: count_class0 += 1 if index % 5000 == 0: print(".") print("No of instances in class '>50K' = " + str(count_class1)) print("No of instances in class '<=50K' = " + str(count_class0))
class DecisionTreeBot(MinimaxBot): def __init__(self, number, name=None): """ Minimax bot which uses a decision tree to score board states The decision tree outputs one of ten classes. The higher the class number, the better the board state for X :param number: Board.X for player1 or Board.O for player2 :param name: A descriptive name for the Bot """ if name is None: name = "DTree Bot" MinimaxBot.__init__(self, number, name=name) self.player_type = 'dtree minimax' objects = serialization.read_all( "models/game/bots/weka_models/j48_default.model") self.classifier = Classifier(jobject=objects[0]) def compute_score(self, board): data_model = BoardDataModel(board) weka_instance = data_model.get_weka_instance(categorical=True) # for some reason, the j48 model occasionally throws indexoutofbounds exceptions when classifying new instances # this try/except block is a hacky way of handling those, so we can at least get some results try: category = self.classifier.classify_instance( weka_instance) # category will be one of the ten classes except Exception: print("Error in Decision-Tree Classifier!!") category = 5 # converts the class value into a numeric score between -1 and 1. E.g. class 1 gets converted to -0.90, class 3 is converted to -0.50, class 10 is converted to 0.90, etc. score = ((category - 5.0) / 5.0) - 0.1 return score
def main(args): """ Trains a J48 classifier on a training set and outputs the predicted class and class distribution alongside the actual class from a test set. Class attribute is assumed to be the last attribute. :param args: the commandline arguments (train and test datasets) :type args: list """ # load a dataset helper.print_info("Loading train: " + args[1]) loader = Loader(classname="weka.core.converters.ArffLoader") train = loader.load_file(args[1]) train.class_index = train.num_attributes - 1 helper.print_info("Loading test: " + args[2]) test = loader.load_file(args[2]) test.class_is_last() # classifier cls = Classifier(classname="weka.classifiers.trees.J48") cls.build_classifier(train) # output predictions print("# - actual - predicted - error - distribution") for index, inst in enumerate(test): pred = cls.classify_instance(inst) dist = cls.distribution_for_instance(inst) print("%d - %s - %s - %s - %s" % (index + 1, inst.get_string_value( inst.class_index), inst.class_attribute.value(int(pred)), "yes" if pred != inst.get_value(inst.class_index) else "no", str(dist.tolist())))
def PredecirUnaTemporada(path): jvm.start() insta = CrearInstanciaParaPredecir(path) atributos = "" file = open('ModelData/wekaHeader.arff', 'r') atributos = file.readlines() file.close() file = open('ModelData/predictionFiles/inst.arff', 'w') file.writelines(atributos) file.write("\n" + insta + '\n') file.close() objects = serialization.read_all("ModelData/77PercentModelPaisajes.model") classifier = Classifier(jobject=objects[0]) loader = Loader() data = loader.load_file("ModelData/predictionFiles/inst.arff") data.class_is_last() clases = ["invierno", "verano", "otono", "primavera"] prediccion = "" for index, inst in enumerate(data): pred = classifier.classify_instance(inst) dist = classifier.distribution_for_instance(inst) prediccion = clases[int(pred)] jvm.stop() return prediccion
def test_single(): #['long', 'nK', 'nK_delta', 'nT', 'nT_delta', 'str', 'strsum']: objs = ['olsize', 'ylsize'] for obj in objs: c = Classifier(jobject=serialization.read(model_file('hash', obj))) values = [3.0, 192.0, 124.0, 192.0, 124.0, 6.0, 144.0] values.append(0) # should be obj ins = Instance.create_instance(values) prediction = c.classify_instance(ins) print obj, prediction
class python_weka(object): def __init__(self, input_x, input_y, labels): self.input_x = input_x self.input_y = input_y self.labels = labels def write_arff(self, filename, relation, train_or_predict, input_x, input_y=None): f = open(filename, "w") f.write("@relation " + relation + "\n") for i in self.labels: train_or_predict += 1 if train_or_predict == len(self.labels): break f.write("@attribute " + i + " " + self.labels[i] + "\n") f.write("\n") f.write("@data" + "\n") for i in range(len(input_x)): for j in input_x[i]: f.write(str(j) + " ") if train_or_predict == 0: f.write(str(input_y[i])) else: f.write(str(0)) f.write("\n") f.close() def train(self): filename = "train.arff" self.write_arff(filename, "train", 0, self.input_x, self.input_y) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(filename) data.class_is_last() self.cls = Classifier(classname="weka.classifiers.meta.Bagging", options=["-S", "5"]) self.cls.build_classifier(data) os.remove(filename) def predict(self, test_data): filename = "test.arff" self.write_arff(filename, "test", 0, test_data) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(filename) data.class_is_last() # evl = Evaluation(data) # evl.evaluate_model(self.cls,data) # data.set_class_label(data.numAttributes() - 1) # data.setClassIndex(data.numAttributes() - 1) result = [] for index, inst in enumerate(data): pred = self.cls.classify_instance(inst) dist = self.cls.distribution_for_instance(inst) result.append(dist[0]) # print(str(index+1) + ": label index=" + str(pred) + ", class distribution=" + str(dist)) # print str(index+1) + 'dist:'+ str(dist) os.remove(filename) return result
def predictWithWeka(csvFilenameWithInputToPredict, modelFilename): """ # Nota: para usar sin conocer la clase, se puede colocar una clase dummy # e ignorar los valores actual y error de @return results. # # Nota: es necesario que el archivo de nombre @csvFilenameWithInputToPredict # contenga instancias de ambas clases (spam y sanas) # # @csvFilenameWithInputToPredict : nombre del archivo csv con las instancias # a predecir. # # @modelFilename : nombre del archivo de modelo generado por weka y # compatible con el archivo csv de entrada # # @return results : lista de diccionarios con los siguientes indices # index, actual, predicted, error y distribution """ loader = Loader(classname="weka.core.converters.CSVLoader") cls = Classifier(jobject=serialization.read(modelFilename)) #print(cls) data = loader.load_file(csvFilenameWithInputToPredict) data.class_is_last() multi = MultiFilter() remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "first"]) numericToNom = Filter( classname="weka.filters.unsupervised.attribute.NumericToNominal", options=["-R", "8,11"]) normalize = Filter( classname="weka.filters.unsupervised.attribute.Normalize", options=["-S", "1.0", "-T", "0.0"]) multi.filters = [remove, numericToNom, normalize] multi.inputformat(data) test = multi.filter(data) results = [] for index, inst in enumerate(test): result = dict() pred = cls.classify_instance(inst) dist = cls.distribution_for_instance(inst) result["index"] = index + 1 result["actual"] = inst.get_string_value(inst.class_index) result["predicted"] = inst.class_attribute.value(int(pred)) result["error"] = "yes" if pred != inst.get_value( inst.class_index) else "no" result["distribution"] = str(dist.tolist()) results.append(result) #print result return results
def assign_classify(file_location, output="classified.out", model="naivebayes.model"): data = read_csv_file(file_location) jvm.start() # load clusters obj = serialization.read(model) classifier = Classifier(jobject=obj) # create file with cluster group with open(output, 'w') as cluster_file: for index, attrs in enumerate(data): inst = Instance.create_instance(attrs[1:]) pred = classifier.classify_instance(inst) print(str(index + 1) + ": label index=" + str(pred)) jvm.stop()
def playback_speed_checker(inputFile, dirRef): TRAINING_ARFF = 'dataset_playback.arff' inputRef = "" # Start JVM jvm.start() jvm.start(system_cp=True, packages=True) jvm.start(max_heap_size="512m") # Find reference file for file in os.listdir(dirRef): if str(file).find(str(os.path.basename(inputFile))) != -1: inputRef = os.path.join(dirRef, file) break # Calculation distance (result, distance) = dtw_checker(inputFile, inputRef) # Loading data loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(TRAINING_ARFF) data.class_is_last() # set class attribute # Train the classifier #cls = Classifier(classname="weka.classifiers.functions.SMO") cls = Classifier(classname="weka.classifiers.trees.J48", options = ["-C", "0.3", "-M", "10"]) cls.build_classifier(data) # Classify instance speed_instance = Instance.create_instance(numpy.ndarray(distance), classname='weka.core.DenseInstance', weight=1.0) speed_instance.dataset = data # Classify instance speed_flag = cls.classify_instance(speed_instance) if (distance == 0): speed_class = 'nominal' else: if speed_flag == 0: speed_class = 'down_speed' if speed_flag == 0: speed_class = 'up_speed' # print os.path.basename(inputFile) + ' --- ' + speed_class # Stop JVM jvm.stop() print "SPEED IS: " + speed_class return speed_class
def predict(attributes): jvm.start() file_path = print_to_file(attributes) # load the saved model objects = serialization.read_all("/Users/hosyvietanh/Desktop/data_mining/trained_model.model") classifier = Classifier(jobject=objects[0]) loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(file_path) data.class_is_last() for index, inst in enumerate(data): pred = classifier.classify_instance(inst) dist = classifier.distribution_for_instance(inst) return int(pred) jvm.stop()
def weka_predict(self): # grab WEKA model objects = serialization.read_all(self.weka_model) classifier = Classifier(jobject=objects[0]) # load the dataset i.e. the .arff file generated for the supplied url loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(self.dataset) data.class_is_last() # for each url tested predict whether "Phishy" or not by using the Random Tree model for item in data: self.prediction = classifier.classify_instance(item)
def eval_one_split(traindata, testdata, obj): c = Classifier(classname="weka.classifiers.trees.M5P") c.build_classifier(traindata) objidx = traindata.attribute_by_name(obj).index preds = [] reals = [] for idx, ins in enumerate(testdata): pred = c.classify_instance(ins) real = ins.get_value(objidx) preds.append(pred) reals.append(real) header = '' for h in traindata.attributes(): header += ' ' + h.name return preds, reals
def functionProcessamento(self, ca1_r, ca1_l, ca2_ca3_r, ca2_ca3_l, sub_r, sub_l, sexo, id): jvm.start() path = os.path.dirname(os.path.abspath(__file__)) # TODO: verificar qual o sexo do individuo para carregar o modelo corretamente modelo = path + "\\naive_bayes_feminino_novo.model" if (sexo == "Male"): print("É masculino") modelo = path + "\\naive_bayes_feminino_novo.model" objects = serialization.read_all(modelo) classifier = Classifier(jobject=objects[0]) loader = Loader(classname="weka.core.converters.ArffLoader") arquivo = open(path + "\\novo_individuo.arff", "w") conteudo = list() conteudo.append("@relation alzheimer \n\n") conteudo.append("@attribute doente {SIM, NAO} \n") conteudo.append("@attribute ca1_right real \n") conteudo.append("@attribute ca1_left real \n") conteudo.append("@attribute ca2_ca3_right real\n") conteudo.append("@attribute ca2_ca3_left real \n") conteudo.append("@attribute subic_right real \n") conteudo.append("@attribute subic_left real \n\n") conteudo.append("@data \n") #aqui passar as variáveis conteudo.append("SIM," + str(ca1_r) + "," + str(ca1_l) + "," + str(ca2_ca3_r) + "," + str(ca2_ca3_l) + "," + str(sub_r) + "," + str(sub_l)) print(conteudo) arquivo.writelines(conteudo) arquivo.close() data = loader.load_file(path + "\\novo_individuo.arff") data.class_is_last() for index, inst in enumerate(data): pred = classifier.classify_instance(inst) dist = classifier.distribution_for_instance(inst) pc_doenca = round(((pred) * 100), 2) pc_saudavel = round(((100 - pc_doenca)), 2) print(" Porcentagem de alzheimer=" + str(pc_doenca) + "%, porcentagem saudavel=" + str(pc_saudavel) + "%") alzheimer = Alzheimer.objects.get(id=id) alzheimer.resultado_ad = pc_doenca alzheimer.resultado_cn = pc_saudavel alzheimer.status_seg = 2 alzheimer.save() jvm.stop()
def run(): jvm.start() load_csv = Loader("weka.core.converters.CSVLoader") data_csv = load_csv.load_file( "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.csv" ) saver = Saver("weka.core.converters.ArffSaver") saver.save_file( data_csv, "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.arff" ) load_arff = Loader("weka.core.converters.ArffLoader") data_arff = load_arff.load_file( "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.arff" ) data_arff.class_is_last() cls = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.5"]) cls.build_classifier(data_arff) for index, inst in enumerate(data_arff): pred = cls.classify_instance(inst) dist = cls.distribution_for_instance(inst) # save tree prune in txt file saveFile = open( "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.txt", "w") saveFile.write(str(cls)) # print(cls) saveFile.close() global j48 J48_class = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"]) J48_class.build_classifier(data_arff) evaluationj48 = Evaluation(data_arff) evaluationj48.crossvalidate_model(J48_class, data_arff, 10, Random(100)) j48 = str(evaluationj48.percent_correct) jvm.stop() return j48
def predict(self, modelName, x, arffName, debug=False): # Carga el arrf para conocer la estructura de las instancias loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(arffName) # Se asume que la clase es el ultimo atributo data.class_is_last() # Carga del modelo generado en Weka objects = serialization.read_all(modelName) cls = Classifier(jobject=objects[0]) if(debug): print("Loaded model...") print(cls) # Se crea la instancia correspondiente a la entrada y se clasifica if(debug): print("Input", x) # Anyade un valor tonto para la clase de la instancia if data.class_attribute.is_nominal: x.append('a') else: x.append(0) # Convierte los valores nominales a la posicion entera que ocupa dentro de sus lista #print data.num_attributes for i in range(0, data.num_attributes): attribute = data.attribute(i) if attribute.is_nominal: x[i] = attribute.index_of(x[i]) '''print x[i] print ''''' # Realiza la prediccion inst = Instance.create_instance(x) inst.dataset = data pred = cls.classify_instance(inst) if data.class_attribute.is_nominal: pred = data.class_attribute.value(pred) if(debug): print("Prediction", pred) return pred
class TweetClassifier2: def __init__(self, twitter_user='******'): self.__dir = dirname(__file__) self.twitter_user = twitter_user self.training_data_file = join( dirname(__file__), '../{}_classified_data.arff'.format(twitter_user)) self.loader = Loader(classname="weka.core.converters.ArffLoader") data = self.loader.load_file(self.training_data_file) data.class_is_last() # the classes / categories we are classifying tweets as self.categories = data.attribute(data.class_index).values self.classifier = Classifier(classname='weka.classifiers.trees.J48', options=['-C', '0.3']) self.classifier.build_classifier(data) def classify(self, document): """Use the trained classifier to determine the class of this new document :param document: <str> body of text to classify :return: <str> name of the class / category of the document """ document = { 'text': document, 'category': ['?'], } # save as json fp = join(self.__dir, '../{}_new_doc.json'.format(self.twitter_user)) json.dump([document], open(fp, 'w')) # transform to arff conforming to attributes from trained data set arff_file = transform(fp, 'dont_care', 80, self.training_data_file) data = self.loader.load_file(arff_file) data.class_is_last() predicted = self.classifier.classify_instance(data.get_instance(0)) return self.categories[int(predicted)]
def train_and_separate_validation(self, trainingSet, validationSet, validationInstancesNames, classifier): loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(trainingSet) data.class_is_last() data2 = loader.load_file(validationSet) if not len(data2) == len(validationInstancesNames): print( "Theres a mismatch between the number of instances in the arff file and the list of instance names." ) raise LookupError data2.class_is_last() classes = [str(code) for code in data.class_attribute.values] header = [[classifier, trainingSet, "", "", ""], ["Instance"] + [className + " probability" for className in classes] + ["Real Class", "Guess"]] cls = Classifier(classname=classifier) print("Training.") cls.build_classifier(data) print("Model done!") dataMatrix = [["", 0, 0, 0, ""] for i in range(len(data2))] print("Validating.") for index, inst in enumerate(data2): print("Instance: " + str(index + 1) + "/" + str(len(data2))) pred = cls.classify_instance(inst) if inst.get_value(inst.class_index) == pred: guessValue = 1.0 else: guessValue = 0.0 dist = cls.distribution_for_instance(inst) dataMatrix[index][0] = validationInstancesNames[index] dataMatrix[index][1:3] = [round(p, 2) for p in dist] dataMatrix[index][3] = classes[int(inst.get_value( inst.class_index))] dataMatrix[index][4] = guessValue print("Done\n") return [header, dataMatrix]
def riaa_checker(inputFile): TRAINING_ARFF = 'C:\Users\ASUS\Desktop\IGNASI\SMC\Workspace\dataset_riaa.arff' # Start JVM jvm.start() jvm.start(system_cp=True, packages=True) jvm.start(max_heap_size="512m") # Calculation of bark bands information (absolute_bark, relative_bark, bark_ratios) = compute_bark_spectrum(inputFile) # Loading data loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(TRAINING_ARFF) data.class_is_last() # set class attribute # Train the classifier cls = Classifier(classname="weka.classifiers.functions.SMO") #cls = Classifier(classname="weka.classifiers.trees.J48", options = ["-C", "0.3", "-M", "10"]) cls.build_classifier(data) # Classify instance bark_instance = Instance.create_instance(bark_ratios, classname='weka.core.DenseInstance', weight=1.0) bark_instance.dataset = data # Classify instance riaa_flag = cls.classify_instance(bark_instance) if riaa_flag == 0: riaa_class = 'riaa_ok' else: riaa_class = 'riaa_ko' # print os.path.basename(inputFile) + ' --- ' + riaa_class # Stop JVM jvm.stop() print "RIAA FILTERING?: " + riaa_class return riaa_class
def classify(train, test, name="RF", tuning=False): jvm.start() if isinstance(train, list) and isinstance(test, list): train = weka_instance(train) trn_data = converters.load_any_file(train) test = weka_instance(test) tst_data = converters.load_any_file(test) elif os.path.isfile(train) and os.path.isfile(test): trn_data = converters.load_any_file(train) tst_data = converters.load_any_file(test) else: trn = csv_as_ndarray(train) tst = csv_as_ndarray(test) trn_data = converters.ndarray_to_instances(trn, relation="Train") tst_data = converters.ndarray_to_instances(tst, relation="Test") trn_data.class_is_last() tst_data.class_is_last() # t = time() if tuning: opt = tune(train) else: opt = default_opt # print("Time to tune: {} seconds".format(time() - t)) cls = Classifier(classname=classifiers[name.lower()], options=opt) cls.build_classifier(trn_data) distr = [cls.distribution_for_instance(inst)[1] for inst in tst_data] preds = [cls.classify_instance(inst) for inst in tst_data] jvm.stop() return preds, distr
class ObjectiveClassifier: def __init__(self, model_path, senti_path, stop_words, ngrams_path): self.loader = Loader(classname="weka.core.converters.ArffLoader") self.features_calculator = FeaturesCalculator(ngrams_path) self.classifier = Classifier(jobject=serialization.read(model_path)) self.normalizer = Preprocessor(senti_path) self.stop_words = stop_words def classify_tweet(self, tweet, polarity='"positive"'): tweet_normalized = self.normalizer.preprocess(tweet, self.stop_words, "") self.features_calculator.calculateFeatures( tweet_normalized, "output/tweet_features_objective.arff", polarity) tweet_features = self.loader.load_file( "output/tweet_features_objective.arff") tweet_features.class_is_last() for index, inst in enumerate(tweet_features): pred = self.classifier.classify_instance(inst) dist = self.classifier.distribution_for_instance(inst) print("%d - %s - %s" % (index + 1, inst.class_attribute.value( int(pred)), str(dist.tolist())))
def python_wrapper(mImage, prefix, file_name, pre_prefix, dir, permanent_dir, model): # Initialization of weka machine learning library weka_machine_learning = WML.WekaMachineLearning() # tokenization of images token = re.split('RGB_|.png', mImage) ir_directory = token[0] + 'IR_' + token[1] + '.pgm' mat_directory = token[0] + 'Mat_' + token[1] # get mat and ir image image = segmentor.getImage(ir_directory) mat = segmentor.readMatFile(mat_directory) # image processing edges = segmentor.edgeDetector(image) type = segmentor.getTypeOfFruit(image) segmentation = segmentor.segmentation(image, type) filter = segmentor.filterImageFromSegmentation(image, segmentation) output_seg = segmentor.imageMapping(filter, mat['IR']) ####################-Anomaly Detection via INFLO-################### # file prefix creation for the csv file to save prefix_csv = prefix + "\\" + file_name # if folder is not there then create it # and right the csv to the folder if not os.path.exists(prefix): os.mkdir(prefix) csv = segmentor.writeToCSV(output_seg, prefix_csv) print "file is written" #else simply write the csv to the folder else: csv = segmentor.writeToCSV(output_seg, prefix_csv) print "file is written" #call the INFLO.bat after segmenting the image #for anomaly detection run_batch_file("rapid_miner_pro_ifruitlfy.bat") ############################-Clustering-############################ # image file directory is stored in ir_directory # mat file directory is stored in mat_directory # and need to get the INFLO file # directory for INFLO file is prefix_csv anomaly_file = prefix_csv + '.csv_INFLO.csv' # directory for the temperorary files is made so # some results can be stored and processed auto- # matically by the rapid miner 5, this folder is demo_printing_picture(permanent_dir, prefix, mImage, pre_prefix, dir, file_name) print( "END OF ANOMALY DETECTION CLICK TRAIN AND SHOW RESULT FOR PROCESSING") write_temp_dir = permanent_dir + "\\" print prefix print file_name # Clean the junk of the output files if os.path.exists(permanent_dir + "//output.csv"): os.remove(permanent_dir + "//output.csv") features = iFruitFly_clustering.cluster_analysis.cluster_analysis( ir_directory, permanent_dir + "\\output_INFLO.csv", mat_directory, dir + "\\" + file_name, prefix, file_name, permanent_dir) if (features == None): print("Image cant be segmented due to poor calibiration") #other files are stored for the user in the junk else: print "printing images->>>>>>> ", prefix + file_name image_plotter(features, ir_directory, prefix + file_name) import csv # Weka Machine Learning Inclusion on 5/30/2017 # adding one extra column with open(permanent_dir + "\\output.csv", 'r') as csvinput: with open(permanent_dir + "\\output_n.csv", 'w') as csvoutput: writer = csv.writer(csvoutput, lineterminator='\n') reader = csv.reader(csvinput) all = [] row = next(reader) row.append('result') all.append(row) for row in reader: row.append(0) all.append(row) writer.writerows(all) #model = "J:\iFruitFly\Python Scripts\Model 1\\model.model" data_dir = permanent_dir + "\\output_n.csv" #data_dir_open = open(data_dir) #r = csv.reader(data_dir_open) jvm.start() loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(data_dir) # using the serialization library for # opening the model objects = serialization.read_all(model) classifier = Classifier(jobject=objects[0]) print "Model Classified" print classifier data.class_is_last() for index, inst in enumerate(data): pred = classifier.classify_instance(inst) dist = classifier.distribution_for_instance(inst) print pred
def run_classifier(path, prot, sel, cols, prot_vals, beta): DIs = dict() jvm.start() for i in range(len(cols)-1): loader = Loader(classname="weka.core.converters.CSVLoader") data = loader.load_file(path) # remove selected attribute from the data # NOTE: options are ONE indexed, not ZERO indexed remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", \ options=["-R", str(sel[2]+1)]) remove.inputformat(data) data = remove.filter(data) # if running for only one attribue, remove all others (except protected) if i > 0: for j in range(1, prot[2]+1): if i != j: remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", \ options=["-R", ("1" if i>j else "2")]) remove.inputformat(data) data = remove.filter(data) # set prot attribute as Class attribute data.class_is_last() # run classifier cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes") cls.build_classifier(data) # count the number of each combination pos_and_pred = float(0.0) pos_and_not_pred = float(0.0) neg_and_pred = float(0.0) neg_and_not_pred = float(0.0) for ind, inst in enumerate(data): if cls.classify_instance(inst): if prot_vals[ind] == prot[1]: pos_and_pred += 1 else: neg_and_pred += 1 else: if prot_vals[ind] == prot[1]: pos_and_not_pred += 1 else: neg_and_not_pred += 1 # calculate DI BER = ((pos_and_not_pred / (pos_and_pred + pos_and_not_pred)) + \ (neg_and_pred / (neg_and_pred + neg_and_not_pred))) * 0.5 if BER > 0.5: BER = 1 - BER DI = 1 - ((1 - 2 * BER) / (beta + 1 - 2 * BER)) if i == 0: # consider changing this to a 'code word' instead of 'all' DIs["all"] = DI else: DIs[cols[i-1]] = DI jvm.stop() return DIs
for classifier in classifiers: print("~~~~~~~~~~~~~~~~~~~") print(classifier) data_test = loader.load_file(test_csv) data_test.class_is_last() actual_scores = [] with open(test_csv) as csvfile: reader = csv.DictReader(csvfile) for row in reader: actual_scores.append(row['Score']) with open(result_csv, 'wb') as csvfile: print() csvfile.close() print(data_test) for index, inst in enumerate(data_test): print(index) knn_score = int(knn_classifier.classify_instance(inst)) lin_score = int(lin_classifier.classify_instance(inst)) svm_score = int(svm_classifier.classify_instance(inst)) classifiers_scores = [["KNN", knn_score], ["Linear Regression", lin_score], ["SVM", svm_score], ["Actual Score", actual_scores[index]]] write_to_csv(classifiers_scores, result_csv) jvm.stop()
class WekaWrapper: def __init__(self, questionID, algorithm, classifier, parameters, modelParams, optimizer, predict = 0): self.questionID = questionID self.algorithm = algorithm self.classifier = classifier self.parameters = parameters self.modelParams = modelParams self.api = nemoApi() self.config = nemoConfig() self.optimizer = optimizer self.predict = predict self.prediction = None def retrieveData(self, id, dataset): query = self.api.getDataQuery(id, dataset) iquery = InstanceQuery() iquery.db_url = "jdbc:mysql://" + self.config.HOST + ":" + str(self.config.PORT) + "/" + self.config.DB iquery.user = self.config.USER iquery.password = self.config.PASS iquery.query = query data = iquery.retrieve_instances() data.class_is_last() return data def uploadData(self): # Upload file to database self.api.addModel(self.questionID, '?', self.acc, self.model, self.algorithm, False, self.matrix, self.optimizer) info = self.api.fetchQuestionInfo(self.questionID) modelID = info['ID'] for mParam in self.modelParams: mParam.AIModel = modelID self.api.addAIModelParam(mParam) def uploadPrediction(self): # Upload best classifier prediction to database if self.prediction is not None: # Convert prediction to string predStr = 'No prediction' if (self.prediction == 1.0): predStr = "True" elif (self.prediction == 0.0): predStr = "False" print 'Writing ' + predStr self.api.updatePrediction(self.questionID, predStr) def addInstancesToDataset(self, source, dest): # Align the instances of a source dataset to destination's header and add them to the destination dataset i = 0 while i < source.num_instances: values = source.get_instance(i).values it = np.nditer(values, flags=['f_index'], op_flags=['readwrite']) while not it.finished: (it[0], it.index), if (source.attribute(it.index).is_nominal): stringVal = source.get_instance(i).get_string_value(it.index) # print stringVal if(stringVal != '?'): values[it.index] = dest.attribute(it.index).values.index(stringVal) it.iternext() dest.add_instance(Instance.create_instance(values)) i = i + 1 def buildPatientObject(self): # Build a patient to classify patient = self.api.fetchPatientJSON(self.questionID) if patient is not None: newPatient = {} demographics = ['race_cd', 'sex_cd', 'age_in_years_num'] observation_fact_features = ['tval_char', 'nval_num'] for demo in demographics: if demo not in patient: print "Patient definition missing" + demo + "." newPatient[demo] = float('nan') else: if patient[demo] is not None and patient[demo] != '': newPatient[demo] = patient[demo] else: print "Demographic " + demo + " for patient is empty" newPatient[demo] = float('nan') for obs in patient['observation_facts']: concept_cd = obs['concept_cd'] for feat in observation_fact_features: if feat in obs: if obs[feat] is not None: newPatient[(concept_cd + feat)] = obs[feat] else: newPatient[(concept_cd + feat)] = float('nan') else: print "Feature " + concept_cd + feat + " missing from Patient definition, marking it None" newPatient[(concept_cd + feat)] = float('nan') return newPatient else: return None def addPatientNominals(self, patient, dataset): # Add the nominal values for the patient to the master header, in case they aren't already there # Loop and add patient's nominal values in case they aren't in masterDataset # newDataset will be the new master header # Waiting on prediction patient to be defined # Should be like {sex_cd: "m", ...} ignoreAttributes = ['readmitted'] atts = [] for a in dataset.attributes(): if (not (a.is_nominal)) or (a.name in ignoreAttributes) : atts.append(a) else: newValues = list(a.values) #print a.name pvalue = patient[a.name] if(pvalue not in newValues): newValues.append(pvalue) atts.append(Attribute.create_nominal(a.name, newValues)) newDataset = Instances.create_instances("Dataset", atts, 0) newDataset.class_is_last() return newDataset def addNominals(self, dataset): # Add the nominal values for all columns, in case a column has none ignoreAttributes = ['readmitted'] atts = [] for a in dataset.attributes(): if (not (a.is_nominal)) or (a.name in ignoreAttributes) : atts.append(a) else: newValues = list(a.values) pvalue = 'DefaultNominal' if(pvalue not in newValues): newValues.append(pvalue) atts.append(Attribute.create_nominal(a.name, newValues)) newDataset = Instances.create_instances("Dataset", atts, 0) newDataset.class_is_last() return newDataset def createPatientInstance(self, patient, dataset): # Create a patient instance to classify ignoreAttributes = ['readmitted'] values = [] for a in dataset.attributes(): if not a.is_nominal: values.append(patient[a.name]) elif a.name in ignoreAttributes: values.append(0) else: values.append(a.values.index(patient[a.name])) #print values newInst = Instance.create_instance(values) return newInst def run(self): # Attach JVM javabridge.attach() # Debug print "Classifier" print self.classifier print "Params" print self.parameters print "Model Params" print self.modelParams # Get data for testing and learning learnerData = self.retrieveData(self.questionID, "learner") testData = self.retrieveData(self.questionID, 'test') masterData = self.retrieveData(self.questionID, 'all') masterData = self.addNominals(masterData) # Check if there is enough correct data to run if (learnerData.num_instances < 1 or testData.num_instances < 1): self.status = self.config.NOT_ENOUGH_DATA return False # If this is a prediction and there is a valid patient, change masterData header patientObj = self.buildPatientObject() patientInstance = None if ((patientObj is not None) and (self.predict == 1)): masterData = self.addPatientNominals(patientObj, masterData) patientInstance = self.createPatientInstance(patientObj, masterData) masterData.add_instance(patientInstance) elif (patientObj is None) and (self.predict == 1): print 'No patient defined for prediction. Exiting' return True # Fix dataset headers up to match and fix instances to match headers masterData.delete() learner = masterData.copy_instances(masterData, 0, 0) test = masterData.copy_instances(masterData, 0, 0) self.addInstancesToDataset(learnerData, learner) self.addInstancesToDataset(testData, test) # Comparison of data for testing purposes # print 'learnerData' # print learnerData # print 'learner' # print learner # print 'testData' # print testData # print 'test' # print test # pdb.set_trace() # Instantiate classifier self.cls = Classifier(classname=self.classifier, options=self.parameters) # Run classifier self.cls.build_classifier(learner) # for index, inst in enumerate(learnerData): # prediction = self.cls.classify_instance(inst) # distribution = self.cls.distribution_for_instance(inst) # Test classifier evl = Evaluation(learner) evl.test_model(self.cls, test) # Store information about matrix self.acc = evl.percent_correct self.val = None # Convert numpy array into simple array confusionMatrix = [] confusionMatrix.append([evl.confusion_matrix[0][0], evl.confusion_matrix[0][1]]) confusionMatrix.append([evl.confusion_matrix[1][0], evl.confusion_matrix[1][1]]) # Convert matrix into json format self.matrix = json.dumps(confusionMatrix) # print 'Classifier: ', self.classifier # print 'ID: ', self.questionID # print 'ACC: ', self.acc # print(evl.summary()) # If this is a prediction... make the prediction if ((patientObj is not None) and (self.predict == 1)): masterData.add_instance(patientInstance) print "Running prediction on patient: " print masterData.get_instance(0) self.prediction = self.cls.classify_instance(masterData.get_instance(0)) #self.uploadPrediction() # Temporarily store file to serialize to fileName = str(self.questionID) + self.algorithm + ".model" serialization.write(fileName, self.cls) # Open that file and store it self.model = None with open(fileName, 'rb') as f: self.model = f.read() # Remove temporary file os.remove(fileName) # Set status to awaiting feedback self.status = self.config.AWAITING_FEEDBACK_STATUS return True
def evaluate_j48(datasets_path, intermediary_path): # for examples on how to use this function, refer to # http://pythonhosted.org/python-weka-wrapper/examples.html#build-classifier-on-dataset-output-predictions import weka.core.jvm as jvm from weka.core.converters import Loader from weka.classifiers import Classifier from sklearn.metrics import precision_score, accuracy_score, f1_score from networkx.drawing.nx_agraph import graphviz_layout jvm.start() json_results = { 'runs': { '1': dict() } } try: for dataset in os.listdir(datasets_path): dataset_name = dataset.split('.')[0] json_results['runs']['1'][dataset_name] = dict() loader = Loader(classname="weka.core.converters.ArffLoader") y_pred_all = [] y_true_all = [] heights = [] n_nodes = [] for n_fold in it.count(): try: train_s = loader.load_file( os.path.join(intermediary_path, '%s_fold_%d_train.arff' % (dataset_name, n_fold))) val_s = loader.load_file( os.path.join(intermediary_path, '%s_fold_%d_val.arff' % (dataset_name, n_fold))) test_s = loader.load_file( os.path.join(intermediary_path, '%s_fold_%d_test.arff' % (dataset_name, n_fold))) train_s.relationname = dataset_name val_s.relationname = dataset_name test_s.relationname = dataset_name train_s.class_is_last() val_s.class_is_last() test_s.class_is_last() warnings.warn('WARNING: appending validation set in training set.') for inst in val_s: train_s.add_instance(inst) cls = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"]) # cls = Classifier(classname="weka.classifiers.trees.REPTree", # options=["-M", "2", "-V", "0.001", "-N", "3", "-S", "1", "-L", "-1", "-I", "0.0"]) cls.build_classifier(train_s) warnings.warn('WARNING: will only work for binary splits!') graph = cls.graph.encode('ascii') out = StringIO.StringIO(graph) G = nx.Graph(nx.nx_pydot.read_dot(out)) # TODO plotting! # fig = plt.figure(figsize=(40, 30)) # pos = graphviz_layout(G, root='N0', prog='dot') # # edgelist = G.edges(data=True) # nodelist = G.nodes(data=True) # # edge_labels = {(x1, x2): v['label'] for x1, x2, v in edgelist} # node_colors = {node_id: ('#98FB98' if 'shape' in _dict else '#0099FF') for node_id, _dict in nodelist} # node_colors['N0'] = '#FFFFFF' # node_colors = node_colors.values() # # nx.draw_networkx_nodes(G, pos, node_color=node_colors) # nx.draw_networkx_edges(G, pos, style='dashed', arrows=False) # nx.draw_networkx_labels(G, pos, {k: v['label'] for k, v in G.node.iteritems()}) # nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels) # plt.axis('off') # plt.show() # exit(0) # TODO plotting! heights += [max(map(len, nx.shortest_path(G, source='N0').itervalues()))] n_nodes += [len(G.node)] y_test_true = [] y_test_pred = [] # y_train_true = [] # y_train_pred = [] # for index, inst in enumerate(train_s): # y_train_true += [inst.get_value(inst.class_index)] # y_train_pred += [cls.classify_instance(inst)] for index, inst in enumerate(test_s): y_test_true += [inst.get_value(inst.class_index)] y_test_pred += [cls.classify_instance(inst)] y_true_all += y_test_true y_pred_all += y_test_pred except Exception as e: break json_results['runs']['1'][dataset_name] = { 'confusion_matrix': confusion_matrix(y_true_all, y_pred_all).tolist(), 'height': heights, 'n_nodes': n_nodes, } # interprets json_results = json.load(open('/home/henry/Desktop/j48/j48_results.json', 'r')) n_runs = len(json_results['runs'].keys()) some_run = json_results['runs'].keys()[0] n_datasets = len(json_results['runs'][some_run].keys()) df = pd.DataFrame( columns=['run', 'dataset', 'test_acc', 'height mean', 'height std', 'n_nodes mean', 'n_nodes std'], index=np.arange(n_runs * n_datasets), dtype=np.float32 ) df['dataset'] = df['dataset'].astype(np.object) count_row = 0 for n_run, run in json_results['runs'].iteritems(): for dataset_name, dataset in run.iteritems(): conf_matrix = np.array(dataset['confusion_matrix'], dtype=np.float32) test_acc = np.diag(conf_matrix).sum() / conf_matrix.sum() height_mean = np.mean(dataset['height']) height_std = np.std(dataset['height']) n_nodes_mean = np.mean(dataset['n_nodes']) n_nodes_std = np.std(dataset['n_nodes']) df.loc[count_row] = [ int(n_run), str(dataset_name), float(test_acc), float(height_mean), float(height_std), float(n_nodes_mean), float(n_nodes_std) ] count_row += 1 print df json.dump(json_results, open('j48_results.json', 'w'), indent=2) df.to_csv('j48_results.csv', sep=',', quotechar='\"', index=False) finally: jvm.stop()
def predictionFromModel(): import weka.core.serialization as serialization from weka.classifiers import Classifier from weka.classifiers import Evaluation predictionsPath = outputPrediction models_dir = inputModel modelsList = os.listdir(inputModel) data_dir = input folderList = os.listdir(inputModel) i = 0 loader = Loader(classname="weka.core.converters.ArffLoader") from weka.core.classes import Random from weka.core.dataset import Instances data = loader.load_file(os.path.join(inputModel, "genderTest.arff")) data.class_is_last() modelName = "GenderModel.model" objects = serialization.read_all(os.path.join(inputModel, modelName)) trainedModel = Classifier(jobject=objects[0]) genderFile = open(os.path.join(outputPrediction, 'Gender_Predictions.csv'), 'w') with genderFile: j = -1 fieldnames = ['Test_Author_Profile_Id', 'Gender'] writer = csv.DictWriter(genderFile, fieldnames=fieldnames) writer.writeheader() for index, inst in enumerate(data): j = j + 1 pred = trainedModel.classify_instance(inst) dist = trainedModel.distribution_for_instance(inst) print( str(index + 1) + ": label index=" + str(pred) + ", class distribution=" + str(dist)) if (str(pred) == '0.0'): writer.writerow({ 'Test_Author_Profile_Id': my_list[j], 'Gender': 'male' }) if (str(pred) == '1.0'): writer.writerow({ 'Test_Author_Profile_Id': my_list[j], 'Gender': 'female' }) data = loader.load_file(os.path.join(inputModel, "ageTest.arff")) data.class_is_last() modelName = "AgeModel.model" objects = serialization.read_all(os.path.join(inputModel, modelName)) trainedModel = Classifier(jobject=objects[0]) ageFile = open(os.path.join(outputPrediction, 'Age_Predictions.csv'), 'w') with ageFile: j = -1 fieldnames = ['Test_Author_Profile_Id', 'Age'] writer = csv.DictWriter(ageFile, fieldnames=fieldnames) writer.writeheader() for index, inst in enumerate(data): j = j + 1 pred = trainedModel.classify_instance(inst) dist = trainedModel.distribution_for_instance(inst) print( str(index + 1) + ": label index=" + str(pred) + ", class distribution=" + str(dist)) if (str(pred) == '0.0'): writer.writerow({ 'Test_Author_Profile_Id': my_list[j], 'Age': '15-19' }) if (str(pred) == '1.0'): writer.writerow({ 'Test_Author_Profile_Id': my_list[j], 'Age': '20-24' }) if (str(pred) == '2.0'): writer.writerow({ 'Test_Author_Profile_Id': my_list[j], 'Age': '25-xx' }) os._exit(0)
def runner(self, cdat, heap_size = 16384, seed = None, verbose = True): self.set_status(Pipeline.RUNNING) self.logs.append('Initializing Pipeline') para = self.config self.logs.append('Reading Pipeline Configuration') head = '' name = get_rand_uuid_str() self.logs.append('Reading Input File') for i, stage in enumerate(self.stages): if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'): self.stages[i].status = Pipeline.RUNNING if stage.code == 'dat.fle': head = os.path.abspath(stage.value.path) name, _ = os.path.splitext(stage.value.name) self.logs.append('Parsing to ARFF') path = os.path.join(head, '{name}.arff'.format(name = name)) # This bug, I don't know why, using Config.schema instead. # cdat.toARFF(path, express_config = para.Preprocess.schema, verbose = verbose) for i, stage in enumerate(self.stages): if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'): self.stages[i].status = Pipeline.COMPLETE self.logs.append('Saved ARFF at {path}'.format(path = path)) self.logs.append('Splitting to Training and Testing Sets') JVM.start(max_heap_size = '{size}m'.format(size = heap_size)) load = Loader(classname = 'weka.core.converters.ArffLoader') # data = load.load_file(path) # save = Saver(classname = 'weka.core.converters.ArffSaver') data = load.load_file(os.path.join(head, 'iris.arff')) # For Debugging Purposes Only data.class_is_last() # For Debugging Purposes Only # data.class_index = cdat.iclss for i, stage in enumerate(self.stages): if stage.code == 'prp.kcv': self.stages[i].status = Pipeline.RUNNING self.logs.append('Splitting Training Set') # TODO - Check if this seed is worth it. seed = assign_if_none(seed, random.randint(0, 1000)) opts = ['-S', str(seed), '-N', str(para.Preprocess.FOLDS)] wobj = Filter(classname = 'weka.filters.supervised.instance.StratifiedRemoveFolds', options = opts + ['-V']) wobj.inputformat(data) tran = wobj.filter(data) self.logs.append('Splitting Testing Set') wobj.options = opts test = wobj.filter(data) for i, stage in enumerate(self.stages): if stage.code == 'prp.kcv': self.stages[i].status = Pipeline.COMPLETE self.logs.append('Performing Feature Selection') feat = [ ] for comb in para.FEATURE_SELECTION: if comb.USE: for i, stage in enumerate(self.stages): if stage.code == 'ats': search = stage.value.search.name evaluator = stage.value.evaluator.name if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME: self.stages[i].status = Pipeline.RUNNING srch = ASSearch(classname = 'weka.attributeSelection.{classname}'.format( classname = comb.Search.NAME, options = assign_if_none(comb.Search.OPTIONS, [ ]) )) ewal = ASEvaluation(classname = 'weka.attributeSelection.{classname}'.format( classname = comb.Evaluator.NAME, options = assign_if_none(comb.Evaluator.OPTIONS, [ ]) )) attr = AttributeSelection() attr.search(srch) attr.evaluator(ewal) attr.select_attributes(tran) meta = addict.Dict() meta.search = comb.Search.NAME meta.evaluator = comb.Evaluator.NAME meta.features = [tran.attribute(index).name for index in attr.selected_attributes] feat.append(meta) for i, stage in enumerate(self.stages): if stage.code == 'ats': search = stage.value.search.name evaluator = stage.value.evaluator.name if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME: self.stages[i].status = Pipeline.COMPLETE models = [ ] for model in para.MODEL: if model.USE: summary = addict.Dict() self.logs.append('Modelling {model}'.format(model = model.LABEL)) summary.label = model.LABEL summary.name = model.NAME summary.options = assign_if_none(model.OPTIONS, [ ]) for i, stage in enumerate(self.stages): if stage.code == 'lrn' and stage.value.name == model.NAME: self.stages[i].status = Pipeline.RUNNING for i, instance in enumerate(data): iclass = list(range(instance.num_classes)) options = assign_if_none(model.OPTIONS, [ ]) classifier = Classifier(classname = 'weka.classifiers.{classname}'.format(classname = model.NAME), options = options) classifier.build_classifier(tran) serializer.write(os.path.join(head, '{name}.{classname}.model'.format( name = name, classname = model.NAME )), classifier) self.logs.append('Testing model {model}'.format(model = model.LABEL)) evaluation = Evaluation(tran) evaluation.test_model(classifier, test) summary.summary = evaluation.summary() frame = pd.DataFrame(data = evaluation.confusion_matrix) axes = sns.heatmap(frame, cbar = False, annot = True) b64str = get_b64_plot(axes) summary.confusion_matrix = addict.Dict({ 'value': evaluation.confusion_matrix.tolist(), 'plot': b64str }) self.logs.append('Plotting Learning Curve for {model}'.format(model = model.LABEL)) buffer = io.BytesIO() plot_classifier_errors(evaluation.predictions, tran, test, outfile = buffer, wait = False) b64str = buffer_to_b64(buffer) summary.learning_curve = b64str buffer = io.BytesIO() plot_roc(evaluation, class_index = iclass, outfile = buffer, wait = False) b64str = buffer_to_b64(buffer) summary.roc_curve = b64str buffer = io.BytesIO() plot_prc(evaluation, class_index = iclass, outfile = buffer, wait = False) b64str = buffer_to_b64(buffer) summary.prc_curve = b64str if classifier.graph: summary.graph = classifier.graph for i, instance in enumerate(test): prediction = classifier.classify_instance(instance) for i, stage in enumerate(self.stages): if stage.code == 'lrn' and stage.value.name == model.NAME: self.stages[i].status = Pipeline.COMPLETE models.append(summary) self.gist.models = models JVM.stop() JSON.write(os.path.join(head, '{name}.cgist'.format(name = name)), self.gist) self.logs.append('Pipeline Complete') self.set_status(Pipeline.COMPLETE)
predictions = [0]*len(sensors) stepCount = 0 try: while True: for i in range(len(sensors)): # aquire data # df = sensors[i].getFrame() df = dataManager.getNextWindow() # run the data frame (df) through nathan's code, output is signle row arff inst = # nathan, looks like the iotdata class has some capabilities for handling streaming data, ill let you do this # classify pred = classifier.classify_instance(inst) # estimate step count stepCount += countSteps(df) # save prediciton predictions[i] = pred # output predictions print predictions, stepCount except KeyboardInterrupt: for sensor in sensors: sensor.closeDevice() jvm.stop()
tempList = list() jvm.start() data_dir = "C:\Users\Softmints\Desktop\Diss\Code\WEKA" from weka.core.converters import Loader #Prepare ARFF Loader loader = Loader(classname="weka.core.converters.ArffLoader") #Assign ands load ARFF data file data = loader.load_file(data_dir + "\TestDataEleventoTwentyTwo.arff") data.class_is_last() from weka.classifiers import Classifier #Classify data using J48 classifer cls = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"]) cls.build_classifier(data) for index, inst in enumerate(data): #Output predicition and distribution pred = cls.classify_instance(inst) dist = cls.distribution_for_instance(inst) print(str(index) + ": label index=" + str(pred) + ", class distribution=" + str(dist)) if str(pred) == "0.0": tempList.append(str(index)) print tempList jvm.stop()
X_test[:,-1] = classes[0] # make sure test classes is removed y_test = Y[test_index] write_to_weka('train.arff', 'training_data', data.columns, X_train, classes) write_to_weka('test.arff', 'testing_data', data.columns, X_test, classes) loader = Loader(classname="weka.core.converters.ArffLoader") trdata = loader.load_file("train.arff") trdata.class_is_last() classifier = Classifier(classname="weka.classifiers.lazy.IBk") classifier.options = ["-K", "10", "-W", "0", "-I", "-A", "weka.core.neighboursearch.LinearNNSearch -A \"weka.core.ManhattanDistance -R first-last\""] classifier.build_classifier(trdata) tedata = loader.load_file("test.arff") tedata.class_is_last() for index, inst in enumerate(tedata): result = classifier.classify_instance(inst) Ypred[test_index[index]] = classes[int(result)] accuracy = float(np.sum(y_test == Ypred[test_index])) / float(y_test.shape[0]) print " => Accuracy = ", accuracy itr += 1 accuracy = float(np.sum(Y == Ypred)) / float(Y.shape[0]) print "Total accuracy = ", accuracy os.remove('train.arff') os.remove('test.arff') jvm.stop()
class WekaEstimator(BaseEstimator, OptionHandler, RegressorMixin, ClassifierMixin): """ Wraps a Weka classifier (classifier/regressor) within the scikit-learn framework. """ def __init__(self, jobject=None, classifier=None, classname=None, options=None, nominal_input_vars=None, nominal_output_var=None, num_nominal_input_labels=None, num_nominal_output_labels=None): """ Initializes the estimator. Can be either instantiated via the following priority of parameters: 1. JB_Object representing a Java Classifier object 2. Classifier pww3 wrapper 3. classname/options :param jobject: the JB_Object representing a Weka classifier to use :type jobject: JB_Object :param classifier: the classifier wrapper to use :type classifier: Classifier :param classname: the classname of the Weka classifier to instantiate :type classname: str :param options: the command-line options of the Weka classifier to instantiate :type options: list :param nominal_input_vars: the list of 0-based indices of attributes to convert to nominal or range string with 1-based indices :type nominal_input_vars: list or str :param nominal_output_var: whether to convert the output variable to a nominal one :type nominal_output_var: bool :param num_nominal_input_labels: the dictionary with the number of labels for the nominal input variables (key is 0-based attribute index) :type num_nominal_input_labels: dict :param num_nominal_output_labels: the number of labels for the output variable :type num_nominal_output_labels: int """ if jobject is not None: _jobject = jobject elif classifier is not None: _jobject = classifier.jobject elif classname is not None: if options is None: options = [] classifier = Classifier(classname=classname, options=options) _jobject = classifier.jobject else: raise Exception("At least Java classname must be provided!") if not is_instance_of(_jobject, "weka.classifiers.Classifier"): raise Exception("Java object does not implement weka.classifiers.Classifier!") super(WekaEstimator, self).__init__(_jobject) self._classifier = Classifier(jobject=_jobject) self.header_ = None self.classes_ = None # the following references are required for get_params/set_params self._classname = classname self._options = options self._nominal_input_vars = nominal_input_vars self._nominal_output_var = nominal_output_var self._num_nominal_input_labels = num_nominal_input_labels self._num_nominal_output_labels = num_nominal_output_labels @property def classifier(self): """ Returns the underlying classifier object, if any. :return: the classifier object :rtype: Classifier """ return self._classifier @property def header(self): """ Returns the underlying dataset header, if any. :return: the dataset structure :rtype: Instances """ return self.header_ def fit(self, data, targets): """ Trains the estimator. :param data: the input variables as matrix, array-like of shape (n_samples, n_features) :type data: ndarray :param targets: the class attribute column, array-like of shape (n_samples,) :type targets: ndarray :return: itself :rtype: WekaEstimator """ data, targets = check_X_y(data, y=targets, dtype=None) if self._nominal_input_vars is not None: data = to_nominal_attributes(data, self._nominal_input_vars) if self._nominal_output_var is not None: targets = to_nominal_labels(targets) d = to_instances(data, targets, num_nominal_labels=self._num_nominal_input_labels, num_class_labels=self._num_nominal_output_labels) self._classifier.build_classifier(d) self.header_ = d.template_instances(d, 0) if d.class_attribute.is_nominal: self.classes_ = d.class_attribute.values else: self.classes_ = None return self def predict(self, data): """ Performs predictions with the trained classifier. :param data: the data matrix to generate predictions for, array-like of shape (n_samples, n_features) :type data: ndarray :return: the score (or scores) :rtype: ndarray """ check_is_fitted(self) if self._nominal_input_vars is not None: data = to_nominal_attributes(data, self._nominal_input_vars) data = check_array(data, dtype=None) result = [] for d in data: inst = to_instance(self.header_, d, missing_value()) if self.header_.class_attribute.is_nominal: result.append(self.header_.class_attribute.value(int(self._classifier.classify_instance(inst)))) else: result.append(self._classifier.classify_instance(inst)) return np.array(result) def predict_proba(self, data): """ Performs predictions and returns class probabilities. :param data: the data matrix to generate predictions for, array-like of shape (n_samples, n_features) :type data: ndarray :return: the probabilities """ check_is_fitted(self) if self._nominal_input_vars is not None: data = to_nominal_attributes(data, self._nominal_input_vars) data = check_array(data, dtype=None) result = [] for d in data: inst = to_instance(self.header_, d, missing_value()) result.append(self._classifier.distribution_for_instance(inst)) return np.array(result) def get_params(self, deep=True): """ Returns the parameters for this classifier, basically classname and options list. :param deep: ignored :type deep: bool :return: the dictionary with options :rtype: dict """ result = dict() result["classname"] = self._classname result["options"] = self._options if self._nominal_input_vars is not None: result["nominal_input_vars"] = self._nominal_input_vars if self._nominal_output_var is not None: result["nominal_output_var"] = self._nominal_output_var if self._num_nominal_input_labels is not None: result["num_nominal_input_labels"] = self._num_nominal_input_labels if self._num_nominal_output_labels is not None: result["num_nominal_output_labels"] = self._num_nominal_output_labels return result def set_params(self, **params): """ Sets the options for the classifier, expects 'classname' and 'options'. :param params: the parameter dictionary :type params: dict """ if len(params) == 0: return if "classname" not in params: raise Exception("Cannot find 'classname' in parameters!") if "options" not in params: raise Exception("Cannot find 'options' in parameters!") self._classname = params["classname"] self._options = params["options"] self._classifier = Classifier(classname=self._classname, options=self._options) self._nominal_input_vars = None if "nominal_input_vars" in params: self._nominal_input_vars = params["nominal_input_vars"] self._nominal_output_var = None if "nominal_output_var" in params: self._nominal_output_var = params["nominal_output_var"] self._num_nominal_input_labels = None if "num_nominal_input_labels" in params: self._num_nominal_input_labels = params["num_nominal_input_labels"] self._num_nominal_output_labels = None if "num_nominal_output_labels" in params: self._num_nominal_output_labels = params["num_nominal_output_labels"] def __str__(self): """ For printing the model. :return: the model representation, if any :rtype: str """ if self._classifier is None: return self._classname + ": No model built yet" else: return str(self._classifier) def __copy__(self): """ Creates a deep copy of itself. :return: the copy :rtype: WekaEstimator """ result = WekaEstimator(jobject=deepcopy(self.jobject)) result._classname = self._classname result._options = self._options[:] result._nominal_input_vars = None if (self._nominal_input_vars is None) else self._nominal_input_vars[:] result._nominal_output_var = self._nominal_output_var return result def __repr__(self, N_CHAR_MAX=700): """ Returns a valid Python string using its classname and options. :param N_CHAR_MAX: ignored :type N_CHAR_MAX: int :return: the representation :rtype: str """ if isinstance(self._nominal_input_vars, str): return "WekaEstimator(classname='%s', options=%s, nominal_input_vars='%s', nominal_output_var=%s)" % (self._classifier.classname, str(self._classifier.options), str(self._nominal_input_vars), str(self._nominal_output_var)) else: return "WekaEstimator(classname='%s', options=%s, nominal_input_vars=%s, nominal_output_var=%s)" % (self._classifier.classname, str(self._classifier.options), str(self._nominal_input_vars), str(self._nominal_output_var))
def test(objs, paras, testfile1, pred, real): testfile = preprocess(testfile1, True) xref = {'x_nT':1,'x_nT_delta':0,'x_nK':1,'x_nK_delta':0,'x_long':1,'x_str':0,'x_strsum':0} add_features(xref, 'x') zeroref = [] for k in ['long', 'nK', 'nK_delta', 'nT', 'nT_delta', 'str', 'strsum']: zeroref.append(xref['x_%s' % k]) zeroref.append(0) # should be obj for k in addf(): zeroref.append(xref['x_%s' % k]) with open(testfile) as fin: reader = csv.DictReader(fin) linecount = 0 for line in reader: ops = [] for h in line: if h.startswith('op'): ops.append(h[:h.find('_')]) for op in ops: add_features(line, op) stats = {} valid = True real_line = {} for h in line: if h.startswith('op'): k = h[:h.find('_')] v = h[h.find('_')+1:] if k not in stats: stats[k] = {} stats[k][v] = pfloat(line[h]) if stats[k][v] is None: valid = False elif h in objs: real_line[h] = pfloat(line[h]) if real_line[h] is None: valid = False if not valid: continue linecount += 1 if linecount > 250: continue #for k in stats: # assert len(paras) == len(stats[k]) # for v in stats[k]: # assert v in paras for obj in objs: c = Classifier(jobject=serialization.read(model_file('hash', obj))) zerovalue = c.classify_instance(Instance.create_instance(zeroref)) #s = 0 s = zerovalue for op in stats: values = [] for k in ['long', 'nK', 'nK_delta', 'nT', 'nT_delta', 'str', 'strsum']: values.append(stats[op][k]) values.append(0) # should be obj for k in addf(): values.append(stats[op][k]) ins = Instance.create_instance(values) prediction = c.classify_instance(ins) #print ' ', obj, op, values, prediction, prediction - zerovalue #s += pred s = s + max(prediction - zerovalue, 0) #print obj, 'real', real_line[obj], 'pred', s pred[obj].append(s) real[obj].append(real_line[obj]) print 'test', testfile, 'linecount', linecount subprocess.call('rm %s' % testfile, shell=True)