def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()

    # train classifier
    classifier = Classifier("weka.classifiers.trees.J48")
    classifier.build_classifier(iris_data)

    # save and read object
    helper.print_title("I/O: single object")
    outfile = tempfile.gettempdir() + os.sep + "j48.model"
    serialization.write(outfile, classifier)
    model = Classifier(jobject=serialization.read(outfile))
    print(model)

    # save classifier and dataset header (multiple objects)
    helper.print_title("I/O: single object")
    serialization.write_all(outfile, [classifier, Instances.template_instances(iris_data)])
    objects = serialization.read_all(outfile)
    for i, obj in enumerate(objects):
        helper.print_info("Object #" + str(i+1) + ":")
        if javabridge.get_env().is_instance_of(obj, javabridge.get_env().find_class("weka/core/Instances")):
            obj = Instances(jobject=obj)
        elif javabridge.get_env().is_instance_of(obj, javabridge.get_env().find_class("weka/classifiers/Classifier")):
            obj = Classifier(jobject=obj)
        print(obj)
예제 #2
0
    def serialize(self, model, path):
        """Serialize the java model in specific path

        Arguments:
            model {java object} -- the model to serialize
            path {str} -- path to save the model serialized
        """
        serialization.write(path, model)
예제 #3
0
    def train_weka_model(self,
                         training_data_dir,
                         save_model_dir,
                         log_file,
                         mimic_env=None):
        """
        Just runs some example code.
        """
        loader = Loader(classname="weka.core.converters.CSVLoader")
        training_data = loader.load_file(training_data_dir)
        training_data.class_is_last()

        self.classifier = Classifier(classname="weka.classifiers.trees.M5P",
                                     options=self.options)
        # classifier help, check https://weka.sourceforge.io/doc.dev/weka/classifiers/trees/M5P.html
        self.classifier.build_classifier(training_data)
        # print(classifier)
        graph = self.classifier.graph
        node_number = float(graph.split('\n')[-3].split()[0].replace('N', ''))
        leaves_number = node_number / 2
        serialization.write(save_model_dir, self.classifier)
        # print('Leaves number is {0}'.format(leave_number), file=log_file)

        evaluation = Evaluation(training_data)
        predicts = evaluation.test_model(self.classifier, training_data)
        # return_value = None
        # if mimic_env is not None:
        predict_dictionary = {}
        for predict_index in range(len(predicts)):
            predict_value = predicts[predict_index]
            if predict_value in predict_dictionary.keys():
                predict_dictionary[predict_value].append(predict_index)
            else:
                predict_dictionary.update({predict_value: [predict_index]})

        # return_value = mimic_env.get_return(state=list(predict_dictionary.values()))
        return_value_log = mimic_env.get_return(
            state=list(predict_dictionary.values()))
        return_value_log_struct = mimic_env.get_return(
            state=list(predict_dictionary.values()), apply_structure_cost=True)
        return_value_var_reduction = mimic_env.get_return(
            state=list(predict_dictionary.values()),
            apply_variance_reduction=True)
        # print("Training return is {0}".format(return_value), file=log_file)

        summary = evaluation.summary()
        numbers = summary.split('\n')
        corr = float(numbers[1].split()[-1])
        mae = float(numbers[2].split()[-1])
        rmse = float(numbers[3].split()[-1])
        rae = float(numbers[4].split()[-2]) / 100
        rrse = float(numbers[5].split()[-2]) / 100
        # print(evl)
        # print("Training summary is "+summary, file=log_file)

        return return_value_log, return_value_log_struct, \
               return_value_var_reduction, mae, rmse, leaves_number
예제 #4
0
    def serialize(self, ser_file):
        """
        Serializes the filter to the specified file.

        :param ser_file: the file to save the filter to
        :type ser_file: str
        """

        serialization.write(ser_file, self)
예제 #5
0
 def __init__(self, index = 0, inference = "ExactInference", ghostAgents = None):
     BustersAgent.__init__(self, index, inference, ghostAgents)
     jvm.start(max_heap_size="512m")
     self.loader = Loader(classname="weka.core.converters.ArffLoader")
     self.data = self.loader.load_file("data/training-fase3.arff")
     self.data.class_is_last()
     self.cls = Classifier(classname="weka.classifiers.trees.REPTree", options=["-M", "2","-V", "0.001","-N", "3", "-S", "1", "-L", "-1"])
     self.cls.build_classifier(self.data)
     serialization.write("data/out.model", self.cls)
예제 #6
0
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()

    # train classifier
    classifier = Classifier("weka.classifiers.trees.J48")
    classifier.build_classifier(iris_data)

    # save and read object
    helper.print_title("I/O: model (using serialization module)")
    outfile = tempfile.gettempdir() + os.sep + "j48.model"
    serialization.write(outfile, classifier)
    model = Classifier(jobject=serialization.read(outfile))
    print(model)

    # save classifier and dataset header (multiple objects)
    helper.print_title("I/O: model and header (using serialization module)")
    serialization.write_all(
        outfile,
        [classifier, Instances.template_instances(iris_data)])
    objects = serialization.read_all(outfile)
    for i, obj in enumerate(objects):
        helper.print_info("Object #" + str(i + 1) + ":")
        if javabridge.get_env().is_instance_of(
                obj,
                javabridge.get_env().find_class("weka/core/Instances")):
            obj = Instances(jobject=obj)
        elif javabridge.get_env().is_instance_of(
                obj,
                javabridge.get_env().find_class(
                    "weka/classifiers/Classifier")):
            obj = Classifier(jobject=obj)
        print(obj)

    # save and read object
    helper.print_title("I/O: just model (using Classifier class)")
    outfile = tempfile.gettempdir() + os.sep + "j48.model"
    classifier.serialize(outfile)
    model, _ = Classifier.deserialize(outfile)
    print(model)

    # save classifier and dataset header (multiple objects)
    helper.print_title("I/O: model and header (using Classifier class)")
    classifier.serialize(outfile, header=iris_data)
    model, header = Classifier.deserialize(outfile)
    print(model)
    if header is not None:
        print(header)
예제 #7
0
def save_all_models(results_dir_str, train_data):
	c = 0
	for algo in algo_func_dict.keys():
		gc.collect()
		print "Training: " + str(algo)
		model = algo_func_dict[algo](train_data)
		out_file = results_dir_str + '/' + algo + ".model"
		serialization.write(out_file, model)
		c += 1
		print str(c) + ": Model Saved =>" + str(out_file)
def save_all_models():
    id = str(expression) + '_' + str(user)
    target_dir = '../results/' + str(expression) + '/' + str(user) + '/'
    train_data_file = "../data/arff_files/" + str(user) + "/"
    loader = Loader(classname="weka.core.converters.ArffLoader")
    train_data = loader.load_file(train_data_file)
    for algo in algo_func_dict.keys():
        model = algo_func_dict[algo](train_data)
        out_file = target_dir + algo + ".model"
        serialization.write(out_file, model)
def case1():
    loader = Loader(classname="weka.core.converters.ArffLoader")
    file = input("Enter the name of the file without the extension:")
    data = loader.load_file(file + ".arff", incremental=True)
    data.class_is_last()
    # print(data)
    # print(str(data1))
    cls = Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")
    cls.build_classifier(data)
    for inst in loader:
        cls.update_classifier(inst)
    serialization.write(file + ".bin", cls)
    print("Model created with name:", file, ".bin")
예제 #10
0
def create_cluster_model(arff_file, n=10, loader_type="csv", model="kmeans.model"):
    """ create cluster model """
    check_jvm()
    if loader_type == "csv":
        loader = converters.Loader(classname="weka.core.converters.CSVLoader")
    else :
        loader = conventers.Loader(classname="weka.core.converters.ArffLoader")

    data = loader.load_file(arff_file)
    clusterer = Clusterer(
        classname="weka.clusterers.SimpleKMeans", options=["-N", str(n)])
    clusterer.build_clusterer(data)
    serialization.write(model, clusterer)
예제 #11
0
    def test_weka_model(self,
                        testing_data_dir,
                        save_model_dir,
                        log_file,
                        mimic_env=None):
        self.classifier = Classifier(
            jobject=serialization.read(save_model_dir))

        graph = self.classifier.graph
        node_number = float(graph.split('\n')[-3].split()[0].replace('N', ''))
        leaves_number = node_number / 2
        serialization.write(save_model_dir, self.classifier)
        # print('Leaves number is {0}'.format(leave_number), file=log_file)

        loader = Loader(classname="weka.core.converters.CSVLoader")
        testing_data = loader.load_file(testing_data_dir)
        testing_data.class_is_last()

        evaluation = Evaluation(testing_data)
        predicts = evaluation.test_model(self.classifier, testing_data)

        predict_dictionary = {}
        for predict_index in range(len(predicts)):
            predict_value = predicts[predict_index]
            if predict_value in predict_dictionary.keys():
                predict_dictionary[predict_value].append(predict_index)
            else:
                predict_dictionary.update({predict_value: [predict_index]})

        return_value_log = mimic_env.get_return(
            state=list(predict_dictionary.values()))
        return_value_log_struct = mimic_env.get_return(
            state=list(predict_dictionary.values()), apply_structure_cost=True)
        return_value_var_reduction = mimic_env.get_return(
            state=list(predict_dictionary.values()),
            apply_variance_reduction=True)

        summary = evaluation.summary()
        numbers = summary.split('\n')
        corr = float(numbers[1].split()[-1])
        mae = float(numbers[2].split()[-1])
        rmse = float(numbers[3].split()[-1])
        rae = float(numbers[4].split()[-2]) / 100
        rrse = float(numbers[5].split()[-2]) / 100
        # print(evl)
        # print("Testing summary is "+summary, file=log_file)

        return return_value_log, return_value_log_struct, \
               return_value_var_reduction, mae, rmse, leaves_number
예제 #12
0
    def serialize(self, ser_file, header=None):
        """
        Serializes the clusterer to the specified file.

        :param ser_file: the file to save the model to
        :type ser_file: str
        :param header: the (optional) dataset header to store alongside; recommended
        :type header: Instances
        """

        if (header is not None) and header.num_instances > 0:
            header = Instances.template_instances(header)

        if header is not None:
            serialization.write_all(ser_file, [self, header])
        else:
            serialization.write(ser_file, self)
예제 #13
0
def output_model(objs, paras, outfiles):
  global op
  outfile = preprocess(outfiles)
  c = Classifier(classname="weka.classifiers.trees.M5P")
  for obj in objs:
    data = cleanup(outfile, paras, obj)
    print 'output_model', op, obj, paras, data.num_instances, outfile
    header = []
    for a in data.attributes():
      header.append(a.name)
    c.build_classifier(data)
    #print c
    serialization.write(model_file(op, obj), c)
    #e = Evaluation(data)
    #e.test_model(c, data)
    #print e.summary()
  subprocess.call('rm %s' % outfile, shell=True)
예제 #14
0
 def train_model(self, training_data):
     model_weka = None
     if os.path.isfile(self.model_file):
         print 'Model ' + self.name + ' already trained.'
     else:
         print 'Starting to train_model model ' + self.name + '.'
         model_weka = Classifier(classname = self.classname, options = self.options) 
         
         model_weka.build_classifier(data = training_data)
         serialization.write(filename = self.model_file, jobject = model_weka)
         print 'Model ' + self.name + ' trained and saved.'
     if os.path.isfile(self.parameter_file):
         print 'Parameters of the model ' + self.name + ' already saved.'
     else:
         if model_weka == None:
             model_weka = Classifier(jobject = serialization.read(self.model_file))
         save_file(file_name = self.parameter_file, content = str(model_weka))
         print 'Parameters of the model ' + self.name + ' saved.'
예제 #15
0
    def test_read_write(self):
        """
        Tests methods read and write.
        """
        fname = self.tempfile("readwrite.ser")
        self.delfile(fname)

        lin = ["A", "B", "C", "D"]
        vin = javabridge.make_instance("java/util/Vector", "()V")
        for element in lin:
            javabridge.call(vin, "add", "(Ljava/lang/Object;)Z", element)
        serialization.write(fname, vin)
        self.assertTrue(os.path.exists(fname), msg="Failed to write to " + fname + "?")

        vout = serialization.read(fname)
        self.assertIsNotNone(vout, msg="Failed to read from " + fname + "?")
        enm = javabridge.call(vin, "elements", "()Ljava/util/Enumeration;")
        lout = typeconv.enumeration_to_list(enm)
        self.delfile(fname)
        self.assertEqual(lin, lout, msg="Input/output differ")
예제 #16
0
    def test_read_write(self):
        """
        Tests methods read and write.
        """
        fname = self.tempfile("readwrite.ser")
        self.delfile(fname)

        lin = ["A", "B", "C", "D"]
        vin = javabridge.make_instance("java/util/Vector", "()V")
        for element in lin:
            javabridge.call(vin, "add", "(Ljava/lang/Object;)Z", element)
        serialization.write(fname, vin)
        self.assertTrue(os.path.exists(fname), msg="Failed to write to " + fname + "?")

        vout = serialization.read(fname)
        self.assertIsNotNone(vout, msg="Failed to read from " + fname + "?")
        enm = javabridge.call(vin, "elements", "()Ljava/util/Enumeration;")
        lout = typeconv.enumeration_to_list(enm)
        self.delfile(fname)
        self.assertEqual(lin, lout, msg="Input/output differ")
예제 #17
0
def main():
    # load a dataset
    loader = Loader(classname="weka.core.converters.ArffLoader")
    train_data = loader.load_file(
        "/home/atos-user/project/Logs/arff/logs_training.arff")

    train_data.class_is_last()  # set class attribute

    classifier = Classifier(classname="weka.classifiers.rules.PART",
                            options=["-C", "0.25", "-M", "2", "-Q", "1"])
    # classifier = Classifier(classname="weka.classifiers.rules.JRip", options=["-F", "3", "-N", "2.0", "-O", "2", "-S", "1"])
    # classifier = Classifier(classname="weka.classifiers.trees.RandomForest",  options=["-P", "100", "-I", "100", "-num-slots", "1", "-S", "1", "-K", "0", "-M", "1.0", "-V", "0.001"])
    # classifier = Classifier(classname="weka.classifiers.functions.MultilayerPerceptron",  options=["-L", "0.3", "-M", "0.2", "-N", "500", "-S", "0", "-V", "0", "-E", "20", "-H", "a"])
    classifier.build_classifier(train_data)

    predicted_index = 0
    notpredicted_index = 0
    index = 0
    index_predicted = 0
    index_notpredicted = 0
    fauxpositif1 = 0
    fauxpositif2 = 0
    for index, pred in enumerate(evaluation.predictions):

        if pred.predicted == 1.0 and pred.actual == 1.0:
            index_predicted += 1
        if pred.predicted == 0.0 and pred.actual == 0.0:
            index_notpredicted += 1
        if pred.predicted == 1.0 and pred.actual == 0.0:
            fauxpositif1 += 1
        if pred.predicted == 0.0 and pred.actual == 1.0:
            fauxpositif2 += 1

    print("index_predicted = ", index_predicted)
    print("index_notpredicted = ", index_notpredicted)
    print("fauxpositif1 = ", fauxpositif1)
    print("fauxpositif2 = ", fauxpositif2)
    print(evaluation.summary())

    serialization.write("/home/atos-user/project/project_files/PART.model",
                        classifier)  # save model
예제 #18
0
def save_all_models():
    for user in user_list:
        user_train_dir = os.listdir("../data/arff_files/" + str(user) + "/train/")
        user_train_dir.sort()
        n = len(user_train_dir)
        c = 0
        for expression_index in range(n):
            print "\n", expression_index, "=>", str(expression_list[expression_index]), ':', str(user_train_dir[expression_index])
            id = str(expression_list[expression_index]) + '_' + str(user)
            target_dir = '../models/' + str(expression_list[expression_index]) + '/' + str(user) + '/'
            train_data_file = "../data/arff_files/" + str(user) + "/train/" + str(user_train_dir[expression_index])
            print train_data_file, "=>", target_dir, "\n"
            loader = Loader(classname="weka.core.converters.ArffLoader")
            train_data = loader.load_file(train_data_file)
            for algo in algo_func_dict.keys():
                print "training - " + str(algo)
                model = algo_func_dict[algo](train_data)
                out_file = target_dir + algo + ".model"
                serialization.write(out_file, model)
                c = c + 1
                print str(c) + ": Model Saved =>" + str(out_file)
예제 #19
0
def stream(file, option):
    jvm.start(packages=True)

    if option == 1:
        print(
            "Hi! This is a protected command, please insert the password to proceed!"
        )
        for x in range(3):
            password = input('')
            if password.strip() == 'DMMLproject':
                print("All good!")
                break
            else:
                if x == 2:
                    print(
                        "This command is protected and can be used only by an administrator, please use another command."
                    )
                    return
                else:
                    print(
                        "Wrong password, please provide the correct password")

    hoeffding = loadModel('models/HoeffdingTree.model')
    f = open(file, 'r')
    while True:
        line = f.readline()
        if not line:
            break
        if option == 0:
            classifyOne(line.strip(), hoeffding)
        else:
            print('Stream update start at: ', datetime.now().time())
            hoeffding = retrainOneInc(line.strip(), hoeffding)
            print('Stream update end at: ', datetime.now().time())
    f.close()
    sr.write('models/HoeffdingTree.model', hoeffding)
예제 #20
0
def classify(data,
             classifier,
             cv,
             modelPath,
             folds=10,
             splitPerc=70,
             randomSeed=10):
    # cross validate the model
    if cv:
        print('CV start at: ', datetime.now().time())
        evaluation = Evaluation(data)
        evaluation.crossvalidate_model(classifier, data, folds,
                                       Random(randomSeed))
        print('CV end at: ', datetime.now().time())
        displayResults("Cross Validation", evaluation)

    else:
        # split data into train and test
        print('Split start training at: ', datetime.now().time())
        train, test = data.train_test_split(splitPerc, Random(randomSeed))
        # build classifier with training set
        classifier.build_classifier(train)

        print(classifier)

        print('Split end training at: ', datetime.now().time())
        evaluation = Evaluation(train)

        print('Split start at: ', datetime.now().time())
        evaluation.test_model(classifier, test)
        print('Split end at: ', datetime.now().time())

        # evaluation.evaluate_model(classifier, ["-t", test])

        displayResults("TrainTestSplit", evaluation)
        sr.write(modelPath, classifier)
예제 #21
0
	def run(self):
		# Attach JVM
		javabridge.attach()

		# Debug

		print "Classifier"
		print self.classifier
		print "Params"
		print self.parameters
		print "Model Params"
		print self.modelParams

		# Get data for testing and learning
		learnerData = self.retrieveData(self.questionID, "learner")
		testData = self.retrieveData(self.questionID, 'test')
		masterData = self.retrieveData(self.questionID, 'all')
		masterData = self.addNominals(masterData)

		# Check if there is enough correct data to run
		if (learnerData.num_instances < 1 or testData.num_instances < 1):
			self.status = self.config.NOT_ENOUGH_DATA
			return False

		# If this is a prediction and there is a valid patient, change masterData header
		patientObj = self.buildPatientObject()
		patientInstance = None
		if ((patientObj is not None) and (self.predict == 1)):
			masterData = self.addPatientNominals(patientObj, masterData)
			patientInstance = self.createPatientInstance(patientObj, masterData)
			masterData.add_instance(patientInstance)

		elif (patientObj is None) and (self.predict == 1):
			print 'No patient defined for prediction. Exiting'
			return True
		# Fix dataset headers up to match and fix instances to match headers
		masterData.delete()
		learner = masterData.copy_instances(masterData, 0, 0)
		test = masterData.copy_instances(masterData, 0, 0)
		self.addInstancesToDataset(learnerData, learner)
		self.addInstancesToDataset(testData, test)

		# Comparison of data for testing purposes
		# print 'learnerData'
		# print learnerData

		# print 'learner'
		# print learner

		# print 'testData'
		# print testData

		# print 'test'
		# print test

		# pdb.set_trace()
		# Instantiate classifier
		self.cls = Classifier(classname=self.classifier, options=self.parameters)

		# Run classifier
		self.cls.build_classifier(learner)
		# for index, inst in enumerate(learnerData):
			# prediction = self.cls.classify_instance(inst)
			# distribution = self.cls.distribution_for_instance(inst)

		# Test classifier
		evl = Evaluation(learner)
		evl.test_model(self.cls, test)

		# Store information about matrix
		self.acc = evl.percent_correct
		self.val = None

		# Convert numpy array into simple array
		confusionMatrix = []
		confusionMatrix.append([evl.confusion_matrix[0][0], evl.confusion_matrix[0][1]])
		confusionMatrix.append([evl.confusion_matrix[1][0], evl.confusion_matrix[1][1]])

		# Convert matrix into json format
		self.matrix = json.dumps(confusionMatrix)

		
		# print 'Classifier: ', self.classifier
		# print 'ID: ', self.questionID
		# print 'ACC: ', self.acc
		# print(evl.summary())

		# If this is a prediction... make the prediction
		if ((patientObj is not None) and (self.predict == 1)):
			masterData.add_instance(patientInstance)
			print "Running prediction on patient: "
			print masterData.get_instance(0)
			self.prediction = self.cls.classify_instance(masterData.get_instance(0))
			#self.uploadPrediction()

		# Temporarily store file to serialize to
		fileName = str(self.questionID) + self.algorithm + ".model"
		serialization.write(fileName, self.cls)

		# Open that file and store it
		self.model = None
		with open(fileName, 'rb') as f:
			self.model = f.read()

		# Remove temporary file
		os.remove(fileName)

		# Set status to awaiting feedback
		self.status = self.config.AWAITING_FEEDBACK_STATUS
		return True
예제 #22
0
 def saveModel(self, model, method, mname):
     finalname = "%s_%s.model" %(method, mname)
     serialization.write(os.path.join(self.modelDir, finalname), model)
     logger.info('[%s] : [INFO] Saved mode %s ',
                 datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), finalname)
예제 #23
0
def call_weka(file_dir, ml_opt, ofile_dir):

    loader = Loader(classname="weka.core.converters.CSVLoader")
    data = loader.load_file(file_dir)
    data.class_is_last()
    filtered = data

    ml_id = ''
    if ml_opt != '0':
        if ml_opt == '1':
            classifier = Classifier(
                classname="weka.classifiers.functions.LibSVM",
                options=[
                    "-S", "0", "-K", "2", "-D", "3", "-G", "0.0", "-R", "0.0",
                    "-N", "0.5", "-M", "40.0", "-C", "1.0", "-E", "0.001",
                    "-P", "0.1", "-seed", "1"
                ])
            ml_id = 'SVM'
        elif ml_opt == '3':
            classifier = Classifier(
                classname="weka.classifiers.functions.MLPClassifier",
                options=[
                    '-N', '2', '-R', '0.01', '-O', '1.0E-6', '-P', '1', '-E',
                    '1', '-S', '1'
                ])
            ml_id = 'MLPC'
        elif ml_opt == '4':
            classifier = Classifier(
                classname="weka.classifiers.trees.RandomForest",
                options=["-I", "100", "-K", "0", "-S", "1", "-num-slots", "1"])
            ml_id = 'RF'
        elif ml_opt == '2':
            classifier = Classifier(classname="weka.classifiers.meta.Bagging",
                                    options=[
                                        "-P", "100", "-S", "1", "-I", "10",
                                        "-W", "weka.classifiers.trees.M5P",
                                        "--", "-M", "4.0"
                                    ])
            ml_id = 'BagM5P'
        elif ml_opt == '5':
            classifier = Classifier(classname="weka.classifiers.trees.J48",
                                    options=["-C", "0.25", "-M", "2"])
            ml_id = 'J48'
        elif ml_opt == '7':
            classifier = Classifier(
                classname="weka.classifiers.functions.RBFNetwork",
                options=[
                    "-B", "2", "-S", "1", "-R", "1.0E-8", "-M", "-1", "-W",
                    "0.1"
                ])
            ml_id = 'RBFNet'
        elif ml_opt == '8':
            classifier = Classifier(
                classname="weka.classifiers.bayes.BayesNet",
                options=[
                    "-D", "-Q", "weka.classifiers.bayes.net.search.local.K2",
                    "--", "-P", "1", "-S", "BAYES", "-E",
                    "weka.classifiers.bayes.net.estimate.SimpleEstimator",
                    "--", "-A", "0.5"
                ])
            ml_id = 'BayesNet'
        elif ml_opt == '6':
            classifier = Classifier(
                classname="weka.classifiers.bayes.NaiveBayes")
            ml_id = 'NaiveBayes'
        elif ml_opt == '9':
            classifier = Classifier(
                classname="weka.classifiers.functions.SimpleLogistic",
                options=["-I", "0", "-M", "500", "-H", "50", "-W", "0.0"])
            ml_id = 'LogReg'
        filtered.class_is_last()
        evaluation = Evaluation(filtered)
        evaluation.crossvalidate_model(classifier, filtered, 10, Random(42))
        print "Evaluation: Done."

        ofile = open(ofile_dir + ml_id + "_results.txt", 'wb')

        print >> ofile, evaluation.summary()
        print >> ofile, evaluation.class_details().encode('ascii', 'ignore')
        print >> ofile, evaluation.matrix().encode('ascii', 'ignore')
        serialization.write(ofile_dir + ml_id + ".model", classifier)
        print "Saving " + ml_id + " Model: Done."

        ofile.close()
예제 #24
0
#     cl2 = clusterEM.cluster_instance(inst)
#     dist2 = clusterEM.distribution_for_instance(inst)
#     print ("cluster=" + str(cl2) + ", distribution=" + str(dist2))
#     print inst
#
clusterDBSCAN = Clusterer(
    classname="weka.clusterers.DBSCAN",
    options=[
        "-E", "0.9", "-M", "6", "-I",
        "weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase",
        "-D",
        "weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclideanDataObject"
    ])
clusterDBSCAN.build_clusterer(data)

serialization.write(os.path.join(modelDir, "dbscan.model"), clusterDBSCAN)
cluster = Clusterer(
    jobject=serialization.read(os.path.join(modelDir, "dbscan.model")))
# print clusterDBSCAN
# print clusterDBSCAN.number_of_clusters
for inst in data:
    cl3 = cluster.cluster_instance(inst)
    dist3 = cluster.distribution_for_instance(inst)
    print(("cluster=" + str(cl3) + ", distribution=" + str(dist3)))

# for inst in data:
#     cl3 = clusterDBSCAN.cluster_instance(inst)
#     dist3 = clusterDBSCAN.distribution_for_instance(inst)
#     print ("cluster=" + str(cl3) + ", distribution=" + str(dist3))
jvm.stop()
예제 #25
0
jvm.start(max_heap_size="4g",packages=True)


Wtrain = converters.load_any_file("train.csv")
Wtest = converters.load_any_file("test.csv")
Wtrain.class_is_last()
Wtest.class_is_last()



if(Path('lmt.model').exists()):
    lmt = Classifier(jobject=serialization.read("lmt.model"))
else:
    lmt = Classifier(classname="weka.classifiers.trees.LMT")
    lmt.build_classifier(Wtrain)
    serialization.write("lmt.model", lmt)

evlmt = Evaluation(Wtrain)
evlmt.crossvalidate_model(lmt, Wtrain, 5, Random(1))



print("Error is",evlmt.error_rate)
cm2e = evlmt.confusion_matrix
cm2E = pd.DataFrame(cm2e, index = ["neg","pos"],columns = ["neg","pos"])
plt.figure(figsize = (7,7))
axis = sns.heatmap(cm2E, annot=True, cbar=False, cmap="Reds")
plcls.plot_roc(evlmt,class_index=[1])


tevlmt = Evaluation(Wtrain)
예제 #26
0
def save_model_weka(classifier, path):
    serialization.write(path, classifier)
예제 #27
0
    def runner(self, cdat, heap_size = 16384, seed = None, verbose = True):
        self.set_status(Pipeline.RUNNING)

        self.logs.append('Initializing Pipeline')

        para = self.config

        self.logs.append('Reading Pipeline Configuration')

        head = ''
        name = get_rand_uuid_str()

        self.logs.append('Reading Input File')

        for i, stage in enumerate(self.stages):
            if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'):
                self.stages[i].status = Pipeline.RUNNING
            if stage.code ==  'dat.fle':
                head    = os.path.abspath(stage.value.path)
                name, _ = os.path.splitext(stage.value.name)

        self.logs.append('Parsing to ARFF')

        path = os.path.join(head, '{name}.arff'.format(name = name))
        # This bug, I don't know why, using Config.schema instead.
        # cdat.toARFF(path, express_config = para.Preprocess.schema, verbose = verbose)

        for i, stage in enumerate(self.stages):
            if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'):
                self.stages[i].status = Pipeline.COMPLETE

        self.logs.append('Saved ARFF at {path}'.format(path = path))
        self.logs.append('Splitting to Training and Testing Sets')

        JVM.start(max_heap_size = '{size}m'.format(size = heap_size))

        load = Loader(classname = 'weka.core.converters.ArffLoader')
        # data = load.load_file(path)
        # save =  Saver(classname = 'weka.core.converters.ArffSaver')
        data = load.load_file(os.path.join(head, 'iris.arff')) # For Debugging Purposes Only
        data.class_is_last() # For Debugging Purposes Only
        # data.class_index = cdat.iclss

        for i, stage in enumerate(self.stages):
            if stage.code == 'prp.kcv':
                self.stages[i].status = Pipeline.RUNNING

        self.logs.append('Splitting Training Set')

        # TODO - Check if this seed is worth it.
        seed = assign_if_none(seed, random.randint(0, 1000))
        opts = ['-S', str(seed), '-N', str(para.Preprocess.FOLDS)]
        wobj = Filter(classname = 'weka.filters.supervised.instance.StratifiedRemoveFolds', options = opts + ['-V'])
        wobj.inputformat(data)

        tran = wobj.filter(data)

        self.logs.append('Splitting Testing Set')

        wobj.options = opts
        test = wobj.filter(data)

        for i, stage in enumerate(self.stages):
            if stage.code == 'prp.kcv':
                self.stages[i].status = Pipeline.COMPLETE

        self.logs.append('Performing Feature Selection')

        feat = [ ]
        for comb in para.FEATURE_SELECTION:
            if comb.USE:
                for i, stage in enumerate(self.stages):
                    if stage.code == 'ats':
                        search    = stage.value.search.name
                        evaluator = stage.value.evaluator.name

                        if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME:
                            self.stages[i].status = Pipeline.RUNNING

                srch = ASSearch(classname = 'weka.attributeSelection.{classname}'.format(
                    classname = comb.Search.NAME,
                    options   = assign_if_none(comb.Search.OPTIONS, [ ])
                ))
                ewal = ASEvaluation(classname = 'weka.attributeSelection.{classname}'.format(
                    classname = comb.Evaluator.NAME,
                    options   = assign_if_none(comb.Evaluator.OPTIONS, [ ])
                ))

                attr = AttributeSelection()
                attr.search(srch)
                attr.evaluator(ewal)
                attr.select_attributes(tran)

                meta = addict.Dict()
                meta.search    = comb.Search.NAME
                meta.evaluator = comb.Evaluator.NAME
                meta.features  = [tran.attribute(index).name for index in attr.selected_attributes]

                feat.append(meta)

                for i, stage in enumerate(self.stages):
                    if stage.code == 'ats':
                        search    = stage.value.search.name
                        evaluator = stage.value.evaluator.name

                        if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME:
                            self.stages[i].status = Pipeline.COMPLETE

        models = [ ]
        for model in para.MODEL:
            if model.USE:
                summary         = addict.Dict()

                self.logs.append('Modelling {model}'.format(model = model.LABEL))

                summary.label   = model.LABEL
                summary.name    = model.NAME
                summary.options = assign_if_none(model.OPTIONS, [ ])

                for i, stage in enumerate(self.stages):
                    if stage.code == 'lrn' and stage.value.name == model.NAME:
                        self.stages[i].status = Pipeline.RUNNING

                for i, instance in enumerate(data):
                    iclass = list(range(instance.num_classes))
                
                options    = assign_if_none(model.OPTIONS, [ ])
                classifier = Classifier(classname = 'weka.classifiers.{classname}'.format(classname = model.NAME), options = options)
                classifier.build_classifier(tran)
        
                serializer.write(os.path.join(head, '{name}.{classname}.model'.format(
                        name = name,
                    classname = model.NAME
                )), classifier)

                self.logs.append('Testing model {model}'.format(model = model.LABEL))

                evaluation       = Evaluation(tran)
                evaluation.test_model(classifier, test)

                summary.summary  = evaluation.summary()

                frame  = pd.DataFrame(data = evaluation.confusion_matrix)
                axes   = sns.heatmap(frame, cbar = False, annot = True)
                b64str = get_b64_plot(axes)
                
                summary.confusion_matrix = addict.Dict({
                    'value': evaluation.confusion_matrix.tolist(),
                     'plot': b64str
                })

                self.logs.append('Plotting Learning Curve for {model}'.format(model = model.LABEL))

                buffer = io.BytesIO()
                plot_classifier_errors(evaluation.predictions, tran, test, outfile = buffer, wait = False)
                b64str = buffer_to_b64(buffer)

                summary.learning_curve   = b64str

                buffer = io.BytesIO()
                plot_roc(evaluation, class_index = iclass, outfile = buffer, wait = False)
                b64str = buffer_to_b64(buffer)

                summary.roc_curve        = b64str

                buffer = io.BytesIO()
                plot_prc(evaluation, class_index = iclass, outfile = buffer, wait = False)
                b64str = buffer_to_b64(buffer)

                summary.prc_curve        = b64str

                if classifier.graph:
                    summary.graph = classifier.graph

                for i, instance in enumerate(test):
                    prediction = classifier.classify_instance(instance)

                for i, stage in enumerate(self.stages):
                    if stage.code == 'lrn' and stage.value.name == model.NAME:
                        self.stages[i].status = Pipeline.COMPLETE

                models.append(summary)

        self.gist.models = models

        JVM.stop()

        JSON.write(os.path.join(head, '{name}.cgist'.format(name = name)), self.gist)

        self.logs.append('Pipeline Complete')

        self.set_status(Pipeline.COMPLETE)
예제 #28
0
def save_model(model, fname, dir_name):
    outfile = realpath(join(dir_name, fname))  # "fname.model"
    serialization.write(outfile, model)
예제 #29
0
def train(request):

    jvm.start()

    d_att1 = Attribute.create_numeric("bodydearword.feature")
    d_att2 = Attribute.create_numeric("bodyform.feature")
    d_att3 = Attribute.create_numeric("bodyhtml.feature")
    d_att4 = Attribute.create_numeric("bodymultipart.feature")
    d_att5 = Attribute.create_numeric("bodynumchars.feature")
    d_att6 = Attribute.create_numeric("bodynumfunctionwords.feature")
    d_att7 = Attribute.create_numeric("bodynumuniqwords.feature")
    d_att8 = Attribute.create_numeric("bodynumwords.feature")
    d_att9 = Attribute.create_numeric("bodyrichness.feature")
    d_att10 = Attribute.create_numeric("bodysuspensionword.feature")
    d_att11 = Attribute.create_numeric("bodyverifyyouraccountphrase.feature")
    d_att12 = Attribute.create_numeric("externalsabinary.feature")
    d_att13 = Attribute.create_numeric("externalsascore.feature")
    d_att14 = Attribute.create_numeric("scriptjavascript.feature")
    d_att15 = Attribute.create_numeric("scriptonclick.feature")
    d_att16 = Attribute.create_numeric("scriptpopup.feature")
    d_att17 = Attribute.create_numeric("scriptstatuschange.feature")
    d_att18 = Attribute.create_numeric("scriptunmodalload.feature")
    d_att19 = Attribute.create_numeric("senddiffreplyto.feature")
    d_att20 = Attribute.create_numeric("sendnumwords.feature")
    d_att21 = Attribute.create_numeric("sendunmodaldomain.feature")
    d_att22 = Attribute.create_numeric("subjectbankword.feature")
    d_att23 = Attribute.create_numeric("subjectdebitword.feature")
    d_att24 = Attribute.create_numeric("subjectfwdword.feature")
    d_att25 = Attribute.create_numeric("subjectnumchars.feature")
    d_att26 = Attribute.create_numeric("subjectnumwords.feature")
    d_att27 = Attribute.create_numeric("subjectreplyword.feature")
    d_att28 = Attribute.create_numeric("subjectrichness.feature")
    d_att29 = Attribute.create_numeric("subjectverifyword.feature")
    d_att30 = Attribute.create_numeric("urlatchar.feature")
    d_att31 = Attribute.create_numeric("urlbaglink.feature")
    d_att32 = Attribute.create_numeric("urlip.feature")
    d_att33 = Attribute.create_numeric("urlnumdomains.feature")
    d_att34 = Attribute.create_numeric("urlnumexternallink.feature")
    d_att35 = Attribute.create_numeric("urlnumimagelink.feature")
    d_att36 = Attribute.create_numeric("urlnuminternallink.feature")
    d_att37 = Attribute.create_numeric("urlnumip.feature")
    d_att38 = Attribute.create_numeric("urlnumlink.feature")
    d_att39 = Attribute.create_numeric("urlnumperiods.feature")
    d_att40 = Attribute.create_numeric("urlnumport.feature")
    d_att41 = Attribute.create_numeric("urlport.feature")
    d_att42 = Attribute.create_numeric("urltwodoains.feature")
    d_att43 = Attribute.create_numeric("urlunmodalbaglink.feature")
    d_att44 = Attribute.create_numeric("urlwordclicklink.feature")
    d_att45 = Attribute.create_numeric("urlwordherelink.feature")
    d_att46 = Attribute.create_numeric("urlwordloginlink.feature")
    d_att47 = Attribute.create_numeric("urlwordupdatelink.feature")
    d_att48 = Attribute.create_nominal("class", {'phish', 'ham'})
    #
    data_dir = settings.BASE_DIR + "/phishing/public/datasets/"
    #
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(data_dir + "dataset.arff")
    data.class_is_last()
    cls = Classifier(classname="weka.classifiers.trees.J48")
    cls.options = ["-C", "0.3"]
    cls.build_classifier(data)

    serialization.write(data_dir + "out.model", cls)
    classifier = Classifier(jobject=serialization.read(data_dir + "out.model"))

    dataset = Instances.create_instances("test", [
        d_att1, d_att2, d_att3, d_att4, d_att5, d_att6, d_att7, d_att8, d_att9,
        d_att10, d_att11, d_att12, d_att13, d_att14, d_att15, d_att16, d_att17,
        d_att18, d_att19, d_att20, d_att21, d_att22, d_att23, d_att24, d_att25,
        d_att26, d_att27, d_att28, d_att29, d_att30, d_att31, d_att32, d_att33,
        d_att34, d_att35, d_att36, d_att37, d_att38, d_att39, d_att40, d_att41,
        d_att42, d_att43, d_att44, d_att45, d_att46, d_att47, d_att48
    ], 0)
    values = [
        0, 0, 0, 0, 890, 1, 124, 198, 0.22247191011236, 0, 0, 0, 0.0, 0, 0, 0,
        0, 0, 1, 4, 0, 0, 0, 0, 21, 4, 1, 0.19047619047619, 0, 0, 0, 0, 2, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        Instance.missing_value()
    ]
    inst = Instance.create_instance(values)
    dataset.add_instance(inst)
    dataset.class_is_last()
    # print(str(dataset))
    var = ''
    for inst1 in dataset:
        pred = classifier.classify_instance(inst1)
        var = inst1.class_attribute.value(int(pred))
        if var == 'ham':
            print('No es pishing')
            # do somthing
        else:
            print('Es pishing')
            # do somthing

        print(var)

    jvm.stop()

    return HttpResponse(str(var))
예제 #30
0
jvm.start()

# executar a tecnica variando de 1 a 9 clusters
for i in range(1, 10):
    print '**************Numero de clusters: ' + str(i)
    clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans",
                          options=["-N", str(i)])
    clusterer.build_clusterer(eca)
    print(clusterer)

clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans",
                      options=["-N", "4"])
clusterer.build_clusterer(eca)
print(clusterer)
serialization.write("model/kmeans_eca_reprovacao.model", clusterer)

# ler model
'''objects = serialization.read_all("cluster.model")
clusterer = Clusterer(jobject=objects[0])

data_aluno = loader.load_file("aluno_temp.csv")
for instancia in data_aluno:
    resultado = clusterer.cluster_instance(instancia) 
    print ('O aluno pertence ao cluster: ' + str(resultado))'''
"""
for inst in data:
    cl = clusterer.cluster_instance(inst)  # 0-based cluster index
    dist = clusterer.distribution_for_instance(inst)   # cluster membership distribution

    print("cluster=" + str(cl) + ", distribution=" + str(dist))
dataSet20x50 = loader.load_file("trainingSet/dataSet20x50.arff")
dataSet20x50.class_is_last()

dataSet50x20 = loader.load_file("trainingSet/dataSet50x20.arff")
dataSet50x20.class_is_last()

classifier1 = Classifier(classname="weka.classifiers.functions.MultilayerPerceptron", options=["-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "9"])
classifier2 = Classifier(classname="weka.classifiers.functions.MultilayerPerceptron", options=["-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "11"])
classifier3 = Classifier(classname="weka.classifiers.functions.MultilayerPerceptron", options=["-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E", "20", "-H", "9"])

print "\n\nTraining neural network 1"
evaluation1 = Evaluation(dataSet20x20)                   
evaluation1.crossvalidate_model(classifier1, dataSet20x20, 10, Random(42))
classifier1.build_classifier(dataSet20x20)
serialization.write("trainingSet/nn1.model", classifier1)
print "\n\n====================================================== NUERAL NETWORK 1 ======================================================"
print(evaluation1.summary())
print(evaluation1.class_details())

print "Training neural network 2"
evaluation2 = Evaluation(dataSet20x50) 
evaluation2.crossvalidate_model(classifier2, dataSet20x50, 10, Random(42))
classifier2.build_classifier(dataSet20x50)
serialization.write("trainingSet/nn2.model", classifier2)
print "\n\n====================================================== NUERAL NETWORK 2 ======================================================"
print(evaluation2.summary())
print(evaluation2.class_details())

print "Training neural network 3"
evaluation3 = Evaluation(dataSet50x20) 
예제 #32
0
 def saveModel(self, model, method, mname):
     finalname = "%s_%s.model" %(method, mname)
     serialization.write(os.path.join(self.modelDir, finalname), model)
     logger.info('[%s] : [INFO] Saved mode %s ',
                 datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), finalname)