Exemplo n.º 1
0
def __build_kernel_classifier(algorithm_name,
                              kernel_name,
                              data,
                              result_dest=None):
    """
    Function for building kernel clasifier based on arguments we send to function.
    algorithm_name is for example JRip or Logistic or RandomForest...
    algorithm_path is for example weka.classifiers.rules.JRip, or 
    weka.classifiers.trees.RandomForest, ... 
    
    Kernel name for now will be the same as algorithm name. Later when we 
    will want to use different kernels that needs to be changed.

    :param algorithm_name: string
    :param kernel_name: string
    :param algorithm_path: string
    :param data: weka arff data
    :param result_dest: results destination
    :return: None
    """
    args_cls, _sufix_cls = parsers_dict[algorithm_name]()
    args_ker, _sufix_ker = kernel_parsers_dict[kernel_name]()
    kernel = Kernel(classname=kernel_path_dict[kernel_name],
                    options=args_to_weka_options(args_ker, _sufix_ker))
    classifier = Classifier(classname=algorithms_path_dict[algorithm_name],
                            options=args_to_weka_options(args_cls, _sufix_cls))
    classifier.kernel = kernel
    classifier.build_classifier(data)
    evaluation = evaluate(classifier, data)

    if result_dest:
        with open(result_dest, 'a') as file:
            file.write(
                __print_algorithm_header(classifier.to_commandline(),
                                         __get_header_of_data(data),
                                         algorithm_name))
            file.write(str(classifier))
            file.write(evaluation.summary())


#==============================================================================
#             file.write(str(evaluation.percent_correct))
#==============================================================================
    else:
        print(
            __print_algorithm_header(classifier.to_commandline(),
                                     __get_header_of_data(data),
                                     algorithm_name))
        print(classifier)
        print(evaluation.summary())
    return evaluation.percent_correct
Exemplo n.º 2
0
def __build_classifier(algorithm_name, data, result_dest=None):
    """
    Function for building clasifier based on arguments we send to function.
    algorithm_name is for example JRip or Logistic or RandomForest...
    algorithm_path is for example weka.classifiers.rules.JRip, or 
    weka.classifiers.trees.RandomForest, ...

    :param algorithm_name: string
    :param algorithm_path: string
    :param data: weka arff data
    :param result_dest: results destination
    :return: None
    """
    args, _sufix = parsers_dict[algorithm_name]()
    classifier = Classifier(classname=algorithms_path_dict[algorithm_name],
                            options=args_to_weka_options(args, _sufix))
    classifier.build_classifier(data)
    evaluation = evaluate(classifier, data)

    if result_dest:
        with open(result_dest, 'a') as file:
            file.write(
                __print_algorithm_header(classifier.to_commandline(),
                                         __get_header_of_data(data),
                                         algorithm_name))
            file.write(str(classifier))
            file.write(evaluation.summary())


#==============================================================================
#             file.write(str(evaluation.percent_correct))
#==============================================================================
    else:
        print(
            __print_algorithm_header(classifier.to_commandline(),
                                     __get_header_of_data(data),
                                     algorithm_name))
        print(classifier)
        print(evaluation.summary())
    return evaluation.percent_correct
def DecisionTree(rnd_data, folds, seed, data):

    data_size = rnd_data.num_instances
    fold_size = math.floor(data_size / folds)

    # cross-validation
    evaluation = Evaluation(rnd_data)
    for i in range(folds):
        this_fold = fold_size
        test_start = i * fold_size
        test_end = (test_start + fold_size)
        if ((data_size - test_end) / fold_size < 1):
            this_fold = data_size - test_start
        test = Instances.copy_instances(rnd_data, test_start,
                                        this_fold)  # generate validation fold
        if i == 0:
            train = Instances.copy_instances(rnd_data, test_end,
                                             data_size - test_end)
        else:
            train_1 = Instances.copy_instances(rnd_data, 0, test_start)
            train_2 = Instances.copy_instances(rnd_data, test_end,
                                               data_size - test_end)
            train = Instances.append_instances(
                train_1, train_2)  # generate training fold

        # build and evaluate classifier
        cls = Classifier(classname="weka.classifiers.trees.J48")
        cls.build_classifier(train)  # build classifier on training set
        evaluation.test_model(cls,
                              test)  # test classifier on validation/test set

    print("")
    print("=== Decision Tree ===")
    print("Classifier: " + cls.to_commandline())
    print("Dataset: " + data.relationname)
    print("Folds: " + str(folds))
    print("Seed: " + str(seed))
    print("")
    print(
        evaluation.summary("=== " + str(folds) + "-fold Cross-Validation ==="))
trainData = loader.load_file('segment-challenge.arff')
trainData.class_is_last()
testData = loader.load_file('segment-test.arff')
testData.class_is_last()

# Default C4.5 tree
classifier = Classifier(classname="weka.classifiers.trees.J48")

# Search for the best parameters and build a classifier with them
classifier.build_classifier(trainData)

print("\n\n=========== Classifier information ================\n\n")
print(classifier.options)
print(classifier)

print("\n\n=========== Train results ================\n\n")
evaluation = Evaluation(trainData)
evaluation.test_model(classifier, trainData)
print(classifier.to_commandline())
print(evaluation.matrix())
print("Train recognition: %0.2f%%" % evaluation.percent_correct)

print("\n\n=========== Test results ================\n\n")
evaluation = Evaluation(testData)
evaluation.test_model(classifier, testData)
print(classifier.to_commandline())
print(evaluation.matrix())
print("Test recognition: %0.2f%%" % evaluation.percent_correct)

jvm.stop()
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()

    # classifier help
    helper.print_title("Creating help string")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    print(classifier.to_help())

    # partial classname
    helper.print_title("Creating classifier from partial classname")
    clsname = ".J48"
    classifier = Classifier(classname=clsname)
    print(clsname + " --> " + classifier.classname)

    # classifier from commandline
    helper.print_title("Creating SMO from command-line string")
    cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"'
    classifier = from_commandline(cmdline,
                                  classname="weka.classifiers.Classifier")
    classifier.build_classifier(iris_data)
    print("input: " + cmdline)
    print("output: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # kernel classifier
    helper.print_title("Creating SMO as KernelClassifier")
    kernel = Kernel(
        classname="weka.classifiers.functions.supportVector.RBFKernel",
        options=["-G", "0.001"])
    classifier = KernelClassifier(classname="weka.classifiers.functions.SMO",
                                  options=["-M"])
    classifier.kernel = kernel
    classifier.build_classifier(iris_data)
    print("classifier: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # build a classifier and output model
    helper.print_title("Training J48 classifier on iris")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor"
    # property of the J48 classifier itself. However, being of type float rather than double, we need
    # to convert it to the correct type first using the double_to_float function:
    classifier.set_property("confidenceFactor", types.double_to_float(0.3))
    classifier.build_classifier(iris_data)
    print(classifier)
    print(classifier.graph)
    print(classifier.to_source("MyJ48"))
    plot_graph.plot_dot_graph(classifier.graph)

    # evaluate model on test set
    helper.print_title("Evaluating J48 classifier on iris")
    evaluation = Evaluation(iris_data)
    evl = evaluation.test_model(classifier, iris_data)
    print(evl)
    print(evaluation.summary())

    # evaluate model on train/test split
    helper.print_title("Evaluating J48 classifier on iris (random split 66%)")
    classifier = Classifier(classname="weka.classifiers.trees.J48",
                            options=["-C", "0.3"])
    evaluation = Evaluation(iris_data)
    evaluation.evaluate_train_test_split(classifier, iris_data, 66.0,
                                         Random(1))
    print(evaluation.summary())

    # load a dataset incrementally and build classifier incrementally
    helper.print_title("Build classifier incrementally on iris")
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_inc = loader.load_file(iris_file, incremental=True)
    iris_inc.class_is_last()
    classifier = Classifier(
        classname="weka.classifiers.bayes.NaiveBayesUpdateable")
    classifier.build_classifier(iris_inc)
    for inst in loader:
        classifier.update_classifier(inst)
    print(classifier)

    # construct meta-classifiers
    helper.print_title("Meta classifiers")
    # generic FilteredClassifier instantiation
    print("generic FilteredClassifier instantiation")
    meta = SingleClassifierEnhancer(
        classname="weka.classifiers.meta.FilteredClassifier")
    meta.classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.set_property("filter", flter.jobject)
    print(meta.to_commandline())
    # direct FilteredClassifier instantiation
    print("direct FilteredClassifier instantiation")
    meta = FilteredClassifier()
    meta.classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.filter = flter
    print(meta.to_commandline())
    # generic Vote
    print("generic Vote instantiation")
    meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote")
    classifiers = [
        Classifier(classname="weka.classifiers.functions.SMO"),
        Classifier(classname="weka.classifiers.trees.J48")
    ]
    meta.classifiers = classifiers
    print(meta.to_commandline())

    # cross-validate nominal classifier
    helper.print_title("Cross-validating NaiveBayes on diabetes")
    diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + diabetes_file)
    loader = Loader("weka.core.converters.ArffLoader")
    diabetes_data = loader.load_file(diabetes_file)
    diabetes_data.class_is_last()
    classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
    pred_output = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText",
        options=["-distribution"])
    evaluation = Evaluation(diabetes_data)
    evaluation.crossvalidate_model(classifier,
                                   diabetes_data,
                                   10,
                                   Random(42),
                                   output=pred_output)
    print(evaluation.summary())
    print(evaluation.class_details())
    print(evaluation.matrix())
    print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0)))
    print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc))
    print("areaUnderROC/1: " + str(evaluation.area_under_roc(1)))
    print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc))
    print("avgCost: " + str(evaluation.avg_cost))
    print("totalCost: " + str(evaluation.total_cost))
    print("confusionMatrix: " + str(evaluation.confusion_matrix))
    print("correct: " + str(evaluation.correct))
    print("pctCorrect: " + str(evaluation.percent_correct))
    print("incorrect: " + str(evaluation.incorrect))
    print("pctIncorrect: " + str(evaluation.percent_incorrect))
    print("unclassified: " + str(evaluation.unclassified))
    print("pctUnclassified: " + str(evaluation.percent_unclassified))
    print("coverageOfTestCasesByPredictedRegions: " +
          str(evaluation.coverage_of_test_cases_by_predicted_regions))
    print("sizeOfPredictedRegions: " +
          str(evaluation.size_of_predicted_regions))
    print("falseNegativeRate: " + str(evaluation.false_negative_rate(1)))
    print("weightedFalseNegativeRate: " +
          str(evaluation.weighted_false_negative_rate))
    print("numFalseNegatives: " + str(evaluation.num_false_negatives(1)))
    print("trueNegativeRate: " + str(evaluation.true_negative_rate(1)))
    print("weightedTrueNegativeRate: " +
          str(evaluation.weighted_true_negative_rate))
    print("numTrueNegatives: " + str(evaluation.num_true_negatives(1)))
    print("falsePositiveRate: " + str(evaluation.false_positive_rate(1)))
    print("weightedFalsePositiveRate: " +
          str(evaluation.weighted_false_positive_rate))
    print("numFalsePositives: " + str(evaluation.num_false_positives(1)))
    print("truePositiveRate: " + str(evaluation.true_positive_rate(1)))
    print("weightedTruePositiveRate: " +
          str(evaluation.weighted_true_positive_rate))
    print("numTruePositives: " + str(evaluation.num_true_positives(1)))
    print("fMeasure: " + str(evaluation.f_measure(1)))
    print("weightedFMeasure: " + str(evaluation.weighted_f_measure))
    print("unweightedMacroFmeasure: " +
          str(evaluation.unweighted_macro_f_measure))
    print("unweightedMicroFmeasure: " +
          str(evaluation.unweighted_micro_f_measure))
    print("precision: " + str(evaluation.precision(1)))
    print("weightedPrecision: " + str(evaluation.weighted_precision))
    print("recall: " + str(evaluation.recall(1)))
    print("weightedRecall: " + str(evaluation.weighted_recall))
    print("kappa: " + str(evaluation.kappa))
    print("KBInformation: " + str(evaluation.kb_information))
    print("KBMeanInformation: " + str(evaluation.kb_mean_information))
    print("KBRelativeInformation: " + str(evaluation.kb_relative_information))
    print("SFEntropyGain: " + str(evaluation.sf_entropy_gain))
    print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain))
    print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy))
    print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy))
    print("matthewsCorrelationCoefficient: " +
          str(evaluation.matthews_correlation_coefficient(1)))
    print("weightedMatthewsCorrelation: " +
          str(evaluation.weighted_matthews_correlation))
    print("class priors: " + str(evaluation.class_priors))
    print("numInstances: " + str(evaluation.num_instances))
    print("meanAbsoluteError: " + str(evaluation.mean_absolute_error))
    print("meanPriorAbsoluteError: " +
          str(evaluation.mean_prior_absolute_error))
    print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error))
    print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error))
    print("rootMeanPriorSquaredError: " +
          str(evaluation.root_mean_prior_squared_error))
    print("rootRelativeSquaredError: " +
          str(evaluation.root_relative_squared_error))
    print("prediction output:\n" + str(pred_output))
    plot_cls.plot_roc(evaluation,
                      title="ROC diabetes",
                      class_index=range(
                          0, diabetes_data.class_attribute.num_values),
                      wait=False)
    plot_cls.plot_prc(evaluation,
                      title="PRC diabetes",
                      class_index=range(
                          0, diabetes_data.class_attribute.num_values),
                      wait=False)

    # train 2nd classifier on diabetes dataset
    classifier2 = Classifier(classname="weka.classifiers.trees.RandomForest")
    evaluation2 = Evaluation(diabetes_data)
    evaluation2.crossvalidate_model(classifier2, diabetes_data, 10, Random(42))
    plot_cls.plot_rocs({
        "NB": evaluation,
        "RF": evaluation2
    },
                       title="ROC diabetes",
                       class_index=0,
                       wait=False)
    plot_cls.plot_prcs({
        "NB": evaluation,
        "RF": evaluation2
    },
                       title="PRC diabetes",
                       class_index=0,
                       wait=False)

    # load a numeric dataset
    bolts_file = helper.get_data_dir() + os.sep + "bolts.arff"
    helper.print_info("Loading dataset: " + bolts_file)
    loader = Loader("weka.core.converters.ArffLoader")
    bolts_data = loader.load_file(bolts_file)
    bolts_data.class_is_last()

    # build a classifier and output model
    helper.print_title("Training LinearRegression on bolts")
    classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression",
        options=["-S", "1", "-C"])
    classifier.build_classifier(bolts_data)
    print(classifier)

    # cross-validate numeric classifier
    helper.print_title("Cross-validating LinearRegression on bolts")
    classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression",
        options=["-S", "1", "-C"])
    evaluation = Evaluation(bolts_data)
    evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42))
    print(evaluation.summary())
    print("correlationCoefficient: " + str(evaluation.correlation_coefficient))
    print("errorRate: " + str(evaluation.error_rate))
    helper.print_title("Header - bolts")
    print(str(evaluation.header))
    helper.print_title("Predictions on bolts")
    for index, pred in enumerate(evaluation.predictions):
        print(
            str(index + 1) + ": " + str(pred) + " -> error=" + str(pred.error))
    plot_cls.plot_classifier_errors(evaluation.predictions, wait=False)

    # train 2nd classifier and show errors in same plot
    classifier2 = Classifier(classname="weka.classifiers.functions.SMOreg")
    evaluation2 = Evaluation(bolts_data)
    evaluation2.crossvalidate_model(classifier2, bolts_data, 10, Random(42))
    plot_cls.plot_classifier_errors(
        {
            "LR": evaluation.predictions,
            "SMOreg": evaluation2.predictions
        },
        wait=False)

    # learning curve
    cls = [
        Classifier(classname="weka.classifiers.trees.J48"),
        Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")
    ]
    plot_cls.plot_learning_curve(cls,
                                 diabetes_data,
                                 increments=0.05,
                                 label_template="[#] !",
                                 metric="percent_correct",
                                 wait=True)

    # access classifier's Java API
    labor_file = helper.get_data_dir() + os.sep + "labor.arff"
    helper.print_info("Loading dataset: " + labor_file)
    loader = Loader("weka.core.converters.ArffLoader")
    labor_data = loader.load_file(labor_file)
    labor_data.class_is_last()

    helper.print_title("Using JRip's Java API to access rules")
    jrip = Classifier(classname="weka.classifiers.rules.JRip")
    jrip.build_classifier(labor_data)
    rset = jrip.jwrapper.getRuleset()
    for i in xrange(rset.size()):
        r = rset.get(i)
        print(str(r.toString(labor_data.class_attribute.jobject)))
    def perceptron_classifier(cls, features, settings):
        # carrega o dataset
        loader = Loader("weka.core.converters.ArffLoader")
        instancias = loader.load_file(
            "./src/results/caracteristicas_sounds.arff")
        # sinaliza que o ultimo atributo é a classe
        instancias.class_is_last()
        # Define os Parametros
        learning_rate = str(settings['learningRate'])
        training_time = str(settings['trainingTime'])
        momentum = "0.2"
        hidden_layers = "a"
        seed = 2
        cross_validation = 20
        print('Learning Rate', learning_rate)
        print('Training Time', training_time)
        # Carrega o classificafor  Multilayer Perceptron de acordo com os parametros definidos
        classifier = Classifier(
            classname="weka.classifiers.functions.MultilayerPerceptron",
            options=[
                "-L", learning_rate, "-M", momentum, "-N", training_time, "-V",
                "0", "-S",
                str(seed), "-E", "20", "-H", hidden_layers
            ])
        # Constroi o Classificador e Valida o dataset
        classifier.build_classifier(instancias)
        evaluation = Evaluation(instancias)
        # Aplica o Cross Validation
        rnd = Random(seed)
        rand_data = Instances.copy_instances(instancias)
        rand_data.randomize(rnd)
        if rand_data.class_attribute.is_nominal:
            rand_data.stratify(cross_validation)
        for i in range(cross_validation):
            # treina as instancias
            train = instancias.train_cv(cross_validation, i)
            # testa as instancias
            test = instancias.test_cv(cross_validation, i)

            # Constroi e Valida o Classificador
            cls = Classifier.make_copy(classifier)
            cls.build_classifier(train)
            evaluation.test_model(cls, test)
        # Cria uma nova instância com base nas caracteristicas extraidas
        new_instance = Instance.create_instance(features)
        # Adiciona a nova instância ao dataset
        instancias.add_instance(new_instance)
        # Liga a nova instancia ao dataset treinado com o classificador
        new_instance.dataset = train
        # Classifica a nova instância trazendo as probabilidades de ela pertencer as classes definidas
        classification = classifier.distribution_for_instance(new_instance)
        result = {
            'cat': round(classification[0] * 100, 2),
            'dog': round(classification[1] * 100, 2)
        }
        print("=== Setup ===")
        print("Classifier: " + classifier.to_commandline())
        print("Dataset: " + instancias.relationname)
        print("Cross Validation: " + str(cross_validation) + "folds")
        print("Seed: " + str(seed))
        print("")
        print(
            evaluation.summary("=== " + str(cross_validation) +
                               " -fold Cross-Validation ==="))
        print("Classificação", " - Gato: ", result['cat'], "  Cachorro: ",
              result['dog'])

        return result
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()

    # classifier help
    helper.print_title("Creating help string")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    print(classifier.to_help())

    # partial classname
    helper.print_title("Creating classifier from partial classname")
    clsname = ".J48"
    classifier = Classifier(classname=clsname)
    print(clsname + " --> " + classifier.classname)

    # classifier from commandline
    helper.print_title("Creating SMO from command-line string")
    cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"'
    classifier = from_commandline(cmdline, classname="weka.classifiers.Classifier")
    classifier.build_classifier(iris_data)
    print("input: " + cmdline)
    print("output: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # kernel classifier
    helper.print_title("Creating SMO as KernelClassifier")
    kernel = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"])
    classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"])
    classifier.kernel = kernel
    classifier.build_classifier(iris_data)
    print("classifier: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # build a classifier and output model
    helper.print_title("Training J48 classifier on iris")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor"
    # property of the J48 classifier itself. However, being of type float rather than double, we need
    # to convert it to the correct type first using the double_to_float function:
    classifier.set_property("confidenceFactor", typeconv.double_to_float(0.3))
    classifier.build_classifier(iris_data)
    print(classifier)
    print(classifier.graph)
    print(classifier.to_source("MyJ48"))
    plot_graph.plot_dot_graph(classifier.graph)

    # evaluate model on test set
    helper.print_title("Evaluating J48 classifier on iris")
    evaluation = Evaluation(iris_data)
    evl = evaluation.test_model(classifier, iris_data)
    print(evl)
    print(evaluation.summary())

    # evaluate model on train/test split
    helper.print_title("Evaluating J48 classifier on iris (random split 66%)")
    classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"])
    evaluation = Evaluation(iris_data)
    evaluation.evaluate_train_test_split(classifier, iris_data, 66.0, Random(1))
    print(evaluation.summary())

    # load a dataset incrementally and build classifier incrementally
    helper.print_title("Build classifier incrementally on iris")
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_inc = loader.load_file(iris_file, incremental=True)
    iris_inc.class_is_last()
    classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")
    classifier.build_classifier(iris_inc)
    for inst in loader:
        classifier.update_classifier(inst)
    print(classifier)

    # construct meta-classifiers
    helper.print_title("Meta classifiers")
    # generic FilteredClassifier instantiation
    print("generic FilteredClassifier instantiation")
    meta = SingleClassifierEnhancer(classname="weka.classifiers.meta.FilteredClassifier")
    meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.set_property("filter", flter.jobject)
    print(meta.to_commandline())
    # direct FilteredClassifier instantiation
    print("direct FilteredClassifier instantiation")
    meta = FilteredClassifier()
    meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.filter = flter
    print(meta.to_commandline())
    # generic Vote
    print("generic Vote instantiation")
    meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote")
    classifiers = [
        Classifier(classname="weka.classifiers.functions.SMO"),
        Classifier(classname="weka.classifiers.trees.J48")
    ]
    meta.classifiers = classifiers
    print(meta.to_commandline())

    # cross-validate nominal classifier
    helper.print_title("Cross-validating NaiveBayes on diabetes")
    diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + diabetes_file)
    loader = Loader("weka.core.converters.ArffLoader")
    diabetes_data = loader.load_file(diabetes_file)
    diabetes_data.class_is_last()
    classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
    pred_output = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"])
    evaluation = Evaluation(diabetes_data)
    evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output)
    print(evaluation.summary())
    print(evaluation.class_details())
    print(evaluation.matrix())
    print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0)))
    print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc))
    print("areaUnderROC/1: " + str(evaluation.area_under_roc(1)))
    print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc))
    print("avgCost: " + str(evaluation.avg_cost))
    print("totalCost: " + str(evaluation.total_cost))
    print("confusionMatrix: " + str(evaluation.confusion_matrix))
    print("correct: " + str(evaluation.correct))
    print("pctCorrect: " + str(evaluation.percent_correct))
    print("incorrect: " + str(evaluation.incorrect))
    print("pctIncorrect: " + str(evaluation.percent_incorrect))
    print("unclassified: " + str(evaluation.unclassified))
    print("pctUnclassified: " + str(evaluation.percent_unclassified))
    print("coverageOfTestCasesByPredictedRegions: " + str(evaluation.coverage_of_test_cases_by_predicted_regions))
    print("sizeOfPredictedRegions: " + str(evaluation.size_of_predicted_regions))
    print("falseNegativeRate: " + str(evaluation.false_negative_rate(1)))
    print("weightedFalseNegativeRate: " + str(evaluation.weighted_false_negative_rate))
    print("numFalseNegatives: " + str(evaluation.num_false_negatives(1)))
    print("trueNegativeRate: " + str(evaluation.true_negative_rate(1)))
    print("weightedTrueNegativeRate: " + str(evaluation.weighted_true_negative_rate))
    print("numTrueNegatives: " + str(evaluation.num_true_negatives(1)))
    print("falsePositiveRate: " + str(evaluation.false_positive_rate(1)))
    print("weightedFalsePositiveRate: " + str(evaluation.weighted_false_positive_rate))
    print("numFalsePositives: " + str(evaluation.num_false_positives(1)))
    print("truePositiveRate: " + str(evaluation.true_positive_rate(1)))
    print("weightedTruePositiveRate: " + str(evaluation.weighted_true_positive_rate))
    print("numTruePositives: " + str(evaluation.num_true_positives(1)))
    print("fMeasure: " + str(evaluation.f_measure(1)))
    print("weightedFMeasure: " + str(evaluation.weighted_f_measure))
    print("unweightedMacroFmeasure: " + str(evaluation.unweighted_macro_f_measure))
    print("unweightedMicroFmeasure: " + str(evaluation.unweighted_micro_f_measure))
    print("precision: " + str(evaluation.precision(1)))
    print("weightedPrecision: " + str(evaluation.weighted_precision))
    print("recall: " + str(evaluation.recall(1)))
    print("weightedRecall: " + str(evaluation.weighted_recall))
    print("kappa: " + str(evaluation.kappa))
    print("KBInformation: " + str(evaluation.kb_information))
    print("KBMeanInformation: " + str(evaluation.kb_mean_information))
    print("KBRelativeInformation: " + str(evaluation.kb_relative_information))
    print("SFEntropyGain: " + str(evaluation.sf_entropy_gain))
    print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain))
    print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy))
    print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy))
    print("matthewsCorrelationCoefficient: " + str(evaluation.matthews_correlation_coefficient(1)))
    print("weightedMatthewsCorrelation: " + str(evaluation.weighted_matthews_correlation))
    print("class priors: " + str(evaluation.class_priors))
    print("numInstances: " + str(evaluation.num_instances))
    print("meanAbsoluteError: " + str(evaluation.mean_absolute_error))
    print("meanPriorAbsoluteError: " + str(evaluation.mean_prior_absolute_error))
    print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error))
    print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error))
    print("rootMeanPriorSquaredError: " + str(evaluation.root_mean_prior_squared_error))
    print("rootRelativeSquaredError: " + str(evaluation.root_relative_squared_error))
    print("prediction output:\n" + str(pred_output))
    plot_cls.plot_roc(
        evaluation, title="ROC diabetes",
        class_index=range(0, diabetes_data.class_attribute.num_values), wait=False)
    plot_cls.plot_prc(
        evaluation, title="PRC diabetes",
        class_index=range(0, diabetes_data.class_attribute.num_values), wait=False)

    # load a numeric dataset
    bolts_file = helper.get_data_dir() + os.sep + "bolts.arff"
    helper.print_info("Loading dataset: " + bolts_file)
    loader = Loader("weka.core.converters.ArffLoader")
    bolts_data = loader.load_file(bolts_file)
    bolts_data.class_is_last()

    # build a classifier and output model
    helper.print_title("Training LinearRegression on bolts")
    classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"])
    classifier.build_classifier(bolts_data)
    print(classifier)

    # cross-validate numeric classifier
    helper.print_title("Cross-validating LinearRegression on bolts")
    classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"])
    evaluation = Evaluation(bolts_data)
    evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42))
    print(evaluation.summary())
    print("correlationCoefficient: " + str(evaluation.correlation_coefficient))
    print("errorRate: " + str(evaluation.error_rate))
    helper.print_title("Header - bolts")
    print(str(evaluation.header))
    helper.print_title("Predictions on bolts")
    for index, pred in enumerate(evaluation.predictions):
        print(str(index+1) + ": " + str(pred) + " -> error=" + str(pred.error))
    plot_cls.plot_classifier_errors(evaluation.predictions, wait=False)

    # learning curve
    cls = [
        Classifier(classname="weka.classifiers.trees.J48"),
        Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")]
    plot_cls.plot_learning_curve(
        cls, diabetes_data, increments=0.05, label_template="[#] !", metric="percent_correct", wait=True)

    # access classifier's Java API
    labor_file = helper.get_data_dir() + os.sep + "labor.arff"
    helper.print_info("Loading dataset: " + labor_file)
    loader = Loader("weka.core.converters.ArffLoader")
    labor_data = loader.load_file(labor_file)
    labor_data.class_is_last()

    helper.print_title("Using JRip's Java API to access rules")
    jrip = Classifier(classname="weka.classifiers.rules.JRip")
    jrip.build_classifier(labor_data)
    rset = jrip.jwrapper.getRuleset()
    for i in range(rset.size()):
        r = rset.get(i)
        print(str(r.toString(labor_data.class_attribute.jobject)))
Exemplo n.º 8
0
        DFC = pd.read_csv('C:/PythonProjects/AgentsTurboFan/Test/Test4/Agent' +
                          l + '.csv',
                          delimiter=",")
        for a in range(len(DFC)):
            classvar = DFC.iloc[a, len(DFC.columns) - 1]
            classvarStr = str(classvar)

        print('classvarStr :', classvarStr)
        print('isreal(classvarStr) :', isreal(classvarStr))

        if isreal(classvarStr) == True:

            classifier = Classifier(classname="weka.classifiers.trees.M5P",
                                    options=["-U", "-M", "500.0"])
            print("\n--> building:")
            print(classifier.to_commandline())
            classifier.build_classifier(dataA)
            print("\n--> classifier:\n")
            print(classifier)
            print("\n--> graph:\n")
            print(classifier.graph)

            outputfile = helper.get_tmp_dir() + "/result.csv"
            output = PredictionOutput(
                classname='weka.classifiers.evaluation.output.prediction.CSV',
                options=["-distribution", "-suppress", "-file", outputfile])
            print("\n--> Output:\n")
            output.header = dataA
            output.print_all(classifier, dataA)
            helper.print_info("Predictions stored in:" + outputfile)
            print(output.buffer_content())
Exemplo n.º 9
0
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    data_file = helper.get_data_dir() + os.sep + "vote.arff"
    helper.print_info("Loading dataset: " + data_file)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(data_file)
    data.class_is_last()

    # classifier
    classifier = Classifier(classname="weka.classifiers.trees.J48")

    # randomize data
    folds = 10
    seed = 1
    rnd = Random(seed)
    rand_data = Instances.copy_instances(data)
    rand_data.randomize(rnd)
    if rand_data.class_attribute.is_nominal:
        rand_data.stratify(folds)

    # perform cross-validation and add predictions
    predicted_data = None
    evaluation = Evaluation(rand_data)
    for i in xrange(folds):
        train = rand_data.train_cv(folds, i)
        # the above code is used by the StratifiedRemoveFolds filter,
        # the following code is used by the Explorer/Experimenter
        # train = rand_data.train_cv(folds, i, rnd)
        test = rand_data.test_cv(folds, i)

        # build and evaluate classifier
        cls = Classifier.make_copy(classifier)
        cls.build_classifier(train)
        evaluation.test_model(cls, test)

        # add predictions
        addcls = Filter(
            classname="weka.filters.supervised.attribute.AddClassification",
            options=["-classification", "-distribution", "-error"])
        # setting the java object directory avoids issues with correct quoting in option array
        addcls.set_property("classifier", Classifier.make_copy(classifier))
        addcls.inputformat(train)
        addcls.filter(train)  # trains the classifier
        pred = addcls.filter(test)
        if predicted_data is None:
            predicted_data = Instances.template_instances(pred, 0)
        for n in xrange(pred.num_instances):
            predicted_data.add_instance(pred.get_instance(n))

    print("")
    print("=== Setup ===")
    print("Classifier: " + classifier.to_commandline())
    print("Dataset: " + data.relationname)
    print("Folds: " + str(folds))
    print("Seed: " + str(seed))
    print("")
    print(evaluation.summary("=== " + str(folds) + " -fold Cross-Validation ==="))
    print("")
    print(predicted_data)
Exemplo n.º 10
0
    test = rand_data.test_cv(folds, i)

    # build and evaluate classifier
    cls = Classifier.make_copy(classifier)
    cls.build_classifier(train)
    evaluation.test_model(cls, test)

    # add predictions
    addcls = Filter(
        classname="weka.filters.supervised.attribute.AddClassification",
        options=["-classification", "-distribution", "-error"])
    # setting the java object directory avoids issues with correct quoting in option array
    addcls.set_property("classifier", Classifier.make_copy(classifier))
    addcls.inputformat(train)
    addcls.filter(train)  # trains the classifier
    pred = addcls.filter(test)
    if predicted_data is None:
        predicted_data = Instances.template_instances(pred, 0)
    for n in xrange(pred.num_instances):
        predicted_data.add_instance(pred.get_instance(n))

print("")
print("=== Setup ===")
print("Classifier: " + classifier.to_commandline())
print("Dataset: " + data.relationname)
print("Folds: " + str(folds))
print("Seed: " + str(seed))
print("")
print(evaluation.summary("=== " + str(folds) + " -fold Cross-Validation ==="))
print("")
print(predicted_data)
Exemplo n.º 11
0
class Experiment:
	data = None
	class_index = -1
	classifier = None
	attrs = []

	def __init__(self):
#		jvm.start(max_heap_size="2500M")
		pass

	def out(self, x):
		print x.__str__().encode('ascii', 'ignore')

	def loadCSV(self, filename, path='/home/sbiastoch/Schreibtisch/csv_files/'):
		weka_loader = Loader(classname="weka.core.converters.CSVLoader")
		self.data = weka_loader.load_file(path+filename)

	def setClassIndex(self, index):
		if index < 0:
			self.data.class_index = self.data.num_attributes + index
		else:
			self.data.class_index = index

	def train_J48(self, min_per_rule=20):
		params = [
			'-C','0.3',
			'-M',str(min_per_rule),
	#		'-N',str(folds),
	#		'-R',
		]
		self.base_classifier = Classifier(classname='weka.classifiers.trees.J48', options=params)
		self._train()

	def train_JRip(self, min_per_rule=20, optimizations=2, folds=3, seed=42):
		params = [
			'-F', str(folds), # folds
			'-N', str(min_per_rule), # min elements per rule
			'-O', str(optimizations), # optimizations
			'-S', str(seed) #seed
		] 
		self.base_classifier = Classifier(classname='weka.classifiers.rules.JRip', options=params)
		self._train()

	def _train(self):
		params = [
			'-F','weka.filters.unsupervised.attribute.RemoveByName -E ^('+'|'.join(self.attrs)+')$ -V',
			'-W', self.base_classifier.classname, '--',
			]
		params.extend(self.base_classifier.options)


#		self.classifier = Classifier(classname='weka.classifiers.meta.FilteredClassifier', options=params)
		self.classifier = FilteredClassifier(options=params)
	#	self.classifier.filter(Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=['-E','^('+'|'.join(self.attrs)+')$','-V']))
		self.classifier.build_classifier(self.data)
		self.out(self.classifier.__str__().encode('ascii', 'ignore').split("\n")[-2])

	def test(self, folds = 10):
		evaluation = Evaluation(self.data)                     # initialize with priors
		evaluation.crossvalidate_model(self.classifier, self.data, folds, Random(42))  # 10-fold CV
		print('Total number of instances: '+str(evaluation.num_instances)+'.')
		print(str(round(evaluation.percent_correct,2))+'% / '+str(round(evaluation.correct, 2))+' correct.')
		print(str(round(evaluation.percent_incorrect,2))+'% / '+str(round(evaluation.incorrect, 2))+' incorrect.')
		
	def saveCSV(self, filename, path='/home/sbiastoch/Schreibtisch/csv_files/'):
		saver = Saver(classname="weka.core.converters.CSVSaver")
		saver.save_file(self.data, path+filename)

	def loadClassifier(self, filename, path='/home/sbiastoch/Schreibtisch/classifiers/'):
		objects = serialization.read_all(path+filename)
		self.classifier = Classifier(jobject=objects[0])
		#self.data = Instances(jobject=objects[1])

	def saveClassifier(self, filename, path='/home/sbiastoch/Schreibtisch/classifiers/'):
		serialization.write_all(path+filename, [self.classifier, Instances.template_instances(self.data)])


	def remove_correct_classified(self, invert = False):
		options=[
			'-W', self.classifier.to_commandline(), 
			'-C', str(self.class_index), #classindex
	#		'-F','0', # folds
	#		'-T','0.1', #threshold by numeric classes
			'-I','0', # max iterations
			'-V' if not invert else '' 
		] # invert
		classname = "weka.filters.unsupervised.instance.RemoveMisclassified"
		remove = Filter(classname=classname, options=options)
		remove.inputformat(self.data)
		self.data = remove.filter(self.data)

	def remove_incorrect_classified(self):
		self.remove_correct_classified(True)

	def set_attributes(self, attrs):
		self.attrs = attrs

	def select_missclassified(self):
		remove = Filter(classname="weka.filters.supervised.attribute.AddClassification", options=['-classification' ,'-error' ,'-W' ,self.base_classifier.to_commandline()])
		remove.inputformat(self.data)
		self.data = remove.filter(self.data)

		remove = Filter(classname="weka.filters.unsupervised.instance.RemoveWithValues", options=['-S','0.0','-C','last','-L','last','-V'])
		remove.inputformat(self.data)

		remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=['-R',str(self.data.num_attributes-2)+',last'])
		remove.inputformat(self.data)
		self.data = remove.filter(self.data)

	def merge_nominal_attributes(self, significance=0.01):
		remove = Filter(classname="weka.filters.supervised.attribute.MergeNominalValues", options=['-L',str(significance),'-R','first-last'])
		remove.inputformat(self.data)
		self.data = remove.filter(self.data)