예제 #1
0
    def crossTest(this, trainingFile, classifier, testFile):

        loader = Loader(classname="weka.core.converters.ArffLoader")
        data1 = loader.load_file(trainingFile)
        data1.class_is_last()

        cls = Classifier(classname=classifier)
        cls.build_classifier(data1)

        data2 = loader.load_file(testFile)
        data2.class_is_last()

        classes = [str(code) for code in data2.class_attribute.values]
        header = ["Accuracy"]
        for name in classes:
            header += [name + " TP", name + " FP", name + " AUC ROC"]
        values = []

        evl = Evaluation(data2)
        evl.test_model(cls, data2)

        values.append(evl.percent_correct)
        for name in classes:
            index = classes.index(name)
            values += [
                evl.true_positive_rate(index) * 100,
                evl.false_positive_rate(index) * 100,
                evl.area_under_roc(index)
            ]

        this.values = values
        this.header = header
예제 #2
0
def logit_PC(df_train, df_test, attr_label):
    '''
    logistic regression with PC members only
    :param df_train:        training data, pandas data frame
    :param df_test:         testing data, pandas data frame
    :param attr_label:      label attribute, string
    :return:                PC members, logistic regression model and AUC
    '''
    pcs = RF.learnPC_R(df_train, attr_label)
    if pcs:
        # model = LogisticRegression().fit(df_train[pcs], df_train[attr_label])
        # pred = model.predict_proba(df_test[pcs])
        # pred = [x[1] for x in pred]
        # auc = evaluate_auc(df_test[attr_label].values.tolist(), pred)

        df2Instances = DF2Instances(df_train[pcs+[attr_label]], 'train', attr_label)
        data_train = df2Instances.df_to_instances()
        data_train.class_is_last()  # set class attribute
        model = Classifier(classname="weka.classifiers.functions.Logistic")
        model.build_classifier(data_train)

        df2Instances = DF2Instances(df_test[pcs+[attr_label]], 'test', attr_label)
        data_test = df2Instances.df_to_instances()
        data_test.class_is_last()  # set class attribute

        preds = []
        for index, inst in enumerate(data_test):
            preds.append(model.distribution_for_instance(inst)[1])
        auc = evaluate_auc(df_test[attr_label].values.tolist(), preds)

        return pcs, model, auc
    else:
        return pcs, None, None
예제 #3
0
    def predBtn_clicked(self):

        gender = self.gender_entry.get()
        age = int(self.age_entry.get())
        height = int(self.height_entry.get())
        weight = int(self.weight_entry.get())
        sociability = self.sociability_entry.get()
        stability = self.stability_entry.get()
        '''Create the model'''
        objects = serialization.read_all("J48.model")

        cls = Classifier(jobject=objects[0])
        data = Instances(jobject=objects[1])
        '''Create the test set to be classified'''
        gender_values = ["Man", "Woman"]
        sociability_values = ["Introvert", "Extrovert"]
        stability_values = ["Stable", "Unstable"]

        values = [
            gender_values.index(gender), age, height, weight,
            self.BMI(weight, height),
            stability_values.index(stability),
            sociability_values.index(sociability),
            Instance.missing_value()
        ]

        inst = Instance.create_instance(values)
        inst.dataset = data
        '''Classification'''
        prediction = int(cls.classify_instance(inst))
        self.controller.show_frame("Result").show(prediction)
        self.clear()
예제 #4
0
def testNB(training_data, testing_data):

    train_data = Instances.copy_instances(training_data)
    test_data = Instances.copy_instances(testing_data)

    evaluation = Evaluation(train_data)
    classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
    classifier.build_classifier(
        train_data)  # build classifier on the training data
    evaluation.test_model(classifier,
                          test_data)  # test and evaluate model on the test set
    print("")
    print("")
    print(
        evaluation.summary(
            "--------------Naive Bayes Evaluation--------------"))
    print("Accuracy: " + str(evaluation.percent_correct))
    print("")
    print("Label\tPrecision\t\tRecall\t\t\tF-Measure")
    print("<=50K\t" + str(evaluation.precision(0)) + "\t" +
          str(evaluation.recall(0)) + "\t" + str(evaluation.f_measure(0)))
    print(">50K\t" + str(evaluation.precision(1)) + "\t" +
          str(evaluation.recall(1)) + "\t" + str(evaluation.f_measure(1)))
    print("Mean\t" + str(((evaluation.precision(1)) +
                          (evaluation.precision(0))) / 2) + "\t" +
          str(((evaluation.recall(1)) + (evaluation.recall(0))) / 2) + "\t" +
          str(((evaluation.f_measure(1)) + (evaluation.f_measure(0))) / 2))
def DecisionTree(data):

    classifier = Classifier(classname="weka.classifiers.trees.J48")
    classifier.build_classifier(data)

    print("")
    print("=== Decision Tree ===")
    print(classifier)

    count_class1 = 0
    count_class0 = 0
    print("Labeling income status of each instance. Please wait..")
    for index, inst in enumerate(data):
        pred = classifier.classify_instance(inst)
        # calculate no. of instances classified in class 1 and 0
        if str(pred) == "1.0":
            count_class1 += 1
        else:
            count_class0 += 1

        if index % 5000 == 0:
            print(".")

    print("No of instances in class '>50K' = " + str(count_class1))
    print("No of instances in class '<=50K' = " + str(count_class0))
    def build(self):
        """
        Build J48 classifier using data loaded from ARFF
        :param storeModel: Store model after built
        :return:
        """
        try:
            dataLoaded = self.loadClassifierData()

            if dataLoaded is True:
                # Decision tree options
                if self.unpruned is True:
                    self.dtOptions = ['-U']
                else:
                    self.dtOptions = ['-C', str(self.confidenceValue)]

                # Decision tree classificator
                print '[Building J48 DT from training]'
                self.classifierInstance = Classifier(classname="weka.classifiers.trees.J48", options=self.dtOptions)
                self.classifierInstance.build_classifier(self.classificationData)
                return True
        except:
            return False

        return False
예제 #7
0
    def __init__(self, classifier_name):
        # Defaults
        class_name = 'weka.classifiers.trees.RandomForest'
        options = None
        self.proba = None

        if classifier_name == 'wrf':
            class_name = 'weka.classifiers.trees.RandomForest'
            options = None
        elif classifier_name == 'wj48':
            class_name = 'weka.classifiers.trees.J48'
            options = None
        elif classifier_name == 'wnb':
            class_name = 'weka.classifiers.bayes.NaiveBayes'
            options = '-D'
        elif classifier_name == 'wbn':
            class_name = 'weka.classifiers.bayes.BayesNet'
            options = '-D -Q weka.classifiers.bayes.net.search.local.TAN -- -S BAYES -E weka.classifiers.bayes.net.estimate.SimpleEstimator -- -A 0.5'
        elif classifier_name == 'wsv':
            # Implementation of one-class SVM used in Anomaly Detection mode
            class_name = 'weka.classifiers.functions.LibSVM'
            options = '-S 2'

        if options is not None:
            self._classifier = Classifier(classname=class_name, options=[option for option in options.split()])
        else:
            self._classifier = Classifier(classname=class_name)

        self.model_ = None
def main(args):
    """
    Trains a NaiveBayesUpdateable classifier incrementally on a dataset. The dataset can be supplied as parameter.
    :param args: the commandline arguments
    :type args: list
    """

    # load a dataset
    if len(args) <= 1:
        data_file = helper.get_data_dir() + os.sep + "vote.arff"
    else:
        data_file = args[1]
    helper.print_info("Loading dataset: " + data_file)
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(data_file, incremental=True)
    data.class_is_last()

    # classifier
    nb = Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")
    nb.build_classifier(data)

    # train incrementally
    for inst in loader:
        nb.update_classifier(inst)

    print(nb)
예제 #9
0
def test_classifier(dataset: Instances, classifier: Classifier, params: dict):
    vars = params.keys()
    vals = params.values()

    results = defaultdict(list)

    for val_combo in itertools.product(*vals):
        results["numInstances"].append(dataset.num_instances)
        results["numAttributes"].append(dataset.num_attributes)
        opts = dict(zip(vars, val_combo))

        for opt in opts:
            results[opt].append(opts[opt])
            classifier.set_property(
                opt, opts[opt] if not isinstance(opts[opt], float) else
                typeconv.double_to_float(opts[opt]))

        evl = Evaluation(dataset)
        classifier.build_classifier(dataset)
        evl.test_model(classifier, dataset)
        results["Training_Accuracy"].append(evl.percent_correct)
        results["size"].append(
            int(javabridge.call(classifier.jobject, "measureTreeSize", "()D")))
        evl.crossvalidate_model(classifier, dataset, 10, Random(1))
        results["CV_Accuracy"].append(evl.percent_correct)

    return results
class ClassifierNaiveBayes(ClassifierAbstract):
    """
    Naive Bayes classifier algorithm in Weka
    """
    def build(self):
        """
        Build J48 classifier using data loaded from ARFF
        :param storeModel: Store model after built
        :return:
        """
        try:
            dataLoaded = self.loadClassifierData()

            if dataLoaded is True:
                # Naive Bayes classificator
                print '[Building Naive Bayes from training]'
                self.classifierInstance = Classifier(
                    classname="weka.classifiers.bayes.NaiveBayes")
                self.classifierInstance.build_classifier(
                    self.classificationData)
                return True
        except:
            return False

        return False
def main(args):
    """
    Trains a J48 classifier on a training set and outputs the predicted class and class distribution alongside the
    actual class from a test set. Class attribute is assumed to be the last attribute.
    :param args: the commandline arguments (train and test datasets)
    :type args: list
    """

    # load a dataset
    helper.print_info("Loading train: " + args[1])
    loader = Loader(classname="weka.core.converters.ArffLoader")
    train = loader.load_file(args[1])
    train.class_index = train.num_attributes - 1
    helper.print_info("Loading test: " + args[2])
    test = loader.load_file(args[2])
    test.class_is_last()

    # classifier
    cls = Classifier(classname="weka.classifiers.trees.J48")
    cls.build_classifier(train)

    # output predictions
    print("# - actual - predicted - error - distribution")
    for index, inst in enumerate(test):
        pred = cls.classify_instance(inst)
        dist = cls.distribution_for_instance(inst)
        print(
            "%d - %s - %s - %s  - %s" %
            (index+1,
             inst.get_string_value(inst.class_index),
             inst.class_attribute.value(int(pred)),
             "yes" if pred != inst.get_value(inst.class_index) else "no",
             str(dist.tolist())))
def main(args):
    """
    Loads a dataset, shuffles it, splits it into train/test set. Trains J48 with training set and
    evaluates the built model on the test set.
    :param args: the commandline arguments (optional, can be dataset filename)
    :type args: list
    """

    # load a dataset
    if len(args) <= 1:
        data_file = helper.get_data_dir() + os.sep + "vote.arff"
    else:
        data_file = args[1]
    helper.print_info("Loading dataset: " + data_file)
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(data_file)
    data.class_is_last()

    # generate train/test split of randomized data
    train, test = data.train_test_split(66.0, Random(1))

    # build classifier
    cls = Classifier(classname="weka.classifiers.trees.J48")
    cls.build_classifier(train)
    print(cls)

    # evaluate
    evl = Evaluation(train)
    evl.test_model(cls, test)
    print(evl.summary())
예제 #13
0
def create_model(input_file, output_file):
    # Load data
    data = converters.load_any_file(input_file)
    data.class_is_last()  # set class attribute

    # filter data
    print_title("Filtering Data")
    discretize = Filter(
        classname="weka.filters.unsupervised.attribute.Discretize",
        options=["-B", "10", "-M", "-1.0", "-R", "first-last"])
    discretize.inputformat(
        data)  # let the filter know about the type of data to filter
    filtered_data = discretize.filter(data)
    print("Done! (believe it or not)")

    print_title("Build Classifier")
    classifier = Classifier(classname="weka.classifiers.trees.RandomForest",
                            options=["-I", "100", "-K", "0", "-S", "1"])
    classifier.build_classifier(filtered_data)
    print("Done! (believe it or not)")
    serialization.write_all(output_file, [classifier, discretize])
    print("Model and filter saved to ", output_file)

    evaluation = Evaluation(data)  # initialize with priors
    evaluation.crossvalidate_model(classifier, filtered_data, 10,
                                   Random(42))  # 10-fold CV
    print(evaluation.summary())
    print("pctCorrect: " + str(evaluation.percent_correct))
    print("incorrect: " + str(evaluation.incorrect))
예제 #14
0
    def retrain(self, examples, labels):

        f = open("trainingweka.arff", "w")
        f.write("@relation randomset\n")
        for j in range(len(examples[0])):
            f.write("@attribute feature%d real\n" % j)
        f.write("@attribute class {TRUE, FALSE}\n")
        f.write("@data\n")

        for (example, label) in zip(examples, labels):
            for feature in example:
                f.write("%f," % feature)
            if label == 1:
                f.write("TRUE\n")
            else:
                f.write("FALSE\n")
        f.close()

        loader = Loader(classname="weka.core.converters.ArffLoader")
        # options=["-H", "-B", "10000"])
        self.trainingData = loader.load_file("trainingweka.arff")
        self.trainingData.set_class_index(self.trainingData.num_attributes() -
                                          1)
        self.classifier = Classifier(
            classname="weka.classifiers.functions.Logistic",
            options=["-R", "%f" % (1.0 / self.C)])
        self.classifier.build_classifier(self.trainingData)
def PredecirUnaTemporada(path):
    jvm.start()
    insta = CrearInstanciaParaPredecir(path)
    atributos = ""
    file = open('ModelData/wekaHeader.arff', 'r')
    atributos = file.readlines()
    file.close()

    file = open('ModelData/predictionFiles/inst.arff', 'w')
    file.writelines(atributos)
    file.write("\n" + insta + '\n')
    file.close()

    objects = serialization.read_all("ModelData/77PercentModelPaisajes.model")
    classifier = Classifier(jobject=objects[0])

    loader = Loader()
    data = loader.load_file("ModelData/predictionFiles/inst.arff")
    data.class_is_last()

    clases = ["invierno", "verano", "otono", "primavera"]
    prediccion = ""
    for index, inst in enumerate(data):
        pred = classifier.classify_instance(inst)
        dist = classifier.distribution_for_instance(inst)
        prediccion = clases[int(pred)]
    jvm.stop()
    return prediccion
예제 #16
0
    def run_naive_bayes_crossval(self, output_directory):
        # build classifier
        print("\nBuilding Classifier on training data.")
        buildTimeStart = time.time()
        cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
        cls.build_classifier(self.training_data)

        resultsString = ""
        resultsString = self.print_both(str(cls), resultsString)

        buildTimeString = "NB Cross Eval Classifier Built in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Evaluate Classifier
        resultsString = self.print_both("\nCross Evaluating on test data.",
                                        resultsString)

        buildTimeStart = time.time()
        evl = Evaluation(self.training_data)
        evl.crossvalidate_model(cls, self.training_data, 10, Random(1))

        resultsString = self.print_both(str(evl.summary()), resultsString)
        resultsString = self.print_both(str(evl.class_details()),
                                        resultsString)
        resultsString = self.print_both(str(evl.confusion_matrix),
                                        resultsString)
        buildTimeString = "\nNB Cross Eval Classifier Evaluated in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Save Results and Cleanup
        self.save_results("Naive_Bayes_Crossval", resultsString,
                          output_directory)
예제 #17
0
def TestClassification(arff, modelInput, results):
    # 启动java虚拟机
    jvm.start()
    # 导入分析模型
    objects = serialization.read_all(modelInput)
    clsf = Classifier(jobject=objects[0])
    print(clsf)
    # 导入测试组
    loader = Loader(classname="weka.core.converters.ArffLoader")
    test = loader.load_file(arff)
    test.class_is_first()
    # 分析结果
    resultsFile = open(results, "w")
    resultsFile.write("序号\t原判断\t预测\t良性概率\t恶性概率\n")
    print("序号\t原判断\t预测\t良性概率\t恶性概率")
    for index, inst in enumerate(test):
        pred = clsf.classify_instance(inst)
        dist = clsf.distribution_for_instance(inst)
        sampleID = index + 1
        origin = inst.get_string_value(inst.class_index)
        prediction = inst.class_attribute.value(int(pred))
        sameAsOrigin = "yes" if pred != inst.get_value(
            inst.class_index) else "no"
        NRate = dist.tolist()[0]
        PRate = dist.tolist()[1]
        resultsFile.write(
            "%d\t%s\t%s\t%s\t%s" %
            (sampleID, origin, prediction, str(NRate), str(PRate)) + "\n")
        print("%d\t%s\t%s\t%s\t%s" %
              (sampleID, origin, prediction, str(NRate), str(PRate)))
    resultsFile.close()
    # 退出java虚拟机
    jvm.stop()
    print("检测完成")
예제 #18
0
    def set_params(self, **params):
        """
        Sets the options for the classifier, expects 'classname' and 'options'.

        :param params: the parameter dictionary
        :type params: dict
        """
        if len(params) == 0:
            return
        if "classname" not in params:
            raise Exception("Cannot find 'classname' in parameters!")
        if "options" not in params:
            raise Exception("Cannot find 'options' in parameters!")
        self._classname = params["classname"]
        self._options = params["options"]
        self._classifier = Classifier(classname=self._classname, options=self._options)
        self._nominal_input_vars = None
        if "nominal_input_vars" in params:
            self._nominal_input_vars = params["nominal_input_vars"]
        self._nominal_output_var = None
        if "nominal_output_var" in params:
            self._nominal_output_var = params["nominal_output_var"]
        self._num_nominal_input_labels = None
        if "num_nominal_input_labels" in params:
            self._num_nominal_input_labels = params["num_nominal_input_labels"]
        self._num_nominal_output_labels = None
        if "num_nominal_output_labels" in params:
            self._num_nominal_output_labels = params["num_nominal_output_labels"]
예제 #19
0
def run():
    jvm.start()
    load_csv = Loader("weka.core.converters.CSVLoader")
    data_csv = load_csv.load_file(
        "/Users/imeiliasantoso/web_graduate_project4/predict_page/predict_data.csv"
    )

    saver = Saver("weka.core.converters.ArffSaver")
    saver.save_file(
        data_csv,
        "/Users/imeiliasantoso/web_graduate_project4/predict_page/predict_data.arff"
    )

    load_arff = Loader("weka.core.converters.ArffLoader")
    data_arff = load_arff.load_file(
        "/Users/imeiliasantoso/web_graduate_project4/predict_page/predict_data.arff"
    )
    data_arff.class_is_last()

    global j48
    J48_class = Classifier(classname="weka.classifiers.trees.J48",
                           options=["-C", "0.25", "-M", "2"])
    J48_class.build_classifier(data_arff)
    evaluationj48 = Evaluation(data_arff)
    evaluationj48.crossvalidate_model(J48_class, data_arff, 10, Random(100))
    j48 = str(evaluationj48.percent_correct)
    jvm.stop()
    return j48
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()

    # train classifier
    classifier = Classifier("weka.classifiers.trees.J48")
    classifier.build_classifier(iris_data)

    # save and read object
    helper.print_title("I/O: single object")
    outfile = tempfile.gettempdir() + os.sep + "j48.model"
    serialization.write(outfile, classifier)
    model = Classifier(jobject=serialization.read(outfile))
    print(model)

    # save classifier and dataset header (multiple objects)
    helper.print_title("I/O: single object")
    serialization.write_all(outfile, [classifier, Instances.template_instances(iris_data)])
    objects = serialization.read_all(outfile)
    for i, obj in enumerate(objects):
        helper.print_info("Object #" + str(i+1) + ":")
        if javabridge.get_env().is_instance_of(obj, javabridge.get_env().find_class("weka/core/Instances")):
            obj = Instances(jobject=obj)
        elif javabridge.get_env().is_instance_of(obj, javabridge.get_env().find_class("weka/classifiers/Classifier")):
            obj = Classifier(jobject=obj)
        print(obj)
예제 #21
0
	def __init__(self, class_name, options=None):

		if options is not None:
			self._classifier = Classifier(classname=class_name, options=[
									  option for option in options.split()])
		else:
			self._classifier = Classifier(classname=class_name)
예제 #22
0
def main(args):
    """
    Loads a dataset, shuffles it, splits it into train/test set. Trains J48 with training set and
    evaluates the built model on the test set.
    :param args: the commandline arguments (optional, can be dataset filename)
    :type args: list
    """

    # load a dataset
    if len(args) <= 1:
        data_file = helper.get_data_dir() + os.sep + "vote.arff"
    else:
        data_file = args[1]
    helper.print_info("Loading dataset: " + data_file)
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(data_file)
    data.class_is_last()

    # generate train/test split of randomized data
    train, test = data.train_test_split(66.0, Random(1))

    # build classifier
    cls = Classifier(classname="weka.classifiers.trees.J48")
    cls.build_classifier(train)
    print(cls)

    # evaluate
    evl = Evaluation(train)
    evl.test_model(cls, test)
    print(evl.summary())
def autoweka(data, duration, metric, nb_folds):
    classifier = Classifier(
        classname="weka.classifiers.meta.AutoWEKAClassifier",
        options=["-x", nb_folds, "-timeLimit", duration, "-metric", metric]
    )  #classname="weka.classifiers.functions.Logistic", options=["-R", "1.0E-2"]
    classifier.build_classifier(data)
    print(classifier)
예제 #24
0
def f_smote():
    jvm.start()

    train_data, test_data = b_i_impute_data()

    train_data = train_data[:10000]
    y_train = train_data["class"]
    x_train = train_data.drop("class", axis=1)

    sm = SMOTE(ratio="minority")
    x_train_sm, y_train_sm = sm.fit_sample(x_train, y_train)

    x_train_sm_df = pd.DataFrame(x_train_sm, columns=x_train.columns)
    y_train_sm_df = pd.DataFrame(y_train_sm, columns=["class"])
    train_data_sm_df = pd.concat([y_train_sm_df, x_train_sm_df], axis=1)
    print_f("smote train data shape", train_data_sm_df.shape)
    train_data_sm_df.to_csv("./train_data_sm.csv", index=False)

    train_data_sm = converters.load_any_file("train_data_sm.csv")
    train_data_sm.class_is_first()

    test_data = converters.load_any_file("test_data.csv")
    test_data.class_is_first()

    print_f("1")
    cls = Classifier(classname="weka.classifiers.trees.LMT")
    print_f("bulding classifier")
    cls.build_classifier(train_data_sm)
    print_f("Evaluating")
    evl = Evaluation(train_data_sm)

    evl.crossvalidate_model(cls, train_data_sm, 5, Random(1))
    print_f("Train Accuracy:", evl.percent_correct)
    print_f("Train summary")
    print_f(evl.summary())
    print_f("Train class details")
    print_f(evl.class_details())
    print_f("Train confusion matrix")
    print_f(evl.confusion_matrix)
    plcls.plot_roc(evl,
                   class_index=[0, 1],
                   wait=True,
                   outfile="./plots/2_f_smote_10k.png")
    plt.suptitle("Train ROC Curve", fontsize=20, y=0.95)

    evl = Evaluation(test_data)
    print_f("testing model")
    evl.test_model(cls, test_data)
    print_f("Test Accuracy:", evl.percent_correct)
    print_f("Test summary")
    print_f(evl.summary())
    print_f(" Testclass details")
    print_f(evl.class_details())
    print_f("Testconfusion matrix")
    print_f(evl.confusion_matrix)
    plcls.plot_roc(evl, class_index=[0, 1], wait=True)
    plt.suptitle("Test ROC Curve", fontsize=20, y=0.95)
    savefig("./plots/f_test_roc_curve.png")
예제 #25
0
    def train_weka_model(self,
                         training_data_dir,
                         save_model_dir,
                         log_file,
                         mimic_env=None):
        """
        Just runs some example code.
        """
        loader = Loader(classname="weka.core.converters.CSVLoader")
        training_data = loader.load_file(training_data_dir)
        training_data.class_is_last()

        self.classifier = Classifier(classname="weka.classifiers.trees.M5P",
                                     options=self.options)
        # classifier help, check https://weka.sourceforge.io/doc.dev/weka/classifiers/trees/M5P.html
        self.classifier.build_classifier(training_data)
        # print(classifier)
        graph = self.classifier.graph
        node_number = float(graph.split('\n')[-3].split()[0].replace('N', ''))
        leaves_number = node_number / 2
        serialization.write(save_model_dir, self.classifier)
        # print('Leaves number is {0}'.format(leave_number), file=log_file)

        evaluation = Evaluation(training_data)
        predicts = evaluation.test_model(self.classifier, training_data)
        # return_value = None
        # if mimic_env is not None:
        predict_dictionary = {}
        for predict_index in range(len(predicts)):
            predict_value = predicts[predict_index]
            if predict_value in predict_dictionary.keys():
                predict_dictionary[predict_value].append(predict_index)
            else:
                predict_dictionary.update({predict_value: [predict_index]})

        # return_value = mimic_env.get_return(state=list(predict_dictionary.values()))
        return_value_log = mimic_env.get_return(
            state=list(predict_dictionary.values()))
        return_value_log_struct = mimic_env.get_return(
            state=list(predict_dictionary.values()), apply_structure_cost=True)
        return_value_var_reduction = mimic_env.get_return(
            state=list(predict_dictionary.values()),
            apply_variance_reduction=True)
        # print("Training return is {0}".format(return_value), file=log_file)

        summary = evaluation.summary()
        numbers = summary.split('\n')
        corr = float(numbers[1].split()[-1])
        mae = float(numbers[2].split()[-1])
        rmse = float(numbers[3].split()[-1])
        rae = float(numbers[4].split()[-2]) / 100
        rrse = float(numbers[5].split()[-2]) / 100
        # print(evl)
        # print("Training summary is "+summary, file=log_file)

        return return_value_log, return_value_log_struct, \
               return_value_var_reduction, mae, rmse, leaves_number
예제 #26
0
 def __init__(self, index = 0, inference = "ExactInference", ghostAgents = None):
     BustersAgent.__init__(self, index, inference, ghostAgents)
     jvm.start(max_heap_size="512m")
     self.loader = Loader(classname="weka.core.converters.ArffLoader")
     self.data = self.loader.load_file("data/training-fase3.arff")
     self.data.class_is_last()
     self.cls = Classifier(classname="weka.classifiers.trees.REPTree", options=["-M", "2","-V", "0.001","-N", "3", "-S", "1", "-L", "-1"])
     self.cls.build_classifier(self.data)
     serialization.write("data/out.model", self.cls)
def train(data_train, n_estimators):  # train the model
    # create `Classifier` object
    rf = Classifier(classname="weka.classifiers.trees.RandomForest",
                    options=['-num-slots', '0', '-I',
                             str(n_estimators)])

    # train classifier on the train split
    rf.build_classifier(data_train)

    return rf
예제 #28
0
def test_single():
  #['long', 'nK', 'nK_delta', 'nT', 'nT_delta', 'str', 'strsum']:
  objs = ['olsize', 'ylsize']
  for obj in objs:
    c = Classifier(jobject=serialization.read(model_file('hash', obj)))
    values = [3.0, 192.0, 124.0, 192.0, 124.0, 6.0, 144.0]
    values.append(0) # should be obj
    ins = Instance.create_instance(values)
    prediction = c.classify_instance(ins)
    print obj, prediction
예제 #29
0
class python_weka(object):
    def __init__(self, input_x, input_y, labels):
        self.input_x = input_x
        self.input_y = input_y
        self.labels = labels

    def write_arff(self, filename, relation, train_or_predict, input_x, input_y=None):
        f = open(filename, "w")
        f.write("@relation " + relation + "\n")
        for i in self.labels:
            train_or_predict += 1
            if train_or_predict == len(self.labels):
                break
            f.write("@attribute " + i + " " + self.labels[i] + "\n")
        f.write("\n")
        f.write("@data" + "\n")
        for i in range(len(input_x)):
            for j in input_x[i]:
                f.write(str(j) + "  ")
            if train_or_predict == 0:
                f.write(str(input_y[i]))
            else:
                f.write(str(0))
            f.write("\n")
        f.close()

    def train(self):
        filename = "train.arff"
        self.write_arff(filename, "train", 0, self.input_x, self.input_y)
        loader = Loader(classname="weka.core.converters.ArffLoader")
        data = loader.load_file(filename)
        data.class_is_last()
        self.cls = Classifier(classname="weka.classifiers.meta.Bagging", options=["-S", "5"])
        self.cls.build_classifier(data)
        os.remove(filename)

    def predict(self, test_data):
        filename = "test.arff"
        self.write_arff(filename, "test", 0, test_data)
        loader = Loader(classname="weka.core.converters.ArffLoader")
        data = loader.load_file(filename)
        data.class_is_last()
        # evl = Evaluation(data)
        # evl.evaluate_model(self.cls,data)
        # data.set_class_label(data.numAttributes() - 1)
        # data.setClassIndex(data.numAttributes() - 1)
        result = []
        for index, inst in enumerate(data):
            pred = self.cls.classify_instance(inst)
            dist = self.cls.distribution_for_instance(inst)
            result.append(dist[0])
            # print(str(index+1) + ": label index=" + str(pred) + ", class distribution=" + str(dist))
            # print str(index+1) + 'dist:'+ str(dist)
        os.remove(filename)
        return result
class ClassifierDecisionTreeJ48(ClassifierAbstract):
    """
    Decision tree using J48 algorithm in Weka
    """

    def __init__(self, arffFileName, confidenceValue=0.25):
        """
        Class constructor (overridden)
        :param arffFileName: ARFF file name
        :param confidenceValue: Confidence value for classifier
        """
        ClassifierAbstract.__init__(self, arffFileName)

        # Store confidence value
        if 0 <= confidenceValue <= 1:
            self.confidenceValue = confidenceValue
        else:
            # Set default confidence value
            self.confidencevalue = 0.25

        self.unpruned = False

    def setUnprunedTree(self, unpruned):
        """
        Set unpruned tree option
        :param unpruned: If tree result is unpruned or not (TRUE or FALSE)
        :return:
        """
        self.unpruned = unpruned

    def build(self):
        """
        Build J48 classifier using data loaded from ARFF
        :param storeModel: Store model after built
        :return:
        """
        try:
            dataLoaded = self.loadClassifierData()

            if dataLoaded is True:
                # Decision tree options
                if self.unpruned is True:
                    self.dtOptions = ['-U']
                else:
                    self.dtOptions = ['-C', str(self.confidenceValue)]

                # Decision tree classificator
                print '[Building J48 DT from training]'
                self.classifierInstance = Classifier(classname="weka.classifiers.trees.J48", options=self.dtOptions)
                self.classifierInstance.build_classifier(self.classificationData)
                return True
        except:
            return False

        return False
def predictWithWeka(csvFilenameWithInputToPredict, modelFilename):
    """
    #   Nota: para usar sin conocer la clase, se puede colocar una clase dummy
    #   e ignorar los valores actual y error de @return results.
    #
    #   Nota: es necesario que el archivo de nombre @csvFilenameWithInputToPredict
    #   contenga instancias de ambas clases (spam y sanas)
    #
    #   @csvFilenameWithInputToPredict : nombre del archivo csv con las instancias
    #                                   a predecir.
    #
    #   @modelFilename : nombre del archivo de modelo generado por weka y 
    #                    compatible con el archivo csv de entrada
    #
    #   @return results : lista de diccionarios con los siguientes indices
    #                      index, actual, predicted, error y distribution
    """
    loader = Loader(classname="weka.core.converters.CSVLoader")
    cls = Classifier(jobject=serialization.read(modelFilename))
    #print(cls)

    data = loader.load_file(csvFilenameWithInputToPredict)
    data.class_is_last()

    multi = MultiFilter()
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                    options=["-R", "first"])
    numericToNom = Filter(
        classname="weka.filters.unsupervised.attribute.NumericToNominal",
        options=["-R", "8,11"])
    normalize = Filter(
        classname="weka.filters.unsupervised.attribute.Normalize",
        options=["-S", "1.0", "-T", "0.0"])
    multi.filters = [remove, numericToNom, normalize]
    multi.inputformat(data)
    test = multi.filter(data)

    results = []
    for index, inst in enumerate(test):
        result = dict()

        pred = cls.classify_instance(inst)
        dist = cls.distribution_for_instance(inst)

        result["index"] = index + 1
        result["actual"] = inst.get_string_value(inst.class_index)
        result["predicted"] = inst.class_attribute.value(int(pred))
        result["error"] = "yes" if pred != inst.get_value(
            inst.class_index) else "no"
        result["distribution"] = str(dist.tolist())

        results.append(result)
        #print result

    return results
예제 #32
0
파일: utils.py 프로젝트: Unkrible/NFS
    def exposed_evaluate(self, X, d, task, i_model, i_evl):
        data = np.reshape(eval(X), [d, -1], order='C')
        if task == 'regression':
            if i_model == 'LR':
                data = converters.ndarray_to_instances(data, relation='tmp')
                data.class_is_last()
                model = Classifier(
                    classname='weka.classifiers.functions.LinearRegression')
                evl = Evaluation(data)
                evl.crossvalidate_model(model, data, 5, Random(0))
            elif i_model == 'RF':
                data = converters.ndarray_to_instances(data, relation='tmp')
                data.class_is_last()
                model = Classifier(
                    classname='weka.classifiers.trees.RandomForest')
                evl = Evaluation(data)
                evl.crossvalidate_model(model, data, 5, Random(0))
            if i_evl == 'mae':
                r_mae = evl.mean_absolute_error
                return r_mae
            elif i_evl == 'mse':
                r_mae = evl.mean_square_error
                return r_mse
            elif i_evl == '1-rae':
                r_one_minus_rae = 1 - evl.relative_absolute_error / 100
                del evl, model, data
                return r_one_minus_rae

        elif task == 'classification':
            le = LabelEncoder()
            data[:, -1] = le.fit_transform(data[:, -1])
            if i_model == 'RF':
                dataRaw = converters.ndarray_to_instances(data, relation='tmp')
                weka_filter = Filter(
                    classname=
                    "weka.filters.unsupervised.attribute.NumericToNominal",
                    options=["-R", "last"])
                weka_filter.inputformat(dataRaw)
                data = weka_filter.filter(dataRaw)
                data.class_is_last()
                model = Classifier(
                    classname='weka.classifiers.trees.RandomForest')
                evl = Evaluation(data)
                evl.crossvalidate_model(model, data, 5, Random(0))
            elif i_model == 'LR':
                model = LogisticRegression(multi_class='ovr')
            elif i_model == 'SVM':
                model = svm.SVC()
            if i_evl == 'f_score':
                fscore = evl.weighted_f_measure
                del evl, model, data, dataRaw
                if not (fscore >= 0.01 and fscore < 1.01):
                    fscore = 0.01
                return fscore
예제 #33
0
    def LMT(self):
        model = Classifier(classname="weka.classifiers.trees.LMT")
        model.build_classifier(self.data_train)
        print(model)

        preds = []
        for index, inst in enumerate(self.data_test):
            preds.append(model.distribution_for_instance(inst)[1])
        auc = evaluate_auc(self.df_test[self.attr_label].values.tolist(),
                           preds)
        return auc
예제 #34
0
    def __init__(self):
        jvm.start()

        data_dir = "./DataSet/"
        self.data = converters.load_any_file(data_dir + "chatbot2.arff")
        self.data.class_is_last()

        self.cls = Classifier(classname="weka.classifiers.trees.J48")
        self.cls.build_classifier(self.data)

        self.intens = self.data.attribute_by_name("intent")
예제 #35
0
def get_classifier(min_no, seed):
    cls = Classifier(classname="weka.classifiers.rules.JRip")
    # options = ["-N", "25.0"] #-N: minNo, -F folds, -O num optimizations, -batch-size, -S: seed
    options = list()
    options.append("-N")
    options.append(str(min_no))
    options.append("-S")
    options.append(str(seed))

    cls.options = options
    return cls
예제 #36
0
파일: JeroR.py 프로젝트: duyvk/weka-spmf
 def setOptions(self, options):
     """
     Parses a given list of options.
      
     Parameter(s):
     
         'options' -- the list of options as an array of strings
     """
     
     Classifier.setOptions(self, options)
     
     return
예제 #37
0
def assign_classify(file_location, output="classified.out", model="naivebayes.model"):
    data = read_csv_file(file_location)
    jvm.start()
    # load clusters
    obj = serialization.read(model)
    classifier = Classifier(jobject=obj)
    # create file with cluster group
    with open(output, 'w') as cluster_file:
        for index, attrs in enumerate(data):
            inst = Instance.create_instance(attrs[1:])
            pred = classifier.classify_instance(inst)
            print(str(index + 1) + ": label index=" + str(pred))
    jvm.stop()
예제 #38
0
def predict(attributes):
    jvm.start()
    file_path = print_to_file(attributes)
    # load the saved model
    objects = serialization.read_all("/Users/hosyvietanh/Desktop/data_mining/trained_model.model")
    classifier = Classifier(jobject=objects[0])
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(file_path)
    data.class_is_last()
    for index, inst in enumerate(data):
        pred = classifier.classify_instance(inst)
        dist = classifier.distribution_for_instance(inst)
        return int(pred)
    jvm.stop()
def playback_speed_checker(inputFile, dirRef):
    
    TRAINING_ARFF = 'dataset_playback.arff'
    inputRef = ""

    # Start JVM
    jvm.start()
    jvm.start(system_cp=True, packages=True)
    jvm.start(max_heap_size="512m")
    
    # Find reference file
    for file in os.listdir(dirRef):
        if str(file).find(str(os.path.basename(inputFile))) != -1:
            inputRef = os.path.join(dirRef, file)
            break

    # Calculation distance
    (result, distance) = dtw_checker(inputFile, inputRef)

    # Loading data
    loader = Loader(classname="weka.core.converters.ArffLoader")    
    data = loader.load_file(TRAINING_ARFF)
    data.class_is_last()                    # set class attribute

    # Train the classifier
    #cls = Classifier(classname="weka.classifiers.functions.SMO")
    cls = Classifier(classname="weka.classifiers.trees.J48", options = ["-C", "0.3", "-M", "10"])
    cls.build_classifier(data)

    # Classify instance
    speed_instance = Instance.create_instance(numpy.ndarray(distance), classname='weka.core.DenseInstance', weight=1.0)
    speed_instance.dataset = data
    
    # Classify instance
    speed_flag = cls.classify_instance(speed_instance)
    
    if (distance == 0):
        speed_class = 'nominal'
    else:
        if speed_flag == 0: speed_class = 'down_speed'
        if speed_flag == 0: speed_class = 'up_speed'
        
#    print os.path.basename(inputFile) + ' --- ' + speed_class
    
    # Stop JVM
    jvm.stop()    

    print "SPEED IS: " + speed_class

    return speed_class
예제 #40
0
파일: JeroR.py 프로젝트: duyvk/weka-spmf
    def getCapabilities(self):
        """
        returns the capabilities of this classifier
        
        Return:
        
            the capabilities of this classifier
        """

        result = Classifier.getCapabilities(self)
    
        # attributes
        result.enable(Capability.NOMINAL_ATTRIBUTES)
        result.enable(Capability.NUMERIC_ATTRIBUTES)
        result.enable(Capability.DATE_ATTRIBUTES)
        result.enable(Capability.STRING_ATTRIBUTES)
        result.enable(Capability.RELATIONAL_ATTRIBUTES)
        result.enable(Capability.MISSING_VALUES)
    
        # class
        result.enable(Capability.NOMINAL_CLASS)
        result.enable(Capability.NUMERIC_CLASS)
        result.enable(Capability.DATE_CLASS)
        result.enable(Capability.MISSING_CLASS_VALUES)
    
        # instances
        result.setMinimumNumberInstances(0)
        
        return result
예제 #41
0
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    bodyfat_file = helper.get_data_dir() + os.sep + "bodyfat.arff"
    helper.print_info("Loading dataset: " + bodyfat_file)
    loader = Loader("weka.core.converters.ArffLoader")
    bodyfat_data = loader.load_file(bodyfat_file)
    bodyfat_data.class_is_last()

    # classifier help
    helper.print_title("Creating help string")
    classifier = Classifier(classname="weka.classifiers.trees.M5P")
    classifier.build_classifier(bodyfat_data)
    print(classifier)
예제 #42
0
	def train_J48(self, min_per_rule=20):
		params = [
			'-C','0.3',
			'-M',str(min_per_rule),
	#		'-N',str(folds),
	#		'-R',
		]
		self.base_classifier = Classifier(classname='weka.classifiers.trees.J48', options=params)
		self._train()
예제 #43
0
 def train(self):
     filename = "train.arff"
     self.write_arff(filename, "train", 0, self.input_x, self.input_y)
     loader = Loader(classname="weka.core.converters.ArffLoader")
     data = loader.load_file(filename)
     data.class_is_last()
     self.cls = Classifier(classname="weka.classifiers.meta.Bagging", options=["-S", "5"])
     self.cls.build_classifier(data)
     os.remove(filename)
예제 #44
0
 def train_model(self, training_data):
     model_weka = None
     if os.path.isfile(self.model_file):
         print 'Model ' + self.name + ' already trained.'
     else:
         print 'Starting to train_model model ' + self.name + '.'
         model_weka = Classifier(classname = self.classname, options = self.options) 
         
         model_weka.build_classifier(data = training_data)
         serialization.write(filename = self.model_file, jobject = model_weka)
         print 'Model ' + self.name + ' trained and saved.'
     if os.path.isfile(self.parameter_file):
         print 'Parameters of the model ' + self.name + ' already saved.'
     else:
         if model_weka == None:
             model_weka = Classifier(jobject = serialization.read(self.model_file))
         save_file(file_name = self.parameter_file, content = str(model_weka))
         print 'Parameters of the model ' + self.name + ' saved.'
예제 #45
0
	def train_JRip(self, min_per_rule=20, optimizations=2, folds=3, seed=42):
		params = [
			'-F', str(folds), # folds
			'-N', str(min_per_rule), # min elements per rule
			'-O', str(optimizations), # optimizations
			'-S', str(seed) #seed
		] 
		self.base_classifier = Classifier(classname='weka.classifiers.rules.JRip', options=params)
		self._train()
예제 #46
0
파일: JeroR.py 프로젝트: duyvk/weka-spmf
 def listOptions(self):
     """
     Returns an enumeration describing the available options.
     
     Return:
      
         an enumeration of all the available options.
     """
      
     return Classifier.listOptions(self)
예제 #47
0
파일: JeroR.py 프로젝트: duyvk/weka-spmf
 def getOptions(self):
     """
     Gets the current settings of the Classifier as string array.
      
     Return:
      
         an array of strings suitable for passing to setOptions
     """
      
     return Classifier.getOptions(self)
예제 #48
0
 def trainData(self, arrfFile = None, classname="weka.classifiers.trees.J48", options=["-C", "0.3"]):
     if arrfFile is not None:
         self.initData( arrfFile )
         
     if self.data is None:
         return 
         
     print 'Contruindo classificador ' + str(classname) + ' ' + ' '.join(options)
     self.classifier = Classifier(classname=classname, options=options)
     self.classifier.build_classifier(self.data)
예제 #49
0
def riaa_checker(inputFile):
    
    TRAINING_ARFF = 'C:\Users\ASUS\Desktop\IGNASI\SMC\Workspace\dataset_riaa.arff'

    # Start JVM
    jvm.start()
    jvm.start(system_cp=True, packages=True)
    jvm.start(max_heap_size="512m")

    # Calculation of bark bands information
    (absolute_bark, relative_bark, bark_ratios) = compute_bark_spectrum(inputFile)

    # Loading data
    loader = Loader(classname="weka.core.converters.ArffLoader")    
    data = loader.load_file(TRAINING_ARFF)
    data.class_is_last()                    # set class attribute

    # Train the classifier
    cls = Classifier(classname="weka.classifiers.functions.SMO")
    #cls = Classifier(classname="weka.classifiers.trees.J48", options = ["-C", "0.3", "-M", "10"])
    cls.build_classifier(data)

    # Classify instance
    bark_instance = Instance.create_instance(bark_ratios, classname='weka.core.DenseInstance', weight=1.0)
    bark_instance.dataset = data
    
    # Classify instance
    riaa_flag = cls.classify_instance(bark_instance)
    
    if riaa_flag == 0:
        riaa_class = 'riaa_ok'
    else:
        riaa_class = 'riaa_ko'
        
#    print os.path.basename(inputFile) + ' --- ' + riaa_class
    
    # Stop JVM
    jvm.stop()   

    print "RIAA FILTERING?: " + riaa_class

    return riaa_class
 def getDecisionTree(self, inputPath):   
     #load arff  
     data = self.load_Arff(inputPath)  
         
     #classifier
     data.set_class_index(data.num_attributes() - 1)   # set class attribute
     classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"])
     
     data.set_class_index(data.num_attributes() - 1)
     classifier.build_classifier(data)
     
     
     classifierStr = str(classifier)
     for index in range(0,data.num_instances()):
         instance = data.get_instance(index)
         #print instance
         result = classifier.distribution_for_instance(instance)
         
         #print result
     graph = classifier.graph()
     return graph
예제 #51
0
def classify(train, test, name="RF", tuning=False):
    jvm.start()

    if isinstance(train, list) and isinstance(test, list):
        train = weka_instance(train)
        trn_data = converters.load_any_file(train)
        test = weka_instance(test)
        tst_data = converters.load_any_file(test)

    elif os.path.isfile(train) and os.path.isfile(test):
        trn_data = converters.load_any_file(train)
        tst_data = converters.load_any_file(test)

    else:
        trn = csv_as_ndarray(train)
        tst = csv_as_ndarray(test)

        trn_data = converters.ndarray_to_instances(trn, relation="Train")
        tst_data = converters.ndarray_to_instances(tst, relation="Test")

    trn_data.class_is_last()
    tst_data.class_is_last()

    # t = time()
    if tuning:
        opt = tune(train)
    else:
        opt = default_opt
    # print("Time to tune: {} seconds".format(time() - t))

    cls = Classifier(classname=classifiers[name.lower()], options=opt)

    cls.build_classifier(trn_data)

    distr = [cls.distribution_for_instance(inst)[1] for inst in tst_data]
    preds = [cls.classify_instance(inst) for inst in tst_data]

    jvm.stop()

    return preds, distr
예제 #52
0
	def _train(self):
		params = [
			'-F','weka.filters.unsupervised.attribute.RemoveByName -E ^('+'|'.join(self.attrs)+')$ -V',
			'-W', self.base_classifier.classname, '--',
			]
		params.extend(self.base_classifier.options)


#		self.classifier = Classifier(classname='weka.classifiers.meta.FilteredClassifier', options=params)
		self.classifier = FilteredClassifier(options=params)
	#	self.classifier.filter(Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=['-E','^('+'|'.join(self.attrs)+')$','-V']))
		self.classifier.build_classifier(self.data)
		self.out(self.classifier.__str__().encode('ascii', 'ignore').split("\n")[-2])
예제 #53
0
    def __init__(self, jobject=None, options=None):
        """
        Initializes the specified classifier using either the classname or the supplied JB_Object.

        :param classname: the classname of the classifier
        :type classname: str
        :param jobject: the JB_Object to use
        :type jobject: JB_Object
        :param options: the list of commandline options to set
        :type options: list
        """
        classname = "weka.classifiers.functions.Logistic"

        if jobject is None:
            jobject = Classifier.new_instance(classname)
        self.enforce_type(jobject, "weka.classifiers.functions.Logistic")
        super(Logistic, self).__init__(classname=classname, jobject=jobject, options=options)
예제 #54
0
    def fit(self, X, y):
        # Check params
        self.n_features_ = X.shape[1]
        random_state = check_random_state(self.random_state)

        if isinstance(self.max_features, str):
            if self.max_features == "auto":
                max_features = max(1, int(np.sqrt(self.n_features_)))
            elif self.max_features == "sqrt":
                max_features = max(1, int(np.sqrt(self.n_features_)))
            elif self.max_features == "log2":
                max_features = max(1, int(np.log2(self.n_features_)))
            else:
                raise ValueError(
                    'Invalid value for max_features. Allowed string '
                    'values are "auto", "sqrt" or "log2".')
        elif self.max_features is None:
            max_features = self.n_features_
        elif isinstance(self.max_features, (numbers.Integral, np.integer)):
            max_features = self.max_features
        else:  # float
            max_features = int(self.max_features * self.n_features_)

        params = {}
        params["-I"] = self.n_estimators
        params["-K"] = max_features
        params["-depth"] = 0 if self.max_depth is None else self.max_depth
        params["-no-cv"] = None
        params["-s"] = random_state.randint(1000000)

        # Convert data
        self.classes_ = np.unique(y)
        self.n_classes_ = len(self.classes_)
        y = np.searchsorted(self.classes_, y)

        tf = tempfile.NamedTemporaryFile(mode="w", suffix=".arff", dir="/dev/shm", delete=False)
        to_arff(X, y, self.n_classes_, tf)
        tf.close()

        # Run
        self.model_ = Classifier(name="weka.classifiers.trees.RandomForest", ckargs=params)
        self.model_.train(tf.name)
        os.remove(tf.name)

        return self
def use_classifier(data):
    """
    Uses the meta-classifier AttributeSelectedClassifier for attribute selection.
    :param data: the dataset to use
    :type data: Instances
    """
    print("\n1. Meta-classifier")
    classifier = Classifier(classname="weka.classifiers.meta.AttributeSelectedClassifier")
    aseval = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval")
    assearch = ASSearch(classname="weka.attributeSelection.GreedyStepwise", options=["-B"])
    base = Classifier(classname="weka.classifiers.trees.J48")
    # setting nested options is always a bit tricky, getting all the escaped double quotes right
    # simply using the bean property for setting Java objects is often easier and less error prone
    classifier.set_property("classifier", base.jobject)
    classifier.set_property("evaluator", aseval.jobject)
    classifier.set_property("search", assearch.jobject)
    evaluation = Evaluation(data)
    evaluation.crossvalidate_model(classifier, data, 10, Random(1))
    print(evaluation.summary())
예제 #56
0
import weka.core.jvm as jvm
jvm.start()

jvm.start(system_cp=True, packages=True)
jvm.start(packages="/usr/local/lib/python2.7/dist-packages/weka")

jvm.start(max_heap_size="512m")

data_dir="CSDMC2010_SPAM/CSDMC2010_SPAM/TRAINING"


from  weka.classifiers  import  Classifier 
cls = Classifier(classname="weka.classifiers.trees.J48")
cls.options = ["-C", "0.3"]
print(cls.options)


jvm.stop()
예제 #57
0
if data_dir is None:
  data_dir = "." + os.sep + "data"

import os
import weka.core.jvm as jvm
from weka.core.converters import Loader
from weka.core.classes import Random
from weka.classifiers import Classifier, Evaluation
from weka.filters import Filter

jvm.start()

# load weather.nominal
loader = Loader(classname="weka.core.converters.ArffLoader")
fname = data_dir + os.sep + "weather.nominal.arff"
print("\nLoading dataset: " + fname + "\n")
data = loader.load_file(fname)
data.set_class_index(data.num_attributes() - 1)

# perform 10-fold cross-validation
cls = Classifier(classname="weka.classifiers.rules.OneR")
evl = Evaluation(data)
evl.crossvalidate_model(cls, data, 10, Random(1))
print("10-fold cross-validation:\n" + evl.to_summary())

# build model on full dataset and output it
cls.build_classifier(data)
print("Model:\n\n" + str(cls))

jvm.stop()
예제 #58
0
class WekaWrapper:

	def __init__(self, questionID, algorithm, classifier, parameters, modelParams, optimizer, predict = 0):
		self.questionID = questionID
		self.algorithm = algorithm
		self.classifier = classifier
		self.parameters = parameters
		self.modelParams = modelParams
		self.api = nemoApi()
		self.config = nemoConfig()
		self.optimizer = optimizer
		self.predict = predict
		self.prediction = None


	def retrieveData(self, id, dataset):
		query = self.api.getDataQuery(id, dataset)
		iquery = InstanceQuery()
		iquery.db_url = "jdbc:mysql://" + self.config.HOST + ":" + str(self.config.PORT) + "/" + self.config.DB
		iquery.user = self.config.USER
		iquery.password = self.config.PASS
		iquery.query = query
		data = iquery.retrieve_instances()
		data.class_is_last()
		return data

	def uploadData(self):
		# Upload file to database
		self.api.addModel(self.questionID, '?', self.acc, self.model, self.algorithm, False, self.matrix, self.optimizer)
		info = self.api.fetchQuestionInfo(self.questionID)
		modelID = info['ID']
		for mParam in self.modelParams:
			mParam.AIModel = modelID
			self.api.addAIModelParam(mParam)

	def uploadPrediction(self):
		# Upload best classifier prediction to database

		if self.prediction is not None:
			# Convert prediction to string
			predStr = 'No prediction'
			if (self.prediction == 1.0):
				predStr = "True"
			elif (self.prediction == 0.0):
				predStr = "False"
			print 'Writing ' + predStr
			self.api.updatePrediction(self.questionID, predStr)

	def addInstancesToDataset(self, source, dest):
		# Align the instances of a source dataset to destination's header and add them to the destination dataset
		i = 0
		while i < source.num_instances:
			values = source.get_instance(i).values
			it = np.nditer(values, flags=['f_index'], op_flags=['readwrite'])
			while not it.finished:
				(it[0], it.index),
				if (source.attribute(it.index).is_nominal):
					stringVal = source.get_instance(i).get_string_value(it.index)
					# print stringVal
					if(stringVal != '?'):
						values[it.index] = dest.attribute(it.index).values.index(stringVal)
				it.iternext()
			dest.add_instance(Instance.create_instance(values))
			i = i + 1

	def buildPatientObject(self):
		# Build a patient to classify
		patient = self.api.fetchPatientJSON(self.questionID)
		if patient is not None:
			newPatient = {}
			demographics = ['race_cd', 'sex_cd', 'age_in_years_num']
			observation_fact_features = ['tval_char', 'nval_num']
			for demo in demographics:
				if demo not in patient:
					print "Patient definition missing" + demo + "."
					newPatient[demo] = float('nan')
				else:
					if patient[demo] is not None and patient[demo] != '':
						newPatient[demo] = patient[demo]
					else: 
						print "Demographic " + demo +  " for patient is empty"
						newPatient[demo] = float('nan')
			for obs in patient['observation_facts']:
				concept_cd = obs['concept_cd']
				for feat in observation_fact_features:
					if feat in obs:
						if obs[feat] is not None:
							newPatient[(concept_cd + feat)] = obs[feat]
						else:
							newPatient[(concept_cd + feat)] = float('nan')
					else:
						print "Feature " + concept_cd + feat + " missing from Patient definition, marking it None"
						newPatient[(concept_cd + feat)] = float('nan')
			return newPatient
		else:
			return None

	def addPatientNominals(self, patient, dataset):
		# Add the nominal values for the patient to the master header, in case they aren't already there
		# Loop and add patient's nominal values in case they aren't in masterDataset
		# newDataset will be the new master header
		# Waiting on prediction patient to be defined
		# Should be like {sex_cd: "m", ...}
		ignoreAttributes = ['readmitted']
		atts = []
		for a in dataset.attributes():
			if (not (a.is_nominal)) or (a.name in ignoreAttributes) :
				atts.append(a)
			else:
				newValues = list(a.values)
				#print a.name
				pvalue = patient[a.name]
				if(pvalue not in newValues):
					newValues.append(pvalue)
				atts.append(Attribute.create_nominal(a.name, newValues))
		newDataset = Instances.create_instances("Dataset", atts, 0)
		newDataset.class_is_last()
		return newDataset

	def addNominals(self, dataset):
		# Add the nominal values for all columns, in case a column has none
		ignoreAttributes = ['readmitted']
		atts = []
		for a in dataset.attributes():
			if (not (a.is_nominal)) or (a.name in ignoreAttributes) :
				atts.append(a)
			else:
				newValues = list(a.values)
				pvalue = 'DefaultNominal'
				if(pvalue not in newValues):
					newValues.append(pvalue)
				atts.append(Attribute.create_nominal(a.name, newValues))
		newDataset = Instances.create_instances("Dataset", atts, 0)
		newDataset.class_is_last()
		return newDataset
		
	def createPatientInstance(self, patient, dataset):
		# Create a patient instance to classify
		ignoreAttributes = ['readmitted']
		values = []
		for a in dataset.attributes():
			if not a.is_nominal:
				values.append(patient[a.name])
			elif a.name in ignoreAttributes:
				values.append(0)
			else:
				values.append(a.values.index(patient[a.name]))
		#print values
		newInst = Instance.create_instance(values)
		return newInst



	def run(self):
		# Attach JVM
		javabridge.attach()

		# Debug

		print "Classifier"
		print self.classifier
		print "Params"
		print self.parameters
		print "Model Params"
		print self.modelParams

		# Get data for testing and learning
		learnerData = self.retrieveData(self.questionID, "learner")
		testData = self.retrieveData(self.questionID, 'test')
		masterData = self.retrieveData(self.questionID, 'all')
		masterData = self.addNominals(masterData)

		# Check if there is enough correct data to run
		if (learnerData.num_instances < 1 or testData.num_instances < 1):
			self.status = self.config.NOT_ENOUGH_DATA
			return False

		# If this is a prediction and there is a valid patient, change masterData header
		patientObj = self.buildPatientObject()
		patientInstance = None
		if ((patientObj is not None) and (self.predict == 1)):
			masterData = self.addPatientNominals(patientObj, masterData)
			patientInstance = self.createPatientInstance(patientObj, masterData)
			masterData.add_instance(patientInstance)

		elif (patientObj is None) and (self.predict == 1):
			print 'No patient defined for prediction. Exiting'
			return True
		# Fix dataset headers up to match and fix instances to match headers
		masterData.delete()
		learner = masterData.copy_instances(masterData, 0, 0)
		test = masterData.copy_instances(masterData, 0, 0)
		self.addInstancesToDataset(learnerData, learner)
		self.addInstancesToDataset(testData, test)

		# Comparison of data for testing purposes
		# print 'learnerData'
		# print learnerData

		# print 'learner'
		# print learner

		# print 'testData'
		# print testData

		# print 'test'
		# print test

		# pdb.set_trace()
		# Instantiate classifier
		self.cls = Classifier(classname=self.classifier, options=self.parameters)

		# Run classifier
		self.cls.build_classifier(learner)
		# for index, inst in enumerate(learnerData):
			# prediction = self.cls.classify_instance(inst)
			# distribution = self.cls.distribution_for_instance(inst)

		# Test classifier
		evl = Evaluation(learner)
		evl.test_model(self.cls, test)

		# Store information about matrix
		self.acc = evl.percent_correct
		self.val = None

		# Convert numpy array into simple array
		confusionMatrix = []
		confusionMatrix.append([evl.confusion_matrix[0][0], evl.confusion_matrix[0][1]])
		confusionMatrix.append([evl.confusion_matrix[1][0], evl.confusion_matrix[1][1]])

		# Convert matrix into json format
		self.matrix = json.dumps(confusionMatrix)

		
		# print 'Classifier: ', self.classifier
		# print 'ID: ', self.questionID
		# print 'ACC: ', self.acc
		# print(evl.summary())

		# If this is a prediction... make the prediction
		if ((patientObj is not None) and (self.predict == 1)):
			masterData.add_instance(patientInstance)
			print "Running prediction on patient: "
			print masterData.get_instance(0)
			self.prediction = self.cls.classify_instance(masterData.get_instance(0))
			#self.uploadPrediction()

		# Temporarily store file to serialize to
		fileName = str(self.questionID) + self.algorithm + ".model"
		serialization.write(fileName, self.cls)

		# Open that file and store it
		self.model = None
		with open(fileName, 'rb') as f:
			self.model = f.read()

		# Remove temporary file
		os.remove(fileName)

		# Set status to awaiting feedback
		self.status = self.config.AWAITING_FEEDBACK_STATUS
		return True
예제 #59
0
	def run(self):
		# Attach JVM
		javabridge.attach()

		# Debug

		print "Classifier"
		print self.classifier
		print "Params"
		print self.parameters
		print "Model Params"
		print self.modelParams

		# Get data for testing and learning
		learnerData = self.retrieveData(self.questionID, "learner")
		testData = self.retrieveData(self.questionID, 'test')
		masterData = self.retrieveData(self.questionID, 'all')
		masterData = self.addNominals(masterData)

		# Check if there is enough correct data to run
		if (learnerData.num_instances < 1 or testData.num_instances < 1):
			self.status = self.config.NOT_ENOUGH_DATA
			return False

		# If this is a prediction and there is a valid patient, change masterData header
		patientObj = self.buildPatientObject()
		patientInstance = None
		if ((patientObj is not None) and (self.predict == 1)):
			masterData = self.addPatientNominals(patientObj, masterData)
			patientInstance = self.createPatientInstance(patientObj, masterData)
			masterData.add_instance(patientInstance)

		elif (patientObj is None) and (self.predict == 1):
			print 'No patient defined for prediction. Exiting'
			return True
		# Fix dataset headers up to match and fix instances to match headers
		masterData.delete()
		learner = masterData.copy_instances(masterData, 0, 0)
		test = masterData.copy_instances(masterData, 0, 0)
		self.addInstancesToDataset(learnerData, learner)
		self.addInstancesToDataset(testData, test)

		# Comparison of data for testing purposes
		# print 'learnerData'
		# print learnerData

		# print 'learner'
		# print learner

		# print 'testData'
		# print testData

		# print 'test'
		# print test

		# pdb.set_trace()
		# Instantiate classifier
		self.cls = Classifier(classname=self.classifier, options=self.parameters)

		# Run classifier
		self.cls.build_classifier(learner)
		# for index, inst in enumerate(learnerData):
			# prediction = self.cls.classify_instance(inst)
			# distribution = self.cls.distribution_for_instance(inst)

		# Test classifier
		evl = Evaluation(learner)
		evl.test_model(self.cls, test)

		# Store information about matrix
		self.acc = evl.percent_correct
		self.val = None

		# Convert numpy array into simple array
		confusionMatrix = []
		confusionMatrix.append([evl.confusion_matrix[0][0], evl.confusion_matrix[0][1]])
		confusionMatrix.append([evl.confusion_matrix[1][0], evl.confusion_matrix[1][1]])

		# Convert matrix into json format
		self.matrix = json.dumps(confusionMatrix)

		
		# print 'Classifier: ', self.classifier
		# print 'ID: ', self.questionID
		# print 'ACC: ', self.acc
		# print(evl.summary())

		# If this is a prediction... make the prediction
		if ((patientObj is not None) and (self.predict == 1)):
			masterData.add_instance(patientInstance)
			print "Running prediction on patient: "
			print masterData.get_instance(0)
			self.prediction = self.cls.classify_instance(masterData.get_instance(0))
			#self.uploadPrediction()

		# Temporarily store file to serialize to
		fileName = str(self.questionID) + self.algorithm + ".model"
		serialization.write(fileName, self.cls)

		# Open that file and store it
		self.model = None
		with open(fileName, 'rb') as f:
			self.model = f.read()

		# Remove temporary file
		os.remove(fileName)

		# Set status to awaiting feedback
		self.status = self.config.AWAITING_FEEDBACK_STATUS
		return True
예제 #60
0
fname = data_dir + os.sep + "simpletext-train.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)
data.set_class_index(data.num_attributes() - 1)

# 1a filter data
print("Filtering data...")
fltr = Filter("weka.filters.unsupervised.attribute.StringToWordVector")
fltr.set_inputformat(data)
filtered = fltr.filter(data)
filtered.set_class_index(0)

# 1b build classifier
print("Building/evaluating classifier...")
cls = Classifier(classname="weka.classifiers.trees.J48")
cls.build_classifier(filtered)
evl = Evaluation(filtered)
evl.test_model(cls, filtered)
print(evl.to_summary())
print(str(cls))
plg.plot_dot_graph(cls.graph())

# 2. filtered classifier
fname = data_dir + os.sep + "simpletext-test.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
test = loader.load_file(fname)
test.set_class_index(test.num_attributes() - 1)
print("Building/evaluating filtered classifier...")
cls = FilteredClassifier()