def main(args):
    """
    Trains a J48 classifier on a training set and outputs the predicted class and class distribution alongside the
    actual class from a test set. Class attribute is assumed to be the last attribute.
    :param args: the commandline arguments (train and test datasets)
    :type args: list
    """

    # load a dataset
    helper.print_info("Loading train: " + args[1])
    loader = Loader(classname="weka.core.converters.ArffLoader")
    train = loader.load_file(args[1])
    train.class_index = train.num_attributes - 1
    helper.print_info("Loading test: " + args[2])
    test = loader.load_file(args[2])
    test.class_is_last()

    # classifier
    cls = Classifier(classname="weka.classifiers.trees.J48")
    cls.build_classifier(train)

    # output predictions
    print("# - actual - predicted - error - distribution")
    for index, inst in enumerate(test):
        pred = cls.classify_instance(inst)
        dist = cls.distribution_for_instance(inst)
        print(
            "%d - %s - %s - %s  - %s" %
            (index+1,
             inst.get_string_value(inst.class_index),
             inst.class_attribute.value(int(pred)),
             "yes" if pred != inst.get_value(inst.class_index) else "no",
             str(dist.tolist())))
def main(args):
    """
    Loads a dataset, shuffles it, splits it into train/test set. Trains J48 with training set and
    evaluates the built model on the test set.
    :param args: the commandline arguments (optional, can be dataset filename)
    :type args: list
    """

    # load a dataset
    if len(args) <= 1:
        data_file = helper.get_data_dir() + os.sep + "vote.arff"
    else:
        data_file = args[1]
    helper.print_info("Loading dataset: " + data_file)
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(data_file)
    data.class_is_last()

    # generate train/test split of randomized data
    train, test = data.train_test_split(66.0, Random(1))

    # build classifier
    cls = Classifier(classname="weka.classifiers.trees.J48")
    cls.build_classifier(train)
    print(cls)

    # evaluate
    evl = Evaluation(train)
    evl.test_model(cls, test)
    print(evl.summary())
def main(args):
    """
    Trains a NaiveBayesUpdateable classifier incrementally on a dataset. The dataset can be supplied as parameter.
    :param args: the commandline arguments
    :type args: list
    """

    # load a dataset
    if len(args) <= 1:
        data_file = helper.get_data_dir() + os.sep + "vote.arff"
    else:
        data_file = args[1]
    helper.print_info("Loading dataset: " + data_file)
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(data_file, incremental=True)
    data.class_is_last()

    # classifier
    nb = Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")
    nb.build_classifier(data)

    # train incrementally
    for inst in loader:
        nb.update_classifier(inst)

    print(nb)
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()

    # train classifier
    classifier = Classifier("weka.classifiers.trees.J48")
    classifier.build_classifier(iris_data)

    # save and read object
    helper.print_title("I/O: single object")
    outfile = tempfile.gettempdir() + os.sep + "j48.model"
    serialization.write(outfile, classifier)
    model = Classifier(jobject=serialization.read(outfile))
    print(model)

    # save classifier and dataset header (multiple objects)
    helper.print_title("I/O: single object")
    serialization.write_all(outfile, [classifier, Instances.template_instances(iris_data)])
    objects = serialization.read_all(outfile)
    for i, obj in enumerate(objects):
        helper.print_info("Object #" + str(i+1) + ":")
        if javabridge.get_env().is_instance_of(obj, javabridge.get_env().find_class("weka/core/Instances")):
            obj = Instances(jobject=obj)
        elif javabridge.get_env().is_instance_of(obj, javabridge.get_env().find_class("weka/classifiers/Classifier")):
            obj = Classifier(jobject=obj)
        print(obj)
Exemplo n.º 5
0
class python_weka(object):
    def __init__(self, input_x, input_y, labels):
        self.input_x = input_x
        self.input_y = input_y
        self.labels = labels

    def write_arff(self, filename, relation, train_or_predict, input_x, input_y=None):
        f = open(filename, "w")
        f.write("@relation " + relation + "\n")
        for i in self.labels:
            train_or_predict += 1
            if train_or_predict == len(self.labels):
                break
            f.write("@attribute " + i + " " + self.labels[i] + "\n")
        f.write("\n")
        f.write("@data" + "\n")
        for i in range(len(input_x)):
            for j in input_x[i]:
                f.write(str(j) + "  ")
            if train_or_predict == 0:
                f.write(str(input_y[i]))
            else:
                f.write(str(0))
            f.write("\n")
        f.close()

    def train(self):
        filename = "train.arff"
        self.write_arff(filename, "train", 0, self.input_x, self.input_y)
        loader = Loader(classname="weka.core.converters.ArffLoader")
        data = loader.load_file(filename)
        data.class_is_last()
        self.cls = Classifier(classname="weka.classifiers.meta.Bagging", options=["-S", "5"])
        self.cls.build_classifier(data)
        os.remove(filename)

    def predict(self, test_data):
        filename = "test.arff"
        self.write_arff(filename, "test", 0, test_data)
        loader = Loader(classname="weka.core.converters.ArffLoader")
        data = loader.load_file(filename)
        data.class_is_last()
        # evl = Evaluation(data)
        # evl.evaluate_model(self.cls,data)
        # data.set_class_label(data.numAttributes() - 1)
        # data.setClassIndex(data.numAttributes() - 1)
        result = []
        for index, inst in enumerate(data):
            pred = self.cls.classify_instance(inst)
            dist = self.cls.distribution_for_instance(inst)
            result.append(dist[0])
            # print(str(index+1) + ": label index=" + str(pred) + ", class distribution=" + str(dist))
            # print str(index+1) + 'dist:'+ str(dist)
        os.remove(filename)
        return result
def playback_speed_checker(inputFile, dirRef):
    
    TRAINING_ARFF = 'dataset_playback.arff'
    inputRef = ""

    # Start JVM
    jvm.start()
    jvm.start(system_cp=True, packages=True)
    jvm.start(max_heap_size="512m")
    
    # Find reference file
    for file in os.listdir(dirRef):
        if str(file).find(str(os.path.basename(inputFile))) != -1:
            inputRef = os.path.join(dirRef, file)
            break

    # Calculation distance
    (result, distance) = dtw_checker(inputFile, inputRef)

    # Loading data
    loader = Loader(classname="weka.core.converters.ArffLoader")    
    data = loader.load_file(TRAINING_ARFF)
    data.class_is_last()                    # set class attribute

    # Train the classifier
    #cls = Classifier(classname="weka.classifiers.functions.SMO")
    cls = Classifier(classname="weka.classifiers.trees.J48", options = ["-C", "0.3", "-M", "10"])
    cls.build_classifier(data)

    # Classify instance
    speed_instance = Instance.create_instance(numpy.ndarray(distance), classname='weka.core.DenseInstance', weight=1.0)
    speed_instance.dataset = data
    
    # Classify instance
    speed_flag = cls.classify_instance(speed_instance)
    
    if (distance == 0):
        speed_class = 'nominal'
    else:
        if speed_flag == 0: speed_class = 'down_speed'
        if speed_flag == 0: speed_class = 'up_speed'
        
#    print os.path.basename(inputFile) + ' --- ' + speed_class
    
    # Stop JVM
    jvm.stop()    

    print "SPEED IS: " + speed_class

    return speed_class
Exemplo n.º 7
0
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    bodyfat_file = helper.get_data_dir() + os.sep + "bodyfat.arff"
    helper.print_info("Loading dataset: " + bodyfat_file)
    loader = Loader("weka.core.converters.ArffLoader")
    bodyfat_data = loader.load_file(bodyfat_file)
    bodyfat_data.class_is_last()

    # classifier help
    helper.print_title("Creating help string")
    classifier = Classifier(classname="weka.classifiers.trees.M5P")
    classifier.build_classifier(bodyfat_data)
    print(classifier)
Exemplo n.º 8
0
 def train_model(self, training_data):
     model_weka = None
     if os.path.isfile(self.model_file):
         print 'Model ' + self.name + ' already trained.'
     else:
         print 'Starting to train_model model ' + self.name + '.'
         model_weka = Classifier(classname = self.classname, options = self.options) 
         
         model_weka.build_classifier(data = training_data)
         serialization.write(filename = self.model_file, jobject = model_weka)
         print 'Model ' + self.name + ' trained and saved.'
     if os.path.isfile(self.parameter_file):
         print 'Parameters of the model ' + self.name + ' already saved.'
     else:
         if model_weka == None:
             model_weka = Classifier(jobject = serialization.read(self.model_file))
         save_file(file_name = self.parameter_file, content = str(model_weka))
         print 'Parameters of the model ' + self.name + ' saved.'
def riaa_checker(inputFile):
    
    TRAINING_ARFF = 'C:\Users\ASUS\Desktop\IGNASI\SMC\Workspace\dataset_riaa.arff'

    # Start JVM
    jvm.start()
    jvm.start(system_cp=True, packages=True)
    jvm.start(max_heap_size="512m")

    # Calculation of bark bands information
    (absolute_bark, relative_bark, bark_ratios) = compute_bark_spectrum(inputFile)

    # Loading data
    loader = Loader(classname="weka.core.converters.ArffLoader")    
    data = loader.load_file(TRAINING_ARFF)
    data.class_is_last()                    # set class attribute

    # Train the classifier
    cls = Classifier(classname="weka.classifiers.functions.SMO")
    #cls = Classifier(classname="weka.classifiers.trees.J48", options = ["-C", "0.3", "-M", "10"])
    cls.build_classifier(data)

    # Classify instance
    bark_instance = Instance.create_instance(bark_ratios, classname='weka.core.DenseInstance', weight=1.0)
    bark_instance.dataset = data
    
    # Classify instance
    riaa_flag = cls.classify_instance(bark_instance)
    
    if riaa_flag == 0:
        riaa_class = 'riaa_ok'
    else:
        riaa_class = 'riaa_ko'
        
#    print os.path.basename(inputFile) + ' --- ' + riaa_class
    
    # Stop JVM
    jvm.stop()   

    print "RIAA FILTERING?: " + riaa_class

    return riaa_class
 def getDecisionTree(self, inputPath):   
     #load arff  
     data = self.load_Arff(inputPath)  
         
     #classifier
     data.set_class_index(data.num_attributes() - 1)   # set class attribute
     classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"])
     
     data.set_class_index(data.num_attributes() - 1)
     classifier.build_classifier(data)
     
     
     classifierStr = str(classifier)
     for index in range(0,data.num_instances()):
         instance = data.get_instance(index)
         #print instance
         result = classifier.distribution_for_instance(instance)
         
         #print result
     graph = classifier.graph()
     return graph
Exemplo n.º 11
0
def classify(train, test, name="RF", tuning=False):
    jvm.start()

    if isinstance(train, list) and isinstance(test, list):
        train = weka_instance(train)
        trn_data = converters.load_any_file(train)
        test = weka_instance(test)
        tst_data = converters.load_any_file(test)

    elif os.path.isfile(train) and os.path.isfile(test):
        trn_data = converters.load_any_file(train)
        tst_data = converters.load_any_file(test)

    else:
        trn = csv_as_ndarray(train)
        tst = csv_as_ndarray(test)

        trn_data = converters.ndarray_to_instances(trn, relation="Train")
        tst_data = converters.ndarray_to_instances(tst, relation="Test")

    trn_data.class_is_last()
    tst_data.class_is_last()

    # t = time()
    if tuning:
        opt = tune(train)
    else:
        opt = default_opt
    # print("Time to tune: {} seconds".format(time() - t))

    cls = Classifier(classname=classifiers[name.lower()], options=opt)

    cls.build_classifier(trn_data)

    distr = [cls.distribution_for_instance(inst)[1] for inst in tst_data]
    preds = [cls.classify_instance(inst) for inst in tst_data]

    jvm.stop()

    return preds, distr
Exemplo n.º 12
0
def run_classifier(path, prot, sel, cols, prot_vals, beta):
        
    DIs = dict()
    jvm.start()

    for i in range(len(cols)-1):
        loader = Loader(classname="weka.core.converters.CSVLoader")
        data = loader.load_file(path)
    
        # remove selected attribute from the data
        # NOTE: options are ONE indexed, not ZERO indexed
        remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", \
                        options=["-R", str(sel[2]+1)])
        remove.inputformat(data)
        data = remove.filter(data)

        # if running for only one attribue, remove all others (except protected)
        if i > 0:
            for j in range(1, prot[2]+1):
                if i != j:
                    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", \
                                    options=["-R", ("1" if i>j else "2")])
                    remove.inputformat(data)
                    data = remove.filter(data)

        # set prot attribute as Class attribute
        data.class_is_last()
        
        # run classifier
        cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
        cls.build_classifier(data)
    
        # count the number of each combination
        pos_and_pred = float(0.0)
        pos_and_not_pred = float(0.0)
        neg_and_pred = float(0.0)
        neg_and_not_pred = float(0.0)
        for ind, inst in enumerate(data):
            if cls.classify_instance(inst):
                if prot_vals[ind] == prot[1]:
                    pos_and_pred += 1
                else:
                    neg_and_pred += 1
            else:
                if prot_vals[ind] == prot[1]:
                    pos_and_not_pred += 1
                else:
                    neg_and_not_pred += 1

        # calculate DI
        BER = ((pos_and_not_pred / (pos_and_pred + pos_and_not_pred)) + \
               (neg_and_pred / (neg_and_pred + neg_and_not_pred))) * 0.5
        if BER > 0.5:
            BER = 1 - BER
        DI = 1 - ((1 - 2 * BER) / (beta + 1 - 2 * BER))

        if i == 0: # consider changing this to a 'code word' instead of 'all'
            DIs["all"] = DI
        else:
            DIs[cols[i-1]] = DI

    jvm.stop()

    return DIs
Exemplo n.º 13
0
def random_forest(train_data):
    cls = Classifier(classname="weka.classifiers.trees.RandomForest")
    cls.build_classifier(train_data)
    return cls
Exemplo n.º 14
0
from weka.core.converters import Loader
from weka.core.classes import Random
from weka.classifiers import Classifier, Evaluation

jvm.start()

# load diabetes
loader = Loader(classname="weka.core.converters.ArffLoader")
fname = data_dir + os.sep + "diabetes.arff"
print("\nLoading dataset: " + fname + "\n")
data = loader.load_file(fname)
data.class_is_last()

# determine baseline with ZeroR
zeror = Classifier(classname="weka.classifiers.rules.ZeroR")
zeror.build_classifier(data)
evl = Evaluation(data)
evl.test_model(zeror, data)
print("Baseline accuracy (ZeroR): %0.1f%%" % evl.percent_correct)

print("\nHoldout 10%...")
# use seed 1-10 and perform random split with 90%
perc = []
for i in xrange(1, 11):
    evl = Evaluation(data)
    evl.evaluate_train_test_split(
        Classifier(classname="weka.classifiers.trees.J48"), data, 90.0,
        Random(i))
    perc.append(round(evl.percent_correct, 1))
    print("Accuracy with seed %i: %0.1f%%" % (i, evl.percent_correct))
Exemplo n.º 15
0

jvm.start()
loader = Loader(classname="weka.core.converters.CSVLoader")
data = loader.load_file("data_train.csv")
data.class_is_last()

knn_classifier = Classifier(classname="weka.classifiers.lazy.IBk",
                            options=["-K", "3"])
lin_classifier = Classifier(
    classname="weka.classifiers.functions.LinearRegression",
    options=["-S", "0"])
svm_classifier = Classifier(classname="weka.classifiers.functions.SMOreg",
                            options=["-C", "1.0"])

knn_classifier.build_classifier(data)
lin_classifier.build_classifier(data)
svm_classifier.build_classifier(data)

classifiers = [knn_classifier, lin_classifier, svm_classifier]

print("###################### Classifiers ######################")
for classifier in classifiers:
    print("~~~~~~~~~~~~~~~~~~~")
    print(classifier)

classifier_names = [
    "KNN Classifier", "LinearRegression Classifier", "SVM Classifier"
]

documents = get_docs(sys.argv[1])
Exemplo n.º 16
0
jvm.start()

# load diabetes
loader = Loader(classname="weka.core.converters.ArffLoader")
fname = data_dir + os.sep + "diabetes.arff"
print("\nLoading dataset: " + fname + "\n")
data = loader.load_file(fname)
data.class_is_last()

for classifier in ["weka.classifiers.bayes.NaiveBayes", "weka.classifiers.rules.ZeroR", "weka.classifiers.trees.J48"]:
    # train/test split 90% using classifier
    cls = Classifier(classname=classifier)
    evl = Evaluation(data)
    evl.evaluate_train_test_split(cls, data, 90.0, Random(1))
    print("\n" + classifier + " train/test split (90%):\n" + evl.summary())
    cls.build_classifier(data)
    print(classifier + " model:\n\n" + str(cls))

# calculate mean/stdev over 10 cross-validations
for classifier in [
    "weka.classifiers.meta.ClassificationViaRegression", "weka.classifiers.bayes.NaiveBayes",
        "weka.classifiers.rules.ZeroR", "weka.classifiers.trees.J48", "weka.classifiers.functions.Logistic"]:
    accuracy = []
    for i in xrange(1,11):
        cls = Classifier(classname=classifier)
        evl = Evaluation(data)
        evl.crossvalidate_model(cls, data, 10, Random(i))
        accuracy.append(evl.percent_correct)
    nacc = numpy.array(accuracy)
    print("%s: %0.2f +/-%0.2f" % (classifier, numpy.mean(nacc), numpy.std(nacc)))
def bayes_net(train_data):
    train_data.class_is_last()
    cls = Classifier(classname="weka.classifiers.bayes.BayesNet")
    cls.build_classifier(train_data)
    return cls
def bagging(train_data):
    train_data.class_is_last()
    cls = Classifier(classname="weka.classifiers.meta.Bagging")
    cls.build_classifier(train_data)
    return cls
Exemplo n.º 19
0
def classify(fileToClassify,
             fileToCompare,
             predictionYear=None,
             pastResultYears=None,
             prefix="NFL",
             classifierFunction=[
                 "LinearRegression",
                 ["-S", "0", "-R", "1.0E-8", "-num-decimal-places", "4"]
             ]):
    # Start Java VM
    jvm.start(max_heap_size="1024m")
    # Load CSV files into weka loader
    loader = Loader(classname="weka.core.converters.CSVLoader")
    fileToClassifyData = loader.load_file(fileToClassify)
    fileToClassifyData.class_is_last()
    fileToCompareData = loader.load_file(fileToCompare)
    fileToCompareData.class_is_last()
    predictionYear = "".join(map(str, predictionYear))
    pastResultYears = "-".join(map(str, pastResultYears))

    # Generate Classifier based on data
    classifier = Classifier(classname="weka.classifiers.functions.{}".format(
        classifierFunction[0]),
                            options=classifierFunction[1])
    classifier.build_classifier(fileToClassifyData)
    print(classifier)
    # Var builder for graph
    count = 0.0
    countPred = 0.0
    graphDetails = [
        ['TITLE'],
        [
            '{1} Data Ratings (Official) {0}'.format(pastResultYears, prefix),
            [], []
        ],
        [
            '{1} Data Ratings (Predicted) {0}'.format(predictionYear, prefix),
            [], []
        ]
    ]

    # Time to predict results based on classifier
    for index, inst in enumerate(fileToCompareData):
        pred = classifier.classify_instance(inst)
        temp = list(enumerate(inst))[-1][1]
        countPred += pred
        count += temp
        # index=list(enumerate(inst))[3+1][1]
        index += 1
        print('YOLO', list(enumerate(inst))[3][1])
        print("{0:.3f} accurate compared to results.".format(countPred /
                                                             count))

        dist = classifier.distribution_for_instance(inst)
        # NFL Results
        graphDetails[1][1].append(index)
        graphDetails[1][2].append(temp)

        # Predicted Results
        graphDetails[2][1].append(index)
        graphDetails[2][2].append(pred)
        print(
            str(index + 1) + ": label index=" + str(pred) +
            ", class distribution=" + str(dist) + " , original: " + str(temp))
    graphDetails[0][
        0] = 'Player Rating Predictions For {0} ({1:.3f} Accurate)'.format(
            predictionYear, 100 - (countPred / count))
    jvm.stop()
    BuildGraph(graphDetails)
Exemplo n.º 20
0
def classifyTest(fileToClassify,
                 fileToCompare,
                 predictionYear=None,
                 pastResultYears=None,
                 classifier=None):
    # Start Java VM
    jvm.start(max_heap_size="1024m")
    # Load CSV files into weka loader
    loader = Loader(classname="weka.core.converters.CSVLoader")
    fileToClassifyData = loader.load_file(fileToClassify)
    fileToClassifyData.class_is_last()
    fileToCompareData = loader.load_file(fileToCompare)
    fileToCompareData.class_is_last()

    # Generate Classifier based on data
    classifier = Classifier(
        classname="weka.classifiers.functions.MultilayerPerceptron",
        options=[
            "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "a"
        ])
    classifier.build_classifier(fileToClassifyData)
    print(classifier)
    # Var builder for graph
    count = 0.0
    countPred = 0.0
    graphDetails = [
        ['TITLE'],
        ['NFL Data Ratings (Official) {0}'.format(pastResultYears), [], []],
        ['NFL Data Ratings (Predicted) {0}'.format(predictionYear), [], []]
    ]

    # Time to predict results based on classifier
    for index, inst in enumerate(fileToCompareData):
        pred = classifier.classify_instance(inst)
        temp = list(enumerate(inst))[-1][1]
        countPred += pred
        count += temp
        # index=list(enumerate(inst))[3+1][1]
        index += 1
        print('YOLO', list(enumerate(inst))[3][1])
        print("{0:.3f} accurate compared to results.".format(countPred /
                                                             count))

        dist = classifier.distribution_for_instance(inst)
        # NFL Results
        graphDetails[1][1].append(index)
        graphDetails[1][2].append(temp)

        # Predicted Results
        graphDetails[2][1].append(index)
        graphDetails[2][2].append(pred)
        print(
            str(index + 1) + ": label index=" + str(pred) +
            ", class distribution=" + str(dist) + " , original: " + str(temp))
    graphDetails[0][
        0] = 'Player Rating Predictions For {0} ({1:.3f} Accurate)'.format(
            predictionYear, 100 - (countPred / count))
    jvm.stop()
    print(graphDetails)
    BuildGraph(graphDetails)
Exemplo n.º 21
0
    smote_test_data.class_is_first()

    # load logistic model tree algorithm
    log_tree = Classifier(classname="weka.classifiers.trees.LMT")
    eval_smote_test_obj = Evaluation(smote_test_data)
    eval_smote_test_obj.crossvalidate_model(classifier=log_tree,
                                            data=smote_test_data,
                                            num_folds=5,
                                            rnd=Random(1))
    print("SMOTE Test CV (5-folds) Error = %.2f%%" %
          (eval_smote_test_obj.percent_incorrect))
    print(eval_smote_test_obj.matrix())
    print("=================\"Summary\"====================")
    print(eval_smote_test_obj.summary())

    log_tree.build_classifier(smote_test_data)
    y_predict = eval_smote_test_obj.test_model(log_tree, smote_test_data)

    y_test = to_binary_numeric(y_test.head(500), classNeg="neg")

    falsePositiveRate, truePositiveRate, thresholds = roc_curve(
        y_test, y_predict)
    # compute Area Under the Curve (AUC) using the trapezoidal rule
    area = auc(falsePositiveRate, truePositiveRate)

    plt.plot(falsePositiveRate,
             truePositiveRate,
             color='red',
             label='ROC = ' + str(area))
    plt.plot([0, 1], [0, 1], linestyle='dotted')
    plt.xlabel('False Positive Rate')
Exemplo n.º 22
0
    def predict(self, to_test_file: str, trained_file: str):
        if to_test_file == "" or trained_file == "":
            raise Exception ("Please fill all the fields")
        # if
        if not(os.path.isfile(to_test_file) or os.path.isfile(trained_file)):
            raise Exception ("The file to test and the trained file must be paths to existing files")
        # if
        if not (to_test_file.endswith(".arff") or trained_file.endswith(".arff")):
            raise Exception ("The file to test and the trained one must be arffs files")
        # if

        # Checking files headers, they must be the same to do predictions
        trained_header = ""
        to_test_header = ""
        try:
            with open(trained_file, "r") as tf:
                read_file = True
                while read_file:
                    line = tf.readline()
                    if not "@data" in line:
                        trained_header += line
                    else:
                        read_file = False
                    # if
                # while
            # with

            with open(to_test_file, "r") as tt:
                read_file = True
                while read_file:
                    line = tt.readline()
                    if not "@data" in line:
                        to_test_header += line
                    else:
                        read_file = False
                    # if
                # while
            # with
        except:
            raise Exception("Error opening the referencce arff file")
        # except

        if not trained_header == to_test_header:
            raise Exception("Files header must be the same")
        # if

        # Loading trained and test data from arff files
        loader = Loader(classname="weka.core.converters.ArffLoader")
        trained_data = loader.load_file(trained_file)
        trained_data.class_index = trained_data.num_attributes - 1
        to_test_data = loader.load_file(to_test_file)
        to_test_data.class_is_last()

        # Building classifier from trained data
        cls = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"])
        cls.build_classifier(trained_data)

        # Evaluating predictions
        for i, inst in enumerate(to_test_data):
            pred = cls.classify_instance(inst)
            # dist = cls.distribution_for_instance(inst)
            row = [int(i+1), int(pred+1)]
            self._predicted.append(row)
Exemplo n.º 23
0
# output arff files
processDataToArff("train.arff", False)
processDataToArff("test.arff", True)

# setup training model
loader = Loader(classname="weka.core.converters.ArffLoader")
train = loader.load_file("train.arff")
train.class_is_last()
test = loader.load_file("test.arff")
test.class_is_last()
# print(train)

cls = Classifier(
    classname="weka.classifiers.trees.LMT")  #use LMT as our algorithm
cls.build_classifier(train)  #train the model using train.arff

pout = PredictionOutput(
    classname="weka.classifiers.evaluation.output.prediction.PlainText")
evl = Evaluation(train)
evl.test_model(cls, test, pout)

# print the result
result = pout.buffer_content()
#print(result)

# split the result and only print the gesture
resultLines = result.splitlines()
for i in range(len(resultLines)):
    if (resultLines[i].find("upDown") != -1):
        print("%d upDown" % (i + 1))
Exemplo n.º 24
0
# %%
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(y)
y = le.transform(y)

# %%jhdjghfjdh
nominaldata = nominal.filter(dataset)
nominaldata.class_is_last()

# %%
from weka.classifiers import Classifier, Evaluation
from weka.core.classes import Random
cls = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"])
cls.build_classifier(nominaldata)

print(cls)

import weka.plot.graph as graph  # NB: pygraphviz and PIL are required
graph.plot_dot_graph(cls.graph)

evaluation = Evaluatiojgdjhfsdminaldata, 10, Random(42))  # 10-fold CV
print(evaluation.summary())
print("pctCorrect: " + str(evaluation.percent_correct))
print("incorrect: " + str(evaluation.incorrect))

# %%
jvm.stop()

# %%
Exemplo n.º 25
0
class SklearnWekaWrapper(object):

	def __init__(self, class_name, options=None):

		if options is not None:
			self._classifier = Classifier(classname=class_name, options=[
									  option for option in options.split()])
		else:
			self._classifier = Classifier(classname=class_name)

	def fit(self, training_set, ground_through):

		self.ground_through = ground_through

		training_set = self._sklearn2weka(training_set, self.ground_through)
		training_set.class_is_last()

		self._classifier.build_classifier(training_set)

	def predict(self, testing_set):

		testing_set = self._sklearn2weka(testing_set, self.ground_through)
		testing_set.class_is_last()

		preds = []
		for index, inst in enumerate(testing_set):
			pred = self._classifier.classify_instance(inst)
			preds.append(pred)

		preds = np.vectorize(self._dict.get)(preds)

		return np.array(preds)

	def predict_proba(self, testing_set):

		testing_set = self._sklearn2weka(testing_set, self.ground_through)
		testing_set.class_is_last()

		dists = []
		for index, inst in enumerate(testing_set):
			dist = self._classifier.distribution_for_instance(inst)
			dists.append(dist)

		return np.array(dists)

	def _sklearn2weka(self, features, labels=None):

		encoder = CategoricalEncoder(encoding='ordinal')
		labels_nominal = encoder.fit_transform(np.array(labels).reshape(-1, 1))

		if not hasattr(self, 'dict') and labels is not None:

			dict = {}

			for label, nominal in zip(labels, labels_nominal):
				if nominal.item(0) not in dict:
					dict[nominal.item(0)] = label

			self._dict = dict

		labels_column = np.reshape(labels_nominal,[labels_nominal.shape[0], 1])

		weka_dataset = ndarray_to_instances(np.ascontiguousarray(features, dtype=np.float_), 'weka_dataset')
		weka_dataset.insert_attribute(Attribute.create_nominal('tag', [str(float(i)) for i in range(len(self._dict))]), features.shape[1])

		if labels is not None:
			for index, inst in enumerate(weka_dataset):
				inst.set_value(features.shape[1], labels_column[index])
				weka_dataset.set_instance(index,inst)

		return weka_dataset
Exemplo n.º 26
0
def ClassifyParam(website, mode, binWidths, truncation_modes=["full", "truncated"]):
	if not os.path.exists("classificationResults"):
		os.makedirs("classificationResults")

	
	if("normal" in mode):
		for truncation in truncation_modes:
			file = open("classificationResults/SingleWebsite_%s_%s.csv"%(truncation, website),"w")
			file.write("BinWidth, Accuracy, FalsePositiveRate, FalseNegativeRate\n")

			for binWidth in binWidths:

				train_set_file = "TrainSet_%s_%s.arff"%(truncation, binWidth)
				train_set = "Data/%s/arff/%s"%(website, train_set_file)
				test_set = "Data/%s/arff/%s"%(website, train_set_file.replace("TrainSet", "TestSet"))

				print "Loading Datasets..."
				print "Train: " + train_set
				train_data = converters.load_any_file(train_set)
				print "Test: " + test_set
				test_data = converters.load_any_file(test_set)
				
				#Set class attribute
				train_data.class_is_last()
				test_data.class_is_last()
				print "Dataset Loaded!"


				classifier_name = "weka.classifiers.meta.FilteredClassifier"

				classifier = Classifier(classname=classifier_name, options=[
					"-F", "weka.filters.unsupervised.attribute.StringToWordVector -R first-last -W 1000 -C -T -N 1 -stemmer weka.core.stemmers.NullStemmer -M 1 -tokenizer \"weka.core.tokenizers.WordTokenizer -delimiters \\\" \\\\r\\\\n\\\\t.,;:\\\\\\\'\\\\\\\"()?!\\\"\"",
					"-W", "weka.classifiers.bayes.NaiveBayesMultinomial"])

				start_train = time.time()
				classifier.build_classifier(train_data)
				end_train = time.time()
				print "Train\t%s\t%s"%(binWidth, end_train-start_train)

				for index, inst in enumerate(test_data):
					if(index == 0):
						start_sample = time.time()
						classifier.classify_instance(inst)
						end_sample = time.time()
						print "Sample\t%s\t%s"%(binWidth, end_sample-start_sample)

				print "Evaluating w/ Multinomial Naive Bayes classifier. BinWidth = %s"%(binWidth)
				evaluation = Evaluation(test_data)
				start_batch = time.time()
				evaluation.test_model(classifier, test_data)
				end_batch = time.time()
				print "Batch\t%s\t%s"%(binWidth,end_batch-start_batch)
				

				print evaluation.summary()
				print evaluation.matrix()
				#Just as an example, we're measuring the fpr and fnr of the website indexed as class 1

				tp = evaluation.num_true_positives(1)
				tn = evaluation.num_true_negatives(1)
				fp = evaluation.num_false_positives(1)
				fn = evaluation.num_false_negatives(1)

				acc = (tp+tn)/float(tp+tn+fp+fn)
				fpr = evaluation.false_positive_rate(1)
				fnr = evaluation.false_negative_rate(1)
				
				print "Accuracy: %s"%(acc)
				print "False Positive Rate: %s"%(fpr)
				print "False Negative Rate: %s"%(fnr)

				file.write("%s, %s, %s, %s\n"%(binWidth, acc, fpr, fnr))
			file.close()
Exemplo n.º 27
0
class SklearnWekaWrapper(object):

	def __init__(self, classifier_name):

		if classifier_name == 'wrf':
			class_name='weka.classifiers.trees.RandomForest'
			options=None
		elif classifier_name == 'wj48':
			class_name='weka.classifiers.trees.J48'
			options=None
		elif classifier_name == 'wnb':
			class_name='weka.classifiers.bayes.NaiveBayes'
			options='-D'
		elif classifier_name == 'wbn':
			class_name='weka.classifiers.bayes.BayesNet'
			options='-D -Q weka.classifiers.bayes.net.search.local.TAN -- -S BAYES -E weka.classifiers.bayes.net.estimate.SimpleEstimator -- -A 0.5'

		if options is not None:
			Classifier(classname=class_name, options=[option for option in options.split()])
		else:
			self._classifier = Classifier(classname=class_name)

	def fit(self, training_set, ground_truth):

		self.ground_truth = ground_truth

		training_set = self._sklearn2weka(training_set, self.ground_truth)
		training_set.class_is_last()

		self._classifier.build_classifier(training_set)

	def predict(self, testing_set):

		testing_set = self._sklearn2weka(testing_set, self.ground_truth)
		testing_set.class_is_last()

		preds = []
		for index, inst in enumerate(testing_set):
			pred = self._classifier.classify_instance(inst)
			preds.append(pred)

		preds = np.vectorize(self._dict.get)(preds)

		return np.array(preds)

	def predict_proba(self, testing_set):

		testing_set = self._sklearn2weka(testing_set, self.ground_truth)
		testing_set.class_is_last()

		dists = []
		for index, inst in enumerate(testing_set):
			dist = self._classifier.distribution_for_instance(inst)
			dists.append(dist)

		return np.array(dists)

	def set_oracle(self, oracle):

		pass

	def _sklearn2weka(self, features, labels=None):

		features_encoder = OrdinalEncoder()
		labels_nominal = features_encoder.fit_transform(np.array(labels).reshape(-1, 1))

		if not hasattr(self, 'dict') and labels is not None:

			dict = {}

			for label, nominal in zip(labels, labels_nominal):
				if nominal.item(0) not in dict:
					dict[nominal.item(0)] = label

			self._dict = dict

		labels_column = np.reshape(labels_nominal,[labels_nominal.shape[0], 1])

		weka_dataset = ndarray_to_instances(np.ascontiguousarray(features, dtype=np.float_), 'weka_dataset')
		weka_dataset.insert_attribute(Attribute.create_nominal('tag', [str(float(i)) for i in range(len(self._dict))]), features.shape[1])

		if labels is not None:
			for index, inst in enumerate(weka_dataset):
				inst.set_value(features.shape[1], labels_column[index])
				weka_dataset.set_instance(index,inst)

		return weka_dataset
Exemplo n.º 28
0
class Experiment:
	data = None
	class_index = -1
	classifier = None
	attrs = []

	def __init__(self):
#		jvm.start(max_heap_size="2500M")
		pass

	def out(self, x):
		print x.__str__().encode('ascii', 'ignore')

	def loadCSV(self, filename, path='/home/sbiastoch/Schreibtisch/csv_files/'):
		weka_loader = Loader(classname="weka.core.converters.CSVLoader")
		self.data = weka_loader.load_file(path+filename)

	def setClassIndex(self, index):
		if index < 0:
			self.data.class_index = self.data.num_attributes + index
		else:
			self.data.class_index = index

	def train_J48(self, min_per_rule=20):
		params = [
			'-C','0.3',
			'-M',str(min_per_rule),
	#		'-N',str(folds),
	#		'-R',
		]
		self.base_classifier = Classifier(classname='weka.classifiers.trees.J48', options=params)
		self._train()

	def train_JRip(self, min_per_rule=20, optimizations=2, folds=3, seed=42):
		params = [
			'-F', str(folds), # folds
			'-N', str(min_per_rule), # min elements per rule
			'-O', str(optimizations), # optimizations
			'-S', str(seed) #seed
		] 
		self.base_classifier = Classifier(classname='weka.classifiers.rules.JRip', options=params)
		self._train()

	def _train(self):
		params = [
			'-F','weka.filters.unsupervised.attribute.RemoveByName -E ^('+'|'.join(self.attrs)+')$ -V',
			'-W', self.base_classifier.classname, '--',
			]
		params.extend(self.base_classifier.options)


#		self.classifier = Classifier(classname='weka.classifiers.meta.FilteredClassifier', options=params)
		self.classifier = FilteredClassifier(options=params)
	#	self.classifier.filter(Filter(classname="weka.filters.unsupervised.attribute.RemoveByName", options=['-E','^('+'|'.join(self.attrs)+')$','-V']))
		self.classifier.build_classifier(self.data)
		self.out(self.classifier.__str__().encode('ascii', 'ignore').split("\n")[-2])

	def test(self, folds = 10):
		evaluation = Evaluation(self.data)                     # initialize with priors
		evaluation.crossvalidate_model(self.classifier, self.data, folds, Random(42))  # 10-fold CV
		print('Total number of instances: '+str(evaluation.num_instances)+'.')
		print(str(round(evaluation.percent_correct,2))+'% / '+str(round(evaluation.correct, 2))+' correct.')
		print(str(round(evaluation.percent_incorrect,2))+'% / '+str(round(evaluation.incorrect, 2))+' incorrect.')
		
	def saveCSV(self, filename, path='/home/sbiastoch/Schreibtisch/csv_files/'):
		saver = Saver(classname="weka.core.converters.CSVSaver")
		saver.save_file(self.data, path+filename)

	def loadClassifier(self, filename, path='/home/sbiastoch/Schreibtisch/classifiers/'):
		objects = serialization.read_all(path+filename)
		self.classifier = Classifier(jobject=objects[0])
		#self.data = Instances(jobject=objects[1])

	def saveClassifier(self, filename, path='/home/sbiastoch/Schreibtisch/classifiers/'):
		serialization.write_all(path+filename, [self.classifier, Instances.template_instances(self.data)])


	def remove_correct_classified(self, invert = False):
		options=[
			'-W', self.classifier.to_commandline(), 
			'-C', str(self.class_index), #classindex
	#		'-F','0', # folds
	#		'-T','0.1', #threshold by numeric classes
			'-I','0', # max iterations
			'-V' if not invert else '' 
		] # invert
		classname = "weka.filters.unsupervised.instance.RemoveMisclassified"
		remove = Filter(classname=classname, options=options)
		remove.inputformat(self.data)
		self.data = remove.filter(self.data)

	def remove_incorrect_classified(self):
		self.remove_correct_classified(True)

	def set_attributes(self, attrs):
		self.attrs = attrs

	def select_missclassified(self):
		remove = Filter(classname="weka.filters.supervised.attribute.AddClassification", options=['-classification' ,'-error' ,'-W' ,self.base_classifier.to_commandline()])
		remove.inputformat(self.data)
		self.data = remove.filter(self.data)

		remove = Filter(classname="weka.filters.unsupervised.instance.RemoveWithValues", options=['-S','0.0','-C','last','-L','last','-V'])
		remove.inputformat(self.data)

		remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=['-R',str(self.data.num_attributes-2)+',last'])
		remove.inputformat(self.data)
		self.data = remove.filter(self.data)

	def merge_nominal_attributes(self, significance=0.01):
		remove = Filter(classname="weka.filters.supervised.attribute.MergeNominalValues", options=['-L',str(significance),'-R','first-last'])
		remove.inputformat(self.data)
		self.data = remove.filter(self.data)
Exemplo n.º 29
0
def evaluate_j48(datasets_path, intermediary_path):
    # for examples on how to use this function, refer to
    # http://pythonhosted.org/python-weka-wrapper/examples.html#build-classifier-on-dataset-output-predictions
    import weka.core.jvm as jvm
    from weka.core.converters import Loader
    from weka.classifiers import Classifier
    from sklearn.metrics import precision_score, accuracy_score, f1_score

    from networkx.drawing.nx_agraph import graphviz_layout

    jvm.start()

    json_results = {
        'runs': {
            '1': dict()
        }
    }

    try:
        for dataset in os.listdir(datasets_path):
            dataset_name = dataset.split('.')[0]

            json_results['runs']['1'][dataset_name] = dict()

            loader = Loader(classname="weka.core.converters.ArffLoader")

            y_pred_all = []
            y_true_all = []
            heights = []
            n_nodes = []

            for n_fold in it.count():
                try:
                    train_s = loader.load_file(
                        os.path.join(intermediary_path, '%s_fold_%d_train.arff' % (dataset_name, n_fold)))
                    val_s = loader.load_file(
                        os.path.join(intermediary_path, '%s_fold_%d_val.arff' % (dataset_name, n_fold)))
                    test_s = loader.load_file(
                        os.path.join(intermediary_path, '%s_fold_%d_test.arff' % (dataset_name, n_fold)))

                    train_s.relationname = dataset_name
                    val_s.relationname = dataset_name
                    test_s.relationname = dataset_name

                    train_s.class_is_last()
                    val_s.class_is_last()
                    test_s.class_is_last()

                    warnings.warn('WARNING: appending validation set in training set.')
                    for inst in val_s:
                        train_s.add_instance(inst)

                    cls = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"])
                    # cls = Classifier(classname="weka.classifiers.trees.REPTree",
                    # options=["-M", "2", "-V", "0.001", "-N", "3", "-S", "1", "-L", "-1", "-I", "0.0"])
                    cls.build_classifier(train_s)

                    warnings.warn('WARNING: will only work for binary splits!')
                    graph = cls.graph.encode('ascii')
                    out = StringIO.StringIO(graph)
                    G = nx.Graph(nx.nx_pydot.read_dot(out))

                    # TODO plotting!
                    # fig = plt.figure(figsize=(40, 30))
                    # pos = graphviz_layout(G, root='N0', prog='dot')
                    #
                    # edgelist = G.edges(data=True)
                    # nodelist = G.nodes(data=True)
                    #
                    # edge_labels = {(x1, x2): v['label'] for x1, x2, v in edgelist}
                    # node_colors = {node_id: ('#98FB98' if 'shape' in _dict else '#0099FF') for node_id, _dict in nodelist}
                    # node_colors['N0'] = '#FFFFFF'
                    # node_colors = node_colors.values()
                    #
                    # nx.draw_networkx_nodes(G, pos, node_color=node_colors)
                    # nx.draw_networkx_edges(G, pos, style='dashed', arrows=False)
                    # nx.draw_networkx_labels(G, pos, {k: v['label'] for k, v in G.node.iteritems()})
                    # nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
                    # plt.axis('off')
                    # plt.show()
                    # exit(0)
                    # TODO plotting!

                    heights += [max(map(len, nx.shortest_path(G, source='N0').itervalues()))]
                    n_nodes += [len(G.node)]

                    y_test_true = []
                    y_test_pred = []

                    # y_train_true = []
                    # y_train_pred = []

                    # for index, inst in enumerate(train_s):
                    #     y_train_true += [inst.get_value(inst.class_index)]
                    #     y_train_pred += [cls.classify_instance(inst)]

                    for index, inst in enumerate(test_s):
                        y_test_true += [inst.get_value(inst.class_index)]
                        y_test_pred += [cls.classify_instance(inst)]

                    y_true_all += y_test_true
                    y_pred_all += y_test_pred

                except Exception as e:
                    break

            json_results['runs']['1'][dataset_name] = {
                'confusion_matrix': confusion_matrix(y_true_all, y_pred_all).tolist(),
                'height': heights,
                'n_nodes': n_nodes,
            }

        # interprets
        json_results = json.load(open('/home/henry/Desktop/j48/j48_results.json', 'r'))

        n_runs = len(json_results['runs'].keys())
        some_run = json_results['runs'].keys()[0]
        n_datasets = len(json_results['runs'][some_run].keys())

        df = pd.DataFrame(
            columns=['run', 'dataset', 'test_acc', 'height mean', 'height std', 'n_nodes mean', 'n_nodes std'],
            index=np.arange(n_runs * n_datasets),
            dtype=np.float32
        )

        df['dataset'] = df['dataset'].astype(np.object)

        count_row = 0
        for n_run, run in json_results['runs'].iteritems():
            for dataset_name, dataset in run.iteritems():
                conf_matrix = np.array(dataset['confusion_matrix'], dtype=np.float32)

                test_acc = np.diag(conf_matrix).sum() / conf_matrix.sum()

                height_mean = np.mean(dataset['height'])
                height_std = np.std(dataset['height'])
                n_nodes_mean = np.mean(dataset['n_nodes'])
                n_nodes_std = np.std(dataset['n_nodes'])

                df.loc[count_row] = [
                    int(n_run), str(dataset_name), float(test_acc),
                    float(height_mean), float(height_std), float(n_nodes_mean), float(n_nodes_std)
                ]
                count_row += 1

        print df
        json.dump(json_results, open('j48_results.json', 'w'), indent=2)
        df.to_csv('j48_results.csv', sep=',', quotechar='\"', index=False)

    finally:
        jvm.stop()
def rbfc(train_data):
    train_data.class_is_last()
    cls = Classifier(classname="weka.classifiers.functions.RBFClassifier")
    cls.build_classifier(train_data)
    return cls
Exemplo n.º 31
0
            
            fileOut.write("##################### y =>  " + str(y)+"\n");
            print("##################### y =>  " + str(y)+"\n");

            # f1 = loader.load_file(PathToData + dataset + "/train.arff")
            train.class_is_last()
            test.class_is_last()
            # f1.class_is_last()

            labledDataSet , UnlabledDataSet = splitTrainSet(train);

            
            tree = Classifier(classname="weka.classifiers.trees.J48", options=["-A"])
            # tree = Classifier(classname="weka.classifiers.trees.J48")

            tree.build_classifier(labledDataSet)
            
            eval = Evaluation(labledDataSet)
            eval.test_model(tree, test)
            
            fileOut.write("Labeled data======== " + str((1.0 - eval.error_rate )* 100) + " number of instances== " + str(labledDataSet.num_instances) + "\n")
            
            Newtrainpool = LabeledUnlabeldata(labledDataSet, UnlabledDataSet, tree, y )
            # Newtrainpool = LabeledUnlabeldata(labledDataSet, UnlabledDataSet, tree, y , cal_method=Method)

            fileOut.write("\n\nLabeled data======== " + str((1.0 - eval.error_rate )* 100) + " number of instances== " + str(labledDataSet.num_instances) + "\n")

            fileOut.write("           Decision Tree                       \n")
            fileOut.write("\n      precision   recall     areaUnderROC            \n\n")

            for i in range(test.get_instance(0).num_classes) :
def adaboost(train_data):
    train_data.class_is_last()
    cls = Classifier(classname="weka.classifiers.meta.AdaBoostM1")
    cls.build_classifier(train_data)
    return cls
Exemplo n.º 33
0
    def runner(self, cdat, heap_size = 16384, seed = None, verbose = True):
        self.set_status(Pipeline.RUNNING)

        self.logs.append('Initializing Pipeline')

        para = self.config

        self.logs.append('Reading Pipeline Configuration')

        head = ''
        name = get_rand_uuid_str()

        self.logs.append('Reading Input File')

        for i, stage in enumerate(self.stages):
            if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'):
                self.stages[i].status = Pipeline.RUNNING
            if stage.code ==  'dat.fle':
                head    = os.path.abspath(stage.value.path)
                name, _ = os.path.splitext(stage.value.name)

        self.logs.append('Parsing to ARFF')

        path = os.path.join(head, '{name}.arff'.format(name = name))
        # This bug, I don't know why, using Config.schema instead.
        # cdat.toARFF(path, express_config = para.Preprocess.schema, verbose = verbose)

        for i, stage in enumerate(self.stages):
            if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'):
                self.stages[i].status = Pipeline.COMPLETE

        self.logs.append('Saved ARFF at {path}'.format(path = path))
        self.logs.append('Splitting to Training and Testing Sets')

        JVM.start(max_heap_size = '{size}m'.format(size = heap_size))

        load = Loader(classname = 'weka.core.converters.ArffLoader')
        # data = load.load_file(path)
        # save =  Saver(classname = 'weka.core.converters.ArffSaver')
        data = load.load_file(os.path.join(head, 'iris.arff')) # For Debugging Purposes Only
        data.class_is_last() # For Debugging Purposes Only
        # data.class_index = cdat.iclss

        for i, stage in enumerate(self.stages):
            if stage.code == 'prp.kcv':
                self.stages[i].status = Pipeline.RUNNING

        self.logs.append('Splitting Training Set')

        # TODO - Check if this seed is worth it.
        seed = assign_if_none(seed, random.randint(0, 1000))
        opts = ['-S', str(seed), '-N', str(para.Preprocess.FOLDS)]
        wobj = Filter(classname = 'weka.filters.supervised.instance.StratifiedRemoveFolds', options = opts + ['-V'])
        wobj.inputformat(data)

        tran = wobj.filter(data)

        self.logs.append('Splitting Testing Set')

        wobj.options = opts
        test = wobj.filter(data)

        for i, stage in enumerate(self.stages):
            if stage.code == 'prp.kcv':
                self.stages[i].status = Pipeline.COMPLETE

        self.logs.append('Performing Feature Selection')

        feat = [ ]
        for comb in para.FEATURE_SELECTION:
            if comb.USE:
                for i, stage in enumerate(self.stages):
                    if stage.code == 'ats':
                        search    = stage.value.search.name
                        evaluator = stage.value.evaluator.name

                        if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME:
                            self.stages[i].status = Pipeline.RUNNING

                srch = ASSearch(classname = 'weka.attributeSelection.{classname}'.format(
                    classname = comb.Search.NAME,
                    options   = assign_if_none(comb.Search.OPTIONS, [ ])
                ))
                ewal = ASEvaluation(classname = 'weka.attributeSelection.{classname}'.format(
                    classname = comb.Evaluator.NAME,
                    options   = assign_if_none(comb.Evaluator.OPTIONS, [ ])
                ))

                attr = AttributeSelection()
                attr.search(srch)
                attr.evaluator(ewal)
                attr.select_attributes(tran)

                meta = addict.Dict()
                meta.search    = comb.Search.NAME
                meta.evaluator = comb.Evaluator.NAME
                meta.features  = [tran.attribute(index).name for index in attr.selected_attributes]

                feat.append(meta)

                for i, stage in enumerate(self.stages):
                    if stage.code == 'ats':
                        search    = stage.value.search.name
                        evaluator = stage.value.evaluator.name

                        if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME:
                            self.stages[i].status = Pipeline.COMPLETE

        models = [ ]
        for model in para.MODEL:
            if model.USE:
                summary         = addict.Dict()

                self.logs.append('Modelling {model}'.format(model = model.LABEL))

                summary.label   = model.LABEL
                summary.name    = model.NAME
                summary.options = assign_if_none(model.OPTIONS, [ ])

                for i, stage in enumerate(self.stages):
                    if stage.code == 'lrn' and stage.value.name == model.NAME:
                        self.stages[i].status = Pipeline.RUNNING

                for i, instance in enumerate(data):
                    iclass = list(range(instance.num_classes))
                
                options    = assign_if_none(model.OPTIONS, [ ])
                classifier = Classifier(classname = 'weka.classifiers.{classname}'.format(classname = model.NAME), options = options)
                classifier.build_classifier(tran)
        
                serializer.write(os.path.join(head, '{name}.{classname}.model'.format(
                        name = name,
                    classname = model.NAME
                )), classifier)

                self.logs.append('Testing model {model}'.format(model = model.LABEL))

                evaluation       = Evaluation(tran)
                evaluation.test_model(classifier, test)

                summary.summary  = evaluation.summary()

                frame  = pd.DataFrame(data = evaluation.confusion_matrix)
                axes   = sns.heatmap(frame, cbar = False, annot = True)
                b64str = get_b64_plot(axes)
                
                summary.confusion_matrix = addict.Dict({
                    'value': evaluation.confusion_matrix.tolist(),
                     'plot': b64str
                })

                self.logs.append('Plotting Learning Curve for {model}'.format(model = model.LABEL))

                buffer = io.BytesIO()
                plot_classifier_errors(evaluation.predictions, tran, test, outfile = buffer, wait = False)
                b64str = buffer_to_b64(buffer)

                summary.learning_curve   = b64str

                buffer = io.BytesIO()
                plot_roc(evaluation, class_index = iclass, outfile = buffer, wait = False)
                b64str = buffer_to_b64(buffer)

                summary.roc_curve        = b64str

                buffer = io.BytesIO()
                plot_prc(evaluation, class_index = iclass, outfile = buffer, wait = False)
                b64str = buffer_to_b64(buffer)

                summary.prc_curve        = b64str

                if classifier.graph:
                    summary.graph = classifier.graph

                for i, instance in enumerate(test):
                    prediction = classifier.classify_instance(instance)

                for i, stage in enumerate(self.stages):
                    if stage.code == 'lrn' and stage.value.name == model.NAME:
                        self.stages[i].status = Pipeline.COMPLETE

                models.append(summary)

        self.gist.models = models

        JVM.stop()

        JSON.write(os.path.join(head, '{name}.cgist'.format(name = name)), self.gist)

        self.logs.append('Pipeline Complete')

        self.set_status(Pipeline.COMPLETE)
Exemplo n.º 34
0
                    from weka.classifiers import Evaluation
                    from weka.core.classes import Random
                    from weka.classifiers import Classifier
                    if classifier == 0:
                        for kernel in range(0, 2):
                            if kernel == 0:
                                mapper = Classifier(
                                    classname=
                                    "weka.classifiers.misc.InputMappedClassifier",
                                    options=[
                                        "-M", "-W",
                                        "weka.classifiers.bayes.NaiveBayes"
                                    ])
                                Class = 'NaiveBayes'
                                mapper.build_classifier(dataTrain)
                                evaluation = Evaluation(dataTrain)
                                evaluation.test_model(mapper, dataTest)
                                roc_NB.append(
                                    evaluation.area_under_roc(1) * 100)
                                recall_NB.append(evaluation.recall(1) * 100)
                                precision_NB.append(
                                    evaluation.precision(1) * 100)

                                mapper.build_classifier(dataLastTrain)
                                evaluation = Evaluation(dataLastTrain)
                                evaluation.test_model(mapper, dataLastTest)

                                roc_NB_Last.append(
                                    evaluation.area_under_roc(1) * 100)
                                recall_NB_Last.append(
Exemplo n.º 35
0
def train(request):

    jvm.start()

    d_att1 = Attribute.create_numeric("bodydearword.feature")
    d_att2 = Attribute.create_numeric("bodyform.feature")
    d_att3 = Attribute.create_numeric("bodyhtml.feature")
    d_att4 = Attribute.create_numeric("bodymultipart.feature")
    d_att5 = Attribute.create_numeric("bodynumchars.feature")
    d_att6 = Attribute.create_numeric("bodynumfunctionwords.feature")
    d_att7 = Attribute.create_numeric("bodynumuniqwords.feature")
    d_att8 = Attribute.create_numeric("bodynumwords.feature")
    d_att9 = Attribute.create_numeric("bodyrichness.feature")
    d_att10 = Attribute.create_numeric("bodysuspensionword.feature")
    d_att11 = Attribute.create_numeric("bodyverifyyouraccountphrase.feature")
    d_att12 = Attribute.create_numeric("externalsabinary.feature")
    d_att13 = Attribute.create_numeric("externalsascore.feature")
    d_att14 = Attribute.create_numeric("scriptjavascript.feature")
    d_att15 = Attribute.create_numeric("scriptonclick.feature")
    d_att16 = Attribute.create_numeric("scriptpopup.feature")
    d_att17 = Attribute.create_numeric("scriptstatuschange.feature")
    d_att18 = Attribute.create_numeric("scriptunmodalload.feature")
    d_att19 = Attribute.create_numeric("senddiffreplyto.feature")
    d_att20 = Attribute.create_numeric("sendnumwords.feature")
    d_att21 = Attribute.create_numeric("sendunmodaldomain.feature")
    d_att22 = Attribute.create_numeric("subjectbankword.feature")
    d_att23 = Attribute.create_numeric("subjectdebitword.feature")
    d_att24 = Attribute.create_numeric("subjectfwdword.feature")
    d_att25 = Attribute.create_numeric("subjectnumchars.feature")
    d_att26 = Attribute.create_numeric("subjectnumwords.feature")
    d_att27 = Attribute.create_numeric("subjectreplyword.feature")
    d_att28 = Attribute.create_numeric("subjectrichness.feature")
    d_att29 = Attribute.create_numeric("subjectverifyword.feature")
    d_att30 = Attribute.create_numeric("urlatchar.feature")
    d_att31 = Attribute.create_numeric("urlbaglink.feature")
    d_att32 = Attribute.create_numeric("urlip.feature")
    d_att33 = Attribute.create_numeric("urlnumdomains.feature")
    d_att34 = Attribute.create_numeric("urlnumexternallink.feature")
    d_att35 = Attribute.create_numeric("urlnumimagelink.feature")
    d_att36 = Attribute.create_numeric("urlnuminternallink.feature")
    d_att37 = Attribute.create_numeric("urlnumip.feature")
    d_att38 = Attribute.create_numeric("urlnumlink.feature")
    d_att39 = Attribute.create_numeric("urlnumperiods.feature")
    d_att40 = Attribute.create_numeric("urlnumport.feature")
    d_att41 = Attribute.create_numeric("urlport.feature")
    d_att42 = Attribute.create_numeric("urltwodoains.feature")
    d_att43 = Attribute.create_numeric("urlunmodalbaglink.feature")
    d_att44 = Attribute.create_numeric("urlwordclicklink.feature")
    d_att45 = Attribute.create_numeric("urlwordherelink.feature")
    d_att46 = Attribute.create_numeric("urlwordloginlink.feature")
    d_att47 = Attribute.create_numeric("urlwordupdatelink.feature")
    d_att48 = Attribute.create_nominal("class", {'phish', 'ham'})
    #
    data_dir = settings.BASE_DIR + "/phishing/public/datasets/"
    #
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(data_dir + "dataset.arff")
    data.class_is_last()
    cls = Classifier(classname="weka.classifiers.trees.J48")
    cls.options = ["-C", "0.3"]
    cls.build_classifier(data)

    serialization.write(data_dir + "out.model", cls)
    classifier = Classifier(jobject=serialization.read(data_dir + "out.model"))

    dataset = Instances.create_instances("test", [
        d_att1, d_att2, d_att3, d_att4, d_att5, d_att6, d_att7, d_att8, d_att9,
        d_att10, d_att11, d_att12, d_att13, d_att14, d_att15, d_att16, d_att17,
        d_att18, d_att19, d_att20, d_att21, d_att22, d_att23, d_att24, d_att25,
        d_att26, d_att27, d_att28, d_att29, d_att30, d_att31, d_att32, d_att33,
        d_att34, d_att35, d_att36, d_att37, d_att38, d_att39, d_att40, d_att41,
        d_att42, d_att43, d_att44, d_att45, d_att46, d_att47, d_att48
    ], 0)
    values = [
        0, 0, 0, 0, 890, 1, 124, 198, 0.22247191011236, 0, 0, 0, 0.0, 0, 0, 0,
        0, 0, 1, 4, 0, 0, 0, 0, 21, 4, 1, 0.19047619047619, 0, 0, 0, 0, 2, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        Instance.missing_value()
    ]
    inst = Instance.create_instance(values)
    dataset.add_instance(inst)
    dataset.class_is_last()
    # print(str(dataset))
    var = ''
    for inst1 in dataset:
        pred = classifier.classify_instance(inst1)
        var = inst1.class_attribute.value(int(pred))
        if var == 'ham':
            print('No es pishing')
            # do somthing
        else:
            print('Es pishing')
            # do somthing

        print(var)

    jvm.stop()

    return HttpResponse(str(var))
Exemplo n.º 36
0
class P1:

    # INIT for class (runs project) - !arr initializes empty data field
    def __init__(self, load):

        self.cls = None  # Reuseable
        self.evl = None
        self.data = []
        self.IBK = {}
        self.J48 = {}
        if load:
            self.loader = Loader(classname="weka.core.converters.ArffLoader")
            self.parseARFF("data")

    def run(self):

        self.run1(1)
        self.run2()

        # for i in self.J48:
        #     print "J48 - "+str(i)
        #     for j in self.J48[i]:
        #         print str(j)

        self.graph()

    ###########################################################################
    #                           GRAPHING METHODS                              #
    ###########################################################################

    def graph(self):

        if os.path.exists(os.getcwd() + '/graphs'):
            shutil.rmtree(os.getcwd() + '/graphs')
        # Removed extant
        os.makedirs(os.getcwd() + '/graphs')

        plt1 = pd.DataFrame({
            'features': map(str, range(14, 95, 10)),
            'J48acc': self.J48[1],
            'IBKacc': self.IBK[1]
        })
        plt2 = pd.DataFrame({
            'examples': map(int, range(50, 501, 50)),
            'xaxis': map(float, range(0, 10)),
            'J48std14': self.J48["14AVG"]['stdev'],
            'J48acc14': self.J48["14AVG"]['acc'],
            'IBKstd14': self.IBK["14AVG"]['stdev'],
            'IBKacc14': self.IBK["14AVG"]['acc'],
            'J48std54': self.J48["54AVG"]['stdev'],
            'J48acc54': self.J48["54AVG"]['acc'],
            'IBKstd54': self.IBK["54AVG"]['stdev'],
            'IBKacc54': self.IBK["54AVG"]['acc']
        })

        self.graph1(plt1)
        self.graph2(plt2)

    def graph1(self, plt1):
        # 1st graph: IBK accuracy over # of features
        ax = plt1[['features', 'J48acc']].plot(x='features',
                                               linestyle='-',
                                               marker='+')
        plt1[['features', 'J48acc']].plot(x='features', kind='bar', ax=ax)
        plt.ylim(0.7, 0.8)
        plt.title("J48 accuracy over features")
        plt.ylabel("Accuracy")
        plt.xlabel("# Features")
        plt.show()
        plt.savefig('graphs/J48.png')
        plt.gcf().clear()
        # 2nd graph: J48 accuracy over # of features
        ax = plt1[['features', 'IBKacc']].plot(x='features',
                                               linestyle='-',
                                               marker='+')
        plt1[['features', 'IBKacc']].plot(x='features', kind='bar', ax=ax)
        plt.ylim(0.4, 0.8)
        plt.title("IBK accuracy over features")
        plt.ylabel("Accuracy")
        plt.xlabel("# Features")
        plt.show()
        plt.savefig('graphs/IBK.png')
        plt.gcf().clear()

    def graph2(self, plt2):
        # Setting the positions and width for the bars
        pos = list(range(len(plt2['IBKacc14'])))
        width = 0.2

        # Plotting the bars
        fig, ax = plt.subplots(figsize=(10, 5))

        # Create a bar with pre_score data,
        # in position pos,
        plt.bar(pos,
                plt2['IBKacc14'],
                width,
                alpha=0.5,
                color='#EE3224',
                yerr=plt2['IBKstd14'],
                label="IBKacc14")
        plt.bar([p + width for p in pos],
                plt2['IBKacc54'],
                width,
                alpha=0.5,
                color='#F78F1E',
                yerr=plt2['IBKstd54'],
                label="IBKacc54")
        plt.bar([p + width * 2 for p in pos],
                plt2['J48acc14'],
                width,
                alpha=0.5,
                color='#FFC222',
                yerr=plt2['J48std14'],
                label="J48acc14")
        plt.bar([p + width * 3 for p in pos],
                plt2['J48acc54'],
                width,
                alpha=0.5,
                color='green',
                yerr=plt2['J48std54'],
                label="J48acc54")

        # Set the position of the x ticks
        ax.set_xticks([p + 1.5 * width for p in pos])

        # Set the labels for the x ticks
        ax.set_xticklabels(plt2['examples'])

        # Setting the x-axis and y-axis limits
        plt.xlim(min(pos) - width, max(pos) + width * 4)
        plt.ylim(0.0, 1.3)

        plt.title("J48 v IBK Performance Over Training Sample Size")
        plt.ylabel("Accuracy")
        plt.xlabel("# Examples")
        # Shrink current axis by 20%
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
        # Put a legend to the right of the current axis
        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
        plt.show()
        plt.savefig('graphs/14_54.png')
        plt.gcf().clear()

    ###########################################################################
    #                    DATA-GENERATING METHODS                              #
    ###########################################################################

    def run1(self, num):

        self.gen1(
            num)  # Has imported saved in init object w /data/ presumption
        self.data1 = deepcopy(self.data)

    def run2(self):

        self.IBK["14AVG"] = {'acc': [0.0] * 10, 'stdev': [0.0] * 10}
        self.IBK["54AVG"] = {'acc': [0.0] * 10, 'stdev': [0.0] * 10}
        self.J48["14AVG"] = {'acc': [0.0] * 10, 'stdev': [0.0] * 10}
        self.J48["54AVG"] = {'acc': [0.0] * 10, 'stdev': [0.0] * 10}

        # Loops through algorithm 10 times and generates avg, stdev along folds
        for i in range(0, 10):
            self.data = self.data1
            self.gen2()

            for i, v in enumerate(self.IBK[14]):
                self.IBK["14AVG"]['acc'][i] += v['acc']
                self.IBK["14AVG"]['stdev'][i] += v['stdev']
            for i, v in enumerate(self.IBK[54]):
                self.IBK["54AVG"]['acc'][i] += v['acc']
                self.IBK["54AVG"]['stdev'][i] += v['stdev']
            for i, v in enumerate(self.J48[14]):
                self.J48["14AVG"]['acc'][i] += v['acc']
                self.J48["14AVG"]['stdev'][i] += v['stdev']
            for i, v in enumerate(self.J48[54]):
                self.J48["54AVG"]['acc'][i] += v['acc']
                self.J48["54AVG"]['stdev'][i] += v['stdev']

        self.IBK["14AVG"]['acc'] = [(i / 10.0)
                                    for i in self.IBK["14AVG"]['acc']]
        self.IBK["14AVG"]['stdev'] = [(i / 10.0)
                                      for i in self.IBK["14AVG"]['stdev']]
        self.J48["14AVG"]['acc'] = [(i / 10.0)
                                    for i in self.J48["14AVG"]['acc']]
        self.J48["14AVG"]['stdev'] = [(i / 10.0)
                                      for i in self.J48["14AVG"]['stdev']]
        self.IBK["54AVG"]['acc'] = [(i / 10.0)
                                    for i in self.IBK["54AVG"]['acc']]
        self.IBK["54AVG"]['stdev'] = [(i / 10.0)
                                      for i in self.IBK["54AVG"]['stdev']]
        self.J48["54AVG"]['acc'] = [(i / 10.0)
                                    for i in self.J48["54AVG"]['acc']]
        self.J48["54AVG"]['stdev'] = [(i / 10.0)
                                      for i in self.J48["54AVG"]['stdev']]

    def gen1(self, label):

        self.IBK[label] = []
        self.J48[label] = []

        for x in range(0, 9):

            # Train IBk
            self.train(True, self.data[x + 9]['data'])
            stash1 = self.test1(self.data[x]['data'])
            self.IBK[label].append(stash1)

            # Train J48
            self.train(False, self.data[x + 9]['data'])
            stash2 = self.test1(self.data[x]['data'])
            self.J48[label].append(stash2)

    def gen2(self):

        # Make temp folder to hold train- and test- .arffs
        if os.path.exists(os.getcwd() + '/temp'):
            shutil.rmtree(os.getcwd() + '/temp')
        # Removed extant
        os.makedirs(os.getcwd() + '/temp')

        # For the 14 feature and 54-feature sets ALONE:
        test = []
        for i in [0, 4]:

            test.append(self.data[i]['file'])  # Add file to test set
            train = self.data[i + 9]
            raw = self.getARFFpieces(train['file'])

            temp = {}
            temp['header'] = raw['header']
            for k in range(50, 501, 50):
                temp['train'] = self.getXrandom(k, raw['train'])
                self.genARFF(i, k, temp)

        self.parseARFF("temp")  # Changes self.data permanently

        freshtests = self.parseSomeARFF(test)

        storeall = self.data

        self.data = [i for i in self.data if "14" in i['file']]
        self.data.append(freshtests[0])  # Adds the 14-feature TEST set
        self.gen3(14)

        self.data = [i for i in storeall if "54" in i['file']]
        self.data.append(freshtests[1])  # Adds the 54-feature TEST set
        self.gen3(54)

    def gen3(self, label):

        self.IBK[label] = []
        self.J48[label] = []

        for x in range(0, 10):

            # Train IBk
            self.train(True, self.data[x]['data'])
            stash1 = self.test2(self.data[10]['data'])
            self.IBK[label].append(stash1)

            # Train J48
            self.train(False, self.data[x]['data'])
            stash2 = self.test2(self.data[10]['data'])
            self.J48[label].append(stash2)

    ###########################################################################
    #                    TRAINING & TESTING METHODS                           #
    ###########################################################################

    # Stashes newly trained classifier into self.cls attribute
    def train(self, IBk, xtrain):

        if IBk:
            self.cls = Classifier(
                classname="weka.classifiers.lazy.IBk")  # TODO - options?
        else:
            self.cls = Classifier(classname="weka.classifiers.trees.J48",
                                  options=["-B"])

        self.cls.build_classifier(xtrain)  #Builds with train set for next step

        self.evl = Evaluation(xtrain)

    def test1(self, xtest):
        if self.cls == None:
            print("Fail: no classifier to test with\n")
        self.evl.test_model(self.cls, xtest)
        return self.findacc(self.evl.confusion_matrix)

    def test2(self, xtest):
        if self.cls == None:
            print("Fail: no classifier to test with\n")
        self.evl.test_model(self.cls, xtest)
        return {
            'acc': self.findacc(self.evl.confusion_matrix),
            'stdev': self.evl.root_mean_squared_error
        }

    def findacc(self, cmatrix):
        # a+d / a+b+c+d
        x = map(
            float,
            filter(None,
                   [re.sub("[^0-9]", "", i) for i in str(cmatrix).split('.')]))
        return (x[0] + x[3]) / (sum(x))

    ###########################################################################
    #                    ARFF PARSING & HANDLING METHODS                      #
    ###########################################################################

    # Pulls header and particular data examples apart (rms empty lines)
    # & returns as a dict of arrays
    def getARFFpieces(self, fname1):  # TODO - clean components

        temp = {'header': [], 'train': []}

        for file in glob.glob('data/*.arff'):
            if (file == fname1):  #TRAIN
                with open(os.getcwd() + '/' + file) as f:
                    rest = False
                    for line in f:
                        if line[0] == "@":
                            temp['header'].append(line)
                            if "@DATA" in line:
                                rest = True
                        elif rest and len(line) > 3:  # Cutoff for EOF lines
                            temp['train'].append(line)
        return temp

    # Create new ARFF files from which to import
    def genARFF(self, i, k, data):

        if i == 0:
            path = "14-"
        else:
            path = "54-"
        if k == 50:
            path += "0"
        path += str(k) + ".arff"

        with open((os.getcwd() + "/temp/Train" + path), "w") as train:
            for j in data['header']:
                train.write(j)
            for j in data['train']:
                train.write(j)

    # Get a random x # of documents from a list & return
    def getXrandom(self, x, arr):
        ret = []
        used = []
        max = len(arr) - 1  # Max randint value
        if (max + 1) < x:
            print "Err: can't take " + str(x) + " values from a set of " + str(
                max)
            return
        while len(ret) < x:
            new = randint(0, max)  # Shuffle functional
            if new not in used:  # make sure to not repeat
                used.append(new)
                ret.append(arr[new])  # Saves that line
        return ret

    # Stashes loaded Instances to data array given parameter for subdirectory
    def parseARFF(self, param):

        self.data = [
        ]  # CLEARS FROM PREVIOUS CALL - needed in special iteration case
        try:
            for file in glob.glob(str(param) + "/*.arff"):
                #with open(os.getcwd()+"/"+file, 'r') as f:
                temp = self.loader.load_file(file)
                temp.class_is_last()
                self.data.append({'file': file, 'data': temp})
        except:
            print "Missing data/ folder in cwd\n"

    def parseSomeARFF(self, paths):

        ret = []
        for path in paths:
            temp = self.loader.load_file(os.getcwd() + '/' + path)
            temp.class_is_last()
            ret.append({'file': path, 'data': temp})
        return ret
Exemplo n.º 37
0
# we'll set the class attribute after filtering

# apply NominalToBinary filter and set class attribute
fltr = Filter("weka.filters.unsupervised.attribute.NominalToBinary")
fltr.inputformat(data)
filtered = fltr.filter(data)
filtered.class_is_last()

# cross-validate LinearRegression on filtered data, display model
cls = Classifier(classname="weka.classifiers.functions.LinearRegression")
pout = PredictionOutput(classname="weka.classifiers.evaluation.output.prediction.PlainText")
evl = Evaluation(filtered)
evl.crossvalidate_model(cls, filtered, 10, Random(1), pout)
print("10-fold cross-validation:\n" + evl.summary())
print("Predictions:\n\n" + str(pout))
cls.build_classifier(filtered)
print("Model:\n\n" + str(cls))

# use AddClassification filter with LinearRegression on filtered data
print("Applying AddClassification to filtered data:\n")
fltr = Filter(
    classname="weka.filters.supervised.attribute.AddClassification",
    options=["-W", "weka.classifiers.functions.LinearRegression", "-classification"])
fltr.inputformat(filtered)
classified = fltr.filter(filtered)
print(classified)

# convert class back to nominal
fltr = Filter(classname="weka.filters.unsupervised.attribute.NumericToNominal", options=["-R", "9"])
fltr.inputformat(classified)
nominal = fltr.filter(classified)
Exemplo n.º 38
0
print(train.num_instances)
print(test.num_instances)

# Check data in datasets
print(train.num_attributes)
print(test.num_attributes)


# Create classifier
from weka.classifiers import Classifier
cls = Classifier(classname= "weka.classifiers.bayes.NaiveBayes" )


# No options of interest to adjust
# Build classifier on training data
cls.build_classifier(train)
#       print(cls)

#import weka.plot.graph as graph  
#graph.plot_dot_graph(cls.graph)

from weka.classifiers import Evaluation
from weka.core.classes import Random
evl = Evaluation(train)
evl.crossvalidate_model(cls, train, 10, Random(1))

print ("Kappa Score")
print (evl.kappa) # 0.50 - Not bad
print ("Evaluation Summary")
print (evl.summary()) # Accuracy: 83%
Exemplo n.º 39
0
    test_data.class_is_first()

    # load logistic model tree algorithm
    log_tree = Classifier(classname="weka.classifiers.trees.LMT")
    eval_test_obj = Evaluation(test_data)
    eval_test_obj.crossvalidate_model(classifier=log_tree,
                                      data=test_data,
                                      num_folds=5,
                                      rnd=Random(1))
    print("Test CV (10-folds) Error = %.2f%%" %
          (eval_test_obj.percent_incorrect))
    print(eval_test_obj.matrix())
    print("=================\"Summary\"====================")
    print(eval_test_obj.summary())

    log_tree.build_classifier(test_data)
    y_predict = eval_test_obj.test_model(log_tree, test_data)

    y_test = to_binary_numeric(y_test.head(500), classNeg="neg")

    falsePositiveRate, truePositiveRate, thresholds = roc_curve(y_test,
                                                                y_predict,
                                                                pos_label=0)
    # compute Area Under the Curve (AUC) using the trapezoidal rule
    area = auc(falsePositiveRate, truePositiveRate)

    plt.plot(falsePositiveRate,
             truePositiveRate,
             color='red',
             label='ROC = ' + str(area))
    plt.plot([0, 1], [0, 1], linestyle='dotted')
Exemplo n.º 40
0
Arquivo: c.py Projeto: tanayz/Kaggle
# load a dataset
iris_file = "HairEyeColor.csv"
print("Loading dataset: " + iris_file)
loader = Loader(classname="weka.core.converters.CSVLoader")
iris_data = loader.load_file(iris_file)
print (iris_data.num_attributes)
iris_data.set_class_index(iris_data.num_attributes() - 1)
                                            
# build a classifier and output model
print ("Training J48 classifier on iris")
classifier = Classifier(classname="weka.test.Regression")
#classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.5"])
# Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor"
# property of the J48 classifier itself. However, being of type float rather than double, we need
# to convert it to the correct type first using the double_to_float function:
#classifier.set_property("confidenceFactor", types.double_to_float(0.3))
classifier.build_classifier(iris_data)
print(classifier)
print(classifier.graph())
#plot_graph.plot_dot_graph(classifier.graph())
    

evaluation = Evaluation(iris_data)                     # initialize with priors
evaluation.crossvalidate_model(classifier, iris_data, 10, Random(42))  # 10-fold CV
print(evaluation.to_summary())

print("pctCorrect: " + str(evaluation.percent_correct()))
print("incorrect: " + str(evaluation.incorrect()))
jvm.stop()
Exemplo n.º 41
0
def run_multilayerPercepton(file, file2=None):
    # Get filename from Pathlib object
    filename = file.parts[-1]
    dir = file.parents[0]

    print("Running Multilayer Percepton on %s" % filename)

    if not filename.endswith(".arff"):
        print("%s not ARFF file." % filename)
        return

    # Removes '.arff' from filename
    filename_base = filename[:-5]

    print("loading data...")
    # Load data with class as first attr
    data = load_Arff_file(file)
    data.class_is_first()

    # If 2nd file load that data too
    if file2:
        print("Loading test...")
        test = load_Arff_file(file2)
        test.class_is_first()

    file_names = [
        "MP_N-500_default_H-1",
        "MP_N-500_H-3",
        "MP_N-500_H-5",
        "MP_N-500_H-7",
        "MP_N-500_H-3-5",
        "MP_N-500_H-5-3",
        "MP_N-500_H-3-5-7",
        "MP_N-500_H-7-3-5",
        "MP_N-500_H-5-7-3",
        "MP_N-500_L-01",
        "MP_N-500_L-02",
        "MP_N-500_L-04",
        "MP_N-500_L-05",
        "MP_N-500_M-01",
        "MP_N-500_M-03",
        "MP_N-500_M-04",
        "MP_N-500_M-05",
        "MP_N-500_E-5",
        "MP_N-500_E-10",
        "MP_N-500_E-15",
        "MP_N-500_E-25",
    ]

    options_list = [
        [
            "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "1"
        ],  # DEFAULT
        [
            "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "3"
        ],  # -H START
        [
            "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "5"
        ],
        [
            "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "7"
        ],
        [
            "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "3, 5"
        ],
        [
            "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "5, 3"
        ],
        [
            "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "3, 5, 7"
        ],
        [
            "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "7, 3, 5"
        ],
        [
            "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "5, 7, 3"
        ],  # -H END
        [
            "-L", "0.1", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "1"
        ],  # -L START
        [
            "-L", "0.2", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "1"
        ],
        [
            "-L", "0.4", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "1"
        ],
        [
            "-L", "0.5", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "1"
        ],  # -L END
        [
            "-L", "0.3", "-M", "0.1", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "1"
        ],  # -M START
        [
            "-L", "0.3", "-M", "0.3", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "1"
        ],
        [
            "-L", "0.3", "-M", "0.4", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "1"
        ],
        [
            "-L", "0.3", "-M", "0.5", "-N", "500", "-V", "0", "-S", "0", "-E",
            "20", "-H", "1"
        ],  # -M END
        [
            "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "5", "-H", "1"
        ],  # -E START
        [
            "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "10", "-H", "1"
        ],
        [
            "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "15", "-H", "1"
        ],
        [
            "-L", "0.3", "-M", "0.2", "-N", "500", "-V", "0", "-S", "0", "-E",
            "25", "-H", "1"
        ],  # -E END
    ]

    for i in range(len(options_list)):
        start = time.time()
        print("Beginning iteration " + str(i) + ": " + file_names[i])

        # Use MultilayerPercepton and set options
        cls = Classifier(
            classname="weka.classifiers.functions.MultilayerPerceptron",
            options=options_list[i])
        # Build classifier with train data
        cls.build_classifier(data)

        # Predictions stored in pout
        pout = PredictionOutput(
            classname="weka.classifiers.evaluation.output.prediction.PlainText"
        )

        # Evaluate data on test data
        evaluation = Evaluation(data)
        evaluation.test_model(cls, test, output=pout)

        print(evaluation.summary())
        print(evaluation.class_details())
        print(evaluation.confusion_matrix)

        # Generate grid for ROC
        # plcls.plot_roc(evaluation, class_index=[0,1], wait=True)

        # mk dirs for output
        tempdir = dir / "Results/" / "MP-ALL_N-500_results/" / (file_names[i] +
                                                                "_results/")
        tempdir.mkdir(parents=True, exist_ok=True)

        # Save summary, class details and confusion matrix to file
        result_output = file_names[i] + "_results.txt"
        print(tempdir)
        print(result_output)
        print((tempdir / result_output).absolute())
        output_eval(evaluation, tempdir / result_output)

        # Save the predicited results to file
        prediction_output = file_names[i] + "_prediction.txt"
        output_pred(pout, tempdir / prediction_output)

        end = time.time()
        timetaken = round(end - start, 2)
        print("Time taken to run iteration " + str(i) + ": %s seconds" %
              (timetaken))

    print("Multilayer Percepton complete")
                    from weka.classifiers import Evaluation
                    from weka.core.classes import Random
                    from weka.classifiers import Classifier
                    if classifier == 0:
                        SMOTE = Filter(classname="weka.filters.supervised.instance.SMOTE", options=['-P', str(smote)])
                        SMOTE.inputformat(dataSlowTrain)
                        dataSlowTrain = SMOTE.filter(dataSlowTrain)

                        SMOTE.inputformat(dataFastTrain)
                        dataFastTrain = SMOTE.filter(dataFastTrain)
                        for kernel in range(0,1):
                            if kernel == 0:
                                mapper = Classifier(classname="weka.classifiers.misc.InputMappedClassifier", options=["-M","-W", "weka.classifiers.bayes.NaiveBayes"])
                                Class = 'NaiveBayes'
                                mapper.build_classifier(dataSlowTrain)
                                evaluation = Evaluation(dataSlowTrain)
                                evaluation.test_model(mapper,dataSlowTest)
                                roc_NB.append(evaluation.area_under_roc(1)*100)
                                recall_NB.append(evaluation.recall(yIndexSlow)*100)
                                precision_NB.append(evaluation.precision(yIndexSlow)*100)

                                mapper.build_classifier(dataFastTrain)
                                evaluation = Evaluation(dataFastTrain)
                                evaluation.test_model(mapper, dataFastTest)

                                roc_NB_Last.append(evaluation.area_under_roc(1) * 100)
                                recall_NB_Last.append(evaluation.recall(yIndexFast) * 100)
                                precision_NB_Last.append(evaluation.precision(yIndexFast) * 100)

                                mapper.build_classifier(dataNeutralTrain)
Exemplo n.º 43
0
from weka.core.converters import Loader
from weka.core.classes import Random
from weka.classifiers import Classifier, Evaluation

jvm.start()

# load diabetes
loader = Loader(classname="weka.core.converters.ArffLoader")
fname = data_dir + os.sep + "diabetes.arff"
print("\nLoading dataset: " + fname + "\n")
data = loader.load_file(fname)
data.set_class_index(data.num_attributes() - 1)

# determine baseline with ZeroR
zeror = Classifier(classname="weka.classifiers.rules.ZeroR")
zeror.build_classifier(data)
evl = Evaluation(data)
evl.test_model(zeror, data)
print("Baseline accuracy (ZeroR): %0.1f%%" % evl.percent_correct())

print("\nHoldout 10%...")
# use seed 1-10 and perform random split with 90%
perc = []
for i in xrange(1, 11):
    evl = Evaluation(data)
    evl.evaluate_train_test_split(
        Classifier(classname="weka.classifiers.trees.J48"), data, 90.0, Random(i))
    perc.append(round(evl.percent_correct(), 1))
    print("Accuracy with seed %i: %0.1f%%" % (i, evl.percent_correct()))

# calculate mean and standard deviation
HAM = 0.0
l1 = [0,0]
l2 = [0,0]
count_spams = 0
counts_hams = 0

l = sys.argv
tec = l[1]
if(tec == "1"):
	cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
elif(tec == "2"):
	cls = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25"])
else:
	cls = Classifier(classname="weka.classifiers.trees.RandomForest", options=["-I", "10"])

cls.build_classifier(data_train)

for index, inst in enumerate(data_test):
	pred = cls.classify_instance(inst)
    
	if(index <= 29):
		if(pred == SPAM):
			l1[0] = l1[0]+1
		else:
			l1[1] = l1[1]+1

	else:
		if(pred == SPAM):
			l2[0] = l2[0]+1
		else:
			l2[1] = l2[1]+1
Exemplo n.º 45
0
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)
data.set_class_index(data.num_attributes() - 1)

# 1a filter data
print("Filtering data...")
fltr = Filter("weka.filters.unsupervised.attribute.StringToWordVector")
fltr.set_inputformat(data)
filtered = fltr.filter(data)
filtered.set_class_index(0)

# 1b build classifier
print("Building/evaluating classifier...")
cls = Classifier(classname="weka.classifiers.trees.J48")
cls.build_classifier(filtered)
evl = Evaluation(filtered)
evl.test_model(cls, filtered)
print(evl.to_summary())
print(str(cls))
plg.plot_dot_graph(cls.graph())

# 2. filtered classifier
fname = data_dir + os.sep + "simpletext-test.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
test = loader.load_file(fname)
test.set_class_index(test.num_attributes() - 1)
print("Building/evaluating filtered classifier...")
cls = FilteredClassifier()
cls.set_classifier(Classifier(classname="weka.classifiers.trees.J48"))
Exemplo n.º 46
0
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()

    # classifier help
    helper.print_title("Creating help string")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    print(classifier.to_help())

    # partial classname
    helper.print_title("Creating classifier from partial classname")
    clsname = ".J48"
    classifier = Classifier(classname=clsname)
    print(clsname + " --> " + classifier.classname)

    # classifier from commandline
    helper.print_title("Creating SMO from command-line string")
    cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"'
    classifier = from_commandline(cmdline,
                                  classname="weka.classifiers.Classifier")
    classifier.build_classifier(iris_data)
    print("input: " + cmdline)
    print("output: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # kernel classifier
    helper.print_title("Creating SMO as KernelClassifier")
    kernel = Kernel(
        classname="weka.classifiers.functions.supportVector.RBFKernel",
        options=["-G", "0.001"])
    classifier = KernelClassifier(classname="weka.classifiers.functions.SMO",
                                  options=["-M"])
    classifier.kernel = kernel
    classifier.build_classifier(iris_data)
    print("classifier: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # build a classifier and output model
    helper.print_title("Training J48 classifier on iris")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor"
    # property of the J48 classifier itself. However, being of type float rather than double, we need
    # to convert it to the correct type first using the double_to_float function:
    classifier.set_property("confidenceFactor", typeconv.double_to_float(0.3))
    classifier.build_classifier(iris_data)
    print(classifier)
    print(classifier.graph)
    print(classifier.to_source("MyJ48"))
    plot_graph.plot_dot_graph(classifier.graph)

    # evaluate model on test set
    helper.print_title("Evaluating J48 classifier on iris")
    evaluation = Evaluation(iris_data)
    evl = evaluation.test_model(classifier, iris_data)
    print(evl)
    print(evaluation.summary())

    # evaluate model on train/test split
    helper.print_title("Evaluating J48 classifier on iris (random split 66%)")
    classifier = Classifier(classname="weka.classifiers.trees.J48",
                            options=["-C", "0.3"])
    evaluation = Evaluation(iris_data)
    evaluation.evaluate_train_test_split(classifier, iris_data, 66.0,
                                         Random(1))
    print(evaluation.summary())

    # load a dataset incrementally and build classifier incrementally
    helper.print_title("Build classifier incrementally on iris")
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_inc = loader.load_file(iris_file, incremental=True)
    iris_inc.class_is_last()
    classifier = Classifier(
        classname="weka.classifiers.bayes.NaiveBayesUpdateable")
    classifier.build_classifier(iris_inc)
    for inst in loader:
        classifier.update_classifier(inst)
    print(classifier)

    # construct meta-classifiers
    helper.print_title("Meta classifiers")
    # generic FilteredClassifier instantiation
    print("generic FilteredClassifier instantiation")
    meta = SingleClassifierEnhancer(
        classname="weka.classifiers.meta.FilteredClassifier")
    meta.classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.set_property("filter", flter.jobject)
    print(meta.to_commandline())
    # direct FilteredClassifier instantiation
    print("direct FilteredClassifier instantiation")
    meta = FilteredClassifier()
    meta.classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.filter = flter
    print(meta.to_commandline())
    # generic Vote
    print("generic Vote instantiation")
    meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote")
    classifiers = [
        Classifier(classname="weka.classifiers.functions.SMO"),
        Classifier(classname="weka.classifiers.trees.J48")
    ]
    meta.classifiers = classifiers
    print(meta.to_commandline())

    # cross-validate nominal classifier
    helper.print_title("Cross-validating NaiveBayes on diabetes")
    diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + diabetes_file)
    loader = Loader("weka.core.converters.ArffLoader")
    diabetes_data = loader.load_file(diabetes_file)
    diabetes_data.class_is_last()
    classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
    pred_output = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText",
        options=["-distribution"])
    evaluation = Evaluation(diabetes_data)
    evaluation.crossvalidate_model(classifier,
                                   diabetes_data,
                                   10,
                                   Random(42),
                                   output=pred_output)
    print(evaluation.summary())
    print(evaluation.class_details())
    print(evaluation.matrix())
    print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0)))
    print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc))
    print("areaUnderROC/1: " + str(evaluation.area_under_roc(1)))
    print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc))
    print("avgCost: " + str(evaluation.avg_cost))
    print("totalCost: " + str(evaluation.total_cost))
    print("confusionMatrix: " + str(evaluation.confusion_matrix))
    print("correct: " + str(evaluation.correct))
    print("pctCorrect: " + str(evaluation.percent_correct))
    print("incorrect: " + str(evaluation.incorrect))
    print("pctIncorrect: " + str(evaluation.percent_incorrect))
    print("unclassified: " + str(evaluation.unclassified))
    print("pctUnclassified: " + str(evaluation.percent_unclassified))
    print("coverageOfTestCasesByPredictedRegions: " +
          str(evaluation.coverage_of_test_cases_by_predicted_regions))
    print("sizeOfPredictedRegions: " +
          str(evaluation.size_of_predicted_regions))
    print("falseNegativeRate: " + str(evaluation.false_negative_rate(1)))
    print("weightedFalseNegativeRate: " +
          str(evaluation.weighted_false_negative_rate))
    print("numFalseNegatives: " + str(evaluation.num_false_negatives(1)))
    print("trueNegativeRate: " + str(evaluation.true_negative_rate(1)))
    print("weightedTrueNegativeRate: " +
          str(evaluation.weighted_true_negative_rate))
    print("numTrueNegatives: " + str(evaluation.num_true_negatives(1)))
    print("falsePositiveRate: " + str(evaluation.false_positive_rate(1)))
    print("weightedFalsePositiveRate: " +
          str(evaluation.weighted_false_positive_rate))
    print("numFalsePositives: " + str(evaluation.num_false_positives(1)))
    print("truePositiveRate: " + str(evaluation.true_positive_rate(1)))
    print("weightedTruePositiveRate: " +
          str(evaluation.weighted_true_positive_rate))
    print("numTruePositives: " + str(evaluation.num_true_positives(1)))
    print("fMeasure: " + str(evaluation.f_measure(1)))
    print("weightedFMeasure: " + str(evaluation.weighted_f_measure))
    print("unweightedMacroFmeasure: " +
          str(evaluation.unweighted_macro_f_measure))
    print("unweightedMicroFmeasure: " +
          str(evaluation.unweighted_micro_f_measure))
    print("precision: " + str(evaluation.precision(1)))
    print("weightedPrecision: " + str(evaluation.weighted_precision))
    print("recall: " + str(evaluation.recall(1)))
    print("weightedRecall: " + str(evaluation.weighted_recall))
    print("kappa: " + str(evaluation.kappa))
    print("KBInformation: " + str(evaluation.kb_information))
    print("KBMeanInformation: " + str(evaluation.kb_mean_information))
    print("KBRelativeInformation: " + str(evaluation.kb_relative_information))
    print("SFEntropyGain: " + str(evaluation.sf_entropy_gain))
    print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain))
    print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy))
    print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy))
    print("matthewsCorrelationCoefficient: " +
          str(evaluation.matthews_correlation_coefficient(1)))
    print("weightedMatthewsCorrelation: " +
          str(evaluation.weighted_matthews_correlation))
    print("class priors: " + str(evaluation.class_priors))
    print("numInstances: " + str(evaluation.num_instances))
    print("meanAbsoluteError: " + str(evaluation.mean_absolute_error))
    print("meanPriorAbsoluteError: " +
          str(evaluation.mean_prior_absolute_error))
    print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error))
    print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error))
    print("rootMeanPriorSquaredError: " +
          str(evaluation.root_mean_prior_squared_error))
    print("rootRelativeSquaredError: " +
          str(evaluation.root_relative_squared_error))
    print("prediction output:\n" + str(pred_output))
    plot_cls.plot_roc(evaluation,
                      title="ROC diabetes",
                      class_index=range(
                          0, diabetes_data.class_attribute.num_values),
                      wait=False)
    plot_cls.plot_prc(evaluation,
                      title="PRC diabetes",
                      class_index=range(
                          0, diabetes_data.class_attribute.num_values),
                      wait=False)

    # train 2nd classifier on diabetes dataset
    classifier2 = Classifier(classname="weka.classifiers.trees.RandomForest")
    evaluation2 = Evaluation(diabetes_data)
    evaluation2.crossvalidate_model(classifier2, diabetes_data, 10, Random(42))
    plot_cls.plot_rocs({
        "NB": evaluation,
        "RF": evaluation2
    },
                       title="ROC diabetes",
                       class_index=0,
                       wait=False)
    plot_cls.plot_prcs({
        "NB": evaluation,
        "RF": evaluation2
    },
                       title="PRC diabetes",
                       class_index=0,
                       wait=False)

    # load a numeric dataset
    bolts_file = helper.get_data_dir() + os.sep + "bolts.arff"
    helper.print_info("Loading dataset: " + bolts_file)
    loader = Loader("weka.core.converters.ArffLoader")
    bolts_data = loader.load_file(bolts_file)
    bolts_data.class_is_last()

    # build a classifier and output model
    helper.print_title("Training LinearRegression on bolts")
    classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression",
        options=["-S", "1", "-C"])
    classifier.build_classifier(bolts_data)
    print(classifier)

    # cross-validate numeric classifier
    helper.print_title("Cross-validating LinearRegression on bolts")
    classifier = Classifier(
        classname="weka.classifiers.functions.LinearRegression",
        options=["-S", "1", "-C"])
    evaluation = Evaluation(bolts_data)
    evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42))
    print(evaluation.summary())
    print("correlationCoefficient: " + str(evaluation.correlation_coefficient))
    print("errorRate: " + str(evaluation.error_rate))
    helper.print_title("Header - bolts")
    print(str(evaluation.header))
    helper.print_title("Predictions on bolts")
    for index, pred in enumerate(evaluation.predictions):
        print(
            str(index + 1) + ": " + str(pred) + " -> error=" + str(pred.error))
    plot_cls.plot_classifier_errors(evaluation.predictions, wait=False)

    # train 2nd classifier and show errors in same plot
    classifier2 = Classifier(classname="weka.classifiers.functions.SMOreg")
    evaluation2 = Evaluation(bolts_data)
    evaluation2.crossvalidate_model(classifier2, bolts_data, 10, Random(42))
    plot_cls.plot_classifier_errors(
        {
            "LR": evaluation.predictions,
            "SMOreg": evaluation2.predictions
        },
        wait=False)

    # learning curve
    cls = [
        Classifier(classname="weka.classifiers.trees.J48"),
        Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")
    ]
    plot_cls.plot_learning_curve(cls,
                                 diabetes_data,
                                 increments=0.05,
                                 label_template="[#] !",
                                 metric="percent_correct",
                                 wait=True)

    # access classifier's Java API
    labor_file = helper.get_data_dir() + os.sep + "labor.arff"
    helper.print_info("Loading dataset: " + labor_file)
    loader = Loader("weka.core.converters.ArffLoader")
    labor_data = loader.load_file(labor_file)
    labor_data.class_is_last()

    helper.print_title("Using JRip's Java API to access rules")
    jrip = Classifier(classname="weka.classifiers.rules.JRip")
    jrip.build_classifier(labor_data)
    rset = jrip.jwrapper.getRuleset()
    for i in range(rset.size()):
        r = rset.get(i)
        print(str(r.toString(labor_data.class_attribute.jobject)))
Exemplo n.º 47
0
for train_index, test_index in sss:
    print "Iter", itr,
    X_train, X_test = X[train_index], X[test_index]
    X_test[:,-1] = classes[0]       # make sure test classes is removed
    y_test = Y[test_index]
    write_to_weka('train.arff', 'training_data', data.columns, X_train, classes)
    write_to_weka('test.arff', 'testing_data', data.columns, X_test, classes)

    loader = Loader(classname="weka.core.converters.ArffLoader")
    trdata = loader.load_file("train.arff")
    trdata.class_is_last()

    classifier = Classifier(classname="weka.classifiers.lazy.IBk")
    classifier.options = ["-K", "10", "-W", "0", "-I", "-A",
                          "weka.core.neighboursearch.LinearNNSearch -A \"weka.core.ManhattanDistance -R first-last\""]
    classifier.build_classifier(trdata)

    tedata = loader.load_file("test.arff")
    tedata.class_is_last()

    for index, inst in enumerate(tedata):
        result = classifier.classify_instance(inst)
        Ypred[test_index[index]] = classes[int(result)]

    accuracy = float(np.sum(y_test == Ypred[test_index])) / float(y_test.shape[0])
    print " => Accuracy = ", accuracy
    itr += 1
accuracy = float(np.sum(Y == Ypred)) / float(Y.shape[0])
print "Total accuracy = ", accuracy

os.remove('train.arff')
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_data = loader.load_file(iris_file)
    iris_data.class_is_last()

    # classifier help
    helper.print_title("Creating help string")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    print(classifier.to_help())

    # partial classname
    helper.print_title("Creating classifier from partial classname")
    clsname = ".J48"
    classifier = Classifier(classname=clsname)
    print(clsname + " --> " + classifier.classname)

    # classifier from commandline
    helper.print_title("Creating SMO from command-line string")
    cmdline = 'weka.classifiers.functions.SMO -K "weka.classifiers.functions.supportVector.NormalizedPolyKernel -E 3.0"'
    classifier = from_commandline(cmdline, classname="weka.classifiers.Classifier")
    classifier.build_classifier(iris_data)
    print("input: " + cmdline)
    print("output: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # kernel classifier
    helper.print_title("Creating SMO as KernelClassifier")
    kernel = Kernel(classname="weka.classifiers.functions.supportVector.RBFKernel", options=["-G", "0.001"])
    classifier = KernelClassifier(classname="weka.classifiers.functions.SMO", options=["-M"])
    classifier.kernel = kernel
    classifier.build_classifier(iris_data)
    print("classifier: " + classifier.to_commandline())
    print("model:\n" + str(classifier))

    # build a classifier and output model
    helper.print_title("Training J48 classifier on iris")
    classifier = Classifier(classname="weka.classifiers.trees.J48")
    # Instead of using 'options=["-C", "0.3"]' in the constructor, we can also set the "confidenceFactor"
    # property of the J48 classifier itself. However, being of type float rather than double, we need
    # to convert it to the correct type first using the double_to_float function:
    classifier.set_property("confidenceFactor", typeconv.double_to_float(0.3))
    classifier.build_classifier(iris_data)
    print(classifier)
    print(classifier.graph)
    print(classifier.to_source("MyJ48"))
    plot_graph.plot_dot_graph(classifier.graph)

    # evaluate model on test set
    helper.print_title("Evaluating J48 classifier on iris")
    evaluation = Evaluation(iris_data)
    evl = evaluation.test_model(classifier, iris_data)
    print(evl)
    print(evaluation.summary())

    # evaluate model on train/test split
    helper.print_title("Evaluating J48 classifier on iris (random split 66%)")
    classifier = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"])
    evaluation = Evaluation(iris_data)
    evaluation.evaluate_train_test_split(classifier, iris_data, 66.0, Random(1))
    print(evaluation.summary())

    # load a dataset incrementally and build classifier incrementally
    helper.print_title("Build classifier incrementally on iris")
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    iris_inc = loader.load_file(iris_file, incremental=True)
    iris_inc.class_is_last()
    classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")
    classifier.build_classifier(iris_inc)
    for inst in loader:
        classifier.update_classifier(inst)
    print(classifier)

    # construct meta-classifiers
    helper.print_title("Meta classifiers")
    # generic FilteredClassifier instantiation
    print("generic FilteredClassifier instantiation")
    meta = SingleClassifierEnhancer(classname="weka.classifiers.meta.FilteredClassifier")
    meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.set_property("filter", flter.jobject)
    print(meta.to_commandline())
    # direct FilteredClassifier instantiation
    print("direct FilteredClassifier instantiation")
    meta = FilteredClassifier()
    meta.classifier = Classifier(classname="weka.classifiers.functions.LinearRegression")
    flter = Filter("weka.filters.unsupervised.attribute.Remove")
    flter.options = ["-R", "first"]
    meta.filter = flter
    print(meta.to_commandline())
    # generic Vote
    print("generic Vote instantiation")
    meta = MultipleClassifiersCombiner(classname="weka.classifiers.meta.Vote")
    classifiers = [
        Classifier(classname="weka.classifiers.functions.SMO"),
        Classifier(classname="weka.classifiers.trees.J48")
    ]
    meta.classifiers = classifiers
    print(meta.to_commandline())

    # cross-validate nominal classifier
    helper.print_title("Cross-validating NaiveBayes on diabetes")
    diabetes_file = helper.get_data_dir() + os.sep + "diabetes.arff"
    helper.print_info("Loading dataset: " + diabetes_file)
    loader = Loader("weka.core.converters.ArffLoader")
    diabetes_data = loader.load_file(diabetes_file)
    diabetes_data.class_is_last()
    classifier = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
    pred_output = PredictionOutput(
        classname="weka.classifiers.evaluation.output.prediction.PlainText", options=["-distribution"])
    evaluation = Evaluation(diabetes_data)
    evaluation.crossvalidate_model(classifier, diabetes_data, 10, Random(42), output=pred_output)
    print(evaluation.summary())
    print(evaluation.class_details())
    print(evaluation.matrix())
    print("areaUnderPRC/0: " + str(evaluation.area_under_prc(0)))
    print("weightedAreaUnderPRC: " + str(evaluation.weighted_area_under_prc))
    print("areaUnderROC/1: " + str(evaluation.area_under_roc(1)))
    print("weightedAreaUnderROC: " + str(evaluation.weighted_area_under_roc))
    print("avgCost: " + str(evaluation.avg_cost))
    print("totalCost: " + str(evaluation.total_cost))
    print("confusionMatrix: " + str(evaluation.confusion_matrix))
    print("correct: " + str(evaluation.correct))
    print("pctCorrect: " + str(evaluation.percent_correct))
    print("incorrect: " + str(evaluation.incorrect))
    print("pctIncorrect: " + str(evaluation.percent_incorrect))
    print("unclassified: " + str(evaluation.unclassified))
    print("pctUnclassified: " + str(evaluation.percent_unclassified))
    print("coverageOfTestCasesByPredictedRegions: " + str(evaluation.coverage_of_test_cases_by_predicted_regions))
    print("sizeOfPredictedRegions: " + str(evaluation.size_of_predicted_regions))
    print("falseNegativeRate: " + str(evaluation.false_negative_rate(1)))
    print("weightedFalseNegativeRate: " + str(evaluation.weighted_false_negative_rate))
    print("numFalseNegatives: " + str(evaluation.num_false_negatives(1)))
    print("trueNegativeRate: " + str(evaluation.true_negative_rate(1)))
    print("weightedTrueNegativeRate: " + str(evaluation.weighted_true_negative_rate))
    print("numTrueNegatives: " + str(evaluation.num_true_negatives(1)))
    print("falsePositiveRate: " + str(evaluation.false_positive_rate(1)))
    print("weightedFalsePositiveRate: " + str(evaluation.weighted_false_positive_rate))
    print("numFalsePositives: " + str(evaluation.num_false_positives(1)))
    print("truePositiveRate: " + str(evaluation.true_positive_rate(1)))
    print("weightedTruePositiveRate: " + str(evaluation.weighted_true_positive_rate))
    print("numTruePositives: " + str(evaluation.num_true_positives(1)))
    print("fMeasure: " + str(evaluation.f_measure(1)))
    print("weightedFMeasure: " + str(evaluation.weighted_f_measure))
    print("unweightedMacroFmeasure: " + str(evaluation.unweighted_macro_f_measure))
    print("unweightedMicroFmeasure: " + str(evaluation.unweighted_micro_f_measure))
    print("precision: " + str(evaluation.precision(1)))
    print("weightedPrecision: " + str(evaluation.weighted_precision))
    print("recall: " + str(evaluation.recall(1)))
    print("weightedRecall: " + str(evaluation.weighted_recall))
    print("kappa: " + str(evaluation.kappa))
    print("KBInformation: " + str(evaluation.kb_information))
    print("KBMeanInformation: " + str(evaluation.kb_mean_information))
    print("KBRelativeInformation: " + str(evaluation.kb_relative_information))
    print("SFEntropyGain: " + str(evaluation.sf_entropy_gain))
    print("SFMeanEntropyGain: " + str(evaluation.sf_mean_entropy_gain))
    print("SFMeanPriorEntropy: " + str(evaluation.sf_mean_prior_entropy))
    print("SFMeanSchemeEntropy: " + str(evaluation.sf_mean_scheme_entropy))
    print("matthewsCorrelationCoefficient: " + str(evaluation.matthews_correlation_coefficient(1)))
    print("weightedMatthewsCorrelation: " + str(evaluation.weighted_matthews_correlation))
    print("class priors: " + str(evaluation.class_priors))
    print("numInstances: " + str(evaluation.num_instances))
    print("meanAbsoluteError: " + str(evaluation.mean_absolute_error))
    print("meanPriorAbsoluteError: " + str(evaluation.mean_prior_absolute_error))
    print("relativeAbsoluteError: " + str(evaluation.relative_absolute_error))
    print("rootMeanSquaredError: " + str(evaluation.root_mean_squared_error))
    print("rootMeanPriorSquaredError: " + str(evaluation.root_mean_prior_squared_error))
    print("rootRelativeSquaredError: " + str(evaluation.root_relative_squared_error))
    print("prediction output:\n" + str(pred_output))
    plot_cls.plot_roc(
        evaluation, title="ROC diabetes",
        class_index=range(0, diabetes_data.class_attribute.num_values), wait=False)
    plot_cls.plot_prc(
        evaluation, title="PRC diabetes",
        class_index=range(0, diabetes_data.class_attribute.num_values), wait=False)

    # load a numeric dataset
    bolts_file = helper.get_data_dir() + os.sep + "bolts.arff"
    helper.print_info("Loading dataset: " + bolts_file)
    loader = Loader("weka.core.converters.ArffLoader")
    bolts_data = loader.load_file(bolts_file)
    bolts_data.class_is_last()

    # build a classifier and output model
    helper.print_title("Training LinearRegression on bolts")
    classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"])
    classifier.build_classifier(bolts_data)
    print(classifier)

    # cross-validate numeric classifier
    helper.print_title("Cross-validating LinearRegression on bolts")
    classifier = Classifier(classname="weka.classifiers.functions.LinearRegression", options=["-S", "1", "-C"])
    evaluation = Evaluation(bolts_data)
    evaluation.crossvalidate_model(classifier, bolts_data, 10, Random(42))
    print(evaluation.summary())
    print("correlationCoefficient: " + str(evaluation.correlation_coefficient))
    print("errorRate: " + str(evaluation.error_rate))
    helper.print_title("Header - bolts")
    print(str(evaluation.header))
    helper.print_title("Predictions on bolts")
    for index, pred in enumerate(evaluation.predictions):
        print(str(index+1) + ": " + str(pred) + " -> error=" + str(pred.error))
    plot_cls.plot_classifier_errors(evaluation.predictions, wait=False)

    # learning curve
    cls = [
        Classifier(classname="weka.classifiers.trees.J48"),
        Classifier(classname="weka.classifiers.bayes.NaiveBayesUpdateable")]
    plot_cls.plot_learning_curve(
        cls, diabetes_data, increments=0.05, label_template="[#] !", metric="percent_correct", wait=True)

    # access classifier's Java API
    labor_file = helper.get_data_dir() + os.sep + "labor.arff"
    helper.print_info("Loading dataset: " + labor_file)
    loader = Loader("weka.core.converters.ArffLoader")
    labor_data = loader.load_file(labor_file)
    labor_data.class_is_last()

    helper.print_title("Using JRip's Java API to access rules")
    jrip = Classifier(classname="weka.classifiers.rules.JRip")
    jrip.build_classifier(labor_data)
    rset = jrip.jwrapper.getRuleset()
    for i in range(rset.size()):
        r = rset.get(i)
        print(str(r.toString(labor_data.class_attribute.jobject)))
def logit_boost(train_data):
    train_data.class_is_last()
    cls = Classifier(classname="weka.classifiers.meta.LogitBoost")
    cls.build_classifier(train_data)
    return cls
Exemplo n.º 50
0
class Weka(object): 

    data = None
    dataDir = None
    classifier = None

    def __init__(self, dataDir = '.'):
        self.dataDir = dataDir 
        
        jvm.start()
        

    # Inicializa dados com conteudo do arquivo arff
    def initData(self, arrfFile):
        loader = Loader(classname="weka.core.converters.ArffLoader")
        print self.dataDir + '/' + arrfFile
        self.data = loader.load_file(self.dataDir + '/' + arrfFile)
        self.data.class_is_last()
        
        print 'Carregando arquivo ' + self.dataDir + '/' + arrfFile
        # print(data)
                     

    # Realiza o treinamento do classificador
    def trainData(self, arrfFile = None, classname="weka.classifiers.trees.J48", options=["-C", "0.3"]):
        if arrfFile is not None:
            self.initData( arrfFile )
            
        if self.data is None:
            return 
            
        print 'Contruindo classificador ' + str(classname) + ' ' + ' '.join(options)
        self.classifier = Classifier(classname=classname, options=options)
        self.classifier.build_classifier(self.data)


    # Realiza a classificacao das instancias de um arquivo arff
    def classify(self, predictFile):
            
        if self.data is None or self.classifier is None:
            return [-1]

        loader = Loader(classname="weka.core.converters.ArffLoader")
        predict_data = loader.load_file(self.dataDir + '/' + predictFile)
        predict_data.class_is_last()
        
        values = str(predict_data.class_attribute)[19:-1].split(',')
        
        classes = []
        
        for index, inst in enumerate(predict_data):
            #pred = self.classifier.classify_instance(inst)
            prediction = self.classifier.distribution_for_instance(inst)
            cl = int(values[prediction.argmax()][7:])
            
            print 'Classe:', cl
            classes.append(cl)

        return classes


    # Realiza uma validação cruzada e mostra os resultados na saída padrão
    def crossValidate(self, arrfFile = None, classname="weka.classifiers.trees.J48", options=["-C", "0.3"]):
        
        if arrfFile is not None:
            self.initData( arrfFile )
            
        if self.data is None:
            return 

        print 'Classificador ' + str(classname) + ' ' + ' '.join(options)
        cls = Classifier(classname=classname, options=options)
        
        evl = Evaluation(self.data)
        evl.crossvalidate_model(cls, self.data, 10, Random(1))

        print(evl.percent_correct)
        print(evl.summary())
        print(evl.class_details())
def rotation_forest(train_data):
    train_data.class_is_last()
    cls = Classifier(classname="weka.classifiers.meta.RotationForest")
    cls.build_classifier(train_data)
    return cls
jvm.logger.setLevel(jvm.logging.WARNING)
jvm.start(packages=True, max_heap_size="512m")

# Each instance has nominal class and numeric attributes
loader = Loader(classname="weka.core.converters.ArffLoader")
trainData = loader.load_file('segment-challenge.arff')
trainData.class_is_last()
testData = loader.load_file('segment-test.arff')
testData.class_is_last()

# Default C4.5 tree
classifier = Classifier(classname="weka.classifiers.trees.J48")

# Search for the best parameters and build a classifier with them
classifier.build_classifier(trainData)

print("\n\n=========== Classifier information ================\n\n")
print(classifier.options)
print(classifier)

print("\n\n=========== Train results ================\n\n")
evaluation = Evaluation(trainData)
evaluation.test_model(classifier, trainData)
print(classifier.to_commandline())
print(evaluation.matrix())
print("Train recognition: %0.2f%%" % evaluation.percent_correct)

print("\n\n=========== Test results ================\n\n")
evaluation = Evaluation(testData)
evaluation.test_model(classifier, testData)
def flda(train_data):
    train_data.class_is_last()
    cls = Classifier(classname="weka.classifiers.functions.FLDA")
    cls.build_classifier(train_data)
    return cls
Exemplo n.º 54
0
class WekaWrapper:

	def __init__(self, questionID, algorithm, classifier, parameters, modelParams, optimizer, predict = 0):
		self.questionID = questionID
		self.algorithm = algorithm
		self.classifier = classifier
		self.parameters = parameters
		self.modelParams = modelParams
		self.api = nemoApi()
		self.config = nemoConfig()
		self.optimizer = optimizer
		self.predict = predict
		self.prediction = None


	def retrieveData(self, id, dataset):
		query = self.api.getDataQuery(id, dataset)
		iquery = InstanceQuery()
		iquery.db_url = "jdbc:mysql://" + self.config.HOST + ":" + str(self.config.PORT) + "/" + self.config.DB
		iquery.user = self.config.USER
		iquery.password = self.config.PASS
		iquery.query = query
		data = iquery.retrieve_instances()
		data.class_is_last()
		return data

	def uploadData(self):
		# Upload file to database
		self.api.addModel(self.questionID, '?', self.acc, self.model, self.algorithm, False, self.matrix, self.optimizer)
		info = self.api.fetchQuestionInfo(self.questionID)
		modelID = info['ID']
		for mParam in self.modelParams:
			mParam.AIModel = modelID
			self.api.addAIModelParam(mParam)

	def uploadPrediction(self):
		# Upload best classifier prediction to database

		if self.prediction is not None:
			# Convert prediction to string
			predStr = 'No prediction'
			if (self.prediction == 1.0):
				predStr = "True"
			elif (self.prediction == 0.0):
				predStr = "False"
			print 'Writing ' + predStr
			self.api.updatePrediction(self.questionID, predStr)

	def addInstancesToDataset(self, source, dest):
		# Align the instances of a source dataset to destination's header and add them to the destination dataset
		i = 0
		while i < source.num_instances:
			values = source.get_instance(i).values
			it = np.nditer(values, flags=['f_index'], op_flags=['readwrite'])
			while not it.finished:
				(it[0], it.index),
				if (source.attribute(it.index).is_nominal):
					stringVal = source.get_instance(i).get_string_value(it.index)
					# print stringVal
					if(stringVal != '?'):
						values[it.index] = dest.attribute(it.index).values.index(stringVal)
				it.iternext()
			dest.add_instance(Instance.create_instance(values))
			i = i + 1

	def buildPatientObject(self):
		# Build a patient to classify
		patient = self.api.fetchPatientJSON(self.questionID)
		if patient is not None:
			newPatient = {}
			demographics = ['race_cd', 'sex_cd', 'age_in_years_num']
			observation_fact_features = ['tval_char', 'nval_num']
			for demo in demographics:
				if demo not in patient:
					print "Patient definition missing" + demo + "."
					newPatient[demo] = float('nan')
				else:
					if patient[demo] is not None and patient[demo] != '':
						newPatient[demo] = patient[demo]
					else: 
						print "Demographic " + demo +  " for patient is empty"
						newPatient[demo] = float('nan')
			for obs in patient['observation_facts']:
				concept_cd = obs['concept_cd']
				for feat in observation_fact_features:
					if feat in obs:
						if obs[feat] is not None:
							newPatient[(concept_cd + feat)] = obs[feat]
						else:
							newPatient[(concept_cd + feat)] = float('nan')
					else:
						print "Feature " + concept_cd + feat + " missing from Patient definition, marking it None"
						newPatient[(concept_cd + feat)] = float('nan')
			return newPatient
		else:
			return None

	def addPatientNominals(self, patient, dataset):
		# Add the nominal values for the patient to the master header, in case they aren't already there
		# Loop and add patient's nominal values in case they aren't in masterDataset
		# newDataset will be the new master header
		# Waiting on prediction patient to be defined
		# Should be like {sex_cd: "m", ...}
		ignoreAttributes = ['readmitted']
		atts = []
		for a in dataset.attributes():
			if (not (a.is_nominal)) or (a.name in ignoreAttributes) :
				atts.append(a)
			else:
				newValues = list(a.values)
				#print a.name
				pvalue = patient[a.name]
				if(pvalue not in newValues):
					newValues.append(pvalue)
				atts.append(Attribute.create_nominal(a.name, newValues))
		newDataset = Instances.create_instances("Dataset", atts, 0)
		newDataset.class_is_last()
		return newDataset

	def addNominals(self, dataset):
		# Add the nominal values for all columns, in case a column has none
		ignoreAttributes = ['readmitted']
		atts = []
		for a in dataset.attributes():
			if (not (a.is_nominal)) or (a.name in ignoreAttributes) :
				atts.append(a)
			else:
				newValues = list(a.values)
				pvalue = 'DefaultNominal'
				if(pvalue not in newValues):
					newValues.append(pvalue)
				atts.append(Attribute.create_nominal(a.name, newValues))
		newDataset = Instances.create_instances("Dataset", atts, 0)
		newDataset.class_is_last()
		return newDataset
		
	def createPatientInstance(self, patient, dataset):
		# Create a patient instance to classify
		ignoreAttributes = ['readmitted']
		values = []
		for a in dataset.attributes():
			if not a.is_nominal:
				values.append(patient[a.name])
			elif a.name in ignoreAttributes:
				values.append(0)
			else:
				values.append(a.values.index(patient[a.name]))
		#print values
		newInst = Instance.create_instance(values)
		return newInst



	def run(self):
		# Attach JVM
		javabridge.attach()

		# Debug

		print "Classifier"
		print self.classifier
		print "Params"
		print self.parameters
		print "Model Params"
		print self.modelParams

		# Get data for testing and learning
		learnerData = self.retrieveData(self.questionID, "learner")
		testData = self.retrieveData(self.questionID, 'test')
		masterData = self.retrieveData(self.questionID, 'all')
		masterData = self.addNominals(masterData)

		# Check if there is enough correct data to run
		if (learnerData.num_instances < 1 or testData.num_instances < 1):
			self.status = self.config.NOT_ENOUGH_DATA
			return False

		# If this is a prediction and there is a valid patient, change masterData header
		patientObj = self.buildPatientObject()
		patientInstance = None
		if ((patientObj is not None) and (self.predict == 1)):
			masterData = self.addPatientNominals(patientObj, masterData)
			patientInstance = self.createPatientInstance(patientObj, masterData)
			masterData.add_instance(patientInstance)

		elif (patientObj is None) and (self.predict == 1):
			print 'No patient defined for prediction. Exiting'
			return True
		# Fix dataset headers up to match and fix instances to match headers
		masterData.delete()
		learner = masterData.copy_instances(masterData, 0, 0)
		test = masterData.copy_instances(masterData, 0, 0)
		self.addInstancesToDataset(learnerData, learner)
		self.addInstancesToDataset(testData, test)

		# Comparison of data for testing purposes
		# print 'learnerData'
		# print learnerData

		# print 'learner'
		# print learner

		# print 'testData'
		# print testData

		# print 'test'
		# print test

		# pdb.set_trace()
		# Instantiate classifier
		self.cls = Classifier(classname=self.classifier, options=self.parameters)

		# Run classifier
		self.cls.build_classifier(learner)
		# for index, inst in enumerate(learnerData):
			# prediction = self.cls.classify_instance(inst)
			# distribution = self.cls.distribution_for_instance(inst)

		# Test classifier
		evl = Evaluation(learner)
		evl.test_model(self.cls, test)

		# Store information about matrix
		self.acc = evl.percent_correct
		self.val = None

		# Convert numpy array into simple array
		confusionMatrix = []
		confusionMatrix.append([evl.confusion_matrix[0][0], evl.confusion_matrix[0][1]])
		confusionMatrix.append([evl.confusion_matrix[1][0], evl.confusion_matrix[1][1]])

		# Convert matrix into json format
		self.matrix = json.dumps(confusionMatrix)

		
		# print 'Classifier: ', self.classifier
		# print 'ID: ', self.questionID
		# print 'ACC: ', self.acc
		# print(evl.summary())

		# If this is a prediction... make the prediction
		if ((patientObj is not None) and (self.predict == 1)):
			masterData.add_instance(patientInstance)
			print "Running prediction on patient: "
			print masterData.get_instance(0)
			self.prediction = self.cls.classify_instance(masterData.get_instance(0))
			#self.uploadPrediction()

		# Temporarily store file to serialize to
		fileName = str(self.questionID) + self.algorithm + ".model"
		serialization.write(fileName, self.cls)

		# Open that file and store it
		self.model = None
		with open(fileName, 'rb') as f:
			self.model = f.read()

		# Remove temporary file
		os.remove(fileName)

		# Set status to awaiting feedback
		self.status = self.config.AWAITING_FEEDBACK_STATUS
		return True
def rep_tree(train_data):
    train_data.class_is_last()
    cls = Classifier(classname="weka.classifiers.trees.REPTree")
    cls.build_classifier(train_data)
    return cls
Exemplo n.º 56
0
if data_dir is None:
  data_dir = "." + os.sep + "data"

import os
import weka.core.jvm as jvm
from weka.core.converters import Loader
from weka.core.classes import Random
from weka.classifiers import Classifier, Evaluation
from weka.filters import Filter

jvm.start()

# load weather.nominal
loader = Loader(classname="weka.core.converters.ArffLoader")
fname = data_dir + os.sep + "weather.nominal.arff"
print("\nLoading dataset: " + fname + "\n")
data = loader.load_file(fname)
data.set_class_index(data.num_attributes() - 1)

# perform 10-fold cross-validation
cls = Classifier(classname="weka.classifiers.rules.OneR")
evl = Evaluation(data)
evl.crossvalidate_model(cls, data, 10, Random(1))
print("10-fold cross-validation:\n" + evl.to_summary())

# build model on full dataset and output it
cls.build_classifier(data)
print("Model:\n\n" + str(cls))

jvm.stop()
def mlpc_10(train_data):
    train_data.class_is_last()
    cls = Classifier(classname="weka.classifiers.functions.MLPClassifier",
                     options=["-N", "10"])
    cls.build_classifier(train_data)
    return cls