Exemplo n.º 1
0
    def train_and_predict_instances(self, trainingFile, classifier):

        loader = Loader(classname="weka.core.converters.ArffLoader")
        data = loader.load_file(trainingFile)
        data.class_is_last()
        classes = [str(code) for code in data.class_attribute.values]
        head = [className + " probability" for className in classes]
        head.append("Guess")

        cls = Classifier(classname=classifier)
        cls.build_classifier(data)

        predictions = [[0, 0]] * len(data)
        realLabels = [""] * len(data)
        guess = [0] * len(data)

        for index, inst in enumerate(data):
            pred = cls.classify_instance(inst)
            if inst.get_value(inst.class_index) == pred:
                guess[index] = 1.0
            else:
                guess[index] = 0.0
            dist = cls.distribution_for_instance(inst)
            predictions[index] = [p for p in dist]
            realLabels[index] = classes[int(inst.get_value(inst.class_index))]
            print(
                str(index + 1) + ": label index=" + str(pred) +
                ", class distribution=" + str(dist))

        return [predictions, guess, head, realLabels]
def main(args):
    """
    Trains a J48 classifier on a training set and outputs the predicted class and class distribution alongside the
    actual class from a test set. Class attribute is assumed to be the last attribute.
    :param args: the commandline arguments (train and test datasets)
    :type args: list
    """

    # load a dataset
    helper.print_info("Loading train: " + args[1])
    loader = Loader(classname="weka.core.converters.ArffLoader")
    train = loader.load_file(args[1])
    train.class_index = train.num_attributes - 1
    helper.print_info("Loading test: " + args[2])
    test = loader.load_file(args[2])
    test.class_is_last()

    # classifier
    cls = Classifier(classname="weka.classifiers.trees.J48")
    cls.build_classifier(train)

    # output predictions
    print("# - actual - predicted - error - distribution")
    for index, inst in enumerate(test):
        pred = cls.classify_instance(inst)
        dist = cls.distribution_for_instance(inst)
        print(
            "%d - %s - %s - %s  - %s" %
            (index+1,
             inst.get_string_value(inst.class_index),
             inst.class_attribute.value(int(pred)),
             "yes" if pred != inst.get_value(inst.class_index) else "no",
             str(dist.tolist())))
Exemplo n.º 3
0
    def predBtn_clicked(self):

        gender = self.gender_entry.get()
        age = int(self.age_entry.get())
        height = int(self.height_entry.get())
        weight = int(self.weight_entry.get())
        sociability = self.sociability_entry.get()
        stability = self.stability_entry.get()
        '''Create the model'''
        objects = serialization.read_all("J48.model")

        cls = Classifier(jobject=objects[0])
        data = Instances(jobject=objects[1])
        '''Create the test set to be classified'''
        gender_values = ["Man", "Woman"]
        sociability_values = ["Introvert", "Extrovert"]
        stability_values = ["Stable", "Unstable"]

        values = [
            gender_values.index(gender), age, height, weight,
            self.BMI(weight, height),
            stability_values.index(stability),
            sociability_values.index(sociability),
            Instance.missing_value()
        ]

        inst = Instance.create_instance(values)
        inst.dataset = data
        '''Classification'''
        prediction = int(cls.classify_instance(inst))
        self.controller.show_frame("Result").show(prediction)
        self.clear()
Exemplo n.º 4
0
def TestClassification(arff, modelInput, results):
    # 启动java虚拟机
    jvm.start()
    # 导入分析模型
    objects = serialization.read_all(modelInput)
    clsf = Classifier(jobject=objects[0])
    print(clsf)
    # 导入测试组
    loader = Loader(classname="weka.core.converters.ArffLoader")
    test = loader.load_file(arff)
    test.class_is_first()
    # 分析结果
    resultsFile = open(results, "w")
    resultsFile.write("序号\t原判断\t预测\t良性概率\t恶性概率\n")
    print("序号\t原判断\t预测\t良性概率\t恶性概率")
    for index, inst in enumerate(test):
        pred = clsf.classify_instance(inst)
        dist = clsf.distribution_for_instance(inst)
        sampleID = index + 1
        origin = inst.get_string_value(inst.class_index)
        prediction = inst.class_attribute.value(int(pred))
        sameAsOrigin = "yes" if pred != inst.get_value(
            inst.class_index) else "no"
        NRate = dist.tolist()[0]
        PRate = dist.tolist()[1]
        resultsFile.write(
            "%d\t%s\t%s\t%s\t%s" %
            (sampleID, origin, prediction, str(NRate), str(PRate)) + "\n")
        print("%d\t%s\t%s\t%s\t%s" %
              (sampleID, origin, prediction, str(NRate), str(PRate)))
    resultsFile.close()
    # 退出java虚拟机
    jvm.stop()
    print("检测完成")
def DecisionTree(data):

    classifier = Classifier(classname="weka.classifiers.trees.J48")
    classifier.build_classifier(data)

    print("")
    print("=== Decision Tree ===")
    print(classifier)

    count_class1 = 0
    count_class0 = 0
    print("Labeling income status of each instance. Please wait..")
    for index, inst in enumerate(data):
        pred = classifier.classify_instance(inst)
        # calculate no. of instances classified in class 1 and 0
        if str(pred) == "1.0":
            count_class1 += 1
        else:
            count_class0 += 1

        if index % 5000 == 0:
            print(".")

    print("No of instances in class '>50K' = " + str(count_class1))
    print("No of instances in class '<=50K' = " + str(count_class0))
class DecisionTreeBot(MinimaxBot):
    def __init__(self, number, name=None):
        """
        Minimax bot which uses a decision tree to score board states
        The decision tree outputs one of ten classes.  The higher the class number, the better the board state for X

        :param number:  Board.X for player1 or Board.O for player2
        :param name: A descriptive name for the Bot
        """
        if name is None:
            name = "DTree Bot"
        MinimaxBot.__init__(self, number, name=name)
        self.player_type = 'dtree minimax'

        objects = serialization.read_all(
            "models/game/bots/weka_models/j48_default.model")
        self.classifier = Classifier(jobject=objects[0])

    def compute_score(self, board):
        data_model = BoardDataModel(board)
        weka_instance = data_model.get_weka_instance(categorical=True)
        # for some reason, the j48 model occasionally throws indexoutofbounds exceptions when classifying new instances
        # this try/except block is a hacky way of handling those, so we can at least get some results
        try:
            category = self.classifier.classify_instance(
                weka_instance)  # category will be one of the ten classes
        except Exception:
            print("Error in Decision-Tree Classifier!!")
            category = 5
        #  converts the class value into a numeric score between -1 and 1.   E.g. class 1 gets converted to -0.90, class 3 is converted to -0.50, class 10 is converted to 0.90, etc.
        score = ((category - 5.0) / 5.0) - 0.1

        return score
def main(args):
    """
    Trains a J48 classifier on a training set and outputs the predicted class and class distribution alongside the
    actual class from a test set. Class attribute is assumed to be the last attribute.
    :param args: the commandline arguments (train and test datasets)
    :type args: list
    """

    # load a dataset
    helper.print_info("Loading train: " + args[1])
    loader = Loader(classname="weka.core.converters.ArffLoader")
    train = loader.load_file(args[1])
    train.class_index = train.num_attributes - 1
    helper.print_info("Loading test: " + args[2])
    test = loader.load_file(args[2])
    test.class_is_last()

    # classifier
    cls = Classifier(classname="weka.classifiers.trees.J48")
    cls.build_classifier(train)

    # output predictions
    print("# - actual - predicted - error - distribution")
    for index, inst in enumerate(test):
        pred = cls.classify_instance(inst)
        dist = cls.distribution_for_instance(inst)
        print("%d - %s - %s - %s  - %s" %
              (index + 1, inst.get_string_value(
                  inst.class_index), inst.class_attribute.value(int(pred)),
               "yes" if pred != inst.get_value(inst.class_index) else "no",
               str(dist.tolist())))
def PredecirUnaTemporada(path):
    jvm.start()
    insta = CrearInstanciaParaPredecir(path)
    atributos = ""
    file = open('ModelData/wekaHeader.arff', 'r')
    atributos = file.readlines()
    file.close()

    file = open('ModelData/predictionFiles/inst.arff', 'w')
    file.writelines(atributos)
    file.write("\n" + insta + '\n')
    file.close()

    objects = serialization.read_all("ModelData/77PercentModelPaisajes.model")
    classifier = Classifier(jobject=objects[0])

    loader = Loader()
    data = loader.load_file("ModelData/predictionFiles/inst.arff")
    data.class_is_last()

    clases = ["invierno", "verano", "otono", "primavera"]
    prediccion = ""
    for index, inst in enumerate(data):
        pred = classifier.classify_instance(inst)
        dist = classifier.distribution_for_instance(inst)
        prediccion = clases[int(pred)]
    jvm.stop()
    return prediccion
Exemplo n.º 9
0
def test_single():
  #['long', 'nK', 'nK_delta', 'nT', 'nT_delta', 'str', 'strsum']:
  objs = ['olsize', 'ylsize']
  for obj in objs:
    c = Classifier(jobject=serialization.read(model_file('hash', obj)))
    values = [3.0, 192.0, 124.0, 192.0, 124.0, 6.0, 144.0]
    values.append(0) # should be obj
    ins = Instance.create_instance(values)
    prediction = c.classify_instance(ins)
    print obj, prediction
Exemplo n.º 10
0
class python_weka(object):
    def __init__(self, input_x, input_y, labels):
        self.input_x = input_x
        self.input_y = input_y
        self.labels = labels

    def write_arff(self, filename, relation, train_or_predict, input_x, input_y=None):
        f = open(filename, "w")
        f.write("@relation " + relation + "\n")
        for i in self.labels:
            train_or_predict += 1
            if train_or_predict == len(self.labels):
                break
            f.write("@attribute " + i + " " + self.labels[i] + "\n")
        f.write("\n")
        f.write("@data" + "\n")
        for i in range(len(input_x)):
            for j in input_x[i]:
                f.write(str(j) + "  ")
            if train_or_predict == 0:
                f.write(str(input_y[i]))
            else:
                f.write(str(0))
            f.write("\n")
        f.close()

    def train(self):
        filename = "train.arff"
        self.write_arff(filename, "train", 0, self.input_x, self.input_y)
        loader = Loader(classname="weka.core.converters.ArffLoader")
        data = loader.load_file(filename)
        data.class_is_last()
        self.cls = Classifier(classname="weka.classifiers.meta.Bagging", options=["-S", "5"])
        self.cls.build_classifier(data)
        os.remove(filename)

    def predict(self, test_data):
        filename = "test.arff"
        self.write_arff(filename, "test", 0, test_data)
        loader = Loader(classname="weka.core.converters.ArffLoader")
        data = loader.load_file(filename)
        data.class_is_last()
        # evl = Evaluation(data)
        # evl.evaluate_model(self.cls,data)
        # data.set_class_label(data.numAttributes() - 1)
        # data.setClassIndex(data.numAttributes() - 1)
        result = []
        for index, inst in enumerate(data):
            pred = self.cls.classify_instance(inst)
            dist = self.cls.distribution_for_instance(inst)
            result.append(dist[0])
            # print(str(index+1) + ": label index=" + str(pred) + ", class distribution=" + str(dist))
            # print str(index+1) + 'dist:'+ str(dist)
        os.remove(filename)
        return result
def predictWithWeka(csvFilenameWithInputToPredict, modelFilename):
    """
    #   Nota: para usar sin conocer la clase, se puede colocar una clase dummy
    #   e ignorar los valores actual y error de @return results.
    #
    #   Nota: es necesario que el archivo de nombre @csvFilenameWithInputToPredict
    #   contenga instancias de ambas clases (spam y sanas)
    #
    #   @csvFilenameWithInputToPredict : nombre del archivo csv con las instancias
    #                                   a predecir.
    #
    #   @modelFilename : nombre del archivo de modelo generado por weka y 
    #                    compatible con el archivo csv de entrada
    #
    #   @return results : lista de diccionarios con los siguientes indices
    #                      index, actual, predicted, error y distribution
    """
    loader = Loader(classname="weka.core.converters.CSVLoader")
    cls = Classifier(jobject=serialization.read(modelFilename))
    #print(cls)

    data = loader.load_file(csvFilenameWithInputToPredict)
    data.class_is_last()

    multi = MultiFilter()
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove",
                    options=["-R", "first"])
    numericToNom = Filter(
        classname="weka.filters.unsupervised.attribute.NumericToNominal",
        options=["-R", "8,11"])
    normalize = Filter(
        classname="weka.filters.unsupervised.attribute.Normalize",
        options=["-S", "1.0", "-T", "0.0"])
    multi.filters = [remove, numericToNom, normalize]
    multi.inputformat(data)
    test = multi.filter(data)

    results = []
    for index, inst in enumerate(test):
        result = dict()

        pred = cls.classify_instance(inst)
        dist = cls.distribution_for_instance(inst)

        result["index"] = index + 1
        result["actual"] = inst.get_string_value(inst.class_index)
        result["predicted"] = inst.class_attribute.value(int(pred))
        result["error"] = "yes" if pred != inst.get_value(
            inst.class_index) else "no"
        result["distribution"] = str(dist.tolist())

        results.append(result)
        #print result

    return results
Exemplo n.º 12
0
def assign_classify(file_location, output="classified.out", model="naivebayes.model"):
    data = read_csv_file(file_location)
    jvm.start()
    # load clusters
    obj = serialization.read(model)
    classifier = Classifier(jobject=obj)
    # create file with cluster group
    with open(output, 'w') as cluster_file:
        for index, attrs in enumerate(data):
            inst = Instance.create_instance(attrs[1:])
            pred = classifier.classify_instance(inst)
            print(str(index + 1) + ": label index=" + str(pred))
    jvm.stop()
def playback_speed_checker(inputFile, dirRef):
    
    TRAINING_ARFF = 'dataset_playback.arff'
    inputRef = ""

    # Start JVM
    jvm.start()
    jvm.start(system_cp=True, packages=True)
    jvm.start(max_heap_size="512m")
    
    # Find reference file
    for file in os.listdir(dirRef):
        if str(file).find(str(os.path.basename(inputFile))) != -1:
            inputRef = os.path.join(dirRef, file)
            break

    # Calculation distance
    (result, distance) = dtw_checker(inputFile, inputRef)

    # Loading data
    loader = Loader(classname="weka.core.converters.ArffLoader")    
    data = loader.load_file(TRAINING_ARFF)
    data.class_is_last()                    # set class attribute

    # Train the classifier
    #cls = Classifier(classname="weka.classifiers.functions.SMO")
    cls = Classifier(classname="weka.classifiers.trees.J48", options = ["-C", "0.3", "-M", "10"])
    cls.build_classifier(data)

    # Classify instance
    speed_instance = Instance.create_instance(numpy.ndarray(distance), classname='weka.core.DenseInstance', weight=1.0)
    speed_instance.dataset = data
    
    # Classify instance
    speed_flag = cls.classify_instance(speed_instance)
    
    if (distance == 0):
        speed_class = 'nominal'
    else:
        if speed_flag == 0: speed_class = 'down_speed'
        if speed_flag == 0: speed_class = 'up_speed'
        
#    print os.path.basename(inputFile) + ' --- ' + speed_class
    
    # Stop JVM
    jvm.stop()    

    print "SPEED IS: " + speed_class

    return speed_class
Exemplo n.º 14
0
def predict(attributes):
    jvm.start()
    file_path = print_to_file(attributes)
    # load the saved model
    objects = serialization.read_all("/Users/hosyvietanh/Desktop/data_mining/trained_model.model")
    classifier = Classifier(jobject=objects[0])
    loader = Loader(classname="weka.core.converters.ArffLoader")
    data = loader.load_file(file_path)
    data.class_is_last()
    for index, inst in enumerate(data):
        pred = classifier.classify_instance(inst)
        dist = classifier.distribution_for_instance(inst)
        return int(pred)
    jvm.stop()
Exemplo n.º 15
0
    def weka_predict(self):

        # grab WEKA model
        objects = serialization.read_all(self.weka_model)
        classifier = Classifier(jobject=objects[0])

        # load the dataset i.e. the .arff file generated for the supplied url
        loader = Loader(classname="weka.core.converters.ArffLoader")
        data = loader.load_file(self.dataset)
        data.class_is_last()

        # for each url tested predict whether "Phishy" or not by using the Random Tree model
        for item in data:
            self.prediction = classifier.classify_instance(item)
Exemplo n.º 16
0
def eval_one_split(traindata, testdata, obj):
  c = Classifier(classname="weka.classifiers.trees.M5P")
  c.build_classifier(traindata)
  objidx = traindata.attribute_by_name(obj).index
  preds = []
  reals = []
  for idx, ins in enumerate(testdata):
    pred = c.classify_instance(ins)
    real = ins.get_value(objidx)
    preds.append(pred)
    reals.append(real)
    header = ''
    for h in traindata.attributes(): header += ' ' + h.name
  return preds, reals
    def functionProcessamento(self, ca1_r, ca1_l, ca2_ca3_r, ca2_ca3_l, sub_r,
                              sub_l, sexo, id):
        jvm.start()
        path = os.path.dirname(os.path.abspath(__file__))
        # TODO: verificar qual o sexo do individuo para carregar o modelo corretamente
        modelo = path + "\\naive_bayes_feminino_novo.model"
        if (sexo == "Male"):
            print("É masculino")
            modelo = path + "\\naive_bayes_feminino_novo.model"
        objects = serialization.read_all(modelo)
        classifier = Classifier(jobject=objects[0])
        loader = Loader(classname="weka.core.converters.ArffLoader")
        arquivo = open(path + "\\novo_individuo.arff", "w")
        conteudo = list()
        conteudo.append("@relation alzheimer \n\n")
        conteudo.append("@attribute doente {SIM, NAO} \n")
        conteudo.append("@attribute ca1_right real \n")
        conteudo.append("@attribute ca1_left real \n")
        conteudo.append("@attribute ca2_ca3_right real\n")
        conteudo.append("@attribute ca2_ca3_left real \n")
        conteudo.append("@attribute subic_right real \n")
        conteudo.append("@attribute subic_left real \n\n")
        conteudo.append("@data \n")
        #aqui passar as variáveis
        conteudo.append("SIM," + str(ca1_r) + "," + str(ca1_l) + "," +
                        str(ca2_ca3_r) + "," + str(ca2_ca3_l) + "," +
                        str(sub_r) + "," + str(sub_l))
        print(conteudo)
        arquivo.writelines(conteudo)
        arquivo.close()

        data = loader.load_file(path + "\\novo_individuo.arff")
        data.class_is_last()
        for index, inst in enumerate(data):
            pred = classifier.classify_instance(inst)
            dist = classifier.distribution_for_instance(inst)
            pc_doenca = round(((pred) * 100), 2)
            pc_saudavel = round(((100 - pc_doenca)), 2)
            print(" Porcentagem de alzheimer=" + str(pc_doenca) +
                  "%, porcentagem saudavel=" + str(pc_saudavel) + "%")
            alzheimer = Alzheimer.objects.get(id=id)
            alzheimer.resultado_ad = pc_doenca
            alzheimer.resultado_cn = pc_saudavel
            alzheimer.status_seg = 2
            alzheimer.save()
        jvm.stop()
Exemplo n.º 18
0
def run():
    jvm.start()
    load_csv = Loader("weka.core.converters.CSVLoader")
    data_csv = load_csv.load_file(
        "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.csv"
    )

    saver = Saver("weka.core.converters.ArffSaver")
    saver.save_file(
        data_csv,
        "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.arff"
    )

    load_arff = Loader("weka.core.converters.ArffLoader")
    data_arff = load_arff.load_file(
        "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.arff"
    )
    data_arff.class_is_last()

    cls = Classifier(classname="weka.classifiers.trees.J48",
                     options=["-C", "0.5"])
    cls.build_classifier(data_arff)
    for index, inst in enumerate(data_arff):
        pred = cls.classify_instance(inst)
        dist = cls.distribution_for_instance(inst)
        # save tree prune in txt file

    saveFile = open(
        "/Users/imeiliasantoso/web_graduate_project5/register_page/bank-full_input.txt",
        "w")
    saveFile.write(str(cls))
    # print(cls)
    saveFile.close()

    global j48
    J48_class = Classifier(classname="weka.classifiers.trees.J48",
                           options=["-C", "0.25", "-M", "2"])
    J48_class.build_classifier(data_arff)
    evaluationj48 = Evaluation(data_arff)
    evaluationj48.crossvalidate_model(J48_class, data_arff, 10, Random(100))
    j48 = str(evaluationj48.percent_correct)
    jvm.stop()
    return j48
Exemplo n.º 19
0
	def predict(self, modelName, x, arffName, debug=False):
		# Carga el arrf para conocer la estructura de las instancias
		loader = Loader(classname="weka.core.converters.ArffLoader")
		data = loader.load_file(arffName)


		# Se asume que la clase es el ultimo atributo
		data.class_is_last()

		# Carga del modelo generado en Weka
		objects = serialization.read_all(modelName)
		cls = Classifier(jobject=objects[0])
		if(debug):
			print("Loaded model...")
			print(cls)

		# Se crea la instancia correspondiente a la entrada y se clasifica
		if(debug): print("Input", x)

		# Anyade un valor tonto para la clase de la instancia
		if data.class_attribute.is_nominal:
			x.append('a')
		else:
			x.append(0)

		# Convierte los valores nominales a la posicion entera que ocupa dentro de sus lista
		#print data.num_attributes
		for i in range(0, data.num_attributes):
			attribute = data.attribute(i)
			if attribute.is_nominal:
				x[i] = attribute.index_of(x[i])
			'''print x[i]
		print '''''
		# Realiza la prediccion
		inst = Instance.create_instance(x)
		inst.dataset = data
		pred = cls.classify_instance(inst)
		if data.class_attribute.is_nominal:
			pred =  data.class_attribute.value(pred)
		if(debug): print("Prediction", pred)

		return pred
Exemplo n.º 20
0
class TweetClassifier2:
    def __init__(self, twitter_user='******'):
        self.__dir = dirname(__file__)
        self.twitter_user = twitter_user
        self.training_data_file = join(
            dirname(__file__),
            '../{}_classified_data.arff'.format(twitter_user))

        self.loader = Loader(classname="weka.core.converters.ArffLoader")
        data = self.loader.load_file(self.training_data_file)
        data.class_is_last()

        # the classes / categories we are classifying tweets as
        self.categories = data.attribute(data.class_index).values

        self.classifier = Classifier(classname='weka.classifiers.trees.J48',
                                     options=['-C', '0.3'])
        self.classifier.build_classifier(data)

    def classify(self, document):
        """Use the trained classifier to determine the class of this
        new document

        :param document: <str> body of text to classify
        :return: <str> name of the class / category of the document
        """
        document = {
            'text': document,
            'category': ['?'],
        }
        # save as json
        fp = join(self.__dir, '../{}_new_doc.json'.format(self.twitter_user))
        json.dump([document], open(fp, 'w'))

        # transform to arff conforming to attributes from trained data set
        arff_file = transform(fp, 'dont_care', 80, self.training_data_file)

        data = self.loader.load_file(arff_file)
        data.class_is_last()
        predicted = self.classifier.classify_instance(data.get_instance(0))

        return self.categories[int(predicted)]
Exemplo n.º 21
0
    def train_and_separate_validation(self, trainingSet, validationSet,
                                      validationInstancesNames, classifier):

        loader = Loader(classname="weka.core.converters.ArffLoader")
        data = loader.load_file(trainingSet)
        data.class_is_last()
        data2 = loader.load_file(validationSet)
        if not len(data2) == len(validationInstancesNames):
            print(
                "Theres a mismatch between the number of instances in the arff file and the list of instance names."
            )
            raise LookupError
        data2.class_is_last()
        classes = [str(code) for code in data.class_attribute.values]
        header = [[classifier, trainingSet, "", "", ""], ["Instance"] +
                  [className + " probability"
                   for className in classes] + ["Real Class", "Guess"]]

        cls = Classifier(classname=classifier)
        print("Training.")
        cls.build_classifier(data)
        print("Model done!")

        dataMatrix = [["", 0, 0, 0, ""] for i in range(len(data2))]

        print("Validating.")
        for index, inst in enumerate(data2):
            print("Instance: " + str(index + 1) + "/" + str(len(data2)))
            pred = cls.classify_instance(inst)
            if inst.get_value(inst.class_index) == pred:
                guessValue = 1.0
            else:
                guessValue = 0.0
            dist = cls.distribution_for_instance(inst)
            dataMatrix[index][0] = validationInstancesNames[index]
            dataMatrix[index][1:3] = [round(p, 2) for p in dist]
            dataMatrix[index][3] = classes[int(inst.get_value(
                inst.class_index))]
            dataMatrix[index][4] = guessValue

        print("Done\n")
        return [header, dataMatrix]
Exemplo n.º 22
0
def riaa_checker(inputFile):
    
    TRAINING_ARFF = 'C:\Users\ASUS\Desktop\IGNASI\SMC\Workspace\dataset_riaa.arff'

    # Start JVM
    jvm.start()
    jvm.start(system_cp=True, packages=True)
    jvm.start(max_heap_size="512m")

    # Calculation of bark bands information
    (absolute_bark, relative_bark, bark_ratios) = compute_bark_spectrum(inputFile)

    # Loading data
    loader = Loader(classname="weka.core.converters.ArffLoader")    
    data = loader.load_file(TRAINING_ARFF)
    data.class_is_last()                    # set class attribute

    # Train the classifier
    cls = Classifier(classname="weka.classifiers.functions.SMO")
    #cls = Classifier(classname="weka.classifiers.trees.J48", options = ["-C", "0.3", "-M", "10"])
    cls.build_classifier(data)

    # Classify instance
    bark_instance = Instance.create_instance(bark_ratios, classname='weka.core.DenseInstance', weight=1.0)
    bark_instance.dataset = data
    
    # Classify instance
    riaa_flag = cls.classify_instance(bark_instance)
    
    if riaa_flag == 0:
        riaa_class = 'riaa_ok'
    else:
        riaa_class = 'riaa_ko'
        
#    print os.path.basename(inputFile) + ' --- ' + riaa_class
    
    # Stop JVM
    jvm.stop()   

    print "RIAA FILTERING?: " + riaa_class

    return riaa_class
Exemplo n.º 23
0
def classify(train, test, name="RF", tuning=False):
    jvm.start()

    if isinstance(train, list) and isinstance(test, list):
        train = weka_instance(train)
        trn_data = converters.load_any_file(train)
        test = weka_instance(test)
        tst_data = converters.load_any_file(test)

    elif os.path.isfile(train) and os.path.isfile(test):
        trn_data = converters.load_any_file(train)
        tst_data = converters.load_any_file(test)

    else:
        trn = csv_as_ndarray(train)
        tst = csv_as_ndarray(test)

        trn_data = converters.ndarray_to_instances(trn, relation="Train")
        tst_data = converters.ndarray_to_instances(tst, relation="Test")

    trn_data.class_is_last()
    tst_data.class_is_last()

    # t = time()
    if tuning:
        opt = tune(train)
    else:
        opt = default_opt
    # print("Time to tune: {} seconds".format(time() - t))

    cls = Classifier(classname=classifiers[name.lower()], options=opt)

    cls.build_classifier(trn_data)

    distr = [cls.distribution_for_instance(inst)[1] for inst in tst_data]
    preds = [cls.classify_instance(inst) for inst in tst_data]

    jvm.stop()

    return preds, distr
Exemplo n.º 24
0
class ObjectiveClassifier:
    def __init__(self, model_path, senti_path, stop_words, ngrams_path):
        self.loader = Loader(classname="weka.core.converters.ArffLoader")
        self.features_calculator = FeaturesCalculator(ngrams_path)
        self.classifier = Classifier(jobject=serialization.read(model_path))
        self.normalizer = Preprocessor(senti_path)
        self.stop_words = stop_words

    def classify_tweet(self, tweet, polarity='"positive"'):
        tweet_normalized = self.normalizer.preprocess(tweet, self.stop_words,
                                                      "")
        self.features_calculator.calculateFeatures(
            tweet_normalized, "output/tweet_features_objective.arff", polarity)
        tweet_features = self.loader.load_file(
            "output/tweet_features_objective.arff")
        tweet_features.class_is_last()
        for index, inst in enumerate(tweet_features):
            pred = self.classifier.classify_instance(inst)
            dist = self.classifier.distribution_for_instance(inst)
            print("%d - %s - %s" %
                  (index + 1, inst.class_attribute.value(
                      int(pred)), str(dist.tolist())))
Exemplo n.º 25
0
def python_wrapper(mImage, prefix, file_name, pre_prefix, dir, permanent_dir,
                   model):
    # Initialization of weka machine learning library
    weka_machine_learning = WML.WekaMachineLearning()
    # tokenization of images
    token = re.split('RGB_|.png', mImage)
    ir_directory = token[0] + 'IR_' + token[1] + '.pgm'
    mat_directory = token[0] + 'Mat_' + token[1]

    # get mat and ir image
    image = segmentor.getImage(ir_directory)
    mat = segmentor.readMatFile(mat_directory)

    # image processing
    edges = segmentor.edgeDetector(image)
    type = segmentor.getTypeOfFruit(image)
    segmentation = segmentor.segmentation(image, type)
    filter = segmentor.filterImageFromSegmentation(image, segmentation)
    output_seg = segmentor.imageMapping(filter, mat['IR'])

    ####################-Anomaly Detection via INFLO-###################

    # file prefix creation for the csv file to save
    prefix_csv = prefix + "\\" + file_name

    # if folder is not there then create it
    # and right the csv to the folder
    if not os.path.exists(prefix):
        os.mkdir(prefix)
        csv = segmentor.writeToCSV(output_seg, prefix_csv)
        print "file is written"

    #else simply write the csv to the folder
    else:
        csv = segmentor.writeToCSV(output_seg, prefix_csv)
        print "file is written"
    #call the INFLO.bat after segmenting the image
    #for anomaly detection
    run_batch_file("rapid_miner_pro_ifruitlfy.bat")
    ############################-Clustering-############################

    # image file directory is stored in ir_directory
    # mat file directory is stored in mat_directory
    # and need to get the INFLO file
    # directory for INFLO file is prefix_csv
    anomaly_file = prefix_csv + '.csv_INFLO.csv'
    # directory for the temperorary files is made so
    # some results can be stored and processed auto-
    # matically by the rapid miner 5, this folder is

    demo_printing_picture(permanent_dir, prefix, mImage, pre_prefix, dir,
                          file_name)
    print(
        "END OF ANOMALY DETECTION CLICK TRAIN AND SHOW RESULT FOR PROCESSING")
    write_temp_dir = permanent_dir + "\\"
    print prefix
    print file_name
    # Clean the junk of the output files
    if os.path.exists(permanent_dir + "//output.csv"):
        os.remove(permanent_dir + "//output.csv")
    features = iFruitFly_clustering.cluster_analysis.cluster_analysis(
        ir_directory, permanent_dir + "\\output_INFLO.csv", mat_directory,
        dir + "\\" + file_name, prefix, file_name, permanent_dir)
    if (features == None):
        print("Image cant be segmented due to poor calibiration")

    #other files are stored for the user in the junk
    else:
        print "printing images->>>>>>> ", prefix + file_name
        image_plotter(features, ir_directory, prefix + file_name)
    import csv
    # Weka Machine Learning Inclusion on 5/30/2017
    # adding one extra column
    with open(permanent_dir + "\\output.csv", 'r') as csvinput:
        with open(permanent_dir + "\\output_n.csv", 'w') as csvoutput:
            writer = csv.writer(csvoutput, lineterminator='\n')
            reader = csv.reader(csvinput)
            all = []
            row = next(reader)
            row.append('result')
            all.append(row)
            for row in reader:
                row.append(0)
                all.append(row)
            writer.writerows(all)
    #model = "J:\iFruitFly\Python Scripts\Model 1\\model.model"
    data_dir = permanent_dir + "\\output_n.csv"
    #data_dir_open = open(data_dir)
    #r = csv.reader(data_dir_open)

    jvm.start()
    loader = Loader(classname="weka.core.converters.CSVLoader")
    data = loader.load_file(data_dir)
    # using the serialization library for
    # opening the model
    objects = serialization.read_all(model)
    classifier = Classifier(jobject=objects[0])
    print "Model Classified"
    print classifier
    data.class_is_last()
    for index, inst in enumerate(data):
        pred = classifier.classify_instance(inst)
        dist = classifier.distribution_for_instance(inst)
        print pred
Exemplo n.º 26
0
def run_classifier(path, prot, sel, cols, prot_vals, beta):
        
    DIs = dict()
    jvm.start()

    for i in range(len(cols)-1):
        loader = Loader(classname="weka.core.converters.CSVLoader")
        data = loader.load_file(path)
    
        # remove selected attribute from the data
        # NOTE: options are ONE indexed, not ZERO indexed
        remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", \
                        options=["-R", str(sel[2]+1)])
        remove.inputformat(data)
        data = remove.filter(data)

        # if running for only one attribue, remove all others (except protected)
        if i > 0:
            for j in range(1, prot[2]+1):
                if i != j:
                    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", \
                                    options=["-R", ("1" if i>j else "2")])
                    remove.inputformat(data)
                    data = remove.filter(data)

        # set prot attribute as Class attribute
        data.class_is_last()
        
        # run classifier
        cls = Classifier(classname="weka.classifiers.bayes.NaiveBayes")
        cls.build_classifier(data)
    
        # count the number of each combination
        pos_and_pred = float(0.0)
        pos_and_not_pred = float(0.0)
        neg_and_pred = float(0.0)
        neg_and_not_pred = float(0.0)
        for ind, inst in enumerate(data):
            if cls.classify_instance(inst):
                if prot_vals[ind] == prot[1]:
                    pos_and_pred += 1
                else:
                    neg_and_pred += 1
            else:
                if prot_vals[ind] == prot[1]:
                    pos_and_not_pred += 1
                else:
                    neg_and_not_pred += 1

        # calculate DI
        BER = ((pos_and_not_pred / (pos_and_pred + pos_and_not_pred)) + \
               (neg_and_pred / (neg_and_pred + neg_and_not_pred))) * 0.5
        if BER > 0.5:
            BER = 1 - BER
        DI = 1 - ((1 - 2 * BER) / (beta + 1 - 2 * BER))

        if i == 0: # consider changing this to a 'code word' instead of 'all'
            DIs["all"] = DI
        else:
            DIs[cols[i-1]] = DI

    jvm.stop()

    return DIs
for classifier in classifiers:
    print("~~~~~~~~~~~~~~~~~~~")
    print(classifier)

data_test = loader.load_file(test_csv)
data_test.class_is_last()

actual_scores = []
with open(test_csv) as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        actual_scores.append(row['Score'])

with open(result_csv, 'wb') as csvfile:
    print()
csvfile.close()

print(data_test)

for index, inst in enumerate(data_test):
    print(index)
    knn_score = int(knn_classifier.classify_instance(inst))
    lin_score = int(lin_classifier.classify_instance(inst))
    svm_score = int(svm_classifier.classify_instance(inst))
    classifiers_scores = [["KNN", knn_score], ["Linear Regression", lin_score],
                          ["SVM", svm_score],
                          ["Actual Score", actual_scores[index]]]
    write_to_csv(classifiers_scores, result_csv)

jvm.stop()
Exemplo n.º 28
0
class WekaWrapper:

	def __init__(self, questionID, algorithm, classifier, parameters, modelParams, optimizer, predict = 0):
		self.questionID = questionID
		self.algorithm = algorithm
		self.classifier = classifier
		self.parameters = parameters
		self.modelParams = modelParams
		self.api = nemoApi()
		self.config = nemoConfig()
		self.optimizer = optimizer
		self.predict = predict
		self.prediction = None


	def retrieveData(self, id, dataset):
		query = self.api.getDataQuery(id, dataset)
		iquery = InstanceQuery()
		iquery.db_url = "jdbc:mysql://" + self.config.HOST + ":" + str(self.config.PORT) + "/" + self.config.DB
		iquery.user = self.config.USER
		iquery.password = self.config.PASS
		iquery.query = query
		data = iquery.retrieve_instances()
		data.class_is_last()
		return data

	def uploadData(self):
		# Upload file to database
		self.api.addModel(self.questionID, '?', self.acc, self.model, self.algorithm, False, self.matrix, self.optimizer)
		info = self.api.fetchQuestionInfo(self.questionID)
		modelID = info['ID']
		for mParam in self.modelParams:
			mParam.AIModel = modelID
			self.api.addAIModelParam(mParam)

	def uploadPrediction(self):
		# Upload best classifier prediction to database

		if self.prediction is not None:
			# Convert prediction to string
			predStr = 'No prediction'
			if (self.prediction == 1.0):
				predStr = "True"
			elif (self.prediction == 0.0):
				predStr = "False"
			print 'Writing ' + predStr
			self.api.updatePrediction(self.questionID, predStr)

	def addInstancesToDataset(self, source, dest):
		# Align the instances of a source dataset to destination's header and add them to the destination dataset
		i = 0
		while i < source.num_instances:
			values = source.get_instance(i).values
			it = np.nditer(values, flags=['f_index'], op_flags=['readwrite'])
			while not it.finished:
				(it[0], it.index),
				if (source.attribute(it.index).is_nominal):
					stringVal = source.get_instance(i).get_string_value(it.index)
					# print stringVal
					if(stringVal != '?'):
						values[it.index] = dest.attribute(it.index).values.index(stringVal)
				it.iternext()
			dest.add_instance(Instance.create_instance(values))
			i = i + 1

	def buildPatientObject(self):
		# Build a patient to classify
		patient = self.api.fetchPatientJSON(self.questionID)
		if patient is not None:
			newPatient = {}
			demographics = ['race_cd', 'sex_cd', 'age_in_years_num']
			observation_fact_features = ['tval_char', 'nval_num']
			for demo in demographics:
				if demo not in patient:
					print "Patient definition missing" + demo + "."
					newPatient[demo] = float('nan')
				else:
					if patient[demo] is not None and patient[demo] != '':
						newPatient[demo] = patient[demo]
					else: 
						print "Demographic " + demo +  " for patient is empty"
						newPatient[demo] = float('nan')
			for obs in patient['observation_facts']:
				concept_cd = obs['concept_cd']
				for feat in observation_fact_features:
					if feat in obs:
						if obs[feat] is not None:
							newPatient[(concept_cd + feat)] = obs[feat]
						else:
							newPatient[(concept_cd + feat)] = float('nan')
					else:
						print "Feature " + concept_cd + feat + " missing from Patient definition, marking it None"
						newPatient[(concept_cd + feat)] = float('nan')
			return newPatient
		else:
			return None

	def addPatientNominals(self, patient, dataset):
		# Add the nominal values for the patient to the master header, in case they aren't already there
		# Loop and add patient's nominal values in case they aren't in masterDataset
		# newDataset will be the new master header
		# Waiting on prediction patient to be defined
		# Should be like {sex_cd: "m", ...}
		ignoreAttributes = ['readmitted']
		atts = []
		for a in dataset.attributes():
			if (not (a.is_nominal)) or (a.name in ignoreAttributes) :
				atts.append(a)
			else:
				newValues = list(a.values)
				#print a.name
				pvalue = patient[a.name]
				if(pvalue not in newValues):
					newValues.append(pvalue)
				atts.append(Attribute.create_nominal(a.name, newValues))
		newDataset = Instances.create_instances("Dataset", atts, 0)
		newDataset.class_is_last()
		return newDataset

	def addNominals(self, dataset):
		# Add the nominal values for all columns, in case a column has none
		ignoreAttributes = ['readmitted']
		atts = []
		for a in dataset.attributes():
			if (not (a.is_nominal)) or (a.name in ignoreAttributes) :
				atts.append(a)
			else:
				newValues = list(a.values)
				pvalue = 'DefaultNominal'
				if(pvalue not in newValues):
					newValues.append(pvalue)
				atts.append(Attribute.create_nominal(a.name, newValues))
		newDataset = Instances.create_instances("Dataset", atts, 0)
		newDataset.class_is_last()
		return newDataset
		
	def createPatientInstance(self, patient, dataset):
		# Create a patient instance to classify
		ignoreAttributes = ['readmitted']
		values = []
		for a in dataset.attributes():
			if not a.is_nominal:
				values.append(patient[a.name])
			elif a.name in ignoreAttributes:
				values.append(0)
			else:
				values.append(a.values.index(patient[a.name]))
		#print values
		newInst = Instance.create_instance(values)
		return newInst



	def run(self):
		# Attach JVM
		javabridge.attach()

		# Debug

		print "Classifier"
		print self.classifier
		print "Params"
		print self.parameters
		print "Model Params"
		print self.modelParams

		# Get data for testing and learning
		learnerData = self.retrieveData(self.questionID, "learner")
		testData = self.retrieveData(self.questionID, 'test')
		masterData = self.retrieveData(self.questionID, 'all')
		masterData = self.addNominals(masterData)

		# Check if there is enough correct data to run
		if (learnerData.num_instances < 1 or testData.num_instances < 1):
			self.status = self.config.NOT_ENOUGH_DATA
			return False

		# If this is a prediction and there is a valid patient, change masterData header
		patientObj = self.buildPatientObject()
		patientInstance = None
		if ((patientObj is not None) and (self.predict == 1)):
			masterData = self.addPatientNominals(patientObj, masterData)
			patientInstance = self.createPatientInstance(patientObj, masterData)
			masterData.add_instance(patientInstance)

		elif (patientObj is None) and (self.predict == 1):
			print 'No patient defined for prediction. Exiting'
			return True
		# Fix dataset headers up to match and fix instances to match headers
		masterData.delete()
		learner = masterData.copy_instances(masterData, 0, 0)
		test = masterData.copy_instances(masterData, 0, 0)
		self.addInstancesToDataset(learnerData, learner)
		self.addInstancesToDataset(testData, test)

		# Comparison of data for testing purposes
		# print 'learnerData'
		# print learnerData

		# print 'learner'
		# print learner

		# print 'testData'
		# print testData

		# print 'test'
		# print test

		# pdb.set_trace()
		# Instantiate classifier
		self.cls = Classifier(classname=self.classifier, options=self.parameters)

		# Run classifier
		self.cls.build_classifier(learner)
		# for index, inst in enumerate(learnerData):
			# prediction = self.cls.classify_instance(inst)
			# distribution = self.cls.distribution_for_instance(inst)

		# Test classifier
		evl = Evaluation(learner)
		evl.test_model(self.cls, test)

		# Store information about matrix
		self.acc = evl.percent_correct
		self.val = None

		# Convert numpy array into simple array
		confusionMatrix = []
		confusionMatrix.append([evl.confusion_matrix[0][0], evl.confusion_matrix[0][1]])
		confusionMatrix.append([evl.confusion_matrix[1][0], evl.confusion_matrix[1][1]])

		# Convert matrix into json format
		self.matrix = json.dumps(confusionMatrix)

		
		# print 'Classifier: ', self.classifier
		# print 'ID: ', self.questionID
		# print 'ACC: ', self.acc
		# print(evl.summary())

		# If this is a prediction... make the prediction
		if ((patientObj is not None) and (self.predict == 1)):
			masterData.add_instance(patientInstance)
			print "Running prediction on patient: "
			print masterData.get_instance(0)
			self.prediction = self.cls.classify_instance(masterData.get_instance(0))
			#self.uploadPrediction()

		# Temporarily store file to serialize to
		fileName = str(self.questionID) + self.algorithm + ".model"
		serialization.write(fileName, self.cls)

		# Open that file and store it
		self.model = None
		with open(fileName, 'rb') as f:
			self.model = f.read()

		# Remove temporary file
		os.remove(fileName)

		# Set status to awaiting feedback
		self.status = self.config.AWAITING_FEEDBACK_STATUS
		return True
Exemplo n.º 29
0
def evaluate_j48(datasets_path, intermediary_path):
    # for examples on how to use this function, refer to
    # http://pythonhosted.org/python-weka-wrapper/examples.html#build-classifier-on-dataset-output-predictions
    import weka.core.jvm as jvm
    from weka.core.converters import Loader
    from weka.classifiers import Classifier
    from sklearn.metrics import precision_score, accuracy_score, f1_score

    from networkx.drawing.nx_agraph import graphviz_layout

    jvm.start()

    json_results = {
        'runs': {
            '1': dict()
        }
    }

    try:
        for dataset in os.listdir(datasets_path):
            dataset_name = dataset.split('.')[0]

            json_results['runs']['1'][dataset_name] = dict()

            loader = Loader(classname="weka.core.converters.ArffLoader")

            y_pred_all = []
            y_true_all = []
            heights = []
            n_nodes = []

            for n_fold in it.count():
                try:
                    train_s = loader.load_file(
                        os.path.join(intermediary_path, '%s_fold_%d_train.arff' % (dataset_name, n_fold)))
                    val_s = loader.load_file(
                        os.path.join(intermediary_path, '%s_fold_%d_val.arff' % (dataset_name, n_fold)))
                    test_s = loader.load_file(
                        os.path.join(intermediary_path, '%s_fold_%d_test.arff' % (dataset_name, n_fold)))

                    train_s.relationname = dataset_name
                    val_s.relationname = dataset_name
                    test_s.relationname = dataset_name

                    train_s.class_is_last()
                    val_s.class_is_last()
                    test_s.class_is_last()

                    warnings.warn('WARNING: appending validation set in training set.')
                    for inst in val_s:
                        train_s.add_instance(inst)

                    cls = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.25", "-M", "2"])
                    # cls = Classifier(classname="weka.classifiers.trees.REPTree",
                    # options=["-M", "2", "-V", "0.001", "-N", "3", "-S", "1", "-L", "-1", "-I", "0.0"])
                    cls.build_classifier(train_s)

                    warnings.warn('WARNING: will only work for binary splits!')
                    graph = cls.graph.encode('ascii')
                    out = StringIO.StringIO(graph)
                    G = nx.Graph(nx.nx_pydot.read_dot(out))

                    # TODO plotting!
                    # fig = plt.figure(figsize=(40, 30))
                    # pos = graphviz_layout(G, root='N0', prog='dot')
                    #
                    # edgelist = G.edges(data=True)
                    # nodelist = G.nodes(data=True)
                    #
                    # edge_labels = {(x1, x2): v['label'] for x1, x2, v in edgelist}
                    # node_colors = {node_id: ('#98FB98' if 'shape' in _dict else '#0099FF') for node_id, _dict in nodelist}
                    # node_colors['N0'] = '#FFFFFF'
                    # node_colors = node_colors.values()
                    #
                    # nx.draw_networkx_nodes(G, pos, node_color=node_colors)
                    # nx.draw_networkx_edges(G, pos, style='dashed', arrows=False)
                    # nx.draw_networkx_labels(G, pos, {k: v['label'] for k, v in G.node.iteritems()})
                    # nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
                    # plt.axis('off')
                    # plt.show()
                    # exit(0)
                    # TODO plotting!

                    heights += [max(map(len, nx.shortest_path(G, source='N0').itervalues()))]
                    n_nodes += [len(G.node)]

                    y_test_true = []
                    y_test_pred = []

                    # y_train_true = []
                    # y_train_pred = []

                    # for index, inst in enumerate(train_s):
                    #     y_train_true += [inst.get_value(inst.class_index)]
                    #     y_train_pred += [cls.classify_instance(inst)]

                    for index, inst in enumerate(test_s):
                        y_test_true += [inst.get_value(inst.class_index)]
                        y_test_pred += [cls.classify_instance(inst)]

                    y_true_all += y_test_true
                    y_pred_all += y_test_pred

                except Exception as e:
                    break

            json_results['runs']['1'][dataset_name] = {
                'confusion_matrix': confusion_matrix(y_true_all, y_pred_all).tolist(),
                'height': heights,
                'n_nodes': n_nodes,
            }

        # interprets
        json_results = json.load(open('/home/henry/Desktop/j48/j48_results.json', 'r'))

        n_runs = len(json_results['runs'].keys())
        some_run = json_results['runs'].keys()[0]
        n_datasets = len(json_results['runs'][some_run].keys())

        df = pd.DataFrame(
            columns=['run', 'dataset', 'test_acc', 'height mean', 'height std', 'n_nodes mean', 'n_nodes std'],
            index=np.arange(n_runs * n_datasets),
            dtype=np.float32
        )

        df['dataset'] = df['dataset'].astype(np.object)

        count_row = 0
        for n_run, run in json_results['runs'].iteritems():
            for dataset_name, dataset in run.iteritems():
                conf_matrix = np.array(dataset['confusion_matrix'], dtype=np.float32)

                test_acc = np.diag(conf_matrix).sum() / conf_matrix.sum()

                height_mean = np.mean(dataset['height'])
                height_std = np.std(dataset['height'])
                n_nodes_mean = np.mean(dataset['n_nodes'])
                n_nodes_std = np.std(dataset['n_nodes'])

                df.loc[count_row] = [
                    int(n_run), str(dataset_name), float(test_acc),
                    float(height_mean), float(height_std), float(n_nodes_mean), float(n_nodes_std)
                ]
                count_row += 1

        print df
        json.dump(json_results, open('j48_results.json', 'w'), indent=2)
        df.to_csv('j48_results.csv', sep=',', quotechar='\"', index=False)

    finally:
        jvm.stop()
def predictionFromModel():
    import weka.core.serialization as serialization
    from weka.classifiers import Classifier
    from weka.classifiers import Evaluation

    predictionsPath = outputPrediction
    models_dir = inputModel
    modelsList = os.listdir(inputModel)
    data_dir = input
    folderList = os.listdir(inputModel)
    i = 0
    loader = Loader(classname="weka.core.converters.ArffLoader")
    from weka.core.classes import Random
    from weka.core.dataset import Instances

    data = loader.load_file(os.path.join(inputModel, "genderTest.arff"))
    data.class_is_last()
    modelName = "GenderModel.model"
    objects = serialization.read_all(os.path.join(inputModel, modelName))
    trainedModel = Classifier(jobject=objects[0])
    genderFile = open(os.path.join(outputPrediction, 'Gender_Predictions.csv'),
                      'w')
    with genderFile:
        j = -1
        fieldnames = ['Test_Author_Profile_Id', 'Gender']
        writer = csv.DictWriter(genderFile, fieldnames=fieldnames)
        writer.writeheader()
        for index, inst in enumerate(data):
            j = j + 1
            pred = trainedModel.classify_instance(inst)
            dist = trainedModel.distribution_for_instance(inst)
            print(
                str(index + 1) + ": label index=" + str(pred) +
                ", class distribution=" + str(dist))
            if (str(pred) == '0.0'):
                writer.writerow({
                    'Test_Author_Profile_Id': my_list[j],
                    'Gender': 'male'
                })
            if (str(pred) == '1.0'):
                writer.writerow({
                    'Test_Author_Profile_Id': my_list[j],
                    'Gender': 'female'
                })

    data = loader.load_file(os.path.join(inputModel, "ageTest.arff"))
    data.class_is_last()
    modelName = "AgeModel.model"
    objects = serialization.read_all(os.path.join(inputModel, modelName))
    trainedModel = Classifier(jobject=objects[0])
    ageFile = open(os.path.join(outputPrediction, 'Age_Predictions.csv'), 'w')

    with ageFile:
        j = -1
        fieldnames = ['Test_Author_Profile_Id', 'Age']
        writer = csv.DictWriter(ageFile, fieldnames=fieldnames)
        writer.writeheader()
        for index, inst in enumerate(data):
            j = j + 1
            pred = trainedModel.classify_instance(inst)
            dist = trainedModel.distribution_for_instance(inst)
            print(
                str(index + 1) + ": label index=" + str(pred) +
                ", class distribution=" + str(dist))
            if (str(pred) == '0.0'):
                writer.writerow({
                    'Test_Author_Profile_Id': my_list[j],
                    'Age': '15-19'
                })
            if (str(pred) == '1.0'):
                writer.writerow({
                    'Test_Author_Profile_Id': my_list[j],
                    'Age': '20-24'
                })
            if (str(pred) == '2.0'):
                writer.writerow({
                    'Test_Author_Profile_Id': my_list[j],
                    'Age': '25-xx'
                })
    os._exit(0)
Exemplo n.º 31
0
    def runner(self, cdat, heap_size = 16384, seed = None, verbose = True):
        self.set_status(Pipeline.RUNNING)

        self.logs.append('Initializing Pipeline')

        para = self.config

        self.logs.append('Reading Pipeline Configuration')

        head = ''
        name = get_rand_uuid_str()

        self.logs.append('Reading Input File')

        for i, stage in enumerate(self.stages):
            if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'):
                self.stages[i].status = Pipeline.RUNNING
            if stage.code ==  'dat.fle':
                head    = os.path.abspath(stage.value.path)
                name, _ = os.path.splitext(stage.value.name)

        self.logs.append('Parsing to ARFF')

        path = os.path.join(head, '{name}.arff'.format(name = name))
        # This bug, I don't know why, using Config.schema instead.
        # cdat.toARFF(path, express_config = para.Preprocess.schema, verbose = verbose)

        for i, stage in enumerate(self.stages):
            if stage.code in ('dat.fle', 'prp.bgc', 'prp.nrm', 'prp.pmc', 'prp.sum'):
                self.stages[i].status = Pipeline.COMPLETE

        self.logs.append('Saved ARFF at {path}'.format(path = path))
        self.logs.append('Splitting to Training and Testing Sets')

        JVM.start(max_heap_size = '{size}m'.format(size = heap_size))

        load = Loader(classname = 'weka.core.converters.ArffLoader')
        # data = load.load_file(path)
        # save =  Saver(classname = 'weka.core.converters.ArffSaver')
        data = load.load_file(os.path.join(head, 'iris.arff')) # For Debugging Purposes Only
        data.class_is_last() # For Debugging Purposes Only
        # data.class_index = cdat.iclss

        for i, stage in enumerate(self.stages):
            if stage.code == 'prp.kcv':
                self.stages[i].status = Pipeline.RUNNING

        self.logs.append('Splitting Training Set')

        # TODO - Check if this seed is worth it.
        seed = assign_if_none(seed, random.randint(0, 1000))
        opts = ['-S', str(seed), '-N', str(para.Preprocess.FOLDS)]
        wobj = Filter(classname = 'weka.filters.supervised.instance.StratifiedRemoveFolds', options = opts + ['-V'])
        wobj.inputformat(data)

        tran = wobj.filter(data)

        self.logs.append('Splitting Testing Set')

        wobj.options = opts
        test = wobj.filter(data)

        for i, stage in enumerate(self.stages):
            if stage.code == 'prp.kcv':
                self.stages[i].status = Pipeline.COMPLETE

        self.logs.append('Performing Feature Selection')

        feat = [ ]
        for comb in para.FEATURE_SELECTION:
            if comb.USE:
                for i, stage in enumerate(self.stages):
                    if stage.code == 'ats':
                        search    = stage.value.search.name
                        evaluator = stage.value.evaluator.name

                        if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME:
                            self.stages[i].status = Pipeline.RUNNING

                srch = ASSearch(classname = 'weka.attributeSelection.{classname}'.format(
                    classname = comb.Search.NAME,
                    options   = assign_if_none(comb.Search.OPTIONS, [ ])
                ))
                ewal = ASEvaluation(classname = 'weka.attributeSelection.{classname}'.format(
                    classname = comb.Evaluator.NAME,
                    options   = assign_if_none(comb.Evaluator.OPTIONS, [ ])
                ))

                attr = AttributeSelection()
                attr.search(srch)
                attr.evaluator(ewal)
                attr.select_attributes(tran)

                meta = addict.Dict()
                meta.search    = comb.Search.NAME
                meta.evaluator = comb.Evaluator.NAME
                meta.features  = [tran.attribute(index).name for index in attr.selected_attributes]

                feat.append(meta)

                for i, stage in enumerate(self.stages):
                    if stage.code == 'ats':
                        search    = stage.value.search.name
                        evaluator = stage.value.evaluator.name

                        if search == comb.Search.NAME and evaluator == comb.Evaluator.NAME:
                            self.stages[i].status = Pipeline.COMPLETE

        models = [ ]
        for model in para.MODEL:
            if model.USE:
                summary         = addict.Dict()

                self.logs.append('Modelling {model}'.format(model = model.LABEL))

                summary.label   = model.LABEL
                summary.name    = model.NAME
                summary.options = assign_if_none(model.OPTIONS, [ ])

                for i, stage in enumerate(self.stages):
                    if stage.code == 'lrn' and stage.value.name == model.NAME:
                        self.stages[i].status = Pipeline.RUNNING

                for i, instance in enumerate(data):
                    iclass = list(range(instance.num_classes))
                
                options    = assign_if_none(model.OPTIONS, [ ])
                classifier = Classifier(classname = 'weka.classifiers.{classname}'.format(classname = model.NAME), options = options)
                classifier.build_classifier(tran)
        
                serializer.write(os.path.join(head, '{name}.{classname}.model'.format(
                        name = name,
                    classname = model.NAME
                )), classifier)

                self.logs.append('Testing model {model}'.format(model = model.LABEL))

                evaluation       = Evaluation(tran)
                evaluation.test_model(classifier, test)

                summary.summary  = evaluation.summary()

                frame  = pd.DataFrame(data = evaluation.confusion_matrix)
                axes   = sns.heatmap(frame, cbar = False, annot = True)
                b64str = get_b64_plot(axes)
                
                summary.confusion_matrix = addict.Dict({
                    'value': evaluation.confusion_matrix.tolist(),
                     'plot': b64str
                })

                self.logs.append('Plotting Learning Curve for {model}'.format(model = model.LABEL))

                buffer = io.BytesIO()
                plot_classifier_errors(evaluation.predictions, tran, test, outfile = buffer, wait = False)
                b64str = buffer_to_b64(buffer)

                summary.learning_curve   = b64str

                buffer = io.BytesIO()
                plot_roc(evaluation, class_index = iclass, outfile = buffer, wait = False)
                b64str = buffer_to_b64(buffer)

                summary.roc_curve        = b64str

                buffer = io.BytesIO()
                plot_prc(evaluation, class_index = iclass, outfile = buffer, wait = False)
                b64str = buffer_to_b64(buffer)

                summary.prc_curve        = b64str

                if classifier.graph:
                    summary.graph = classifier.graph

                for i, instance in enumerate(test):
                    prediction = classifier.classify_instance(instance)

                for i, stage in enumerate(self.stages):
                    if stage.code == 'lrn' and stage.value.name == model.NAME:
                        self.stages[i].status = Pipeline.COMPLETE

                models.append(summary)

        self.gist.models = models

        JVM.stop()

        JSON.write(os.path.join(head, '{name}.cgist'.format(name = name)), self.gist)

        self.logs.append('Pipeline Complete')

        self.set_status(Pipeline.COMPLETE)
predictions = [0]*len(sensors)
stepCount = 0

try:
    while True:
        for i in range(len(sensors)):

            # aquire data
            # df = sensors[i].getFrame()
            df = dataManager.getNextWindow()

            # run the data frame (df) through nathan's code, output is signle row arff
            inst = # nathan, looks like the iotdata class has some capabilities for handling streaming data, ill let you do this

            # classify
            pred = classifier.classify_instance(inst)

            # estimate step count
            stepCount += countSteps(df)

            # save prediciton
            predictions[i] = pred

        # output predictions
        print predictions, stepCount

except KeyboardInterrupt:
    for sensor in sensors:
        sensor.closeDevice()

jvm.stop()
tempList = list()

jvm.start()

data_dir = "C:\Users\Softmints\Desktop\Diss\Code\WEKA"

from weka.core.converters import Loader
#Prepare ARFF Loader
loader = Loader(classname="weka.core.converters.ArffLoader")
#Assign ands load ARFF data file
data = loader.load_file(data_dir + "\TestDataEleventoTwentyTwo.arff")
data.class_is_last()

from weka.classifiers import Classifier
#Classify data using J48 classifer
cls = Classifier(classname="weka.classifiers.trees.J48", options=["-C", "0.3"])
cls.build_classifier(data)

for index, inst in enumerate(data):
	#Output predicition and distribution
    pred = cls.classify_instance(inst)
    dist = cls.distribution_for_instance(inst)
    print(str(index) + ": label index=" + str(pred) + ", class distribution=" + str(dist))
    
    if str(pred) == "0.0":
    	tempList.append(str(index))

print tempList

jvm.stop()
Exemplo n.º 34
0
    X_test[:,-1] = classes[0]       # make sure test classes is removed
    y_test = Y[test_index]
    write_to_weka('train.arff', 'training_data', data.columns, X_train, classes)
    write_to_weka('test.arff', 'testing_data', data.columns, X_test, classes)

    loader = Loader(classname="weka.core.converters.ArffLoader")
    trdata = loader.load_file("train.arff")
    trdata.class_is_last()

    classifier = Classifier(classname="weka.classifiers.lazy.IBk")
    classifier.options = ["-K", "10", "-W", "0", "-I", "-A",
                          "weka.core.neighboursearch.LinearNNSearch -A \"weka.core.ManhattanDistance -R first-last\""]
    classifier.build_classifier(trdata)

    tedata = loader.load_file("test.arff")
    tedata.class_is_last()

    for index, inst in enumerate(tedata):
        result = classifier.classify_instance(inst)
        Ypred[test_index[index]] = classes[int(result)]

    accuracy = float(np.sum(y_test == Ypred[test_index])) / float(y_test.shape[0])
    print " => Accuracy = ", accuracy
    itr += 1
accuracy = float(np.sum(Y == Ypred)) / float(Y.shape[0])
print "Total accuracy = ", accuracy

os.remove('train.arff')
os.remove('test.arff')
jvm.stop()
Exemplo n.º 35
0
class WekaEstimator(BaseEstimator, OptionHandler, RegressorMixin, ClassifierMixin):
    """
    Wraps a Weka classifier (classifier/regressor) within the scikit-learn framework.
    """

    def __init__(self, jobject=None, classifier=None, classname=None, options=None,
                 nominal_input_vars=None, nominal_output_var=None,
                 num_nominal_input_labels=None, num_nominal_output_labels=None):
        """
        Initializes the estimator. Can be either instantiated via the following priority of parameters:
        1. JB_Object representing a Java Classifier object
        2. Classifier pww3 wrapper
        3. classname/options

        :param jobject: the JB_Object representing a Weka classifier to use
        :type jobject: JB_Object
        :param classifier: the classifier wrapper to use
        :type classifier: Classifier
        :param classname: the classname of the Weka classifier to instantiate
        :type classname: str
        :param options: the command-line options of the Weka classifier to instantiate
        :type options: list
        :param nominal_input_vars: the list of 0-based indices of attributes to convert to nominal or range string with 1-based indices
        :type nominal_input_vars: list or str
        :param nominal_output_var: whether to convert the output variable to a nominal one
        :type nominal_output_var: bool
        :param num_nominal_input_labels: the dictionary with the number of labels for the nominal input variables (key is 0-based attribute index)
        :type num_nominal_input_labels: dict
        :param num_nominal_output_labels: the number of labels for the output variable
        :type num_nominal_output_labels: int
        """
        if jobject is not None:
            _jobject = jobject
        elif classifier is not None:
            _jobject = classifier.jobject
        elif classname is not None:
            if options is None:
                options = []
            classifier = Classifier(classname=classname, options=options)
            _jobject = classifier.jobject
        else:
            raise Exception("At least Java classname must be provided!")

        if not is_instance_of(_jobject, "weka.classifiers.Classifier"):
            raise Exception("Java object does not implement weka.classifiers.Classifier!")

        super(WekaEstimator, self).__init__(_jobject)
        self._classifier = Classifier(jobject=_jobject)
        self.header_ = None
        self.classes_ = None
        # the following references are required for get_params/set_params
        self._classname = classname
        self._options = options
        self._nominal_input_vars = nominal_input_vars
        self._nominal_output_var = nominal_output_var
        self._num_nominal_input_labels = num_nominal_input_labels
        self._num_nominal_output_labels = num_nominal_output_labels

    @property
    def classifier(self):
        """
        Returns the underlying classifier object, if any.

        :return: the classifier object
        :rtype: Classifier
        """
        return self._classifier

    @property
    def header(self):
        """
        Returns the underlying dataset header, if any.

        :return: the dataset structure
        :rtype: Instances
        """
        return self.header_

    def fit(self, data, targets):
        """
        Trains the estimator.

        :param data: the input variables as matrix, array-like of shape (n_samples, n_features)
        :type data: ndarray
        :param targets: the class attribute column, array-like of shape (n_samples,)
        :type targets: ndarray
        :return: itself
        :rtype: WekaEstimator
        """
        data, targets = check_X_y(data, y=targets, dtype=None)
        if self._nominal_input_vars is not None:
            data = to_nominal_attributes(data, self._nominal_input_vars)
        if self._nominal_output_var is not None:
            targets = to_nominal_labels(targets)
        d = to_instances(data, targets,
                         num_nominal_labels=self._num_nominal_input_labels,
                         num_class_labels=self._num_nominal_output_labels)
        self._classifier.build_classifier(d)
        self.header_ = d.template_instances(d, 0)
        if d.class_attribute.is_nominal:
            self.classes_ = d.class_attribute.values
        else:
            self.classes_ = None
        return self

    def predict(self, data):
        """
        Performs predictions with the trained classifier.

        :param data: the data matrix to generate predictions for, array-like of shape (n_samples, n_features)
        :type data: ndarray
        :return: the score (or scores)
        :rtype: ndarray
        """
        check_is_fitted(self)
        if self._nominal_input_vars is not None:
            data = to_nominal_attributes(data, self._nominal_input_vars)
        data = check_array(data, dtype=None)
        result = []
        for d in data:
            inst = to_instance(self.header_, d, missing_value())
            if self.header_.class_attribute.is_nominal:
                result.append(self.header_.class_attribute.value(int(self._classifier.classify_instance(inst))))
            else:
                result.append(self._classifier.classify_instance(inst))
        return np.array(result)

    def predict_proba(self, data):
        """
        Performs predictions and returns class probabilities.

        :param data: the data matrix to generate predictions for, array-like of shape (n_samples, n_features)
        :type data: ndarray
        :return: the probabilities
        """
        check_is_fitted(self)
        if self._nominal_input_vars is not None:
            data = to_nominal_attributes(data, self._nominal_input_vars)
        data = check_array(data, dtype=None)
        result = []
        for d in data:
            inst = to_instance(self.header_, d, missing_value())
            result.append(self._classifier.distribution_for_instance(inst))
        return np.array(result)

    def get_params(self, deep=True):
        """
        Returns the parameters for this classifier, basically classname and options list.

        :param deep: ignored
        :type deep: bool
        :return: the dictionary with options
        :rtype: dict
        """
        result = dict()
        result["classname"] = self._classname
        result["options"] = self._options
        if self._nominal_input_vars is not None:
            result["nominal_input_vars"] = self._nominal_input_vars
        if self._nominal_output_var is not None:
            result["nominal_output_var"] = self._nominal_output_var
        if self._num_nominal_input_labels is not None:
            result["num_nominal_input_labels"] = self._num_nominal_input_labels
        if self._num_nominal_output_labels is not None:
            result["num_nominal_output_labels"] = self._num_nominal_output_labels
        return result

    def set_params(self, **params):
        """
        Sets the options for the classifier, expects 'classname' and 'options'.

        :param params: the parameter dictionary
        :type params: dict
        """
        if len(params) == 0:
            return
        if "classname" not in params:
            raise Exception("Cannot find 'classname' in parameters!")
        if "options" not in params:
            raise Exception("Cannot find 'options' in parameters!")
        self._classname = params["classname"]
        self._options = params["options"]
        self._classifier = Classifier(classname=self._classname, options=self._options)
        self._nominal_input_vars = None
        if "nominal_input_vars" in params:
            self._nominal_input_vars = params["nominal_input_vars"]
        self._nominal_output_var = None
        if "nominal_output_var" in params:
            self._nominal_output_var = params["nominal_output_var"]
        self._num_nominal_input_labels = None
        if "num_nominal_input_labels" in params:
            self._num_nominal_input_labels = params["num_nominal_input_labels"]
        self._num_nominal_output_labels = None
        if "num_nominal_output_labels" in params:
            self._num_nominal_output_labels = params["num_nominal_output_labels"]

    def __str__(self):
        """
        For printing the model.

        :return: the model representation, if any
        :rtype: str
        """
        if self._classifier is None:
            return self._classname + ": No model built yet"
        else:
            return str(self._classifier)

    def __copy__(self):
        """
        Creates a deep copy of itself.

        :return: the copy
        :rtype: WekaEstimator
        """
        result = WekaEstimator(jobject=deepcopy(self.jobject))
        result._classname = self._classname
        result._options = self._options[:]
        result._nominal_input_vars = None if (self._nominal_input_vars is None) else self._nominal_input_vars[:]
        result._nominal_output_var = self._nominal_output_var
        return result

    def __repr__(self, N_CHAR_MAX=700):
        """
        Returns a valid Python string using its classname and options.

        :param N_CHAR_MAX: ignored
        :type N_CHAR_MAX: int
        :return: the representation
        :rtype: str
        """
        if isinstance(self._nominal_input_vars, str):
            return "WekaEstimator(classname='%s', options=%s, nominal_input_vars='%s', nominal_output_var=%s)" % (self._classifier.classname, str(self._classifier.options), str(self._nominal_input_vars), str(self._nominal_output_var))
        else:
            return "WekaEstimator(classname='%s', options=%s, nominal_input_vars=%s, nominal_output_var=%s)" % (self._classifier.classname, str(self._classifier.options), str(self._nominal_input_vars), str(self._nominal_output_var))
Exemplo n.º 36
0
def test(objs, paras, testfile1, pred, real):
  testfile = preprocess(testfile1, True)
  xref = {'x_nT':1,'x_nT_delta':0,'x_nK':1,'x_nK_delta':0,'x_long':1,'x_str':0,'x_strsum':0}
  add_features(xref, 'x')
  zeroref = []
  for k in ['long', 'nK', 'nK_delta', 'nT', 'nT_delta', 'str', 'strsum']:
    zeroref.append(xref['x_%s' % k])
  zeroref.append(0) # should be obj
  for k in addf():
    zeroref.append(xref['x_%s' % k])

  with open(testfile) as fin:
    reader = csv.DictReader(fin)
    linecount = 0
    for line in reader:
      ops = []
      for h in line:
        if h.startswith('op'): ops.append(h[:h.find('_')])
      for op in ops: add_features(line, op)
      stats = {}
      valid = True
      real_line = {}
      for h in line:
        if h.startswith('op'):
          k = h[:h.find('_')]
          v = h[h.find('_')+1:]
          if k not in stats: stats[k] = {}
          stats[k][v] = pfloat(line[h])
          if stats[k][v] is None:
            valid = False
        elif h in objs:
          real_line[h] = pfloat(line[h])
          if real_line[h] is None:
            valid = False
      if not valid: continue
      linecount += 1
      if linecount > 250: continue
      #for k in stats:
      #  assert len(paras) == len(stats[k])
      #  for v in stats[k]:
      #    assert v in paras
      for obj in objs:
        c = Classifier(jobject=serialization.read(model_file('hash', obj)))
        zerovalue = c.classify_instance(Instance.create_instance(zeroref))
        #s = 0
        s = zerovalue
        for op in stats:
          values = []
          for k in ['long', 'nK', 'nK_delta', 'nT', 'nT_delta', 'str', 'strsum']:
            values.append(stats[op][k])
          values.append(0) # should be obj
          for k in addf():
            values.append(stats[op][k])
          ins = Instance.create_instance(values)
          prediction = c.classify_instance(ins)
          #print '   ', obj, op, values, prediction, prediction - zerovalue
          #s += pred
          s = s + max(prediction - zerovalue, 0)
        #print obj, 'real', real_line[obj], 'pred', s
        pred[obj].append(s)
        real[obj].append(real_line[obj])
  print 'test', testfile, 'linecount', linecount
  subprocess.call('rm %s' % testfile, shell=True)