def perform_KMeans(data, classes, k):

    clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans",
                          options=["-N", str(k)])
    clusterer.build_clusterer(data)
    purity = cluster_purity(clusterer, data, classes)
    return purity
def perform_HC(data, classes, k, link):

    clusterer = Clusterer(classname="weka.clusterers.HierarchicalClusterer",
                          options=["-N", str(k), "-L", link])
    clusterer.build_clusterer(data)
    purity = cluster_purity(clusterer, data, classes)
    return purity
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    full = loader.load_file(iris_file)
    full.class_is_last()

    # remove class attribute
    data = Instances.copy_instances(full)
    data.no_class()
    data.delete_last_attribute()

    # build a clusterer and output model
    helper.print_title("Training SimpleKMeans clusterer")
    clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"])
    clusterer.build_clusterer(data)
    print("done")

    # classes to clusters
    evl = ClusterEvaluation()
    evl.set_model(clusterer)
    evl.test_model(full)
    helper.print_title("Cluster results")
    print(evl.cluster_results)
    helper.print_title("Classes to clusters")
    print(evl.classes_to_clusters)
Exemplo n.º 4
0
 def simpleKMeansTrain(self, dataf, options, mname, temp=True):
     '''
     :param data: -> data to be clustered
     :param options: -> SimpleKMeans options
                   N -> number of clusters
                   A -> Distance function to use (ex: default is "weka.core.EuclideanDistance -R first-last")
                   l -> maximum number of iterations default 500
           num-slots -> number of execution slots, 1 means no parallelism
                   S -> Random number seed (default 10)
           example => ["-N", "10", "-S", "10"]
     :return:
     '''
     try:
         jvm.start(max_heap_size=self.wHeap)
         data = self.loadData(dataf, temp=True)
         clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=options)
         clusterer.build_clusterer(data)
         print clusterer
         # cluster the data
         for inst in data:
             cl = clusterer.cluster_instance(inst)  # 0-based cluster index
             dist = clusterer.distribution_for_instance(inst)  # cluster membership distribution
             print("cluster=" + str(cl) + ", distribution=" + str(dist))
         self.saveModel(clusterer, 'skm', mname)
     except Exception, e:
         print(traceback.format_exc())
Exemplo n.º 5
0
 def emTrain(self, dataf, options, mname, temp=True):
     '''
     :param data: -> data to be clustered
     :param options: -> EM options
                   I -> number of iterations
                   N -> number of clusters
                   M -> Minimum standard deviation for normal density (default=1.0E-6)
           num-slots -> number of execution slots, 1 means no parallelism
                   S -> random seed (default=100)
             example => ["-I", "1000", "-N", "6", "-X", "10", "-max", "-1", "-ll-cv", "1.0E-6",
                                    "-ll-iter", "1.0E-6", "-M", "1.0E-6", "-num-slots", "1", "-S", "100"]
     :return:
     '''
     try:
         jvm.start(max_heap_size=self.wHeap)
         data = self.loadData(dataf, temp)
         clusterEM = Clusterer(classname="weka.clusterers.EM",
                           options=options)
         clusterEM.build_clusterer(data)
         print(clusterEM)
         self.saveModel(clusterEM, 'em', mname, )
     except Exception as e:
         print((traceback.format_exc()))
     finally:
         jvm.stop()
Exemplo n.º 6
0
 def simpleKMeansTrain(self, dataf, options, mname, temp=True):
     '''
     :param data: -> data to be clustered
     :param options: -> SimpleKMeans options
                   N -> number of clusters
                   A -> Distance function to use (ex: default is "weka.core.EuclideanDistance -R first-last")
                   l -> maximum number of iterations default 500
           num-slots -> number of execution slots, 1 means no parallelism
                   S -> Random number seed (default 10)
           example => ["-N", "10", "-S", "10"]
     :return:
     '''
     try:
         jvm.start(max_heap_size=self.wHeap)
         data = self.loadData(dataf, temp=True)
         clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=options)
         clusterer.build_clusterer(data)
         print(clusterer)
         # cluster the data
         for inst in data:
             cl = clusterer.cluster_instance(inst)  # 0-based cluster index
             dist = clusterer.distribution_for_instance(inst)  # cluster membership distribution
             print(("cluster=" + str(cl) + ", distribution=" + str(dist)))
         self.saveModel(clusterer, 'skm', mname)
     except Exception as e:
         print((traceback.format_exc()))
     finally:
         jvm.stop()
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(iris_file)

    # remove class attribute
    data.delete_last_attribute()

    # build a clusterer and output model
    helper.print_title("Training SimpleKMeans clusterer")
    clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"])
    clusterer.build_clusterer(data)
    print(clusterer)

    # cluster data
    helper.print_info("Clustering data")
    for index, inst in enumerate(data):
        cl = clusterer.cluster_instance(inst)
        dist = clusterer.distribution_for_instance(inst)
        print(str(index+1) + ": cluster=" + str(cl) + ", distribution=" + str(dist))
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    full = loader.load_file(iris_file)
    full.class_is_last()

    # remove class attribute
    data = Instances.copy_instances(full)
    data.no_class()
    data.delete_last_attribute()

    # build a clusterer and output model
    helper.print_title("Training SimpleKMeans clusterer")
    clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans",
                          options=["-N", "3"])
    clusterer.build_clusterer(data)
    print("done")

    # classes to clusters
    evl = ClusterEvaluation()
    evl.set_model(clusterer)
    evl.test_model(full)
    helper.print_title("Cluster results")
    print(evl.cluster_results)
    helper.print_title("Classes to clusters")
    print(evl.classes_to_clusters)
def perform_DBScan(data, classes, e, min_points):

    clusterer = Clusterer(classname="weka.clusterers.DBSCAN",
                          options=["-E", str(e), "-M",
                                   str(min_points)])
    clusterer.build_clusterer(data)
    purity = cluster_purity(clusterer, data, classes)
    return purity
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(iris_file)

    # remove class attribute
    data.delete_last_attribute()

    # build a clusterer and output model
    helper.print_title("Training SimpleKMeans clusterer")
    clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"])
    clusterer.build_clusterer(data)
    print(clusterer)
    helper.print_info("Evaluating on data")
    evaluation = ClusterEvaluation()
    evaluation.set_model(clusterer)
    evaluation.test_model(data)
    print("# clusters: " + str(evaluation.num_clusters))
    print("log likelihood: " + str(evaluation.log_likelihood))
    print("cluster assignments:\n" + str(evaluation.cluster_assignments))
    plc.plot_cluster_assignments(evaluation, data, inst_no=True)

    # using a filtered clusterer
    helper.print_title("Filtered clusterer")
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(iris_file)
    clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"])
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"])
    fclusterer = FilteredClusterer()
    fclusterer.clusterer = clusterer
    fclusterer.filter = remove
    fclusterer.build_clusterer(data)
    print(fclusterer)

    # load a dataset incrementally and build clusterer incrementally
    helper.print_title("Incremental clusterer")
    loader = Loader("weka.core.converters.ArffLoader")
    iris_inc = loader.load_file(iris_file, incremental=True)
    clusterer = Clusterer("weka.clusterers.Cobweb")
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"])
    remove.inputformat(iris_inc)
    iris_filtered = remove.outputformat()
    clusterer.build_clusterer(iris_filtered)
    for inst in loader:
        remove.input(inst)
        inst_filtered = remove.output()
        clusterer.update_clusterer(inst_filtered)
    clusterer.update_finished()
    print(clusterer.to_commandline())
    print(clusterer)
    print(clusterer.graph)
    plg.plot_dot_graph(clusterer.graph)
Exemplo n.º 11
0
def main():
    """
    Just runs some example code.
    """

    # load a dataset
    iris_file = helper.get_data_dir() + os.sep + "iris.arff"
    helper.print_info("Loading dataset: " + iris_file)
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(iris_file)

    # remove class attribute
    data.delete_last_attribute()

    # build a clusterer and output model
    helper.print_title("Training SimpleKMeans clusterer")
    clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"])
    clusterer.build_clusterer(data)
    print(clusterer)
    helper.print_info("Evaluating on data")
    evaluation = ClusterEvaluation()
    evaluation.set_model(clusterer)
    evaluation.test_model(data)
    print("# clusters: " + str(evaluation.num_clusters))
    print("log likelihood: " + str(evaluation.log_likelihood))
    print("cluster assignments:\n" + str(evaluation.cluster_assignments))
    plc.plot_cluster_assignments(evaluation, data, inst_no=True)

    # using a filtered clusterer
    helper.print_title("Filtered clusterer")
    loader = Loader("weka.core.converters.ArffLoader")
    data = loader.load_file(iris_file)
    clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"])
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"])
    fclusterer = FilteredClusterer()
    fclusterer.clusterer = clusterer
    fclusterer.filter = remove
    fclusterer.build_clusterer(data)
    print(fclusterer)

    # load a dataset incrementally and build clusterer incrementally
    helper.print_title("Incremental clusterer")
    loader = Loader("weka.core.converters.ArffLoader")
    iris_inc = loader.load_file(iris_file, incremental=True)
    clusterer = Clusterer("weka.clusterers.Cobweb")
    remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"])
    remove.inputformat(iris_inc)
    iris_filtered = remove.outputformat()
    clusterer.build_clusterer(iris_filtered)
    for inst in loader:
        remove.input(inst)
        inst_filtered = remove.output()
        clusterer.update_clusterer(inst_filtered)
    clusterer.update_finished()
    print(clusterer.to_commandline())
    print(clusterer)
    print(clusterer.graph)
    plg.plot_dot_graph(clusterer.graph)
Exemplo n.º 12
0
    def run_cluster_simplek(self,
                            output_directory,
                            exc_class=False,
                            num_clusters=7):
        data = Instances.copy_instances(self.training_data)
        data.no_class()
        data.delete_first_attribute()

        # build a clusterer and output model
        print("\nBuilding Clusterer on training data.")
        buildTimeStart = time.time()
        clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans",
                              options=["-N", "" + str(num_clusters)])
        clusterer.build_clusterer(data)

        resultsString = ""
        resultsString = self.print_both(str(clusterer), resultsString)

        buildTimeString = "Clusterer Built in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Evaluate Clusterer
        resultsString = self.print_both("\nClustering data.", resultsString)

        buildTimeStart = time.time()

        clsexc = ""
        if (exc_class):
            # no class attribute
            clsexc = "_NO_Class"
            evl = ClusterEvaluation()
            evl.set_model(clusterer)
            evl.test_model(data)
        else:
            # classes to clusters
            evl = ClusterEvaluation()
            evl.set_model(clusterer)
            evl.test_model(self.training_data)

        resultsString = self.print_both("\nCluster results\n", resultsString)
        resultsString = self.print_both(str(evl.cluster_results),
                                        resultsString)

        resultsString = self.print_both("\nClasses to clusters\n",
                                        resultsString)
        resultsString = self.print_both(str(evl.classes_to_clusters),
                                        resultsString)

        buildTimeString = "\nClustered data in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Save Results and Cleanup
        self.save_results("SimpleKM" + clsexc + "_", resultsString,
                          output_directory)
Exemplo n.º 13
0
    def run_clustering_task7_manual(self,
                                    output_directory,
                                    clusterer_name,
                                    num_clusters,
                                    seed=10):
        data = Instances.copy_instances(self.training_data)
        data.no_class()
        data.delete_first_attribute()

        clusterer_name_short = clusterer_name.replace("weka.clusterers.", "")
        # build a clusterer and output model
        print("\nBuilding " + clusterer_name_short +
              " Clusterer on training data.")
        buildTimeStart = time.time()
        clusterer = Clusterer(
            classname=clusterer_name,
            options=["-N", "" + str(num_clusters), "-S", "" + str(seed)])
        clusterer.build_clusterer(data)

        resultsString = ""
        resultsString = self.print_both(str(clusterer), resultsString)

        buildTimeString = "Clusterer Built in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Evaluate Clusterer
        resultsString = self.print_both("\nClustering data.", resultsString)

        buildTimeStart = time.time()

        evl = ClusterEvaluation()
        evl.set_model(clusterer)
        evl.test_model(self.training_data)

        resultsString = self.print_both("\nCluster results\n", resultsString)
        resultsString = self.print_both(str(evl.cluster_results),
                                        resultsString)

        resultsString = self.print_both("\nClasses to clusters\n",
                                        resultsString)
        resultsString = self.print_both(str(evl.classes_to_clusters),
                                        resultsString)

        buildTimeString = "\nClustered data in " + str(
            time.time() - buildTimeStart) + " secs.\n"
        resultsString = self.print_both(buildTimeString, resultsString)

        #Save Results and Cleanup
        self.save_results(
            clusterer_name_short + "_" + "N" + str(num_clusters) + "_S" +
            str(seed), resultsString, output_directory)
Exemplo n.º 14
0
    def run_SKMeans_137(self):
        
        #construct output paths
        output_prefix = os.path.split(self.input_path)[-1].split(".")[0];
        print(output_prefix);
        write_date = output_prefix + "." + str(datetime.now().date());
        SKMeans_dir = os.path.join(self.output_dir,"SKMeans");
        eval_path = os.path.join(SKMeans_dir, write_date + ".cl_eval.txt");
        clust_desc_path = os.path.join(SKMeans_dir, write_date + ".cl_descr.txt");
        clust_assign_path = os.path.join(SKMeans_dir, write_date + ".cl_assign.txt");
        
        #create output dir if it doesn't already exist
        if(not os.path.exists(SKMeans_dir)):
            os.makedirs(SKMeans_dir);
        
        #clone data and build clusters
#         data_clone = copy.deepcopy(self.data_loaded);
        data_clone = self.data_loaded;
        clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N","137"]);
        clusterer.build_clusterer(data_clone);
        
        #cluster evaluation
        evaluation = ClusterEvaluation();
        evaluation.set_model(clusterer);
        evaluation.test_model(data_clone);
        with open(eval_path, 'w') as outfile:
            outfile.write("number of clusters: \t" + str(evaluation.num_clusters) + "\n");
            outfile.write("log likelihood: \t" + str(evaluation.num_clusters) + "\n");
            outfile.write("cluster assignments: \t" + str(evaluation.cluster_assignments) + "\n");
            outfile.write("***********************\n")
            outfile.write("\t".join(["SKmeans Cluster Evaluation Results\n"])); #header
            outfile.write(str(evaluation.cluster_results) + "\n");
        
        #cluster Instance objects Description of clusters
        with open(clust_desc_path, 'w') as outfile:
            outfile.write(",".join(["cluster_num","distribution\n"])); #header
            for inst in data_clone:    # data
                cl = clusterer.cluster_instance(inst); # 0-based cluster index
                dist = clusterer.distribution_for_instance(inst); #cluster membership distribution
                outfile.write(",".join([str(cl),str(dist)]));
                outfile.write("\n");
     
        #cluster assignment by row
        with open(clust_assign_path, 'w') as outfile:
            outfile.write(",".join(["row_num","SKMeans\n"])); #header
            for i, inst in enumerate(evaluation.cluster_assignments):    # data
                outfile.write(",".join([str(i),str(inst)]));
                outfile.write("\n");
        
        
        return();
        
Exemplo n.º 15
0
def create_cluster_model(arff_file, n=10, loader_type="csv", model="kmeans.model"):
    """ create cluster model """
    check_jvm()
    if loader_type == "csv":
        loader = converters.Loader(classname="weka.core.converters.CSVLoader")
    else :
        loader = conventers.Loader(classname="weka.core.converters.ArffLoader")

    data = loader.load_file(arff_file)
    clusterer = Clusterer(
        classname="weka.clusterers.SimpleKMeans", options=["-N", str(n)])
    clusterer.build_clusterer(data)
    serialization.write(model, clusterer)
Exemplo n.º 16
0
    def train_data(self):
        try:
            #helper.print_info("Loading dataset: " + self.datasetName)
            loader = Loader(classname="weka.core.converters.ArffLoader")
            data_train = loader.load_file(self.datasetName)
            data_train.delete_last_attribute()
            clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans",
                                  options=["-N", "2"])
            clusterer.build_clusterer(data_train)
            return clusterer

        except Exception, e:
            raise e
            print(traceback.format_exc())
Exemplo n.º 17
0
def command():
    jvm.start()

    import weka.core.converters as converters
    clusters = request.form['clusternum']
    a1 = request.form['firstcol']
    a2 = request.form['secondcol']
    # print clusters
    # print a1
    # print a2
    if (a1 == 'B' and a2 == 'C'):
        data = converters.load_any_file("Data.csv")
    elif (a1 == 'B' and a2 == 'D'):
        data = converters.load_any_file("Data1.csv")
    elif (a1 == 'C' and a2 == 'D'):
        data = converters.load_any_file("Data2.csv")
    elif (a1 == 'C' and a2 == 'E'):
        data = converters.load_any_file("Data3.csv")
    elif (a1 == 'D' and a2 == 'E'):
        data = converters.load_any_file("Data4.csv")

    #data.class_is_last()

    print(data)

    # from weka.attribute_selection import ASSearch, ASEvaluation, AttributeSelection
    # search = ASSearch(classname="weka.attributeSelection.BestFirst", options=["-D", "1", "-N", "5"])
    # evaluator = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval", options=["-P", "2", "-E", "1"])
    # attsel = AttributeSelection()
    # attsel.search(search)
    # attsel.evaluator(evaluator)
    # attsel.select_attributes(data)
    f = open("filename.txt", "w")
    from weka.clusterers import Clusterer
    clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans",
                          options=["-N", "{}".format(clusters)])
    clusterer.build_clusterer(data)

    print(clusterer)
    f.write(str(clusterer))
    # cluster the data
    for inst in data:
        cl = clusterer.cluster_instance(inst)  # 0-based cluster index
        dist = clusterer.distribution_for_instance(
            inst)  # cluster membership distribution
        print("cluster=" + str(cl) + ", distribution=" + str(dist))
        f.write("cluster=" + str(cl) + ", distribution=" + str(dist))

    return render_template("output.html")
    f.close()
Exemplo n.º 18
0
def run_clusterer(file):
    # Get filename from Pathlib object
    filename = file.parts[-1]
    dir = file.parents[0]

    print("Running Clusterer on %s" % filename)

    if not filename.endswith(".arff"):
        print("%s not ARFF file." % filename)
        return

    # Removes '.arff' from filename
    filename_base = filename[:-5]

    # Load data with class as first attr
    full = load_Arff_file(file)
    full.class_is_first()

    full_withoutclass = load_Arff_file(file)
    #data.delete_first_attribute()

    data = Instances.copy_instances(full)
    data.no_class()
    data.delete_first_attribute()

    dir = dir / "cluster_results_optimum"
    dir.mkdir(parents=True, exist_ok=True)
    # Init clusterer

    #"-N", "-1",
    n = "2"

    if (filename_base.startswith("fer2018_")):
        print("Changing number of clusters to 7")
        n = "7"

#clusterer = Clusterer(classname="weka.clusterers.EM", options=[ "-S", "10", "-N", n])
#clusterer = Clusterer(classname="weka.clusterers.FarthestFirst", options=[ "-S", "10", "-N", n])
    clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans",
                          options=["-S", "10", "-N", n])
    clusterer.build_clusterer(data)

    evaluation = ClusterEvaluation()
    evaluation.set_model(clusterer)
    evaluation.test_model(full)

    str1 = str(filename_base) + "_cl_res.txt"

    output_results = dir / str1
    output_cluster(evaluation, output_results)
Exemplo n.º 19
0
    def dbscanTrain(self, dataf, options, mname, temp=True):
        '''
        :param data: -> data to be clustered
        :param options: -> dbscan options
                      E -> epsilon (default = 0.9)
                      M -> minPoints (default = 6)
                      D -> default weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclideanDataObject
                      I -> index (database) used for DBSCAN (default = weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase)
                example => ["-E",  "0.9",  "-M", "6", "-I", "weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase", "-D", "weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclideanDataObject"]
        :return:
        '''

        try:
            jvm.start(max_heap_size=self.wHeap)
            data = self.loadData(dataf, temp)
            clusterDBSCAN = Clusterer(classname="weka.clusterers.DBSCAN", options=options)
            clusterDBSCAN.build_clusterer(data)
            print clusterDBSCAN
            self.saveModel(clusterDBSCAN, 'dbscan', mname)
            # cluster the data
        except Exception, e:
            print(traceback.format_exc())
Exemplo n.º 20
0
 def emTrain(self, dataf, options, mname, temp=True):
     '''
     :param data: -> data to be clustered
     :param options: -> EM options
                   I -> number of iterations
                   N -> number of clusters
                   M -> Minimum standard deviation for normal density (default=1.0E-6)
           num-slots -> number of execution slots, 1 means no parallelism
                   S -> random seed (default=100)
             example => ["-I", "1000", "-N", "6", "-X", "10", "-max", "-1", "-ll-cv", "1.0E-6",
                                    "-ll-iter", "1.0E-6", "-M", "1.0E-6", "-num-slots", "1", "-S", "100"]
     :return:
     '''
     try:
         jvm.start(max_heap_size=self.wHeap)
         data = self.loadData(dataf, temp)
         clusterEM = Clusterer(classname="weka.clusterers.EM",
                           options=options)
         clusterEM.build_clusterer(data)
         print clusterEM
         self.saveModel(clusterEM, 'em', mname, )
     except Exception, e:
         print(traceback.format_exc())
Exemplo n.º 21
0
    def dbscanTrain(self, dataf, options, mname, temp=True):
        '''
        :param data: -> data to be clustered
        :param options: -> dbscan options
                      E -> epsilon (default = 0.9)
                      M -> minPoints (default = 6)
                      D -> default weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclideanDataObject
                      I -> index (database) used for DBSCAN (default = weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase)
                example => ["-E",  "0.9",  "-M", "6", "-I", "weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase", "-D", "weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclideanDataObject"]
        :return:
        '''

        try:
            jvm.start(max_heap_size=self.wHeap)
            data = self.loadData(dataf, temp)
            clusterDBSCAN = Clusterer(classname="weka.clusterers.DBSCAN", options=options)
            clusterDBSCAN.build_clusterer(data)
            print(clusterDBSCAN)
            self.saveModel(clusterDBSCAN, 'dbscan', mname)
            # cluster the data
        except Exception as e:
            print((traceback.format_exc()))
        finally:
            jvm.stop()
Exemplo n.º 22
0
# load iris
fname = data_dir + os.sep + "iris.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)

# remove class attribute
flt = Filter(classname="weka.filters.unsupervised.attribute.Remove",
             options=["-R", "last"])
flt.inputformat(data)
filtered = flt.filter(data)

# build KMeans
print("\n--> SimpleKMeans\n")
cl = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"])
cl.build_clusterer(filtered)
evl = ClusterEvaluation()
evl.set_model(cl)
evl.test_model(filtered)
print(evl.cluster_results)
plc.plot_cluster_assignments(evl, data, atts=[], inst_no=True, wait=True)

# use AddCluster filter
print("\n--> AddCluster filter\n")
flt = Filter(classname="weka.filters.unsupervised.attribute.AddCluster",
             options=["-W", "weka.clusterers.SimpleKMeans -N 3"])
flt.inputformat(filtered)
addcl = flt.filter(filtered)
print(addcl)

# classes-to-clusters evaluation
Exemplo n.º 23
0
# load iris
fname = data_dir + os.sep + "iris.arff"
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)

# remove class attribute
flt = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"])
flt.set_inputformat(data)
filtered = flt.filter(data)

# build KMeans
print("\n--> SimpleKMeans\n")
cl = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"])
cl.build_clusterer(filtered)
evl = ClusterEvaluation()
evl.set_model(cl)
evl.test_model(filtered)
print(evl.get_cluster_results())
plc.plot_cluster_assignments(evl, data, atts=[], inst_no=True, wait=True)

# use AddCluster filter
print("\n--> AddCluster filter\n")
flt = Filter(classname="weka.filters.unsupervised.attribute.AddCluster",
             options=["-W", "weka.clusterers.SimpleKMeans -N 3"])
flt.set_inputformat(filtered)
addcl = flt.filter(filtered)
print(addcl)

# classes-to-clusters evaluation
Exemplo n.º 24
0
dataDir = os.path.join(os.path.dirname(os.path.abspath('')), 'data')
modelDir = os.path.join(os.path.dirname(os.path.abspath('')), 'models')

dformat = DataFormatter(dataDir)

dformat.dict2arff(os.path.join(dataDir, 'System.csv'),
                  os.path.join(dataDir, 'System.arff'))

#Arff_file = os.path.join(dataDir, 'System.arff')

jvm.start(packages=True)

data = converters.load_any_file(os.path.join(dataDir, 'System.arff'))
clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans",
                      options=["-N", "10", "-S", "10"])
clusterer.build_clusterer(data)

# print clusterer
# cluster the data
# for inst in data:
#     cl = clusterer.cluster_instance(inst)  # 0-based cluster index
#     dist = clusterer.distribution_for_instance(inst)   # cluster membership distribution
#     print("cluster=" + str(cl) + ", distribution=" + str(dist))
#     print inst

# serialization.write(os.path.join(modelDir, 'SKM.model'), clusterer)

clusterEM = Clusterer(classname="weka.clusterers.EM",
                      options=[
                          "-I", "1000", "-N", "6", "-X", "10", "-max", "-1",
                          "-ll-cv", "1.0E-6", "-ll-iter", "1.0E-6", "-M",
Exemplo n.º 25
0
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)

# build KMeans
seeds = [-1, 11, 12]
for seed in seeds:
    if seed == -1:
        seedStr = "default"
    else:
        seedStr = str(seed)
    print("\n--> SimpleKMeans - seed " + seedStr + "\n")
    cl = Clusterer("weka.clusterers.SimpleKMeans")
    if seed != -1:
        cl.set_options(["-S", str(seed)])
    cl.build_clusterer(data)
    evl = ClusterEvaluation()
    evl.set_model(cl)
    evl.test_model(data)
    print(evl.get_cluster_results())

# build XMeans
print("\n--> XMeans\n")
flt = Filter(classname="weka.filters.unsupervised.attribute.RemoveType", options=["-T", "numeric", "-V"])
flt.set_inputformat(data)
filtered = flt.filter(data)
cl = Clusterer(classname="weka.clusterers.XMeans")
cl.build_clusterer(filtered)
evl = ClusterEvaluation()
evl.set_model(cl)
evl.test_model(filtered)
Exemplo n.º 26
0
print("\nLoading dataset: " + fname + "\n")
loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file(fname)

# build KMeans
seeds = [-1, 11, 12]
for seed in seeds:
    if seed == -1:
        seedStr = "default"
    else:
        seedStr = str(seed)
    print("\n--> SimpleKMeans - seed " + seedStr + "\n")
    cl = Clusterer("weka.clusterers.SimpleKMeans")
    if seed != -1:
        cl.options = ["-S", str(seed)]
    cl.build_clusterer(data)
    evl = ClusterEvaluation()
    evl.set_model(cl)
    evl.test_model(data)
    print(evl.cluster_results)

# build XMeans
print("\n--> XMeans\n")
flt = Filter(classname="weka.filters.unsupervised.attribute.RemoveType",
             options=["-T", "numeric", "-V"])
flt.inputformat(data)
filtered = flt.filter(data)
cl = Clusterer(classname="weka.clusterers.XMeans")
cl.build_clusterer(filtered)
evl = ClusterEvaluation()
evl.set_model(cl)
Exemplo n.º 27
0
##saver.save_file(data, "data_with_class_type.arff")


### Deletes the not required attributes 
data.delete_attribute(2)
data.delete_attribute(2)
#####Uncomment to save the file with has serviceId as class, forkV and ForkW as attributes
###saver.save_file(data, "data_with_class_serviceID.arff")
data.delete_attribute(2)

#saver.save_file(data,"data.arff")
num_clusters = "6"   #Number of clusters for k mean

##Performing clustering
clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", num_clusters])
clusterer.build_clusterer(data)

for inst in data:
    cl = clusterer.cluster_instance(inst)  # 0-based cluster index
    dist = clusterer.distribution_for_instance(inst)   # cluster membership distribution
    #print("cluster=" + str(cl) + ", distribution=" + str(dist))

#########Getting the data about the clustered instances
evaluation = ClusterEvaluation()
evaluation.set_model(clusterer)
evaluation.test_model(data)
print evaluation.cluster_results
#print("# clusters: " + str(evaluation.num_clusters))
#print("log likelihood: " + str(evaluation.log_likelihood))
#print("cluster assignments:\n" + str(evaluation.cluster_assignments))
#plc.plot_cluster_assignments(evaluation, data,[],True)
Exemplo n.º 28
0
class ClusterAgent (BustersAgent):


    def registerInitialState(self, gameState):
        BustersAgent.registerInitialState(self, gameState)
        self.distancer = Distancer(gameState.data.layout, False)

        #Definimos si se usa la distancia (true para v1 y v2, false para v3)
        self.dis = True

        #Para calcular los valores de la clase en las politicas.
        self.clusters = 8
        self.classes = 4
        self.classCounts = [[0 for i in range(self.classes)]for j in range(self.clusters)]

        self.classIndex = 2
        self.clusterIndex = 3

        self.readInstances()

        #Esto nos servira para guardar las instancias de entrenamiento.
        self.numInstances = 52
        self.numAttributes = 4
        #self.instances = [[" " for i in range(self.numAttributes)] for j in range(self.numInstances)]
        self.ins = [" " for i in range(self.numInstances)]

        #Para usar la libreria debemos usar la maquina virtual de java, JVM
        jvm.start()

        #Creamos el modelo
        loader = Loader(classname="weka.core.converters.ArffLoader")
        data = loader.load_file("/home/dot/Escritorio/Universidad/Machine Learning/practica 2/Outputs/agent_header.arff")

        self.clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", str(self.clusters)])
        self.clusterer.build_clusterer(data)

        print(self.clusterer)

        #Aplicamos la politica
        self.politicaMax()


    def readInstances(self):

        #Direccion del fichero agente (instancias sin cabecera).
        path = os.getcwd() + "/Outputs/agent.arff"

        f = open(path, 'r')

        index = 0

        #Leemos cacda instancia
        for line in f:

            #Obtenemos los valores de los atributos (String)
            values = line.split(",")

            #Obtenemos el valor de la clase, de Norte a Oeste (0 - 3)
            classValue = 0
            classAtt = values[self.classIndex]
            if (classAtt == "East"):
                classValue = 1
            elif (classAtt == "South"):
                classValue = 2
            elif (classAtt == "West"):
                classValue = 3

            #Obtenemos el valor del cluster.
            cluster = values[self.clusterIndex]

            #Incrementamos la cuenta de la clase para el cluster.
            self.classCounts[int(cluster[-2:]) - 1][classValue] += 1

        f.close()

    #Calcula la clase mayoritaria para cada cluster
    def politicaMax(self):

        self.max = [0 for i in range(self.clusters)]

        for i in range(self.clusters):

            temp_max = 0
            class_index = 0

            for j in range(self.classes):

                if (self.classCounts[i][j] > temp_max):

                    temp_max = self.classCounts[i][j]
                    class_index = j

            self.max[i] = class_index
            #print(class_index)

        '''
        for i in range(self.clusters):
            print(self.max[i])
        '''

    def chooseAction(self, gameState):

        path = os.getcwd() + "/Outputs/newInstance.arff"

        f = open(path, 'w')

        if (self.dis):
            data = "@RELATION pacman\n" \
                    + "@ATTRIBUTE dis NUMERIC\n" \
                    + "@ATTRIBUTE relPos {-1,0,1,2,3,4,5,6,7,8}\n\n" \
                    + "@DATA\n"
        else:
            data = "@RELATION pacman\n" \
                   + "@ATTRIBUTE relPos {-1,0,1,2,3,4,5,6,7,8}\n\n" \
                   + "@DATA\n"


        # Obtenemos la posicion del pacman (x,y)
        pos_pac = gameState.data.agentStates[0].getPosition()


        # Obtenemos las distancias a los fantasmas
        for i in range(1, gameState.getNumAgents()):

            # Calculmos la distancia real (mazedistance) al fantasma i
            pos_ghost = gameState.data.agentStates[i].getPosition()

            distance = self.distancer.getDistance(pos_pac, pos_ghost)

            #Normalizacion: (distance - min)/(max - min): min = 1, max = 21
            distance = (distance - 1) / (21 - 1)

            # Si la distancia es mayor a 1000 significa que el fantasma en cuestion ya ha sido comido
            if (self.dis):
                if (distance > 1000):
                    data = data + ("-1,")
                else:
                    data = data + str(distance) + ","


        # Obtenemos las posiciones relativas de los fantasmas con respecto del pacman
        for i in range(1, gameState.getNumAgents()):

            pos_ghost = gameState.data.agentStates[i].getPosition()

            if (pos_ghost[1] < 3):
                data = data + "-1,"
                continue

            # Si el fantasma esta en la misma posicion lo indicamos como 0
            if (pos_ghost == pos_pac):
                data = data + "0,"

            # Determinamos las posiciones relativas
            # {NORTH = 1, NORTH_EAST = 2, EAST = 3, SOUTH_EAST = 4, SOUTH = 5, SOUTH_WEST = 6, WEST = 7, NORTH_WEST = 8}.
            if (pos_ghost[0] > pos_pac[0]):
                if (pos_ghost[1] > pos_pac[1]):
                    data = data + "2,"
                elif (pos_ghost[1] < pos_pac[1]):
                    data = data + "4,"
                else:
                    data = data + "3,"
            elif (pos_ghost[0] < pos_pac[0]):
                if (pos_ghost[1] > pos_pac[1]):
                    data = data + "8,"
                elif (pos_ghost[1] < pos_pac[1]):
                    data = data + "6,"
                else:
                    data = data + "7,"
            else:
                if (pos_ghost[1] > pos_pac[1]):
                    data = data + "1,"
                else:
                    data = data + "5,"

        data = data + "\n"

        #print(data)

        f.write(data)

        f.close()

        loader = Loader(classname="weka.core.converters.ArffLoader")
        newData = loader.load_file("/home/dot/Escritorio/Universidad/Machine Learning/practica 2/Outputs/newInstance.arff")

        dir = 4
        direction = Directions.STOP

        for inst in newData:
            cl = self.clusterer.cluster_instance(inst)
            #print(cl)
            dir = self.max[cl]
            #print(dir)


        if (dir == 0):
            direction = Directions.NORTH
        elif (dir == 1):
            direction = Directions.EAST
        elif (dir == 2):
            direction = Directions.SOUTH
        elif (dir == 3):
            direction = Directions.WEST

        #print(direction)
        return direction
Exemplo n.º 29
0
class WekaCluster(BaseEstimator, OptionHandler, ClusterMixin):
    """
    Wraps a Weka cluster within the scikit-learn framework.
    """
    def __init__(self,
                 jobject=None,
                 cluster=None,
                 classname=None,
                 options=None,
                 nominal_input_vars=None,
                 num_nominal_input_labels=None):
        """
        Initializes the estimator. Can be either instantiated via the following priority of parameters:
        1. JB_Object representing a Java Clusterer object
        2. Clusterer pww3 wrapper
        3. classname/options

        :param jobject: the JB_Object representing a Weka cluster to use
        :type jobject: JB_Object
        :param cluster: the cluster wrapper to use
        :type cluster: Clusterer
        :param classname: the classname of the Weka cluster to instantiate
        :type classname: str
        :param options: the command-line options of the Weka cluster to instantiate
        :type options: list
        :param num_nominal_input_labels: the dictionary with the number of labels for the nominal input variables (key is 0-based attribute index)
        :type num_nominal_input_labels: dict
        """
        if jobject is not None:
            _jobject = jobject
        elif cluster is not None:
            _jobject = cluster.jobject
        elif classname is not None:
            if options is None:
                options = []
            cluster = Clusterer(classname=classname, options=options)
            _jobject = cluster.jobject
        else:
            raise Exception("At least Java classname must be provided!")

        if not is_instance_of(_jobject, "weka.clusterers.Clusterer"):
            raise Exception(
                "Java object does not implement weka.clusterers.Clusterer!")

        super(WekaCluster, self).__init__(_jobject)
        self._cluster = Clusterer(jobject=_jobject)
        self.header_ = None
        # the following references are required for get_params/set_params
        self._classname = classname
        self._options = options
        self._nominal_input_vars = nominal_input_vars
        self._num_nominal_input_labels = num_nominal_input_labels

    @property
    def cluster(self):
        """
        Returns the underlying cluster object, if any.

        :return: the cluster object
        :rtype: Clusterer
        """
        return self._cluster

    @property
    def header(self):
        """
        Returns the underlying dataset header, if any.

        :return: the dataset structure
        :rtype: Instances
        """
        return self.header_

    def fit(self, data, targets=None):
        """
        Trains the cluster.

        :param data: the input variables as matrix, array-like of shape (n_samples, n_features)
        :type data: ndarray
        :param targets: ignored
        :type targets: ndarray
        :return: the cluster
        :rtype: WekaCluster
        """
        if self._nominal_input_vars is not None:
            data = to_nominal_attributes(data, self._nominal_input_vars)
        d = to_instances(data,
                         num_nominal_labels=self._num_nominal_input_labels)
        self._cluster.build_clusterer(d)
        self.header_ = d.template_instances(d, 0)
        return self

    def predict(self, data, targets=None):
        """
        Predicts cluster labels.

        :param data: the input variables as matrix, array-like of shape (n_samples, n_features)
        :type data: ndarray
        :param targets: ignored
        :type targets: ndarray
        :return: the cluster labels (of type int)
        :rtype: ndarray
        """
        check_is_fitted(self)
        if self._nominal_input_vars is not None:
            data = to_nominal_attributes(data, self._nominal_input_vars)
        result = []
        for d in data:
            inst = to_instance(self.header_, d)
            result.append(int(self._cluster.cluster_instance(inst)))
        return np.array(result)

    def fit_predict(self, data, targets=None):
        """
        Trains the cluster and returns the cluster labels.

        :param data: the input variables as matrix, array-like of shape (n_samples, n_features)
        :type data: ndarray
        :param targets: ignored
        :type targets: ndarray
        :return: the cluster labels (of type int)
        :rtype: ndarray
        """
        self.fit(data)
        return self.predict(data)

    def get_params(self, deep=True):
        """
        Returns the parameters for this cluster, basically classname and options list.

        :param deep: ignored
        :type deep: bool
        :return: the dictionary with options
        :rtype: dict
        """
        result = dict()
        result["classname"] = self._classname
        result["options"] = self._options
        if self._nominal_input_vars is not None:
            result["nominal_input_vars"] = self._nominal_input_vars
        if self._num_nominal_input_labels is not None:
            result["num_nominal_input_labels"] = self._num_nominal_input_labels
        if self._num_nominal_input_labels is not None:
            result["num_nominal_input_labels"] = self._num_nominal_input_labels
        return result

    def set_params(self, **params):
        """
        Sets the options for the cluster, expects 'classname' and 'options'.

        :param params: the parameter dictionary
        :type params: dict
        """
        if len(params) == 0:
            return
        if "classname" not in params:
            raise Exception("Cannot find 'classname' in parameters!")
        if "options" not in params:
            raise Exception("Cannot find 'options' in parameters!")
        self._classname = params["classname"]
        self._options = params["options"]
        self._cluster = Clusterer(classname=self._classname,
                                  options=self._options)
        self._nominal_input_vars = None
        if "nominal_input_vars" in params:
            self._nominal_input_vars = params["nominal_input_vars"]
        self._num_nominal_input_labels = None
        if "num_nominal_input_labels" in params:
            self._num_nominal_input_labels = params["num_nominal_input_labels"]

    def __str__(self):
        """
        For printing the model.

        :return: the model representation, if any
        :rtype: str
        """
        if self._cluster is None:
            return self._classname + ": No model built yet"
        else:
            return str(self._cluster)

    def __copy__(self):
        """
        Creates a deep copy of itself.

        :return: the copy
        :rtype: WekaEstimator
        """
        result = WekaCluster(jobject=deepcopy(self.jobject))
        result._classname = self._classname
        result._options = self._options[:]
        return result

    def __repr__(self, N_CHAR_MAX=700):
        """
        Returns a valid Python string using its classname and options.

        :param N_CHAR_MAX: ignored
        :type N_CHAR_MAX: int
        :return: the representation
        :rtype: str
        """
        if isinstance(self._nominal_input_vars, str):
            return "WekaCluster(classname='%s', options=%s, nominal_input_vars='%s')" % (
                self._cluster.classname, str(
                    self._cluster.options), str(self._nominal_input_vars))
        else:
            return "WekaCluster(classname='%s', options=%s, nominal_input_vars=%s)" % (
                self._cluster.classname, str(
                    self._cluster.options), str(self._nominal_input_vars))
Exemplo n.º 30
0
eca.drop('APROVADO', axis=1, inplace=True)

eca.to_csv('temp.csv', index=False)

from weka.clusterers import Clusterer
import weka.core.jvm as jvm
import weka.core.serialization as serialization

jvm.start()

# executar a tecnica variando de 1 a 9 clusters
for i in range(1, 10):
    print '**************Numero de clusters: ' + str(i)
    clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans",
                          options=["-N", str(i)])
    clusterer.build_clusterer(eca)
    print(clusterer)

clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans",
                      options=["-N", "4"])
clusterer.build_clusterer(eca)
print(clusterer)
serialization.write("model/kmeans_eca_reprovacao.model", clusterer)

# ler model
'''objects = serialization.read_all("cluster.model")
clusterer = Clusterer(jobject=objects[0])

data_aluno = loader.load_file("aluno_temp.csv")
for instancia in data_aluno:
    resultado = clusterer.cluster_instance(instancia) 
Exemplo n.º 31
0
class ClusteredAgent(BustersAgent):
    "An agent that charges the closest ghost."

    def __init__(self, index = 0, inference = "ExactInference", ghostAgents = None):
        BustersAgent.__init__(self, index, inference, ghostAgents)
        self.previousDistances = [0,0,0,0]
        jvm.start(max_heap_size="512m")
        self.loader = Loader(classname="weka.core.converters.ArffLoader")
        self.data = self.loader.load_file("data/game_toCluster.arff")
        self.data.delete_last_attribute()
        self.clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "10", "-S", "4", "-I", "500"])
        self.clusterer.build_clusterer(self.data)
        self.inst = ""
        self.data = self.loader.load_file("data/game_toCluster.arff")
        addCluster = Filter(classname="weka.filters.unsupervised.attribute.AddCluster", options=["-W", "weka.clusterers.SimpleKMeans -N 10 -S 4 -I 500", "-I", "last"])
        addCluster.inputformat(self.data)
        filtered = addCluster.filter(self.data)
        self.f = open('data/addCluster.arff', 'w+')
        self.f.write(str(filtered))
        self.clustered_data = self.classifyData('data/addCluster.arff')


    def classifyData(self, filename):
        self.data_clust = [[],[],[],[],[],[],[],[],[],[]]
        with open(filename, "r") as f:
            for line in f:
                if "@" not in line or line != "\n":
                    cluster_name = line.split(",")[-1]
                    if cluster_name == "cluster1\n":
                        self.data_clust[0].append(line)
                    elif cluster_name == "cluster2\n":
                        self.data_clust[1].append(line)
                    elif cluster_name == "cluster3\n":
                        self.data_clust[2].append(line)
                    elif cluster_name == "cluster4\n":
                        self.data_clust[3].append(line)
                    elif cluster_name == "cluster5\n":
                        self.data_clust[4].append(line)
                    elif cluster_name == "cluster6\n":
                        self.data_clust[5].append(line)
                    elif cluster_name == "cluster7\n":
                        self.data_clust[6].append(line)
                    elif cluster_name == "cluster8\n":
                        self.data_clust[7].append(line)
                    elif cluster_name == "cluster9\n":
                        self.data_clust[8].append(line)
                    elif cluster_name == "cluster10\n":
                        self.data_clust[9].append(line)
        return self.data_clust

    def registerInitialState(self, gameState):
        "Pre-computes the distance between every two points."
        BustersAgent.registerInitialState(self, gameState)

    def getInstance(self, gameState):

        headers = ""
        headers = headers + "@relation prueba\n\n"

        headers = headers + "@attribute score NUMERIC\n"

        headers = headers + "@attribute ghosts-living NUMERIC\n"

        headers = headers + "@attribute distance-ghost1 NUMERIC \n"
        headers = headers + "@attribute distance-ghost2 NUMERIC \n"
        headers = headers + "@attribute distance-ghost3 NUMERIC \n"
        headers = headers + "@attribute distance-ghost4 NUMERIC \n"

        headers = headers + "@attribute prev-distance-ghost1 NUMERIC \n"
        headers = headers + "@attribute prev-distance-ghost2 NUMERIC \n"
        headers = headers + "@attribute prev-distance-ghost3 NUMERIC \n"
        headers = headers + "@attribute prev-distance-ghost4 NUMERIC \n"

        headers = headers + "@attribute posX NUMERIC\n"
        headers = headers + "@attribute posY NUMERIC\n"

        headers = headers + "@attribute direction {North, South, East, West, Stop}\n"

        headers = headers + "@attribute wall-east {True, False}\n"
        headers = headers + "@attribute wall-south {True, False}\n"
        headers = headers + "@attribute wall-west {True, False}\n"
        headers = headers + "@attribute wall-north {True, False}\n"

        headers = headers + "@data\n\n\n"

        file = open('data/instances.arff', 'w+')
        file.write(headers)

        line = ""
        line = line + str(gameState.data.score) + ","


        livingGhosts = 0
        for i in gameState.livingGhosts[1:]:
            livingGhosts += 1
        line = line + str(livingGhosts) + ","

        # include the distances to the ghosts in the current turn
        for i in range(len(gameState.livingGhosts[1:])):
            if gameState.livingGhosts[i] is False:
                line = line + "0" + ","
            else:
                line = line +\
                str(self.distancer.getDistance(gameState.getPacmanPosition(), gameState.getGhostPosition(i))) + ","


        # include the distances to the ghosts in the previous turn
        for i in self.previousDistances:
            line = line + str(i) + ","

         # store the distances of this turn for the next one
        for i in range(len(gameState.livingGhosts[1:])):
            if gameState.livingGhosts[i] is False:
                self.previousDistances[i] = 0
            else:
                self.previousDistances[i] = self.distancer.getDistance(gameState.getPacmanPosition(), gameState.getGhostPosition(i))

        line = line +\
        str(gameState.data.agentStates[0].getPosition()[0]) + "," +\
        str(gameState.data.agentStates[0].getPosition()[1])+ "," +\
        str(gameState.data.agentStates[0].getDirection()) + "," +\
        str(gameState.hasWall(gameState.getPacmanPosition()[0] - 1, gameState.getPacmanPosition()[1])) + "," +\
        str(gameState.hasWall(gameState.getPacmanPosition()[0], gameState.getPacmanPosition()[1] - 1)) + "," +\
        str(gameState.hasWall(gameState.getPacmanPosition()[0] + 1, gameState.getPacmanPosition()[1])) + "," +\
        str(gameState.hasWall(gameState.getPacmanPosition()[0], gameState.getPacmanPosition()[1] + 1)) + ",?"


        file.write(line)
        file.close()

        loader = Loader(classname="weka.core.converters.ArffLoader")
        data = loader.load_file("data/instances.arff")
        data.class_is_last()   # set class attribute
        for index, inst in enumerate(data):
            pred = self.clusterer.cluster_instance(inst)
            self.inst = inst
        return pred

    def closeMove(self, move, option):

        if move == Directions.NORTH:
            if option == 0:
                return Directions.EAST
            elif option == 1:
                return Directions.WEST
            else:
                return Directions.SOUTH
        elif move == Directions.SOUTH:
            if option == 0:
                return Directions.EAST
            elif option == 1:
                return Directions.WEST
            else:
                return Directions.NORTH
        elif move == Directions.EAST:
            if option == 0:
                return Directions.NORTH
            elif option == 1:
                return Directions.SOUTH
            else:
                return Directions.WEST
        elif move == Directions.WEST:
            if option == 0:
                return Directions.NORTH
            elif option == 1:
                return Directions.SOUTH
            else:
                return Directions.EAST
        return Directions.SOUTH

    def chooseAction(self, gameState):
        start = self.startMeasuring(gameState)
        move = self.getMove(ClusteredAgent.getInstance(self, gameState))
        end = self.endMeasuring()
        self.f_stats.write(str(end - start) + "\n")
        if move in gameState.getLegalActions(0):
            return move

        # When chose an illegal action, try to round the obstacle
        rand = random.randint(0,1)
        closemove = self.closeMove(move, rand)
        if closemove in gameState.getLegalActions(0):
            return closemove
        closemove = self.closeMove(move, (rand+1)%2)
        if closemove in gameState.getLegalActions(0):
            return closemove

        # When this is not possible, we can only backtrack
        return self.closeMove(move, 2)

    def getMove(self, clusterNum):
        # get the closest instance
        values = []
        for instance in self.clustered_data[clusterNum]:
            values.append(self.getSimilarity(instance))

        inst = values.index(min(values))
        # return the movement
        return self.clustered_data[clusterNum][inst].split(",")[-2]

    def similarityFunc(self, attrs):
        # ghosts-living
        a = float(attrs[1]) * 0.2

        # distance-ghosts
        dist = 0
        for i in attrs[2:6]:
            dist += float(i)
        a += dist * 0.2

        # poxX and posY
        a += float(int(attrs[10]) + int(attrs[11])) * 0.2

        # direction
        a += float(move_to_num[attrs[12]]) * 0.2

        # walls
        wall = 0
        for i in attrs[13:17]:
            wall += bool(i)
        a += wall * 0.2
        return a

    def getSimilarity(self, instance):
        attrs_known_inst = instance.split(",")
        attrs_new_inst = str(self.inst).split(",")

        a = self.similarityFunc(attrs_known_inst)
        b = self.similarityFunc(attrs_new_inst)

        return abs(a - b)