def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") full = loader.load_file(iris_file) full.class_is_last() # remove class attribute data = Instances.copy_instances(full) data.no_class() data.delete_last_attribute() # build a clusterer and output model helper.print_title("Training SimpleKMeans clusterer") clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) clusterer.build_clusterer(data) print("done") # classes to clusters evl = ClusterEvaluation() evl.set_model(clusterer) evl.test_model(full) helper.print_title("Cluster results") print(evl.cluster_results) helper.print_title("Classes to clusters") print(evl.classes_to_clusters)
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(iris_file) # remove class attribute data.delete_last_attribute() # build a clusterer and output model helper.print_title("Training SimpleKMeans clusterer") clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) clusterer.build_clusterer(data) print(clusterer) helper.print_info("Evaluating on data") evaluation = ClusterEvaluation() evaluation.set_model(clusterer) evaluation.test_model(data) print("# clusters: " + str(evaluation.num_clusters)) print("log likelihood: " + str(evaluation.log_likelihood)) print("cluster assignments:\n" + str(evaluation.cluster_assignments)) plc.plot_cluster_assignments(evaluation, data, inst_no=True) # using a filtered clusterer helper.print_title("Filtered clusterer") loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(iris_file) clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) fclusterer = FilteredClusterer() fclusterer.clusterer = clusterer fclusterer.filter = remove fclusterer.build_clusterer(data) print(fclusterer) # load a dataset incrementally and build clusterer incrementally helper.print_title("Incremental clusterer") loader = Loader("weka.core.converters.ArffLoader") iris_inc = loader.load_file(iris_file, incremental=True) clusterer = Clusterer("weka.clusterers.Cobweb") remove = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) remove.inputformat(iris_inc) iris_filtered = remove.outputformat() clusterer.build_clusterer(iris_filtered) for inst in loader: remove.input(inst) inst_filtered = remove.output() clusterer.update_clusterer(inst_filtered) clusterer.update_finished() print(clusterer.to_commandline()) print(clusterer) print(clusterer.graph) plg.plot_dot_graph(clusterer.graph)
def evaluation_data(self, model): try: loader = Loader(classname="weka.core.converters.ArffLoader") data_test = loader.load_file(self.dataTestName) #helper.print_info("Evaluating on data:") evaluation = ClusterEvaluation() evaluation.set_model(model) evaluation.test_model(data_test) #print("# clusters: " + str(evaluation.num_clusters)) #print("# log likelihood: " + str(evaluation.log_likelihood)) cluster_ass = evaluation.cluster_assignments #print("# cluster assignments:\n" + str(cluster_ass)) f = open("result_data.txt", "w+") i = 0 for ins in data_test: stt = "normal" if (cluster_ass[i] == 0): stt = "anomaly" statement = str(ins) + "," + stt #print statement f.write(statement + "\n") i = i + 1 f.close() return evaluation.cluster_results except Exception, e: raise e print(traceback.format_exc())
def run_clustering_task7_manual(self, output_directory, clusterer_name, num_clusters, seed=10): data = Instances.copy_instances(self.training_data) data.no_class() data.delete_first_attribute() clusterer_name_short = clusterer_name.replace("weka.clusterers.", "") # build a clusterer and output model print("\nBuilding " + clusterer_name_short + " Clusterer on training data.") buildTimeStart = time.time() clusterer = Clusterer( classname=clusterer_name, options=["-N", "" + str(num_clusters), "-S", "" + str(seed)]) clusterer.build_clusterer(data) resultsString = "" resultsString = self.print_both(str(clusterer), resultsString) buildTimeString = "Clusterer Built in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Evaluate Clusterer resultsString = self.print_both("\nClustering data.", resultsString) buildTimeStart = time.time() evl = ClusterEvaluation() evl.set_model(clusterer) evl.test_model(self.training_data) resultsString = self.print_both("\nCluster results\n", resultsString) resultsString = self.print_both(str(evl.cluster_results), resultsString) resultsString = self.print_both("\nClasses to clusters\n", resultsString) resultsString = self.print_both(str(evl.classes_to_clusters), resultsString) buildTimeString = "\nClustered data in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Save Results and Cleanup self.save_results( clusterer_name_short + "_" + "N" + str(num_clusters) + "_S" + str(seed), resultsString, output_directory)
def run_SKMeans_137(self): #construct output paths output_prefix = os.path.split(self.input_path)[-1].split(".")[0]; print(output_prefix); write_date = output_prefix + "." + str(datetime.now().date()); SKMeans_dir = os.path.join(self.output_dir,"SKMeans"); eval_path = os.path.join(SKMeans_dir, write_date + ".cl_eval.txt"); clust_desc_path = os.path.join(SKMeans_dir, write_date + ".cl_descr.txt"); clust_assign_path = os.path.join(SKMeans_dir, write_date + ".cl_assign.txt"); #create output dir if it doesn't already exist if(not os.path.exists(SKMeans_dir)): os.makedirs(SKMeans_dir); #clone data and build clusters # data_clone = copy.deepcopy(self.data_loaded); data_clone = self.data_loaded; clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N","137"]); clusterer.build_clusterer(data_clone); #cluster evaluation evaluation = ClusterEvaluation(); evaluation.set_model(clusterer); evaluation.test_model(data_clone); with open(eval_path, 'w') as outfile: outfile.write("number of clusters: \t" + str(evaluation.num_clusters) + "\n"); outfile.write("log likelihood: \t" + str(evaluation.num_clusters) + "\n"); outfile.write("cluster assignments: \t" + str(evaluation.cluster_assignments) + "\n"); outfile.write("***********************\n") outfile.write("\t".join(["SKmeans Cluster Evaluation Results\n"])); #header outfile.write(str(evaluation.cluster_results) + "\n"); #cluster Instance objects Description of clusters with open(clust_desc_path, 'w') as outfile: outfile.write(",".join(["cluster_num","distribution\n"])); #header for inst in data_clone: # data cl = clusterer.cluster_instance(inst); # 0-based cluster index dist = clusterer.distribution_for_instance(inst); #cluster membership distribution outfile.write(",".join([str(cl),str(dist)])); outfile.write("\n"); #cluster assignment by row with open(clust_assign_path, 'w') as outfile: outfile.write(",".join(["row_num","SKMeans\n"])); #header for i, inst in enumerate(evaluation.cluster_assignments): # data outfile.write(",".join([str(i),str(inst)])); outfile.write("\n"); return();
def run_clusterer(file): # Get filename from Pathlib object filename = file.parts[-1] dir = file.parents[0] print("Running Clusterer on %s" % filename) if not filename.endswith(".arff"): print("%s not ARFF file." % filename) return # Removes '.arff' from filename filename_base = filename[:-5] # Load data with class as first attr full = load_Arff_file(file) full.class_is_first() full_withoutclass = load_Arff_file(file) #data.delete_first_attribute() data = Instances.copy_instances(full) data.no_class() data.delete_first_attribute() dir = dir / "cluster_results_optimum" dir.mkdir(parents=True, exist_ok=True) # Init clusterer #"-N", "-1", n = "2" if (filename_base.startswith("fer2018_")): print("Changing number of clusters to 7") n = "7" #clusterer = Clusterer(classname="weka.clusterers.EM", options=[ "-S", "10", "-N", n]) #clusterer = Clusterer(classname="weka.clusterers.FarthestFirst", options=[ "-S", "10", "-N", n]) clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-S", "10", "-N", n]) clusterer.build_clusterer(data) evaluation = ClusterEvaluation() evaluation.set_model(clusterer) evaluation.test_model(full) str1 = str(filename_base) + "_cl_res.txt" output_results = dir / str1 output_cluster(evaluation, output_results)
def run_cluster_simplek(self, output_directory, exc_class=False, num_clusters=7): data = Instances.copy_instances(self.training_data) data.no_class() data.delete_first_attribute() # build a clusterer and output model print("\nBuilding Clusterer on training data.") buildTimeStart = time.time() clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "" + str(num_clusters)]) clusterer.build_clusterer(data) resultsString = "" resultsString = self.print_both(str(clusterer), resultsString) buildTimeString = "Clusterer Built in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Evaluate Clusterer resultsString = self.print_both("\nClustering data.", resultsString) buildTimeStart = time.time() clsexc = "" if (exc_class): # no class attribute clsexc = "_NO_Class" evl = ClusterEvaluation() evl.set_model(clusterer) evl.test_model(data) else: # classes to clusters evl = ClusterEvaluation() evl.set_model(clusterer) evl.test_model(self.training_data) resultsString = self.print_both("\nCluster results\n", resultsString) resultsString = self.print_both(str(evl.cluster_results), resultsString) resultsString = self.print_both("\nClasses to clusters\n", resultsString) resultsString = self.print_both(str(evl.classes_to_clusters), resultsString) buildTimeString = "\nClustered data in " + str( time.time() - buildTimeStart) + " secs.\n" resultsString = self.print_both(buildTimeString, resultsString) #Save Results and Cleanup self.save_results("SimpleKM" + clsexc + "_", resultsString, output_directory)
# load iris fname = data_dir + os.sep + "iris.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) # remove class attribute flt = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) flt.set_inputformat(data) filtered = flt.filter(data) # build KMeans print("\n--> SimpleKMeans\n") cl = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) cl.build_clusterer(filtered) evl = ClusterEvaluation() evl.set_model(cl) evl.test_model(filtered) print(evl.get_cluster_results()) plc.plot_cluster_assignments(evl, data, atts=[], inst_no=True, wait=True) # use AddCluster filter print("\n--> AddCluster filter\n") flt = Filter(classname="weka.filters.unsupervised.attribute.AddCluster", options=["-W", "weka.clusterers.SimpleKMeans -N 3"]) flt.set_inputformat(filtered) addcl = flt.filter(filtered) print(addcl) # classes-to-clusters evaluation print("\n--> Classes to clusters\n")
# load iris fname = data_dir + os.sep + "iris.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) # remove class attribute flt = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) flt.inputformat(data) filtered = flt.filter(data) # build KMeans print("\n--> SimpleKMeans\n") cl = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) cl.build_clusterer(filtered) evl = ClusterEvaluation() evl.set_model(cl) evl.test_model(filtered) print(evl.cluster_results) plc.plot_cluster_assignments(evl, data, atts=[], inst_no=True, wait=True) # use AddCluster filter print("\n--> AddCluster filter\n") flt = Filter(classname="weka.filters.unsupervised.attribute.AddCluster", options=["-W", "weka.clusterers.SimpleKMeans -N 3"]) flt.inputformat(filtered) addcl = flt.filter(filtered) print(addcl) # classes-to-clusters evaluation print("\n--> Classes to clusters\n")
loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) # build KMeans seeds = [-1, 11, 12] for seed in seeds: if seed == -1: seedStr = "default" else: seedStr = str(seed) print("\n--> SimpleKMeans - seed " + seedStr + "\n") cl = Clusterer("weka.clusterers.SimpleKMeans") if seed != -1: cl.options = ["-S", str(seed)] cl.build_clusterer(data) evl = ClusterEvaluation() evl.set_model(cl) evl.test_model(data) print(evl.cluster_results) # build XMeans print("\n--> XMeans\n") flt = Filter(classname="weka.filters.unsupervised.attribute.RemoveType", options=["-T", "numeric", "-V"]) flt.inputformat(data) filtered = flt.filter(data) cl = Clusterer(classname="weka.clusterers.XMeans") cl.build_clusterer(filtered) evl = ClusterEvaluation() evl.set_model(cl) evl.test_model(filtered)
data.delete_attribute(2) #saver.save_file(data,"data.arff") num_clusters = "6" #Number of clusters for k mean ##Performing clustering clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", num_clusters]) clusterer.build_clusterer(data) for inst in data: cl = clusterer.cluster_instance(inst) # 0-based cluster index dist = clusterer.distribution_for_instance(inst) # cluster membership distribution #print("cluster=" + str(cl) + ", distribution=" + str(dist)) #########Getting the data about the clustered instances evaluation = ClusterEvaluation() evaluation.set_model(clusterer) evaluation.test_model(data) print evaluation.cluster_results #print("# clusters: " + str(evaluation.num_clusters)) #print("log likelihood: " + str(evaluation.log_likelihood)) #print("cluster assignments:\n" + str(evaluation.cluster_assignments)) #plc.plot_cluster_assignments(evaluation, data,[],True) ####Using WEKA files to get the required results by calling them through this script #########Calling the WEKA GUI to display the clusters subprocess.call(["java" ,"-classpath", ".:weka.jar", "VisualizeClusterAssignments" ,"-t", "data.arff" ,"-W", "weka.clusterers.SimpleKMeans -N 6"]) ## Change the num_clusters here #########Accuracy for clustering when target is serviceID subprocess.call(["python", "clusterers.py", "-t", "data_with_class_serviceID.arff", "-c", "last", "weka.clusterers.SimpleKMeans", "-N", num_clusters])
fname = data_dir + os.sep + "iris.arff" print("\nLoading dataset: " + fname + "\n") loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) # remove class attribute flt = Filter(classname="weka.filters.unsupervised.attribute.Remove", options=["-R", "last"]) flt.inputformat(data) filtered = flt.filter(data) # build KMeans print("\n--> SimpleKMeans\n") cl = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) cl.build_clusterer(filtered) evl = ClusterEvaluation() evl.set_model(cl) evl.test_model(filtered) print(evl.cluster_results) plc.plot_cluster_assignments(evl, data, atts=[], inst_no=True, wait=True) # use AddCluster filter print("\n--> AddCluster filter\n") flt = Filter(classname="weka.filters.unsupervised.attribute.AddCluster", options=["-W", "weka.clusterers.SimpleKMeans -N 3"]) flt.inputformat(filtered) addcl = flt.filter(filtered) print(addcl) # classes-to-clusters evaluation print("\n--> Classes to clusters\n")
loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) # build KMeans seeds = [-1, 11, 12] for seed in seeds: if seed == -1: seedStr = "default" else: seedStr = str(seed) print("\n--> SimpleKMeans - seed " + seedStr + "\n") cl = Clusterer("weka.clusterers.SimpleKMeans") if seed != -1: cl.set_options(["-S", str(seed)]) cl.build_clusterer(data) evl = ClusterEvaluation() evl.set_model(cl) evl.test_model(data) print(evl.get_cluster_results()) # build XMeans print("\n--> XMeans\n") flt = Filter(classname="weka.filters.unsupervised.attribute.RemoveType", options=["-T", "numeric", "-V"]) flt.set_inputformat(data) filtered = flt.filter(data) cl = Clusterer(classname="weka.clusterers.XMeans") cl.build_clusterer(filtered) evl = ClusterEvaluation() evl.set_model(cl) evl.test_model(filtered) print(evl.get_cluster_results())
loader = Loader(classname="weka.core.converters.ArffLoader") data = loader.load_file(fname) # build KMeans seeds = [-1, 11, 12] for seed in seeds: if seed == -1: seedStr = "default" else: seedStr = str(seed) print("\n--> SimpleKMeans - seed " + seedStr + "\n") cl = Clusterer("weka.clusterers.SimpleKMeans") if seed != -1: cl.options = ["-S", str(seed)] cl.build_clusterer(data) evl = ClusterEvaluation() evl.set_model(cl) evl.test_model(data) print(evl.cluster_results) # build XMeans print("\n--> XMeans\n") flt = Filter(classname="weka.filters.unsupervised.attribute.RemoveType", options=["-T", "numeric", "-V"]) flt.inputformat(data) filtered = flt.filter(data) cl = Clusterer(classname="weka.clusterers.XMeans") cl.build_clusterer(filtered) evl = ClusterEvaluation() evl.set_model(cl) evl.test_model(filtered) print(evl.cluster_results)