def simpleKMeansTrain(self, dataf, options, mname, temp=True): ''' :param data: -> data to be clustered :param options: -> SimpleKMeans options N -> number of clusters A -> Distance function to use (ex: default is "weka.core.EuclideanDistance -R first-last") l -> maximum number of iterations default 500 num-slots -> number of execution slots, 1 means no parallelism S -> Random number seed (default 10) example => ["-N", "10", "-S", "10"] :return: ''' try: jvm.start(max_heap_size=self.wHeap) data = self.loadData(dataf, temp=True) clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=options) clusterer.build_clusterer(data) print(clusterer) # cluster the data for inst in data: cl = clusterer.cluster_instance(inst) # 0-based cluster index dist = clusterer.distribution_for_instance(inst) # cluster membership distribution print(("cluster=" + str(cl) + ", distribution=" + str(dist))) self.saveModel(clusterer, 'skm', mname) except Exception as e: print((traceback.format_exc())) finally: jvm.stop()
def main(): """ Just runs some example code. """ # load a dataset iris_file = helper.get_data_dir() + os.sep + "iris.arff" helper.print_info("Loading dataset: " + iris_file) loader = Loader("weka.core.converters.ArffLoader") data = loader.load_file(iris_file) # remove class attribute data.delete_last_attribute() # build a clusterer and output model helper.print_title("Training SimpleKMeans clusterer") clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "3"]) clusterer.build_clusterer(data) print(clusterer) # cluster data helper.print_info("Clustering data") for index, inst in enumerate(data): cl = clusterer.cluster_instance(inst) dist = clusterer.distribution_for_instance(inst) print(str(index+1) + ": cluster=" + str(cl) + ", distribution=" + str(dist))
def assign_cluster(file_location, file_out="clustered.csv", model="kmeans.model", last_filename=False): data = read_csv_file(file_location) check_jvm() # load clusters obj = serialization.read(model) clusterer = Clusterer(jobject=obj) # create file with cluster group with open(file_out, 'w') as output: for index, attrs in enumerate(data): tmp = [] if last_filename: inst = Instance.create_instance(attrs[:-2]) else: inst = Instance.create_instance(attrs[1:]) pred = clusterer.cluster_instance(inst) dist = clusterer.distribution_for_instance(inst) if last_filename : tmp.append(attrs[-1]) tmp.append(pred) tmp.extend(attrs[:-2]) else: tmp.append(attrs[0]) tmp.append(pred) tmp.extend(attrs[1:]) print(str(index + 1) + ": label index=" + str(pred) + ", class distribution=" + str(dist)) output.write('%s\n'%(','.join(map(str,tmp)) ))
def simpleKMeansTrain(self, dataf, options, mname, temp=True): ''' :param data: -> data to be clustered :param options: -> SimpleKMeans options N -> number of clusters A -> Distance function to use (ex: default is "weka.core.EuclideanDistance -R first-last") l -> maximum number of iterations default 500 num-slots -> number of execution slots, 1 means no parallelism S -> Random number seed (default 10) example => ["-N", "10", "-S", "10"] :return: ''' try: jvm.start(max_heap_size=self.wHeap) data = self.loadData(dataf, temp=True) clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=options) clusterer.build_clusterer(data) print clusterer # cluster the data for inst in data: cl = clusterer.cluster_instance(inst) # 0-based cluster index dist = clusterer.distribution_for_instance(inst) # cluster membership distribution print("cluster=" + str(cl) + ", distribution=" + str(dist)) self.saveModel(clusterer, 'skm', mname) except Exception, e: print(traceback.format_exc())
def run_SKMeans_137(self): #construct output paths output_prefix = os.path.split(self.input_path)[-1].split(".")[0]; print(output_prefix); write_date = output_prefix + "." + str(datetime.now().date()); SKMeans_dir = os.path.join(self.output_dir,"SKMeans"); eval_path = os.path.join(SKMeans_dir, write_date + ".cl_eval.txt"); clust_desc_path = os.path.join(SKMeans_dir, write_date + ".cl_descr.txt"); clust_assign_path = os.path.join(SKMeans_dir, write_date + ".cl_assign.txt"); #create output dir if it doesn't already exist if(not os.path.exists(SKMeans_dir)): os.makedirs(SKMeans_dir); #clone data and build clusters # data_clone = copy.deepcopy(self.data_loaded); data_clone = self.data_loaded; clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N","137"]); clusterer.build_clusterer(data_clone); #cluster evaluation evaluation = ClusterEvaluation(); evaluation.set_model(clusterer); evaluation.test_model(data_clone); with open(eval_path, 'w') as outfile: outfile.write("number of clusters: \t" + str(evaluation.num_clusters) + "\n"); outfile.write("log likelihood: \t" + str(evaluation.num_clusters) + "\n"); outfile.write("cluster assignments: \t" + str(evaluation.cluster_assignments) + "\n"); outfile.write("***********************\n") outfile.write("\t".join(["SKmeans Cluster Evaluation Results\n"])); #header outfile.write(str(evaluation.cluster_results) + "\n"); #cluster Instance objects Description of clusters with open(clust_desc_path, 'w') as outfile: outfile.write(",".join(["cluster_num","distribution\n"])); #header for inst in data_clone: # data cl = clusterer.cluster_instance(inst); # 0-based cluster index dist = clusterer.distribution_for_instance(inst); #cluster membership distribution outfile.write(",".join([str(cl),str(dist)])); outfile.write("\n"); #cluster assignment by row with open(clust_assign_path, 'w') as outfile: outfile.write(",".join(["row_num","SKMeans\n"])); #header for i, inst in enumerate(evaluation.cluster_assignments): # data outfile.write(",".join([str(i),str(inst)])); outfile.write("\n"); return();
def command(): jvm.start() import weka.core.converters as converters clusters = request.form['clusternum'] a1 = request.form['firstcol'] a2 = request.form['secondcol'] # print clusters # print a1 # print a2 if (a1 == 'B' and a2 == 'C'): data = converters.load_any_file("Data.csv") elif (a1 == 'B' and a2 == 'D'): data = converters.load_any_file("Data1.csv") elif (a1 == 'C' and a2 == 'D'): data = converters.load_any_file("Data2.csv") elif (a1 == 'C' and a2 == 'E'): data = converters.load_any_file("Data3.csv") elif (a1 == 'D' and a2 == 'E'): data = converters.load_any_file("Data4.csv") #data.class_is_last() print(data) # from weka.attribute_selection import ASSearch, ASEvaluation, AttributeSelection # search = ASSearch(classname="weka.attributeSelection.BestFirst", options=["-D", "1", "-N", "5"]) # evaluator = ASEvaluation(classname="weka.attributeSelection.CfsSubsetEval", options=["-P", "2", "-E", "1"]) # attsel = AttributeSelection() # attsel.search(search) # attsel.evaluator(evaluator) # attsel.select_attributes(data) f = open("filename.txt", "w") from weka.clusterers import Clusterer clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", "{}".format(clusters)]) clusterer.build_clusterer(data) print(clusterer) f.write(str(clusterer)) # cluster the data for inst in data: cl = clusterer.cluster_instance(inst) # 0-based cluster index dist = clusterer.distribution_for_instance( inst) # cluster membership distribution print("cluster=" + str(cl) + ", distribution=" + str(dist)) f.write("cluster=" + str(cl) + ", distribution=" + str(dist)) return render_template("output.html") f.close()
# cl2 = clusterEM.cluster_instance(inst) # dist2 = clusterEM.distribution_for_instance(inst) # print ("cluster=" + str(cl2) + ", distribution=" + str(dist2)) # print inst # clusterDBSCAN = Clusterer( classname="weka.clusterers.DBSCAN", options=[ "-E", "0.9", "-M", "6", "-I", "weka.clusterers.forOPTICSAndDBScan.Databases.SequentialDatabase", "-D", "weka.clusterers.forOPTICSAndDBScan.DataObjects.EuclideanDataObject" ]) clusterDBSCAN.build_clusterer(data) serialization.write(os.path.join(modelDir, "dbscan.model"), clusterDBSCAN) cluster = Clusterer( jobject=serialization.read(os.path.join(modelDir, "dbscan.model"))) # print clusterDBSCAN # print clusterDBSCAN.number_of_clusters for inst in data: cl3 = cluster.cluster_instance(inst) dist3 = cluster.distribution_for_instance(inst) print(("cluster=" + str(cl3) + ", distribution=" + str(dist3))) # for inst in data: # cl3 = clusterDBSCAN.cluster_instance(inst) # dist3 = clusterDBSCAN.distribution_for_instance(inst) # print ("cluster=" + str(cl3) + ", distribution=" + str(dist3)) jvm.stop()
data.delete_attribute(2) data.delete_attribute(2) #####Uncomment to save the file with has serviceId as class, forkV and ForkW as attributes ###saver.save_file(data, "data_with_class_serviceID.arff") data.delete_attribute(2) #saver.save_file(data,"data.arff") num_clusters = "6" #Number of clusters for k mean ##Performing clustering clusterer = Clusterer(classname="weka.clusterers.SimpleKMeans", options=["-N", num_clusters]) clusterer.build_clusterer(data) for inst in data: cl = clusterer.cluster_instance(inst) # 0-based cluster index dist = clusterer.distribution_for_instance(inst) # cluster membership distribution #print("cluster=" + str(cl) + ", distribution=" + str(dist)) #########Getting the data about the clustered instances evaluation = ClusterEvaluation() evaluation.set_model(clusterer) evaluation.test_model(data) print evaluation.cluster_results #print("# clusters: " + str(evaluation.num_clusters)) #print("log likelihood: " + str(evaluation.log_likelihood)) #print("cluster assignments:\n" + str(evaluation.cluster_assignments)) #plc.plot_cluster_assignments(evaluation, data,[],True) ####Using WEKA files to get the required results by calling them through this script #########Calling the WEKA GUI to display the clusters