def tensorflowProcess(clustersNumber, dataLessTarget, datasetName, runinfo=None, initialClusters=None): import tensorflow as tf from numpy.core.tests.test_mem_overlap import xrange ''' https://www.tensorflow.org/api_docs/python/tf/contrib/factorization/KMeansClustering ''' outputFile = datasetOutFile(datasetName, TENSORFLOW_ALGO, runinfo=runinfo) clustersOutputFile = datasetOutFile(datasetName, centroidFor(TENSORFLOW_ALGO), runinfo=runinfo) if os.path.exists(outputFile) and os.path.exists(clustersOutputFile): print("tensorflow skipped") return points = dataLessTarget.values def input_fn(): return tf.train.limit_epochs(tf.convert_to_tensor(points, dtype=tf.float32), num_epochs=1) if initialClusters is None: kmeans = tf.contrib.factorization.KMeansClustering( num_clusters=clustersNumber, use_mini_batch=False) else: kmeans = tf.contrib.factorization.KMeansClustering( num_clusters=clustersNumber, initial_clusters=initialClusters, use_mini_batch=False) # train num_iterations = 10 previous_centers = None for _ in xrange(num_iterations): kmeans.train(input_fn) with open(outputFile, 'w') as csvfile: filewriter = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL) cluster_indices = list(kmeans.predict_cluster_index(input_fn)) for index, point in enumerate(points): cluster_index = cluster_indices[index] filewriter.writerow([index, cluster_index]) # Clusters saving with open(clustersOutputFile, 'w') as clusterFile: filewriter = csv.writer(clusterFile, quoting=csv.QUOTE_MINIMAL) for row in kmeans.cluster_centers(): filewriter.writerow(row.tolist())
def sklearnProcess(clustersNumber, dataLessTarget, datasetName, runinfo=None, initialClusters=None, zeroTolerance=False): import sklearn.cluster selectedAlgo = SKLEARN_TOL0_ALGO if zeroTolerance else SKLEARN_ALGO outputFile = datasetOutFile(datasetName, selectedAlgo, runinfo=runinfo) clustersOutputFile = datasetOutFile(datasetName, centroidFor(selectedAlgo), runinfo=runinfo) print("File in {}".format(outputFile)) print("Clusters in {}".format(clustersOutputFile)) if os.path.exists(outputFile) and os.path.exists(clustersOutputFile): print("sklearn skipped") return # Create a KMean model. if initialClusters is None: # random_state is the seed to be used. # By default, k-means++ is used if zeroTolerance: sklearnKmeanModel = sklearn.cluster.KMeans( n_clusters=clustersNumber, tol=0) else: sklearnKmeanModel = sklearn.cluster.KMeans( n_clusters=clustersNumber) else: if zeroTolerance: sklearnKmeanModel = sklearn.cluster.KMeans( n_clusters=clustersNumber, init=initialClusters, tol=0) else: sklearnKmeanModel = sklearn.cluster.KMeans( n_clusters=clustersNumber, init=initialClusters) #sklearnKmeanModel = sklearn.cluster.KMeans(n_clusters=clustersNumber, init=initialClusters, tol = 0, algorithm = 'full', max_iter = 100) sklearnKmeanModel.fit(dataLessTarget) with open(outputFile, 'w') as csvfile: filewriter = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL) for index, row in dataLessTarget.iterrows(): filewriter.writerow([index, sklearnKmeanModel.labels_[index]]) with open(clustersOutputFile, 'w') as clusterFile: filewriter = csv.writer(clusterFile, quoting=csv.QUOTE_MINIMAL) for row in sklearnKmeanModel.cluster_centers_: filewriter.writerow(row.tolist())
def shogunProcess(clustersNumber, dataLessTarget, datasetName, runinfo=None, initialClusters=None): import shogun outputFile = datasetOutFile(datasetName, SHOGUN_ALGO, runinfo=runinfo) clustersOutputFile = datasetOutFile(datasetName, centroidFor(SHOGUN_ALGO), runinfo=runinfo) if os.path.exists(outputFile) and os.path.exists(clustersOutputFile): print("shogun skipped") return train_features = shogun.RealFeatures( dataLessTarget.values.astype("float64").transpose()) # distance metric over feature matrix - Euclidean distance distance = shogun.EuclideanDistance(train_features, train_features) # KMeans object created kmeans = shogun.KMeans(clustersNumber, distance) if initialClusters is None: # set KMeans++ flag kmeans.set_use_kmeanspp(True) else: # set new initial centers kmeans.set_initial_centers( initialClusters.astype("float64").transpose()) # KMeans training kmeans.train() # cluster centers centers = kmeans.get_cluster_centers() # Labels for data points result = kmeans.apply() with open(outputFile, 'w') as csvfile: filewriter = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL) for index, row in dataLessTarget.iterrows(): filewriter.writerow([index, result[index].item(0)]) with open(clustersOutputFile, 'w') as clusterFile: filewriter = csv.writer(clusterFile, quoting=csv.QUOTE_MINIMAL) for row in centers.transpose(): filewriter.writerow(row.tolist())
def tensorflowProcess(clustersNumber, dataLessTarget, datasetName, runinfo=None, initialClusters=None): # Local import import tensorflow as tf from tensorflow.python.framework import constant_op import numpy as np ''' https://www.tensorflow.org/api_docs/python/tf/contrib/factorization/KMeansClustering ''' outputFile = datasetOutFile(datasetName, TENSORFLOW_ALGO, runinfo=runinfo) clustersOutputFile = datasetOutFile(datasetName, centroidFor(TENSORFLOW_ALGO), runinfo=runinfo) if os.path.exists(outputFile) and os.path.exists(clustersOutputFile): print("tensorflow skipped") return points = dataLessTarget.values def get_input_fn(): def input_fn(): return constant_op.constant(points.astype(np.float32)), None return input_fn if initialClusters is None: gmm = tf.contrib.factorization.GMM(num_clusters=clustersNumber) else: gmm = tf.contrib.factorization.GMM(num_clusters=clustersNumber, initial_clusters=initialClusters) gmm.fit(input_fn=get_input_fn(), steps=1) with open(outputFile, 'w') as csvfile: filewriter = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL) cluster_indices = list(gmm.predict_assignments()) for index, point in enumerate(points): cluster_index = cluster_indices[index] filewriter.writerow([index, cluster_index]) # Clusters saving with open(clustersOutputFile, 'w') as clusterFile: filewriter = csv.writer(clusterFile, quoting=csv.QUOTE_MINIMAL) for row in gmm.cluster_centers(): filewriter.writerow(row.tolist())
def shogunProcess(clustersNumber, dataLessTarget, datasetName, runinfo = None, initialClusters = None): import shogun outputFile = datasetOutFile(datasetName, SHOGUN_ALGO, runinfo=runinfo) if os.path.exists(outputFile): print("shogun skipped") return train_features = shogun.RealFeatures(dataLessTarget.values.astype("float64").transpose()) # distance metric over feature matrix - Euclidean distance distance = shogun.EuclideanDistance(train_features, train_features) hierarchical = shogun.Hierarchical(clustersNumber, distance) #TODO Makes the pyhon process dies!!!???!!! d = hierarchical.get_merge_distances() cp = hierarchical.get_cluster_pairs() with open(outputFile, 'w') as csvfile: filewriter = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL) for index, row in dataLessTarget.iterrows(): filewriter.writerow([index, result[index].item(0)])
def sklearnProcess(clustersNumber, dataLessTarget, datasetName, runinfo=None): import sklearn.cluster selectedAlgo = SKLEARN_ALGO outputFile = datasetOutFile(datasetName, selectedAlgo, runinfo=runinfo) if os.path.exists(outputFile): print("sklearn skipped") return check = False while not check: try: builtModel = sklearn.cluster.SpectralClustering( n_clusters=clustersNumber) builtModel.fit(dataLessTarget) check = True except np.linalg.linalg.LinAlgError: continue except AssertionError: continue with open(outputFile, 'w') as csvfile: filewriter = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL) for index, row in dataLessTarget.iterrows(): filewriter.writerow([index, builtModel.labels_[index]])
def sklearnProcess(clustersNumber, dataLessTarget, datasetName, runinfo=None, zeroTolerance=False): # Local import import sklearn.mixture selectedAlgo = SKLEARN_TOL0_ALGO if zeroTolerance else SKLEARN_ALGO outputFile = datasetOutFile(datasetName, selectedAlgo, runinfo=runinfo) if os.path.exists(outputFile): print( "sklearn {}skipped".format("Zero Tol. " if zeroTolerance else "")) return builtModel = None if zeroTolerance: builtModel = sklearn.mixture.GaussianMixture( n_components=clustersNumber, tol=0) else: builtModel = sklearn.mixture.GaussianMixture( n_components=clustersNumber) builtModel.fit(dataLessTarget) predictedLabels = builtModel.predict(dataLessTarget) with open(outputFile, 'w') as csvfile: filewriter = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL) for index, row in dataLessTarget.iterrows(): filewriter.writerow([index, predictedLabels[index]])
def sklearnProcess(clustersNumber, dataLessTarget, datasetName, runinfo=None): import sklearn.cluster selectedAlgo = SKLEARN_ALGO outputFile = datasetOutFile(datasetName, selectedAlgo, runinfo=runinfo) if os.path.exists(outputFile): print("sklearn skipped") return #print(clustersNumber, dataLessTarget, datasetName, runinfo) i = re.fullmatch("[^0-9]*?([0-9]+)", runinfo) i = int(i.group(1)) eps_value = 0.33 * i sample_value = i % 10 if sample_value == 0: sample_value = 10 builtModel = sklearn.cluster.DBSCAN(eps=eps_value, min_samples=sample_value) builtModel.fit(dataLessTarget) with open(outputFile, 'w') as csvfile: filewriter = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL) for index, row in dataLessTarget.iterrows(): filewriter.writerow([index, builtModel.labels_[index]])
def matlabProcess(clustersNumber, dataLessTarget, datasetName, runinfo=None): outputFile = datasetOutFile(datasetName, MATLAB_ALGO, runinfo=runinfo) if os.path.exists(outputFile): print("matlab skipped") return tempFile = dumpDataOnCleanCsv(dataLessTarget) matlabCmd = "cluster(fitgmdist(csvread('{}'),{},'RegularizationValue',0.1), csvread('{}'))".format( tempFile, str(clustersNumber), tempFile) command_parts = [ MATLAB_EXE, "-nodisplay", "-nosplash", "-nodesktop", "-r \"rng('shuffle'); {}idx = {}; disp(idx); exit;\" ".format( matlabRedirectTempFolder(TEMPFOLDER), matlabCmd) ] print(" ".join(command_parts)) result = subprocess.run(command_parts, stdout=subprocess.PIPE) res = result.stdout i = 0 resultFile = open(outputFile, 'w') for line in res.decode().split("\n"): matches = re.fullmatch(" ?([0-9]+)", line) if matches is not None: resultFile.write("{},{}\n".format(i, matches.group(1))) i += 1 resultFile.close() os.unlink(tempFile)
def opencvProcess(clustersNumber, dataLessTarget, datasetName, runinfo=None, initialClusters=None): import cv2 import numpy as np outputFile = datasetOutFile(datasetName, OPENCV_ALGO, runinfo=runinfo) clustersOutputFile = datasetOutFile(datasetName, centroidFor(OPENCV_ALGO), runinfo=runinfo) if os.path.exists(outputFile) and os.path.exists(clustersOutputFile): print("opencv skipped") return # Define criteria = ( type, max_iter = 10 , epsilon = 1.0 ) criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 100, 0.0) if initialClusters is None: ret = cv2.kmeans(np.float32(dataLessTarget.values), clustersNumber, None, criteria, 10, flags=cv2.KMEANS_PP_CENTERS) else: print(initialClusters) ret = cv2.kmeans(np.float32(dataLessTarget.values), clustersNumber, initialClusters, criteria, 10, flags=cv2.KMEANS_USE_INITIAL_LABELS) with open(outputFile, 'w') as csvfile: filewriter = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL) for index, row in dataLessTarget.iterrows(): filewriter.writerow([index, ret[1][index].item(0)]) with open(clustersOutputFile, 'w') as clusterFile: filewriter = csv.writer(clusterFile, quoting=csv.QUOTE_MINIMAL) for row in ret[2]: filewriter.writerow(row.tolist())
def wekaProcess(inFile, datasetName, runinfo=None): outputFile = datasetOutFile(datasetName, WEKA_ALGO, runinfo=runinfo) clustersOutputFile = datasetOutFile(datasetName, centroidFor(WEKA_ALGO), runinfo=runinfo) if os.path.exists(outputFile) and os.path.exists(clustersOutputFile): print("weka skipped") return command_parts = [ JAVA_EXE, "-classpath", JAVA_CLASSPATH, "WekaRun", inFile, outputFile, clustersOutputFile ] print(" ".join(command_parts)) subprocess.call(command_parts)
def mlpackProcess(clustersNumber, dataLessTarget, datasetName, runinfo=None, initialClustersCsvFile=None): outputFile = datasetOutFile(datasetName, MLPACK_ALGO, runinfo=runinfo) clustersOutputFile = datasetOutFile(datasetName, centroidFor(MLPACK_ALGO), runinfo=runinfo) if os.path.exists(outputFile) and os.path.exists(clustersOutputFile): print("mlpack skipped") return tempFile = dumpDataOnCleanCsv(dataLessTarget) tempFile2 = "{}/{}b.csv".format(TEMPFOLDER, int(time.time())) # -a naive = Lloyd if initialClustersCsvFile is not None: command_parts = [ "{}/mlpack_kmeans".format(MLPACK_BIN), "--clusters", str(clustersNumber), "-i", tempFile, "-I", initialClustersCsvFile, "-a", "naive", "-o", tempFile2, "-C", clustersOutputFile ] else: command_parts = [ "{}/mlpack_kmeans".format(MLPACK_BIN), "--clusters", str(clustersNumber), "-i", tempFile, "-o", tempFile2, "-a", "naive", "-C", clustersOutputFile ] print(" ".join(command_parts)) subprocess.call(command_parts) with open(tempFile2, 'r') as csvfile: with open(outputFile, 'w') as resultFile: i = 0 resultReader = csv.reader(csvfile) for row in resultReader: resultFile.write("{},{}\n".format(i, int(float(row[-1])))) i += 1 os.unlink(tempFile) os.unlink(tempFile2)
def rProcess(srcFile, datasetName, runinfo=None, initialClustersCsvFile=None, hundredIters=False): import pandas if initialClustersCsvFile is None and hundredIters: print("R kcca function don't have the iteration parameter !") return selectedAlgo = R_100ITER_ALGO if hundredIters else R_ALGO outputFile = datasetOutFile(datasetName, selectedAlgo, runinfo=runinfo) clustersOutputFile = datasetOutFile(datasetName, centroidFor(selectedAlgo), runinfo=runinfo) if os.path.exists(outputFile) and os.path.exists(clustersOutputFile): print("R skipped") return if initialClustersCsvFile is None: command_parts = [ R_BIN, "--no-save", "--quiet", os.path.join(R_SCRIPT_BASE_DIR, "kmeans_test.R"), srcFile, outputFile, clustersOutputFile ] else: command_parts = [ R_BIN, "--no-save", "--quiet", os.path.join( R_SCRIPT_BASE_DIR, "kmeans_test_init_clusters_100it.R" if hundredIters else "kmeans_test_init_clusters.R"), srcFile, outputFile, clustersOutputFile, initialClustersCsvFile ] subprocess.call(command_parts) print(" ".join(command_parts)) dta = pandas.read_csv(clustersOutputFile) dta.drop(dta.columns[[0]], axis=1).to_csv(clustersOutputFile, index=False, header=False)
def rProcess(srcFile, datasetName, runinfo = None): selectedAlgo = R_ALGO outputFile = datasetOutFile(datasetName, selectedAlgo, runinfo=runinfo) if os.path.exists(outputFile): print("R skipped") return command_parts = [R_BIN, "--no-save", "--quiet", os.join(R_SCRIPT_BASE_DIR, "hierarchical_test.R"), srcFile, outputFile] subprocess.call(command_parts)
def wekaProcess(inFile, datasetName, runinfo = None): outputFile = datasetOutFile(datasetName, WEKA_ALGO, runinfo = runinfo) command_parts = [JAVA_EXE, "-Xmx100g", "-classpath", JAVA_CLASSPATH, "HierarchicalWekaRun", inFile, outputFile] print(" ".join(command_parts)) if os.path.exists(outputFile): print("weka skipped") return subprocess.call(command_parts)
def sklearnProcess(clustersNumber, dataLessTarget, datasetName, runinfo = None): import sklearn.cluster selectedAlgo = SKLEARN_ALGO outputFile = datasetOutFile(datasetName, selectedAlgo, runinfo=runinfo) if os.path.exists(outputFile): print("sklearn skipped") return builtModel = sklearn.cluster.AgglomerativeClustering(n_clusters=clustersNumber) builtModel.fit(dataLessTarget) with open(outputFile, 'w') as csvfile: filewriter = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL) for index, row in dataLessTarget.iterrows(): filewriter.writerow([index, builtModel.labels_[index]])
def preprocessRun(self, groundTruthClustersId, data, dataLessTarget, datasetName, RUN_INFO): self.drawnInitialClustersFeatures = datasetOutFile(datasetName, "{}.init_set_clusters".format(RUN_INFO)) if os.path.exists(self.drawnInitialClustersFeatures): print("Loading selected clusters") self.initialClusters = pandas.read_csv(self.drawnInitialClustersFeatures, header=None, dtype='float32').values else: # At this stage, we draw one random feature set on EACH feature (this will be the starting point for *ALL* algorithms) self.initialClusters = list() for i in groundTruthClustersId: found = False selectedSample = None ''' In some dataset (eg. titanic) the random drawn cluster centroid may be the same in both clusters. To avoid this effect, we redrawn as long as there is a conflict... ''' while not found: selectedSample = data[data.target == i].sample(1) selectedSample = selectedSample.loc[:, data.columns != 'target'].iloc[0].values found = True for anInitialClusterPreviouslyInserted in self.initialClusters: if False not in (selectedSample == anInitialClusterPreviouslyInserted): found = False break self.initialClusters.append(selectedSample) self.initialClusters = numpy.asarray(self.initialClusters) print("Saving initial clusters for this project...") pandas.DataFrame(self.initialClusters).to_csv(path_or_buf=self.drawnInitialClustersFeatures, index=False, header=False) # Reread to get float32 type (required by TF) self.initialClusters = pandas.read_csv(self.drawnInitialClustersFeatures, header=None, dtype='float32').values
def sklearnProcess(clustersNumber, dataLessTarget, datasetName, runinfo=None): import sklearn.cluster selectedAlgo = SKLEARN_ALGO outputFile = datasetOutFile(datasetName, selectedAlgo, runinfo=runinfo) if os.path.exists(outputFile): print("sklearn skipped") return #print(clustersNumber, dataLessTarget, datasetName, runinfo) i = re.fullmatch("[^0-9]*?([0-9]+)", runinfo) i = int(i.group(1)) damping_value = 0.016 * i + 0.5 builtModel = sklearn.cluster.AffinityPropagation(damping=damping_value) builtModel.fit(dataLessTarget) with open(outputFile, 'w') as csvfile: filewriter = csv.writer(csvfile, quoting=csv.QUOTE_MINIMAL) for index, row in dataLessTarget.iterrows(): filewriter.writerow([index, builtModel.labels_[index]])
def matlabProcess(clustersNumber, dataLessTarget, datasetName, runinfo=None, initialClusters=None): outputFile = datasetOutFile(datasetName, MATLAB_ALGO, runinfo=runinfo) clustersOutputFile = datasetOutFile(datasetName, centroidFor(MATLAB_ALGO), runinfo=runinfo) # print(outputFile, os.path.exists(outputFile)) # print(clustersOutputFile, os.path.exists(clustersOutputFile)) if os.path.exists(outputFile) and os.path.exists(clustersOutputFile): print("matlab skipped") return tempFile = dumpDataOnCleanCsv(dataLessTarget) if initialClusters is None: matlabKmeanCommand = "kmeans(csvread('{}'), {})".format( tempFile, str(clustersNumber)) else: initialClustersMatlabStringMatrix = [] for aClusterFeatures in initialClusters: initialClustersMatlabStringMatrix.append(",".join( map(lambda x: str(x), aClusterFeatures.tolist()))) initialClustersMatlabStringMatrix = ";".join( initialClustersMatlabStringMatrix) matlabKmeanCommand = "kmeans(csvread('{}'), {}, 'Start', [{}])".format( tempFile, str(clustersNumber), initialClustersMatlabStringMatrix) command_parts = [ MATLAB_EXE, "-nodisplay", "-nosplash", "-nodesktop", "-r \"rng('shuffle'); {}[idx,C] = {}; disp(idx); disp('===C==='); disp(num2str(C)); exit;\" " .format(matlabRedirectTempFolder(TEMPFOLDER), matlabKmeanCommand) ] print(" ".join(command_parts)) result = subprocess.run(command_parts, stdout=subprocess.PIPE) res = result.stdout readingIdx = True i = 0 resultFile = open(outputFile, 'w') for line in res.decode().split("\n"): if readingIdx and line == "===C===": resultFile.close() resultFile = open(clustersOutputFile, 'w') readingIdx = False else: if readingIdx: matches = re.fullmatch(" ?([0-9]+)", line) if matches is not None: resultFile.write("{},{}\n".format(i, matches.group(1))) i += 1 else: if len(line.strip()) > 0: resultFile.write(",".join(re.split(" +", line.strip()))) resultFile.write("\n") resultFile.close() os.unlink(tempFile)