def fit(self, X, y): #creating a manifold on training data self.model = LocallyLinearEmbedding( method=self.method, n_neighbors=self.n_neighbors, n_components=self.n_components, reg=self.reg, eigen_solver=self.eigen_solver, random_state=self.random_state).fit(X, y) #determining centroids for given points self.centroids = KMeans(n_clusters=self.n_clusters, random_state=self.random_state).fit( self.model.transform(X)) labels = self.centroids.predict(self.model.transform( X)) # Every point is assigned to a certain cluster. #assigning each centroid to the correct cluster confusion_m = confusion_matrix(y, labels) m = Munkres() cost_m = make_cost_matrix(confusion_m) target_cluster = m.compute( cost_m) # (target, cluster) assignment pairs. #saving mapping for predictions self.mapping = { cluster: target for target, cluster in dict(target_cluster).items() }
def evaluate(self, individual, X, y, random_state): X_new = self.reduce(individual, X) if self.fitness_function in [ "kmeans", "silhouette", "adjusted_rand_score", "calinski_harabasz" ]: # Clustering of the reduced dataset. centroids = KMeans(n_clusters=self.n_clusters, random_state=self.random_state).fit(X_new) labels = centroids.labels_ # Every point is assigned to a certain cluster. if self.fitness_function == "silhouette": if len(Counter(labels)) == 1: return -1 else: return silhouette_score(X_new, labels) elif self.fitness_function == "adjusted_rand_score": return adjusted_rand_score(y, labels) elif self.fitness_function == "calinski_harabasz": if len(Counter(labels)) == 1: return -1 else: return calinski_harabasz_score(X_new, labels) elif self.fitness_function == "kmeans": confusion_m = confusion_matrix(y, labels) m = Munkres() cost_m = make_cost_matrix(confusion_m) target_cluster = m.compute( cost_m) # (target, cluster) assignment pairs. cluster_target = { cluster: target for target, cluster in dict(target_cluster).items() } y_pred = list(map(cluster_target.get, labels)) return balanced_accuracy_score(y, y_pred) elif self.fitness_function == "nn": n_neighbors = self.k # n_neighbors + 1 because the class of the point itself is not taken into account. neighbors = NearestNeighbors(n_neighbors=n_neighbors + 1).fit(X_new) nearest_neighbors = neighbors.kneighbors(X_new, return_distance=False)[:, 1:] classes = y[nearest_neighbors] y_pred = mode(classes, axis=1)[0].reshape(len(y), ) return balanced_accuracy_score(y, y_pred) elif self.fitness_function == "angles": angles = np.apply_along_axis( lambda row: math.atan2(row[1], row[0]), axis=1, arr=X_new) # Mapping from (-pi,pi) to (0, 2*pi) angles = (2 * np.pi + angles) * (angles < 0) + angles * (angles > 0) y_pred = list( map( lambda angle: check_in_which_slice(angle, self.n_clusters, self.slices), angles)) return balanced_accuracy_score(y, y_pred)
def fit(self, X, y, sample_weight=None, groups=None): self.dataset = self.dataset.split("/")[-1] self.n_clusters = len(Counter(y)) data = pd.DataFrame(X) data['target'] = y f = open( self.path + "/" + self.dataset + "." + str(self.random_state) + "-train", "w") f.write("classLast," + str(data.shape[1] - 1) + "," + str(self.n_components) + ',comma\n') f.close() data.to_csv(self.path + "/" + self.dataset + "." + str(self.random_state) + "-train", header=None, index=None, mode='a', sep=',') #two empty lines at the end, adding header #java code returns many lines, the result ends in the last ones z = subprocess.check_output([ 'java', '-cp', 'GPMaL/gp-mal-eurogp-19-bin.jar', 'featureLearn.RunGPMaL', 'dataset=' + os.getcwd() + "/" + self.dataset + '.' + str(self.random_state) + '-train', 'numtrees=2', 'preprocessing=none', 'logPrefix=' + self.dataset + '-train', 'treeDepth=8', 'featureMin=0', 'featureMax=1', 'normalisePostCreation=false', 'scalePostCreation=false', 'roundPostCreation=true', 'featureLearnParamFile=GPMaL/flNeighboursFG.params', 'doNNs=false', 'n_jobs=' + str(self.n_jobs), 'random_state=' + str(self.random_state) ]).decode("utf-8") print(str(z)) z = z.split('\n')[-X.shape[0] - 3:-2] #loading the data into pandas df X_new = pd.read_csv(StringIO('\n'.join(z)), sep=',') X_new.drop('class', axis=1, inplace=True) # for f in range(0,X.shape[1]-1): for f in range(0, X_new.shape[1] - self.n_components): X_new.drop('F' + str(f), axis=1, inplace=True) print(X_new) self.centroids = KMeans(n_clusters=self.n_clusters, random_state=self.random_state).fit(X_new) labels = self.centroids.predict(X_new) confusion_m = confusion_matrix(y, labels) m = Munkres() cost_m = make_cost_matrix(confusion_m) target_cluster = m.compute( cost_m) # (target, cluster) assignment pairs. self.mapping = { cluster: target for target, cluster in dict(target_cluster).items() } return self
def fit(self,X,y): self.X_train=X #creating a manifold on training data self.model = TSNE(n_iter=self.n_iter, n_components=self.n_components, perplexity=self.perplexity).fit_transform(X,y) #determining centroids for given classes self.centroids = KMeans(n_clusters=self.n_clusters, random_state=self.random_state).fit(self.model) self.labels = self.centroids.predict(self.model) # Every point is assigned to a certain cluster. #assigning each centroid to the correct cluster confusion_m = confusion_matrix(y, self.labels) m = Munkres() cost_m = make_cost_matrix(confusion_m) target_cluster = m.compute(cost_m) # (target, cluster) assignment pairs. #saving mapping for predictions self.mapping = {cluster : target for target, cluster in dict(target_cluster).items()}
def fit(self,X,y): pset = create_pset(in_type=float, in_types_length=X.shape[1], out_type=float) self.toolbox = create_toolbox(weights=self.weights, pset=pset, min_tree_height=self.min_tree_height, max_tree_height=self.max_tree_height, n_components=self.n_components) self.toolbox.register("evaluate", self.evaluate, X=X, y=y, random_state=self.random_state) population = self.toolbox.population(self.pop_size) best_individuals = [] self.n_clusters=len(Counter(y)) self.slices = [(i*2*math.pi/self.n_clusters, (i+1)*2*math.pi/self.n_clusters) for i in range(self.n_clusters)] self.rejected = 0 self.cx_count = 0 self.mut_count = 0 for g in range(self.n_iter): population = self.toolbox.selectBest(population, self.pop_size) best_individuals.append(self.toolbox.selectBest(population, 1)[0]) random.shuffle(population) for parent1, parent2 in zip(population[::2], population[1::2]): if random.random() < self.cxpb: self.cx_count += 1 child1 = self.toolbox.clone(parent1) child2 = self.toolbox.clone(parent2) for i in range(self.n_components): self.toolbox.mate(child1[i], child2[i]) reject = False for i in range(self.n_components): if get_height(child1[i]) > self.max_tree_height: reject = True self.rejected += 1 break if not reject: del child1.fitness.values population.append(child1) reject = False for i in range(self.n_components): if get_height(child2[i]) > self.max_tree_height: reject = True self.rejected += 1 break if not reject: del child2.fitness.values population.append(child2) for individual in population.copy(): if random.random() < self.mutpb: self.mut_count += 1 mutant = self.toolbox.clone(individual) for i in range(self.n_components): self.toolbox.mutate(mutant[i]) reject = False for i in range(self.n_components): if get_height(mutant[i]) > self.max_tree_height: reject = True self.rejected += 1 break if not reject: del mutant.fitness.values population.append(mutant) invalid_ind = [ind for ind in population if not ind.fitness.valid] fitnesses = list(map(self.toolbox.evaluate, invalid_ind)) for ind, fit in zip(invalid_ind, fitnesses): ind.fitness.values = (fit,) # population = self.toolbox.selectTournament(population, self.tourn_size) best_individuals.append(self.toolbox.selectBest(population, 1)[0]) self.best_fitness = best_individuals[-1].fitness.values[0] self.model=best_individuals[-1] X_new = self.reduce(self.model, X) self.centroids=KMeans(n_clusters=self.n_clusters, random_state=self.random_state).fit(X_new) labels = self.centroids.predict(X_new) confusion_m = confusion_matrix(y, labels) m = Munkres() cost_m = make_cost_matrix(confusion_m) target_cluster = m.compute(cost_m) # (target, cluster) assignment pairs. self.mapping = {cluster : target for target, cluster in dict(target_cluster).items()} # Nearest neighbors. self.neighbors = NearestNeighbors(n_neighbors=1).fit(X_new) self.y_train = y