示例#1
0
 def fit(self, X, y):
     #creating a manifold on training data
     self.model = LocallyLinearEmbedding(
         method=self.method,
         n_neighbors=self.n_neighbors,
         n_components=self.n_components,
         reg=self.reg,
         eigen_solver=self.eigen_solver,
         random_state=self.random_state).fit(X, y)
     #determining centroids for given points
     self.centroids = KMeans(n_clusters=self.n_clusters,
                             random_state=self.random_state).fit(
                                 self.model.transform(X))
     labels = self.centroids.predict(self.model.transform(
         X))  # Every point is assigned to a certain cluster.
     #assigning each centroid to the correct cluster
     confusion_m = confusion_matrix(y, labels)
     m = Munkres()
     cost_m = make_cost_matrix(confusion_m)
     target_cluster = m.compute(
         cost_m)  # (target, cluster) assignment pairs.
     #saving mapping for predictions
     self.mapping = {
         cluster: target
         for target, cluster in dict(target_cluster).items()
     }
示例#2
0
    def evaluate(self, individual, X, y, random_state):
        X_new = self.reduce(individual, X)

        if self.fitness_function in [
                "kmeans", "silhouette", "adjusted_rand_score",
                "calinski_harabasz"
        ]:
            # Clustering of the reduced dataset.
            centroids = KMeans(n_clusters=self.n_clusters,
                               random_state=self.random_state).fit(X_new)
            labels = centroids.labels_  # Every point is assigned to a certain cluster.
            if self.fitness_function == "silhouette":
                if len(Counter(labels)) == 1:
                    return -1
                else:
                    return silhouette_score(X_new, labels)
            elif self.fitness_function == "adjusted_rand_score":
                return adjusted_rand_score(y, labels)
            elif self.fitness_function == "calinski_harabasz":
                if len(Counter(labels)) == 1:
                    return -1
                else:
                    return calinski_harabasz_score(X_new, labels)
            elif self.fitness_function == "kmeans":
                confusion_m = confusion_matrix(y, labels)
                m = Munkres()
                cost_m = make_cost_matrix(confusion_m)
                target_cluster = m.compute(
                    cost_m)  # (target, cluster) assignment pairs.
                cluster_target = {
                    cluster: target
                    for target, cluster in dict(target_cluster).items()
                }
                y_pred = list(map(cluster_target.get, labels))
                return balanced_accuracy_score(y, y_pred)
        elif self.fitness_function == "nn":
            n_neighbors = self.k
            # n_neighbors + 1 because the class of the point itself is not taken into account.
            neighbors = NearestNeighbors(n_neighbors=n_neighbors +
                                         1).fit(X_new)
            nearest_neighbors = neighbors.kneighbors(X_new,
                                                     return_distance=False)[:,
                                                                            1:]
            classes = y[nearest_neighbors]
            y_pred = mode(classes, axis=1)[0].reshape(len(y), )
            return balanced_accuracy_score(y, y_pred)
        elif self.fitness_function == "angles":
            angles = np.apply_along_axis(
                lambda row: math.atan2(row[1], row[0]), axis=1, arr=X_new)
            # Mapping from (-pi,pi) to (0, 2*pi)
            angles = (2 * np.pi +
                      angles) * (angles < 0) + angles * (angles > 0)
            y_pred = list(
                map(
                    lambda angle: check_in_which_slice(angle, self.n_clusters,
                                                       self.slices), angles))
            return balanced_accuracy_score(y, y_pred)
示例#3
0
    def fit(self, X, y, sample_weight=None, groups=None):
        self.dataset = self.dataset.split("/")[-1]
        self.n_clusters = len(Counter(y))

        data = pd.DataFrame(X)
        data['target'] = y
        f = open(
            self.path + "/" + self.dataset + "." + str(self.random_state) +
            "-train", "w")
        f.write("classLast," + str(data.shape[1] - 1) + "," +
                str(self.n_components) + ',comma\n')
        f.close()
        data.to_csv(self.path + "/" + self.dataset + "." +
                    str(self.random_state) + "-train",
                    header=None,
                    index=None,
                    mode='a',
                    sep=',')
        #two empty lines at the end, adding header
        #java code returns many lines, the result ends in the last ones
        z = subprocess.check_output([
            'java', '-cp', 'GPMaL/gp-mal-eurogp-19-bin.jar',
            'featureLearn.RunGPMaL', 'dataset=' + os.getcwd() + "/" +
            self.dataset + '.' + str(self.random_state) + '-train',
            'numtrees=2', 'preprocessing=none',
            'logPrefix=' + self.dataset + '-train', 'treeDepth=8',
            'featureMin=0', 'featureMax=1', 'normalisePostCreation=false',
            'scalePostCreation=false', 'roundPostCreation=true',
            'featureLearnParamFile=GPMaL/flNeighboursFG.params', 'doNNs=false',
            'n_jobs=' + str(self.n_jobs),
            'random_state=' + str(self.random_state)
        ]).decode("utf-8")
        print(str(z))
        z = z.split('\n')[-X.shape[0] - 3:-2]
        #loading the data into pandas df
        X_new = pd.read_csv(StringIO('\n'.join(z)), sep=',')

        X_new.drop('class', axis=1, inplace=True)
        #    for f in range(0,X.shape[1]-1):
        for f in range(0, X_new.shape[1] - self.n_components):
            X_new.drop('F' + str(f), axis=1, inplace=True)
        print(X_new)
        self.centroids = KMeans(n_clusters=self.n_clusters,
                                random_state=self.random_state).fit(X_new)
        labels = self.centroids.predict(X_new)
        confusion_m = confusion_matrix(y, labels)
        m = Munkres()
        cost_m = make_cost_matrix(confusion_m)
        target_cluster = m.compute(
            cost_m)  # (target, cluster) assignment pairs.
        self.mapping = {
            cluster: target
            for target, cluster in dict(target_cluster).items()
        }

        return self
示例#4
0
 def fit(self,X,y):
   self.X_train=X
   #creating a manifold on training data
   self.model = TSNE(n_iter=self.n_iter, n_components=self.n_components, perplexity=self.perplexity).fit_transform(X,y)
   #determining centroids for given classes
   self.centroids = KMeans(n_clusters=self.n_clusters, random_state=self.random_state).fit(self.model)
   self.labels = self.centroids.predict(self.model) # Every point is assigned to a certain cluster.
   #assigning each centroid to the correct cluster
   confusion_m = confusion_matrix(y, self.labels)
   m = Munkres()
   cost_m = make_cost_matrix(confusion_m)
   target_cluster = m.compute(cost_m) # (target, cluster) assignment pairs.
   #saving mapping for predictions
   self.mapping = {cluster : target for target, cluster in dict(target_cluster).items()}
  def fit(self,X,y):
    pset = create_pset(in_type=float, in_types_length=X.shape[1], out_type=float)
    self.toolbox = create_toolbox(weights=self.weights,
                                pset=pset,
                                min_tree_height=self.min_tree_height,
                                max_tree_height=self.max_tree_height,
                                n_components=self.n_components)

    self.toolbox.register("evaluate", self.evaluate, X=X, y=y, random_state=self.random_state)

    population = self.toolbox.population(self.pop_size)
    best_individuals = []
    self.n_clusters=len(Counter(y))
    self.slices = [(i*2*math.pi/self.n_clusters, (i+1)*2*math.pi/self.n_clusters) for i in range(self.n_clusters)]
    self.rejected = 0
    self.cx_count = 0
    self.mut_count = 0

    for g in range(self.n_iter):
      population = self.toolbox.selectBest(population, self.pop_size)
      best_individuals.append(self.toolbox.selectBest(population, 1)[0])
      random.shuffle(population)
      for parent1, parent2 in zip(population[::2], population[1::2]):
        if random.random() < self.cxpb:
          self.cx_count += 1
          child1 = self.toolbox.clone(parent1)
          child2 = self.toolbox.clone(parent2)
          for i in range(self.n_components):
            self.toolbox.mate(child1[i], child2[i])
          reject = False
          for i in range(self.n_components):
            if get_height(child1[i]) > self.max_tree_height:
              reject = True
              self.rejected += 1
              break
          if not reject:
            del child1.fitness.values
            population.append(child1)
          reject = False
          for i in range(self.n_components):
            if get_height(child2[i]) > self.max_tree_height:
              reject = True
              self.rejected += 1
              break
          if not reject:
            del child2.fitness.values
            population.append(child2)

      for individual in population.copy():
        if random.random() < self.mutpb:
          self.mut_count += 1
          mutant = self.toolbox.clone(individual)
          for i in range(self.n_components):
            self.toolbox.mutate(mutant[i])
          reject = False
          for i in range(self.n_components):
            if get_height(mutant[i]) > self.max_tree_height:
              reject = True
              self.rejected += 1
              break
          if not reject:
            del mutant.fitness.values
            population.append(mutant)

      invalid_ind = [ind for ind in population if not ind.fitness.valid]
      fitnesses = list(map(self.toolbox.evaluate, invalid_ind))
      for ind, fit in zip(invalid_ind, fitnesses):
        ind.fitness.values = (fit,)

#      population = self.toolbox.selectTournament(population, self.tourn_size)

    best_individuals.append(self.toolbox.selectBest(population, 1)[0])
    self.best_fitness = best_individuals[-1].fitness.values[0]
    self.model=best_individuals[-1]
    X_new = self.reduce(self.model, X)
    self.centroids=KMeans(n_clusters=self.n_clusters, random_state=self.random_state).fit(X_new)
    labels = self.centroids.predict(X_new)
    confusion_m = confusion_matrix(y, labels)
    m = Munkres()
    cost_m = make_cost_matrix(confusion_m)
    target_cluster = m.compute(cost_m) # (target, cluster) assignment pairs.
    self.mapping = {cluster : target for target, cluster in dict(target_cluster).items()}

    # Nearest neighbors.
    self.neighbors = NearestNeighbors(n_neighbors=1).fit(X_new)
    self.y_train = y