def _construct_graph(self, n_neighbor=None, weight=False): # create neighbors buffer self._preprocess_neighbors() # # load neighbors information # neighbors_path = os.path.join(self.selected_dir, "neighbors.npy") # neighbors_weight_path = os.path.join(self.selected_dir, # "neighbors_weight.npy") # neighbors = np.load(neighbors_path) # neighbors_weight = np.load(neighbors_weight_path) neighbors = self.neighbors neighbors_weight = self.neighbors_weight instance_num = neighbors.shape[0] train_y = self.get_train_label() train_y = np.array(train_y) self.train_y = train_y print("train_y", train_y.shape) # get knn graph in a csr form indptr = [i * n_neighbor for i in range(instance_num + 1)] logger.info("get indptr") indices = neighbors[:, :n_neighbor].reshape(-1).tolist() logger.info("get indices") if not weight: data = neighbors[:, :n_neighbor].reshape(-1) logger.info("get data") data = (data * 0 + 1.0).tolist() else: data = neighbors_weight[:, :n_neighbor].reshape(-1).tolist() logger.info("get data in connectivity") affinity_matrix = sparse.csr_matrix((data, indices, indptr), shape=(instance_num, instance_num)) affinity_matrix = affinity_matrix + affinity_matrix.T affinity_matrix = sparse.csr_matrix( (np.ones(len(affinity_matrix.data)).tolist(), affinity_matrix.indices, affinity_matrix.indptr), shape=(instance_num, instance_num)) # affinity_matrix = modify_graph(affinity_matrix, train_y, 0.2) # affinity_matrix = self.correct_unconnected_nodes(affinity_matrix) # affinity_matrix = self.correct_unconnected_nodes(affinity_matrix) logger.info("affinity_matrix construction finished!!") self.affinity_matrix = affinity_matrix return affinity_matrix
def get_graph(self, n_neighbor=None, rebuild=False): if self.affinity_matrix is None or rebuild is True: self._construct_graph(n_neighbor) n_components, labels = sparse.csgraph.connected_components( csgraph=self.affinity_matrix, return_labels=True) logger.info("n_components: {}".format(n_components)) train_y = self.get_train_label() unp = [] for i in range(n_components): y_in_this_component = train_y[labels == i] if not any(y_in_this_component > -1): idxs = self.get_rest_idxs()[labels == i] unp = unp + idxs.tolist() logger.info( "connected components without labeled data - instance num: {}". format(len(unp))) return self.affinity_matrix.copy()
def correct_unconnected_nodes(self, affinity_matrix): logger.info("begin correct unconnected nodes...") np.random.seed(123) correted_nodes = [] affinity_matrix = affinity_matrix.copy() labeled_ids = np.where(self.get_train_label() > -1)[0] iter_cnt = 0 neighbors = self.get_neighbors(k_neighbors=100) while True: unconnected_ids = self._find_unconnected_nodes( affinity_matrix, labeled_ids) if unconnected_ids.shape[0] == 0: logger.info( "No correcnted nodes after {} iteration. Correction finished." .format(iter_cnt)) # debug: show how many edge is uncorrect gt = self.get_train_ground_truth() err_cnt = 0 all_cnt = 0 # np.save("./buffer/add_edges.npy", np.array(correted_nodes)) # for source, target in correted_nodes: # all_cnt += 1 # if gt[source] != gt[target]: # err_cnt+=1 # if all_cnt>0: # logger.info("All:{}, Err:{}, Percent:{}".format(all_cnt, err_cnt, err_cnt/all_cnt)) return affinity_matrix else: while True: corrected_id = np.random.choice(unconnected_ids) k_neighbors = neighbors[corrected_id] find = False for neighbor_id in k_neighbors: if neighbor_id not in unconnected_ids: find = True iter_cnt += 1 affinity_matrix[corrected_id, neighbor_id] = 1 correted_nodes.append([corrected_id, neighbor_id]) break if find: break
def update_graph(self, deleted_idxs): logger.info("begin update graph according to editing info") rest_idxs = self.get_rest_idxs() remove_idxs = self.get_removed_idxs() assert len( set(rest_idxs.copy().tolist()).intersection( set(deleted_idxs))) == 0 last_rest_idxs = np.sort(rest_idxs.copy().tolist() + deleted_idxs) last_map = {} for i in range(len(last_rest_idxs)): last_map[last_rest_idxs[i]] = i rest_idxs = [last_map[idx] for idx in rest_idxs] logger.info("total len: {}".format(len(rest_idxs) + len(remove_idxs))) self.affinity_matrix = self.affinity_matrix[rest_idxs, :] self.affinity_matrix = self.affinity_matrix[:, rest_idxs] # update neighbors info self._preprocess_neighbors() logger.info("affinity_matrix shape after updating: {}".format( str(self.affinity_matrix.shape)))
def _load_data(self): processed_data_filename = os.path.join(self.data_root, config.processed_dataname) processed_data = pickle_load_data(processed_data_filename) self.processed_data = processed_data self.X = processed_data[config.X_name] self.y = processed_data[config.y_name] self.y = np.array(self.y).astype(int) if self.dataname.lower() == "oct": # wrong label self.y[564] = 3 self.train_idx = processed_data[config.train_idx_name] self.valid_idx = processed_data[config.valid_idx_name] self.test_idx = processed_data[config.test_idx_name] self.labeled_idx = processed_data[config.labeled_idx_name] self.unlabeled_idx = processed_data[config.unlabeled_idx_name] self.class_names = processed_data[ config.class_name] #+["lizard", "snake"] self.add_info = processed_data[config.add_info_name] self.actions = [] # if self.dataname.lower() == "stl": # # self.y[] # unlabeled_pred = pickle_load_data(os.path.join(self.data_root, "unlabeled_labels.pkl")) # self.y[self.unlabeled_idx] = unlabeled_pred if self.selected_labeled_num is None and self.selected_total_num is None: self.selected_labeled_num = self.add_info.get( "default_selected_labeled_num", None) self.selected_total_num = self.add_info.get( "default_selected_total_num", None) self.seed = self.add_info.get("default_seed", 123) # produce unlabeled data assert (self.selected_labeled_num is not None and self.selected_total_num is not None) dir_name = "labeled-" + str(self.selected_labeled_num) + \ ".total-" + str(self.selected_total_num) + ".seed-" + str(self.seed) logger.info(dir_name) dir_path = os.path.join(self.data_root, dir_name) check_dir(dir_path) self.selected_dir = dir_path idx_info_path = os.path.join(dir_path, "idx_info.pkl") if os.path.exists(idx_info_path): logger.info("idx info exists in: {}".format(idx_info_path)) idx_info = pickle_load_data(idx_info_path) self.train_idx = idx_info["train_idx"] self.selected_labeled_idx = idx_info["selected_labeled_idx"] if self.dataname.lower() == "stl": # relabel: removed_idx = [self.train_idx[39], self.train_idx[33]] added_idx = [self.train_idx[9081], self.train_idx[7427]] # removed_idx = [self.train_idx[39], self.train_idx[33]] # added_idx = [self.train_idx[2790], self.train_idx[5855]] tmp_labeled_idx = added_idx # added_idx = [self.train_idx[11146], self.train_idx[7683]] # tmp_labeled_idx = [] for old_idx in self.selected_labeled_idx: if old_idx not in removed_idx: tmp_labeled_idx.append(old_idx) self.selected_labeled_idx = np.array(tmp_labeled_idx) self.rest_idxs = np.array(range(len(self.train_idx))) return # if len(self.labeled_idx) == self.selected_labeled_num: # self.selected_labeled_idx = self.labeled_idx selected_labeled_idx = np.array(self.labeled_idx) selected_labeled_idx.sort() else: # selected_labeled_idx = np.random.choice(self.labeled_idx, self.selected_labeled_num, replace=False) # class balance selection selected_labeled_num_in_each_class = np.zeros(len( self.class_names)) class_num = len(selected_labeled_num_in_each_class) num_per_class = self.selected_labeled_num // class_num selected_labeled_num_in_each_class = (np.ones(class_num) * num_per_class).astype(int) rest_num = self.selected_labeled_num - num_per_class * class_num if rest_num > 0: idx = np.random.choice(class_num, rest_num, replace=False) selected_labeled_num_in_each_class[idx] += 1 selected_labeled_idx = [] labeled_y = self.y[self.labeled_idx] for i in range(class_num): labeled_idx_in_this_class = self.labeled_idx[labeled_y == i] selected_labeled_idx_in_this_class = \ np.random.choice(labeled_idx_in_this_class, selected_labeled_num_in_each_class[i], replace=False) selected_labeled_idx = selected_labeled_idx + selected_labeled_idx_in_this_class.tolist( ) selected_labeled_idx = np.array(selected_labeled_idx) selected_labeled_idx.sort() # get unlabeled idx rest_selected_labeled_num = self.selected_total_num - self.selected_labeled_num rest_selected_labeled_idx = np.random.choice(self.unlabeled_idx, rest_selected_labeled_num, replace=False) train_idx = np.hstack( (selected_labeled_idx, rest_selected_labeled_idx)) train_idx.sort() self.train_idx = train_idx self.selected_labeled_idx = selected_labeled_idx idx_info = { "selected_labeled_idx": selected_labeled_idx, "train_idx": train_idx } pickle_save_data(idx_info_path, idx_info)
def _preprocess_neighbors(self, rebuild=False, save=True): neighbors_model_path = os.path.join( self.selected_dir, "neighbors_model-step" + str(self.model.step) + ".pkl") neighbors_path = os.path.join( self.selected_dir, "neighbors-step" + str(self.model.step) + ".npy") neighbors_weight_path = os.path.join( self.selected_dir, "neighbors_weight-step" + str(self.model.step) + ".npy") test_neighbors_path = os.path.join( self.selected_dir, "test_neighbors-step" + str(self.model.step) + ".npy") test_neighbors_weight_path = os.path.join( self.selected_dir, "test_neighbors_weight-step" + str(self.model.step) + ".npy") if os.path.exists(neighbors_model_path) and \ os.path.exists(neighbors_path) and \ os.path.exists(test_neighbors_path) and rebuild == False and DEBUG == False: logger.info("neighbors and neighbor_weight exist!!!") self.neighbors = np.load(neighbors_path) self.neighbors_weight = np.load(neighbors_weight_path) self.test_neighbors = np.load(test_neighbors_path) return logger.info("neighbors and neighbor_weight " "do not exist, preprocessing!") train_X = self.get_full_train_X() train_num = train_X.shape[0] train_y = self.get_full_train_label() train_y = np.array(train_y) test_X = self.get_test_X() test_num = test_X.shape[0] self.max_neighbors = min(len(train_y), self.max_neighbors) logger.info("data shape: {}, labeled_num: {}".format( str(train_X.shape), sum(train_y != -1))) nn_fit = NearestNeighbors(7, n_jobs=-4).fit(train_X) logger.info("nn construction finished!") neighbor_result = nn_fit.kneighbors_graph( nn_fit._fit_X, self.max_neighbors, # 2, mode="distance") test_neighbors_result = nn_fit.kneighbors_graph(test_X, self.max_neighbors, mode="distance") logger.info("neighbor_result got!") self.neighbors, self.neighbors_weight = self.csr_to_impact_matrix( neighbor_result, train_num, self.max_neighbors) self.test_neighbors, test_neighbors_weight = self.csr_to_impact_matrix( test_neighbors_result, test_num, self.max_neighbors) logger.info("preprocessed neighbors got!") # save neighbors information if save: pickle_save_data(neighbors_model_path, nn_fit) np.save(neighbors_path, self.neighbors) np.save(neighbors_weight_path, self.neighbors_weight) np.save(test_neighbors_path, self.test_neighbors) np.save(test_neighbors_weight_path, test_neighbors_weight) return self.neighbors, self.test_neighbors
def remove_instance(self, idxs): if len(idxs) > 0: self.actions.append("remove-node") self.rest_idxs = np.array([i for i in self.rest_idxs if i not in idxs]) self.removed_idxs += idxs logger.info("rest data: {}".format(len(self.rest_idxs)))