def main(): # Training settings batch_size = 10 num_epochs = 1 learning_rate = 0.1 log_interval = 2 use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") transform = ToTensor() Circles = [Circle(n, r, transform=ToTensor()) for n, r in zip((2, 3, 5), (1, 2, 3))] # ds = [Circle(1000, 2, transform=transform), Circle(1200, 6, transform=transform)] k = len(Circles) train_dataset = ConcatDataset(Circles) train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) # ds = [Circle(1000, 2, transform=transform), Circle(1200, 6, transform=transform)] # train_dataset = ConcatDataset(ds) # test_dataset = ConcatDataset([Circle(32, 3, transform=transform), Circle(1, 0, transform=transform)]) # train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) # test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False) model = Net() model.to(device) optimizer = optim.Adam(model.parameters(), lr=learning_rate) scheduler = StepLR(optimizer, step_size=1, gamma=0.1) for epoch in range(num_epochs): # (model, device, train_loader, optimizer, epoch, num_clusters) train(model, device, train_loader, optimizer, epoch, k, log_interval) # test(model, test_loader) scheduler.step() eval(model, device, train_loader) N = eval(model, device, train_loader) L = laplacian(N) # print(L.shape) eigval, eigvec = torch.symeig(L, eigenvectors=True) EigMat = eigvec[:, 1:3] # if torch.cuda.is_available(): # device = torch.device('cuda:0') # else: # device = torch.device('cpu') cluster_ids_EigMat_new, cluster_centers = kmeans(EigMat, num_clusters=3, distance='euclidean', device=device) # plot plt.figure(figsize=(4, 3), dpi=160) plt.scatter(EigMat[:, 0], EigMat[:, 1], c=cluster_ids_EigMat_new, cmap='cool') # plt.scatter(y[:, 0], y[:, 1], c=cluster_ids_y, cmap='cool', marker='X') plt.scatter( cluster_centers[:, 0], cluster_centers[:, 1], c='white', alpha=0.6, edgecolors='black', linewidths=2 ) plt.axis([-1, 1, -1, 1]) plt.tight_layout() plt.show()
def forward(self, qk): qk = qk.detach().to(torch.float16) batch_size, seq_len, dim, device = *qk.shape, qk.device # [6,12,384,64] W_R = torch.empty(batch_size, dim, dim, device=device) nn.init.orthogonal_(W_R) W_R = W_R.to(torch.float16) R = torch.matmul(qk, W_R).reshape(-1, dim) K = int(seq_len ** 0.5) import pdb pdb.set_trace() cluster_idx, centroid = kmeans(X=R, num_clusters=K, distance='cosine', device=device) cluster_idx = cluster_idx.reshape(batch_size, seq_len).unsqueeze(1).expand(-1, self.n_heads, -1) result = torch.zeros(batch_size, self.n_heads, seq_len, seq_len, device=device) # import pdb # pdb.set_trace() r1 = result.to(torch.long) + cluster_idx.unsqueeze(-1).to(device) # [0, 0, 0, 0, ...] r2 = result.to(torch.long) + cluster_idx.unsqueeze(-2).to(device) # [0, 1, 2, 3, ...] result = (r1 == r2).to(torch.float32) result = 10000. * result - 10000. return result.detach()
def attn_mask(self, x): attention_map_stretched = x.view(x.shape[2] * x.shape[3], 1) var = torch.randn(attention_map_stretched.shape, device='cuda') * 0.0001 attention_mask, _ = kmeans(attention_map_stretched + var, num_clusters=2, device=torch.device('cuda:0')) attention_mask = attention_mask.view(x.shape).type( torch.cuda.FloatTensor) return attention_mask
def update_kmeans(class_data, batch_size, sample_size, pretrained): class_features = get_features(class_data, batch_size, sample_size, pretrained) _, center_features = kmeans(X=class_features, num_clusters=sample_size, distance='euclidean', device=torch.device('cuda:' + str(args.gpu_id))) center_features = center_features.cuda() dist = dist_matrix(center_features, class_features) inds = torch.argmin(dist, dim=1) return inds
def coexist_multiclass(to_class): path = '/home/ubuntu/data/Workspace/Soobin/wide_total/' path = glob.glob(path + '*') tot_cnt = len(path) print(tot_cnt) length = int(tot_cnt / 50) for attempt in range(24, 50): csv = [] if attempt == 49: st_idx = length * attempt end_idx = tot_cnt else: st_idx = length * attempt end_idx = length * (attempt + 1) print(st_idx, " ", end_idx) transformation = np.zeros((end_idx - st_idx, 20000 * 3), dtype='float32') for i, idx in enumerate(list(range(st_idx, end_idx))): im = cv2.imread(path[idx]) im = np.array(cv2.resize(im, dsize=(200, 100)), dtype='float32') # h, edges = np.histogramdd(im.reshape(-1,3),8,normed=True,range=[(0,255),(0,255),(0,255)]) transform = np.fft.fft2(im) transformation[i] = np.abs(transform.flatten()) res = torch.from_numpy(transformation) print("start clustering") labels, _ = kmeans(X=res, num_clusters=to_class, distance='euclidean', device=torch.device('cuda:1')) cluster_map = pandas.DataFrame() cluster_map['cluster'] = labels for i, idx in enumerate(list(range(st_idx, end_idx))): im_path = path[idx] im = cv2.imread(im_path) name = path[idx].split('/')[7] cluster = cluster_map['cluster'][i] savepath = '/home/ubuntu/data/Workspace/Soobin/attempt' + str( attempt) + '/img' savepath = savepath + str(cluster) + '/' os.makedirs(savepath, exist_ok=True) cv2.imwrite(savepath + name, im) # item = [] # item.append(path) # item.append(name) # item.append(str(cluster)) # csv.append(item) # csv_file = pandas.DataFrame(csv, columns=['file_path', 'file_name','cluster']) # csv_file.to_csv('/home/ubuntu/data/Workspace/Soobin/attempt%d/'%(attempt)+'cluster.csv', index=False, encoding='cp949') return
def main(): batch_train = 60000 batch_test = 1000 attempt_kmeans = 3 # number of k-means attempt centers = range(2, 31, 2) # number of centroids to be tested mnist_root = '../../../data' # CUDA cuda_flag = torch.cuda.is_available() device = torch.device("cuda" if cuda_flag else "cpu") device_cpu = torch.device("cpu") dataloader_kwargs = {'pin_memory': True} if cuda_flag else {} print("Let's use", torch.cuda.device_count(), "GPUs!") # dataset train_dataset = Reduced_MNIST(root=mnist_root, train=True) test_dataset = Reduced_MNIST(root=mnist_root, train=False) train_loader = DataLoader(train_dataset, batch_size=batch_train, shuffle=True) test_loader = DataLoader(test_dataset, batch_size=batch_test, shuffle=True) data_train, index_train = next(iter(train_loader)) # plt.imshow((data_train * 0.3081 + 0.1307).view(data_train.__len__(), 28, 28).detach().numpy()[0], cmap='gray') print(index_train) data_test, index_test = next(iter(test_loader)) # plt.imshow((data_test * 0.3081 + 0.1307).view(data_test.__len__(), 28, 28).detach().numpy()[0], cmap='gray') print(index_test) sse_all = [] sse_temp = np.zeros(attempt_kmeans) # execute k-means clustering with torch.no_grad(): data = data_train # reverse normalization to get original MNIST for i_centers in centers: for i_loop in range(attempt_kmeans): cluster_index, cluster_centers = kmeans(X=data, num_clusters=i_centers, distance='euclidean', device=device) sse = eval_kmeans(data, cluster_index, cluster_centers, device) print( str(cluster_centers.shape[0]) + " centers, sse:" + "{:.4f}".format(sse)) sse_temp[i_loop] = sse sse_all.append(sse_temp.mean()) sse_temp = np.zeros(attempt_kmeans) print(str([round(num, 3) for num in sse_all]).replace(".", ","))
def update_kmeans(class_data, batch_size, sample_size, pretrained): class_features = get_features(class_data, batch_size, sample_size, pretrained) _, center_features = kmeans(X=class_features, num_clusters=sample_size, distance='euclidean', device=torch.device('cuda:3')) center_features = center_features.cuda() ''' print('device of class features: %d'%class_features.get_device()) print('device of center features: %d'%center_features.get_device()) print(center_features.shape) ''' dist = dist_matrix(center_features, class_features) inds = torch.argmin(dist, dim=1) return inds
def get_clusters_from_latent(args, g_ema, device, mean_latent, t_dict_list, yaml_config, layer_channel_dims, latent, noise): print("get clusters") with torch.no_grad(): g_ema.eval() slice_latent = latent[0, :] slce_latent = slice_latent.unsqueeze(0) print(slice_latent.size()) sample, activation_maps = g_ema([slce_latent], input_is_latent=True, noise=noises, transform_dict_list=t_dict_list, return_activation_maps=True) print(len(activation_maps)) feature_cluster_dict = {} for index, activations in enumerate(activation_maps): true_index = index + 1 classifier = FeatureClassifier(true_index) classifier_str = args.classifier_ckpts + "/" + str( true_index) + "/classifier" + str(true_index) + "_final.pt" classifier_state_dict = torch.load(classifier_str) classifier.load_state_dict(classifier_state_dict) classifier.to(device) layer_activation_maps = activation_maps[index] a_map_array = list(torch.split(layer_activation_maps, 1, 1)) dict_list = [] latent_list = [] for i, map in enumerate(a_map_array): map = map.to(device) feat_vec, class_prob = classifier(map) activation_dict = {"class_index": i, "feat_vec": feat_vec} # dict_list.append(activation_dict) latent_list.append(feat_vec) cluster_ids_x, cluster_centers = kmeans( X=torch.stack(latent_list), num_clusters=cluster_layer_dict[true_index], distance='euclidean', device=torch.device('cuda')) for i, id in enumerate(cluster_ids_x): cluster_dict = { "feature_index": int(i), "cluster_index": int(id) } dict_list.append(cluster_dict) feature_cluster_dict[true_index] = dict_list with open(r'cluster_dict.yaml', 'w') as file: documents = yaml.dump(feature_cluster_dict, file)
def get_cluster_idx(self, i_node, clustering_ratio): weights = i_node['layer'].weight.clone() if clustering_ratio <= 0: return [] n = len(weights) out_channels = weights.size()[0] weights = weights.view(-1, out_channels) sim_dict = {} i_u, i_s, i_vh = torch.svd(weights) n_to_cluster = n - int(clustering_ratio * n) # print(i_u.size()) # print(i_s.size()) # print(i_vh.size()) i_sv = torch.matmul(i_vh, torch.diag(i_s)) # kmeans cluster_ids_x, cluster_centers = kmeans(X=i_sv, num_clusters=n_to_cluster, distance='cosine') group_to_mindist = 1e6 * np.ones(n_to_cluster) group_to_id = np.ones(n_to_cluster) for idx in range(len(cluster_ids_x)): i_cluster = cluster_ids_x[idx] i_center = cluster_centers[i_cluster, :] i_val = i_sv[idx, :] dist = torch.dist(i_val, i_center) if group_to_mindist[i_cluster] > dist: group_to_mindist[i_cluster] = dist group_to_id[i_cluster] = idx indices = {x for x in range(n)} - set(group_to_id) return list(indices)
def tot_dat(): for j in range(0, 20): topcon = create_tempfeatures(j) print("start clustering") labels, _ = kmeans(X=topcon, num_clusters=5, distance='euclidean', device=torch.device('cuda:0')) cluster_map = pandas.DataFrame() cluster_map['cluster'] = labels for i in range(0, len(cluster_map['cluster'])): topcon_imwrite(cluster_map['cluster'][i], j, i) csv_file = pandas.DataFrame( csv, columns=['file_path', 'file_name', 'cluster']) csv = [] csv_file.to_csv('./topcon_candidates_5class_higher/attempt%d/' % (j) + 'cluster.csv', index=False, encoding='cp949') print("one attempt done!")
def get_cluster_exemplars(self, features, num_clusters): logging.info("total number of features: "+str(len(features))) indices = torch.randperm(len(features))[:3000] subsample = features[indices] cluster_ids_x, cluster_centers = kmeans( X=subsample, num_clusters=num_clusters, distance='euclidean', device=self._device() ) original_idx_map = {} ret_features = [] for cluster_number, centroid in enumerate(cluster_centers): cluster_features, cluster_to_original_idxs = self.get_all_points_in_cluster(subsample, cluster_ids_x, cluster_number) selected_feature, selected_feature_idx = self.get_point_nearest_centroid(centroid, cluster_features, cluster_to_original_idxs) ret_features.append(selected_feature) # maps back to idx in entire dataset of features original_idx_map[str(cluster_number)] = selected_feature_idx return torch.stack(ret_features), original_idx_map
def select_round_workers_actvSAMP(self, workers, poisoned_workers,clients, kwargs): clients_distribution = [] for client_idx in range(len(clients)): _client_distribution = clients[client_idx].get_client_distribution() clients_distribution.append(_client_distribution) data_size, dims, num_clusters = len(clients), len(clients_distribution[0]), kwargs["NUM_WORKERS_PER_ROUND"] clients_distribution = torch.from_numpy(np.array(clients_distribution)) cluster_ids_x, cluster_centers = kmeans( X = clients_distribution, num_clusters = num_clusters, distance = 'euclidean') clusters = [] for cluster_name in range(num_clusters): _cluster = [] _cluster.extend([i for i in range(len(cluster_ids_x)) if cluster_ids_x[i] == cluster_name]) clusters.append(_cluster) choosed_workers = [] for cluster in clusters: choosed_workers.append((random.sample(cluster,1))[0]) return choosed_workers
def kmeans_content(text_list, tokenizer, model, num_clusters=20): presentence_embedding, text_list, labels = get_embedding( text_list, [], tokenizer, model) presentence_embedding = torch.tensor(presentence_embedding, dtype=torch.long) cluster_ids_x, cluster_centers = kmeans(X=presentence_embedding, num_clusters=num_clusters, distance='euclidean', device=torch.device('cpu'), tol=1e-8) klist = bulid_pre_dict(text_list, cluster_ids_x.tolist()) # klist={} # for i,c in enumerate (cluster_ids_x.tolist()): # # print(i,c,text_list) # if klist.get(c): # klist[c].append(text_list[i]) # else: # klist[c]=[text_list[i]] # # pprint.pprint(klist) return klist
def update_robust_kmeans(class_data, batch_size, sample_size, pretrained): # get the distance to the feature mean and filter out the outliers class_features = get_features(class_data, batch_size, sample_size, pretrained) average_feature = torch.mean(class_features, dim=0, keepdim=True) dist = dist_matrix(class_features, average_feature).squeeze() sorted_inds = torch.argsort(dist, descending=False) class_size = class_data.size(0) candidate_size = int(class_size * 9 / 10) candidate_inds = sorted_inds[:candidate_size] candidate_features = class_features[candidate_inds] _, center_features = kmeans(X=candidate_features, num_clusters=sample_size, distance='euclidean', device=torch.device('cuda:' + str(args.gpu_id))) center_features = center_features.cuda() dist = dist_matrix(center_features, candidate_features) inds = torch.argmin(dist, dim=1) final_inds = candidate_inds[inds] return final_inds
print('Selected random dbids') c.execute(temp_table) c.executemany(temp_insert, dbid_tups) c.execute(select_stmt) def batch_enumerate(bs): batch = c.fetchmany(bs) while (len(batch)): yield batch batch = c.fetchmany(bs) batches = batch_enumerate(bsize) for i, batch in enumerate(batches): sub_lst = list(map(deserialize, batch)) sub_dbids, sub_tensors = zip(*sub_lst) dbid_lst.extend(sub_dbids) tensor_lst.extend(sub_tensors) print('Batch {} completed. Processed {}. {}'.format( i + 1, len(dbid_lst), datetime.now())) x = torch.stack(tensor_lst) cluster_ids, cluster_centers = kmeans(X=x, num_clusters=100, distance='cosine', device=torch.device('cpu')) torch.save(cluster_centers, 'centroids.tensor')
def forward(self, x): x = self.conv1(x) x = self.bn1(x) output_0 = self.relu(x) #pdb.set_trace() #crow_pool = gem(output_0) #pdb.set_trace() x = self.maxpool(output_0) #x = norm(x, dim=1) #version -1: CBAM mask = torch.ones(x.size()[0], 64, 48) for i in range(x.size()[0]): input_mask = x[i].reshape(64, -1 ).permute(1,0).reshape(-1,64) cluster_ids_x, cluster_centers = kmeans(X=input_mask, num_clusters=2, distance='euclidean', tqdm_flag=False, device=torch.device('cuda:0')) idx = cluster_ids_x.reshape(64, 48) #pdb.set_trace() mask[i] = 1 - (idx ^ (idx[32,24].unsqueeze(0).unsqueeze(1))) mask = mask.unsqueeze(1).float().cuda() #version 2 gem+SAM #version 3 CROW+SAM # version0: constrain on feature map ''' att = self.compress(output_0) att = self.cov(att) #pdb.set_trace() att = self.relu(att) #mask = self.sig(att) mask = torch.where(att>self.selective_0, torch.FloatTensor([1.0]).cuda(),torch.FloatTensor([0.0]).cuda()) ''' #x = self.cbam_0(x) output_1 = self.layer1(x) # version1: generate the mask ''' y_cov_1 = 0 for i in [0,60,3,12,19,22,15,29,45,35,50,51,54,55,58,62]: y_cov_1 = y_cov_1 + output_0[:,i,:,:] y_cov_2 = torch.sum(output_0, dim =1) - y_cov_1 y_cov = (y_cov_1/16).unsqueeze(1) #* self.selective_0 y_cov = torch.tanh(y_cov) mask = y_cov.view(y_cov.size()[0], 1, 128,96) ''' # version2: NLB generate the mask ''' y_cov_1 = [] for i in [3,12,29,35,54,58,62]: y_cov_1.append((output_0[:,i,:,:])) #pdb.set_trace() y_cov_1 = torch.stack(y_cov_1, dim=1) y_cov_1 = y_cov_1 * y_cov_1 y_cov_1 = torch.sum(y_cov_1, dim=1) mask = y_cov_1/torch.sum(y_cov_1.view(y_cov_1.size()[0], -1), dim=1).unsqueeze(1).unsqueeze(2) mask = mask.view(mask.size()[0], 1, 128,96) ''' ''' y_cov_1 = self.cov(y_cov_1) y_cov_1 = torch.sum(y_cov_1, dim =1) y_cov = (y_cov_1/11).unsqueeze(1) #* self.selective_0 y_cov = torch.tanh(y_cov) ''' ''' x_compress = self.compress(y_cov_1) x_out = self.cov(x_compress) mask = torch.tanh(x_out) mask = mask.view(mask.size()[0], 1, 128,96) ''' #x_cov = F.softmax(x_cov.view(x_cov.size()[0], 1, -1), dim=-1) output_1 = output_1*F.interpolate(mask, size=[64, 48], mode="bilinear") + output_1 output_2 = self.layer2(output_1) #output_2 = self.cbam_1( output_2) output_2 = output_2*F.interpolate(mask, size=[32, 24], mode="bilinear") + output_2 output_3 = self.layer3(output_2) #output_3 = self.cbam_2(output_3) output_3 = output_3*F.interpolate(mask, size=[16, 12], mode="bilinear") + output_3 output_4 = self.layer4(output_3) #output_4 = self.cbam_3(output_4) output_4 = output_4*F.interpolate(mask, size=[8, 6], mode="bilinear") + output_4 ''' att = self.down_0(output_1)+self.in_cha_0(output_2) att = self.down_1(att)+self.in_cha_1(output_3) att = self.down_2(att)+self.in_cha_2(output_4) spa_mask = spatial_optimize(att).cuda() x = output_4*spa_mask ''' return output_4, output_4, mask#x_cov.view(x_cov.size()[0], 1, 128,96), output_4
def get_clusters_from_generated_greedy(args, g_ema, device, mean_latent, t_dict_list, yaml_config, layer_channel_dims): print("get clusters") with torch.no_grad(): g_ema.eval() latent_ll = [] feature_ll = [] feature_cluster_sum_dict = {} feature_cluster_dict = {} for i in tqdm(range(args.n_layers)): true_index = i + 1 latent_list = [] feature_list = [] latent_ll.append(latent_list) feature_ll.append(feature_list) feature_cluster_sum_dict[true_index] = {} for j in tqdm(range(layer_channel_dims[true_index])): feature_cluster_sum_dict[true_index][j] = [] for i in tqdm(range(args.num_samples)): print("processing sample: " + str(i)) sample_z = torch.randn(1, args.latent, device=device) sample, activation_maps = g_ema([sample_z], truncation=args.truncation, truncation_latent=mean_latent, transform_dict_list=t_dict_list, return_activation_maps=True) for index, activations in enumerate(activation_maps): true_index = index + 1 classifier = FeatureClassifier(true_index) classifier_str = args.classifier_ckpts + "/" + str( true_index) + "/classifier" + str(true_index) + "_final.pt" classifier_state_dict = torch.load(classifier_str) classifier.load_state_dict(classifier_state_dict) classifier.to(device) layer_activation_maps = activation_maps[index] a_map_array = list(torch.split(layer_activation_maps, 1, 1)) for j, map in enumerate(a_map_array): map = map.to(device) feat_vec, class_prob = classifier(map) latent_ll[index].append(feat_vec) feature_ll[index].append(j) for i in tqdm(range(args.n_layers)): true_index = i + 1 print("generating clusters for layer: " + str(i)) cluster_ids_x, cluster_centers = kmeans( X=torch.stack(latent_ll[i]), num_clusters=cluster_layer_dict[true_index], distance='euclidean', device=torch.device('cuda')) for j, id in enumerate(cluster_ids_x): feature_cluster_sum_dict[true_index][feature_ll[i][j]].append( id) dict_list = [] for j in tqdm(range(layer_channel_dims[true_index])): cluster_id = max(feature_cluster_sum_dict[true_index][j]) cluster_dict = { "feature_index": int(j), "cluster_index": int(cluster_id) } dict_list.append(cluster_dict) feature_cluster_dict[true_index] = dict_list with open(r'cluster_dict.yaml', 'w') as file: documents = yaml.dump(feature_cluster_dict, file)
x = torch.Tensor([ data['latitude'].values, data['longitude'].values, data['month'].values, data['day'].values, data['n_killed'].values, data['n_injured'].values, data['n_guns_involved'].values, ]).transpose(0, 1) return x train_data = data.iloc[:int(data.shape[0] * 0.9), :] test_data = data.iloc[int(data.shape[0] * 0.9):, :] train_x = getDataPair(train_data) test_x = getDataPair(test_data) cluster_ids_x, cluster_centers = kmeans(X=train_x, num_clusters=NUM_CLUSTERS, distance='euclidean') df = pd.DataFrame(cluster_centers.data.tolist(), columns=[ 'lat', 'lng', 'month', 'day', 'n_killed', 'n_injured', 'n_guns_involved' ]) fpath = os.path.join(os.path.join(BASE_DIR, "media"), f"cluster-{NUM_CLUSTERS}.csv") df.to_csv(fpath, index=False)
def kmeans(): num_clusters = 64 return kmeans(X=imageTensor, num_clusters=num_clusters, distance='euclidean')
# cluster_ids_x, cluster_centers = kmeans( # X=batch_1, num_clusters=num_clusters, distance='euclidean', device=device # ) num_clusters = 16 cluster_centers = [] for batch_id in range(num_batch): print(batch_id, num_batch) sift_descriptors = descriptors[batch_id*des_bs : (batch_id+1)*des_bs] sift_descriptors = np.array(list(itertools.chain.from_iterable(sift_descriptors))) # kmeans = KMeans(n_clusters=16, mode='euclidean', verbose=1) # kmeans_clusters = KMeans(n_clusters=k).fit(sift_descriptors) sift_descriptors = torch.FloatTensor(sift_descriptors).cuda() # kmeans.fit(sift_descriptors, centroids = kmeans.centroids) cluster_ids_x, cluster_centers = kmeans( X=sift_descriptors, num_clusters=num_clusters, cluster_centers = cluster_centers, distance='euclidean', device=device ) # prtv_score = torch.load('prtv_score.pt') for test_id in range(1000): print(test_id) index_ids = idx_list[test_id] test_img_id = test_list[test_id][0] test_img_name = test_img_id+'.jpg' test_img_path = os.path.join(test_root, test_img_name) test_img = cv2.imread(test_img_path) test_img = cv2.resize(test_img, (224,224))
import torch import numpy as np from kmeans_pytorch import kmeans from Sphere_Data import Sphere, ToTensor import matplotlib.pyplot as plt # data # train_dataset = Sphere([100,150,200],[1, 2, 3], transform=ToTensor()) data_size, dims, num_clusters = 1000, 2, 3 x = np.random.randn(data_size, dims) / 6 x = torch.from_numpy(x) # KMEANS = kmeans(x, 3) # kmeans cluster_ids_x, cluster_centers = kmeans(X=x, num_clusters=num_clusters, distance='euclidean') fig, ax = plt.subplots(figsize=(9, 7)) ax.set_title('Encoded Data', fontsize=18, fontweight='demi') ax.scatter(x[:, 0], x[:, 1], c=cluster_ids_x, s=None, cmap=None) plt.show()
presentence_embedding = torch.from_numpy(presentence_embedding) #为numpy类型 # print( "presentence_embedding",presentence_embedding.size()) return presentence_embedding # 训练 tt=tkitText.Text() text_list=tt.sentence_segmentation_v1(text) presentence_embedding=get_embedding(text_list,tokenizer,model) num_clusters=10 # print('x',x ) # # # kmeans cluster_ids_x, cluster_centers = kmeans( X=presentence_embedding, num_clusters=num_clusters, distance='euclidean', device=torch.device('cpu'),tol=1e-8 ) print('cluster_ids_x',cluster_ids_x) print("cluster_centers",cluster_centers) output_dir='./' # torch.save(cluster_centers, os.path.join(output_dir, 'Kmeanpytroch_model.bin')) cluster_centers=torch.load(os.path.join(output_dir, 'Kmeanpytroch_model.bin'))
def kmeans(data: torch.Tensor, nr_clusters: int, nr_iterations: int = 20, distance: str = 'euclidean', device=None, verbose=False): if device is None: device = data.device from kmeans_pytorch import kmeans return kmeans(X=data, num_clusters=nr_clusters, distance=distance, device=device, tqdm_flag=verbose, iter_limit=nr_iterations)
def k_means(imagesTensor): num_clusters = 64 imagesTensor = pixelsForm(imagesTensor) return kmeans(X=imagesTensor, num_clusters=num_clusters, distance='euclidean')
def qbc(n_model, n_train, batch_size, idx_ratio, dataset): # parameters n_model = n_model n_train = n_train batch_size = batch_size idx_ratio = idx_ratio n_cluster = 20 dataset = dataset.lower() # 'reduced_f_mnist', 'reduced_mnist','unreduced_f_mnist','unreduced_mnist', text = (('n_model: ' + str(n_model)) + (', n_train: ' + str(n_train)) + (', batch_size: ' + str(batch_size)) + (', idx_ratio: ' + str(idx_ratio)) + (', n_cluster: ' + str(n_cluster)) + (', dataset: ' + dataset)) print(text) # paths model_path = os.path.join(dr(dr(abspath(__file__))), 'results', dataset) csv_path = os.path.join(model_path, 'xgb_qbc.csv') # CUDA cuda_flag = torch.cuda.is_available() device = torch.device("cuda" if cuda_flag else "cpu") device_cpu = torch.device("cpu") dataloader_kwargs = {'pin_memory': True} if cuda_flag else {} print("Let's use", torch.cuda.device_count(), "GPUs!") # load dataset if dataset == 'reduced_f_mnist': data_train, target_train = datasets_preset.provide_reduced_f_mnist(train=True) data_test, target_test = datasets_preset.provide_reduced_f_mnist(train=False) elif dataset == 'reduced_mnist': data_train, target_train = datasets_preset.provide_reduced_mnist(train=True) data_test, target_test = datasets_preset.provide_reduced_mnist(train=False) elif dataset == 'unreduced_f_mnist': data_train, target_train = datasets_preset.provide_unreduced_f_mnist(train=True) data_test, target_test = datasets_preset.provide_unreduced_f_mnist(train=False) elif dataset == 'unreduced_mnist': data_train, target_train = datasets_preset.provide_unreduced_mnist(train=True) data_test, target_test = datasets_preset.provide_unreduced_mnist(train=False) # execute kmeans-clustering for entire training dataset cluster_index, cluster_centers = kmeans(X=torch.from_numpy(data_train), num_clusters=n_cluster, distance='cosine', device=device) # show clustering result, document data per cluster n_data_cr = np.zeros(n_cluster, dtype=int) idx_data_cr = [] for i_cluster in range(n_cluster): n_data_cr[i_cluster] = np.sum(cluster_index.numpy() == i_cluster) idx_data_cr.append(np.argwhere(cluster_index == i_cluster).numpy()) print("Cluster " + str(i_cluster) + ": " + str(n_data_cr[i_cluster]) + " data, or " + "{:.4f}".format(n_data_cr[i_cluster] / cluster_index.__len__() * 100) + "%") print("Cluster data size variance: " + "{:.4f}".format(n_data_cr.var() ** 0.5) + ", (smaller is better)") # to document training process, create directory, etc train_text = [str(x) for x in range(batch_size, n_train + 1, batch_size)] dir_name = 'run_' dir_number = 1 while os.path.exists(os.path.join(model_path, (dir_name + '{:03d}'.format(dir_number)))): dir_number += 1 run_path = os.path.join(model_path, (dir_name + '{:03d}'.format(dir_number))) os.makedirs(run_path) # make run_* dir f = open(os.path.join(run_path, 'info.txt'), 'w+') # write .txt file f.write(text) f.close() # create models and index library models = [] tree_method = "auto" # "gpu_hist" if cuda_flag else "auto" print('Tree creation method: ' + tree_method) idx_library = [np.array([]).astype(int) for x in range(n_model)] for i_model in range(n_model): xgbc = XGBClassifier(max_depth=8, objective='objective=multi:softmax', n_estimators=1, n_jobs=32, reg_lambda=1, gamma=2, learning_rate=1, num_classes=10, tree_method=tree_method) models.append(xgbc) print(str(n_model) + " xgboost models created") # training and test process, 1st batch output_list_test = np.zeros((n_model, data_test.__len__())).astype(int) # n_models x n_data x n_classes for i_model in range(n_model): random_index = np.array(random.sample(range(data_train.__len__()), k=batch_size)) idx_library[i_model] = np.append(idx_library[i_model], random_index) models[i_model].fit(data_train[random_index], target_train[random_index]) output_list_test[i_model, :] = models[i_model].predict(data_test) # Document first batch acc_models = qbc_preset.each_model_acc(output_list_test, target_test) acc_committee = qbc_preset.committee_vote(output_list_test, target_test) # committee vote train_text[0] = train_text[0] + ' '.join([";" + "{:.4f}".format(elem) for elem in acc_models]) train_text[0] = train_text[0] + '; ' + "{:.3f}".format(acc_committee * 100) + '%' # committee vote print("First batch added!") print("Batch " + str(0) + ": average acc of models is " + "{:.3f}".format(acc_models.mean() * 100) + "%") print("Batch " + str(0) + ": acc of committee is " + "{:.3f}".format(acc_committee * 100) + "%") print("Library sizes, after first batch:" + str([np.unique(idx_library[i_model]).shape for x in range(n_model)])) pickle.dump(models, open(os.path.join(run_path, ('models_batch_' + "{0:0=3d}".format(0) + '.pkl')), 'wb')) pickle.dump(idx_library, open(os.path.join(run_path, ('indices_batch_' + "{0:0=3d}".format(0) + '.pkl')), 'wb')) # training process, n-th batch for i_batch in range(1, train_text.__len__()): print("Starting Batch " + str(i_batch)) output_list_train = np.zeros((n_model, data_train.__len__())).astype(int) # calculate entropy & acc of current data for i_model in range(n_model): output_list_train[i_model, :] = models[i_model].predict(data_train) acc_models = qbc_preset.each_model_acc(output_list_train, target_train) acc_target = qbc_preset.each_target_acc(output_list_train, target_train) entropy = qbc_preset.vote_entropy_xgb(output_list_train, target_train) # qbc_preset.get_entropy_acc(entropy, output_list_train, target_train) # show entropy, show committee acc, 3 highest guess, entropy value, show 8 of it? # qbc_preset.show_entropy_result(acc_models, entropy, output_list, data_train, target_train) # qbc_preset.plot_ugly(output_list_train, data_train, target_train) print("Library sizes:" + str([np.unique(idx_library[i_model]).shape for x in range(n_model)])) index_1 = np.random.choice(range(n_model)) index_2 = np.random.choice(np.setdiff1d(range(0, n_model), index_1)) print("Overlap size:" + str(np.intersect1d(idx_library[index_1], idx_library[index_2]).__len__()) + ", overlap ideal: " + str(int((idx_library[index_2].__len__() - batch_size) * (idx_ratio[0] + idx_ratio[1]))) + ", library size: " + str(idx_library[index_2].__len__()) + ", dataset: " + dataset + ", idx_ratio: " + str(idx_ratio)) # train and test for each model and each batch for i_model in range(n_model): # indexes idx_library[i_model] = \ qbc_preset.get_next_indices(idx_library[i_model], entropy, idx_data_cr, batch_size, idx_ratio, data_train.__len__()) # train model models[i_model].fit(data_train[idx_library[i_model]], target_train[idx_library[i_model]]) # test model output_list_test[i_model, :] = models[i_model].predict(data_test) print('Model ' + str(i_model)) # check committee vote acc_models = qbc_preset.each_model_acc(output_list_test, target_test) acc_committee = qbc_preset.committee_vote(output_list_test, target_test) # committee vote method print("Batch " + str(i_batch) + ": average acc of models is " + "{:.3f}".format(acc_models.mean() * 100) + "%") print("Batch " + str(i_batch) + ": acc of committee is " + "{:.3f}".format(acc_committee * 100) + "%") # Document training progress train_text[i_batch] = train_text[i_batch] + ' '.join([";" + "{:.4f}".format(elem) for elem in acc_models]) train_text[i_batch] = train_text[i_batch] + '; ' + "{:.3f}".format( acc_committee * 100) + '%' # committee vote method # save models and indices pickle.dump(models, open(os.path.join(run_path, ('models_batch_' + "{0:0=3d}".format(i_batch) + '.pkl')), 'wb')) pickle.dump(idx_library, open(os.path.join(run_path, ('indices_batch_' + "{0:0=3d}".format(i_batch) + '.pkl')), 'wb')) # write text to csv title = ["New Vote, Results for n_model = " + str(n_model) + ", idx_ratio: " + str(idx_ratio) + ", n_cluster: " + str(n_cluster) + ", with highest entropy, avg and var documented"] with open(csv_path, mode='a+') as test_file: test_writer = csv.writer(test_file, delimiter=',') test_writer.writerow(title) # loop through train_text for i_text in range(0, train_text.__len__()): text = train_text[i_text].split(";") mean = statistics.mean([float(i) for i in text[1:-2]]) var = statistics.variance([float(i) for i in text[1:-2]]) ** 0.5 text.append("{:.3f}".format(mean * 100) + "%") text.append("{:.3f}".format(var * 100) + "%") with open(csv_path, mode='a+') as test_file: test_writer = csv.writer(test_file, delimiter=';') test_writer.writerow(text)
def project( G, target_image: torch. Tensor, # [C,H,W] and dynamic range [0,255], W & H must match G output resolution target_text, *, num_steps=300, w_avg_samples=8192, initial_learning_rate=0.02, initial_latent=None, initial_noise_factor=0.01, lr_rampdown_length=0.10, lr_rampup_length=0.5, noise_ramp_length=0.75, latent_range=2.0, max_noise=0.5, min_threshold=0.6, use_vgg=True, use_clip=True, use_pixel=True, use_penalty=True, use_center=True, regularize_noise_weight=1e5, kmeans=True, kmeans_clusters=64, verbose=False, device: torch.device): if target_image is not None: assert target_image.shape == (G.img_channels, G.img_resolution, G.img_resolution) else: use_vgg = False use_pixel = False # reduce errors unless using clip if use_clip: import clip def logprint(*args): if verbose: print(*args) G = copy.deepcopy(G).eval().requires_grad_(False).to( device) # type: ignore # Compute w stats. logprint( f'Computing W midpoint and stddev using {w_avg_samples} samples...') z_samples = np.random.RandomState(123).randn(w_avg_samples, G.z_dim) labels = None if (G.mapping.c_dim): labels = torch.from_numpy(0.5 * np.random.RandomState(123).randn( w_avg_samples, G.mapping.c_dim)).to(device) w_samples = G.mapping(torch.from_numpy(z_samples).to(device), labels) # [N, L, C] w_samples = w_samples.cpu().numpy().astype(np.float32) # [N, L, C] w_samples_1d = w_samples[:, :1, :].astype(np.float32) w_avg = np.mean(w_samples, axis=0, keepdims=True) # [1, L, C] w_std = (np.sum((w_samples - w_avg)**2) / w_avg_samples)**0.5 kmeans_latents = None if initial_latent is not None: w_avg = initial_latent else: if kmeans and use_clip and target_text is not None: from kmeans_pytorch import kmeans # data data_size, dims, num_clusters = w_avg_samples, G.z_dim, kmeans_clusters x = w_samples_1d x = torch.from_numpy(x) # kmeans logprint( f'Performing kmeans clustering using {w_avg_samples} latents into {kmeans_clusters} clusters...' ) cluster_ids_x, cluster_centers = kmeans(X=x, num_clusters=num_clusters, distance='euclidean', device=device) #logprint(f'\nGenerating images from kmeans latents...') kmeans_latents = torch.tensor(cluster_centers, dtype=torch.float32, device=device, requires_grad=True) # Setup noise inputs. noise_bufs = { name: buf for (name, buf) in G.synthesis.named_buffers() if 'noise_const' in name } # Load VGG16 feature detector. if use_vgg: url = 'https://nvlabs-fi-cdn.nvidia.com/stylegan2-ada-pytorch/pretrained/metrics/vgg16.pt' with dnnlib.util.open_url(url) as f: vgg16 = torch.jit.load(f).eval().to(device) # Load CLIP if use_clip: model, transform = clip.load("ViT-B/32", device=device) # Features for target image. if target_image is not None: target_images = target_image.unsqueeze(0).to(device).to(torch.float32) small_target = F.interpolate(target_images, size=(64, 64), mode='area') if use_center: center_target = F.interpolate(target_images, size=(448, 448), mode='area')[:, :, 112:336, 112:336] target_images = F.interpolate(target_images, size=(256, 256), mode='area') target_images = target_images[:, :, 16:240, 16:240] # 256 -> 224, center crop if use_vgg: vgg_target_features = vgg16(target_images, resize_images=False, return_lpips=True) if use_center: vgg_target_center = vgg16(center_target, resize_images=False, return_lpips=True) if use_clip: if target_image is not None: with torch.no_grad(): clip_target_features = model.encode_image( ((target_images / 255.0) - image_mean[None, :, None, None]) / image_std[None, :, None, None]).float() if use_center: clip_target_center = model.encode_image( ((center_target / 255.0) - image_mean[None, :, None, None]) / image_std[None, :, None, None]).float() if kmeans_latents is not None and use_clip and target_text is not None: scores, kmeans_images = score_images(G, model, target_text, kmeans_latents.repeat( [1, G.mapping.num_ws, 1]), device=device) ind = np.argpartition(scores, 4)[:4] w_avg = torch.median(kmeans_latents[ind], dim=0, keepdim=True)[0].repeat([1, G.mapping.num_ws, 1]) w_opt = torch.tensor(w_avg, dtype=torch.float32, device=device, requires_grad=True) # pylint: disable=not-callable w_avg_tensor = w_opt.clone() w_out = torch.zeros([num_steps] + list(w_opt.shape[1:]), dtype=torch.float32, device=device) optimizer = torch.optim.AdamW([w_opt] + list(noise_bufs.values()), betas=(0.9, 0.999), lr=initial_learning_rate) # Init noise. for buf in noise_bufs.values(): buf[:] = torch.randn_like(buf) buf.requires_grad = True for step in range(num_steps): # Learning rate schedule. t = step / num_steps w_noise_scale = max_noise * w_std * initial_noise_factor * max( 0.0, 1.0 - t / noise_ramp_length)**2 lr_ramp = min(1.0, (1.0 - t) / lr_rampdown_length) lr_ramp = 0.5 - 0.5 * np.cos(lr_ramp * np.pi) lr_ramp = lr_ramp * min(1.0, t / lr_rampup_length) lr = initial_learning_rate * lr_ramp for param_group in optimizer.param_groups: param_group['lr'] = lr # Synth images from opt_w. w_noise = torch.randn_like(w_opt) * w_noise_scale ws = w_opt + w_noise synth_images = G.synthesis(torch.clamp(ws, -latent_range, latent_range), noise_mode='const') # Downsample image to 256x256 if it's larger than that. CLIP was built for 224x224 images. synth_images = (torch.clamp(synth_images, -1, 1) + 1) * (255 / 2) small_synth = F.interpolate(synth_images, size=(64, 64), mode='area') if use_center: center_synth = F.interpolate(synth_images, size=(448, 448), mode='area')[:, :, 112:336, 112:336] synth_images = F.interpolate(synth_images, size=(256, 256), mode='area') # Features for synth images. synth_images = synth_images[:, :, 16:240, 16:240] # 256 -> 224, center crop dist = 0 if use_vgg: vgg_synth_features = vgg16(synth_images, resize_images=False, return_lpips=True) vgg_dist = (vgg_target_features - vgg_synth_features).square().sum() if use_center: vgg_synth_center = vgg16(center_synth, resize_images=False, return_lpips=True) vgg_dist += (vgg_target_center - vgg_synth_center).square().sum() vgg_dist *= 6 dist += F.relu(vgg_dist * vgg_dist - min_threshold) if use_clip: clip_synth_image = ( (synth_images / 255.0) - image_mean[None, :, None, None] ) / image_std[None, :, None, None] clip_synth_features = model.encode_image(clip_synth_image).float() adj_center = 2.0 if use_center: clip_cynth_center_image = ( (center_synth / 255.0) - image_mean[None, :, None, None] ) / image_std[None, :, None, None] adj_center = 1.0 clip_synth_center = model.encode_image( clip_cynth_center_image).float() if target_image is not None: clip_dist = (clip_target_features - clip_synth_features).square().sum() if use_center: clip_dist += (clip_target_center - clip_synth_center).square().sum() dist += F.relu(0.5 + adj_center * clip_dist - min_threshold) if target_text is not None: clip_text = 1 - model(clip_synth_image, target_text)[0].sum() / 100 if use_center: clip_text += 1 - model(clip_cynth_center_image, target_text)[0].sum() / 100 dist += 2 * F.relu(adj_center * clip_text * clip_text - min_threshold / adj_center) if use_pixel: pixel_dist = (target_images - synth_images).abs().sum() / 2000000.0 if use_center: pixel_dist += (center_target - center_synth).abs().sum() / 2000000.0 pixel_dist += (small_target - small_synth).square().sum() / 800000.0 pixel_dist /= 4 dist += F.relu(lr_ramp * pixel_dist - min_threshold) if use_penalty: l1_penalty = (w_opt - w_avg_tensor).abs().sum() / 5000.0 dist += F.relu(lr_ramp * l1_penalty - min_threshold) # Noise regularization. reg_loss = 0.0 for v in noise_bufs.values(): noise = v[None, None, :, :] # must be [1,1,H,W] for F.avg_pool2d() while True: reg_loss += (noise * torch.roll(noise, shifts=1, dims=3)).mean()**2 reg_loss += (noise * torch.roll(noise, shifts=1, dims=2)).mean()**2 if noise.shape[2] <= 8: break noise = F.avg_pool2d(noise, kernel_size=2) #print(vgg_dist, clip_dist, pixel_dist, l1_penalty, reg_loss * regularize_noise_weight) loss = dist + reg_loss * regularize_noise_weight # Step optimizer.zero_grad(set_to_none=True) loss.backward() optimizer.step() logprint( f'step {step+1:>4d}/{num_steps}: dist {dist:<4.2f} loss {float(loss):<5.2f}' ) with torch.no_grad(): torch.clamp(w_opt, -latent_range, latent_range, out=w_opt) # Save projected W for each optimization step. w_out[step] = w_opt.detach()[0] # Normalize noise. with torch.no_grad(): for buf in noise_bufs.values(): buf -= buf.mean() buf *= buf.square().mean().rsqrt() return w_out
import torch from kmeans_pytorch import kmeans import open3d as o3d import numpy as np # data data_size, dims, num_clusters = 1000, 2, 10 cloud = o3d.io.read_point_cloud("/home/llg/dataset_paper/camp001_l3.ply") cloud_xyz = np.array(cloud.points) x = torch.from_numpy(cloud_xyz) # kmeans cluster_ids_x, cluster_centers = kmeans(X=x, num_clusters=num_clusters, distance='euclidean', device=torch.device('cuda:0'))
def get_anchor(all_data, anchor_num, device): cluster_ids_x, cluster_centers = kmeans( X=all_data, num_clusters=anchor_num, distance='euclidean', device=device ) return cluster_centers
# Load img img_path = config.trainDataPath fileList = utilits.getAllName(img_path) # It is a little experiment, hence I only use one image. file = fileList[0] img = plt.imread(file) imgCV = cv2.imread(file) imgWriteable = np.array(img) imgWriteable = imgWriteable.reshape(-1, 3) imgTensor = torch.from_numpy(imgWriteable) labels, clusterCenters = kmeans(X=imgTensor, num_clusters=config.K, distance='euclidean', device=torch.device('cuda:0')) imgTensor = imgTensor.view((config.imgSize[0], config.imgSize[1], 3)) colorFeatureList = imgprocess.regionColorFeatures(imgTensor, labels) textureFeatureList = imgprocess.regionTextureFeatures(imgCV, labels) edgeFeatureList = imgprocess.regionEdgeFeatures(imgCV, labels) spatialFeatureList = imgprocess.regionSpatialFeatures(labels) featureList = torch.cat((colorFeatureList, textureFeatureList, edgeFeatureList, spatialFeatureList), dim=1) num_sample = len(featureList) X = featureList.cuda() if config.use_cuda else featureList
# if os.path.exists(os.path.join(output_path, sample_name)) == False: # os.mkdir(os.path.join(output_path, sample_name)) if os.path.exists(os.path.join(output_path, sample_name + ".pickle")) != False: print("{} is processed before.".format(sample_name)) continue print("Processing: {}".format(sample_name)) #Calculate clusters # features_AB = torch.randn(10000, 2, dtype=torch.float32) / 6 + .5 features_AB = features_AB.squeeze(0).transpose(0, 1).reshape( 2, -1).transpose(0, 1).contiguous().type(torch.float32) cluster_ids_x, cluster_centers = kmeans(X=features_AB, num_clusters=num_clusters, distance='euclidean', device=torch.device('cuda'), tol=0.0000005) cluster_ids_x = cluster_ids_x.reshape(-1, 1, 32, 32) now = datetime.now() data_obj = { # 'features_L': features_L.squeeze(0), 'clusters': cluster_ids_x, 'centers': cluster_centers, 'number_of_objects': S, 'class_name': sample_name, 'timeofday': now.strftime("%d/%m/%y %H:%M") } # #View the labels