def get_subgraph(self, i): nbr = self.nbrs[i] dist = self.dists[i] idxs = np.where(self.confs[nbr] > self.confs[i])[0] if len(idxs) == 0: return None elif len(idxs) == 1 or i in self.ignore_set: nbr_lst = [] dist_lst = [] for j in idxs[:self.max_conn]: nbr_lst.append(nbr[j]) dist_lst.append(self.dists[i, j]) return i, nbr_lst, dist_lst if self.use_candidate_set: nbr = nbr[idxs] dist = dist[idxs] # present `direction` feat = self.features[nbr] - self.features[i] adj = self.adj[nbr, :][:, nbr] adj = row_normalize(adj).toarray().astype(np.float32) if not self.ignore_label: lb = [int(self.idx2lb[i] == self.idx2lb[n]) for n in nbr] else: lb = [0 for _ in nbr] # dummy labels lb = np.array(lb) return i, nbr, dist, feat, adj, lb
def propagate(self, adj, labels, idx_train): # row_sums = adj.sum(axis=1).A1 # row_sum_diag_mat = np.diag(row_sums) # adj_rw = np.linalg.inv(row_sum_diag_mat).dot(adj) adj_rw = row_normalize(self.adj.asfptype()) Y = np.zeros(labels.shape) for id in idx_train: Y[id] = labels[id] for i in range(0, 1000): Y = adj_rw.dot(Y) for id in idx_train: Y[id] = labels[id] # Clamping return Y.round()
def load_geom_datasets(dataset, seed): dataset_split = 'splits/%s_split_0.6_0.2_%s.npz' % ( dataset, seed % 10) # seed%10 --> which split print('loading %s' % dataset_split) adj, features, labels, train_mask, val_mask, test_mask, num_features, num_labels = load_data( dataset, dataset_split, None, None, 'ExperimentTwoAll') # print(adj.nnz) # print((adj+adj.transpose()).nnz) idx = np.arange(len(labels)) idx_train, idx_val, idx_test = idx[train_mask.astype( np.bool)], idx[val_mask.astype(np.bool)], idx[test_mask.astype( np.bool)] features = row_normalize(features) return adj, adj, features, features, labels, idx_train, idx_val, idx_test, None, None
def _build_graph(self, features, cluster_features, labels, density, knns): adj = fast_knns2spmat(knns, self.k) adj, adj_row_sum = row_normalize(adj) indices, values, shape = sparse_mx_to_indices_values(adj) g = dgl.graph((indices[1], indices[0])) g.ndata['features'] = torch.FloatTensor(features) g.ndata['cluster_features'] = torch.FloatTensor(cluster_features) g.ndata['labels'] = torch.LongTensor(labels) g.ndata['density'] = torch.FloatTensor(density) g.edata['affine'] = torch.FloatTensor(values) # A Bipartite from DGL sampler will not store global eid, so we explicitly save it here g.edata['global_eid'] = g.edges(form='eid') g.ndata['norm'] = torch.FloatTensor(adj_row_sum) g.apply_edges(lambda edges: {'raw_affine': edges.data['affine'] / edges.dst['norm']}) g.apply_edges(lambda edges: {'labels_conn': (edges.src['labels'] == edges.dst['labels']).long()}) g.apply_edges(lambda edges: {'mask_conn': (edges.src['density'] > edges.dst['density']).bool()}) return g
def populate_clustering(G: HIN, n_clusters: int, WT_clusters: List[Dict[str, float]], damping=0.8) -> Tuple[np.ndarray, np.ndarray]: """Populate clustering results from terms to whole graph by random walk w/ restart. Args: G: The HIN. n_clusters: Number of clusters. WT_clusters: A list of initial weights of terms in each cluster. These weights will be populated to the whole graph. damping: The damping factor for random walk. Larger means more restart probability. Returns: ranking: The ranking distribution over ALL nodes. Shape (n_nodes, n_clusters). clustering_probs: The clustering distribution of all nodes. Shape (n_nodes, n_clusters). """ clustering_probs = np.zeros((G.num_nodes(), n_clusters), dtype=np.float64) for k in range(n_clusters): # get initial distribution using T_score T_score = list(WT_clusters[k].items()) # P_Ti # T_score = take_topk(WT_clusters[k], 20, return_tuple=True) phrases, scores = list(zip(*T_score)) z = sum(WT_clusters[k].values()) # normalizer dist = np.zeros((G.num_nodes(), ), dtype=np.float64) aligned_nids = G.find_by_entity_ids("K", phrases) for i in range(len(scores)): dist[aligned_nids[i]] = scores[i] / z # use random walk to populate clustering probabilities pr = G.ppr(damping=damping, init_probs=dist) clustering_probs[:, k] = pr ranking = clustering_probs clustering_probs = utils.row_normalize(clustering_probs) return ranking, clustering_probs
def __init__(self, cfg): feat_path = cfg['feat_path'] label_path = cfg.get('label_path', None) knn_graph_path = cfg.get('knn_graph_path', None) self.k = cfg['k'] self.feature_dim = cfg['feature_dim'] self.is_norm_feat = cfg.get('is_norm_feat', True) self.th_sim = cfg.get('th_sim', 0.) self.max_conn = cfg.get('max_conn', 1) self.ignore_ratio = cfg.get('ignore_ratio', 0.8) self.ignore_small_confs = cfg.get('ignore_small_confs', True) self.use_candidate_set = cfg.get('use_candidate_set', True) self.nproc = cfg.get('nproc', 1) self.max_qsize = cfg.get('max_qsize', int(1e5)) with Timer('read meta and feature'): if label_path is not None: self.lb2idxs, self.idx2lb = read_meta(label_path) self.inst_num = len(self.idx2lb) self.gt_labels = intdict2ndarray(self.idx2lb) self.ignore_label = False else: self.inst_num = -1 self.ignore_label = True self.features = read_probs(feat_path, self.inst_num, self.feature_dim) if self.is_norm_feat: self.features = l2norm(self.features) if self.inst_num == -1: self.inst_num = self.features.shape[0] self.size = self.inst_num assert self.size == self.features.shape[0] print('feature shape: {}, k: {}, norm_feat: {}'.format( self.features.shape, self.k, self.is_norm_feat)) with Timer('read knn graph'): if knn_graph_path is not None: knns = np.load(knn_graph_path)['data'] else: prefix = osp.dirname(feat_path) name = rm_suffix(osp.basename(feat_path)) # find root folder of `features` prefix = osp.dirname(prefix) knn_prefix = osp.join(prefix, 'knns', name) knns = build_knns(knn_prefix, self.features, cfg.knn_method, cfg.knn) assert self.inst_num == len(knns), "{} vs {}".format( self.inst_num, len(knns)) adj = fast_knns2spmat(knns, self.k, self.th_sim, use_sim=True) # build symmetric adjacency matrix adj = build_symmetric_adj(adj, self_loop=True) self.adj = row_normalize(adj) # convert knns to (dists, nbrs) self.dists, self.nbrs = knns2ordered_nbrs(knns, sort=True) if cfg.pred_confs != '': print('read estimated confidence from {}'.format( cfg.pred_confs)) self.confs = np.load(cfg.pred_confs)['pred_confs'] else: print('use unsupervised density as confidence') assert self.radius from vegcn.confidence import density self.confs = density(self.dists, radius=self.radius) assert 0 <= self.ignore_ratio <= 1 if self.ignore_ratio == 1: self.ignore_set = set(np.arange(len(self.confs))) else: num = int(len(self.confs) * self.ignore_ratio) confs = self.confs if not self.ignore_small_confs: confs = -confs self.ignore_set = set(np.argpartition(confs, num)[:num]) print( 'ignore_ratio: {}, ignore_small_confs: {}, use_candidate_set: {}'. format(self.ignore_ratio, self.ignore_small_confs, self.use_candidate_set)) print('#ignore_set: {} / {} = {:.3f}'.format( len(self.ignore_set), self.inst_num, 1. * len(self.ignore_set) / self.inst_num)) with Timer('Prepare sub-graphs'): # construct subgraphs with larger confidence self.peaks = {i: [] for i in range(self.inst_num)} self.dist2peak = {i: [] for i in range(self.inst_num)} if self.nproc > 1: # multi-process import multiprocessing as mp pool = mp.Pool(self.nproc) results = [] num = int(self.inst_num / self.max_qsize) + 1 for i in tqdm(range(num)): beg = int(i * self.max_qsize) end = min(beg + self.max_qsize, self.inst_num) lst = [j for j in range(beg, end)] results.extend( list( tqdm(pool.map(self.get_subgraph, lst), total=len(lst)))) pool.close() pool.join() else: results = [ self.get_subgraph(i) for i in tqdm(range(self.inst_num)) ] self.adj_lst = [] self.feat_lst = [] self.lb_lst = [] self.subset_gt_labels = [] self.subset_idxs = [] self.subset_nbrs = [] self.subset_dists = [] for result in results: if result is None: continue elif len(result) == 3: i, nbr, dist = result self.peaks[i].extend(nbr) self.dist2peak[i].extend(dist) continue i, nbr, dist, feat, adj, lb = result self.subset_idxs.append(i) self.subset_nbrs.append(nbr) self.subset_dists.append(dist) self.feat_lst.append(feat) self.adj_lst.append(adj) if not self.ignore_label: self.subset_gt_labels.append(self.idx2lb[i]) self.lb_lst.append(lb) self.subset_gt_labels = np.array(self.subset_gt_labels) self.size = len(self.feat_lst) assert self.size == len(self.adj_lst) if not self.ignore_label: assert self.size == len(self.lb_lst)
def __init__(self, cfg): feat_path = cfg['feat_path'] label_path = cfg.get('label_path', None) knn_graph_path = cfg.get('knn_graph_path', None) self.k = cfg['k'] self.feature_dim = cfg['feature_dim'] self.is_norm_feat = cfg.get('is_norm_feat', True) self.save_decomposed_adj = cfg.get('save_decomposed_adj', False) self.th_sim = cfg.get('th_sim', 0.) self.max_conn = cfg.get('max_conn', 1) self.conf_metric = cfg.get('conf_metric') with Timer('read meta and feature'): if label_path is not None: self.lb2idxs, self.idx2lb = read_meta(label_path) self.inst_num = len(self.idx2lb) self.gt_labels = intdict2ndarray(self.idx2lb) self.ignore_label = False else: self.inst_num = -1 self.ignore_label = True self.features = read_probs(feat_path, self.inst_num, self.feature_dim) if self.is_norm_feat: self.features = l2norm(self.features) if self.inst_num == -1: self.inst_num = self.features.shape[0] self.size = 1 # take the entire graph as input with Timer('read knn graph'): if os.path.isfile(knn_graph_path): knns = np.load(knn_graph_path)['data'] else: if knn_graph_path is not None: print('knn_graph_path does not exist: {}'.format( knn_graph_path)) prefix = osp.dirname(feat_path) name = rm_suffix(osp.basename(feat_path)) # find root folder of `features` prefix = osp.dirname(prefix) knn_prefix = osp.join(prefix, 'knns', name) knns = build_knns(knn_prefix, self.features, cfg.knn_method, cfg.knn) adj = fast_knns2spmat(knns, self.k, self.th_sim, use_sim=True) # build symmetric adjacency matrix adj = build_symmetric_adj(adj, self_loop=True) adj = row_normalize(adj) if self.save_decomposed_adj: adj = sparse_mx_to_indices_values(adj) self.adj_indices, self.adj_values, self.adj_shape = adj else: self.adj = adj # convert knns to (dists, nbrs) self.dists, self.nbrs = knns2ordered_nbrs(knns) print('feature shape: {}, k: {}, norm_feat: {}'.format( self.features.shape, self.k, self.is_norm_feat)) if not self.ignore_label: with Timer('Prepare ground-truth label'): self.labels = confidence(feats=self.features, dists=self.dists, nbrs=self.nbrs, metric=self.conf_metric, idx2lb=self.idx2lb, lb2idxs=self.lb2idxs) if cfg.eval_interim: _, self.peaks = confidence_to_peaks( self.dists, self.nbrs, self.labels, self.max_conn)
def __init__(self, cfg): feat_path = cfg['feat_path'] label_path = cfg.get('label_path', None) knn_graph_path = cfg.get('knn_graph_path', None) self.k = cfg['k'] self.feature_dim = cfg['feature_dim'] self.is_norm_feat = cfg.get('is_norm_feat', True) self.save_decomposed_adj = cfg.get('save_decomposed_adj', False) self.th_sim = cfg.get('th_sim', 0.) self.max_conn = cfg.get('max_conn', 1) self.conf_metric = cfg.get('conf_metric') self.num_process = cfg.get('num_process',16) with Timer('read meta and feature'): if label_path is not None: self.lb2idxs, self.idx2lb = read_meta(label_path) self.inst_num = len(self.idx2lb) self.gt_labels = intdict2ndarray(self.idx2lb) self.ignore_label = False else: self.inst_num = -1 self.ignore_label = True self.features = read_probs(feat_path, self.inst_num, self.feature_dim) if self.is_norm_feat: self.features = l2norm(self.features) if self.inst_num == -1: self.inst_num = self.features.shape[0] self.size = 1 # take the entire graph as input with Timer('read knn graph'): if os.path.isfile(knn_graph_path): knns = np.load(knn_graph_path)['data'] # num_imgs*2*k else: if knn_graph_path is not None: print('knn_graph_path does not exist: {}'.format( knn_graph_path)) knn_prefix = os.path.join(cfg.prefix, 'knns', cfg.name) # 通过faiss实现k近邻搜索,此处作者faiss_gpu版本实现可能有问题,但faiss大规模在cpu上跑还是慢 # 当然faiss有针内存和计算速度方面的优化,PQ,IVF等,可参考faiss knns = build_knns(knn_prefix, self.features, cfg.knn_method, cfg.knn,self.num_process) # 依据k近邻搜索结果构建邻接矩阵 adj = fast_knns2spmat(knns, self.k, self.th_sim, use_sim=True) # build symmetric adjacency matrix adj = build_symmetric_adj(adj, self_loop=True) adj = row_normalize(adj) if self.save_decomposed_adj: adj = sparse_mx_to_indices_values(adj) self.adj_indices, self.adj_values, self.adj_shape = adj else: self.adj = adj # convert knns to (dists, nbrs) self.dists, self.nbrs = knns2ordered_nbrs(knns) # num_imgs*k print('feature shape: {}, k: {}, norm_feat: {}'.format( self.features.shape, self.k, self.is_norm_feat)) if not self.ignore_label: with Timer('Prepare ground-truth label'): self.labels = confidence(feats=self.features, dists=self.dists, nbrs=self.nbrs, metric=self.conf_metric, idx2lb=self.idx2lb, lb2idxs=self.lb2idxs) if cfg.eval_interim: _, self.peaks = confidence_to_peaks( self.dists, self.nbrs, self.labels, self.max_conn)
def train_gcn(model, cfg, logger): # prepare dataset for k, v in cfg.model['kwargs'].items(): setattr(cfg.train_data, k, v) dataset = build_dataset(cfg.model['type'], cfg.train_data) pre_features = torch.FloatTensor(dataset.features) print('Have loaded the training data.') inst_num = dataset.inst_num feature_dim = dataset.feature_dim lb2idxs = dataset.lb2idxs center_fea = dataset.center_fea.astype('float32') cls_num, dim = center_fea.shape labels = torch.LongTensor(dataset.gt_labels) HEAD1 = HEAD(nhid=512) HEAD_test1 = HEAD_test(nhid=512) #load parameters from the pretrained model #model.load_state_dict(torch.load('./')) #HEAD1.load_state_dict(torch.load('./'), False) OPTIMIZER = optim.SGD([{'params': model.parameters(),'weight_decay':1e-5}, {'params': HEAD1.parameters(),'weight_decay':1e-5}], lr=0.01, momentum=0.9) print('the learning rate is 0.01') #model.load_state_dict(torch.load('')) #HEAD1.load_state_dict(torch.load('')) print("have load the pretrained model.") cfg.cuda = True model = model.cuda() HEAD1 = HEAD1.cuda() MODEL_ROOT = './src/train_model' print('the model save path is', MODEL_ROOT) #prepare the test data target = "part1_test" knn_path = "./data/knns/" + target + "/faiss_k_80.npz" knns = np.load(knn_path, allow_pickle=True)['data'] inst_num = knns.shape[0] k_num = knns.shape[2] nbrs = knns[:, 0, :] pair_a = [] pair_b = [] for i in range(inst_num): pair_a.extend([i] * k_num) pair_b.extend(nbrs[i]) for epoch in range(cfg.total_epochs): if epoch == cfg.STAGES[0]: # adjust LR for each training stage after warm up, you can also choose to adjust LR manually (with slight modification) once plaueau observed schedule_lr(OPTIMIZER) if epoch == cfg.STAGES[1]: schedule_lr(OPTIMIZER) if epoch == cfg.STAGES[2]: schedule_lr(OPTIMIZER) model.train() HEAD1.train() index = faiss.IndexFlatIP(dim) index.add(center_fea) sims, cluster_id = index.search(center_fea, k=(cfg.cluster_num+200)) # search for the k-10 neighbor #sims, cluster_id = index.search(center_fea, k=cfg.cluster_num) # search for the k-10 neighbor print('Have selected the cluster ids.') for batch in range(cls_num): #for batch in range(20): #0.select ids sample_cluster_id = random.sample(list(cluster_id[batch]), cfg.cluster_num) #sample_cluster_id = list(cluster_id[batch]) sample_id = []#the idx of the samples in this batch for i in range(len(sample_cluster_id)): sample_id.extend(random.sample(lb2idxs[sample_cluster_id[i]],int(len(lb2idxs[sample_cluster_id[i]])*0.9))) #sample_id.extend(lb2idxs[sample_cluster_id[i]]) #sample_id.sort() sample_num =len(sample_id) #id = list(np.arange(0,sample_num,1)) #sample2sort = dict(zip(sample_id, id)) if (sample_num>100000)|(sample_num<100): print('[too much samples] continue.') continue #1.create selected labels and images batch_labels = labels[sample_id] feature = pre_features[sample_id] print(sample_num) #2.create knn for this batch with Timer('build knn:'): knn_prefix = os.path.join("./data/rebuild_knn") if not os.path.exists(knn_prefix): os.makedirs(knn_prefix) if os.path.exists(os.path.join(knn_prefix, 'faiss_k_80.npz')): os.remove(os.path.join(knn_prefix, 'faiss_k_80.npz')) if os.path.exists(os.path.join(knn_prefix, 'faiss_k_80.index')): os.remove(os.path.join(knn_prefix, 'faiss_k_80.index')) knns = build_knns(knn_prefix, #l2norm(feature.clone().detach().cpu().numpy()), l2norm(feature.numpy()), "faiss", 80, is_rebuild=True) batch_adj = fast_knns2spmat(knns, 80, 0, use_sim=True) batch_adj = build_symmetric_adj(batch_adj, self_loop=True) batch_adj = row_normalize(batch_adj) batch_adj = sparse_mx_to_torch_sparse_tensor(batch_adj, return_idx=False) #3.put selected feature and labels to cuda batch_labels = batch_labels.cuda() feature = feature.cuda() batch_adj = batch_adj.cuda() train_data = [feature, batch_adj, batch_labels] #x = model(train_data) #4.train the model #add train_id_inst = batch_adj._indices().size()[1] #print('train_id_inst:', train_id_inst) #print('sample_num:', sample_num) #train_id_inst = sample_num rad_id = random.sample(range(0, train_id_inst), train_id_inst)+random.sample(range(0, train_id_inst), train_id_inst) patch_num = 40 for i in range(patch_num*2): id = rad_id[i * int(train_id_inst / patch_num):(i + 1) * int(train_id_inst / patch_num)] x = model(train_data) loss = HEAD1(x, train_data, id) OPTIMIZER.zero_grad() loss.backward() OPTIMIZER.step() print(datetime.datetime.now()) print('epoch:{}/{}, batch:{}/{}, batch2:{}/{},loss:{}'.format(epoch, cfg.total_epochs, batch, cls_num, i, patch_num*2, loss)) if (batch+1)%100==0: if not os.path.exists(MODEL_ROOT): os.makedirs(MODEL_ROOT) print('save model in epoch:{} batch:{} to {}'.format(epoch, batch, MODEL_ROOT)) torch.save(model.state_dict(), os.path.join(MODEL_ROOT, "Backbone_Epoch_{}_batch_{}.pth".format(epoch + 1, batch))) torch.save(HEAD1.state_dict(), os.path.join(MODEL_ROOT, "Head_Epoch_{}_batch_{}.pth".format(epoch + 1, batch))) if (batch + 1) % 300 == 0: avg_acc = perform_val(model, HEAD1, HEAD_test1, cfg, feature_dim, pair_a, pair_b) print('the avg testing acc in epoch:{} batch:{} is :'.format(epoch,batch), avg_acc) model.train() HEAD1.train() #5.test avg_acc = perform_val(model, HEAD1, HEAD_test1, cfg, feature_dim, pair_a, pair_b) print('the avg testing acc in epoch:{} batch:{} is :'.format(epoch,batch), avg_acc) # 6.save model if not os.path.exists(MODEL_ROOT): os.makedirs(MODEL_ROOT) print('save model in epoch:{} batch:{} to {}'.format(epoch, batch, MODEL_ROOT)) torch.save(model.state_dict(), os.path.join(MODEL_ROOT, "Backbone_Epoch_{}_batch_{}.pth".format(epoch + 1, batch))) torch.save(HEAD1.state_dict(), os.path.join(MODEL_ROOT, "Head_Epoch_{}_batch_{}.pth".format(epoch + 1, batch)))
def __init__(self, cfg): feat_path = cfg['feat_path'] label_path = cfg.get('label_path', None) knn_graph_path = cfg.get('knn_graph_path', None) self.k = cfg['k'] self.feature_dim = cfg['feature_dim'] self.is_norm_feat = cfg.get('is_norm_feat', True) self.save_decomposed_adj = cfg.get('save_decomposed_adj', False) self.th_sim = cfg.get('th_sim', 0.) self.conf_metric = cfg.get('conf_metric') with Timer('read meta and feature'): if label_path is not None: self.lb2idxs, self.idx2lb = read_meta(label_path) self.inst_num = len(self.idx2lb) self.cls_num = len(self.lb2idxs) self.gt_labels = intdict2ndarray(self.idx2lb) self.ignore_label = False else: self.inst_num = -1 self.ignore_label = True self.features = read_probs(feat_path, self.inst_num, self.feature_dim) if self.is_norm_feat: self.features = l2norm(self.features) if self.inst_num == -1: self.inst_num = self.features.shape[0] self.size = 1 # take the entire graph as input with Timer('Compute center feature'): self.center_fea = np.zeros((self.cls_num, self.features.shape[1])) for i in range(self.cls_num): self.center_fea[i] = np.mean(self.features[self.lb2idxs[i]], 0) self.center_fea = l2norm(self.center_fea) with Timer('read knn graph'): if os.path.isfile(knn_graph_path): print("load knns from the knn_path") self.knns = np.load(knn_graph_path)['data'] else: if knn_graph_path is not None: print('knn_graph_path does not exist: {}'.format( knn_graph_path)) knn_prefix = os.path.join(cfg.prefix, 'knns', cfg.name) self.knns = build_knns(knn_prefix, self.features, cfg.knn_method, cfg.knn) adj = fast_knns2spmat(self.knns, self.k, self.th_sim, use_sim=True) # build symmetric adjacency matrix adj = build_symmetric_adj(adj, self_loop=True) #print('adj before norm') #print(adj) adj = row_normalize(adj) if self.save_decomposed_adj: adj = sparse_mx_to_indices_values(adj) self.adj_indices, self.adj_values, self.adj_shape = adj else: self.adj = adj # convert knns to (dists, nbrs) self.dists, self.nbrs = knns2ordered_nbrs(self.knns) print('feature shape: {}, k: {}, norm_feat: {}'.format( self.features.shape, self.k, self.is_norm_feat))