def construct_graph(p_p_g, a_a_g, p_a_g): p_p_edges = p_p_g.edge_index p_p_edges = utils.sort_edge_index(p_p_edges)[0] p_p_edges = utils.to_undirected(p_p_edges) p_p_edges = utils.remove_self_loops(p_p_edges)[0] a_a_edges = a_a_g.edge_index a_a_edges = utils.sort_edge_index(a_a_edges)[0] a_a_edges = utils.to_undirected(a_a_edges) a_a_edges = utils.remove_self_loops(a_a_edges)[0] p_a_edges = p_a_g.edge_index p_a_edges = utils.sort_edge_index(p_a_edges)[0] p_a_edges = utils.remove_self_loops(p_a_edges)[0] paper_paper_graph = dgl.graph((p_p_edges[0], p_p_edges[1]), 'paper', 'pp') author_author_graph = dgl.graph((a_a_edges[0], a_a_edges[1]), 'author', 'aa') paper_author_graph = dgl.bipartite( (p_a_edges[0], p_a_edges[1]), 'paper', 'pa', 'author', num_nodes=(paper_paper_graph.number_of_nodes(), author_author_graph.number_of_nodes())) author_paper_graph = dgl.bipartite( (p_a_edges[1], p_a_edges[0]), 'author', 'ap', 'paper', num_nodes=(author_author_graph.number_of_nodes(), paper_paper_graph.number_of_nodes())) hg = dgl.hetero_from_relations([ author_author_graph, author_paper_graph, paper_author_graph, paper_paper_graph ]) return hg
def test_sort_edge_index(): edge_index = torch.tensor([[2, 1, 1, 0], [1, 2, 0, 1]]) edge_attr = torch.tensor([[1], [2], [3], [4]]) out = sort_edge_index(edge_index) assert out.tolist() == [[0, 1, 1, 2], [1, 0, 2, 1]] out = sort_edge_index(edge_index, edge_attr) assert out[0].tolist() == [[0, 1, 1, 2], [1, 0, 2, 1]] assert out[1].tolist() == [[4], [3], [2], [1]]
def process_graph(self, triple_path, feature_path, embeddings): g1 = read_txt_array(triple_path, sep='\t', dtype=torch.long) subj, rel, obj = g1.t() x_dict = {} with open(feature_path, 'r') as f: for line in f: info = line.strip().split('\t') info = info if len(info) == 2 else info + ['**UNK**'] seq = info[1].lower().split() hs = [embeddings.get(w, embeddings['**UNK**']) for w in seq] x_dict[int(info[0])] = torch.stack(hs, dim=0) idx = torch.tensor(list(x_dict.keys())) assoc = torch.full((idx.max().item() + 1, ), -1, dtype=torch.long) assoc[idx] = torch.arange(idx.size(0)) subj, obj = assoc[subj], assoc[obj] edge_index = torch.stack([subj, obj], dim=0) edge_index, rel = sort_edge_index(edge_index, rel) xs = [None for _ in range(idx.size(0))] for i in x_dict.keys(): xs[assoc[i]] = x_dict[i] x = torch.nn.utils.rnn.pad_sequence(xs, batch_first=True) return x, edge_index, rel, assoc
def augment_adj(self, edge_index, edge_weight, num_nodes): edge_index, edge_weight = sort_edge_index(edge_index, edge_weight, num_nodes) edge_index, edge_weight = spspmm(edge_index, edge_weight, edge_index, edge_weight, num_nodes, num_nodes, num_nodes) return edge_index.to(device)
def process_graph(self, triple_path, feature_path): g1 = read_txt_array(triple_path, sep='\t', dtype=torch.long) subj, rel, obj = g1.t() name_dict = {} with open(feature_path, 'r') as f: for line in f: info = line.strip().split('\t') info = info if len(info) == 2 else info + [''] seq_str = remove_punc(info[1]).strip() if seq_str == "": seq_str = '<unk>' name_dict[int(info[0])] = seq_str idx = torch.tensor(list(name_dict.keys())) assoc = torch.full((idx.max().item() + 1,), -1, dtype=torch.long) assoc[idx] = torch.arange(idx.size(0)) subj, obj = assoc[subj], assoc[obj] edge_index = torch.stack([subj, obj], dim=0) edge_index, rel = sort_edge_index(edge_index, rel) # xs = [None for _ in range(idx.size(0))] names = [None for _ in range(idx.size(0))] for i in name_dict.keys(): names[assoc[i]] = name_dict[i] # x = torch.nn.utils.rnn.pad_sequence(xs, batch_first=True) return edge_index, rel, assoc, names
def augment_adj(self, edge_index, edge_weight, num_nodes): edge_index, edge_weight = coalesce(edge_index, edge_weight, num_nodes, num_nodes) edge_index, edge_weight = sort_edge_index(edge_index, edge_weight, num_nodes) edge_index, edge_weight = spspmm(edge_index, edge_weight, edge_index, edge_weight, num_nodes, num_nodes, num_nodes) return edge_index, edge_weight
def augment_adj(self, edge_index, edge_weight, num_nodes): edge_index, edge_weight = add_self_loops(edge_index, edge_weight, num_nodes=num_nodes) edge_index, edge_weight = sort_edge_index(edge_index, edge_weight, num_nodes) edge_index, edge_weight = spspmm(edge_index, edge_weight, edge_index, edge_weight, num_nodes, num_nodes, num_nodes) edge_index, edge_weight = remove_self_loops(edge_index, edge_weight) return edge_index, edge_weight
def is_undirected( edge_index: Tensor, edge_attr: Optional[Union[Tensor, List[Tensor]]] = None, num_nodes: Optional[int] = None, ) -> bool: r"""Returns :obj:`True` if the graph given by :attr:`edge_index` is undirected. Args: edge_index (LongTensor): The edge indices. edge_attr (Tensor or List[Tensor], optional): Edge weights or multi- dimensional edge features. If given as a list, will check for equivalence in all its entries. (default: :obj:`None`) num_nodes (int, optional): The number of nodes, *i.e.* :obj:`max_val + 1` of :attr:`edge_index`. (default: :obj:`None`) :rtype: bool """ num_nodes = maybe_num_nodes(edge_index, num_nodes) edge_attr = [] if edge_attr is None else edge_attr edge_attr = [edge_attr] if isinstance(edge_attr, Tensor) else edge_attr edge_index1, edge_attr1 = sort_edge_index( edge_index, edge_attr, num_nodes=num_nodes, sort_by_row=True, ) edge_index2, edge_attr2 = sort_edge_index( edge_index1, edge_attr1, num_nodes=num_nodes, sort_by_row=False, ) return (bool(torch.all(edge_index1[0] == edge_index2[1])) and bool(torch.all(edge_index1[1] == edge_index2[0])) and all([ torch.all(e == e_T) for e, e_T in zip(edge_attr1, edge_attr2) ]))
def get_agreement_dist(edge_index: torch.Tensor, y: torch.Tensor, with_self_loops=True, return_agree_dist_sum=False, epsilon=1e-11) -> List[torch.Tensor] or (List, List): """ :param edge_index: tensor the shape of which is [2, E] :param y: tensor the shape of which is [N] :param with_self_loops: add_self_loops if True :param return_agree_dist_sum: whether return the sum of agreement dist :param epsilon: small float number for stability. :return: Tensor list L the length of which is N. L[i] = tensor([..., a(y_j, y_i), ...]) for e_{ji} \in {E} - a(y_j, y_i) = 1 / L[i].sum() if y_j = y_i, - a(y_j, y_i) = 0 otherwise. """ y = y.squeeze() num_nodes = y.size(0) # Add self-loops and sort by index if with_self_loops: edge_index, _ = remove_self_loops(edge_index) edge_index, _ = add_self_loops(edge_index, num_nodes=num_nodes) # [2, E + N] edge_index, _ = sort_edge_index(edge_index, num_nodes=num_nodes) agree_dist_list = [] agree_dist_sum_list = [] for node_idx, label in enumerate(tqdm(y)): neighbors, _ = edge_index[:, edge_index[1] == node_idx] y_neighbors = y[neighbors] if len(label.size()) == 0: agree_dist = (y_neighbors == label).float() else: # multi-label case agree_dist = (y_neighbors * label).float().sum(dim=1) if return_agree_dist_sum: agree_dist_sum_list.append(agree_dist.sum().item()) if int(agree_dist.sum()) != 0: agree_dist[agree_dist == 0] = epsilon # For KLD agree_dist = agree_dist / agree_dist.sum() else: agree_dist[:] = 1.0 agree_dist = agree_dist / agree_dist.sum() agree_dist_list.append(agree_dist) if not return_agree_dist_sum: return agree_dist_list # [N, #neighbors] else: return agree_dist_list, agree_dist_sum_list # [N, #neighbors], [N]
def __call__(self, data: Union[Data, HeteroData]): for store in data.edge_stores: if 'edge_index' not in store: continue keys, values = [], [] for key, value in store.items(): if key == 'edge_index': continue if store.is_edge_attr(key): keys.append(key) values.append(value) store.edge_index, values = sort_edge_index(store.edge_index, values, sort_by_row=False) for key, value in zip(keys, values): store[key] = value store.adj_t = SparseTensor( row=store.edge_index[1], col=store.edge_index[0], value=None if self.attr is None or self.attr not in store else store[self.attr], sparse_sizes=store.size()[::-1], is_sorted=True, trust_data=True) if self.remove_edge_index: del store['edge_index'] if self.attr is not None and self.attr in store: del store[self.attr] if self.fill_cache: # Pre-process some important attributes. store.adj_t.storage.rowptr() store.adj_t.storage.csr2csc() return data
def count_motifs(data, is_direct): if (data.edge_index.shape[1] > 500000): return None edge_index = data.edge_index.clone() if (is_direct == True): edge_index = to_undirected(data.edge_index) edge_index, _ = sort_edge_index(edge_index) edge_index = edge_index.numpy() k = pd.DataFrame(edge_index.T).reset_index(drop=True) k.rename(columns={0: data.x.shape[0], 1: edge_index.shape[1]}, inplace=True) try: name = 'graph' + str(data.x.shape[0]) + str(data.x.shape[1]) + str( data.edge_index.shape[0]) + str(data.edge_index.shape[1]) name1 = name + '.in' name2 = name + '.out' path = './' + name + '.in' k.to_csv(path, sep=' ', index=False) os.system('./orca 4 ' + name1 + ' ' + name2) k = pd.read_csv('./' + name2, sep=' ', header=None) except: return None return k.to_numpy()
def forward(self, x, edge_index, train_mask, is_debug=False): # Step 1: Class Distribution & Entropy Regularization cd = F.softmax(x, dim=-1) EPS = 1e-15 entropy = -(cd * torch.log(cd + EPS)).sum(dim=-1) # Step 2: Compute a transition matrix: transP transP, sum_pipj = self.compute_transP(cd, edge_index) # Step 3: gamma with torch.no_grad(): deg = degree(edge_index[0]) deg[deg == 0] = 1 cont_i = sum_pipj / deg gamma = self.beta + (1 - self.beta) * cont_i # Step 4: Aggregate features x = F.dropout(x, p=self.dropout, training=self.training) H = x for k in range(self.K): x = self.propagate(edge_index, x=x, transP=transP) x = (1 - gamma.unsqueeze(dim=-1)) * H + gamma.unsqueeze(dim=-1) * x if is_debug: debug_tensor = [] with torch.no_grad(): debug_tensor.append(sort_edge_index(edge_index, transP)) debug_tensor.append(cd) debug_tensor.append(sum_pipj) debug_tensor.append(gamma) else: debug_tensor = None return x, entropy, debug_tensor
def add_direct_edge(data): data = data.to('cpu') edge_index = data.edge_index.detach().clone() edge_index, _ = sort_edge_index(edge_index) edge_index = edge_index.numpy() k = pd.DataFrame(edge_index.T).reset_index(drop=True) k.rename(columns={0: data.x.shape[0], 1: edge_index.shape[1]}, inplace=True) k['1'] = data.edge_weight.detach().numpy() #print(k) #return try: name = 'edge' + str(data.x.shape[0]) + str(data.x.shape[1]) + str( data.edge_index.shape[0]) + str(data.edge_index.shape[1]) name1 = name + '.in' name2 = name + '.out' path = './' + name + '.in' k.to_csv(path, sep=' ', index=False) os.system('./edge 4 ' + name1 + ' ' + name2) k = pd.read_csv('./' + name2, sep=' ', header=None) except: return None,None k = k.to_numpy() return k[:,0:2],k[:,2]
def generate_pyg_data(self, data): # get x feature table x = data['fea_table'].copy() df = data['edge_file'] edges = df[['src_idx', 'dst_idx', 'edge_weight']] # get indices first train_indices = data['train_indices'] if self.config.use_valid: train_indices, valid_indices = train_test_split(train_indices, test_size=0.2, shuffle=False) try: if x.shape[1] == 1: # 0-dimensional feature x = x.set_index(keys="node_index") x = feat_engineering( x, edges=edges, num_nodes=self.metadata["n_node"].iloc[0] ) else: x_feat = x.drop('node_index', axis=1).to_numpy() conf_name = self.config.filename.split("/")[-1].split(".")[0] is_only_one_zero = not ((x_feat != 0) & (x_feat != 1)).any() logger.info("use {} config".format(conf_name)) logger.info( "feature only contains zero: {}, only one and zero: {}".format((x_feat == 0).all(), is_only_one_zero)) if conf_name in self.citation_configs: # Judge whether it is a citation graph # if True: if is_only_one_zero: logger.info("Normalize features") normal_feat = feat_row_sum_inv_normalize(x_feat) normal_df = pd.DataFrame(data=normal_feat) normal_df["node_index"] = x["node_index"] x = normal_df pre_feat = prepredict(data, train_indices=train_indices, use_valid=self.config.use_valid, use_ohe=False) x = x.set_index(keys="node_index") x_index = x.index.tolist() lpa_preds, lpa_train_acc = lpa_predict(data, n_class=self._n_class, train_indices=train_indices, use_valid=self.config.use_valid) if not np.isnan(lpa_train_acc) and lpa_train_acc > 0.8: logger.info("Use LPA predicts") x = pd.concat([x, pre_feat, lpa_preds], axis=1).values[x_index] else: x = pd.concat([x, pre_feat], axis=1).values[x_index] else: x = x.set_index(keys="node_index") x = feat_engineering( x, edges=edges, num_nodes=self.metadata["n_node"].iloc[0] ) except Exception as e: logger.error(e) if x.shape[1] == 0: x = np.zeros((x.shape[0], 64), dtype=np.float) else: x = x.to_numpy() logger.info("x shape: {}".format(x.shape)) node_index = torch.tensor(data['fea_table']['node_index'].to_numpy(), dtype=torch.long) x = torch.tensor(x, dtype=torch.float) # get edge_index, edge_weight edges = edges.to_numpy() edge_index = edges[:, :2].astype(np.int) # transpose from [edge_num, 2] to [2, edge_num] which is required by PyG edge_index = torch.tensor(edge_index, dtype=torch.long).transpose(0, 1) edge_weight = edges[:, 2] edge_weight = torch.tensor(edge_weight, dtype=torch.float32) undirected = gtils.is_undirected(edge_index) edge_index, edge_weight = gtils.sort_edge_index(edge_index, edge_weight) logger.info(f"is undirected ? {undirected}") logger.info(f"edge index {edge_index.shape}, edge weight {edge_weight.shape}") # get train/test mask num_nodes = x.size(0) self._num_nodes = num_nodes y = torch.zeros(num_nodes, dtype=torch.long) inds = data['train_label'][['node_index']].to_numpy() train_y = data['train_label'][['label']].to_numpy() self.y_train = train_y y[inds] = torch.tensor(train_y, dtype=torch.long) # train_indices = data['train_indices'] self._origin_graph_data_indices = copy.deepcopy(data['train_indices']) if self.config.use_valid: # train_indices, valid_indices = train_test_split(train_indices, test_size=0.2) # train_indices, valid_indices = train_test_split(train_indices, test_size=0.2, shuffle=False) self.y_train = data['train_label'].set_index('node_index').loc[train_indices][['label']].to_numpy() test_indices = data['test_indices'] data = Data(x=x, node_index=node_index, edge_index=edge_index, y=y, edge_weight=edge_weight) data.num_nodes = num_nodes train_mask = torch.zeros(num_nodes, dtype=torch.bool) train_mask[train_indices] = 1 data.train_indices = np.asarray(train_indices) data.train_mask = train_mask self._train_indices = np.asarray(train_indices) self._train_mask = train_mask if self.config.use_valid: valid_mask = torch.zeros(num_nodes, dtype=torch.bool) valid_mask[valid_indices] = 1 data.valid_indices = valid_indices data.valid_mask = valid_mask self._valid_indices = valid_indices self._valid_mask = valid_mask self._test_mask = np.zeros(num_nodes, dtype=np.bool) self._test_mask[test_indices] = True test_mask = torch.zeros(num_nodes, dtype=torch.bool) test_mask[test_indices] = 1 data.test_mask = test_mask data.test_indices = np.asarray(test_indices) self._sampler = Sampler(data, self.metadata["n_edge"].iloc[0], self.device) return data
def process(self): data_list = [] with open(self.root + '/objects.json', 'r') as f: objects = f.read() with open(self.root + '/relationships.json', 'r') as f: links = f.read() object_entry = json.loads(objects) link_entry = json.loads(links) word_dict = dict() count = 0 num_samples = 5000 feature_size = 500 for i in tqdm(range(num_samples)): objs = object_entry[i]["objects"] for obj in objs: name = obj["names"][0] if name not in word_dict.keys(): word_dict[name] = count count += 1 embeds = nn.Embedding(len(word_dict), feature_size) eb = embeds(Variable(torch.arange(0, len(word_dict)).long())) for i in tqdm(range(num_samples)): objs = object_entry[i]["objects"] id_dict = dict() node_list = [] idx = 0 if len(objs) == 0: continue for obj in objs: name = obj["names"][0] node_list.append(eb[word_dict[name]]) id_dict[obj["object_id"]] = idx for j in obj["merged_object_ids"]: id_dict[j] = idx idx += 1 x = torch.stack(node_list) print() print(x.shape) from_list = [] to_list = [] links = link_entry[i]["relationships"] for link in links: v = link["object"]["object_id"] u = link["subject"]["object_id"] if v in id_dict.keys() and u in id_dict.keys(): from_list.append(id_dict[v]) to_list.append(id_dict[u]) edge_index = torch.tensor([from_list, to_list], dtype=torch.long) edge_index, _ = sort_edge_index(edge_index, None, x.shape[0]) if len(from_list) > 0: edge_index, _ = coalesce(edge_index, None, x.shape[0], x.shape[0]) print(edge_index.shape) print() data = Data(x=x, edge_index=edge_index) data_list.append(data) print(len(word_dict)) data, slices = self.collate(data_list) torch.save((data, slices), self.processed_paths[0])