def _find_exclude_eids_with_reverse_id(g, eids, reverse_eid_map): if isinstance(eids, Mapping): eids = {g.to_canonical_etype(k): v for k, v in eids.items()} exclude_eids = { k: F.cat([v, F.gather_row(reverse_eid_map[k], v)], 0) for k, v in eids.items() } else: exclude_eids = F.cat([eids, F.gather_row(reverse_eid_map, eids)], 0) return exclude_eids
def check_topk_score2(score_model, exclude_mode): num_entity = 40 num_rels = 4 src = F.arange(0, num_entity) dst1 = src + 1 dst1[num_entity-1] = 0 dst2 = src - 1 dst2[0] = num_entity-1 src = F.cat([src, src], dim=0) dst = F.cat([dst1, dst2], dim=0) src = F.cat([src, src, src, src], dim=0) dst = F.cat([dst, dst, dst, dst], dim=0) etype = F.cat([th.full((num_entity*2,), 0, dtype=th.long), th.full((num_entity*2,), 1, dtype=th.long), th.full((num_entity*2,), 2, dtype=th.long), th.full((num_entity*2,), 3, dtype=th.long)], dim=0) g = dgl.graph((src, dst)) g.edata['tid'] = etype _check_topk_score2(score_model, g, num_entity, num_rels, exclude_mode)
def _merge_msg(self, msg_list): """Merge separated message to a big matrix Parameters ---------- msg_list : list a list of KVStoreMsg Return ------ tensor (mx.ndarray or torch.tensor) a merged data matrix """ msg_list.sort(key=self._sort_func) return F.cat([msg.data for msg in msg_list], 0)
def pull_model(self, client, pos_g, neg_g): with th.no_grad(): entity_id = F.cat(seq=[pos_g.ndata["id"], neg_g.ndata["id"]], dim=0) relation_id = pos_g.edata["id"] entity_id = F.tensor(np.unique(F.asnumpy(entity_id))) relation_id = F.tensor(np.unique(F.asnumpy(relation_id))) l2g = client.get_local2global() global_entity_id = l2g[entity_id] entity_data = client.pull(name="entity_emb", id_tensor=global_entity_id) relation_data = client.pull(name="relation_emb", id_tensor=relation_id) self.entity_emb.emb[entity_id] = entity_data self.relation_emb.emb[relation_id] = relation_data
def segmented_knn_graph(x, k, segs): """Transforms the given point set to a directed graph, whose coordinates are given as a matrix. The predecessors of each point are its k-nearest neighbors. The matrices are concatenated along the first axis, and are segmented by ``segs``. Each block would be transformed into a separate graph. The graphs will be unioned. Parameters ---------- x : Tensor The input tensor. k : int The number of neighbors segs : iterable of int Number of points of each point set. Must sum up to the number of rows in ``x``. Returns ------- DGLGraph The graph. The node IDs are in the same order as ``x``. """ n_total_points, _ = F.shape(x) offset = np.insert(np.cumsum(segs), 0, 0) h_list = F.split(x, segs, 0) dst = [ F.argtopk(pairwise_squared_distance(h_g), k, 1, descending=False) + offset[i] for i, h_g in enumerate(h_list) ] dst = F.cat(dst, 0) src = F.arange(0, n_total_points).unsqueeze(1).expand(n_total_points, k) dst = F.reshape(dst, (-1, )) src = F.reshape(src, (-1, )) # !!! fix shape adj = sparse.csr_matrix( (F.asnumpy(F.zeros_like(dst) + 1), (F.asnumpy(dst), F.asnumpy(src))), shape=(n_total_points, n_total_points)) g = DGLGraph(adj, readonly=True) return g
def predict_neg_score(self, pos_g, neg_g, to_device=None, gpu_id=-1, trace=False, neg_deg_sample=False): """Calculate the negative score. Parameters ---------- pos_g : DGLGraph Graph holding positive edges. neg_g : DGLGraph Graph holding negative edges. to_device : func Function to move data into device. gpu_id : int Which gpu to move data to. trace : bool If True, trace the computation. This is required in training. If False, do not trace the computation. Default: False neg_deg_sample : bool If True, we use the head and tail nodes of the positive edges to construct negative edges. Default: False Returns ------- tensor The negative score """ num_chunks = neg_g.num_chunks chunk_size = neg_g.chunk_size neg_sample_size = neg_g.neg_sample_size mask = F.ones((num_chunks, chunk_size * (neg_sample_size + chunk_size)), dtype=F.float32, ctx=F.context(pos_g.ndata['emb'])) if neg_g.neg_head: neg_head_ids = neg_g.ndata['id'][neg_g.head_nid] neg_head = self.entity_emb(neg_head_ids, gpu_id, trace) head_ids, tail_ids = pos_g.all_edges(order='eid') if to_device is not None and gpu_id >= 0: tail_ids = to_device(tail_ids, gpu_id) tail = pos_g.ndata['emb'][tail_ids] rel = pos_g.edata['emb'] # When we train a batch, we could use the head nodes of the positive edges to # construct negative edges. We construct a negative edge between a positive head # node and every positive tail node. # When we construct negative edges like this, we know there is one positive # edge for a positive head node among the negative edges. We need to mask # them. if neg_deg_sample: head = pos_g.ndata['emb'][head_ids] head = head.reshape(num_chunks, chunk_size, -1) neg_head = neg_head.reshape(num_chunks, neg_sample_size, -1) neg_head = F.cat([head, neg_head], 1) neg_sample_size = chunk_size + neg_sample_size mask[:,0::(neg_sample_size + 1)] = 0 neg_head = neg_head.reshape(num_chunks * neg_sample_size, -1) neg_head, tail = self.head_neg_prepare(pos_g.edata['id'], num_chunks, neg_head, tail, gpu_id, trace) neg_score = self.head_neg_score(neg_head, rel, tail, num_chunks, chunk_size, neg_sample_size) else: neg_tail_ids = neg_g.ndata['id'][neg_g.tail_nid] neg_tail = self.entity_emb(neg_tail_ids, gpu_id, trace) head_ids, tail_ids = pos_g.all_edges(order='eid') if to_device is not None and gpu_id >= 0: head_ids = to_device(head_ids, gpu_id) head = pos_g.ndata['emb'][head_ids] rel = pos_g.edata['emb'] # This is negative edge construction similar to the above. if neg_deg_sample: tail = pos_g.ndata['emb'][tail_ids] tail = tail.reshape(num_chunks, chunk_size, -1) neg_tail = neg_tail.reshape(num_chunks, neg_sample_size, -1) neg_tail = F.cat([tail, neg_tail], 1) neg_sample_size = chunk_size + neg_sample_size mask[:,0::(neg_sample_size + 1)] = 0 neg_tail = neg_tail.reshape(num_chunks * neg_sample_size, -1) head, neg_tail = self.tail_neg_prepare(pos_g.edata['id'], num_chunks, head, neg_tail, gpu_id, trace) neg_score = self.tail_neg_score(head, rel, neg_tail, num_chunks, chunk_size, neg_sample_size) if neg_deg_sample: neg_g.neg_sample_size = neg_sample_size mask = mask.reshape(num_chunks, chunk_size, neg_sample_size) return neg_score * mask else: return neg_score
def score(self, head, rel, tail, triplet_wise=False): head_emb = self.entity_emb(head) rel_emb = self.relation_emb(rel) tail_emb = self.entity_emb(tail) num_head = F.shape(head)[0] num_rel = F.shape(rel)[0] num_tail = F.shape(tail)[0] batch_size = self.batch_size score = [] if triplet_wise: class FakeEdge(object): def __init__(self, head_emb, rel_emb, tail_emb): self._hobj = {} self._robj = {} self._tobj = {} self._hobj['emb'] = head_emb self._robj['emb'] = rel_emb self._tobj['emb'] = tail_emb @property def src(self): return self._hobj @property def dst(self): return self._tobj @property def data(self): return self._robj for i in range((num_head + batch_size - 1) // batch_size): sh_emb = head_emb[i * batch_size : (i + 1) * batch_size \ if (i + 1) * batch_size < num_head \ else num_head] sr_emb = rel_emb[i * batch_size : (i + 1) * batch_size \ if (i + 1) * batch_size < num_head \ else num_head] st_emb = tail_emb[i * batch_size : (i + 1) * batch_size \ if (i + 1) * batch_size < num_head \ else num_head] edata = FakeEdge(sh_emb, sr_emb, st_emb) score.append( F.copy_to( self.score_func.edge_func(edata)['score'], F.cpu())) score = F.cat(score, dim=0) return score else: for i in range((num_head + batch_size - 1) // batch_size): sh_emb = head_emb[i * batch_size : (i + 1) * batch_size \ if (i + 1) * batch_size < num_head \ else num_head] s_score = [] for j in range((num_tail + batch_size - 1) // batch_size): st_emb = tail_emb[j * batch_size : (j + 1) * batch_size \ if (j + 1) * batch_size < num_tail \ else num_tail] s_score.append( F.copy_to( self.score_func.infer(sh_emb, rel_emb, st_emb), F.cpu())) score.append(F.cat(s_score, dim=2)) score = F.cat(score, dim=0) return F.reshape(score, (num_head * num_rel * num_tail, ))
def topK(self, head=None, tail=None, bcast=False, pair_ws=False, k=10): if head is None: head = F.arange(0, self.emb.shape[0]) else: head = F.tensor(head) if tail is None: tail = F.arange(0, self.emb.shape[0]) else: tail = F.tensor(tail) head_emb = self.emb[head] tail_emb = self.emb[tail] if pair_ws is True: result = [] batch_size = self.batch_size # chunked cal score score = [] num_head = head.shape[0] num_tail = tail.shape[0] for i in range((num_head + batch_size - 1) // batch_size): sh_emb = head_emb[i * batch_size : (i + 1) * batch_size \ if (i + 1) * batch_size < num_head \ else num_head] sh_emb = F.copy_to(sh_emb, self.device) st_emb = tail_emb[i * batch_size : (i + 1) * batch_size \ if (i + 1) * batch_size < num_head \ else num_head] st_emb = F.copy_to(st_emb, self.device) score.append(F.copy_to(self.sim_func(sh_emb, st_emb, pw=True), F.cpu())) score = F.cat(score, dim=0) sidx = F.argsort(score, dim=0, descending=True) sidx = sidx[:k] score = score[sidx] result.append((F.asnumpy(head[sidx]), F.asnumpy(tail[sidx]), F.asnumpy(score))) else: num_head = head.shape[0] num_tail = tail.shape[0] batch_size = self.batch_size # chunked cal score score = [] for i in range((num_head + batch_size - 1) // batch_size): sh_emb = head_emb[i * batch_size : (i + 1) * batch_size \ if (i + 1) * batch_size < num_head \ else num_head] sh_emb = F.copy_to(sh_emb, self.device) s_score = [] for j in range((num_tail + batch_size - 1) // batch_size): st_emb = tail_emb[j * batch_size : (j + 1) * batch_size \ if (j + 1) * batch_size < num_tail \ else num_tail] st_emb = F.copy_to(st_emb, self.device) s_score.append(F.copy_to(self.sim_func(sh_emb, st_emb), F.cpu())) score.append(F.cat(s_score, dim=1)) score = F.cat(score, dim=0) if bcast is False: result = [] idx = F.arange(0, num_head * num_tail) score = F.reshape(score, (num_head * num_tail, )) sidx = F.argsort(score, dim=0, descending=True) sidx = sidx[:k] score = score[sidx] sidx = sidx idx = idx[sidx] tail_idx = idx % num_tail idx = floor_divide(idx, num_tail) head_idx = idx % num_head result.append((F.asnumpy(head[head_idx]), F.asnumpy(tail[tail_idx]), F.asnumpy(score))) else: # bcast at head result = [] for i in range(num_head): i_score = score[i] sidx = F.argsort(i_score, dim=0, descending=True) idx = F.arange(0, num_tail) i_idx = sidx[:k] i_score = i_score[i_idx] idx = idx[i_idx] result.append((np.full((k,), F.asnumpy(head[i])), F.asnumpy(tail[idx]), F.asnumpy(i_score))) return result
def predict_neg_score(self, pos_g, neg_g, to_device=None, gpu_id=-1, trace=False, neg_deg_sample=False): num_chunks = neg_g.num_chunks chunk_size = neg_g.chunk_size neg_sample_size = neg_g.neg_sample_size mask = F.ones( (num_chunks, chunk_size * (neg_sample_size + chunk_size)), dtype=F.float32, ctx=F.context(pos_g.ndata['emb'])) if neg_g.neg_head: neg_head_ids = neg_g.ndata['id'][neg_g.head_nid] neg_head = self.entity_emb(neg_head_ids, gpu_id, trace) head_ids, tail_ids = pos_g.all_edges(order='eid') if to_device is not None and gpu_id >= 0: tail_ids = to_device(tail_ids, gpu_id) tail = pos_g.ndata['emb'][tail_ids] rel = pos_g.edata['emb'] # When we train a batch, we could use the head nodes of the positive edges to # construct negative edges. We construct a negative edge between a positive head # node and every positive tail node. # When we construct negative edges like this, we know there is one positive # edge for a positive head node among the negative edges. We need to mask # them. if neg_deg_sample: head = pos_g.ndata['emb'][head_ids] head = head.reshape(num_chunks, chunk_size, -1) neg_head = neg_head.reshape(num_chunks, neg_sample_size, -1) neg_head = F.cat([head, neg_head], 1) neg_sample_size = chunk_size + neg_sample_size mask[:, 0::(neg_sample_size + 1)] = 0 neg_head = neg_head.reshape(num_chunks * neg_sample_size, -1) neg_head, tail = self.head_neg_prepare(pos_g.edata['id'], num_chunks, neg_head, tail, gpu_id, trace) neg_score = self.head_neg_score(neg_head, rel, tail, num_chunks, chunk_size, neg_sample_size) else: neg_tail_ids = neg_g.ndata['id'][neg_g.tail_nid] neg_tail = self.entity_emb(neg_tail_ids, gpu_id, trace) head_ids, tail_ids = pos_g.all_edges(order='eid') if to_device is not None and gpu_id >= 0: head_ids = to_device(head_ids, gpu_id) head = pos_g.ndata['emb'][head_ids] rel = pos_g.edata['emb'] # This is negative edge construction similar to the above. if neg_deg_sample: tail = pos_g.ndata['emb'][tail_ids] tail = tail.reshape(num_chunks, chunk_size, -1) neg_tail = neg_tail.reshape(num_chunks, neg_sample_size, -1) neg_tail = F.cat([tail, neg_tail], 1) neg_sample_size = chunk_size + neg_sample_size mask[:, 0::(neg_sample_size + 1)] = 0 neg_tail = neg_tail.reshape(num_chunks * neg_sample_size, -1) head, neg_tail = self.tail_neg_prepare(pos_g.edata['id'], num_chunks, head, neg_tail, gpu_id, trace) neg_score = self.tail_neg_score(head, rel, neg_tail, num_chunks, chunk_size, neg_sample_size) if neg_deg_sample: neg_g.neg_sample_size = neg_sample_size mask = mask.reshape(num_chunks, chunk_size, neg_sample_size) return neg_score * mask else: return neg_score
def process(self): with open(os.path.join(self.raw_dir, "gdb9.sdf.csv"), 'r') as f: target = f.read().split('\n')[1:-1] target = [[float(x) for x in line.split(',')[1:20]] for line in target] target = F.tensor(target, dtype=F.data_type_dict['float32']) target = F.cat([target[:, 3:], target[:, :3]], dim=-1) target = (target * conversion.view(1, -1)).tolist() with open(os.path.join(self.raw_dir, "uncharacterized.txt"), 'r') as f: skip = [int(x.split()[0]) - 1 for x in f.read().split('\n')[9:-2]] suppl = Chem.SDMolSupplier(os.path.join(self.raw_dir, "gdb9.sdf"), removeHs=False, sanitize=False) Ns = [] R = [] Z = [] H = [] A = [] NE = [] E = [] B = [] T = [] for i, mol in enumerate(suppl): if i in skip: continue N = mol.GetNumAtoms() pos = suppl.GetItemText(i).split('\n')[4:4 + N] pos = [[float(x) for x in line.split()[:3]] for line in pos] type_idx = [] atomic_number = [] aromatic = [] hybr = [] for atom in mol.GetAtoms(): type_idx.append(self.types[atom.GetSymbol()]) atomic_number.append(atom.GetAtomicNum()) aromatic.append(1 if atom.GetIsAromatic() else 0) hybr.append(self.hybr_types[atom.GetHybridization()]) row, edge_type = [], [] for bond in mol.GetBonds(): start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx() row += [start, end] edge_type += 2 * [self.bonds[bond.GetBondType()]] Ns.append(N) R += pos Z += atomic_number H += hybr A += aromatic NE.append(len(mol.GetBonds())) E += row B += edge_type T += target[i] self.N = Ns self.R = R self.Z = Z self.H = H self.A = A self.NE = NE self.E = E self.B = B self.T = T self.N_cumsum = np.concatenate([[0], np.cumsum(self.N)]) self.NE_cumsum = np.concatenate([[0], np.cumsum(self.NE)])
def pull(self, name, id_tensor): """Pull message from KVServer. Parameters ---------- name : str data name id_tensor : tensor (mx.ndarray or torch.tensor) a vector storing the ID list Returns ------- tensor a data tensor with the same row size of id_tensor. """ assert len(name) > 0, 'name cannot be empty.' assert F.ndim(id_tensor) == 1, 'ID must be a vector.' # partition data (we can move this part of code into C-api if needed) server_id = self._data_store[name + '-part-'][id_tensor] # sort index by server id sorted_id = np.argsort(F.asnumpy(server_id)) # we need return data with original order of ID back_sorted_id = F.tensor(np.argsort(sorted_id)) id_tensor = id_tensor[F.tensor(sorted_id)] server, count = np.unique(F.asnumpy(server_id), return_counts=True) # pull data from server by server order start = 0 pull_count = 0 local_data = {} for idx in range(len(server)): end = start + count[idx] if start == end: # don't have any data in target server continue partial_id = id_tensor[start:end] if server[ idx] in self._local_server_id and self._close_shared_mem == False: if (name + '-g2l-' in self._has_data) == True: local_id = self._data_store[name + '-g2l-'][partial_id] else: local_id = partial_id local_data[server[idx]] = self._pull_handler( name + '-data-', local_id, self._data_store) else: msg = KVStoreMsg(type=KVMsgType.PULL, rank=self._client_id, name=name, id=partial_id, data=None) _send_kv_msg(self._sender, msg, server[idx]) pull_count += 1 start += count[idx] msg_list = [] for server_id, data in local_data.items(): local_msg = KVStoreMsg(type=KVMsgType.PULL_BACK, rank=server_id, name=name, id=None, data=data) msg_list.append(local_msg) # wait message from server nodes for idx in range(pull_count): msg_list.append(_recv_kv_msg(self._receiver)) # sort msg by server id msg_list.sort(key=self._takeId) data_tensor = F.cat(seq=[msg.data for msg in msg_list], dim=0) return data_tensor[ back_sorted_id] # return data with original index order