def reduced_graph(graph:DGLGraph,center_node:int,paths:dict,node_attr_name,edge_attr_name): """ reduced graph into a simpler graph with only one center node :param graph: the graph need to be reduced :param center_node the reference node :param paths: the traversal path of nodes using BFS :return: new_graph """ new_graph = DGLGraph() new_graph.add_nodes(num=graph.number_of_nodes()) new_graph.ndata[node_attr_name] = graph.ndata[node_attr_name] for node, path in paths.items(): path_weight = torch.tensor([1.]) for index,edge in enumerate(path): path_weight *= graph.edata[edge_attr_name][graph.edge_id(edge[0],edge[1])]*math.exp(-index) new_graph.add_edge(center_node,node,data={edge_attr_name:path_weight}) new_graph.add_edge(node, center_node, data={edge_attr_name: path_weight}) new_graph.add_edges(new_graph.nodes(), new_graph.nodes(), data={edge_attr_name: torch.ones(new_graph.number_of_nodes(), )}) new_graph.edata[edge_attr_name] = new_graph.edata[edge_attr_name].softmax(dim=0) pass return new_graph
def generate_graph_old(grad=False): g = DGLGraph() g.add_nodes(10) # 10 nodes # create a graph where 0 is the source and 9 is the sink # 17 edges for i in range(1, 9): g.add_edge(0, i) g.add_edge(i, 9) # add a back flow from 9 to 0 g.add_edge(9, 0) g = g.to(F.ctx()) ncol = F.randn((10, D)) ecol = F.randn((17, D)) if grad: ncol = F.attach_grad(ncol) ecol = F.attach_grad(ecol) g.ndata['h'] = ncol g.edata['w'] = ecol g.set_n_initializer(dgl.init.zero_initializer) g.set_e_initializer(dgl.init.zero_initializer) return g
def Get_DGL(): key1 = [] key2 = [] maximum = 0 with open('/home/student/raw_data/entity/triple2id4.txt', 'r') as ft: readlines = ft.readlines() for line in readlines: lines = line.split('\t') nod1 = int(lines[0]) nod2 = int(lines[2]) if (nod1 > maximum): maximum = nod1 if (nod2 > maximum): maximum = nod2 key1.append(nod1) key2.append(nod2) ft.close() #print(len(key1)) c = DGLGraph() c.add_nodes(maximum + 1) c.add_edges(key1, key2) return c
def generate_er_graph(n, p): G = DGLGraph() G.add_nodes(n) w = -1 lp = math.log(1.0 - p) # Nodes in graph are from 0,n-1 (start with v as the first node index). v = 1 edges_list = [] while v < n: lr = math.log(1.0 - random.random()) w = w + 1 + int(lr / lp) while w >= v and v < n: w = w - v v = v + 1 if v < n: edges_list.extend([(v, w), (w, v)]) G.add_edges(*zip(*edges_list)) return G
def load_npz(file_name): with np.load(file_name, allow_pickle=True) as loader: loader = dict(loader) num_nodes = loader['adj_shape'][0] adj_matrix = sp.csr_matrix( (loader['adj_data'], loader['adj_indices'], loader['adj_indptr']), shape=loader['adj_shape']).tocoo() if 'attr_data' in loader: # Attributes are stored as a sparse CSR matrix attr_matrix = sp.csr_matrix( (loader['attr_data'], loader['attr_indices'], loader['attr_indptr']), shape=loader['attr_shape']).todense() elif 'attr_matrix' in loader: # Attributes are stored as a (dense) np.ndarray attr_matrix = loader['attr_matrix'] else: attr_matrix = None if 'labels_data' in loader: # Labels are stored as a CSR matrix labels = sp.csr_matrix( (loader['labels_data'], loader['labels_indices'], loader['labels_indptr']), shape=loader['labels_shape']).todense() elif 'labels' in loader: # Labels are stored as a numpy array labels = loader['labels'] else: labels = None g = DGLGraph() g.add_nodes(num_nodes) g.add_edges(adj_matrix.row, adj_matrix.col) g.add_edges(adj_matrix.col, adj_matrix.row) g.ndata['feat'] = attr_matrix g.ndata['label'] = labels return g
def test_recv_0deg_newfld(): # test recv with 0deg nodes; the reducer also creates a new field g = DGLGraph() g.add_nodes(2) g.add_edge(0, 1) def _message(edges): return {'m' : edges.src['h']} def _reduce(nodes): return {'h1' : nodes.data['h'] + F.sum(nodes.mailbox['m'], 1)} def _apply(nodes): return {'h1' : nodes.data['h1'] * 2} def _init2(shape, dtype, ctx, ids): return 2 + F.zeros(shape, dtype=dtype, ctx=ctx) g.register_message_func(_message) g.register_reduce_func(_reduce) g.register_apply_node_func(_apply) # test#1: recv both 0deg and non-0deg nodes old = F.randn((2, 5)) g.set_n_initializer(_init2, 'h1') g.ndata['h'] = old g.send((0, 1)) g.recv([0, 1]) new = g.ndata.pop('h1') # 0deg check: initialized with the func and got applied assert F.allclose(new[0], F.full_1d(5, 4, dtype=F.float32)) # non-0deg check assert F.allclose(new[1], F.sum(old, 0) * 2) # test#2: recv only 0deg node old = F.randn((2, 5)) g.ndata['h'] = old g.ndata['h1'] = F.full((2, 5), -1, F.int64) # this is necessary g.send((0, 1)) g.recv(0) new = g.ndata.pop('h1') # 0deg check: fallback to apply assert F.allclose(new[0], F.full_1d(5, -2, F.int64)) # non-0deg check: not changed assert F.allclose(new[1], F.full_1d(5, -1, F.int64))
def _move_tokens_to_leaves(graph: DGLGraph, pad_token_index: int, pad_type_index: int) -> DGLGraph: old_token = graph.ndata['token'].numpy() n_old_nodes = old_token.shape[0] type_mask = graph.ndata['type'].numpy() != pad_type_index type_mask = np.tile(type_mask.reshape(-1, 1), old_token.shape[1]) mask = np.logical_and(old_token != pad_token_index, type_mask) n_new_nodes = mask.sum() new_token = np.full((n_old_nodes + n_new_nodes, 1), pad_token_index, dtype=np.int) new_token[:n_old_nodes] = np.where(~type_mask, old_token, pad_token_index)[:, [0]] new_token[n_old_nodes:] = old_token[mask].reshape(-1, 1) us, _ = np.nonzero(mask) vs = np.arange(n_new_nodes) + n_old_nodes graph.add_nodes(n_new_nodes) graph.add_edges(us, vs) graph.ndata['type'][n_old_nodes:] = pad_type_index graph.ndata['token'] = new_token return graph
def fromText(self, fileName): ''' The text file should be of the following format: [several rows of description here] [The first valid line should give nodes number and edges number: e.g. 'N180 E1999'] [Rows of data, with each row being '{fromID} {toID}'] And this function will return a graph generated by the given information. ''' with open(fileName, 'r') as f: notStarted = True while notStarted: retr = f.readline() if retr == '': return None match = re.search(r'N(\d+)\s+E(\d+)', retr) if match: # Number of nodes and edges self.size = [int(i) for i in match.groups()] break # retrieve edges tmpData = f.readlines() pat = re.compile(r'(\d+)\s+(\d+)') Fromlist, Tolist = np.empty( self.size[1], dtype=np.int64), np.empty(self.size[1], dtype=np.int64) for i in range(len(tmpData)): match = pat.search(tmpData[i]) Fromlist[i], Tolist[i] = [int(j) for j in match.groups()] G = DGLGraph() G.add_nodes(self.size[0]) G.add_edges(Fromlist, Tolist) return G
def test_recv_0deg(): # test recv with 0deg nodes; g = DGLGraph() g.add_nodes(2) g.add_edge(0, 1) def _message(edges): return {'m' : edges.src['h']} def _reduce(nodes): return {'h' : nodes.data['h'] + F.sum(nodes.mailbox['m'], 1)} def _apply(nodes): return {'h' : nodes.data['h'] * 2} def _init2(shape, dtype, ctx, ids): return 2 + F.zeros(shape, dtype, ctx) g.register_message_func(_message) g.register_reduce_func(_reduce) g.register_apply_node_func(_apply) g.set_n_initializer(_init2, 'h') # test#1: recv both 0deg and non-0deg nodes old = F.randn((2, 5)) g.ndata['h'] = old g.send((0, 1)) g.recv([0, 1]) new = g.ndata.pop('h') # 0deg check: initialized with the func and got applied assert F.allclose(new[0], F.full_1d(5, 4, F.float32)) # non-0deg check assert F.allclose(new[1], F.sum(old, 0) * 2) # test#2: recv only 0deg node is equal to apply old = F.randn((2, 5)) g.ndata['h'] = old g.send((0, 1)) g.recv(0) new = g.ndata.pop('h') # 0deg check: equal to apply_nodes assert F.allclose(new[0], 2 * old[0]) # non-0deg check: untouched assert F.allclose(new[1], old[1])
def test_update_all_0deg(): # test#1 g = DGLGraph() g = g.to(F.ctx()) g.add_nodes(5) g.add_edge(1, 0) g.add_edge(2, 0) g.add_edge(3, 0) g.add_edge(4, 0) def _message(edges): return {'m' : edges.src['h']} def _reduce(nodes): return {'h' : nodes.data['h'] + F.sum(nodes.mailbox['m'], 1)} def _apply(nodes): return {'h' : nodes.data['h'] * 2} def _init2(shape, dtype, ctx, ids): return 2 + F.zeros(shape, dtype, ctx) g.set_n_initializer(_init2, 'h') old_repr = F.randn((5, 5)) g.ndata['h'] = old_repr g.update_all(_message, _reduce, _apply) new_repr = g.ndata['h'] # the first row of the new_repr should be the sum of all the node # features; while the 0-deg nodes should be initialized by the # initializer and applied with UDF. assert F.allclose(new_repr[1:], 2*(2+F.zeros((4,5)))) assert F.allclose(new_repr[0], 2 * F.sum(old_repr, 0)) # test#2: graph with no edge g = DGLGraph() g = g.to(F.ctx()) g.add_nodes(5) g.set_n_initializer(_init2, 'h') g.ndata['h'] = old_repr g.update_all(_message, _reduce, _apply) new_repr = g.ndata['h'] # should fallback to apply assert F.allclose(new_repr, 2*old_repr)
def create_g(file_path, use_cuda=False): npz = np.load(file_path, allow_pickle=True) labels = npz['labels'] fts_nodes = npz['fts_node'] edge_type = npz['edge_type'].tolist() edge_norm = npz['edge_norm'].tolist() edges = npz['edges'] #num_nodes is number of nodes in the graph num_nodes = int(npz['nums']) labels = labels[0:num_nodes] fts_nodes = fts_nodes[0:num_nodes] g = DGLGraph() g.add_nodes(num_nodes) edge_type = np.array(edge_type) edge_norm = np.array(edge_norm) #adding edges from numpy files g.add_edges(edges[:, 0], edges[:, 1]) edge_type = torch.from_numpy(edge_type) edge_norm = torch.from_numpy(edge_norm).unsqueeze(1) edge_norm = edge_norm.float() fts_nodes = fts_nodes.astype(int) fts_nodes = torch.from_numpy(fts_nodes) labels = torch.from_numpy(labels) if (use_cuda): labels = labels.cuda() edge_type = edge_type.cuda() edge_norm = edge_norm.cuda() fts_nodes = fts_nodes.cuda() g.edata.update({'rel_type': edge_type, 'norm': edge_norm}) g.ndata['id'] = fts_nodes return [g, labels, fts_nodes]
def process(self, mol: Mol, atom_map): n = mol.GetNumAtoms() + 1 graph = DGLGraph() graph.add_nodes(n) graph.add_edges(graph.nodes(), graph.nodes()) graph.add_edges(range(1, n), 0) # graph.add_edges(0, range(1, n)) for e in mol.GetBonds(): u, v = e.GetBeginAtomIdx(), e.GetEndAtomIdx() graph.add_edge(u + 1, v + 1) graph.add_edge(v + 1, u + 1) feature = torch.cat([ torch.zeros((1, self.feature_dim), device=self.device), # node 0 torch.nn.functional.one_hot(torch.tensor( [atom_map[u.GetAtomicNum()] for u in mol.GetAtoms()], device=self.device), num_classes=self.feature_dim).to( torch.float) ]) return GCNData(n, graph, feature)
def main(args): NUM_NODES = args.num_nodes NUM_HIDDEN = args.num_hidden IN_FEATS = args.in_feats torch.cuda.set_device(args.gpu) g = DGLGraph() g.add_nodes(NUM_NODES) g.add_edges([i for i in range(NUM_NODES)], 0) g.add_edges([i for i in range(NUM_NODES)], 1) norm = torch.rand((NUM_NODES, 1)).cuda() g.ndata['norm'] = norm feat_src = torch.rand((NUM_NODES, IN_FEATS)) feat_src.requires_grad = True feat_src = feat_src.cuda() conv_test = EglGCNConvTest(g, IN_FEATS, NUM_HIDDEN, activation=torch.nn.functional.relu, dropout=args.dropout, bias=True) conv_test.cuda() dgl_rst, egl_rst = conv_test.forward(feat_src)
def table_to_dgl_graph(par_tab_nums, foreign_keys, col_enc, tab_enc): g = DGLGraph() col_id_offset = max(par_tab_nums) + 1 g.add_nodes(len(par_tab_nums) + col_id_offset) # column id: max table num + 1 + original column num table_id_list = range(col_id_offset) col_id_list = range(len(par_tab_nums)) g.add_edges(table_id_list, table_id_list) g.add_edges(col_id_list, col_id_list) edge_types = [0] * len(table_id_list) + [1] * len(col_id_list) table_children_src = [] table_children_dst = [] for idx, par_tab_num in enumerate(par_tab_nums): if par_tab_num != -1: table_children_src.append(par_tab_num) table_children_dst.append(idx + col_id_offset) g.add_edges(table_children_src, table_children_dst) g.add_edges(table_children_dst, table_children_src) edge_types += [2] * len(table_children_src) + [3] * len(table_children_dst) if foreign_keys: foreign_key_srcs, foreign_key_dsts = zip(*foreign_keys) foreign_key_srcs = list( map(lambda col_num: col_num + col_id_offset, foreign_key_srcs)) foreign_key_dsts = list( map(lambda col_num: col_num + col_id_offset, foreign_key_dsts)) g.add_edges(foreign_key_srcs, foreign_key_dsts) g.add_edges(foreign_key_dsts, foreign_key_srcs) edge_types += [4] * len(foreign_key_srcs) + [5] * len(foreign_key_dsts) edge_types = torch.from_numpy(np.array(edge_types)) if torch.cuda.is_available(): edge_types = edge_types.cuda() g.edata.update({'rel_type': edge_types}) g.ndata['h'] = torch.cat( (tab_enc[:col_id_offset], col_enc[:len(par_tab_nums)])) return g
def construct_complete_graph_from_mol(mol, add_self_loop=False): """Construct a complete graph with topology only for the molecule The **i** th atom in the molecule, i.e. ``mol.GetAtomWithIdx(i)``, corresponds to the **i** th node in the returned DGLGraph. The edges are in the order of (0, 0), (1, 0), (2, 0), ... (0, 1), (1, 1), (2, 1), ... If self loops are not created, we will not have (0, 0), (1, 1), ... Parameters ---------- mol : rdkit.Chem.rdchem.Mol RDKit molecule holder add_self_loop : bool Whether to add self loops in DGLGraphs. Default to False. Returns ------- g : DGLGraph Empty complete graph topology of the molecule """ g = DGLGraph() num_atoms = mol.GetNumAtoms() g.add_nodes(num_atoms) if add_self_loop: g.add_edges( [i for i in range(num_atoms) for j in range(num_atoms)], [j for i in range(num_atoms) for j in range(num_atoms)]) else: g.add_edges( [i for i in range(num_atoms) for j in range(num_atoms - 1)], [ j for i in range(num_atoms) for j in range(num_atoms) if i != j ]) return g
def convert_mol_to_graph( mol: Mol, conformer: Optional[Conformer], atom_rdkit_features: Sequence[RDKitFeature], bond_rdkit_features: Sequence[RDKitFeature], one_hot_encoding: bool = True, master_node: bool = True, ) -> DGLGraph: _graph_dict: Dict[str, torch.Tensor] = convert_mol_to_generic_graph( mol=mol, conformer=conformer, atom_rdkit_features=atom_rdkit_features, bond_rdkit_features=bond_rdkit_features, one_hot_encoding=one_hot_encoding, master_node=master_node, ) dgl_graph = DGLGraph() dgl_graph.add_nodes(num=len(_graph_dict['node_attr']), ) dgl_graph.nodes.data['attr'] = _graph_dict['node_attr'] dgl_graph.nodes.data['pos'] = _graph_dict['node_pos'] # all DGL graphs are directional, so need to add edges twice # ref: https://docs.dgl.ai/api/python/graph.html dgl_graph.add_edges( _graph_dict['edge_index'][:, 0], _graph_dict['edge_index'][:, 1], _graph_dict['edge_attr'], ) dgl_graph.add_edges( _graph_dict['edge_index'][:, 1], _graph_dict['edge_index'][:, 0], _graph_dict['edge_attr'], ) return dgl_graph
def get_graph_from_smile(molecule_smile): """ Method that constructs a molecular graph with nodes being the atoms and bonds being the edges. :param molecule_smile: SMILE sequence :return: DGL graph object, Node features and Edge features """ G = DGLGraph() molecule = Chem.MolFromSmiles(molecule_smile) features = rdDesc.GetFeatureInvariants(molecule) stereo = Chem.FindMolChiralCenters(molecule) chiral_centers = [0] * molecule.GetNumAtoms() for i in stereo: chiral_centers[i[0]] = i[1] G.add_nodes(molecule.GetNumAtoms()) node_features = [] edge_features = [] for i in range(molecule.GetNumAtoms()): atom_i = molecule.GetAtomWithIdx(i) atom_i_features = get_atom_features(atom_i, chiral_centers[i], features[i]) node_features.append(atom_i_features) for j in range(molecule.GetNumAtoms()): bond_ij = molecule.GetBondBetweenAtoms(i, j) if bond_ij is not None: G.add_edge(i, j) bond_features_ij = get_bond_features(bond_ij) edge_features.append(bond_features_ij) G.ndata['x'] = np.array(node_features) G.edata['w'] = np.array(edge_features) return G
def test_send_multigraph(): g = DGLGraph(multigraph=True) g.add_nodes(3) g.add_edge(0, 1) g.add_edge(0, 1) g.add_edge(0, 1) g.add_edge(2, 1) def _message_a(edges): return {'a': edges.data['a']} def _message_b(edges): return {'a': edges.data['a'] * 3} def _reduce(nodes): return {'a': F.max(nodes.mailbox['a'], 1)} def answer(*args): return F.max(F.stack(args, 0), 0) # send by eid old_repr = F.randn((4, 5)) g.ndata['a'] = F.zeros((3, 5)) g.edata['a'] = old_repr g.send([0, 2], message_func=_message_a) g.recv(1, _reduce) new_repr = g.ndata['a'] assert F.allclose(new_repr[1], answer(old_repr[0], old_repr[2])) g.ndata['a'] = F.zeros((3, 5)) g.edata['a'] = old_repr g.send([0, 2, 3], message_func=_message_a) g.recv(1, _reduce) new_repr = g.ndata['a'] assert F.allclose(new_repr[1], answer(old_repr[0], old_repr[2], old_repr[3])) # send on multigraph g.ndata['a'] = F.zeros((3, 5)) g.edata['a'] = old_repr g.send(([0, 2], [1, 1]), _message_a) g.recv(1, _reduce) new_repr = g.ndata['a'] assert F.allclose(new_repr[1], F.max(old_repr, 0)) # consecutive send and send_on g.ndata['a'] = F.zeros((3, 5)) g.edata['a'] = old_repr g.send((2, 1), _message_a) g.send([0, 1], message_func=_message_b) g.recv(1, _reduce) new_repr = g.ndata['a'] assert F.allclose(new_repr[1], answer(old_repr[0] * 3, old_repr[1] * 3, old_repr[3])) # consecutive send_on g.ndata['a'] = F.zeros((3, 5)) g.edata['a'] = old_repr g.send(0, message_func=_message_a) g.send(1, message_func=_message_b) g.recv(1, _reduce) new_repr = g.ndata['a'] assert F.allclose(new_repr[1], answer(old_repr[0], old_repr[1] * 3)) # send_and_recv_on g.ndata['a'] = F.zeros((3, 5)) g.edata['a'] = old_repr g.send_and_recv([0, 2, 3], message_func=_message_a, reduce_func=_reduce) new_repr = g.ndata['a'] assert F.allclose(new_repr[1], answer(old_repr[0], old_repr[2], old_repr[3])) assert F.allclose(new_repr[[0, 2]], F.zeros((2, 5)))
############################################################################### # Create graph and model # ~~~~~~~~~~~~~~~~~~~~~~~ # configurations n_hidden = 16 # number of hidden units n_bases = -1 # use number of relations as number of bases n_hidden_layers = 0 # use 1 input layer, 1 output layer, no hidden layer n_epochs = 25 # epochs to train lr = 0.01 # learning rate l2norm = 0 # L2 norm coefficient # create graph g = DGLGraph() g.add_nodes(num_nodes) g.add_edges(data.edge_src, data.edge_dst) g.edata.update({'rel_type': edge_type, 'norm': edge_norm}) # create model model = Model(len(g), n_hidden, num_classes, num_rels, num_bases=n_bases, num_hidden_layers=n_hidden_layers) ############################################################################### # Training loop # ~~~~~~~~~~~~~~~~
def main(args): # load and preprocess dataset #FIRST, CHECK DATASET path = './dataset/' + str(args.dataset) + '/' ''' edges = np.loadtxt(path + 'edges.txt') edges = edges.astype(int) features = np.loadtxt(path + 'features.txt') train_mask = np.loadtxt(path + 'train_mask.txt') train_mask = train_mask.astype(int) labels = np.loadtxt(path + 'labels.txt') labels = labels.astype(int) ''' edges = np.load(path + 'edges.npy') features = np.load(path + 'features.npy') train_mask = np.load(path + 'train_mask.npy') labels = np.load(path + 'labels.npy') num_edges = edges.shape[0] num_nodes = features.shape[0] num_feats = features.shape[1] n_classes = max(labels) - min(labels) + 1 assert train_mask.shape[0] == num_nodes print('dataset {}'.format(args.dataset)) print('# of edges : {}'.format(num_edges)) print('# of nodes : {}'.format(num_nodes)) print('# of features : {}'.format(num_feats)) features = torch.FloatTensor(features) labels = torch.LongTensor(labels) if hasattr(torch, 'BoolTensor'): train_mask = torch.BoolTensor(train_mask) else: train_mask = torch.ByteTensor(train_mask) if args.gpu < 0: cuda = False else: cuda = True torch.cuda.set_device(args.gpu) features = features.cuda() labels = labels.cuda() train_mask = train_mask.cuda() u = edges[:, 0] v = edges[:, 1] #initialize a DGL graph g = DGLGraph() g.add_nodes(num_nodes) g.add_edges(u, v) # graph preprocess and calculate normalization factor n_edges = g.number_of_edges() # add self loop g.add_edges(g.nodes(), g.nodes()) g.set_n_initializer(dgl.init.zero_initializer) g.set_e_initializer(dgl.init.zero_initializer) # create APPNP model model = EglAPPNP(g, num_feats, args.hidden_sizes, n_classes, F.relu, args.in_drop, args.edge_drop, args.alpha, args.k) if cuda: model.cuda() loss_fcn = torch.nn.CrossEntropyLoss() # use optimizer optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) # initialize graph dur = [] Used_memory = 0 for epoch in range(args.num_epochs): torch.cuda.synchronize() model.train() t0 = time.time() # forward logits = model(features) loss = loss_fcn(logits[train_mask], labels[train_mask]) now_mem = torch.cuda.max_memory_allocated(0) Used_memory = max(now_mem, Used_memory) optimizer.zero_grad() loss.backward() optimizer.step() torch.cuda.synchronize() t2 = time.time() run_time_this_epoch = t2 - t0 if epoch >= 3: dur.append(run_time_this_epoch) train_acc = accuracy(logits[train_mask], labels[train_mask]) print( 'Epoch {:05d} | Time(s) {:.4f} | train_acc {:.6f} | Used_Memory {:.6f} mb' .format(epoch, run_time_this_epoch, train_acc, (now_mem * 1.0 / (1024**2)))) Used_memory /= (1024**3) print('^^^{:6f}^^^{:6f}'.format(Used_memory, np.mean(dur)))
def reversed_graph(g): ret = DGLGraph() ret.add_nodes(g.number_of_nodes()) u, v = g.all_edges() ret.add_edges(v, u) return ret
def main(args): # load and preprocess dataset path = './dataset/' + str(args.dataset) + '/' ''' edges = np.loadtxt(path + 'edges.txt') edges = edges.astype(int) features = np.loadtxt(path + 'features.txt') train_mask = np.loadtxt(path + 'train_mask.txt') train_mask = train_mask.astype(int) labels = np.loadtxt(path + 'labels.txt') labels = labels.astype(int) ''' edges = np.load(path + 'edges.npy') features = np.load(path + 'features.npy') train_mask = np.load(path + 'train_mask.npy') labels = np.load(path + 'labels.npy') num_edges = edges.shape[0] num_nodes = features.shape[0] num_feats = features.shape[1] n_classes = int(max(labels) - min(labels) + 1) assert train_mask.shape[0] == num_nodes print('dataset {}'.format(args.dataset)) print('# of edges : {}'.format(num_edges)) print('# of nodes : {}'.format(num_nodes)) print('# of features : {}'.format(num_feats)) features = torch.FloatTensor(features) labels = torch.LongTensor(labels) if hasattr(torch, 'BoolTensor'): train_mask = torch.BoolTensor(train_mask) else: train_mask = torch.ByteTensor(train_mask) if args.gpu < 0: cuda = False else: cuda = True torch.cuda.set_device(args.gpu) features = features.cuda() labels = labels.cuda() train_mask = train_mask.cuda() ''' # graph preprocess and calculate normalization factor g = data.graph # add self loop if args.self_loop: g.remove_edges_from(nx.selfloop_edges(g)) g.add_edges_from(zip(g.nodes(), g.nodes())) g = DGLGraph(g) n_edges = g.number_of_edges() ''' u = edges[:, 0] v = edges[:, 1] g = DGLGraph() g.add_nodes(num_nodes) g.add_edges(u, v) # add self loop if args.self_loop: g = transform.add_self_loop(g) # normalization degs = g.in_degrees().float() norm = torch.pow(degs, -0.5) norm[torch.isinf(norm)] = 0 if cuda: norm = norm.cuda() g.ndata['norm'] = norm.unsqueeze(1) model = EglGCN(g, num_feats, args.num_hidden, n_classes, args.num_layers, F.relu, args.dropout) if cuda: model.cuda() loss_fcn = torch.nn.CrossEntropyLoss() # use optimizer optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) # initialize graph dur = [] Used_memory = 0 for epoch in range(args.num_epochs): model.train() torch.cuda.synchronize() t0 = time.time() # forward logits = model(features) loss = loss_fcn(logits[train_mask], labels[train_mask]) now_mem = torch.cuda.max_memory_allocated(0) Used_memory = max(now_mem, Used_memory) optimizer.zero_grad() loss.backward() optimizer.step() torch.cuda.synchronize() run_time_this_epoch = time.time() - t0 if epoch >= 3: dur.append(run_time_this_epoch) train_acc = accuracy(logits[train_mask], labels[train_mask]) print( 'Epoch {:05d} | Time(s) {:.4f} | train_acc {:.6f} | Used_Memory {:.6f} mb' .format(epoch, run_time_this_epoch, train_acc, (now_mem * 1.0 / (1024**2)))) Used_memory /= (1024**3) print('^^^{:6f}^^^{:6f}'.format(Used_memory, np.mean(dur)))
def main(args): # load graph data data = load_data(args.dataset, bfs_level=args.bfs_level, relabel=args.relabel) num_nodes = data.num_nodes num_rels = data.num_rels num_classes = data.num_classes labels = data.labels train_idx = data.train_idx test_idx = data.test_idx # split dataset into train, validate, test if args.validation: val_idx = train_idx[:len(train_idx) // 5] train_idx = train_idx[len(train_idx) // 5:] else: val_idx = train_idx # since the nodes are featureless, the input feature is then the node id. feats = torch.arange(num_nodes) # edge type and normalization factor edge_type = torch.from_numpy(data.edge_type) edge_norm = torch.from_numpy(data.edge_norm).unsqueeze(1) labels = torch.from_numpy(labels).view(-1) # check cuda use_cuda = args.gpu >= 0 and torch.cuda.is_available() if use_cuda: torch.cuda.set_device(args.gpu) feats = feats.cuda() edge_type = edge_type.cuda() edge_norm = edge_norm.cuda() labels = labels.cuda() # create graph g = DGLGraph() g.add_nodes(num_nodes) g.add_edges(data.edge_src, data.edge_dst) # create model model = EntityClassify(len(g), args.n_hidden, num_classes, num_rels, num_bases=args.n_bases, num_hidden_layers=args.n_layers - 2, dropout=args.dropout, use_self_loop=args.use_self_loop, use_cuda=use_cuda) if use_cuda: model.cuda() # optimizer optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2norm) # training loop print("start training...") forward_time = [] backward_time = [] model.train() for epoch in range(args.n_epochs): optimizer.zero_grad() t0 = time.time() logits = model(g, feats, edge_type, edge_norm) loss = F.cross_entropy(logits[train_idx], labels[train_idx]) t1 = time.time() loss.backward() optimizer.step() t2 = time.time() forward_time.append(t1 - t0) backward_time.append(t2 - t1) print( "Epoch {:05d} | Train Forward Time(s) {:.4f} | Backward Time(s) {:.4f}" .format(epoch, forward_time[-1], backward_time[-1])) train_acc = torch.sum(logits[train_idx].argmax( dim=1) == labels[train_idx]).item() / len(train_idx) val_loss = F.cross_entropy(logits[val_idx], labels[val_idx]) val_acc = torch.sum(logits[val_idx].argmax( dim=1) == labels[val_idx]).item() / len(val_idx) print( "Train Accuracy: {:.4f} | Train Loss: {:.4f} | Validation Accuracy: {:.4f} | Validation loss: {:.4f}" .format(train_acc, loss.item(), val_acc, val_loss.item())) print() model.eval() logits = model.forward(g, feats, edge_type, edge_norm) test_loss = F.cross_entropy(logits[test_idx], labels[test_idx]) test_acc = torch.sum(logits[test_idx].argmax( dim=1) == labels[test_idx]).item() / len(test_idx) print("Test Accuracy: {:.4f} | Test loss: {:.4f}".format( test_acc, test_loss.item())) print() print("Mean forward time: {:4f}".format( np.mean(forward_time[len(forward_time) // 4:]))) print("Mean backward time: {:4f}".format( np.mean(backward_time[len(backward_time) // 4:])))
class GraphRecommender: """Rapidly trains similarity embeddings for graphs and generates recomendations Attributes ---------- G : DGL Graph object Current DGL graph for all added data with self.add_data node_ids : pandas data frame Contains mapping from user provided nodeids to DGL and faiss compatable integer ids. Also contains various flags which identify properties and classes of the nodes. """ def __init__(self, embedding_dim, feature_dim = None, hidden_dim = None, hidden_layers = 2, dropout = 0, agg_type = 'gcn', distance = 'cosine', torch_device = 'cpu', faiss_gpu = False, inference_batch_size = 10000, p_train = 1, train_faiss_index = False): """Generates embeddings for graph data such that embeddings close by a given distance metric are 'similar'. Embeddings can be used to predict which nodes belong to the same class. The embeddings can be trained with triplet loss in a fully supervised, semi-supervised or fully unsupervised manner. GraphSage is used to allow minibatch training. Uses faiss index to allow extremely fast query times for most similar nodes to a query node even for graphs with billions of nodes. Memory is likely to be the limiting factor before query times. Args ---- embedding_dim : int the dimension of the final output embedding used for similarity search feature_dim : int the dimension of the input node features, currently only allowed to be a trainable embedding. In the future should allow external node features. defaults to 2*hidden_dim hidden_dim : int the dimension of the intermediate hidden layers, defaults to 2*embedding dim. hidden_layers : int number of hidden layers. Embeddings can collpase to a single value if this is set too high. Defaults to 2. dropout : float whether to apply a dropout layer after hidden layers of GraphSAge. Defaults to 0, which means there is no Dropout applied. agg_type : str aggregation function to apply to GraphSage. Valid options are 'mean', 'lstm', and 'gcn' aggregation. See GraphSage paper for implementation details. Defaults to gcn which performs well for untrained networks. distance : str distance metric to use for similarity search. Valid options are l2 and cosine. Defaults to cosine. torch_device : str computation device to place pytorch tensors on. Valid options are any valid pytorch device. Defaults to cpu. faiss_gpu : bool whether to use gpu to accelerate faiss searching. Note that it will compete with pytorch for gpu memory. inference_batch_size : number of nodes to compute per batch when computing all embeddings with self.net.inference. defaults to 10000 which should comfortably fit on most gpus and be reasonably efficient on cpu. p_train : float the proportion of nodes with known class labels to use for training defaults to 1 train_faiss_index : bool whether to train faiss index for faster searches. Not reccomended for training since brute force will actually be faster than retraining the index at each test iteration. Can be used for api to speed up response times. """ self.embedding_dim = embedding_dim self.device = torch_device self.inference_batch_size = inference_batch_size assert p_train<=1 and p_train>=0 self.p_train = p_train self.faiss_gpu = faiss_gpu self.train_faiss = train_faiss_index self.distance_metric = distance if self.distance_metric == 'cosine': self.distance_function = lambda t1,t2 : F.cosine_embedding_loss(t1, t2, th.ones(t1.shape[0]).to(self.device),reduce=False) elif self.distance_metric == 'l2': self.distance_function = lambda t1,t2 : th.sum(F.mse_loss(t1,t2,reduce=False),dim=1) else: raise ValueError('distance {} is not implemented'.format(self.distance)) hidden_dim = embedding_dim*4 if hidden_dim is None else hidden_dim feature_dim = hidden_dim*2 if feature_dim is None else feature_dim self.feature_dim = feature_dim self.net = SAGE(feature_dim, hidden_dim, embedding_dim, hidden_layers, F.relu, dropout, agg_type) self.net.to(self.device) self._embeddings = None self._index = None self._masks_set = False self.node_ids = pd.DataFrame(columns=['id','intID','classid','feature_flag']) self.G = DGLGraph() #hold init args in memory in case needed to save to disk for restoring later self.initargs = (embedding_dim, feature_dim, hidden_dim, hidden_layers, dropout, agg_type, distance, torch_device, faiss_gpu, inference_batch_size, p_train, train_faiss_index) def add_nodes(self, nodearray, skip_duplicates=False): """Define nodes by passing an array (or array like object). Nodes can be identified by any data type (even mixed data types), but each node must be unique. An exception is raised if all nodes are not unique including if the same node is attempted to be added in two calls to this method. Each node is mapped to a unique integer id based on the order they are added. Args ---- nodearray : numpy array (or array-like object) array containing the identifiers of each node to be added skip_duplicates : bool if true, ignore nodes which have already been added. If False, raise error. """ ninputnodes = len(nodearray) nodedf = pd.DataFrame(nodearray, columns=['id']) if len(nodedf) != len(nodedf.drop_duplicates()): raise ValueError('Provided nodeids are not unique. Please pass an array of unique identifiers.') nodes_already_exist = nodedf.merge(self.node_ids,on='id',how='inner') if len(nodes_already_exist)>0 and not skip_duplicates: raise ValueError( 'Some provided nodes have already been added to the graph. See node_ids.ids.') elif len(nodes_already_exist)>0 and skip_duplicates: #get rid of the duplicates nodes_already_exist['dropflag'] = True nodedf = nodedf.merge(nodes_already_exist,on='id',how='left') nodedf['dropflag'] = ~pd.isna(nodedf.dropflag) nodedf = nodedf.drop(nodedf[nodedf.dropflag].index) nodedf = nodedf[['id']] current_maximum_id = self.node_ids.intID.max() num_new_nodes = len(nodedf) start = (current_maximum_id+1) if np.isnan(start): start = 0 end = start + num_new_nodes nodedf['intID'] = range(start,end) nodedf['classid'] = None nodedf['feature_flag'] = False self.node_ids = pd.concat([self.node_ids,nodedf]) self._masks_set = False if self.G.is_readonly: self.G = dgl.as_immutable_graph(self.G) self.G.readonly(False) self.G.add_nodes(num_new_nodes) self._masks_set = False self._embeddings = None self._index = None def add_edges(self, n1, n2): """Adds edges to the DGL graph. Nodes must be previously defined by add_nodes or an exception is raised. Edges are directed. To define a undirected graph, include both n1->n2 and n2->n1 in the graph. Args ---- n1 : numpy array (or array-like object) first node in the edge (n1->n2) n2 : numpy array (or array-like object) second node in the edge (n1->n2) """ edgedf_all = pd.DataFrame(n1,columns=['n1']) edgedf_all['n2'] = n2 chunks = int(max(len(edgedf_all)//MAX_ADD_EDGES,1)) edgedf_all = np.array_split(edgedf_all, chunks) if chunks>1: pbar = tqdm.tqdm(total=chunks) for i in range(chunks): edgedf = edgedf_all.pop() edgedf = edgedf.merge(self.node_ids,left_on='n1',right_on='id',how='left') edgedf = edgedf.merge(self.node_ids,left_on='n2',right_on='id',how='left',suffixes=('','2')) edgedf = edgedf[['intID','intID2']] if len(edgedf) != len(edgedf.dropna()): raise ValueError('Some edges do not correspond to any known node. Please add with add_nodes method first.') if self.G.is_readonly: self.G = dgl.as_immutable_graph(self.G) self.G.readonly(False) self.G.add_edges(edgedf.intID,edgedf.intID2) if chunks>1: pbar.update(1) if chunks>1: pbar.close() self._masks_set = False self._embeddings = None self._index = None def _update_node_ids(self,datadf): """Overwrites existing information about nodes with new info contained in a dataframe. Temporarily sets id as the index to use built in pandas update method aligned on index. Args ---- datadf : data frame has the same structure as self.node_ids """ datadf.set_index('id',inplace=True,drop=True) self.node_ids.set_index('id',inplace=True,drop=True) self.node_ids.update(datadf, overwrite=True) self.node_ids.reset_index(inplace=True) def update_labels(self,labels): """Updates nodes by adding a label (or class). Existing class label is overridden if one already exists. Any node which does not have a known class has a label of None. Any data type can be a valid class label except for None which is reserved for unknown class. All nodes included in the update must be previously defined by add_nodes or an exception is raised. Args ---- labels : dictionary or pandas series maps node ids to label, i.e. classid. If pandas series the index acts as the dictionary key.""" labeldf = pd.DataFrame(labels.items(), columns=['id','classid']) labeldf = labeldf.merge(self.node_ids,on='id',how='left',suffixes=('','2')) if labeldf['intID'].isna().sum() > 0: raise ValueError('Some nodes in update do not exist in graph. Add them first with add_nodes.') labeldf = labeldf[['id','intID','classid','feature_flag']] self._update_node_ids(labeldf) self._masks_set = False self._embeddings = None self._index = None def update_feature_flag(self,flags): """Updates node by adding a feature flag. This can be True or False. If the feature flag is True, the node will not be included in any recommender index. It will still be included in the graph to enrich the embeddings of the other nodes, but it will never be returned as a recommendation as a similar node. I.e. if True this node is a feature of other nodes only and not interesting as an entity of its own right. Args ---- flags : dictionary or pandas series maps node ids to feature flag. If pandas series the index acts as the dictionary key.""" featuredf = pd.DataFrame(flags.items(), columns=['id','feature_flag']) featuredf = featuredf.merge(self.node_ids,on='id',how='left',suffixes=('','2')) if featuredf['intID'].isna().sum() > 0: raise ValueError('Some nodes in update do not exist in graph. Add them first with add_nodes.') featuredf = featuredf[['id','intID','classid','feature_flag']] self._update_node_ids(featuredf) self._masks_set = False self._embeddings = None self._index = None def set_masks(self): """Sets train, test, and relevance masks. Needs to be called once after data as been added to graph. self.train and self.evaluate automatically check if this needs to be called and will call it, but it can also be called manually. Can be called a second time manually to reroll the random generation of the train and test sets.""" self.node_ids = self.node_ids.sort_values('intID') self.labels = self.node_ids.classid.to_numpy() #is relevant mask indicates the nodes which we know the class of self.is_relevant_mask = np.logical_not(pd.isna(self.node_ids.classid).to_numpy()) #entity_mask indicates the nodes which we want to include in the faiss index self.entity_mask = np.logical_not(self.node_ids.feature_flag.to_numpy().astype(np.bool)) self.train_mask = np.random.choice( a=[False,True],size=(len(self.node_ids)),p=[1-self.p_train,self.p_train]) #test set is all nodes other than the train set unless train set is all #nodes and then test set is the same as train set. if self.p_train != 1: self.test_mask = np.logical_not(self.train_mask) else: self.test_mask = self.train_mask #do not include any node without a classid in either set self.train_mask = np.logical_and(self.train_mask,self.is_relevant_mask) self.train_mask = np.logical_and(self.train_mask,self.entity_mask) self.test_mask = np.logical_and(self.test_mask,self.is_relevant_mask) self.test_mask = np.logical_and(self.test_mask,self.entity_mask) if not self.G.is_readonly: self.embed = nn.Embedding(len(self.node_ids),self.feature_dim) self.G.readonly() self.G = dgl.as_heterograph(self.G) self.G.ndata['features'] = self.embed.weight self.features = self.embed.weight self.features.to(self.device) self.embed.to(self.device) self._masks_set = True @property def embeddings(self): """Updates all node embeddings if needed and returns the embeddings. Simple implementation of a cached property. Returns ------- embeddings node x embedding_dim tensor""" if self._embeddings is None: if not self._masks_set: self.set_masks() print('computing embeddings for all nodes...') with th.no_grad(): self._embeddings = self.net.inference( self.G, self.features,self.inference_batch_size,self.device).detach().cpu().numpy() return self._embeddings @property def index(self): """Creates a faiss index for similarity searches over the node embeddings. Simple implementation of a cached property. Returns ------- a faiss index with input embeddings added and optionally trained""" if self._index is None: if not self._masks_set: self.set_masks() if self.distance_metric=='cosine': self._index = faiss.IndexFlatIP(self.embedding_dim) embeddings = np.copy(self.embeddings[self.entity_mask]) #this function operates in place so np.copy any views into a new array before using. faiss.normalize_L2(embeddings) elif self.distance_metric=='l2': self._index = faiss.IndexFlatL2(self.embedding_dim) embeddings = self.embeddings[self.entity_mask] if self.train_faiss: training_points = min( len(self.node_ids)//FAISS_NODES_TO_CLUSTERS+1, MAXIMUM_FAISS_CLUSTERS) self._index = faiss.IndexIVFFlat(self._index, self.embedding_dim, training_points) self._index.train(embeddings) self._index.add(embeddings) if self.faiss_gpu: GPU = faiss.StandardGpuResources() self._index = faiss.index_cpu_to_gpu(GPU, 0, self._index) return self._index def _search_index(self,inputs,k): """Directly searches the faiss index and returns the k nearest neighbors of inputs Args ---- inputs : numpy array np.float the vectors to search against the faiss index k : int how many neighbors to lookup Returns ------- D, I distance numpy array and neighbors array from faiss""" if self.distance_metric == 'cosine': inputs = np.copy(inputs) faiss.normalize_L2(inputs) D, I = self.index.search(inputs,k) return D,I def _get_intID(self,nodelist): """Accepts a list of nodeids and converts them to internally used sequential integer id. Args ---- nodelist : List node identifiers to convert Returns ------- list of integer identifiers""" relevant_nodes = self.node_ids.loc[self.node_ids.id.isin(nodelist)] try: intids = [relevant_nodes.loc[relevant_nodes.id == node].intID.iloc[0] for node in nodelist] except IndexError: intids = [relevant_nodes.loc[relevant_nodes.id == int(node)].intID.iloc[0] for node in nodelist] return intids def get_embeddings(self,nodelist): """Looks up the embedding for a specific list of nodes based on their nodeid. Args ---- nodelist : List list of node identifiers to get the embedding of Returns ------- numpy array of final embeddings""" intids = self._get_intID(nodelist) return self.embeddings[intids,:] def _faiss_ids_to_nodeids(self, I, return_labels): """Takes an output from faiss index and maps the faissids back to nodeids and optionally node class labels Args ---- I : numpy array array returned from a faiss index search return_labels : bool whether to lookup labels Returns ------- I : array with ids mapped to nodeids L : optionally second array with ids mapped to node class labels, if return_labels is false, is None""" faissid_to_nodeid = self.node_ids.id.to_numpy()[self.entity_mask].tolist() if return_labels: faissid_to_label = self.node_ids.classid.to_numpy()[self.entity_mask].tolist() L = [[faissid_to_label[neighbor] for neighbor in neighbors] for neighbors in I] I = [[faissid_to_nodeid[neighbor] for neighbor in neighbors] for neighbors in I] else: I = [[faissid_to_nodeid[neighbor] for neighbor in neighbors] for neighbors in I] L = None return I, L def query_neighbors(self, nodelist, k, return_labels=False): """For each query node in nodelist, return the k closest neighbors in the embedding space. Args ---- nodelist : list list of node identifiers to query k : int number of neighbors to return return_labels : bool if true, includes the node label of all neighbors returned Returns ------- dictionary of neighbors for each querynode and corresponding distance""" if not self._masks_set: self.set_masks() inputs = self.get_embeddings(nodelist) D, I = self._search_index(inputs,k) I,L = self._faiss_ids_to_nodeids(I,return_labels) if return_labels: output = {node:{'neighbors':i,'neighbor labels':l,'distances':d.tolist()} for node, d, i, l in zip(nodelist,D,I,L)} else: output = {node:{'neighbors':i,'distances':d.tolist()} for node, d, i in zip(nodelist,D,I)} return output def evaluate(self, test_levels=[5,1], test_only=False): """Evaluates performance of current embeddings Args ---- test_only : bool whether to only test the performance on the test set. If false, all nodes with known class will be tested. test_levels : list of ints each entry is a number of nearest neighbors and we will test if at least one of the neighbors at each level contains a correct neighbor based on node labels. We also test the total share of the neighbors that have a correct label. Returns ------- dictionary containing details of the performance of the model at each level """ self.net.eval() if not self._masks_set: self.set_masks() mask = self.test_mask if test_only else self.is_relevant_mask test_labels = self.labels[mask] faiss_labels = self.labels[self.entity_mask] test_embeddings = self.embeddings[mask] #we need to return the maximum number of neighbors that we want to test #plus 1 since the top neighbor of each node will always be itself, which #we exclude. _, I = self._search_index(test_embeddings,max(test_levels)+1) performance = {level:[] for level in test_levels} performance_share = {level:[] for level in test_levels} for node, neighbors in enumerate(I): label = test_labels[node] neighbor_labels = [faiss_labels[n] for n in neighbors[1:]] for level in test_levels: correct_labels = np.sum([label==nl for nl in neighbor_labels[:level]]) #at least one label in the neighbors was correct performance[level].append(correct_labels>0) #share of labels in the neighbors that was correct performance_share[level].append(correct_labels/level) return {f'Top {level} neighbors': {'Share >=1 correct neighbor':np.mean(performance[level]), 'Share of correct neighbors':np.mean(performance_share[level])} for level in test_levels} @staticmethod def setup_pairwise_loss_tensors(labelsnp): """Accepts a list of labels and sets up indexers which can be used in a triplet loss function along with whether each pair is a positive or negative example. Args ---- labelsnp : numpy array Class labels of each node, labelsnp[i] = class of node with intid i Returns ------- idx1 : indexer array for left side comparison idx2 : indexer array for right side comparison target : array indicating whether left and right side are the same or different""" idx1 = [] idx2 = [] target = [] for i,l in enumerate(labelsnp): ids = list(range(len(labelsnp))) for j,other in zip(ids[i+1:],labelsnp[i+1:]): if other==l: idx1.append(i) idx2.append(j) target.append(1) else: idx1.append(i) idx2.append(j) target.append(-1) return idx1, idx2, target def triplet_loss(self,embeddings,labels): """For a given tensor of embeddings and corresponding labels, returns a triplet loss maximizing distance between negative examples and minimizing distance between positive examples Args ---- embeddings : pytorch tensor torch.float32 embeddings to be trained labels : numpy array Class labels of each node, labelsnp[i] = class of node with intid i""" batch_relevant_nodes = [i for i,l in enumerate(labels) if not pd.isna(l)] embeddings = embeddings[batch_relevant_nodes] labels = labels[batch_relevant_nodes] idx1,idx2,target = self.setup_pairwise_loss_tensors(labels) losstarget = th.tensor(target).to(self.device) if self.distance_metric=='cosine': input1 = embeddings[idx1] input2 = embeddings[idx2] loss = F.cosine_embedding_loss(input1, input2, losstarget, margin=0.5) elif self.distance_metric=='l2': idx1_pos = [idx for i,idx in enumerate(idx1) if target[i]==1] idx1_neg = [idx for i,idx in enumerate(idx1) if target[i]==-1] idx2_pos = [idx for i,idx in enumerate(idx2) if target[i]==1] idx2_neg = [idx for i,idx in enumerate(idx2) if target[i]==-1] input1_pos = embeddings[idx1_pos] input2_pos = embeddings[idx2_pos] input1_neg = embeddings[idx1_neg] input2_neg = embeddings[idx2_neg] loss_pos = F.mse_loss(input1_pos,input2_pos) loss_neg = th.mean(th.max(th.zeros(input1_neg.shape[0]).to(self.device),0.25-th.sum(F.mse_loss(input1_neg,input2_neg,reduce=False),dim=1))) loss = loss_pos + loss_neg else: raise ValueError('distance {} is not implemented'.format(self.distance_metric)) return loss def train(self,epochs, batch_size, test_every_n_epochs = 1, unsupervised = False, learning_rate = 1e-2, fanouts = [10,25], neg_samples = 1, return_intermediate_embeddings = False, test_levels=[5,1]): """Trains the network weights to improve the embeddings. Can train via supervised learning with triplet loss, semisupervised learning with triplet loss, or fully unsupervised learning. Args ---- epochs : int number of training passes over the data batch_size : int number of seed nodes for building the training graph test_every_n_epochs : int how often to do a full evaluation of the embeddings, expensive for large graphs unsupervised : bool whether to train completely unsupervised learning_rate : float learning rate to use in the adam optimizer fanouts : list of int number of neighbors to sample at each layer for GraphSage neg_samples : int number of negative samples to use in unsupervised loss test_levels : list of ints passsed to self.eval, number of neighbors to use for testing accuracy""" if not self._masks_set: self.set_masks() optimizer = th.optim.Adam(it.chain(self.net.parameters(),self.embed.parameters()), lr=learning_rate) if not unsupervised: sampler = NeighborSampler(self.G, [int(fanout) for fanout in fanouts]) sampledata = np.nonzero(self.train_mask)[0] else: sampler = UnsupervisedNeighborSampler(self.G, [int(fanout) for fanout in fanouts],neg_samples) sampledata = list(range(len(self.node_ids))) unsup_loss_fn = CrossEntropyLoss() unsup_loss_fn.to(self.device) dataloader = DataLoader( dataset=sampledata, batch_size=batch_size, collate_fn=sampler.sample_blocks, shuffle=True, drop_last=True, num_workers=0) perf = self.evaluate(test_levels=test_levels,test_only=True) testtop5, testtop1 = perf['Top 5 neighbors']['Share >=1 correct neighbor'], \ perf['Top 1 neighbors']['Share >=1 correct neighbor'] testtop5tot, testtop1tot = perf['Top 5 neighbors']['Share of correct neighbors'], \ perf['Top 1 neighbors']['Share of correct neighbors'] print(testtop5,testtop1,testtop5tot, testtop1tot) print("Test Top5 {:.4f} | Test Top1 {:.4f} | Test Top5 Total {:.4f} | Test Top1 Total {:.4f} ".format( testtop5,testtop1,testtop5tot, testtop1tot)) loss_history = [] perf_history = [perf] if return_intermediate_embeddings: all_embeddings = [] all_embeddings.append(self.embeddings) for epoch in range(1,epochs+1): for step,data in enumerate(dataloader): #sup_blocks, unsupervised_data = data #pos_graph, neg_graph, unsup_blocks = unsupervised_data self.net.train() # these names are confusing because "seeds" are the input # to neighbor generation but the output in the sense that we # output their embeddings based on their neighbors... # the neighbors are the inputs in the sense that they are what we # use to generate the embedding for the seeds. if not unsupervised: sup_blocks = data sup_input_nodes = sup_blocks[0].srcdata[dgl.NID] sup_seeds = sup_blocks[-1].dstdata[dgl.NID] #sup_batch_inputs = self.G.ndata['features'][sup_input_nodes].to(self.device) sup_batch_inputs = self.features[sup_input_nodes].to(self.device) sup_batch_labels = self.labels[sup_seeds] #nodeids = [self.node_ids.loc[self.node_ids.intID==i].id.iloc[0] for i in sup_seeds] #print(sup_batch_labels,nodeids) sup_embeddings = self.net(sup_blocks, sup_batch_inputs) loss = self.triplet_loss(sup_embeddings,sup_batch_labels) else: pos_graph, neg_graph, unsup_blocks = data unsup_input_nodes = unsup_blocks[0].srcdata[dgl.NID] unsup_seeds = unsup_blocks[-1].dstdata[dgl.NID] unsup_batch_inputs = self.G.ndata['features'][unsup_input_nodes].to(self.device) unsup_embeddings =self.net(unsup_blocks,unsup_batch_inputs) loss = unsup_loss_fn(unsup_embeddings, pos_graph, neg_graph) optimizer.zero_grad() loss.backward() optimizer.step() #once the parameters change we no longer know the new embeddings for all nodes self._embeddings = None self._index = None print("Epoch {:05d} | Step {:0.1f} | Loss {:.8f}".format( epoch, step, loss.item())) if return_intermediate_embeddings: all_embeddings.append(self.embeddings) loss_history.append(loss.item()) if epoch % test_every_n_epochs == 0 or epoch==epochs: perf = self.evaluate(test_levels=test_levels,test_only=True) testtop5, testtop1 = perf['Top 5 neighbors']['Share >=1 correct neighbor'], \ perf['Top 1 neighbors']['Share >=1 correct neighbor'] testtop5tot, testtop1tot = perf['Top 5 neighbors']['Share of correct neighbors'], \ perf['Top 1 neighbors']['Share of correct neighbors'] print("Epoch {:05d} | Loss {:.8f} | Test Top5 {:.4f} | Test Top1 {:.4f} | Test Top5 Total {:.4f} | Test Top1 Total {:.4f} ".format( epoch, loss.item(),testtop5,testtop1,testtop5tot, testtop1tot)) perf_history.append(perf) if return_intermediate_embeddings: return loss_history,perf_history,all_embeddings else: return loss_history,perf_history def start_api(self,*args,**kwargs): """Launches a fastapi to query this class in its current state.""" package_path = os.path.dirname(os.path.abspath(__file__)) production_path = package_path + '/production_model' pathlib.Path(production_path).mkdir(exist_ok=True) self.save(production_path) os.environ['FASTREC_DEPLOY_PATH'] = production_path #this import cant be at the top level to prevent circular dependency from RecAPI import app uvicorn.run(app,*args,**kwargs) def save(self, filepath): """Save all information neccessary to recover current state of the current instance of this object to a folder. Initialization args, graph data, node ids, current trained embedding, and current torch paramters are all saved. Args ---- filepath : str path on disk to save files""" outg = dgl.as_immutable_graph(self.G) dgl.data.utils.save_graphs(f'{filepath}/dgl.bin',outg) self.node_ids.to_csv(f'{filepath}/node_ids.csv',index=False) th.save(self.embed,f'{filepath}/embed.torch') th.save(self.net.state_dict(),f'{filepath}/model_weights.torch') embeddings = self.embeddings np.save(f'{filepath}/final_embed.npy',embeddings,allow_pickle=False) with open(f'{filepath}/initargs.pkl','wb') as pklf: pickle.dump(self.initargs,pklf) def load_graph_data(self,filepath): """Restore graph data from disk, but not network parameters or trained embeddings. Useful for changing network parameters if you don't want to reconstruct the graph. Args ---- filepath : str path to where you saved previous the GraphRecommender """ self.G,_ = dgl.data.utils.load_graphs(f'{filepath}/dgl.bin') self.G = restored_self.G[0] self.G.readonly() self.G = dgl.as_heterograph(restored_self.G) self.node_ids = pd.read_csv(f'{filepath}/node_ids.csv') self._masks_set = False self._embeddings = None self._index = None @classmethod def load(cls, filepath, device=None, faiss_gpu=None): """Restore a previous instance of this class from disk. Args ---- filepath : str path on disk to load from device : str optionally override the pytorch device faiss_gpu : str optionally override whether faiss uses gpu""" with open(f'{filepath}/initargs.pkl','rb') as pklf: (embedding_dim, feature_dim, hidden_dim, hidden_layers, dropout, agg_type, distance, torch_device, faiss_gpu_loaded, inference_batch_size, p_train, train_faiss_index) = pickle.load(pklf) if device is not None: torch_device=device if faiss_gpu is not None: faiss_gpu_loaded = faiss_gpu restored_self = cls(embedding_dim, feature_dim, hidden_dim, hidden_layers, dropout, agg_type, distance, torch_device, faiss_gpu_loaded, inference_batch_size, p_train, train_faiss_index) restored_self.G,_ = dgl.data.utils.load_graphs(f'{filepath}/dgl.bin') restored_self.G = restored_self.G[0] restored_self.G.readonly() restored_self.G = dgl.as_heterograph(restored_self.G) restored_self.node_ids = pd.read_csv(f'{filepath}/node_ids.csv') restored_self.embed = th.load(f'{filepath}/embed.torch',map_location=th.device(torch_device)) restored_self.net.load_state_dict(th.load(f'{filepath}/model_weights.torch',map_location=th.device(torch_device))) embeddings = np.load(f'{filepath}/final_embed.npy',allow_pickle=False) restored_self._embeddings = embeddings return restored_self
def main(args): # load graph data data = load_data(args.dataset, bfs_level=args.bfs_level, relabel=args.relabel) num_nodes = data.num_nodes num_rels = data.num_rels num_classes = data.num_classes labels = data.labels train_idx = data.train_idx test_idx = data.test_idx # split dataset into train, validate, test if args.validation: val_idx = train_idx[:len(train_idx) // 5] train_idx = train_idx[len(train_idx) // 5:] else: val_idx = train_idx # since the nodes are featureless, the input feature is then the node id. feats = torch.arange(num_nodes) # edge type and normalization factor edge_type = torch.from_numpy(data.edge_type).long() edge_norm = torch.from_numpy(data.edge_norm).unsqueeze(1).float() labels = torch.from_numpy(labels).view(-1).long() # check cuda use_cuda = args.gpu >= 0 and torch.cuda.is_available() if use_cuda: torch.cuda.set_device(args.gpu) feats = feats.cuda() edge_type = edge_type.cuda() edge_norm = edge_norm.cuda() labels = labels.cuda() # create graph g = DGLGraph() g.add_nodes(num_nodes) g.add_edges_with_type(data.edge_src, data.edge_dst, data.edge_type) #tu_forward = sorted(list(zip(data.edge_src, data.edge_dst, data.edge_type)), key=lambda x : (x[1], x[2])) #tu_backward = sorted(list(zip(data.edge_dst, data.edge_src, data.edge_type)), key=lambda x : (x[1], x[2])) #def compute_e_to_distict_t(tu): # num_edges = len(tu) # all_node_distinct_types = 0 # cur_node = tu[0][1] # type_set = set() # type_set.add(tu[0][2]) # for i in range(1, len(tu)): # if tu[i][1] == cur_node: # type_set.add(tu[i][2]) # else: # all_node_distinct_types += len(type_set) # cur_node = tu[i][1] # type_set.clear() # type_set.add(tu[i][2]) # all_node_distinct_types += len(type_set) # type_set.clear() # #print('\n'.join([str(t) for t in tu])) # print('num_edges:', num_edges, 'node distinct types', all_node_distinct_types) # return num_edges/all_node_distinct_types #r_forward = compute_e_to_distict_t(tu_forward) #r_backward = compute_e_to_distict_t(tu_backward) #print('ratio forward:', r_forward, 'ratio_backward:', r_backward) model = EGLRGCNModel(num_nodes, args.hidden_size, num_classes, num_rels, edge_type.size(0), num_bases=args.num_bases, activation=F.relu, dropout=args.dropout) if use_cuda: model.cuda() # optimizer optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.l2norm) # training loop print("start training...") forward_time = [] backward_time = [] model.train() train_labels = labels[train_idx] train_idx = list(train_idx) for epoch in range(args.num_epochs): optimizer.zero_grad() t0 = time.time() logits = model(g, feats, edge_type, edge_norm) tb = time.time() train_logits = logits[train_idx] ta = time.time() loss = F.cross_entropy(train_logits, train_labels) t1 = time.time() loss.backward() optimizer.step() torch.cuda.synchronize() t2 = time.time() if epoch >= 3: forward_time.append(t1 - t0) backward_time.append(t2 - t1) print( "Epoch {:05d} | Train Forward Time(s) {:.4f} | Backward Time(s) {:.4f}" .format(epoch, forward_time[-1], backward_time[-1])) train_acc = torch.sum(logits[train_idx].argmax( dim=1) == labels[train_idx]).item() / len(train_idx) val_loss = F.cross_entropy(logits[val_idx], labels[val_idx]) val_acc = torch.sum(logits[val_idx].argmax( dim=1) == labels[val_idx]).item() / len(val_idx) print( "Train Accuracy: {:.4f} | Train Loss: {:.4f} | Validation Accuracy: {:.4f} | Validation loss: {:.4f}" .format(train_acc, loss.item(), val_acc, val_loss.item())) print('max memory allocated', torch.cuda.max_memory_allocated()) model.eval() logits = model.forward(g, feats, edge_type, edge_norm) test_loss = F.cross_entropy(logits[test_idx], labels[test_idx]) test_acc = torch.sum(logits[test_idx].argmax( dim=1) == labels[test_idx]).item() / len(test_idx) print("Test Accuracy: {:.4f} | Test loss: {:.4f}".format( test_acc, test_loss.item())) print() print("Mean forward time: {:4f}".format( np.mean(forward_time[len(forward_time) // 4:]))) print("Mean backward time: {:4f}".format( np.mean(backward_time[len(backward_time) // 4:]))) Used_memory = torch.cuda.max_memory_allocated(0) / (1024**3) avg_run_time = np.mean(forward_time[len(forward_time) // 4:]) + np.mean( backward_time[len(backward_time) // 4:]) #output we need print('^^^{:6f}^^^{:6f}'.format(Used_memory, avg_run_time))
def vectorize_qanta(ex, tokenizer, device, istrain, max_seq_length=64): bert_model.eval() t_id = ex['id'] text = ex['text'] positive_entity = ex['pos_et'] negative_entities = ex['neg_ets'] ## In QANTA setting, we limit the maximum sentences as three ( for efficient training and evaluation) num_edges = 3 g = DGLGraph() question_node_list = list() candidate_node_list = list() first_sent_tokens = list() first_sent_masks = list() question_tokens = list() question_masks = list() input_ids, input_mask = text_tokenize(text, tokenizer, max_seq_length) question_tokens.append(input_ids) question_masks.append(input_mask) node_sub_questions = list() for sup_q in ex['q_et']: sub_question = sup_q['text'] input_ids, input_mask = text_tokenize(sub_question, tokenizer, max_seq_length) question_tokens.append(input_ids) question_masks.append(input_mask) for et in sup_q['entity']: topic = et['et'] node_first_sent = et['first_sent'] if topic is None: continue question_node_list.append(topic) question_idx = len(question_tokens) - 1 node_sub_questions.append(question_idx) input_ids, input_mask = text_tokenize(node_first_sent, tokenizer, max_seq_length) first_sent_tokens.append(input_ids) first_sent_masks.append(input_mask) candidate_node_list.append(normalize(positive_entity['et'])) input_ids, input_mask = text_tokenize(positive_entity['first_sent'], tokenizer, max_seq_length) first_sent_tokens.append(input_ids) first_sent_masks.append(input_mask) node_sub_questions.append(0) for neg_et in negative_entities: candidate_node_list.append(normalize(neg_et['et'])) input_ids, input_mask = text_tokenize(neg_et['first_sent'], tokenizer, max_seq_length) first_sent_tokens.append(input_ids) first_sent_masks.append(input_mask) node_sub_questions.append(0) num_nodes = len(question_node_list) + len(candidate_node_list) g.add_nodes(num_nodes) num_questions = len(question_tokens) ### combine question and first sentence all_tokens = question_tokens + first_sent_tokens all_masks = question_masks + first_sent_masks all_tensor = torch.LongTensor(all_tokens).to(device) all_masks_tensor = torch.LongTensor(all_masks).to(device) all_encodings = list() num_exs = 50 for iii in range(int(all_tensor.size(0) / num_exs)): encoding, _ = bert_model( all_tensor[iii * num_exs:(iii + 1) * num_exs], None, all_masks_tensor[iii * num_exs:(iii + 1) * num_exs]) encoding = encoding.detach().cpu() all_encodings.append(encoding) if all_tensor.size(0) % num_exs > 0: encoding, _ = bert_model( all_tensor[int(all_tensor.size(0) / num_exs) * num_exs:], None, all_masks_tensor[int(all_tensor.size(0) / num_exs) * num_exs:]) encoding = encoding.detach().cpu() all_encodings.append(encoding) all_encodings = torch.cat(all_encodings, dim=0) all_masks_tensor = all_masks_tensor.cpu() g.ndata['first_sent'] = all_encodings[num_questions:].cpu() g.ndata['first_sent_mask'] = all_masks_tensor[num_questions:].cpu().eq(0) for i in range(len(question_node_list)): sub_q_num = node_sub_questions[i] g.nodes[i].data['question'] = all_encodings[sub_q_num].unsqueeze(0) g.nodes[i].data['question_mask'] = all_masks_tensor[ sub_q_num].unsqueeze(0).eq(0) g.nodes[i].data['label'] = torch.tensor(-1).unsqueeze(0) g.nodes[len( question_node_list)].data['question'] = all_encodings[0].unsqueeze(0) g.nodes[len(question_node_list)].data['question_mask'] = all_masks_tensor[ 0].unsqueeze(0).eq(0) g.nodes[len(question_node_list)].data['label'] = torch.tensor(1).unsqueeze( 0) #### for candidates, we only use the full question sentence for i in range( len(question_node_list) + 1, len(question_node_list) + len(candidate_node_list)): g.nodes[i].data['question'] = all_encodings[0].unsqueeze(0) g.nodes[i].data['question_mask'] = all_masks_tensor[0].unsqueeze(0).eq( 0) g.nodes[i].data['label'] = torch.tensor(0).unsqueeze(0) for k_entity in positive_entity['evidence']: normalized_k_entity = normalize(k_entity) if normalized_k_entity in question_node_list: s_id = question_node_list.index(normalized_k_entity) g.add_edge(question_node_list.index(normalized_k_entity), len(question_node_list)) evidence_tokens = list() evidence_masks = list() evidence_ids = list() all_evidences = positive_entity['evidence'][k_entity] for evi_text in all_evidences[:num_edges]: input_ids, input_mask = text_tokenize(evi_text, tokenizer, max_seq_length) evidence_tokens.append(input_ids) evidence_masks.append(input_mask) evidence_tensor = torch.LongTensor(evidence_tokens) evidence_masks_tensor = torch.LongTensor(evidence_masks) edge_features = torch.LongTensor(1, num_edges, max_seq_length).zero_() edge_feature_masks = torch.LongTensor(1, num_edges, max_seq_length).zero_() egde_sent_mask = torch.ByteTensor(1, num_edges).fill_(1) edge_features[0, :len(evidence_tokens), :].copy_(evidence_tensor) edge_feature_masks[0, :len(evidence_tokens), :].copy_( evidence_masks_tensor) egde_sent_mask[0, :len(evidence_tokens)].fill_(0) g.edges[s_id, len(question_node_list)].data['evidence'] = edge_features g.edges[s_id, len(question_node_list )].data['evidence_mask'] = edge_feature_masks g.edges[s_id, len(question_node_list )].data['evidence_sent_mask'] = egde_sent_mask for neg_et in negative_entities: for k_entity in neg_et['evidence']: normalized_k_entity = normalize(k_entity) if normalized_k_entity in question_node_list: s_id = question_node_list.index(normalized_k_entity) t_id = len(question_node_list) + candidate_node_list.index( normalize(neg_et['et'])) g.add_edge(s_id, t_id) evidence_tokens = list() evidence_masks = list() evidence_ids = list() all_evidences = neg_et['evidence'][normalized_k_entity] for evi_text in all_evidences[:num_edges]: input_ids, input_mask = text_tokenize( evi_text, tokenizer, max_seq_length) evidence_tokens.append(input_ids) evidence_masks.append(input_mask) evidence_tensor = torch.LongTensor(evidence_tokens) evidence_masks_tensor = torch.LongTensor(evidence_masks) edge_features = torch.LongTensor(1, num_edges, max_seq_length).zero_() edge_feature_masks = torch.LongTensor(1, num_edges, max_seq_length).zero_() egde_sent_mask = torch.ByteTensor(1, num_edges).fill_(1) edge_features[0, :len(evidence_tokens), :].copy_( evidence_tensor) edge_feature_masks[0, :len(evidence_tokens), :].copy_( evidence_masks_tensor) egde_sent_mask[0, :len(evidence_tokens)].fill_(0) g.edges[s_id, t_id].data['evidence'] = edge_features g.edges[s_id, t_id].data['evidence_mask'] = edge_feature_masks g.edges[s_id, t_id].data['evidence_sent_mask'] = egde_sent_mask ### Batch the sentences and get BERT embeddings if 'evidence' in g.edata: evi = g.edata['evidence'].to(device) evi_mask = g.edata['evidence_mask'].to(device) batch_size, sent_max_len, word_max_len = evi.size(0), evi.size( 1), evi.size(2) evi = evi.view(batch_size * sent_max_len, word_max_len) evi_mask = evi_mask.view(batch_size * sent_max_len, word_max_len) all_encodings = list() num_exs = 50 for iii in range(int(evi.size(0) / num_exs)): encoding, _ = bert_model( evi[iii * num_exs:(iii + 1) * num_exs], None, evi_mask[iii * num_exs:(iii + 1) * num_exs]) encoding = encoding.detach().cpu() all_encodings.append(encoding) if evi.size(0) % num_exs > 0: encoding, _ = bert_model( evi[int(evi.size(0) / num_exs) * num_exs:], None, evi_mask[int(evi.size(0) / num_exs) * num_exs:]) encoding = encoding.detach().cpu() all_encodings.append(encoding) g.edata['evidence'] = torch.cat(all_encodings, dim=0).view(batch_size, sent_max_len, word_max_len, -1) g.edata['evidence_mask'] = g.edata['evidence_mask'].eq(0) return g
class MoleculeEnv(object): """MDP environment for generating molecules. Parameters ---------- atom_types : list E.g. ['C', 'N'] bond_types : list E.g. [Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE, Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC] """ def __init__(self, atom_types, bond_types): super(MoleculeEnv, self).__init__() self.atom_types = atom_types self.bond_types = bond_types self.atom_type_to_id = dict() self.bond_type_to_id = dict() for id, a_type in enumerate(atom_types): self.atom_type_to_id[a_type] = id for id, b_type in enumerate(bond_types): self.bond_type_to_id[b_type] = id def get_decision_sequence(self, mol, atom_order): """Extract a decision sequence with which DGMG can generate the molecule with a specified atom order. Parameters ---------- mol : Chem.rdchem.Mol atom_order : list Specifies a mapping between the original atom indices and the new atom indices. In particular, atom_order[i] is re-labeled as i. Returns ------- decisions : list decisions[i] is a 2-tuple (i, j) - If i = 0, j specifies either the type of the atom to add self.atom_types[j] or termination with j = len(self.atom_types) - If i = 1, j specifies either the type of the bond to add self.bond_types[j] or termination with j = len(self.bond_types) - If i = 2, j specifies the destination atom id for the bond to add. With the formulation of DGMG, j must be created before the decision. """ decisions = [] old2new = dict() for new_id, old_id in enumerate(atom_order): atom = mol.GetAtomWithIdx(old_id) a_type = atom.GetSymbol() decisions.append((0, self.atom_type_to_id[a_type])) for bond in atom.GetBonds(): u = bond.GetBeginAtomIdx() v = bond.GetEndAtomIdx() if v == old_id: u, v = v, u if v in old2new: decisions.append( (1, self.bond_type_to_id[bond.GetBondType()])) decisions.append((2, old2new[v])) decisions.append((1, len(self.bond_types))) old2new[old_id] = new_id decisions.append((0, len(self.atom_types))) return decisions def reset(self, rdkit_mol=False): """Setup for generating a new molecule Parameters ---------- rdkit_mol : bool Whether to keep a Chem.rdchem.Mol object so that we know what molecule is being generated """ self.dgl_graph = DGLGraph() # If there are some features for nodes and edges, # zero tensors will be set for those of new nodes and edges. self.dgl_graph.set_n_initializer(dgl.frame.zero_initializer) self.dgl_graph.set_e_initializer(dgl.frame.zero_initializer) self.mol = None if rdkit_mol: # RWMol is a molecule class that is intended to be edited. self.mol = Chem.RWMol(Chem.MolFromSmiles('')) def num_atoms(self): """Get the number of atoms for the current molecule. Returns ------- int """ return self.dgl_graph.number_of_nodes() def add_atom(self, type): """Add an atom of the specified type. Parameters ---------- type : int Should be in the range of [0, len(self.atom_types) - 1] """ self.dgl_graph.add_nodes(1) if self.mol is not None: self.mol.AddAtom(Chem.Atom(self.atom_types[type])) def add_bond(self, u, v, type, bi_direction=True): """Add a bond of the specified type between atom u and v. Parameters ---------- u : int Index for the first atom v : int Index for the second atom type : int Index for the bond type bi_direction : bool Whether to add edges for both directions in the DGLGraph. If not, we will only add the edge (u, v). """ if bi_direction: self.dgl_graph.add_edges([u, v], [v, u]) else: self.dgl_graph.add_edge(u, v) if self.mol is not None: self.mol.AddBond(u, v, self.bond_types[type]) def get_current_smiles(self): """Get the generated molecule in SMILES Returns ------- s : str SMILES """ assert self.mol is not None, 'Expect a Chem.rdchem.Mol object initialized.' s = Chem.MolToSmiles(self.mol) return s
def _test_nx_conversion(): # check conversion between networkx and DGLGraph def _check_nx_feature(nxg, nf, ef): # check node and edge feature of nxg # this is used to check to_networkx num_nodes = len(nxg) num_edges = nxg.size() if num_nodes > 0: node_feat = ddict(list) for nid, attr in nxg.nodes(data=True): assert len(attr) == len(nf) for k in nxg.nodes[nid]: node_feat[k].append(F.unsqueeze(attr[k], 0)) for k in node_feat: feat = F.cat(node_feat[k], 0) assert F.allclose(feat, nf[k]) else: assert len(nf) == 0 if num_edges > 0: edge_feat = ddict(lambda: [0] * num_edges) for u, v, attr in nxg.edges(data=True): assert len(attr) == len(ef) + 1 # extra id eid = attr['id'] for k in ef: edge_feat[k][eid] = F.unsqueeze(attr[k], 0) for k in edge_feat: feat = F.cat(edge_feat[k], 0) assert F.allclose(feat, ef[k]) else: assert len(ef) == 0 n1 = F.randn((5, 3)) n2 = F.randn((5, 10)) n3 = F.randn((5, 4)) e1 = F.randn((4, 5)) e2 = F.randn((4, 7)) g = DGLGraph() g.add_nodes(5) g.add_edges([0, 1, 3, 4], [2, 4, 0, 3]) g.ndata.update({'n1': n1, 'n2': n2, 'n3': n3}) g.edata.update({'e1': e1, 'e2': e2}) # convert to networkx nxg = g.to_networkx(node_attrs=['n1', 'n3'], edge_attrs=['e1', 'e2']) assert len(nxg) == 5 assert nxg.size() == 4 _check_nx_feature(nxg, {'n1': n1, 'n3': n3}, {'e1': e1, 'e2': e2}) # convert to DGLGraph, nx graph has id in edge feature # use id feature to test non-tensor copy g = dgl.from_networkx(nxg, node_attrs=['n1'], edge_attrs=['e1', 'id']) # check graph size assert g.number_of_nodes() == 5 assert g.number_of_edges() == 4 # check number of features # test with existing dglgraph (so existing features should be cleared) assert len(g.ndata) == 1 assert len(g.edata) == 2 # check feature values assert F.allclose(g.ndata['n1'], n1) # with id in nx edge feature, e1 should follow original order assert F.allclose(g.edata['e1'], e1) assert F.array_equal(F.astype(g.edata['id'], F.int64), F.copy_to(F.arange(0, 4), F.cpu())) # test conversion after modifying DGLGraph g.edata.pop( 'id') # pop id so we don't need to provide id when adding edges new_n = F.randn((2, 3)) new_e = F.randn((3, 5)) g.add_nodes(2, data={'n1': new_n}) # add three edges, one is a multi-edge g.add_edges([3, 6, 0], [4, 5, 2], data={'e1': new_e}) n1 = F.cat((n1, new_n), 0) e1 = F.cat((e1, new_e), 0) # convert to networkx again nxg = g.to_networkx(node_attrs=['n1'], edge_attrs=['e1']) assert len(nxg) == 7 assert nxg.size() == 7 _check_nx_feature(nxg, {'n1': n1}, {'e1': e1}) # now test convert from networkx without id in edge feature # first pop id in edge feature for _, _, attr in nxg.edges(data=True): attr.pop('id') # test with a new graph g = dgl.from_networkx(nxg, node_attrs=['n1'], edge_attrs=['e1']) # check graph size assert g.number_of_nodes() == 7 assert g.number_of_edges() == 7 # check number of features assert len(g.ndata) == 1 assert len(g.edata) == 1 # check feature values assert F.allclose(g.ndata['n1'], n1) # edge feature order follows nxg.edges() edge_feat = [] for _, _, attr in nxg.edges(data=True): edge_feat.append(F.unsqueeze(attr['e1'], 0)) edge_feat = F.cat(edge_feat, 0) assert F.allclose(g.edata['e1'], edge_feat) # Test converting from a networkx graph whose nodes are # not labeled with consecutive-integers. nxg = nx.cycle_graph(5) nxg.remove_nodes_from([0, 4]) for u in nxg.nodes(): nxg.nodes[u]['h'] = F.tensor([u]) for u, v, d in nxg.edges(data=True): d['h'] = F.tensor([u, v]) g = dgl.from_networkx(nxg, node_attrs=['h'], edge_attrs=['h']) assert g.number_of_nodes() == 3 assert g.number_of_edges() == 4 assert g.has_edge_between(0, 1) assert g.has_edge_between(1, 2) assert F.allclose(g.ndata['h'], F.tensor([[1.], [2.], [3.]])) assert F.allclose(g.edata['h'], F.tensor([[1., 2.], [1., 2.], [2., 3.], [2., 3.]]))
def mol_to_nearest_neighbor_graph(mol, coordinates, neighbor_cutoff, max_num_neighbors=None, p_distance=2, add_self_loop=False, node_featurizer=None, edge_featurizer=None, canonical_atom_order=True, keep_dists=False, dist_field='dist', explicit_hydrogens=False): """Convert an RDKit molecule into a nearest neighbor graph and featurize for it. Different from bigraph and complete graph, the nearest neighbor graph may not be symmetric since i is the closest neighbor of j does not necessarily suggest the other way. Parameters ---------- mol : rdkit.Chem.rdchem.Mol RDKit molecule holder coordinates : numpy.ndarray of shape (N, D) The coordinates of atoms in the molecule. N for the number of atoms and D for the dimensions of the coordinates. neighbor_cutoff : float If the distance between a pair of nodes is larger than neighbor_cutoff, they will not be considered as neighboring nodes. max_num_neighbors : int or None. If not None, then this specifies the maximum number of neighbors allowed for each atom. Default to None. p_distance : int We compute the distance between neighbors using Minkowski (:math:`l_p`) distance. When ``p_distance = 1``, Minkowski distance is equivalent to Manhattan distance. When ``p_distance = 2``, Minkowski distance is equivalent to the standard Euclidean distance. Default to 2. add_self_loop : bool Whether to add self loops in DGLGraphs. Default to False. node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for nodes like atoms in a molecule, which can be used to update ndata for a DGLGraph. Default to None. edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict Featurization for edges like bonds in a molecule, which can be used to update edata for a DGLGraph. Default to None. canonical_atom_order : bool Whether to use a canonical order of atoms returned by RDKit. Setting it to true might change the order of atoms in the graph constructed. Default to True. keep_dists : bool Whether to store the distance between neighboring atoms in ``edata`` of the constructed DGLGraphs. Default to False. dist_field : str Field for storing distance between neighboring atoms in ``edata``. This comes into effect only when ``keep_dists=True``. Default to ``'dist'``. explicit_hydrogens : bool Whether to explicitly represent hydrogens as nodes in the graph. Default to False. Returns ------- g : DGLGraph Nearest neighbor DGLGraph for the molecule Examples -------- >>> from dgllife.utils import mol_to_nearest_neighbor_graph >>> from rdkit import Chem >>> from rdkit.Chem import AllChem >>> mol = Chem.MolFromSmiles('CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C') >>> AllChem.EmbedMolecule(mol) >>> AllChem.MMFFOptimizeMolecule(mol) >>> coords = get_mol_3d_coordinates(mol) >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25) >>> print(g) DGLGraph(num_nodes=23, num_edges=6, ndata_schemes={} edata_schemes={}) Quite often we will want to use the distance between end atoms of edges, this can be achieved with >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25, keep_dists=True) >>> print(g.edata['dist']) tensor([[1.2024], [1.2024], [1.2270], [1.2270], [1.2259], [1.2259]]) By default, we do not explicitly represent hydrogens as nodes, which can be done as follows. >>> mol = Chem.MolFromSmiles('CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C') >>> mol = Chem.AddHs(mol) >>> AllChem.EmbedMolecule(mol) >>> AllChem.MMFFOptimizeMolecule(mol) >>> coords = get_mol_3d_coordinates(mol) >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25, >>> explicit_hydrogens=True) >>> print(g) DGLGraph(num_nodes=41, num_edges=42, ndata_schemes={} edata_schemes={}) See Also -------- get_mol_3d_coordinates k_nearest_neighbors smiles_to_nearest_neighbor_graph """ if explicit_hydrogens: mol = Chem.AddHs(mol) else: mol = Chem.RemoveHs(mol) num_atoms = mol.GetNumAtoms() num_coords = coordinates.shape[0] assert num_atoms == num_coords, \ 'Expect the number of atoms to match the first dimension of coordinates, ' \ 'got {:d} and {:d}'.format(num_atoms, num_coords) if canonical_atom_order: new_order = rdmolfiles.CanonicalRankAtoms(mol) mol = rdmolops.RenumberAtoms(mol, new_order) srcs, dsts, dists = k_nearest_neighbors( coordinates=coordinates, neighbor_cutoff=neighbor_cutoff, max_num_neighbors=max_num_neighbors, p_distance=p_distance, self_loops=add_self_loop) g = DGLGraph() # Add nodes first since some nodes may be completely isolated g.add_nodes(num_atoms) # Add edges g.add_edges(srcs, dsts) if node_featurizer is not None: g.ndata.update(node_featurizer(mol)) if edge_featurizer is not None: g.edata.update(edge_featurizer(mol)) if keep_dists: assert dist_field not in g.edata, \ 'Expect {} to be reserved for distance between neighboring atoms.' g.edata[dist_field] = torch.tensor(dists).float().reshape(-1, 1) return g
def vectorize_qanta(ex, model, istrain, max_seq_length=128): q_id = ex['id'] text = ex['text'] positive_entity = ex['pos_et'] negative_entities = ex['neg_ets'] ### Maximum 3 sentences per edge num_edges = 3 g = DGLGraph() question_node_list = list() candidate_node_list = list() first_sent_tokens = list() first_sent_masks = list() question_tokens = list() question_masks = list() input_ids, input_mask = text_tokenize(text, model.word_dict, max_seq_length) question_tokens.append(input_ids) question_masks.append(input_mask) node_sub_questions = list() for sup_q in ex['q_et']: sub_question = sup_q['text'] input_ids, input_mask = text_tokenize(word_tokenize(sub_question), model.word_dict, max_seq_length) question_tokens.append(input_ids) question_masks.append(input_mask) for et in sup_q['entity']: topic = et['et'] node_first_sent = et['first_sent'] if topic is None: continue question_node_list.append(topic) question_idx = len(question_tokens) - 1 node_sub_questions.append(question_idx) input_ids, input_mask = text_tokenize(node_first_sent, model.word_dict, max_seq_length) first_sent_tokens.append(input_ids) first_sent_masks.append(input_mask) candidate_node_list.append(normalize(positive_entity['et'])) input_ids, input_mask = text_tokenize(positive_entity['first_sent'], model.word_dict, max_seq_length) first_sent_tokens.append(input_ids) first_sent_masks.append(input_mask) node_sub_questions.append(0) for neg_et in negative_entities: input_ids, input_mask = text_tokenize(neg_et['first_sent'], model.word_dict, max_seq_length) candidate_node_list.append(normalize(neg_et['et'])) first_sent_tokens.append(input_ids) first_sent_masks.append(input_mask) node_sub_questions.append(0) num_nodes = len(question_node_list) + len(candidate_node_list) g.add_nodes(num_nodes) num_questions = len(question_tokens) ### combine question and first sentence all_tokens = question_tokens + first_sent_tokens all_masks = question_masks + first_sent_masks all_tensor = torch.LongTensor(all_tokens) all_masks_tensor = torch.LongTensor(all_masks) #### add node features g.ndata['first_sent'] = all_tensor[num_questions:].cpu() g.ndata['first_sent_mask'] = all_masks_tensor[num_questions:].eq(0) for i in range(len(question_node_list)): sub_q_num = node_sub_questions[i] g.nodes[i].data['question'] = all_tensor[sub_q_num].unsqueeze(0) g.nodes[i].data['question_mask'] = all_masks_tensor[sub_q_num].unsqueeze(0).eq(0) g.nodes[i].data['label'] = torch.tensor(-1).unsqueeze(0) g.nodes[len(question_node_list)].data['question'] = all_tensor[0].unsqueeze(0) g.nodes[len(question_node_list)].data['question_mask'] = all_masks_tensor[0].unsqueeze(0).eq(0) g.nodes[len(question_node_list)].data['label'] = torch.tensor(1).unsqueeze(0) #### for candidates, we only use the full question sentence for i in range(len(question_node_list) + 1, len(question_node_list) + len(candidate_node_list)): g.nodes[i].data['question'] = all_tensor[0].unsqueeze(0) g.nodes[i].data['question_mask'] = all_masks_tensor[0].unsqueeze(0).eq(0) g.nodes[i].data['label'] = torch.tensor(0).unsqueeze(0) #### add postive edges for k_entity in positive_entity['evidence']: normalized_k_entity = normalize(k_entity) if normalized_k_entity in question_node_list: s_id = question_node_list.index(normalized_k_entity) g.add_edge(question_node_list.index(normalized_k_entity), len(question_node_list)) evidence_tokens = list() evidence_masks = list() all_evidences = positive_entity['evidence'][k_entity] for evi_text in all_evidences[:num_edges]: input_ids, input_mask = text_tokenize(evi_text, model.word_dict, max_seq_length) evidence_tokens.append(input_ids) evidence_masks.append(input_mask) evidence_tensor = torch.LongTensor(evidence_tokens) evidence_masks_tensor = torch.LongTensor(evidence_masks) edge_features = torch.LongTensor(1, num_edges, max_seq_length).zero_() edge_feature_masks = torch.LongTensor(1, num_edges, max_seq_length).zero_() egde_sent_mask = torch.ByteTensor(1, num_edges).fill_(1) edge_features[0, :len(evidence_tokens), :].copy_(evidence_tensor) edge_feature_masks[0, :len(evidence_tokens), :].copy_(evidence_masks_tensor) egde_sent_mask[0, :len(evidence_tokens)].fill_(0) g.edges[s_id, len(question_node_list)].data['evidence'] = edge_features g.edges[s_id, len(question_node_list)].data['evidence_mask'] = edge_feature_masks.eq(0) g.edges[s_id, len(question_node_list)].data['evidence_sent_mask'] = egde_sent_mask for neg_et in negative_entities: #### for k_entity in neg_et['evidence']: normalized_k_entity = normalize(k_entity) if normalized_k_entity in question_node_list: s_id = question_node_list.index(normalized_k_entity) t_id = len(question_node_list) + candidate_node_list.index(normalize(neg_et['et'])) g.add_edge(s_id, t_id) evidence_tokens = list() evidence_masks = list() all_evidences = neg_et['evidence'][normalized_k_entity] for evi_text in all_evidences[:num_edges]: input_ids, input_mask = text_tokenize(evi_text, model.word_dict, max_seq_length) evidence_tokens.append(input_ids) evidence_masks.append(input_mask) evidence_tensor = torch.LongTensor(evidence_tokens) evidence_masks_tensor = torch.LongTensor(evidence_masks) edge_features = torch.LongTensor(1, num_edges, max_seq_length).zero_() edge_feature_masks = torch.LongTensor(1, num_edges, max_seq_length).zero_() egde_sent_mask = torch.ByteTensor(1, num_edges).fill_(1) edge_features[0, :len(evidence_tokens), :].copy_(evidence_tensor) edge_feature_masks[0, :len(evidence_tokens), :].copy_(evidence_masks_tensor) egde_sent_mask[0, :len(evidence_tokens)].fill_(0) g.edges[s_id, t_id].data['evidence'] = edge_features g.edges[s_id, t_id].data['evidence_mask'] = edge_feature_masks.eq(0) g.edges[s_id, t_id].data['evidence_sent_mask'] = egde_sent_mask return g