def sample_frontier(self, block_id, g, seed_nodes): fanout = self.fanouts[block_id] if self.fanouts is not None else None # List of neighbors to sample per edge type for each GNN layer, starting from the first layer. g = dgl.in_subgraph(g, seed_nodes) g.remove_edges(torch.where(g.edata['timestamp'] > self.ts)[0]) if self.args.valid_path: if block_id != self.args.n_layer - 1: g.dstdata['sample_time'] = self.frontiers[block_id + 1].srcdata['sample_time'] g.apply_edges(self.sample_prob) g.remove_edges(torch.where(g.edata['timespan'] < 0)[0]) g_re=dgl.reverse(g,copy_edata=True,copy_ndata=True) g_re.update_all(self.sample_time,fn.max('st','sample_time')) g=dgl.reverse(g_re,copy_edata=True,copy_ndata=True) if fanout is None: frontier = g else: if block_id == self.args.n_layer - 1: if self.args.bandit: frontier = dgl.sampling.sample_neighbors(g,seed_nodes,fanout,prob='q_ij') else: frontier = dgl.sampling.sample_neighbors(g, seed_nodes, fanout) else: frontier = dgl.sampling.sample_neighbors(g, seed_nodes, fanout) self.frontiers[block_id] = frontier return frontier
def track_time(graph_name, format): device = utils.get_bench_device() graph = utils.get_graph(graph_name, format) graph = graph.to(device) graph = graph.formats([format]) # dry run dgl.reverse(graph) # timing with utils.Timer() as t: for i in range(10): gg = dgl.reverse(graph) return t.elapsed_secs / 10
def track_time(graph_name, format): device = utils.get_bench_device() graph = utils.get_graph(graph_name, format) graph = graph.to(device) graph = graph.formats([format]) # dry run dgl.reverse(graph) # timing t0 = time.time() for i in range(10): gg = dgl.reverse(graph) t1 = time.time() return (t1 - t0) / 10
def collate(self, items): #print('before', self.block_sampler.ts) if self.args.eventdrop - 0 > 1e-6 and self.mode == 'train': len_items = len(items) n_drop = int(len_items * self.args.eventdrop) s_idx = np.random.randint(len_items) e_idx = min(len_items, s_idx + n_drop) items = items[:s_idx] + items[e_idx:] current_ts = self.g.edata['timestamp'][ items[-1]] # only sample edges before current timestamp self.block_sampler.ts = current_ts neg_pair_graph = None if self.negative_sampler is None: input_nodes, pair_graph, blocks = self._collate(items) else: input_nodes, pair_graph, neg_pair_graph, blocks = self._collate_with_negative_sampling( items) for i in range(self.args.n_layer - 1): self.block_sampler.frontiers[0].add_edges( *self.block_sampler.frontiers[i + 1].edges()) frontier = dgl.reverse(self.block_sampler.frontiers[0]) return input_nodes, pair_graph, neg_pair_graph, blocks, frontier, current_ts
def forward(self, g, l_g): g.apply_edges(self.edge_transfer) # nodes correspond to edges and edges correspond to nodes in the original graphs # node: d, rbf, o, rbf_env, x_kj, x_ji for k, v in g.edata.items(): l_g.ndata[k] = v l_g_reverse = dgl.reverse(l_g, copy_edata=True) l_g_reverse.update_all(self.msg_func, fn.sum('x_kj', 'm_update')) g.edata['m_update'] = self.up_projection(l_g_reverse.ndata['m_update']) if self.activation is not None: g.edata['m_update'] = self.activation(g.edata['m_update']) # Transformations before skip connection g.edata['m_update'] = g.edata['m_update'] + g.edata['x_ji'] for layer in self.layers_before_skip: g.edata['m_update'] = layer(g.edata['m_update']) g.edata['m_update'] = self.final_before_skip(g.edata['m_update']) if self.activation is not None: g.edata['m_update'] = self.activation(g.edata['m_update']) # Skip connection g.edata['m'] = g.edata['m'] + g.edata['m_update'] # Transformations after skip connection for layer in self.layers_after_skip: g.edata['m'] = layer(g.edata['m']) return g
def inference(self, inputs): input_lengths, inputs_padded, words_padded, mapping_padded, g_batch = inputs input_lengths = input_lengths.data inputs_padded = to_gpu(inputs_padded).long() words_padded = to_gpu(words_padded).long() mapping_padded = to_gpu(mapping_padded).long() g_batch = to_gpu_graph(g_batch) batch_size = input_lengths.size(0) bert_inputs = self.bert(words_padded)[0][0] g_nfeat = torch.reshape(bert_inputs, [-1, self.bert_dim]) g_etype = g_batch.edata['type'] if hparams.dep_graph_type.startswith('bi'): g_output = self.gnn_uni(g_batch, g_nfeat, g_etype) + self.gnn_rev( dgl.reverse(g_batch), g_nfeat, g_etype) else: g_output = self.gnn(g_batch, g_nfeat, g_etype) bert_inputs_batch = to_gpu( torch.FloatTensor(batch_size, input_lengths[0], self.bert_dim)).float() for i in range(batch_size): for j in range(input_lengths[0]): bert_inputs_batch[i][j] = g_output[mapping_padded[i][j]] tacotron2_inputs = inputs_padded tacotron2_outputs = self.tacotron2.inference(tacotron2_inputs, bert_inputs_batch) return tacotron2_outputs
def forward(self, inputs): input_lengths, inputs_padded, words_padded, mapping_padded, g_batch, mel_padded, max_len, output_lengths = inputs input_lengths, output_lengths = input_lengths.data, output_lengths.data batch_size = input_lengths.size(0) max_len = inputs_padded.shape[1] bert_inputs = self.bert(words_padded)[0][0] g_nfeat = torch.reshape(bert_inputs, [-1, self.bert_dim]) g_etype = g_batch.edata['type'] if hparams.dep_graph_type.startswith('bi'): g_output = self.gnn_uni(g_batch, g_nfeat, g_etype) + self.gnn_rev( dgl.reverse(g_batch), g_nfeat, g_etype) else: g_output = self.gnn(g_batch, g_nfeat, g_etype) bert_inputs_batch = to_gpu( torch.FloatTensor(batch_size, max_len, self.bert_dim)).float() for i in range(batch_size): for j in range(max_len): bert_inputs_batch[i][j] = g_output[mapping_padded[i][j]] tacotron2_inputs = (inputs_padded, input_lengths, mel_padded, max_len, output_lengths) tacotron2_outputs = self.tacotron2(tacotron2_inputs, bert_inputs_batch) return tacotron2_outputs
def process(self): super().process() g = self.g g.nodes['paper'].data['citation'] = g.nodes['paper'].data[ 'citation'].float().log1p() apg = dgl.reverse(g['author', 'writes', 'paper']) apg.update_all(fn.copy_u('citation', 'c'), fn.sum('c', 'c')) author_citation = apg.nodes['author'].data['c'] keep_authors = (author_citation >= 10).nonzero( as_tuple=True)[0].tolist() drop_authors = torch.tensor( list(set(range(g.num_nodes('author'))) - set(keep_authors))) g = dgl.remove_nodes(g, drop_authors, 'author', True) nid = {'author': g.nodes['author'].data[dgl.NID]} for ntype, etype in [('institution', 'affiliated_with'), ('paper', 'writes'), ('field', 'has_field'), ('venue', 'published_at')]: drop_nodes = torch.nonzero(g.in_degrees(etype=etype) == 0, as_tuple=True)[0] g = dgl.remove_nodes(g, drop_nodes, ntype, store_ids=True) nid[ntype] = g.nodes[ntype].data[dgl.NID] g.ndata[dgl.NID] = nid for etype in g.etypes: del g.edges[etype].data[dgl.EID] self.g = g
def forward(self, g, feat): rg = dgl.reverse(g, False, False) feat = self.ggnn(g, rg, feat) last_nodes = g.filter_nodes(lambda nodes: nodes.data['last'] == 1) ct_l = feat[last_nodes] ct_g = self.readout(g, feat, last_nodes) sr = th.cat([ct_g, ct_l], dim=1) return sr
def interactive(path_to_function: str, path_to_model: str): fix_seed() device = get_device() print(f"using {device} device") # convert function to dot format print(f"prepare ast...") create_folder(TMP_FOLDER) if not build_ast(path_to_function): return ast_folder = os.path.join(TMP_FOLDER, 'java', 'asts') ast = os.listdir(ast_folder) if len(ast) == 0: print("didn't find any functions in given file") return if len(ast) > 1: print( "too many functions in given file, for interactive prediction you need only one" ) return dgl_ast = convert_dot_to_dgl(os.path.join(ast_folder, ast[0])) ast_desc = pd.read_csv(os.path.join(TMP_FOLDER, 'java', 'description.csv')) ast_desc['token'].fillna('NAN', inplace=True) with open(vocab_path, 'rb') as pkl_file: vocab = pkl_load(pkl_file) token_to_id, type_to_id = vocab['token_to_id'], vocab['type_to_id'] ast_desc = transform_keys(ast_desc, token_to_id, type_to_id) batched_graph, labels, paths = prepare_batch(ast_desc, ['ast_0.dot'], lambda: [dgl_ast]) batched_graph = dgl.batch( list( map(lambda g: dgl.reverse(g, share_ndata=True), dgl.unbatch(batched_graph)))) # load model print("loading model..") model, _ = load_model(path_to_model, device) criterion = nn.CrossEntropyLoss( ignore_index=model.decoder.pad_index).to(device) info = LearningInfo() print("forward pass...") batch_info, prediction = eval_on_batch(model, criterion, batched_graph, labels, device) info.accumulate_info(batch_info) id_to_sublabel = {v: k for k, v in model.decoder.label_to_id.items()} label = '' for cur_sublabel in prediction: if cur_sublabel.item() == model.decoder.label_to_id[EOS]: break label += '|' + id_to_sublabel[cur_sublabel.item()] label = label[1:] print(f"Predicted function name is\n{label}") print( f"Calculated metrics with respect to '{labels[0]}' name\n{info.get_state_dict()}" )
def collate(self,items): current_ts=self.g.edata['timestamp'][items[0]] #only sample edges before current timestamp self.block_sampler.ts=current_ts neg_pair_graph=None if self.negative_sampler is None: input_nodes,pair_graph,blocks=self._collate(items) else: input_nodes,pair_graph,neg_pair_graph,blocks=self._collate_with_negative_sampling(items) if self.args.n_layer>1: self.block_sampler.frontiers[0].add_edges(*self.block_sampler.frontiers[1].edges()) frontier=dgl.reverse(self.block_sampler.frontiers[0]) return input_nodes, pair_graph, neg_pair_graph, blocks, frontier, current_ts
def forward(self, g): with g.local_scope(): g.edata['tmp'] = g.edata['m'] * self.dense_rbf(g.edata['rbf']) g_reverse = dgl.reverse(g, copy_edata=True) g_reverse.update_all(fn.copy_e('tmp', 'x'), fn.sum('x', 't')) g.ndata['t'] = self.up_projection(g_reverse.ndata['t']) for layer in self.dense_layers: g.ndata['t'] = layer(g.ndata['t']) if self.activation is not None: g.ndata['t'] = self.activation(g.ndata['t']) g.ndata['t'] = self.dense_final(g.ndata['t']) return dgl.readout_nodes(g, 't', op='sum' if self.extensive else 'mean')
def forward(self, g, feat, feat_u): rg = dgl.reverse(g, False, False) if g.number_of_edges() > 0: edge_weight = g.edata['w'] in_deg = F.copy_e_sum(g, edge_weight) g.edata['iw'] = F.e_div_v(g, edge_weight, in_deg) out_deg = F.copy_e_sum(rg, edge_weight) rg.edata['ow'] = F.e_div_v(rg, edge_weight, out_deg) feat = self.pwggnn(g, rg, feat) last_nodes = g.filter_nodes(lambda nodes: nodes.data['last'] == 1) ct_l = feat[last_nodes] ct_g = self.readout(g, feat, feat_u, last_nodes) sr = th.cat((ct_l, ct_g), dim=1) return sr
def collate(self, items): #print('before', self.block_sampler.ts) current_ts = self.g.edata['timestamp'][items[-1]] # only sample edges before last timestamp in a batch self.block_sampler.ts = current_ts neg_pair_graph = None if self.negative_sampler is None: input_nodes, pair_graph, blocks = self._collate(items) else: input_nodes, pair_graph, neg_pair_graph, blocks = self._collate_with_negative_sampling(items) for i in range(self.args.n_layer-1): self.block_sampler.frontiers[0].add_edges(*self.block_sampler.frontiers[i+1].edges()) frontier = dgl.reverse(self.block_sampler.frontiers[0]) return input_nodes, pair_graph, neg_pair_graph, blocks, frontier, current_ts
def forward(self, batch, g, h, c): """Compute tree-lstm prediction given a batch. Parameters ---------- batch : dgl.data.SSTBatch The data batch. h : Tensor Initial hidden state. c : Tensor Initial cell state. Returns ------- out """ h_bottom_up = self.propagate(g, self.cell_bottom_up, batch.X, h, c) g_rev = dgl.reverse(g) h_top_down = self.propagate(g_rev, self.cell_top_down, th.cat([batch.X, h_bottom_up], dim=1), h, c) root_ids = th.nonzero(batch.isroot, as_tuple=False).flatten() root_h_bottom_up = th.index_select(h_bottom_up, 0, root_ids) lims_ids = root_ids.tolist() + [g.number_of_nodes()] trees_h = [ h_top_down[s:e, :] for s, e in zip(lims_ids[:-1], lims_ids[1:]) ] trees_isleaf = [ batch.isleaf[s:e] for s, e in zip(lims_ids[:-1], lims_ids[1:]) ] leaves_h_top_down = th.cat([ th.mean(th.index_select( tree, 0, th.nonzero(leaves, as_tuple=False).flatten()), dim=0).view(1, -1) for (tree, leaves) in zip(trees_h, trees_isleaf) ], dim=0) out = th.cat([root_h_bottom_up, leaves_h_top_down], dim=1) return out
def __getitem__(self, item) -> Tuple[BatchedDGLGraph, List[str]]: batch_basename, batch_slice = self.batch_desc[item] # read file only if previous wasn't the same if self.loaded_batch_basename != batch_basename: with open(path_join(self.batched_graphs_path, batch_basename), 'rb') as pkl_file: self.loaded_batched_graph = pkl_load(pkl_file) self.loaded_batch_basename = batch_basename graphs = unbatch(self.loaded_batched_graph['batched_graph']) graphs_for_batch = graphs[batch_slice] if self.invert_edges: graphs_for_batch = list( map(lambda g: reverse(g, share_ndata=True), graphs_for_batch)) batched_graph = batch(graphs_for_batch) batched_labels = self.loaded_batched_graph['labels'][batch_slice] return batched_graph, batched_labels
def __reverse_dgl_batch__(t): t_rev = dgl.reverse(t, copy_edata=True, copy_ndata=True) t_rev.set_batch_num_nodes(t.batch_num_nodes()) t_rev.set_batch_num_edges(t.batch_num_edges()) return t_rev
def handle(self, *args, **options): batch_size = options['batch_size'] raw_path = DATA_DIR / 'oag/cs' print('正在导入期刊数据...') Venue.objects.bulk_create([ Venue(id=i, name=v['name']) for i, v in enumerate(iter_json(raw_path / 'mag_venues.txt')) ], batch_size=batch_size) vid_map = { v['id']: i for i, v in enumerate(iter_json(raw_path / 'mag_venues.txt')) } print('正在导入机构数据...') Institution.objects.bulk_create([ Institution(id=i, name=o['name']) for i, o in enumerate(iter_json(raw_path / 'mag_institutions.txt')) ], batch_size=batch_size) oid_map = { o['id']: i for i, o in enumerate(iter_json(raw_path / 'mag_institutions.txt')) } print('正在导入领域数据...') Field.objects.bulk_create([ Field(id=i, name=f['name']) for i, f in enumerate(iter_json(raw_path / 'mag_fields.txt')) ], batch_size=batch_size) data = OAGCSDataset() g = data[0] apg = dgl.reverse(g['author', 'writes', 'paper'], copy_ndata=False) apg.nodes['paper'].data['c'] = g.nodes['paper'].data['citation'].float( ) apg.update_all(fn.copy_u('c', 'm'), fn.sum('m', 'c')) author_citation = apg.nodes['author'].data['c'].int().tolist() print('正在导入学者数据...') Author.objects.bulk_create([ Author(id=i, name=a['name'], n_citation=author_citation[i], institution_id=oid_map[a['org']] if a['org'] is not None else None) for i, a in enumerate(iter_json(raw_path / 'mag_authors.txt')) ], batch_size=batch_size) print('正在导入论文数据...') Paper.objects.bulk_create([ Paper(id=i, title=p['title'], venue_id=vid_map[p['venue']], year=p['year'], abstract=p['abstract'], n_citation=p['n_citation']) for i, p in enumerate(iter_json(raw_path / 'mag_papers.txt')) ], batch_size=batch_size) print('正在导入论文关联数据(很慢)...') print('writes') u, v = g.edges(etype='writes') order = g.edges['writes'].data['order'] edges = list(zip(u.tolist(), v.tolist(), order.tolist())) for i in trange(0, len(edges), batch_size): Writes.objects.bulk_create([ Writes(author_id=a, paper_id=p, order=r) for a, p, r in edges[i:i + batch_size] ]) print('has_field') u, v = g.edges(etype='has_field') edges = list(zip(u.tolist(), v.tolist())) HasField = Paper.fos.through for i in trange(0, len(edges), batch_size): HasField.objects.bulk_create([ HasField(paper_id=p, field_id=f) for p, f in edges[i:i + batch_size] ]) print('cites') u, v = g.edges(etype='cites') edges = list(zip(u.tolist(), v.tolist())) Cites = Paper.references.through for i in trange(0, len(edges), batch_size): Cites.objects.bulk_create([ Cites(from_paper_id=p, to_paper_id=r) for p, r in edges[i:i + batch_size] ]) print('导入完成')