Пример #1
0
    def sample_frontier(self, block_id, g, seed_nodes):
        fanout = self.fanouts[block_id] if self.fanouts is not None else None
        # List of neighbors to sample per edge type for each GNN layer, starting from the first layer.
        g = dgl.in_subgraph(g, seed_nodes)
        g.remove_edges(torch.where(g.edata['timestamp'] > self.ts)[0])
        if self.args.valid_path:
            if block_id != self.args.n_layer - 1:
                g.dstdata['sample_time'] = self.frontiers[block_id + 1].srcdata['sample_time']
                g.apply_edges(self.sample_prob)
                g.remove_edges(torch.where(g.edata['timespan'] < 0)[0])
            g_re=dgl.reverse(g,copy_edata=True,copy_ndata=True)
            g_re.update_all(self.sample_time,fn.max('st','sample_time'))
            g=dgl.reverse(g_re,copy_edata=True,copy_ndata=True)

        if fanout is None:
            frontier = g
        else:
            if block_id == self.args.n_layer - 1:

                if self.args.bandit:
                    frontier = dgl.sampling.sample_neighbors(g,seed_nodes,fanout,prob='q_ij')
                else:
                    frontier = dgl.sampling.sample_neighbors(g, seed_nodes, fanout)

            else:
                frontier = dgl.sampling.sample_neighbors(g, seed_nodes, fanout)

        self.frontiers[block_id] = frontier
        return frontier
Пример #2
0
def track_time(graph_name, format):
    device = utils.get_bench_device()
    graph = utils.get_graph(graph_name, format)
    graph = graph.to(device)
    graph = graph.formats([format])
    # dry run
    dgl.reverse(graph)

    # timing
    with utils.Timer() as t:
        for i in range(10):
            gg = dgl.reverse(graph)

    return t.elapsed_secs / 10
Пример #3
0
def track_time(graph_name, format):
    device = utils.get_bench_device()
    graph = utils.get_graph(graph_name, format)
    graph = graph.to(device)
    graph = graph.formats([format])
    # dry run
    dgl.reverse(graph)

    # timing
    t0 = time.time()
    for i in range(10):
        gg = dgl.reverse(graph)
    t1 = time.time()

    return (t1 - t0) / 10
Пример #4
0
    def collate(self, items):
        #print('before', self.block_sampler.ts)
        if self.args.eventdrop - 0 > 1e-6 and self.mode == 'train':
            len_items = len(items)
            n_drop = int(len_items * self.args.eventdrop)
            s_idx = np.random.randint(len_items)
            e_idx = min(len_items, s_idx + n_drop)
            items = items[:s_idx] + items[e_idx:]

        current_ts = self.g.edata['timestamp'][
            items[-1]]  # only sample edges before current timestamp
        self.block_sampler.ts = current_ts
        neg_pair_graph = None
        if self.negative_sampler is None:
            input_nodes, pair_graph, blocks = self._collate(items)
        else:
            input_nodes, pair_graph, neg_pair_graph, blocks = self._collate_with_negative_sampling(
                items)

        for i in range(self.args.n_layer - 1):
            self.block_sampler.frontiers[0].add_edges(
                *self.block_sampler.frontiers[i + 1].edges())
        frontier = dgl.reverse(self.block_sampler.frontiers[0])

        return input_nodes, pair_graph, neg_pair_graph, blocks, frontier, current_ts
Пример #5
0
    def forward(self, g, l_g):
        g.apply_edges(self.edge_transfer)

        # nodes correspond to edges and edges correspond to nodes in the original graphs
        # node: d, rbf, o, rbf_env, x_kj, x_ji
        for k, v in g.edata.items():
            l_g.ndata[k] = v

        l_g_reverse = dgl.reverse(l_g, copy_edata=True)
        l_g_reverse.update_all(self.msg_func, fn.sum('x_kj', 'm_update'))

        g.edata['m_update'] = self.up_projection(l_g_reverse.ndata['m_update'])
        if self.activation is not None:
            g.edata['m_update'] = self.activation(g.edata['m_update'])
        # Transformations before skip connection
        g.edata['m_update'] = g.edata['m_update'] + g.edata['x_ji']
        for layer in self.layers_before_skip:
            g.edata['m_update'] = layer(g.edata['m_update'])
        g.edata['m_update'] = self.final_before_skip(g.edata['m_update'])
        if self.activation is not None:
            g.edata['m_update'] = self.activation(g.edata['m_update'])

        # Skip connection
        g.edata['m'] = g.edata['m'] + g.edata['m_update']

        # Transformations after skip connection
        for layer in self.layers_after_skip:
            g.edata['m'] = layer(g.edata['m'])

        return g
Пример #6
0
    def inference(self, inputs):

        input_lengths, inputs_padded, words_padded, mapping_padded, g_batch = inputs
        input_lengths = input_lengths.data
        inputs_padded = to_gpu(inputs_padded).long()
        words_padded = to_gpu(words_padded).long()
        mapping_padded = to_gpu(mapping_padded).long()
        g_batch = to_gpu_graph(g_batch)

        batch_size = input_lengths.size(0)

        bert_inputs = self.bert(words_padded)[0][0]
        g_nfeat = torch.reshape(bert_inputs, [-1, self.bert_dim])
        g_etype = g_batch.edata['type']
        if hparams.dep_graph_type.startswith('bi'):
            g_output = self.gnn_uni(g_batch, g_nfeat, g_etype) + self.gnn_rev(
                dgl.reverse(g_batch), g_nfeat, g_etype)
        else:
            g_output = self.gnn(g_batch, g_nfeat, g_etype)

        bert_inputs_batch = to_gpu(
            torch.FloatTensor(batch_size, input_lengths[0],
                              self.bert_dim)).float()
        for i in range(batch_size):
            for j in range(input_lengths[0]):
                bert_inputs_batch[i][j] = g_output[mapping_padded[i][j]]

        tacotron2_inputs = inputs_padded
        tacotron2_outputs = self.tacotron2.inference(tacotron2_inputs,
                                                     bert_inputs_batch)

        return tacotron2_outputs
Пример #7
0
    def forward(self, inputs):
        input_lengths, inputs_padded, words_padded, mapping_padded, g_batch, mel_padded, max_len, output_lengths = inputs
        input_lengths, output_lengths = input_lengths.data, output_lengths.data

        batch_size = input_lengths.size(0)
        max_len = inputs_padded.shape[1]

        bert_inputs = self.bert(words_padded)[0][0]

        g_nfeat = torch.reshape(bert_inputs, [-1, self.bert_dim])
        g_etype = g_batch.edata['type']
        if hparams.dep_graph_type.startswith('bi'):
            g_output = self.gnn_uni(g_batch, g_nfeat, g_etype) + self.gnn_rev(
                dgl.reverse(g_batch), g_nfeat, g_etype)
        else:
            g_output = self.gnn(g_batch, g_nfeat, g_etype)

        bert_inputs_batch = to_gpu(
            torch.FloatTensor(batch_size, max_len, self.bert_dim)).float()

        for i in range(batch_size):
            for j in range(max_len):
                bert_inputs_batch[i][j] = g_output[mapping_padded[i][j]]

        tacotron2_inputs = (inputs_padded, input_lengths, mel_padded, max_len,
                            output_lengths)
        tacotron2_outputs = self.tacotron2(tacotron2_inputs, bert_inputs_batch)

        return tacotron2_outputs
Пример #8
0
    def process(self):
        super().process()
        g = self.g
        g.nodes['paper'].data['citation'] = g.nodes['paper'].data[
            'citation'].float().log1p()
        apg = dgl.reverse(g['author', 'writes', 'paper'])
        apg.update_all(fn.copy_u('citation', 'c'), fn.sum('c', 'c'))
        author_citation = apg.nodes['author'].data['c']

        keep_authors = (author_citation >= 10).nonzero(
            as_tuple=True)[0].tolist()
        drop_authors = torch.tensor(
            list(set(range(g.num_nodes('author'))) - set(keep_authors)))
        g = dgl.remove_nodes(g, drop_authors, 'author', True)
        nid = {'author': g.nodes['author'].data[dgl.NID]}

        for ntype, etype in [('institution', 'affiliated_with'),
                             ('paper', 'writes'), ('field', 'has_field'),
                             ('venue', 'published_at')]:
            drop_nodes = torch.nonzero(g.in_degrees(etype=etype) == 0,
                                       as_tuple=True)[0]
            g = dgl.remove_nodes(g, drop_nodes, ntype, store_ids=True)
            nid[ntype] = g.nodes[ntype].data[dgl.NID]
        g.ndata[dgl.NID] = nid
        for etype in g.etypes:
            del g.edges[etype].data[dgl.EID]
        self.g = g
Пример #9
0
 def forward(self, g, feat):
     rg = dgl.reverse(g, False, False)
     feat = self.ggnn(g, rg, feat)
     last_nodes = g.filter_nodes(lambda nodes: nodes.data['last'] == 1)
     ct_l = feat[last_nodes]
     ct_g = self.readout(g, feat, last_nodes)
     sr = th.cat([ct_g, ct_l], dim=1)
     return sr
Пример #10
0
def interactive(path_to_function: str, path_to_model: str):
    fix_seed()
    device = get_device()
    print(f"using {device} device")

    # convert function to dot format
    print(f"prepare ast...")
    create_folder(TMP_FOLDER)
    if not build_ast(path_to_function):
        return
    ast_folder = os.path.join(TMP_FOLDER, 'java', 'asts')
    ast = os.listdir(ast_folder)
    if len(ast) == 0:
        print("didn't find any functions in given file")
        return
    if len(ast) > 1:
        print(
            "too many functions in given file, for interactive prediction you need only one"
        )
        return
    dgl_ast = convert_dot_to_dgl(os.path.join(ast_folder, ast[0]))
    ast_desc = pd.read_csv(os.path.join(TMP_FOLDER, 'java', 'description.csv'))
    ast_desc['token'].fillna('NAN', inplace=True)
    with open(vocab_path, 'rb') as pkl_file:
        vocab = pkl_load(pkl_file)
        token_to_id, type_to_id = vocab['token_to_id'], vocab['type_to_id']
    ast_desc = transform_keys(ast_desc, token_to_id, type_to_id)
    batched_graph, labels, paths = prepare_batch(ast_desc, ['ast_0.dot'],
                                                 lambda: [dgl_ast])
    batched_graph = dgl.batch(
        list(
            map(lambda g: dgl.reverse(g, share_ndata=True),
                dgl.unbatch(batched_graph))))

    # load model
    print("loading model..")
    model, _ = load_model(path_to_model, device)
    criterion = nn.CrossEntropyLoss(
        ignore_index=model.decoder.pad_index).to(device)
    info = LearningInfo()

    print("forward pass...")
    batch_info, prediction = eval_on_batch(model, criterion, batched_graph,
                                           labels, device)

    info.accumulate_info(batch_info)
    id_to_sublabel = {v: k for k, v in model.decoder.label_to_id.items()}
    label = ''
    for cur_sublabel in prediction:
        if cur_sublabel.item() == model.decoder.label_to_id[EOS]:
            break
        label += '|' + id_to_sublabel[cur_sublabel.item()]
    label = label[1:]
    print(f"Predicted function name is\n{label}")
    print(
        f"Calculated metrics with respect to '{labels[0]}' name\n{info.get_state_dict()}"
    )
Пример #11
0
 def collate(self,items):
     current_ts=self.g.edata['timestamp'][items[0]]     #only sample edges before current timestamp
     self.block_sampler.ts=current_ts
     neg_pair_graph=None
     if self.negative_sampler is None:
         input_nodes,pair_graph,blocks=self._collate(items)
     else:
         input_nodes,pair_graph,neg_pair_graph,blocks=self._collate_with_negative_sampling(items)
     if self.args.n_layer>1:
         self.block_sampler.frontiers[0].add_edges(*self.block_sampler.frontiers[1].edges())
     frontier=dgl.reverse(self.block_sampler.frontiers[0])
     return input_nodes, pair_graph, neg_pair_graph, blocks, frontier, current_ts
Пример #12
0
    def forward(self, g):
        with g.local_scope():
            g.edata['tmp'] = g.edata['m'] * self.dense_rbf(g.edata['rbf'])
            g_reverse = dgl.reverse(g, copy_edata=True)
            g_reverse.update_all(fn.copy_e('tmp', 'x'), fn.sum('x', 't'))
            g.ndata['t'] = self.up_projection(g_reverse.ndata['t'])

            for layer in self.dense_layers:
                g.ndata['t'] = layer(g.ndata['t'])
                if self.activation is not None:
                    g.ndata['t'] = self.activation(g.ndata['t'])
            g.ndata['t'] = self.dense_final(g.ndata['t'])
            return dgl.readout_nodes(g,
                                     't',
                                     op='sum' if self.extensive else 'mean')
Пример #13
0
    def forward(self, g, feat, feat_u):
        rg = dgl.reverse(g, False, False)
        if g.number_of_edges() > 0:
            edge_weight = g.edata['w']
            in_deg = F.copy_e_sum(g, edge_weight)
            g.edata['iw'] = F.e_div_v(g, edge_weight, in_deg)
            out_deg = F.copy_e_sum(rg, edge_weight)
            rg.edata['ow'] = F.e_div_v(rg, edge_weight, out_deg)

        feat = self.pwggnn(g, rg, feat)
        last_nodes = g.filter_nodes(lambda nodes: nodes.data['last'] == 1)
        ct_l = feat[last_nodes]
        ct_g = self.readout(g, feat, feat_u, last_nodes)
        sr = th.cat((ct_l, ct_g), dim=1)
        return sr
Пример #14
0
    def collate(self, items):
        #print('before', self.block_sampler.ts)

        current_ts = self.g.edata['timestamp'][items[-1]]  # only sample edges before last timestamp in a batch
        self.block_sampler.ts = current_ts
        neg_pair_graph = None
        if self.negative_sampler is None:
            input_nodes, pair_graph, blocks = self._collate(items)
        else:
            input_nodes, pair_graph, neg_pair_graph, blocks = self._collate_with_negative_sampling(items)

        for i in range(self.args.n_layer-1):
            self.block_sampler.frontiers[0].add_edges(*self.block_sampler.frontiers[i+1].edges())
        frontier = dgl.reverse(self.block_sampler.frontiers[0])

        return input_nodes, pair_graph, neg_pair_graph, blocks, frontier, current_ts
Пример #15
0
    def forward(self, batch, g, h, c):
        """Compute tree-lstm prediction given a batch.

        Parameters
        ----------
        batch : dgl.data.SSTBatch
            The data batch.
        h : Tensor
            Initial hidden state.
        c : Tensor
            Initial cell state.

        Returns
        -------
        out
        """
        h_bottom_up = self.propagate(g, self.cell_bottom_up, batch.X, h, c)

        g_rev = dgl.reverse(g)
        h_top_down = self.propagate(g_rev, self.cell_top_down,
                                    th.cat([batch.X, h_bottom_up], dim=1), h,
                                    c)

        root_ids = th.nonzero(batch.isroot, as_tuple=False).flatten()
        root_h_bottom_up = th.index_select(h_bottom_up, 0, root_ids)

        lims_ids = root_ids.tolist() + [g.number_of_nodes()]

        trees_h = [
            h_top_down[s:e, :] for s, e in zip(lims_ids[:-1], lims_ids[1:])
        ]
        trees_isleaf = [
            batch.isleaf[s:e] for s, e in zip(lims_ids[:-1], lims_ids[1:])
        ]
        leaves_h_top_down = th.cat([
            th.mean(th.index_select(
                tree, 0,
                th.nonzero(leaves, as_tuple=False).flatten()),
                    dim=0).view(1, -1)
            for (tree, leaves) in zip(trees_h, trees_isleaf)
        ],
                                   dim=0)

        out = th.cat([root_h_bottom_up, leaves_h_top_down], dim=1)
        return out
Пример #16
0
    def __getitem__(self, item) -> Tuple[BatchedDGLGraph, List[str]]:
        batch_basename, batch_slice = self.batch_desc[item]

        # read file only if previous wasn't the same
        if self.loaded_batch_basename != batch_basename:
            with open(path_join(self.batched_graphs_path, batch_basename),
                      'rb') as pkl_file:
                self.loaded_batched_graph = pkl_load(pkl_file)
            self.loaded_batch_basename = batch_basename

        graphs = unbatch(self.loaded_batched_graph['batched_graph'])

        graphs_for_batch = graphs[batch_slice]
        if self.invert_edges:
            graphs_for_batch = list(
                map(lambda g: reverse(g, share_ndata=True), graphs_for_batch))

        batched_graph = batch(graphs_for_batch)
        batched_labels = self.loaded_batched_graph['labels'][batch_slice]

        return batched_graph, batched_labels
 def __reverse_dgl_batch__(t):
     t_rev = dgl.reverse(t, copy_edata=True, copy_ndata=True)
     t_rev.set_batch_num_nodes(t.batch_num_nodes())
     t_rev.set_batch_num_edges(t.batch_num_edges())
     return t_rev
Пример #18
0
    def handle(self, *args, **options):
        batch_size = options['batch_size']
        raw_path = DATA_DIR / 'oag/cs'

        print('正在导入期刊数据...')
        Venue.objects.bulk_create([
            Venue(id=i, name=v['name'])
            for i, v in enumerate(iter_json(raw_path / 'mag_venues.txt'))
        ],
                                  batch_size=batch_size)
        vid_map = {
            v['id']: i
            for i, v in enumerate(iter_json(raw_path / 'mag_venues.txt'))
        }

        print('正在导入机构数据...')
        Institution.objects.bulk_create([
            Institution(id=i, name=o['name'])
            for i, o in enumerate(iter_json(raw_path / 'mag_institutions.txt'))
        ],
                                        batch_size=batch_size)
        oid_map = {
            o['id']: i
            for i, o in enumerate(iter_json(raw_path / 'mag_institutions.txt'))
        }

        print('正在导入领域数据...')
        Field.objects.bulk_create([
            Field(id=i, name=f['name'])
            for i, f in enumerate(iter_json(raw_path / 'mag_fields.txt'))
        ],
                                  batch_size=batch_size)

        data = OAGCSDataset()
        g = data[0]
        apg = dgl.reverse(g['author', 'writes', 'paper'], copy_ndata=False)
        apg.nodes['paper'].data['c'] = g.nodes['paper'].data['citation'].float(
        )
        apg.update_all(fn.copy_u('c', 'm'), fn.sum('m', 'c'))
        author_citation = apg.nodes['author'].data['c'].int().tolist()

        print('正在导入学者数据...')
        Author.objects.bulk_create([
            Author(id=i,
                   name=a['name'],
                   n_citation=author_citation[i],
                   institution_id=oid_map[a['org']]
                   if a['org'] is not None else None)
            for i, a in enumerate(iter_json(raw_path / 'mag_authors.txt'))
        ],
                                   batch_size=batch_size)

        print('正在导入论文数据...')
        Paper.objects.bulk_create([
            Paper(id=i,
                  title=p['title'],
                  venue_id=vid_map[p['venue']],
                  year=p['year'],
                  abstract=p['abstract'],
                  n_citation=p['n_citation'])
            for i, p in enumerate(iter_json(raw_path / 'mag_papers.txt'))
        ],
                                  batch_size=batch_size)

        print('正在导入论文关联数据(很慢)...')
        print('writes')
        u, v = g.edges(etype='writes')
        order = g.edges['writes'].data['order']
        edges = list(zip(u.tolist(), v.tolist(), order.tolist()))
        for i in trange(0, len(edges), batch_size):
            Writes.objects.bulk_create([
                Writes(author_id=a, paper_id=p, order=r)
                for a, p, r in edges[i:i + batch_size]
            ])

        print('has_field')
        u, v = g.edges(etype='has_field')
        edges = list(zip(u.tolist(), v.tolist()))
        HasField = Paper.fos.through
        for i in trange(0, len(edges), batch_size):
            HasField.objects.bulk_create([
                HasField(paper_id=p, field_id=f)
                for p, f in edges[i:i + batch_size]
            ])

        print('cites')
        u, v = g.edges(etype='cites')
        edges = list(zip(u.tolist(), v.tolist()))
        Cites = Paper.references.through
        for i in trange(0, len(edges), batch_size):
            Cites.objects.bulk_create([
                Cites(from_paper_id=p, to_paper_id=r)
                for p, r in edges[i:i + batch_size]
            ])
        print('导入完成')