Пример #1
0
def reduced_graph(graph:DGLGraph,center_node:int,paths:dict,node_attr_name,edge_attr_name):
	"""
	reduced graph into a simpler graph with only one center node
	:param graph: the graph need to be reduced
	:param center_node the reference node
	:param paths: the traversal path of nodes using BFS
	:return: new_graph
	"""
	new_graph = DGLGraph()
	new_graph.add_nodes(num=graph.number_of_nodes())
	new_graph.ndata[node_attr_name] = graph.ndata[node_attr_name]
	for node, path in paths.items():
		path_weight = torch.tensor([1.])
		for index,edge in enumerate(path):
			path_weight *= graph.edata[edge_attr_name][graph.edge_id(edge[0],edge[1])]*math.exp(-index)
		new_graph.add_edge(center_node,node,data={edge_attr_name:path_weight})
		new_graph.add_edge(node, center_node, data={edge_attr_name: path_weight})
	new_graph.add_edges(new_graph.nodes(), new_graph.nodes(),
	                    data={edge_attr_name: torch.ones(new_graph.number_of_nodes(), )})
	new_graph.edata[edge_attr_name] = new_graph.edata[edge_attr_name].softmax(dim=0)
	pass
	return new_graph
Пример #2
0
def generate_graph_old(grad=False):
    g = DGLGraph()
    g.add_nodes(10)  # 10 nodes
    # create a graph where 0 is the source and 9 is the sink
    # 17 edges
    for i in range(1, 9):
        g.add_edge(0, i)
        g.add_edge(i, 9)
    # add a back flow from 9 to 0
    g.add_edge(9, 0)
    g = g.to(F.ctx())
    ncol = F.randn((10, D))
    ecol = F.randn((17, D))
    if grad:
        ncol = F.attach_grad(ncol)
        ecol = F.attach_grad(ecol)

    g.ndata['h'] = ncol
    g.edata['w'] = ecol
    g.set_n_initializer(dgl.init.zero_initializer)
    g.set_e_initializer(dgl.init.zero_initializer)
    return g
Пример #3
0
def Get_DGL():
    key1 = []
    key2 = []
    maximum = 0
    with open('/home/student/raw_data/entity/triple2id4.txt', 'r') as ft:
        readlines = ft.readlines()
        for line in readlines:
            lines = line.split('\t')
            nod1 = int(lines[0])
            nod2 = int(lines[2])
            if (nod1 > maximum):
                maximum = nod1
            if (nod2 > maximum):
                maximum = nod2
            key1.append(nod1)
            key2.append(nod2)
        ft.close()
    #print(len(key1))
    c = DGLGraph()
    c.add_nodes(maximum + 1)
    c.add_edges(key1, key2)
    return c
def generate_er_graph(n, p):
    G = DGLGraph()
    G.add_nodes(n)

    w = -1
    lp = math.log(1.0 - p)

    # Nodes in graph are from 0,n-1 (start with v as the first node index).
    v = 1
    edges_list = []
    while v < n:
        lr = math.log(1.0 - random.random())
        w = w + 1 + int(lr / lp)
        while w >= v and v < n:
            w = w - v
            v = v + 1
        if v < n:
            edges_list.extend([(v, w), (w, v)])

    G.add_edges(*zip(*edges_list))

    return G
Пример #5
0
    def load_npz(file_name):
        with np.load(file_name, allow_pickle=True) as loader:
            loader = dict(loader)
            num_nodes = loader['adj_shape'][0]
            adj_matrix = sp.csr_matrix(
                (loader['adj_data'], loader['adj_indices'],
                 loader['adj_indptr']),
                shape=loader['adj_shape']).tocoo()

            if 'attr_data' in loader:
                # Attributes are stored as a sparse CSR matrix
                attr_matrix = sp.csr_matrix(
                    (loader['attr_data'], loader['attr_indices'],
                     loader['attr_indptr']),
                    shape=loader['attr_shape']).todense()
            elif 'attr_matrix' in loader:
                # Attributes are stored as a (dense) np.ndarray
                attr_matrix = loader['attr_matrix']
            else:
                attr_matrix = None

            if 'labels_data' in loader:
                # Labels are stored as a CSR matrix
                labels = sp.csr_matrix(
                    (loader['labels_data'], loader['labels_indices'],
                     loader['labels_indptr']),
                    shape=loader['labels_shape']).todense()
            elif 'labels' in loader:
                # Labels are stored as a numpy array
                labels = loader['labels']
            else:
                labels = None
        g = DGLGraph()
        g.add_nodes(num_nodes)
        g.add_edges(adj_matrix.row, adj_matrix.col)
        g.add_edges(adj_matrix.col, adj_matrix.row)
        g.ndata['feat'] = attr_matrix
        g.ndata['label'] = labels
        return g
Пример #6
0
def test_recv_0deg_newfld():
    # test recv with 0deg nodes; the reducer also creates a new field
    g = DGLGraph()
    g.add_nodes(2)
    g.add_edge(0, 1)
    def _message(edges):
        return {'m' : edges.src['h']}
    def _reduce(nodes):
        return {'h1' : nodes.data['h'] + F.sum(nodes.mailbox['m'], 1)}
    def _apply(nodes):
        return {'h1' : nodes.data['h1'] * 2}
    def _init2(shape, dtype, ctx, ids):
        return 2 + F.zeros(shape, dtype=dtype, ctx=ctx)
    g.register_message_func(_message)
    g.register_reduce_func(_reduce)
    g.register_apply_node_func(_apply)
    # test#1: recv both 0deg and non-0deg nodes
    old = F.randn((2, 5))
    g.set_n_initializer(_init2, 'h1')
    g.ndata['h'] = old
    g.send((0, 1))
    g.recv([0, 1])
    new = g.ndata.pop('h1')
    # 0deg check: initialized with the func and got applied
    assert F.allclose(new[0], F.full_1d(5, 4, dtype=F.float32))
    # non-0deg check
    assert F.allclose(new[1], F.sum(old, 0) * 2)

    # test#2: recv only 0deg node
    old = F.randn((2, 5))
    g.ndata['h'] = old
    g.ndata['h1'] = F.full((2, 5), -1, F.int64)  # this is necessary
    g.send((0, 1))
    g.recv(0)
    new = g.ndata.pop('h1')
    # 0deg check: fallback to apply
    assert F.allclose(new[0], F.full_1d(5, -2, F.int64))
    # non-0deg check: not changed
    assert F.allclose(new[1], F.full_1d(5, -1, F.int64))
Пример #7
0
def _move_tokens_to_leaves(graph: DGLGraph, pad_token_index: int, pad_type_index: int) -> DGLGraph:
    old_token = graph.ndata['token'].numpy()
    n_old_nodes = old_token.shape[0]

    type_mask = graph.ndata['type'].numpy() != pad_type_index
    type_mask = np.tile(type_mask.reshape(-1, 1), old_token.shape[1])
    mask = np.logical_and(old_token != pad_token_index, type_mask)
    n_new_nodes = mask.sum()

    new_token = np.full((n_old_nodes + n_new_nodes, 1), pad_token_index, dtype=np.int)
    new_token[:n_old_nodes] = np.where(~type_mask, old_token, pad_token_index)[:, [0]]
    new_token[n_old_nodes:] = old_token[mask].reshape(-1, 1)

    us, _ = np.nonzero(mask)
    vs = np.arange(n_new_nodes) + n_old_nodes

    graph.add_nodes(n_new_nodes)
    graph.add_edges(us, vs)

    graph.ndata['type'][n_old_nodes:] = pad_type_index
    graph.ndata['token'] = new_token
    return graph
    def fromText(self, fileName):
        '''
        The text file should be of the following format:
        [several rows of description here]
        [The first valid line should give nodes number and edges number: e.g. 'N180 E1999']
        [Rows of data, with each row being '{fromID} {toID}']

        And this function will return a graph generated by the given information.
        '''

        with open(fileName, 'r') as f:
            notStarted = True
            while notStarted:
                retr = f.readline()
                if retr == '':
                    return None

                match = re.search(r'N(\d+)\s+E(\d+)', retr)
                if match:
                    # Number of nodes and edges
                    self.size = [int(i) for i in match.groups()]
                    break
            # retrieve edges
            tmpData = f.readlines()
            pat = re.compile(r'(\d+)\s+(\d+)')

            Fromlist, Tolist = np.empty(
                self.size[1], dtype=np.int64), np.empty(self.size[1],
                                                        dtype=np.int64)
            for i in range(len(tmpData)):
                match = pat.search(tmpData[i])
                Fromlist[i], Tolist[i] = [int(j) for j in match.groups()]

            G = DGLGraph()
            G.add_nodes(self.size[0])
            G.add_edges(Fromlist, Tolist)

            return G
Пример #9
0
def test_recv_0deg():
    # test recv with 0deg nodes;
    g = DGLGraph()
    g.add_nodes(2)
    g.add_edge(0, 1)
    def _message(edges):
        return {'m' : edges.src['h']}
    def _reduce(nodes):
        return {'h' : nodes.data['h'] + F.sum(nodes.mailbox['m'], 1)}
    def _apply(nodes):
        return {'h' : nodes.data['h'] * 2}
    def _init2(shape, dtype, ctx, ids):
        return 2 + F.zeros(shape, dtype, ctx)
    g.register_message_func(_message)
    g.register_reduce_func(_reduce)
    g.register_apply_node_func(_apply)
    g.set_n_initializer(_init2, 'h')
    # test#1: recv both 0deg and non-0deg nodes
    old = F.randn((2, 5))
    g.ndata['h'] = old
    g.send((0, 1))
    g.recv([0, 1])
    new = g.ndata.pop('h')
    # 0deg check: initialized with the func and got applied
    assert F.allclose(new[0], F.full_1d(5, 4, F.float32))
    # non-0deg check
    assert F.allclose(new[1], F.sum(old, 0) * 2)

    # test#2: recv only 0deg node is equal to apply
    old = F.randn((2, 5))
    g.ndata['h'] = old
    g.send((0, 1))
    g.recv(0)
    new = g.ndata.pop('h')
    # 0deg check: equal to apply_nodes
    assert F.allclose(new[0], 2 * old[0])
    # non-0deg check: untouched
    assert F.allclose(new[1], old[1])
Пример #10
0
def test_update_all_0deg():
    # test#1
    g = DGLGraph()
    g = g.to(F.ctx())
    g.add_nodes(5)
    g.add_edge(1, 0)
    g.add_edge(2, 0)
    g.add_edge(3, 0)
    g.add_edge(4, 0)
    def _message(edges):
        return {'m' : edges.src['h']}
    def _reduce(nodes):
        return {'h' : nodes.data['h'] + F.sum(nodes.mailbox['m'], 1)}
    def _apply(nodes):
        return {'h' : nodes.data['h'] * 2}
    def _init2(shape, dtype, ctx, ids):
        return 2 + F.zeros(shape, dtype, ctx)
    g.set_n_initializer(_init2, 'h')
    old_repr = F.randn((5, 5))
    g.ndata['h'] = old_repr
    g.update_all(_message, _reduce, _apply)
    new_repr = g.ndata['h']
    # the first row of the new_repr should be the sum of all the node
    # features; while the 0-deg nodes should be initialized by the
    # initializer and applied with UDF.
    assert F.allclose(new_repr[1:], 2*(2+F.zeros((4,5))))
    assert F.allclose(new_repr[0], 2 * F.sum(old_repr, 0))

    # test#2: graph with no edge
    g = DGLGraph()
    g = g.to(F.ctx())
    g.add_nodes(5)
    g.set_n_initializer(_init2, 'h')
    g.ndata['h'] = old_repr
    g.update_all(_message, _reduce, _apply)
    new_repr = g.ndata['h']
    # should fallback to apply
    assert F.allclose(new_repr, 2*old_repr)
Пример #11
0
def create_g(file_path, use_cuda=False):
    npz = np.load(file_path, allow_pickle=True)
    labels = npz['labels']
    fts_nodes = npz['fts_node']
    edge_type = npz['edge_type'].tolist()
    edge_norm = npz['edge_norm'].tolist()
    edges = npz['edges']

    #num_nodes is number of nodes in the graph
    num_nodes = int(npz['nums'])
    labels = labels[0:num_nodes]
    fts_nodes = fts_nodes[0:num_nodes]
    g = DGLGraph()
    g.add_nodes(num_nodes)

    edge_type = np.array(edge_type)
    edge_norm = np.array(edge_norm)
    #adding edges from numpy files
    g.add_edges(edges[:, 0], edges[:, 1])

    edge_type = torch.from_numpy(edge_type)
    edge_norm = torch.from_numpy(edge_norm).unsqueeze(1)
    edge_norm = edge_norm.float()

    fts_nodes = fts_nodes.astype(int)
    fts_nodes = torch.from_numpy(fts_nodes)

    labels = torch.from_numpy(labels)

    if (use_cuda):
        labels = labels.cuda()
        edge_type = edge_type.cuda()
        edge_norm = edge_norm.cuda()
        fts_nodes = fts_nodes.cuda()

    g.edata.update({'rel_type': edge_type, 'norm': edge_norm})
    g.ndata['id'] = fts_nodes
    return [g, labels, fts_nodes]
Пример #12
0
    def process(self, mol: Mol, atom_map):
        n = mol.GetNumAtoms() + 1

        graph = DGLGraph()
        graph.add_nodes(n)
        graph.add_edges(graph.nodes(), graph.nodes())
        graph.add_edges(range(1, n), 0)
        # graph.add_edges(0, range(1, n))
        for e in mol.GetBonds():
            u, v = e.GetBeginAtomIdx(), e.GetEndAtomIdx()
            graph.add_edge(u + 1, v + 1)
            graph.add_edge(v + 1, u + 1)

        feature = torch.cat([
            torch.zeros((1, self.feature_dim), device=self.device),  # node 0
            torch.nn.functional.one_hot(torch.tensor(
                [atom_map[u.GetAtomicNum()] for u in mol.GetAtoms()],
                device=self.device),
                                        num_classes=self.feature_dim).to(
                                            torch.float)
        ])

        return GCNData(n, graph, feature)
Пример #13
0
def main(args):
    NUM_NODES = args.num_nodes
    NUM_HIDDEN = args.num_hidden
    IN_FEATS = args.in_feats
    torch.cuda.set_device(args.gpu)

    g = DGLGraph()
    g.add_nodes(NUM_NODES)
    g.add_edges([i for i in range(NUM_NODES)], 0)
    g.add_edges([i for i in range(NUM_NODES)], 1)
    norm = torch.rand((NUM_NODES, 1)).cuda()
    g.ndata['norm'] = norm
    feat_src = torch.rand((NUM_NODES, IN_FEATS))
    feat_src.requires_grad = True
    feat_src = feat_src.cuda()
    conv_test = EglGCNConvTest(g,
                               IN_FEATS,
                               NUM_HIDDEN,
                               activation=torch.nn.functional.relu,
                               dropout=args.dropout,
                               bias=True)
    conv_test.cuda()
    dgl_rst, egl_rst = conv_test.forward(feat_src)
Пример #14
0
def table_to_dgl_graph(par_tab_nums, foreign_keys, col_enc, tab_enc):
    g = DGLGraph()
    col_id_offset = max(par_tab_nums) + 1
    g.add_nodes(len(par_tab_nums) + col_id_offset)
    # column id: max table num + 1 + original column num
    table_id_list = range(col_id_offset)
    col_id_list = range(len(par_tab_nums))
    g.add_edges(table_id_list, table_id_list)
    g.add_edges(col_id_list, col_id_list)
    edge_types = [0] * len(table_id_list) + [1] * len(col_id_list)
    table_children_src = []
    table_children_dst = []
    for idx, par_tab_num in enumerate(par_tab_nums):
        if par_tab_num != -1:
            table_children_src.append(par_tab_num)
            table_children_dst.append(idx + col_id_offset)
    g.add_edges(table_children_src, table_children_dst)
    g.add_edges(table_children_dst, table_children_src)
    edge_types += [2] * len(table_children_src) + [3] * len(table_children_dst)

    if foreign_keys:
        foreign_key_srcs, foreign_key_dsts = zip(*foreign_keys)
        foreign_key_srcs = list(
            map(lambda col_num: col_num + col_id_offset, foreign_key_srcs))
        foreign_key_dsts = list(
            map(lambda col_num: col_num + col_id_offset, foreign_key_dsts))
        g.add_edges(foreign_key_srcs, foreign_key_dsts)
        g.add_edges(foreign_key_dsts, foreign_key_srcs)
        edge_types += [4] * len(foreign_key_srcs) + [5] * len(foreign_key_dsts)

    edge_types = torch.from_numpy(np.array(edge_types))
    if torch.cuda.is_available():
        edge_types = edge_types.cuda()
    g.edata.update({'rel_type': edge_types})
    g.ndata['h'] = torch.cat(
        (tab_enc[:col_id_offset], col_enc[:len(par_tab_nums)]))
    return g
Пример #15
0
def construct_complete_graph_from_mol(mol, add_self_loop=False):
    """Construct a complete graph with topology only for the molecule

    The **i** th atom in the molecule, i.e. ``mol.GetAtomWithIdx(i)``, corresponds to the
    **i** th node in the returned DGLGraph.

    The edges are in the order of (0, 0), (1, 0), (2, 0), ... (0, 1), (1, 1), (2, 1), ...
    If self loops are not created, we will not have (0, 0), (1, 1), ...

    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
        RDKit molecule holder
    add_self_loop : bool
        Whether to add self loops in DGLGraphs. Default to False.

    Returns
    -------
    g : DGLGraph
        Empty complete graph topology of the molecule
    """
    g = DGLGraph()
    num_atoms = mol.GetNumAtoms()
    g.add_nodes(num_atoms)

    if add_self_loop:
        g.add_edges(
            [i for i in range(num_atoms) for j in range(num_atoms)],
            [j for i in range(num_atoms) for j in range(num_atoms)])
    else:
        g.add_edges(
            [i for i in range(num_atoms) for j in range(num_atoms - 1)], [
                j for i in range(num_atoms)
                for j in range(num_atoms) if i != j
            ])

    return g
Пример #16
0
def convert_mol_to_graph(
    mol: Mol,
    conformer: Optional[Conformer],
    atom_rdkit_features: Sequence[RDKitFeature],
    bond_rdkit_features: Sequence[RDKitFeature],
    one_hot_encoding: bool = True,
    master_node: bool = True,
) -> DGLGraph:

    _graph_dict: Dict[str, torch.Tensor] = convert_mol_to_generic_graph(
        mol=mol,
        conformer=conformer,
        atom_rdkit_features=atom_rdkit_features,
        bond_rdkit_features=bond_rdkit_features,
        one_hot_encoding=one_hot_encoding,
        master_node=master_node,
    )

    dgl_graph = DGLGraph()
    dgl_graph.add_nodes(num=len(_graph_dict['node_attr']), )
    dgl_graph.nodes.data['attr'] = _graph_dict['node_attr']
    dgl_graph.nodes.data['pos'] = _graph_dict['node_pos']

    # all DGL graphs are directional, so need to add edges twice
    # ref: https://docs.dgl.ai/api/python/graph.html
    dgl_graph.add_edges(
        _graph_dict['edge_index'][:, 0],
        _graph_dict['edge_index'][:, 1],
        _graph_dict['edge_attr'],
    )
    dgl_graph.add_edges(
        _graph_dict['edge_index'][:, 1],
        _graph_dict['edge_index'][:, 0],
        _graph_dict['edge_attr'],
    )

    return dgl_graph
Пример #17
0
def get_graph_from_smile(molecule_smile):
    """
    Method that constructs a molecular graph with nodes being the atoms
    and bonds being the edges.
    :param molecule_smile: SMILE sequence
    :return: DGL graph object, Node features and Edge features
    """

    G = DGLGraph()
    molecule = Chem.MolFromSmiles(molecule_smile)
    features = rdDesc.GetFeatureInvariants(molecule)

    stereo = Chem.FindMolChiralCenters(molecule)
    chiral_centers = [0] * molecule.GetNumAtoms()
    for i in stereo:
        chiral_centers[i[0]] = i[1]

    G.add_nodes(molecule.GetNumAtoms())
    node_features = []
    edge_features = []
    for i in range(molecule.GetNumAtoms()):

        atom_i = molecule.GetAtomWithIdx(i)
        atom_i_features = get_atom_features(atom_i, chiral_centers[i], features[i])
        node_features.append(atom_i_features)

        for j in range(molecule.GetNumAtoms()):
            bond_ij = molecule.GetBondBetweenAtoms(i, j)
            if bond_ij is not None:
                G.add_edge(i, j)
                bond_features_ij = get_bond_features(bond_ij)
                edge_features.append(bond_features_ij)

    G.ndata['x'] = np.array(node_features)
    G.edata['w'] = np.array(edge_features)
    return G
Пример #18
0
def test_send_multigraph():
    g = DGLGraph(multigraph=True)
    g.add_nodes(3)
    g.add_edge(0, 1)
    g.add_edge(0, 1)
    g.add_edge(0, 1)
    g.add_edge(2, 1)

    def _message_a(edges):
        return {'a': edges.data['a']}

    def _message_b(edges):
        return {'a': edges.data['a'] * 3}

    def _reduce(nodes):
        return {'a': F.max(nodes.mailbox['a'], 1)}

    def answer(*args):
        return F.max(F.stack(args, 0), 0)

    # send by eid
    old_repr = F.randn((4, 5))
    g.ndata['a'] = F.zeros((3, 5))
    g.edata['a'] = old_repr
    g.send([0, 2], message_func=_message_a)
    g.recv(1, _reduce)
    new_repr = g.ndata['a']
    assert F.allclose(new_repr[1], answer(old_repr[0], old_repr[2]))

    g.ndata['a'] = F.zeros((3, 5))
    g.edata['a'] = old_repr
    g.send([0, 2, 3], message_func=_message_a)
    g.recv(1, _reduce)
    new_repr = g.ndata['a']
    assert F.allclose(new_repr[1], answer(old_repr[0], old_repr[2],
                                          old_repr[3]))

    # send on multigraph
    g.ndata['a'] = F.zeros((3, 5))
    g.edata['a'] = old_repr
    g.send(([0, 2], [1, 1]), _message_a)
    g.recv(1, _reduce)
    new_repr = g.ndata['a']
    assert F.allclose(new_repr[1], F.max(old_repr, 0))

    # consecutive send and send_on
    g.ndata['a'] = F.zeros((3, 5))
    g.edata['a'] = old_repr
    g.send((2, 1), _message_a)
    g.send([0, 1], message_func=_message_b)
    g.recv(1, _reduce)
    new_repr = g.ndata['a']
    assert F.allclose(new_repr[1],
                      answer(old_repr[0] * 3, old_repr[1] * 3, old_repr[3]))

    # consecutive send_on
    g.ndata['a'] = F.zeros((3, 5))
    g.edata['a'] = old_repr
    g.send(0, message_func=_message_a)
    g.send(1, message_func=_message_b)
    g.recv(1, _reduce)
    new_repr = g.ndata['a']
    assert F.allclose(new_repr[1], answer(old_repr[0], old_repr[1] * 3))

    # send_and_recv_on
    g.ndata['a'] = F.zeros((3, 5))
    g.edata['a'] = old_repr
    g.send_and_recv([0, 2, 3], message_func=_message_a, reduce_func=_reduce)
    new_repr = g.ndata['a']
    assert F.allclose(new_repr[1], answer(old_repr[0], old_repr[2],
                                          old_repr[3]))
    assert F.allclose(new_repr[[0, 2]], F.zeros((2, 5)))
Пример #19
0
###############################################################################
# Create graph and model
# ~~~~~~~~~~~~~~~~~~~~~~~

# configurations
n_hidden = 16  # number of hidden units
n_bases = -1  # use number of relations as number of bases
n_hidden_layers = 0  # use 1 input layer, 1 output layer, no hidden layer
n_epochs = 25  # epochs to train
lr = 0.01  # learning rate
l2norm = 0  # L2 norm coefficient

# create graph
g = DGLGraph()
g.add_nodes(num_nodes)
g.add_edges(data.edge_src, data.edge_dst)
g.edata.update({'rel_type': edge_type, 'norm': edge_norm})

# create model
model = Model(len(g),
              n_hidden,
              num_classes,
              num_rels,
              num_bases=n_bases,
              num_hidden_layers=n_hidden_layers)

###############################################################################
# Training loop
# ~~~~~~~~~~~~~~~~
Пример #20
0
def main(args):
    # load and preprocess dataset

    #FIRST, CHECK DATASET
    path = './dataset/' + str(args.dataset) + '/'
    '''
    edges = np.loadtxt(path + 'edges.txt')
    edges = edges.astype(int)

    features = np.loadtxt(path + 'features.txt')

    train_mask = np.loadtxt(path + 'train_mask.txt')
    train_mask = train_mask.astype(int)

    labels = np.loadtxt(path + 'labels.txt')
    labels = labels.astype(int)
    '''
    edges = np.load(path + 'edges.npy')
    features = np.load(path + 'features.npy')
    train_mask = np.load(path + 'train_mask.npy')
    labels = np.load(path + 'labels.npy')

    num_edges = edges.shape[0]
    num_nodes = features.shape[0]
    num_feats = features.shape[1]
    n_classes = max(labels) - min(labels) + 1

    assert train_mask.shape[0] == num_nodes

    print('dataset {}'.format(args.dataset))
    print('# of edges : {}'.format(num_edges))
    print('# of nodes : {}'.format(num_nodes))
    print('# of features : {}'.format(num_feats))

    features = torch.FloatTensor(features)
    labels = torch.LongTensor(labels)

    if hasattr(torch, 'BoolTensor'):
        train_mask = torch.BoolTensor(train_mask)

    else:
        train_mask = torch.ByteTensor(train_mask)

    if args.gpu < 0:
        cuda = False
    else:
        cuda = True
        torch.cuda.set_device(args.gpu)
        features = features.cuda()
        labels = labels.cuda()
        train_mask = train_mask.cuda()

    u = edges[:, 0]
    v = edges[:, 1]

    #initialize a DGL graph
    g = DGLGraph()
    g.add_nodes(num_nodes)
    g.add_edges(u, v)

    # graph preprocess and calculate normalization factor

    n_edges = g.number_of_edges()
    # add self loop
    g.add_edges(g.nodes(), g.nodes())
    g.set_n_initializer(dgl.init.zero_initializer)
    g.set_e_initializer(dgl.init.zero_initializer)

    # create APPNP model
    model = EglAPPNP(g, num_feats, args.hidden_sizes, n_classes, F.relu,
                     args.in_drop, args.edge_drop, args.alpha, args.k)

    if cuda:
        model.cuda()
    loss_fcn = torch.nn.CrossEntropyLoss()

    # use optimizer
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)
    # initialize graph
    dur = []
    Used_memory = 0

    for epoch in range(args.num_epochs):

        torch.cuda.synchronize()
        model.train()
        t0 = time.time()
        # forward
        logits = model(features)
        loss = loss_fcn(logits[train_mask], labels[train_mask])
        now_mem = torch.cuda.max_memory_allocated(0)
        Used_memory = max(now_mem, Used_memory)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        torch.cuda.synchronize()
        t2 = time.time()
        run_time_this_epoch = t2 - t0

        if epoch >= 3:
            dur.append(run_time_this_epoch)

        train_acc = accuracy(logits[train_mask], labels[train_mask])

        print(
            'Epoch {:05d} | Time(s) {:.4f} | train_acc {:.6f} | Used_Memory {:.6f} mb'
            .format(epoch, run_time_this_epoch, train_acc,
                    (now_mem * 1.0 / (1024**2))))

    Used_memory /= (1024**3)
    print('^^^{:6f}^^^{:6f}'.format(Used_memory, np.mean(dur)))
Пример #21
0
def reversed_graph(g):
    ret = DGLGraph()
    ret.add_nodes(g.number_of_nodes())
    u, v = g.all_edges()
    ret.add_edges(v, u)
    return ret
Пример #22
0
def main(args):
    # load and preprocess dataset
    path = './dataset/' + str(args.dataset) + '/'
    '''
    edges = np.loadtxt(path + 'edges.txt')
    edges = edges.astype(int)

    features = np.loadtxt(path + 'features.txt')

    train_mask = np.loadtxt(path + 'train_mask.txt')
    train_mask = train_mask.astype(int)

    labels = np.loadtxt(path + 'labels.txt')
    labels = labels.astype(int)
    '''
    edges = np.load(path + 'edges.npy')
    features = np.load(path + 'features.npy')
    train_mask = np.load(path + 'train_mask.npy')
    labels = np.load(path + 'labels.npy')

    num_edges = edges.shape[0]
    num_nodes = features.shape[0]
    num_feats = features.shape[1]
    n_classes = int(max(labels) - min(labels) + 1)

    assert train_mask.shape[0] == num_nodes

    print('dataset {}'.format(args.dataset))
    print('# of edges : {}'.format(num_edges))
    print('# of nodes : {}'.format(num_nodes))
    print('# of features : {}'.format(num_feats))

    features = torch.FloatTensor(features)
    labels = torch.LongTensor(labels)

    if hasattr(torch, 'BoolTensor'):
        train_mask = torch.BoolTensor(train_mask)

    else:
        train_mask = torch.ByteTensor(train_mask)

    if args.gpu < 0:
        cuda = False
    else:
        cuda = True
        torch.cuda.set_device(args.gpu)
        features = features.cuda()
        labels = labels.cuda()
        train_mask = train_mask.cuda()
    '''
    # graph preprocess and calculate normalization factor
    g = data.graph
    # add self loop
    if args.self_loop:
        g.remove_edges_from(nx.selfloop_edges(g))
        g.add_edges_from(zip(g.nodes(), g.nodes()))
    g = DGLGraph(g)
    n_edges = g.number_of_edges()
    '''
    u = edges[:, 0]
    v = edges[:, 1]
    g = DGLGraph()
    g.add_nodes(num_nodes)
    g.add_edges(u, v)
    # add self loop
    if args.self_loop:
        g = transform.add_self_loop(g)

    # normalization
    degs = g.in_degrees().float()
    norm = torch.pow(degs, -0.5)
    norm[torch.isinf(norm)] = 0
    if cuda:
        norm = norm.cuda()
    g.ndata['norm'] = norm.unsqueeze(1)

    model = EglGCN(g, num_feats, args.num_hidden, n_classes, args.num_layers,
                   F.relu, args.dropout)

    if cuda:
        model.cuda()
    loss_fcn = torch.nn.CrossEntropyLoss()

    # use optimizer
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.weight_decay)

    # initialize graph
    dur = []
    Used_memory = 0

    for epoch in range(args.num_epochs):
        model.train()
        torch.cuda.synchronize()
        t0 = time.time()
        # forward
        logits = model(features)
        loss = loss_fcn(logits[train_mask], labels[train_mask])
        now_mem = torch.cuda.max_memory_allocated(0)
        Used_memory = max(now_mem, Used_memory)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        torch.cuda.synchronize()
        run_time_this_epoch = time.time() - t0

        if epoch >= 3:
            dur.append(run_time_this_epoch)

        train_acc = accuracy(logits[train_mask], labels[train_mask])
        print(
            'Epoch {:05d} | Time(s) {:.4f} | train_acc {:.6f} | Used_Memory {:.6f} mb'
            .format(epoch, run_time_this_epoch, train_acc,
                    (now_mem * 1.0 / (1024**2))))

    Used_memory /= (1024**3)
    print('^^^{:6f}^^^{:6f}'.format(Used_memory, np.mean(dur)))
Пример #23
0
def main(args):
    # load graph data
    data = load_data(args.dataset,
                     bfs_level=args.bfs_level,
                     relabel=args.relabel)
    num_nodes = data.num_nodes
    num_rels = data.num_rels
    num_classes = data.num_classes
    labels = data.labels
    train_idx = data.train_idx
    test_idx = data.test_idx

    # split dataset into train, validate, test
    if args.validation:
        val_idx = train_idx[:len(train_idx) // 5]
        train_idx = train_idx[len(train_idx) // 5:]
    else:
        val_idx = train_idx

    # since the nodes are featureless, the input feature is then the node id.
    feats = torch.arange(num_nodes)

    # edge type and normalization factor
    edge_type = torch.from_numpy(data.edge_type)
    edge_norm = torch.from_numpy(data.edge_norm).unsqueeze(1)
    labels = torch.from_numpy(labels).view(-1)

    # check cuda
    use_cuda = args.gpu >= 0 and torch.cuda.is_available()
    if use_cuda:
        torch.cuda.set_device(args.gpu)
        feats = feats.cuda()
        edge_type = edge_type.cuda()
        edge_norm = edge_norm.cuda()
        labels = labels.cuda()

    # create graph
    g = DGLGraph()
    g.add_nodes(num_nodes)
    g.add_edges(data.edge_src, data.edge_dst)

    # create model
    model = EntityClassify(len(g),
                           args.n_hidden,
                           num_classes,
                           num_rels,
                           num_bases=args.n_bases,
                           num_hidden_layers=args.n_layers - 2,
                           dropout=args.dropout,
                           use_self_loop=args.use_self_loop,
                           use_cuda=use_cuda)

    if use_cuda:
        model.cuda()

    # optimizer
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.l2norm)

    # training loop
    print("start training...")
    forward_time = []
    backward_time = []
    model.train()
    for epoch in range(args.n_epochs):
        optimizer.zero_grad()
        t0 = time.time()
        logits = model(g, feats, edge_type, edge_norm)
        loss = F.cross_entropy(logits[train_idx], labels[train_idx])
        t1 = time.time()
        loss.backward()
        optimizer.step()
        t2 = time.time()

        forward_time.append(t1 - t0)
        backward_time.append(t2 - t1)
        print(
            "Epoch {:05d} | Train Forward Time(s) {:.4f} | Backward Time(s) {:.4f}"
            .format(epoch, forward_time[-1], backward_time[-1]))
        train_acc = torch.sum(logits[train_idx].argmax(
            dim=1) == labels[train_idx]).item() / len(train_idx)
        val_loss = F.cross_entropy(logits[val_idx], labels[val_idx])
        val_acc = torch.sum(logits[val_idx].argmax(
            dim=1) == labels[val_idx]).item() / len(val_idx)
        print(
            "Train Accuracy: {:.4f} | Train Loss: {:.4f} | Validation Accuracy: {:.4f} | Validation loss: {:.4f}"
            .format(train_acc, loss.item(), val_acc, val_loss.item()))
    print()

    model.eval()
    logits = model.forward(g, feats, edge_type, edge_norm)
    test_loss = F.cross_entropy(logits[test_idx], labels[test_idx])
    test_acc = torch.sum(logits[test_idx].argmax(
        dim=1) == labels[test_idx]).item() / len(test_idx)
    print("Test Accuracy: {:.4f} | Test loss: {:.4f}".format(
        test_acc, test_loss.item()))
    print()

    print("Mean forward time: {:4f}".format(
        np.mean(forward_time[len(forward_time) // 4:])))
    print("Mean backward time: {:4f}".format(
        np.mean(backward_time[len(backward_time) // 4:])))
Пример #24
0
class GraphRecommender:
    """Rapidly trains similarity embeddings for graphs and generates recomendations

    Attributes
    ----------
    G : DGL Graph object
        Current DGL graph for all added data with self.add_data
    node_ids : pandas data frame
        Contains mapping from user provided nodeids to DGL and faiss compatable integer ids.
        Also contains various flags which identify properties and classes of the nodes.
    """

    def __init__(self, embedding_dim,
                        feature_dim = None,
                        hidden_dim = None,
                        hidden_layers = 2,
                        dropout = 0,
                        agg_type = 'gcn',
                        distance = 'cosine',
                        torch_device = 'cpu',
                        faiss_gpu = False,
                        inference_batch_size = 10000,
                        p_train = 1,
                        train_faiss_index = False):
        """Generates embeddings for graph data such that embeddings close by a given distance metric are
        'similar'. Embeddings can be used to predict which nodes belong to the same class. The embeddings can be
        trained with triplet loss in a fully supervised, semi-supervised or fully unsupervised manner. GraphSage
        is used to allow minibatch training. Uses faiss index to allow extremely fast query times for most similar
        nodes to a query node even for graphs with billions of nodes. Memory is likely to be the limiting factor before
        query times. 

        Args
        ----
        embedding_dim : int
            the dimension of the final output embedding used for similarity search
        feature_dim : int
            the dimension of the input node features, currently only allowed to be 
            a trainable embedding. In the future should allow external node features.
            defaults to 2*hidden_dim
        hidden_dim : int
            the dimension of the intermediate hidden layers, defaults to 2*embedding dim.
        hidden_layers : int
            number of hidden layers. Embeddings can collpase to a single value if this 
            is set too high. Defaults to 2.
        dropout : float
            whether to apply a dropout layer after hidden layers of GraphSAge. Defaults to 0,
            which means there is no Dropout applied.
        agg_type : str
            aggregation function to apply to GraphSage. Valid options are 'mean', 'lstm', and 'gcn'
            aggregation. See GraphSage paper for implementation details. Defaults to gcn which performs
            well for untrained networks.
        distance : str
            distance metric to use for similarity search. Valid options are l2 and cosine. Defaults to cosine.
        torch_device : str
            computation device to place pytorch tensors on. Valid options are any valid pytorch device. Defaults 
            to cpu.
        faiss_gpu : bool
            whether to use gpu to accelerate faiss searching. Note that it will compete with pytorch for gpu memory.
            inference_batch_size : number of nodes to compute per batch when computing all embeddings with self.net.inference.
            defaults to 10000 which should comfortably fit on most gpus and be reasonably efficient on cpu.
        p_train : float
            the proportion of nodes with known class labels to use for training defaults to 1 
        train_faiss_index : bool
            whether to train faiss index for faster searches. Not reccomended for training since brute force
            will actually be faster than retraining the index at each test iteration. Can be used for api to speed
            up response times.
        """
        self.embedding_dim = embedding_dim
        self.device = torch_device 
        self.inference_batch_size = inference_batch_size
        assert p_train<=1 and p_train>=0
        self.p_train = p_train
        self.faiss_gpu = faiss_gpu
        self.train_faiss = train_faiss_index

        self.distance_metric = distance
        if self.distance_metric == 'cosine':
            self.distance_function = lambda t1,t2 : F.cosine_embedding_loss(t1,
                                                t2,
                                                th.ones(t1.shape[0]).to(self.device),reduce=False)
        elif self.distance_metric == 'l2':
            self.distance_function = lambda t1,t2 : th.sum(F.mse_loss(t1,t2,reduce=False),dim=1)
        else:
            raise ValueError('distance {} is not implemented'.format(self.distance))

        hidden_dim = embedding_dim*4 if hidden_dim is None else hidden_dim
        feature_dim = hidden_dim*2 if feature_dim is None else feature_dim
        self.feature_dim = feature_dim
        self.net = SAGE(feature_dim, hidden_dim, embedding_dim, hidden_layers, F.relu, dropout, agg_type)
        self.net.to(self.device)

        self._embeddings = None 
        self._index = None 
        self._masks_set = False

        self.node_ids = pd.DataFrame(columns=['id','intID','classid','feature_flag'])
        self.G = DGLGraph()

        #hold init args in memory in case needed to save to disk for restoring later
        self.initargs = (embedding_dim,
                        feature_dim,
                        hidden_dim,
                        hidden_layers,
                        dropout,
                        agg_type,
                        distance,
                        torch_device,
                        faiss_gpu,
                        inference_batch_size,
                        p_train,
                        train_faiss_index)


    def add_nodes(self, nodearray, skip_duplicates=False):
        """Define nodes by passing an array (or array like object). Nodes
        can be identified by any data type (even mixed data types), but each
        node must be unique. An exception is raised if all nodes are not unique
        including if the same node is attempted to be added in two calls to this 
        method. Each node is mapped to a unique integer id based on the order
        they are added.

        Args
        ----
        nodearray : numpy array (or array-like object)
            array containing the identifiers of each node to be added
        skip_duplicates : bool
            if true, ignore nodes which have already been added. If False, raise error.
        """
        
        ninputnodes = len(nodearray)
        nodedf = pd.DataFrame(nodearray, columns=['id'])

        if len(nodedf) != len(nodedf.drop_duplicates()):
            raise ValueError('Provided nodeids are not unique. Please pass an array of unique identifiers.')

        nodes_already_exist = nodedf.merge(self.node_ids,on='id',how='inner')
        if len(nodes_already_exist)>0 and not skip_duplicates:
            raise ValueError(
            'Some provided nodes have already been added to the graph. See node_ids.ids.')
        elif len(nodes_already_exist)>0 and skip_duplicates:
            #get rid of the duplicates
            nodes_already_exist['dropflag'] = True 
            nodedf = nodedf.merge(nodes_already_exist,on='id',how='left')
            nodedf['dropflag'] = ~pd.isna(nodedf.dropflag)
            nodedf = nodedf.drop(nodedf[nodedf.dropflag].index)
            nodedf = nodedf[['id']]
            

        current_maximum_id = self.node_ids.intID.max()
        num_new_nodes = len(nodedf)

        start = (current_maximum_id+1)
        if np.isnan(start):
            start = 0
        end = start + num_new_nodes

        nodedf['intID'] = range(start,end)
        nodedf['classid'] = None 
        nodedf['feature_flag'] = False

        self.node_ids = pd.concat([self.node_ids,nodedf])

        self._masks_set = False

        if self.G.is_readonly:
            self.G = dgl.as_immutable_graph(self.G)
            self.G.readonly(False)
        self.G.add_nodes(num_new_nodes)

        self._masks_set = False
        self._embeddings = None 
        self._index = None       


    def add_edges(self, n1, n2):
        """Adds edges to the DGL graph. Nodes must be previously defined by
        add_nodes or an exception is raised. Edges are directed. To define
        a undirected graph, include both n1->n2 and n2->n1 in the graph.

        Args
        ----
        n1 : numpy array (or array-like object)
            first node in the edge (n1->n2)
        n2 : numpy array (or array-like object)
            second node in the edge (n1->n2)
        """
        edgedf_all = pd.DataFrame(n1,columns=['n1'])
        edgedf_all['n2'] = n2

        chunks = int(max(len(edgedf_all)//MAX_ADD_EDGES,1))
        edgedf_all = np.array_split(edgedf_all, chunks)

        if chunks>1:
            pbar = tqdm.tqdm(total=chunks)

        for i in range(chunks):
            edgedf = edgedf_all.pop()
            edgedf = edgedf.merge(self.node_ids,left_on='n1',right_on='id',how='left')
            edgedf = edgedf.merge(self.node_ids,left_on='n2',right_on='id',how='left',suffixes=('','2'))
            edgedf = edgedf[['intID','intID2']]

            if len(edgedf) != len(edgedf.dropna()):
                raise ValueError('Some edges do not correspond to any known node. Please add with add_nodes method first.')

            if self.G.is_readonly:
                self.G = dgl.as_immutable_graph(self.G)
                self.G.readonly(False)

            self.G.add_edges(edgedf.intID,edgedf.intID2)

            if chunks>1:
                pbar.update(1)

        if chunks>1:
            pbar.close()

        self._masks_set = False
        self._embeddings = None 
        self._index = None     

    def _update_node_ids(self,datadf):
        """Overwrites existing information about nodes with new info
        contained in a dataframe. Temporarily sets id as the index to use
        built in pandas update method aligned on index.

        Args
        ----
        datadf : data frame
            has the same structure as self.node_ids
        """

        datadf.set_index('id',inplace=True,drop=True)
        self.node_ids.set_index('id',inplace=True,drop=True)
        self.node_ids.update(datadf, overwrite=True)
        self.node_ids.reset_index(inplace=True)

    def update_labels(self,labels):
        
        """Updates nodes by adding a label (or class). Existing class label
        is overridden if one already exists. Any node which does not have a 
        known class has a label of None. Any data type can be a valid class 
        label except for None which is reserved for unknown class. All nodes
        included in the update must be previously defined by add_nodes or
        an exception is raised.

        Args
        ----
        labels : dictionary or pandas series
            maps node ids to label, i.e. classid. If pandas series the index acts as the dictionary key."""

        labeldf = pd.DataFrame(labels.items(), columns=['id','classid'])
        labeldf = labeldf.merge(self.node_ids,on='id',how='left',suffixes=('','2'))

        if labeldf['intID'].isna().sum() > 0:
            raise ValueError('Some nodes in update do not exist in graph. Add them first with add_nodes.')

        labeldf = labeldf[['id','intID','classid','feature_flag']]
        self._update_node_ids(labeldf)

        self._masks_set = False
        self._embeddings = None 
        self._index = None     

    def update_feature_flag(self,flags):
        """Updates node by adding a feature flag. This can be True or False.
        If the feature flag is True, the node will not be included in any 
        recommender index. It will still be included in the graph to enrich
        the embeddings of the other nodes, but it will never be returned as
        a recommendation as a similar node. I.e. if True this node is a feature
        of other nodes only and not interesting as an entity of its own right.

        Args
        ----
        flags : dictionary or pandas series
            maps node ids to feature flag. If pandas series the index acts as the dictionary key."""

        featuredf = pd.DataFrame(flags.items(), columns=['id','feature_flag'])
        featuredf = featuredf.merge(self.node_ids,on='id',how='left',suffixes=('','2'))

        if featuredf['intID'].isna().sum() > 0:
            raise ValueError('Some nodes in update do not exist in graph. Add them first with add_nodes.')

        featuredf = featuredf[['id','intID','classid','feature_flag']]
        self._update_node_ids(featuredf)

        self._masks_set = False
        self._embeddings = None 
        self._index = None     

    def set_masks(self):
        """Sets train, test, and relevance masks. Needs to be called once after data as been added to graph.
        self.train and self.evaluate automatically check if this needs to be called and will call it, but
        it can also be called manually. Can be called a second time manually to reroll the random generation
        of the train and test sets."""

        self.node_ids = self.node_ids.sort_values('intID')
        self.labels = self.node_ids.classid.to_numpy()

        #is relevant mask indicates the nodes which we know the class of
        self.is_relevant_mask = np.logical_not(pd.isna(self.node_ids.classid).to_numpy())

        #entity_mask indicates the nodes which we want to include in the faiss index
        self.entity_mask = np.logical_not(self.node_ids.feature_flag.to_numpy().astype(np.bool))

        self.train_mask =  np.random.choice(
        a=[False,True],size=(len(self.node_ids)),p=[1-self.p_train,self.p_train])

        #test set is all nodes other than the train set unless train set is all
        #nodes and then test set is the same as train set.
        if self.p_train != 1:
            self.test_mask = np.logical_not(self.train_mask)
        else:
            self.test_mask = self.train_mask

        #do not include any node without a classid in either set
        self.train_mask = np.logical_and(self.train_mask,self.is_relevant_mask)
        self.train_mask = np.logical_and(self.train_mask,self.entity_mask)
        self.test_mask = np.logical_and(self.test_mask,self.is_relevant_mask)
        self.test_mask = np.logical_and(self.test_mask,self.entity_mask)

        if not self.G.is_readonly:
            self.embed = nn.Embedding(len(self.node_ids),self.feature_dim)
            self.G.readonly()
            self.G = dgl.as_heterograph(self.G)
            self.G.ndata['features'] = self.embed.weight

        self.features = self.embed.weight
        self.features.to(self.device)
        self.embed.to(self.device)

        self._masks_set = True

    @property
    def embeddings(self):
        """Updates all node embeddings if needed and returns the embeddings.
        Simple implementation of a cached property.

        Returns
        -------
        embeddings node x embedding_dim tensor"""

        if self._embeddings is None:
            if not self._masks_set:
                self.set_masks()
            print('computing embeddings for all nodes...')
            with th.no_grad():
                self._embeddings = self.net.inference(
                    self.G, self.features,self.inference_batch_size,self.device).detach().cpu().numpy()
        return self._embeddings

    @property
    def index(self):
        """Creates a faiss index for similarity searches over the node embeddings.
        Simple implementation of a cached property.

        Returns
        -------
        a faiss index with input embeddings added and optionally trained"""

        if self._index is None:
            if not self._masks_set:
                self.set_masks()
            if self.distance_metric=='cosine':
                self._index  = faiss.IndexFlatIP(self.embedding_dim)
                embeddings = np.copy(self.embeddings[self.entity_mask])
                #this function operates in place so np.copy any views into a new array before using.
                faiss.normalize_L2(embeddings)
            elif self.distance_metric=='l2':
                self._index = faiss.IndexFlatL2(self.embedding_dim)
                embeddings = self.embeddings[self.entity_mask]
            
            if self.train_faiss:
                training_points = min(
                    len(self.node_ids)//FAISS_NODES_TO_CLUSTERS+1,
                    MAXIMUM_FAISS_CLUSTERS)
                self._index = faiss.IndexIVFFlat(self._index, self.embedding_dim, training_points)
                self._index.train(embeddings)

            self._index.add(embeddings)

            if self.faiss_gpu:
                GPU = faiss.StandardGpuResources()
                self._index = faiss.index_cpu_to_gpu(GPU, 0, self._index)


        return self._index

    def _search_index(self,inputs,k):
        """Directly searches the faiss index and 
        returns the k nearest neighbors of inputs

        Args
        ----
        inputs : numpy array np.float
            the vectors to search against the faiss index
        k : int
            how many neighbors to lookup

        Returns
        -------
        D, I distance numpy array and neighbors array from faiss"""

        if self.distance_metric == 'cosine':
            inputs = np.copy(inputs)
            faiss.normalize_L2(inputs)
        D, I = self.index.search(inputs,k)
        return D,I

    def _get_intID(self,nodelist):
        """Accepts a list of nodeids and converts them to internally used
        sequential integer id. 

        Args
        ----
        nodelist : List
            node identifiers to convert
            
        Returns
        -------
        list of integer identifiers"""

        relevant_nodes = self.node_ids.loc[self.node_ids.id.isin(nodelist)]
        try:
            intids = [relevant_nodes.loc[relevant_nodes.id == node].intID.iloc[0]
                        for node in nodelist]
        except IndexError:
            intids = [relevant_nodes.loc[relevant_nodes.id == int(node)].intID.iloc[0]
                        for node in nodelist]

        return intids

    def get_embeddings(self,nodelist):
        """Looks up the embedding for a specific list of nodes based on
        their nodeid.

        Args
        ----
        nodelist : List
            list of node identifiers to get the embedding of

        Returns
        -------
        numpy array of final embeddings"""

        intids = self._get_intID(nodelist)
        return self.embeddings[intids,:]

    def _faiss_ids_to_nodeids(self, I, return_labels):
        """Takes an output from faiss index and maps the faissids back to nodeids
        and optionally node class labels

        Args
        ----
        I : numpy array
            array returned from a faiss index search
        return_labels : bool
            whether to lookup labels

        Returns
        -------
        I : array with ids mapped to nodeids
        L : optionally second array with ids mapped to node class labels,
            if return_labels is false, is None"""

        faissid_to_nodeid = self.node_ids.id.to_numpy()[self.entity_mask].tolist()
        if return_labels:
            faissid_to_label = self.node_ids.classid.to_numpy()[self.entity_mask].tolist()
            L = [[faissid_to_label[neighbor] for neighbor in neighbors] for neighbors in I]
            I = [[faissid_to_nodeid[neighbor] for neighbor in neighbors] for neighbors in I]
        else:
            I = [[faissid_to_nodeid[neighbor] for neighbor in neighbors] for neighbors in I]
            L = None
        return I, L


    def query_neighbors(self, nodelist, k, return_labels=False):
        """For each query node in nodelist, return the k closest neighbors in the 
        embedding space.

        Args
        ----
        nodelist : list
            list of node identifiers to query
        k : int
            number of neighbors to return
        return_labels : bool
            if true, includes the node label of all neighbors returned

        Returns
        -------
        dictionary of neighbors for each querynode and corresponding distance"""

        if not self._masks_set:
            self.set_masks()

        inputs = self.get_embeddings(nodelist)

        D, I = self._search_index(inputs,k)
        I,L = self._faiss_ids_to_nodeids(I,return_labels)
        if return_labels:
            output = {node:{'neighbors':i,'neighbor labels':l,'distances':d.tolist()} for node, d, i, l in zip(nodelist,D,I,L)}
        else:
            output = {node:{'neighbors':i,'distances':d.tolist()} for node, d, i in zip(nodelist,D,I)}
        return output

    def evaluate(self, test_levels=[5,1], test_only=False):
        """Evaluates performance of current embeddings

        Args
        ----
        test_only : bool
            whether to only test the performance on the test set. If 
            false, all nodes with known class will be tested.
        test_levels : list of ints
            each entry is a number of nearest neighbors and we will test
            if at least one of the neighbors at each level contains a correct
            neighbor based on node labels. We also test the 
            total share of the neighbors that have a correct label.

        Returns
        -------
        dictionary containing details of the performance of the model at each level
        """

        self.net.eval()

        if not self._masks_set:
            self.set_masks()

        mask = self.test_mask if test_only else self.is_relevant_mask
        test_labels = self.labels[mask]
        faiss_labels = self.labels[self.entity_mask]

        test_embeddings = self.embeddings[mask]

        #we need to return the maximum number of neighbors that we want to test
        #plus 1 since the top neighbor of each node will always be itself, which
        #we exclude.
        _, I = self._search_index(test_embeddings,max(test_levels)+1)

        performance = {level:[] for level in test_levels}
        performance_share = {level:[] for level in test_levels}
        for node, neighbors in enumerate(I):
            label = test_labels[node]
            neighbor_labels = [faiss_labels[n] for n in neighbors[1:]]
            for level in test_levels:
                correct_labels = np.sum([label==nl for nl in neighbor_labels[:level]])
                #at least one label in the neighbors was correct
                performance[level].append(correct_labels>0)
                #share of labels in the neighbors that was correct
                performance_share[level].append(correct_labels/level)

        return {f'Top {level} neighbors':
                {'Share >=1 correct neighbor':np.mean(performance[level]),
                'Share of correct neighbors':np.mean(performance_share[level])}
            for level in test_levels}

    @staticmethod
    def setup_pairwise_loss_tensors(labelsnp):
        """Accepts a list of labels and sets up indexers which can be used
        in a triplet loss function along with whether each pair is a positive or
        negative example.

        Args
        ----
        labelsnp : numpy array 
            Class labels of each node, labelsnp[i] = class of node with intid i

        Returns
        -------
        idx1 : indexer array for left side comparison
        idx2 : indexer array for right side comparison
        target : array indicating whether left and right side are the same or different"""

        idx1 = []
        idx2 = []
        target = []
        for i,l in enumerate(labelsnp):
            ids = list(range(len(labelsnp)))
            for j,other in zip(ids[i+1:],labelsnp[i+1:]):
                if other==l:
                    idx1.append(i)
                    idx2.append(j)
                    target.append(1)
                else:
                    idx1.append(i)
                    idx2.append(j)
                    target.append(-1)

        return idx1, idx2, target

    def triplet_loss(self,embeddings,labels):
        """For a given tensor of embeddings and corresponding labels, 
        returns a triplet loss maximizing distance between negative examples
        and minimizing distance between positive examples

        Args
        ----
        embeddings : pytorch tensor torch.float32
            embeddings to be trained
        labels : numpy array
            Class labels of each node, labelsnp[i] = class of node with intid i"""
        
        batch_relevant_nodes = [i for i,l in enumerate(labels) if not pd.isna(l)]
        embeddings = embeddings[batch_relevant_nodes]
        labels = labels[batch_relevant_nodes]
        idx1,idx2,target = self.setup_pairwise_loss_tensors(labels)


        losstarget = th.tensor(target).to(self.device)

        if self.distance_metric=='cosine':
            input1 = embeddings[idx1]
            input2 = embeddings[idx2]
            loss = F.cosine_embedding_loss(input1,
                                            input2,
                                            losstarget,
                                            margin=0.5)
        elif self.distance_metric=='l2':
            idx1_pos = [idx for i,idx in enumerate(idx1) if target[i]==1]
            idx1_neg = [idx for i,idx in enumerate(idx1) if target[i]==-1]

            idx2_pos = [idx for i,idx in enumerate(idx2) if target[i]==1]
            idx2_neg = [idx for i,idx in enumerate(idx2) if target[i]==-1]

            input1_pos = embeddings[idx1_pos]
            input2_pos = embeddings[idx2_pos]

            input1_neg = embeddings[idx1_neg]
            input2_neg = embeddings[idx2_neg]

            loss_pos = F.mse_loss(input1_pos,input2_pos)
            loss_neg = th.mean(th.max(th.zeros(input1_neg.shape[0]).to(self.device),0.25-th.sum(F.mse_loss(input1_neg,input2_neg,reduce=False),dim=1)))

            loss = loss_pos + loss_neg

        else:
            raise ValueError('distance {} is not implemented'.format(self.distance_metric))

        return loss 
       

    def train(self,epochs,
                    batch_size,
                    test_every_n_epochs = 1,
                    unsupervised = False,
                    learning_rate = 1e-2,
                    fanouts = [10,25],
                    neg_samples = 1,
                    return_intermediate_embeddings = False,
                    test_levels=[5,1]):
        """Trains the network weights to improve the embeddings. Can train via supervised learning with triplet loss,
        semisupervised learning with triplet loss, or fully unsupervised learning.

        Args
        ----
        epochs : int
            number of training passes over the data
        batch_size : int
            number of seed nodes for building the training graph
        test_every_n_epochs : int
            how often to do a full evaluation of the embeddings, expensive for large graphs
        unsupervised : bool
            whether to train completely unsupervised
        learning_rate : float
            learning rate to use in the adam optimizer
        fanouts : list of int
            number of neighbors to sample at each layer for GraphSage
        neg_samples : int
            number of negative samples to use in unsupervised loss
        test_levels : list of ints
            passsed to self.eval, number of neighbors to use for testing accuracy"""

        if not self._masks_set:
            self.set_masks()

        optimizer = th.optim.Adam(it.chain(self.net.parameters(),self.embed.parameters()), lr=learning_rate)

        if not unsupervised:
            sampler = NeighborSampler(self.G, [int(fanout) for fanout in fanouts])
            sampledata = np.nonzero(self.train_mask)[0]
        else:
            sampler = UnsupervisedNeighborSampler(self.G, [int(fanout) for fanout in fanouts],neg_samples)
            sampledata = list(range(len(self.node_ids)))
            unsup_loss_fn = CrossEntropyLoss()
            unsup_loss_fn.to(self.device)

        dataloader = DataLoader(
                            dataset=sampledata,
                            batch_size=batch_size,
                            collate_fn=sampler.sample_blocks,
                            shuffle=True,
                            drop_last=True,
                            num_workers=0)

        

        
        perf = self.evaluate(test_levels=test_levels,test_only=True)

        testtop5, testtop1 = perf['Top 5 neighbors']['Share >=1 correct neighbor'], \
                                perf['Top 1 neighbors']['Share >=1 correct neighbor']

        testtop5tot, testtop1tot = perf['Top 5 neighbors']['Share of correct neighbors'], \
                                perf['Top 1 neighbors']['Share of correct neighbors']

        print(testtop5,testtop1,testtop5tot, testtop1tot)
        print("Test Top5 {:.4f} | Test Top1 {:.4f} | Test Top5 Total {:.4f} | Test Top1 Total {:.4f} ".format(
                testtop5,testtop1,testtop5tot, testtop1tot))

        loss_history = []
        perf_history = [perf]
        if return_intermediate_embeddings:
            all_embeddings = []
            all_embeddings.append(self.embeddings)

        for epoch in range(1,epochs+1):
            
            for step,data in enumerate(dataloader):
                #sup_blocks, unsupervised_data = data 
                #pos_graph, neg_graph, unsup_blocks = unsupervised_data


                self.net.train()

                # these names are confusing because "seeds" are the input
                # to neighbor generation but the output in the sense that we 
                # output their embeddings based on their neighbors...
                # the neighbors are the inputs in the sense that they are what we
                # use to generate the embedding for the seeds.
                if not unsupervised:
                    sup_blocks = data
                    sup_input_nodes = sup_blocks[0].srcdata[dgl.NID]
                    sup_seeds = sup_blocks[-1].dstdata[dgl.NID]

                    #sup_batch_inputs = self.G.ndata['features'][sup_input_nodes].to(self.device)
                    sup_batch_inputs = self.features[sup_input_nodes].to(self.device)
                    sup_batch_labels = self.labels[sup_seeds]
                    #nodeids = [self.node_ids.loc[self.node_ids.intID==i].id.iloc[0] for i in sup_seeds]

                    #print(sup_batch_labels,nodeids)

                    sup_embeddings = self.net(sup_blocks, sup_batch_inputs)



                    loss = self.triplet_loss(sup_embeddings,sup_batch_labels)
                else:
                    pos_graph, neg_graph, unsup_blocks = data
                    unsup_input_nodes = unsup_blocks[0].srcdata[dgl.NID]
                    unsup_seeds = unsup_blocks[-1].dstdata[dgl.NID]

                    unsup_batch_inputs = self.G.ndata['features'][unsup_input_nodes].to(self.device)

                    unsup_embeddings =self.net(unsup_blocks,unsup_batch_inputs)
                    loss = unsup_loss_fn(unsup_embeddings, pos_graph, neg_graph)
                
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                #once the parameters change we no longer know the new embeddings for all nodes
                self._embeddings = None 
                self._index = None
                

                print("Epoch {:05d} | Step {:0.1f} | Loss {:.8f}".format(
                        epoch, step, loss.item()))
            if return_intermediate_embeddings:
                all_embeddings.append(self.embeddings)
            loss_history.append(loss.item())
            if epoch % test_every_n_epochs == 0 or epoch==epochs:

                perf = self.evaluate(test_levels=test_levels,test_only=True)

                testtop5, testtop1 = perf['Top 5 neighbors']['Share >=1 correct neighbor'], \
                                        perf['Top 1 neighbors']['Share >=1 correct neighbor']

                testtop5tot, testtop1tot = perf['Top 5 neighbors']['Share of correct neighbors'], \
                                        perf['Top 1 neighbors']['Share of correct neighbors']

                print("Epoch {:05d} | Loss {:.8f} | Test Top5 {:.4f} | Test Top1 {:.4f} | Test Top5 Total {:.4f} | Test Top1 Total {:.4f} ".format(
                        epoch, loss.item(),testtop5,testtop1,testtop5tot, testtop1tot))

                perf_history.append(perf)

        if return_intermediate_embeddings:
            return loss_history,perf_history,all_embeddings     
        else:
            return loss_history,perf_history

    def start_api(self,*args,**kwargs):
        """Launches a fastapi to query this class in its current state."""
        package_path = os.path.dirname(os.path.abspath(__file__))
        production_path = package_path + '/production_model'
        pathlib.Path(production_path).mkdir(exist_ok=True)
        self.save(production_path)
        os.environ['FASTREC_DEPLOY_PATH'] = production_path
        #this import cant be at the top level to prevent circular dependency
        from RecAPI import app
        uvicorn.run(app,*args,**kwargs)


    def save(self, filepath):
        """Save all information neccessary to recover current state of the current instance of
        this object to a folder. Initialization args, graph data, node ids, current trained embedding,
        and current torch paramters are all saved.

        Args
        ----
        filepath : str 
            path on disk to save files"""


        outg = dgl.as_immutable_graph(self.G)
        dgl.data.utils.save_graphs(f'{filepath}/dgl.bin',outg)

        self.node_ids.to_csv(f'{filepath}/node_ids.csv',index=False)

        th.save(self.embed,f'{filepath}/embed.torch')
        th.save(self.net.state_dict(),f'{filepath}/model_weights.torch')
        embeddings = self.embeddings
        np.save(f'{filepath}/final_embed.npy',embeddings,allow_pickle=False)

        with open(f'{filepath}/initargs.pkl','wb') as pklf:
            pickle.dump(self.initargs,pklf)

    def load_graph_data(self,filepath):
        """Restore graph data from disk, but not network parameters
        or trained embeddings. Useful for changing network parameters
        if you don't want to reconstruct the graph.

        Args
        ----
        filepath : str
            path to where you saved previous the GraphRecommender
        """

        self.G,_ = dgl.data.utils.load_graphs(f'{filepath}/dgl.bin')
        self.G = restored_self.G[0]
        self.G.readonly()
        self.G = dgl.as_heterograph(restored_self.G)

        self.node_ids = pd.read_csv(f'{filepath}/node_ids.csv')

        self._masks_set = False
        self._embeddings = None 
        self._index = None 


    @classmethod
    def load(cls, filepath, device=None, faiss_gpu=None):
        """Restore a previous instance of this class from disk.

        Args
        ----
        filepath : str 
            path on disk to load from
        device : str
            optionally override the pytorch device
        faiss_gpu : str
            optionally override whether faiss uses gpu"""

        with open(f'{filepath}/initargs.pkl','rb') as pklf:
            (embedding_dim,
            feature_dim,
            hidden_dim,
            hidden_layers,
            dropout,
            agg_type,
            distance,
            torch_device,
            faiss_gpu_loaded,
            inference_batch_size,
            p_train,
            train_faiss_index) = pickle.load(pklf)

        if device is not None:
            torch_device=device

        if faiss_gpu is not None:
            faiss_gpu_loaded = faiss_gpu

        restored_self = cls(embedding_dim,
                            feature_dim,
                            hidden_dim,
                            hidden_layers,
                            dropout,
                            agg_type,
                            distance,
                            torch_device,
                            faiss_gpu_loaded,
                            inference_batch_size,
                            p_train,
                            train_faiss_index)

        restored_self.G,_ = dgl.data.utils.load_graphs(f'{filepath}/dgl.bin')
        restored_self.G = restored_self.G[0]
        restored_self.G.readonly()
        restored_self.G = dgl.as_heterograph(restored_self.G)

        restored_self.node_ids = pd.read_csv(f'{filepath}/node_ids.csv')

        restored_self.embed = th.load(f'{filepath}/embed.torch',map_location=th.device(torch_device))
        restored_self.net.load_state_dict(th.load(f'{filepath}/model_weights.torch',map_location=th.device(torch_device)))
        embeddings = np.load(f'{filepath}/final_embed.npy',allow_pickle=False)
        restored_self._embeddings = embeddings

        return restored_self
Пример #25
0
def main(args):
    # load graph data
    data = load_data(args.dataset,
                     bfs_level=args.bfs_level,
                     relabel=args.relabel)
    num_nodes = data.num_nodes
    num_rels = data.num_rels
    num_classes = data.num_classes
    labels = data.labels
    train_idx = data.train_idx
    test_idx = data.test_idx

    # split dataset into train, validate, test
    if args.validation:
        val_idx = train_idx[:len(train_idx) // 5]
        train_idx = train_idx[len(train_idx) // 5:]
    else:
        val_idx = train_idx

    # since the nodes are featureless, the input feature is then the node id.
    feats = torch.arange(num_nodes)

    # edge type and normalization factor
    edge_type = torch.from_numpy(data.edge_type).long()
    edge_norm = torch.from_numpy(data.edge_norm).unsqueeze(1).float()
    labels = torch.from_numpy(labels).view(-1).long()

    # check cuda
    use_cuda = args.gpu >= 0 and torch.cuda.is_available()
    if use_cuda:
        torch.cuda.set_device(args.gpu)
        feats = feats.cuda()
        edge_type = edge_type.cuda()
        edge_norm = edge_norm.cuda()
        labels = labels.cuda()

    # create graph
    g = DGLGraph()
    g.add_nodes(num_nodes)
    g.add_edges_with_type(data.edge_src, data.edge_dst, data.edge_type)
    #tu_forward = sorted(list(zip(data.edge_src, data.edge_dst, data.edge_type)), key=lambda x : (x[1], x[2]))
    #tu_backward = sorted(list(zip(data.edge_dst, data.edge_src,  data.edge_type)), key=lambda x : (x[1], x[2]))
    #def compute_e_to_distict_t(tu):
    #    num_edges = len(tu)
    #    all_node_distinct_types = 0
    #    cur_node = tu[0][1]
    #    type_set = set()
    #    type_set.add(tu[0][2])
    #    for i in range(1, len(tu)):
    #        if tu[i][1] == cur_node:
    #            type_set.add(tu[i][2])
    #        else:
    #            all_node_distinct_types += len(type_set)
    #            cur_node = tu[i][1]
    #            type_set.clear()
    #            type_set.add(tu[i][2])
    #    all_node_distinct_types += len(type_set)
    #    type_set.clear()
    #    #print('\n'.join([str(t) for t in tu]))
    #    print('num_edges:', num_edges, 'node distinct types', all_node_distinct_types)
    #    return num_edges/all_node_distinct_types
    #r_forward = compute_e_to_distict_t(tu_forward)
    #r_backward = compute_e_to_distict_t(tu_backward)
    #print('ratio forward:', r_forward, 'ratio_backward:', r_backward)
    model = EGLRGCNModel(num_nodes,
                         args.hidden_size,
                         num_classes,
                         num_rels,
                         edge_type.size(0),
                         num_bases=args.num_bases,
                         activation=F.relu,
                         dropout=args.dropout)

    if use_cuda:
        model.cuda()

    # optimizer
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=args.lr,
                                 weight_decay=args.l2norm)

    # training loop
    print("start training...")
    forward_time = []
    backward_time = []
    model.train()
    train_labels = labels[train_idx]
    train_idx = list(train_idx)
    for epoch in range(args.num_epochs):
        optimizer.zero_grad()
        t0 = time.time()
        logits = model(g, feats, edge_type, edge_norm)
        tb = time.time()
        train_logits = logits[train_idx]
        ta = time.time()
        loss = F.cross_entropy(train_logits, train_labels)
        t1 = time.time()
        loss.backward()
        optimizer.step()
        torch.cuda.synchronize()
        t2 = time.time()
        if epoch >= 3:
            forward_time.append(t1 - t0)
            backward_time.append(t2 - t1)
            print(
                "Epoch {:05d} | Train Forward Time(s) {:.4f} | Backward Time(s) {:.4f}"
                .format(epoch, forward_time[-1], backward_time[-1]))
        train_acc = torch.sum(logits[train_idx].argmax(
            dim=1) == labels[train_idx]).item() / len(train_idx)
        val_loss = F.cross_entropy(logits[val_idx], labels[val_idx])
        val_acc = torch.sum(logits[val_idx].argmax(
            dim=1) == labels[val_idx]).item() / len(val_idx)
        print(
            "Train Accuracy: {:.4f} | Train Loss: {:.4f} | Validation Accuracy: {:.4f} | Validation loss: {:.4f}"
            .format(train_acc, loss.item(), val_acc, val_loss.item()))
    print('max memory allocated', torch.cuda.max_memory_allocated())

    model.eval()
    logits = model.forward(g, feats, edge_type, edge_norm)
    test_loss = F.cross_entropy(logits[test_idx], labels[test_idx])
    test_acc = torch.sum(logits[test_idx].argmax(
        dim=1) == labels[test_idx]).item() / len(test_idx)
    print("Test Accuracy: {:.4f} | Test loss: {:.4f}".format(
        test_acc, test_loss.item()))
    print()

    print("Mean forward time: {:4f}".format(
        np.mean(forward_time[len(forward_time) // 4:])))
    print("Mean backward time: {:4f}".format(
        np.mean(backward_time[len(backward_time) // 4:])))

    Used_memory = torch.cuda.max_memory_allocated(0) / (1024**3)
    avg_run_time = np.mean(forward_time[len(forward_time) // 4:]) + np.mean(
        backward_time[len(backward_time) // 4:])
    #output we need
    print('^^^{:6f}^^^{:6f}'.format(Used_memory, avg_run_time))
Пример #26
0
def vectorize_qanta(ex, tokenizer, device, istrain, max_seq_length=64):
    bert_model.eval()
    t_id = ex['id']
    text = ex['text']
    positive_entity = ex['pos_et']
    negative_entities = ex['neg_ets']
    ## In QANTA setting, we limit the maximum sentences as three ( for efficient training and evaluation)
    num_edges = 3
    g = DGLGraph()
    question_node_list = list()
    candidate_node_list = list()
    first_sent_tokens = list()
    first_sent_masks = list()
    question_tokens = list()
    question_masks = list()

    input_ids, input_mask = text_tokenize(text, tokenizer, max_seq_length)
    question_tokens.append(input_ids)
    question_masks.append(input_mask)
    node_sub_questions = list()

    for sup_q in ex['q_et']:
        sub_question = sup_q['text']
        input_ids, input_mask = text_tokenize(sub_question, tokenizer,
                                              max_seq_length)
        question_tokens.append(input_ids)
        question_masks.append(input_mask)

        for et in sup_q['entity']:
            topic = et['et']
            node_first_sent = et['first_sent']
            if topic is None:
                continue

            question_node_list.append(topic)
            question_idx = len(question_tokens) - 1
            node_sub_questions.append(question_idx)
            input_ids, input_mask = text_tokenize(node_first_sent, tokenizer,
                                                  max_seq_length)

            first_sent_tokens.append(input_ids)
            first_sent_masks.append(input_mask)

    candidate_node_list.append(normalize(positive_entity['et']))
    input_ids, input_mask = text_tokenize(positive_entity['first_sent'],
                                          tokenizer, max_seq_length)
    first_sent_tokens.append(input_ids)
    first_sent_masks.append(input_mask)
    node_sub_questions.append(0)

    for neg_et in negative_entities:
        candidate_node_list.append(normalize(neg_et['et']))
        input_ids, input_mask = text_tokenize(neg_et['first_sent'], tokenizer,
                                              max_seq_length)

        first_sent_tokens.append(input_ids)
        first_sent_masks.append(input_mask)
        node_sub_questions.append(0)

    num_nodes = len(question_node_list) + len(candidate_node_list)
    g.add_nodes(num_nodes)

    num_questions = len(question_tokens)

    ### combine question and first sentence
    all_tokens = question_tokens + first_sent_tokens
    all_masks = question_masks + first_sent_masks

    all_tensor = torch.LongTensor(all_tokens).to(device)
    all_masks_tensor = torch.LongTensor(all_masks).to(device)
    all_encodings = list()
    num_exs = 50
    for iii in range(int(all_tensor.size(0) / num_exs)):
        encoding, _ = bert_model(
            all_tensor[iii * num_exs:(iii + 1) * num_exs], None,
            all_masks_tensor[iii * num_exs:(iii + 1) * num_exs])
        encoding = encoding.detach().cpu()
        all_encodings.append(encoding)
    if all_tensor.size(0) % num_exs > 0:
        encoding, _ = bert_model(
            all_tensor[int(all_tensor.size(0) / num_exs) * num_exs:], None,
            all_masks_tensor[int(all_tensor.size(0) / num_exs) * num_exs:])
        encoding = encoding.detach().cpu()
        all_encodings.append(encoding)
    all_encodings = torch.cat(all_encodings, dim=0)

    all_masks_tensor = all_masks_tensor.cpu()

    g.ndata['first_sent'] = all_encodings[num_questions:].cpu()
    g.ndata['first_sent_mask'] = all_masks_tensor[num_questions:].cpu().eq(0)

    for i in range(len(question_node_list)):
        sub_q_num = node_sub_questions[i]

        g.nodes[i].data['question'] = all_encodings[sub_q_num].unsqueeze(0)
        g.nodes[i].data['question_mask'] = all_masks_tensor[
            sub_q_num].unsqueeze(0).eq(0)
        g.nodes[i].data['label'] = torch.tensor(-1).unsqueeze(0)

    g.nodes[len(
        question_node_list)].data['question'] = all_encodings[0].unsqueeze(0)
    g.nodes[len(question_node_list)].data['question_mask'] = all_masks_tensor[
        0].unsqueeze(0).eq(0)
    g.nodes[len(question_node_list)].data['label'] = torch.tensor(1).unsqueeze(
        0)
    #### for candidates, we only use the full question sentence
    for i in range(
            len(question_node_list) + 1,
            len(question_node_list) + len(candidate_node_list)):
        g.nodes[i].data['question'] = all_encodings[0].unsqueeze(0)
        g.nodes[i].data['question_mask'] = all_masks_tensor[0].unsqueeze(0).eq(
            0)
        g.nodes[i].data['label'] = torch.tensor(0).unsqueeze(0)

    for k_entity in positive_entity['evidence']:
        normalized_k_entity = normalize(k_entity)
        if normalized_k_entity in question_node_list:
            s_id = question_node_list.index(normalized_k_entity)
            g.add_edge(question_node_list.index(normalized_k_entity),
                       len(question_node_list))
            evidence_tokens = list()
            evidence_masks = list()
            evidence_ids = list()
            all_evidences = positive_entity['evidence'][k_entity]

            for evi_text in all_evidences[:num_edges]:
                input_ids, input_mask = text_tokenize(evi_text, tokenizer,
                                                      max_seq_length)

                evidence_tokens.append(input_ids)
                evidence_masks.append(input_mask)

            evidence_tensor = torch.LongTensor(evidence_tokens)
            evidence_masks_tensor = torch.LongTensor(evidence_masks)

            edge_features = torch.LongTensor(1, num_edges,
                                             max_seq_length).zero_()
            edge_feature_masks = torch.LongTensor(1, num_edges,
                                                  max_seq_length).zero_()
            egde_sent_mask = torch.ByteTensor(1, num_edges).fill_(1)
            edge_features[0, :len(evidence_tokens), :].copy_(evidence_tensor)
            edge_feature_masks[0, :len(evidence_tokens), :].copy_(
                evidence_masks_tensor)
            egde_sent_mask[0, :len(evidence_tokens)].fill_(0)

            g.edges[s_id,
                    len(question_node_list)].data['evidence'] = edge_features
            g.edges[s_id, len(question_node_list
                              )].data['evidence_mask'] = edge_feature_masks
            g.edges[s_id, len(question_node_list
                              )].data['evidence_sent_mask'] = egde_sent_mask

    for neg_et in negative_entities:
        for k_entity in neg_et['evidence']:
            normalized_k_entity = normalize(k_entity)
            if normalized_k_entity in question_node_list:
                s_id = question_node_list.index(normalized_k_entity)
                t_id = len(question_node_list) + candidate_node_list.index(
                    normalize(neg_et['et']))
                g.add_edge(s_id, t_id)

                evidence_tokens = list()
                evidence_masks = list()
                evidence_ids = list()
                all_evidences = neg_et['evidence'][normalized_k_entity]
                for evi_text in all_evidences[:num_edges]:
                    input_ids, input_mask = text_tokenize(
                        evi_text, tokenizer, max_seq_length)
                    evidence_tokens.append(input_ids)
                    evidence_masks.append(input_mask)

                evidence_tensor = torch.LongTensor(evidence_tokens)
                evidence_masks_tensor = torch.LongTensor(evidence_masks)

                edge_features = torch.LongTensor(1, num_edges,
                                                 max_seq_length).zero_()
                edge_feature_masks = torch.LongTensor(1, num_edges,
                                                      max_seq_length).zero_()
                egde_sent_mask = torch.ByteTensor(1, num_edges).fill_(1)
                edge_features[0, :len(evidence_tokens), :].copy_(
                    evidence_tensor)
                edge_feature_masks[0, :len(evidence_tokens), :].copy_(
                    evidence_masks_tensor)
                egde_sent_mask[0, :len(evidence_tokens)].fill_(0)

                g.edges[s_id, t_id].data['evidence'] = edge_features
                g.edges[s_id, t_id].data['evidence_mask'] = edge_feature_masks
                g.edges[s_id, t_id].data['evidence_sent_mask'] = egde_sent_mask

    ### Batch the sentences and get BERT embeddings
    if 'evidence' in g.edata:
        evi = g.edata['evidence'].to(device)
        evi_mask = g.edata['evidence_mask'].to(device)
        batch_size, sent_max_len, word_max_len = evi.size(0), evi.size(
            1), evi.size(2)
        evi = evi.view(batch_size * sent_max_len, word_max_len)
        evi_mask = evi_mask.view(batch_size * sent_max_len, word_max_len)
        all_encodings = list()
        num_exs = 50
        for iii in range(int(evi.size(0) / num_exs)):
            encoding, _ = bert_model(
                evi[iii * num_exs:(iii + 1) * num_exs], None,
                evi_mask[iii * num_exs:(iii + 1) * num_exs])
            encoding = encoding.detach().cpu()
            all_encodings.append(encoding)
        if evi.size(0) % num_exs > 0:
            encoding, _ = bert_model(
                evi[int(evi.size(0) / num_exs) * num_exs:], None,
                evi_mask[int(evi.size(0) / num_exs) * num_exs:])
            encoding = encoding.detach().cpu()
            all_encodings.append(encoding)

        g.edata['evidence'] = torch.cat(all_encodings,
                                        dim=0).view(batch_size, sent_max_len,
                                                    word_max_len, -1)
        g.edata['evidence_mask'] = g.edata['evidence_mask'].eq(0)

    return g
Пример #27
0
class MoleculeEnv(object):
    """MDP environment for generating molecules.

    Parameters
    ----------
    atom_types : list
        E.g. ['C', 'N']
    bond_types : list
        E.g. [Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE,
        Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC]
    """
    def __init__(self, atom_types, bond_types):
        super(MoleculeEnv, self).__init__()

        self.atom_types = atom_types
        self.bond_types = bond_types

        self.atom_type_to_id = dict()
        self.bond_type_to_id = dict()

        for id, a_type in enumerate(atom_types):
            self.atom_type_to_id[a_type] = id

        for id, b_type in enumerate(bond_types):
            self.bond_type_to_id[b_type] = id

    def get_decision_sequence(self, mol, atom_order):
        """Extract a decision sequence with which DGMG can generate the
        molecule with a specified atom order.

        Parameters
        ----------
        mol : Chem.rdchem.Mol
        atom_order : list
            Specifies a mapping between the original atom
            indices and the new atom indices. In particular,
            atom_order[i] is re-labeled as i.

        Returns
        -------
        decisions : list
            decisions[i] is a 2-tuple (i, j)
            - If i = 0, j specifies either the type of the atom to add
              self.atom_types[j] or termination with j = len(self.atom_types)
            - If i = 1, j specifies either the type of the bond to add
              self.bond_types[j] or termination with j = len(self.bond_types)
            - If i = 2, j specifies the destination atom id for the bond to add.
              With the formulation of DGMG, j must be created before the decision.
        """
        decisions = []
        old2new = dict()

        for new_id, old_id in enumerate(atom_order):
            atom = mol.GetAtomWithIdx(old_id)
            a_type = atom.GetSymbol()
            decisions.append((0, self.atom_type_to_id[a_type]))
            for bond in atom.GetBonds():
                u = bond.GetBeginAtomIdx()
                v = bond.GetEndAtomIdx()
                if v == old_id:
                    u, v = v, u
                if v in old2new:
                    decisions.append(
                        (1, self.bond_type_to_id[bond.GetBondType()]))
                    decisions.append((2, old2new[v]))
            decisions.append((1, len(self.bond_types)))
            old2new[old_id] = new_id
        decisions.append((0, len(self.atom_types)))
        return decisions

    def reset(self, rdkit_mol=False):
        """Setup for generating a new molecule

        Parameters
        ----------
        rdkit_mol : bool
            Whether to keep a Chem.rdchem.Mol object so
            that we know what molecule is being generated
        """
        self.dgl_graph = DGLGraph()
        # If there are some features for nodes and edges,
        # zero tensors will be set for those of new nodes and edges.
        self.dgl_graph.set_n_initializer(dgl.frame.zero_initializer)
        self.dgl_graph.set_e_initializer(dgl.frame.zero_initializer)

        self.mol = None
        if rdkit_mol:
            # RWMol is a molecule class that is intended to be edited.
            self.mol = Chem.RWMol(Chem.MolFromSmiles(''))

    def num_atoms(self):
        """Get the number of atoms for the current molecule.

        Returns
        -------
        int
        """
        return self.dgl_graph.number_of_nodes()

    def add_atom(self, type):
        """Add an atom of the specified type.

        Parameters
        ----------
        type : int
            Should be in the range of [0, len(self.atom_types) - 1]
        """
        self.dgl_graph.add_nodes(1)
        if self.mol is not None:
            self.mol.AddAtom(Chem.Atom(self.atom_types[type]))

    def add_bond(self, u, v, type, bi_direction=True):
        """Add a bond of the specified type between atom u and v.

        Parameters
        ----------
        u : int
            Index for the first atom
        v : int
            Index for the second atom
        type : int
            Index for the bond type
        bi_direction : bool
            Whether to add edges for both directions in the DGLGraph.
            If not, we will only add the edge (u, v).
        """
        if bi_direction:
            self.dgl_graph.add_edges([u, v], [v, u])
        else:
            self.dgl_graph.add_edge(u, v)

        if self.mol is not None:
            self.mol.AddBond(u, v, self.bond_types[type])

    def get_current_smiles(self):
        """Get the generated molecule in SMILES

        Returns
        -------
        s : str
            SMILES
        """
        assert self.mol is not None, 'Expect a Chem.rdchem.Mol object initialized.'
        s = Chem.MolToSmiles(self.mol)
        return s
Пример #28
0
def _test_nx_conversion():
    # check conversion between networkx and DGLGraph

    def _check_nx_feature(nxg, nf, ef):
        # check node and edge feature of nxg
        # this is used to check to_networkx
        num_nodes = len(nxg)
        num_edges = nxg.size()
        if num_nodes > 0:
            node_feat = ddict(list)
            for nid, attr in nxg.nodes(data=True):
                assert len(attr) == len(nf)
                for k in nxg.nodes[nid]:
                    node_feat[k].append(F.unsqueeze(attr[k], 0))
            for k in node_feat:
                feat = F.cat(node_feat[k], 0)
                assert F.allclose(feat, nf[k])
        else:
            assert len(nf) == 0
        if num_edges > 0:
            edge_feat = ddict(lambda: [0] * num_edges)
            for u, v, attr in nxg.edges(data=True):
                assert len(attr) == len(ef) + 1  # extra id
                eid = attr['id']
                for k in ef:
                    edge_feat[k][eid] = F.unsqueeze(attr[k], 0)
            for k in edge_feat:
                feat = F.cat(edge_feat[k], 0)
                assert F.allclose(feat, ef[k])
        else:
            assert len(ef) == 0

    n1 = F.randn((5, 3))
    n2 = F.randn((5, 10))
    n3 = F.randn((5, 4))
    e1 = F.randn((4, 5))
    e2 = F.randn((4, 7))
    g = DGLGraph()
    g.add_nodes(5)
    g.add_edges([0, 1, 3, 4], [2, 4, 0, 3])
    g.ndata.update({'n1': n1, 'n2': n2, 'n3': n3})
    g.edata.update({'e1': e1, 'e2': e2})

    # convert to networkx
    nxg = g.to_networkx(node_attrs=['n1', 'n3'], edge_attrs=['e1', 'e2'])
    assert len(nxg) == 5
    assert nxg.size() == 4
    _check_nx_feature(nxg, {'n1': n1, 'n3': n3}, {'e1': e1, 'e2': e2})

    # convert to DGLGraph, nx graph has id in edge feature
    # use id feature to test non-tensor copy
    g = dgl.from_networkx(nxg, node_attrs=['n1'], edge_attrs=['e1', 'id'])
    # check graph size
    assert g.number_of_nodes() == 5
    assert g.number_of_edges() == 4
    # check number of features
    # test with existing dglgraph (so existing features should be cleared)
    assert len(g.ndata) == 1
    assert len(g.edata) == 2
    # check feature values
    assert F.allclose(g.ndata['n1'], n1)
    # with id in nx edge feature, e1 should follow original order
    assert F.allclose(g.edata['e1'], e1)
    assert F.array_equal(F.astype(g.edata['id'], F.int64),
                         F.copy_to(F.arange(0, 4), F.cpu()))

    # test conversion after modifying DGLGraph
    g.edata.pop(
        'id')  # pop id so we don't need to provide id when adding edges
    new_n = F.randn((2, 3))
    new_e = F.randn((3, 5))
    g.add_nodes(2, data={'n1': new_n})
    # add three edges, one is a multi-edge
    g.add_edges([3, 6, 0], [4, 5, 2], data={'e1': new_e})
    n1 = F.cat((n1, new_n), 0)
    e1 = F.cat((e1, new_e), 0)
    # convert to networkx again
    nxg = g.to_networkx(node_attrs=['n1'], edge_attrs=['e1'])
    assert len(nxg) == 7
    assert nxg.size() == 7
    _check_nx_feature(nxg, {'n1': n1}, {'e1': e1})

    # now test convert from networkx without id in edge feature
    # first pop id in edge feature
    for _, _, attr in nxg.edges(data=True):
        attr.pop('id')
    # test with a new graph
    g = dgl.from_networkx(nxg, node_attrs=['n1'], edge_attrs=['e1'])
    # check graph size
    assert g.number_of_nodes() == 7
    assert g.number_of_edges() == 7
    # check number of features
    assert len(g.ndata) == 1
    assert len(g.edata) == 1
    # check feature values
    assert F.allclose(g.ndata['n1'], n1)
    # edge feature order follows nxg.edges()
    edge_feat = []
    for _, _, attr in nxg.edges(data=True):
        edge_feat.append(F.unsqueeze(attr['e1'], 0))
    edge_feat = F.cat(edge_feat, 0)
    assert F.allclose(g.edata['e1'], edge_feat)

    # Test converting from a networkx graph whose nodes are
    # not labeled with consecutive-integers.
    nxg = nx.cycle_graph(5)
    nxg.remove_nodes_from([0, 4])
    for u in nxg.nodes():
        nxg.nodes[u]['h'] = F.tensor([u])
    for u, v, d in nxg.edges(data=True):
        d['h'] = F.tensor([u, v])

    g = dgl.from_networkx(nxg, node_attrs=['h'], edge_attrs=['h'])
    assert g.number_of_nodes() == 3
    assert g.number_of_edges() == 4
    assert g.has_edge_between(0, 1)
    assert g.has_edge_between(1, 2)
    assert F.allclose(g.ndata['h'], F.tensor([[1.], [2.], [3.]]))
    assert F.allclose(g.edata['h'],
                      F.tensor([[1., 2.], [1., 2.], [2., 3.], [2., 3.]]))
Пример #29
0
def mol_to_nearest_neighbor_graph(mol,
                                  coordinates,
                                  neighbor_cutoff,
                                  max_num_neighbors=None,
                                  p_distance=2,
                                  add_self_loop=False,
                                  node_featurizer=None,
                                  edge_featurizer=None,
                                  canonical_atom_order=True,
                                  keep_dists=False,
                                  dist_field='dist',
                                  explicit_hydrogens=False):
    """Convert an RDKit molecule into a nearest neighbor graph and featurize for it.

    Different from bigraph and complete graph, the nearest neighbor graph
    may not be symmetric since i is the closest neighbor of j does not
    necessarily suggest the other way.

    Parameters
    ----------
    mol : rdkit.Chem.rdchem.Mol
        RDKit molecule holder
    coordinates : numpy.ndarray of shape (N, D)
        The coordinates of atoms in the molecule. N for the number of atoms
        and D for the dimensions of the coordinates.
    neighbor_cutoff : float
        If the distance between a pair of nodes is larger than neighbor_cutoff,
        they will not be considered as neighboring nodes.
    max_num_neighbors : int or None.
        If not None, then this specifies the maximum number of neighbors
        allowed for each atom. Default to None.
    p_distance : int
        We compute the distance between neighbors using Minkowski (:math:`l_p`)
        distance. When ``p_distance = 1``, Minkowski distance is equivalent to
        Manhattan distance. When ``p_distance = 2``, Minkowski distance is
        equivalent to the standard Euclidean distance. Default to 2.
    add_self_loop : bool
        Whether to add self loops in DGLGraphs. Default to False.
    node_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for nodes like atoms in a molecule, which can be used to update
        ndata for a DGLGraph. Default to None.
    edge_featurizer : callable, rdkit.Chem.rdchem.Mol -> dict
        Featurization for edges like bonds in a molecule, which can be used to update
        edata for a DGLGraph. Default to None.
    canonical_atom_order : bool
        Whether to use a canonical order of atoms returned by RDKit. Setting it
        to true might change the order of atoms in the graph constructed. Default
        to True.
    keep_dists : bool
        Whether to store the distance between neighboring atoms in ``edata`` of the
        constructed DGLGraphs. Default to False.
    dist_field : str
        Field for storing distance between neighboring atoms in ``edata``. This comes
        into effect only when ``keep_dists=True``. Default to ``'dist'``.
    explicit_hydrogens : bool
        Whether to explicitly represent hydrogens as nodes in the graph. Default to False.

    Returns
    -------
    g : DGLGraph
        Nearest neighbor DGLGraph for the molecule

    Examples
    --------
    >>> from dgllife.utils import mol_to_nearest_neighbor_graph
    >>> from rdkit import Chem
    >>> from rdkit.Chem import AllChem

    >>> mol = Chem.MolFromSmiles('CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C')
    >>> AllChem.EmbedMolecule(mol)
    >>> AllChem.MMFFOptimizeMolecule(mol)
    >>> coords = get_mol_3d_coordinates(mol)
    >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25)
    >>> print(g)
    DGLGraph(num_nodes=23, num_edges=6,
             ndata_schemes={}
             edata_schemes={})

    Quite often we will want to use the distance between end atoms of edges, this can be
    achieved with

    >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25, keep_dists=True)
    >>> print(g.edata['dist'])
    tensor([[1.2024],
            [1.2024],
            [1.2270],
            [1.2270],
            [1.2259],
            [1.2259]])

    By default, we do not explicitly represent hydrogens as nodes, which can be done as follows.

    >>> mol = Chem.MolFromSmiles('CC1(C(N2C(S1)C(C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C')
    >>> mol = Chem.AddHs(mol)
    >>> AllChem.EmbedMolecule(mol)
    >>> AllChem.MMFFOptimizeMolecule(mol)
    >>> coords = get_mol_3d_coordinates(mol)
    >>> g = mol_to_nearest_neighbor_graph(mol, coords, neighbor_cutoff=1.25,
    >>>                                   explicit_hydrogens=True)
    >>> print(g)
    DGLGraph(num_nodes=41, num_edges=42,
             ndata_schemes={}
             edata_schemes={})

    See Also
    --------
    get_mol_3d_coordinates
    k_nearest_neighbors
    smiles_to_nearest_neighbor_graph
    """
    if explicit_hydrogens:
        mol = Chem.AddHs(mol)
    else:
        mol = Chem.RemoveHs(mol)

    num_atoms = mol.GetNumAtoms()
    num_coords = coordinates.shape[0]
    assert num_atoms == num_coords, \
        'Expect the number of atoms to match the first dimension of coordinates, ' \
        'got {:d} and {:d}'.format(num_atoms, num_coords)

    if canonical_atom_order:
        new_order = rdmolfiles.CanonicalRankAtoms(mol)
        mol = rdmolops.RenumberAtoms(mol, new_order)

    srcs, dsts, dists = k_nearest_neighbors(
        coordinates=coordinates,
        neighbor_cutoff=neighbor_cutoff,
        max_num_neighbors=max_num_neighbors,
        p_distance=p_distance,
        self_loops=add_self_loop)
    g = DGLGraph()

    # Add nodes first since some nodes may be completely isolated
    g.add_nodes(num_atoms)

    # Add edges
    g.add_edges(srcs, dsts)

    if node_featurizer is not None:
        g.ndata.update(node_featurizer(mol))

    if edge_featurizer is not None:
        g.edata.update(edge_featurizer(mol))

    if keep_dists:
        assert dist_field not in g.edata, \
            'Expect {} to be reserved for distance between neighboring atoms.'
        g.edata[dist_field] = torch.tensor(dists).float().reshape(-1, 1)

    return g
Пример #30
0
def vectorize_qanta(ex, model, istrain, max_seq_length=128):
    q_id = ex['id']
    text = ex['text']
    positive_entity = ex['pos_et']
    negative_entities = ex['neg_ets']
    ### Maximum 3 sentences per edge
    num_edges = 3

    g = DGLGraph()

    question_node_list = list()
    candidate_node_list = list()
    first_sent_tokens = list()
    first_sent_masks = list()
    question_tokens = list()
    question_masks = list()

    input_ids, input_mask = text_tokenize(text, model.word_dict, max_seq_length)
    question_tokens.append(input_ids)
    question_masks.append(input_mask)
    node_sub_questions = list()
    

    for sup_q in ex['q_et']:
        
        sub_question = sup_q['text']
        input_ids, input_mask = text_tokenize(word_tokenize(sub_question), model.word_dict, max_seq_length)
        question_tokens.append(input_ids)
        question_masks.append(input_mask)

        for et in sup_q['entity']:
            topic = et['et']
            node_first_sent = et['first_sent']
            if topic is None:
                continue

            question_node_list.append(topic)
            question_idx = len(question_tokens) - 1
            node_sub_questions.append(question_idx)
            input_ids, input_mask = text_tokenize(node_first_sent, model.word_dict, max_seq_length)
            
            first_sent_tokens.append(input_ids)
            first_sent_masks.append(input_mask)


    candidate_node_list.append(normalize(positive_entity['et']))
    input_ids, input_mask = text_tokenize(positive_entity['first_sent'], model.word_dict, max_seq_length)
    first_sent_tokens.append(input_ids)
    first_sent_masks.append(input_mask)
    node_sub_questions.append(0)
    for neg_et in negative_entities:
        input_ids, input_mask = text_tokenize(neg_et['first_sent'], model.word_dict, max_seq_length)
        
        candidate_node_list.append(normalize(neg_et['et']))

        first_sent_tokens.append(input_ids)
        first_sent_masks.append(input_mask)
        node_sub_questions.append(0)

    num_nodes = len(question_node_list) + len(candidate_node_list)
    g.add_nodes(num_nodes)

    num_questions = len(question_tokens)

    ### combine question and first sentence
    all_tokens = question_tokens + first_sent_tokens
    all_masks = question_masks + first_sent_masks

    all_tensor = torch.LongTensor(all_tokens)
    all_masks_tensor = torch.LongTensor(all_masks)

    #### add node features
    g.ndata['first_sent'] = all_tensor[num_questions:].cpu()
    g.ndata['first_sent_mask'] = all_masks_tensor[num_questions:].eq(0)
  

    for i in range(len(question_node_list)):
        sub_q_num = node_sub_questions[i]
        
        g.nodes[i].data['question'] = all_tensor[sub_q_num].unsqueeze(0)
        g.nodes[i].data['question_mask'] = all_masks_tensor[sub_q_num].unsqueeze(0).eq(0)
        g.nodes[i].data['label'] = torch.tensor(-1).unsqueeze(0)
        
    g.nodes[len(question_node_list)].data['question'] = all_tensor[0].unsqueeze(0)
    g.nodes[len(question_node_list)].data['question_mask'] = all_masks_tensor[0].unsqueeze(0).eq(0)
    g.nodes[len(question_node_list)].data['label'] = torch.tensor(1).unsqueeze(0)
    #### for candidates, we only use the full question sentence
    for i in range(len(question_node_list) + 1, len(question_node_list) + len(candidate_node_list)):
        g.nodes[i].data['question'] = all_tensor[0].unsqueeze(0)
        g.nodes[i].data['question_mask'] = all_masks_tensor[0].unsqueeze(0).eq(0)
        g.nodes[i].data['label'] = torch.tensor(0).unsqueeze(0)
        

    #### add postive edges
        
    for k_entity in positive_entity['evidence']:
        normalized_k_entity = normalize(k_entity)
        if normalized_k_entity in question_node_list:
            s_id = question_node_list.index(normalized_k_entity)
            g.add_edge(question_node_list.index(normalized_k_entity), len(question_node_list))
            evidence_tokens = list()
            evidence_masks = list()
            all_evidences = positive_entity['evidence'][k_entity]
            
            for evi_text in all_evidences[:num_edges]:
                input_ids, input_mask = text_tokenize(evi_text, model.word_dict, max_seq_length)
                
                evidence_tokens.append(input_ids)
                evidence_masks.append(input_mask)

            evidence_tensor = torch.LongTensor(evidence_tokens)
            evidence_masks_tensor = torch.LongTensor(evidence_masks)

            edge_features = torch.LongTensor(1, num_edges, max_seq_length).zero_()
            edge_feature_masks = torch.LongTensor(1, num_edges, max_seq_length).zero_()
            egde_sent_mask = torch.ByteTensor(1, num_edges).fill_(1)
            edge_features[0, :len(evidence_tokens), :].copy_(evidence_tensor)
            edge_feature_masks[0, :len(evidence_tokens), :].copy_(evidence_masks_tensor)
            egde_sent_mask[0, :len(evidence_tokens)].fill_(0)
     
            g.edges[s_id, len(question_node_list)].data['evidence'] = edge_features
            g.edges[s_id, len(question_node_list)].data['evidence_mask'] = edge_feature_masks.eq(0)
            g.edges[s_id, len(question_node_list)].data['evidence_sent_mask'] = egde_sent_mask


        
    for neg_et in negative_entities:
        #### 
        for k_entity in neg_et['evidence']:
            normalized_k_entity = normalize(k_entity)
            if normalized_k_entity in question_node_list:
                s_id = question_node_list.index(normalized_k_entity)
                t_id = len(question_node_list) + candidate_node_list.index(normalize(neg_et['et']))
                g.add_edge(s_id, t_id)
                    
                evidence_tokens = list()
                evidence_masks = list()
                all_evidences = neg_et['evidence'][normalized_k_entity]
                for evi_text in all_evidences[:num_edges]:
                    input_ids, input_mask = text_tokenize(evi_text, model.word_dict, max_seq_length)
                    evidence_tokens.append(input_ids)
                    evidence_masks.append(input_mask)
                evidence_tensor = torch.LongTensor(evidence_tokens)
                evidence_masks_tensor = torch.LongTensor(evidence_masks)

                edge_features = torch.LongTensor(1, num_edges, max_seq_length).zero_()
                edge_feature_masks = torch.LongTensor(1, num_edges, max_seq_length).zero_()
                egde_sent_mask = torch.ByteTensor(1, num_edges).fill_(1)
                edge_features[0, :len(evidence_tokens), :].copy_(evidence_tensor)
                edge_feature_masks[0, :len(evidence_tokens), :].copy_(evidence_masks_tensor)
                egde_sent_mask[0, :len(evidence_tokens)].fill_(0)
     
                g.edges[s_id, t_id].data['evidence'] = edge_features
                g.edges[s_id, t_id].data['evidence_mask'] = edge_feature_masks.eq(0)
                g.edges[s_id, t_id].data['evidence_sent_mask'] = egde_sent_mask

    return g