예제 #1
0
    def process(self, xd1, xd2, xt1, xt2, y, train_mixed, smile_graph):
        smiles1 = xd1
        target1 = xt1

        smiles2 = xd2
        target2 = xt2

        labels = y

        # convert SMILES to molecular representation using rdkit
        c_size1, features1, edge_index1 = smile_graph[smiles1]
        c_size2, features2, edge_index2 = smile_graph[smiles2]
        # make the graph ready for PyTorch Geometrics GCN algorithms:
        GCNData1 = DATA.Data(
            x=torch.Tensor(features1),
            edge_index=torch.LongTensor(edge_index1).transpose(1, 0),
            y=torch.FloatTensor([labels]))
        GCNData1.target = torch.LongTensor([target1])
        GCNData1.train_mixed = torch.LongTensor([train_mixed])
        GCNData1.__setitem__('c_size', torch.LongTensor([c_size1]))

        GCNData2 = DATA.Data(
            x=torch.Tensor(features2),
            edge_index=torch.LongTensor(edge_index2).transpose(1, 0),
            y=torch.FloatTensor([labels]))
        GCNData2.target = torch.LongTensor([target2])
        GCNData2.train_mixed = torch.LongTensor([train_mixed])
        GCNData2.__setitem__('c_size', torch.LongTensor([c_size2]))

        return GCNData1, GCNData2
예제 #2
0
    def process(self, xd, target_key, y, smile_graph, target_graph):
        assert (len(xd) == len(target_key) and len(xd)
                == len(y)), 'The three lists must be the same length!'
        data_list_mol = []
        data_list_pro = []
        data_len = len(xd)
        for i in range(data_len):
            smiles = xd[i]
            tar_key = target_key[i]
            labels = y[i]
            # convert SMILES to molecular representation using rdkit
            c_size, features, edge_index = smile_graph[smiles]
            target_size, target_features, target_edge_index = target_graph[
                tar_key]
            # print(np.array(features).shape, np.array(edge_index).shape)
            # print(target_features.shape, target_edge_index.shape)
            # make the graph ready for PyTorch Geometrics GCN algorithms:
            GCNData_mol = DATA.Data(
                x=torch.Tensor(features),
                edge_index=torch.LongTensor(edge_index).transpose(1, 0),
                y=torch.FloatTensor([labels]))
            GCNData_mol.__setitem__('c_size', torch.LongTensor([c_size]))

            GCNData_pro = DATA.Data(
                x=torch.Tensor(target_features),
                edge_index=torch.LongTensor(target_edge_index).transpose(1, 0),
                y=torch.FloatTensor([labels]))
            GCNData_pro.__setitem__('target_size',
                                    torch.LongTensor([target_size]))
            # print(GCNData.target.size(), GCNData.target_edge_index.size(), GCNData.target_x.size())
            data_list_mol.append(GCNData_mol)
            data_list_pro.append(GCNData_pro)

        if self.pre_filter is not None:
            data_list_mol = [
                data for data in data_list_mol if self.pre_filter(data)
            ]
            data_list_pro = [
                data for data in data_list_pro if self.pre_filter(data)
            ]
        if self.pre_transform is not None:
            data_list_mol = [
                self.pre_transform(data) for data in data_list_mol
            ]
            data_list_pro = [
                self.pre_transform(data) for data in data_list_pro
            ]
        self.data_mol = data_list_mol
        self.data_pro = data_list_pro
예제 #3
0
    def _build_compound_graph_data(self, atoms, edges, target=None):
        atoms = np.reshape(atoms, (-1, 1))  #m x 1
        atoms[atoms >= self.atom_vocab_size] = self.atom_vocab_size - 1

        edges = np.array(edges)
        if edges.shape[0] != 0:
            edges = np.transpose(edges)
        edges = torch.LongTensor(edges)

        if target is not None:
            return gDATA.Data(x=torch.LongTensor(atoms),
                              edge_index=edges,
                              y=torch.LongTensor([target]))

        return gDATA.Data(x=torch.LongTensor(atoms), edge_index=edges)
예제 #4
0
    def use(self, smiles: list, model_filename=None) -> list:

        # Figure out what to use
        if self._model is None and model_filename is None:
            raise RuntimeError(
                'Model not previously built, or model not supplied')
        if model_filename is not None:
            self._model = torch.load(model_filename)
            self._model.eval()

        # Prepare data
        data = []
        for idx, smi in enumerate(smiles):
            a, b = self._ce.encode(smi)
            data.append(
                gdata.Data(x=a,
                           edge_index=self._ce.connectivity(smi),
                           edge_attr=b).to(self._config['device']))
        loader_test = gdata.DataLoader(data, batch_size=1, shuffle=False)

        # Get results
        results = []
        for batch in loader_test:
            _, res = self._model(batch)
            results.append(res.detach().numpy()[0])
        return results
예제 #5
0
    def process(self, xd, xt, y,smile_graph):
        assert (len(xd) == len(xt) and len(xt) == len(y)), "The three lists must be the same length!"
        data_list = []
        data_len = len(xd)
        for i in range(data_len):
            # print('Converting SMILES to graph: {}/{}'.format(i+1, data_len))
            smiles = xd[i]
            target = xt[i]
            labels = y[i]
            # convert SMILES to molecular representation using rdkit
            c_size, features, edge_index = smile_graph[smiles]
            # make the graph ready for PyTorch Geometrics GCN algorithms:
            GCNData = DATA.Data(x=torch.Tensor(features),
                                edge_index=torch.LongTensor(edge_index).transpose(1, 0),
                                y=torch.FloatTensor([labels]))
            GCNData.target = torch.LongTensor([target])
            GCNData.__setitem__('c_size', torch.LongTensor([c_size]))
            # append graph, label and target sequence to data list
            data_list.append(GCNData)

        if self.pre_filter is not None:
            data_list = [data for data in data_list if self.pre_filter(data)]

        if self.pre_transform is not None:
            data_list = [self.pre_transform(data) for data in data_list]
        print('Graph construction done. Saving to file.')
        data, slices = self.collate(data_list)
        # save preprocessed data:
        torch.save((data, slices), self.processed_paths[0])
예제 #6
0
    def process(self, xd, xt, y, smile_graph):
        assert (len(xd) == len(xt) and len(xt) == len(y))
        data_list = []
        data_len = len(xd)
        for i in range(data_len):
            smiles = xd[i]
            target = xt[i]
            labels = y[i]
            c_size, features, edge_index = smile_graph[smiles]
            GCNData = DATA.Data(
                x=torch.Tensor(features),
                edge_index=torch.LongTensor(edge_index).transpose(1, 0),
                y=torch.FloatTensor([labels]))
            GCNData.target = torch.LongTensor([target])
            GCNData.__setitem__('c_size', torch.LongTensor([c_size]))
            data_list.append(GCNData)

        if self.pre_filter is not None:
            data_list = [data for data in data_list if self.pre_filter(data)]

        if self.pre_transform is not None:
            data_list = [self.pre_transform(data) for data in data_list]
        print('Graph construction done. Saving to file.')
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])
예제 #7
0
 def binarized_data(self):
     probs = F.softmax(self.edge_attr, dim=-1)
     ids = probs.argmax(-1)
     edge_attr = torch.zeros_like(probs)
     edge_attr.scatter_(-1, ids.view(-1, 1), 1)
     edge_attr = edge_attr - probs.detach() + probs
     return gd.Data(x=self.nodes,
                    edge_index=self.edge_index,
                    edge_attr=edge_attr)
예제 #8
0
def prepped_to_tensor(data):

    x, position, depth, depth_count, edge_index = data
    x = th.LongTensor(x)
    position = th.LongTensor(position)
    depth = th.LongTensor(depth)
    edge_index = th.LongTensor(edge_index)
    return th_data.Data(
        x=x, position=position, depth_mask=depth, depth_count=th.LongTensor(depth_count),
        edge_index=edge_index
    )
예제 #9
0
파일: gat.py 프로젝트: xduan7/MoReL
    def forward(self, data: pyg_data.Data):

        out = []
        for i in range(self.__edge_attr_dim):
            # New graph that corresponds to the edge attributes
            _mask = data.edge_attr[:, i].byte()
            _edge_index = torch.masked_select(data.edge_index,
                                              mask=_mask).view(2, -1)
            _data = pyg_data.Data(x=data.x, edge_index=_edge_index)

            out.append(self.__gat_nets[i](_data))

        return torch.cat(tuple(out), dim=1)
예제 #10
0
def get_datasets(args):
    """
    Gets the dataset for hateful Twitter users
    """
    # mask not multiple datasets
    datadir = 'hate_with_refex' if args.use_refex else 'hate'
    feat_data, labels, edges = load_hate(args, datadir)

    dataset = pyg_d.Data(x=feat_data,
                         edge_index=edges,
                         y=labels,
                         batch=feat_data[:, 0])
    return dataset
예제 #11
0
파일: utils.py 프로젝트: EliHei2/scPotter
def get_dataloader(graph, X, y, batch_size=1,undirected=True, shuffle=True):
    """
        Converts a graph and a dataset to a dataloader.
        
        Parameters:
        ----------
        graph : igraph object
            The underlying graph to be fed to the graph neural networks.

        X : numpy ndarray
            Input dataset with columns as features and rows as observations.

        y : numpy ndarray
            Class labels.

        batch_size: int, default=1
            The batch size.

        undirected: boolean
            if the input graph is undirected (symmetric adjacency matrix).

        Returns:
        --------
        dataloader : a pytorch-geometric dataloader. All of the graphs will have the same connectivity (given by the input graph),
        but the node features will be the features from X.
    """
    n_obs, n_features = X.shape
    rows, cols = np.where(graph == 1)
    edges      = zip(rows.tolist(), cols.tolist())
    sources    = []
    targets    = []
    for edge in edges:
        sources.append(edge[0])
        targets.append(edge[1])
        if undirected:
            sources.append(edge[0])
            targets.append(edge[1])
    edge_index  = torch.tensor([sources,targets],dtype=torch.long)

    list_graphs = []
    y = y.tolist()
    # print(y)
    for i in range(n_obs):
        y_tensor = torch.tensor(y[i])
        X_tensor = torch.tensor(X[i,:]).view(X.shape[1], 1).float()
        data     = geo_dt.Data(x=X_tensor, edge_index=edge_index, y=y_tensor)
        list_graphs.append(data.coalesce())

    dataloader = geo_dt.DataLoader(list_graphs, batch_size=batch_size, shuffle=shuffle)
    return dataloader
예제 #12
0
def to_torch_geom(adj, features, graph_labels, debug=True):
    graphs = []
    for i in range(len(adj)):  # Graph of a given size
        print("len adj", len(adj))
        batch_i = []
        for j in range(adj[i].shape[0]):  # Number of graphs
            graph_adj = adj[i][j]  ## [edge_index, edge_attribute]
            graph = data.Data(x=features[i][j],
                              edge_index=(graph_adj)[0],
                              y=graph_labels[i][j].unsqueeze(0))
            # , pos=node_labels[i][j])
            if not debug:
                batch_i.append(graph)
        if debug:
            batch_i.append(graph)
        graphs.append(batch_i)
    return graphs.to(device)
예제 #13
0
파일: util.py 프로젝트: vijaydwivedi75/SMP
def to_torch_geom(adj, features, node_labels, graph_labels, device, debug):
    graphs = {}
    for key in adj.keys():  # train, val, test
        graphs[key] = []
        for i in range(len(adj[key])):  # Graph of a given size
            batch_i = []
            for j in range(adj[key][i].shape[0]):  # Number of graphs
                graph_adj = adj[key][i][j]
                graph = data.Data(x=features[key][i][j],
                                  edge_index=dense_to_sparse(graph_adj)[0],
                                  y=graph_labels[key][i][j].unsqueeze(0),
                                  pos=node_labels[key][i][j])
                if not debug:
                    batch_i.append(graph)
            if debug:
                batch_i.append(graph)
            graphs[key].append(batch_i)
    return graphs
예제 #14
0
 def deal_with_mat(self):
     """
     将.mat 转化为 [Data]
     :return: DataList: [Data]
     """
     print("dealing with mat...")
     m = loadmat(self.raw_paths[0])
     A = utils.from_scipy_sparse_matrix(m['network'])
     att = torch.from_numpy(m['attributes'].todense().astype(np.float32))
     y = torch.from_numpy(m['labels'].reshape(-1)).to(torch.long)
     # 如果y最小值不是0,则认为idx从1开始
     if int(torch.min(y)) != 0:
         y -= 1
     dt = tgd.Data(x=att,
                   edge_index=A[0],
                   edge_weight=A[1].to(torch.float32),
                   y=y)
     # print(dt)
     return [dt]
예제 #15
0
 def parse(self, line):
     match = json.loads(line)
     radiant_heroes = [
         int(h) for h in match['radiant_team'].split(',')
     ]
     dire_heroes = [
         int(h) for h in match['dire_team'].split(',')
     ]
     onehot = np.zeros(10, 130)
     for i, h in enumerate(radiant_heroes):
         onehot[i][h] = 1
     for i, h in enumerate(dire_heroes):
         onehot[i + 5][h] = 1
     nodes = torch.Tensor(onehot, dtype=torch.float)
     edges = self.fixed_edges
     edge_attrs = self.match_edge_features(
         radiant_heroes, dire_heroes, self.stats)
     label = float(match['radiant_win'])
     return geodata.Data(nodes, edges, edge_attrs, label)
예제 #16
0
def idx2data(provider, idx: int):
    struct = gt.Structure.str2fullstructure(provider[idx])

    edge_index = [[], []]
    edge_attr = []
    nodes = [[1]] * (1 + len(struct))
    for node, ops in enumerate(struct):
        for op, pre in ops:
            edge_index[0].append(pre)
            edge_index[1].append(node + 1)
            edge_attr.append(torch.eye(len(OP_IDX))[OP_IDX[op]])
    
    x = gd.Data(
        x=tensor(nodes, dtype=torch.float),
        edge_index=tensor(edge_index, dtype=torch.long),
        edge_attr=torch.stack(edge_attr),
        y=tensor([idx2acc(provider, idx)])
    )
    return x
예제 #17
0
파일: data_utils.py 프로젝트: JAOP1/GO
def loadGeometricDataset(datapath, graphpath, encoder):
    json_graph = get_json(graphpath)
    #Getting data.
    data = LoadDataset(datapath, encoder, False)
    inputs, labels = [element[0] for element in data] , [element[1] for element in data]

    #Getting edges.
    edges = json_graph["edges"]
    edges2 = [[v,u] for u,v in edges]
    edges += edges2

    #geometric data.
    edge_index = torch.tensor(edges, dtype = torch.long)   
    data_list = [] 
    for i in range(len(data)):
        x = torch.tensor(inputs[i], dtype = torch.float)
        y = torch.tensor(labels[i], dtype = torch.float)
        data_list.append(geodata.Data(x = x , edge_index = edge_index.t().contiguous() , y = y))
    return data_list
예제 #18
0
def train_model(log=False):
    extract = utils.from_scipy_sparse_matrix(mat)

    G = data.Data(edge_index=extract[0], edge_attr=extract[1], x=x, y=y)
    edge_index = G.edge_index

    num_feat = 5
    num_graph_conv_layers = 2
    graph_conv_embed_sizes = 256
    num_lin_layers = 3
    lin_hidden_sizes = 256
    num_classes = 2

    model = GCN(num_feat, num_graph_conv_layers, graph_conv_embed_sizes,
                num_lin_layers, lin_hidden_sizes, num_classes)
    model.load_state_dict(
        torch.load(load_model_file, map_location=torch.device('cpu')))
    model.eval()

    return model, x, y, edge_index
예제 #19
0
def graph_data(data_helper):

    pmi, edges_matrix, edg_nums = cal_PMI(window_size=15, mode='train')
    index = 0
    data_list = []
    seq_edge_w = torch.zeros((edg_nums, 1))

    for content, label, _ in data_helper.batch_iter(batch_size=1, num_epoch=1):

        vocab_c = data_helper.vocab

        print("content lenth each data", len(vocab_c))

        ###----------this is for the original operating mode  --------#########

        f = feature_etr(vocab_c)

        ##### --------- ---------------  #######
        print('file no', index)
        index += 1

        e, n, _, edg_ar = graphcon(content, label, edges_matrix, pmi)

        ##--------------------------------------------------##
        edges1 = [np.array([edge[0], edge[1]]) for edge in e]
        edge_index = torch.tensor(np.array(edges1).T,
                                  dtype=torch.long)  #.cuda()
        edge_attr = torch.tensor(seq_edge_w[edg_ar],
                                 dtype=torch.float)  #.cuda()

        # print("edge atr", edge_attr.size())
        # edge_index, _ = add_remaining_self_loops (edge_index, edge_attr)
        print("edge index size", edge_index.size())
        ####-------------------------------###

        ft = torch.tensor(f, dtype=torch.float)  #.cuda()
        y = torch.tensor(label, dtype=torch.float)
        data_list.append(
            data.Data(x=ft, edge_index=edge_index, edge_attr=edge_attr, y=y))

    return data_list
예제 #20
0
    def process(self, xd, xt, y, smile_graph):
        assert (len(xd) == len(xt) and len(xt)
                == len(y)), "The three lists must be the same length!"
        data_list = []
        data_len = len(xd)
        for i in range(data_len):
            smiles = xd[i]
            target = xt[i]
            labels = y[i]
            # convert SMILES to molecular representation using rdkit
            c_size, features, edge_index = smile_graph[smiles]
            # make the graph ready for PyTorch Geometrics GCN algorithms:
            GCNData = DATA.Data(
                x=torch.Tensor(features),
                edge_index=torch.LongTensor(edge_index).transpose(1, 0),
                y=torch.FloatTensor([labels]))
            GCNData.target = torch.LongTensor([target])
            GCNData.__setitem__('c_size', torch.LongTensor([c_size]))

            data_list.append(GCNData)
        return data_list
예제 #21
0
def arch2data(arch: Union[str, gt.Structure], acc=None):
    if isinstance(arch, str):
        struct = gt.Structure.str2fullstructure(arch)
    else:
        struct = arch
    edge_index = [[], []]
    edge_attr = []
    nodes = [[1.] for _ in range(1 + len(struct))]
    for idx, ops in enumerate(struct):
        for op, pre in ops:
            edge_index[0].append(pre)
            edge_index[1].append(idx + 1)
            edge_attr.append(torch.eye(len(OP_IDX))[OP_IDX[op]])

    x = gd.Data(x=tensor(nodes),
                edge_index=tensor(edge_index, dtype=torch.long),
                edge_attr=torch.stack(edge_attr))
    if acc is not None:
        x.y = tensor([acc])

    return x
예제 #22
0
    def process(self, groups, xd, xt, y, smile_graph):
        """Customize the process method to fit the task of drug-target affinity prediction.

        Args:
            xd: List of SMILES.
            xt: List of encoded target (categorical or one-hot).
            y: List of labels.

        Returns:
            PyTorch-Geometric format processed data.
        """
        assert (len(xd) == len(xt) and len(xt)
                == len(y)), "The three lists must be the same length!"
        data_list = []
        data_len = len(xd)
        for i in range(data_len):
            smiles = xd[i]
            target = xt[i]
            labels = y[i]
            group = groups[i]
            # Convert SMILES to molecular representation using rdkit
            c_size, features, edge_index = smile_graph[smiles]
            # Make the graph ready for PyTorch Geometrics GCN algorithms
            GCNData = DATA.Data(
                g=torch.FloatTensor([group]),
                x=torch.Tensor(features),
                edge_index=torch.LongTensor(edge_index).transpose(1, 0),
                y=torch.FloatTensor([labels]))
            GCNData.target = torch.LongTensor([target])
            GCNData.__setitem__('c_size', torch.LongTensor([c_size]))
            # Append graph, label and target sequence to data list
            data_list.append(GCNData)

        if self.pre_filter is not None:
            data_list = [data for data in data_list if self.pre_filter(data)]

        if self.pre_transform is not None:
            data_list = [self.pre_transform(data) for data in data_list]
        print('Graph construction done. Saving to file.')
        self.data, self.slices = self.collate(data_list)
예제 #23
0
    def use(self,
            smiles: List[str],
            model_filename: str = None) -> List[List[float]]:
        """
        Uses a pre-trained CompoundGCN, either trained in-session or recalled
        from a file, for use on new data

        Args:
            smiles (list[str]): SMILES strings to predict for
            model_filename (str, optional): filename/path of model to load,
                default = None (model trained in-session used)

        Returns:
            list[list[float]]: predicted values of shape [n_samples, n_targets]
        """

        # Figure out what to use
        if self._model is None and model_filename is None:
            raise RuntimeError(
                'Model not previously built, or model not supplied')
        if model_filename is not None:
            self._model = torch.load(model_filename)
            self._model.eval()

        # Prepare data
        data = []
        for idx, smi in enumerate(smiles):
            a, b, c = self._ce.encode(smi)
            data.append(
                gdata.Data(x=a, edge_index=c, edge_attr=b).to(self._device))
        loader_test = gdata.DataLoader(data, batch_size=1, shuffle=False)

        # Get results
        results = []
        for batch in loader_test:
            res, _, _ = self._model(batch)
            results.append(res.detach().numpy().tolist()[0])
        return results
예제 #24
0
from dataset import cora_data, num_features, num_classes
from config import device, lr, weight_decay, hidden_features


# we only set 2-fc layers (i.e. the same config, A -> I) to train again. 
class TwoLayersFC(nn.Module):
    def __init__(self):
        super(TwoLayersFC, self).__init__()
        self.fc_1 = nn.Linear(num_features, hidden_features)
        self.fc_2 = nn.Linear(hidden_features, num_classes)
    def forward(self, data):
        x = data.x
        x = F.dropout(x, training=self.training)
        x = F.relu(self.fc_1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc_2(x)
        x = F.relu(x)
        return F.log_softmax(x, dim=1)

if __name__ == '__main__':
    # import torch_geometric
    import torch_geometric.data as gdata
    x = torch.randn(3, num_features)
    edge_index = torch.tensor([
        [0,0,1,1,2],
        [1,2,0,2,1],
    ])
    test_data = gdata.Data(x=x)
    f = TwoLayersFC()
    y = f(test_data)
예제 #25
0
    def process(self, xd, xt_mut, xt_meth, xt_ge, y, smile_graph):
        assert (len(xd) == len(xt_mut) and len(xt_mut)
                == len(y)) and len(y) == len(xt_meth) and len(xt_meth) == len(
                    xt_ge), "The four lists must be the same length!"
        data_list = []
        data_len = len(xd)
        for i in range(data_len):
            print('Converting SMILES to graph: {}/{}'.format(i + 1, data_len))
            smiles = xd[i]
            target_mut = xt_mut[i]
            target_meth = xt_meth[i]
            target_ge = xt_ge[i]
            labels = y[i]
            # convert SMILES to molecular representation using rdkit
            c_size, features, edge_index = smile_graph[smiles]
            # make the graph ready for PyTorch Geometrics GCN algorithms:
            GCNData = DATA.Data(
                x=torch.Tensor(features),
                edge_index=torch.LongTensor(edge_index).transpose(1, 0),
                y=torch.FloatTensor([labels]))

            # require_grad of cell-line for saliency map
            if self.saliency_map == True:
                GCNData.target_mut = torch.tensor([target_mut],
                                                  dtype=torch.float,
                                                  requires_grad=True)
                GCNData.target_meth = torch.tensor([target_meth],
                                                   dtype=torch.float,
                                                   requires_grad=True)
                GCNData.target_ge = torch.tensor([target_ge],
                                                 dtype=torch.float,
                                                 requires_grad=True)
            else:
                GCNData.target_mut = torch.FloatTensor([target_mut])
                GCNData.target_meth = torch.FloatTensor([target_meth])
                GCNData.target_ge = torch.FloatTensor([target_ge])

            GCNData.__setitem__('c_size', torch.LongTensor([c_size]))
            # append graph, label and target sequence to data list
            data_list.append(GCNData)


# for xt_meth
        """for i in range(data_len):
            print('Converting SMILES to graph: {}/{}'.format(i + 1, data_len))
            smiles = xd[i]
            target = xt_meth[i]
            labels = y[i]
            # convert SMILES to molecular representation using rdkit
            c_size, features, edge_index = smile_graph[smiles]
            # make the graph ready for PyTorch Geometrics GCN algorithms:
            GCNData = DATA.Data(x=torch.Tensor(features),
                                edge_index=torch.LongTensor(edge_index).transpose(1, 0),
                                y=torch.FloatTensor([labels]))

            # require_grad of cell-line for saliency map
            if self.saliency_map == True:
                GCNData.target = torch.tensor([target], dtype=torch.float, requires_grad=True)
            else:
                GCNData.target = torch.FloatTensor([target])

            GCNData.__setitem__('c_size', torch.LongTensor([c_size]))
            # append graph, label and target sequence to data list
            data_list_meth.append(GCNData)

        #append data_list_mut and data_list_meth together
        for x in data_list_meth:
            data_list.append(x)
"""
        if self.pre_filter is not None:
            data_list = [data for data in data_list if self.pre_filter(data)]

        if self.pre_transform is not None:
            data_list = [self.pre_transform(data) for data in data_list]
        print('Graph construction done. Saving to file.')
        data, slices = self.collate(data_list)
        # save preprocessed data:
        torch.save((data, slices), self.processed_paths[0])
예제 #26
0
def wrapper(func, *args, **kwargs):
    """wrapper to measure functions execution time through timeit.

    :param function func: user defined function
    :param type *args: `*args` of function
    :param type **kwargs: `**kwargs` of function
    :return: wrapped function with no arguments needed.
    :rtype: wrapped_function

    """
    def wrapped():
        return func(*args, **kwargs)
    return wrapped


if __name__ == '__main__': 

    row = 22
    col = 26
    wrap = wrapper(create_graph, row, col)
    print(timeit.timeit(wrap, number=100000))
    
    indx_out, indx_in = create_graph(row, col)
    coo = torch.tensor([indx_out, indx_in], dtype=torch.long)
    graph = data.Data(edge_index=coo)
    G = utils.to_networkx(graph)
    draw_graph(G)
    print(indx_out, '\n', indx_in)

예제 #27
0
# train-test split edges
genes = torch.arange(len(node_classes))[node_classes == 0]
diseases = torch.arange(len(node_classes))[node_classes == 1]
validation_genes_mask = torch.randint(0,
                                      100,
                                      size=(len(node_classes) -
                                            torch.sum(node_classes).item(), ))
validation_genes = torch.arange(0,
                                len(node_classes) - torch.sum(node_classes),
                                dtype=torch.long)[validation_genes_mask < 20]

full_graph = gdata.Data(
    edge_index=edge_index,
    edge_types=edge_types,
    feats=features,
    node_classes=node_classes,
    num_nodes=len(node_classes),
)

# positive train/val
pos_val = torch.logical_or(
    torch.logical_and(
        torch.BoolTensor(np.isin(full_graph.edge_index[0],
                                 validation_genes), ),
        full_graph.edge_types == 1,
    ),
    torch.logical_and(
        torch.BoolTensor(np.isin(full_graph.edge_index[1],
                                 validation_genes), ),
        full_graph.edge_types == 1,
예제 #28
0
    def train(self,
              smiles: list,
              target: list,
              model_filename: str = None,
              model_config: dict = None):
        ''' GraphOperator.train: trains a graph neural network given SMILES
        strings, target values, supplied config (i.e. architecture, hyper-
        parameters)

        Args:
            smiles (list): list of SMILES strings (str)
            target (list): list of target values (1d, float)
            model_filename (str): if not None, saves model to this location
            model_config (dict): configuration dict; if none supplied, default
                is used

        Returns:
            None
        '''

        # Check for inequality in length of input, target data
        if len(smiles) != len(target):
            raise ValueError(
                'Supplied SMILES and targets not the same length: {}, {}'.
                format(len(smiles), len(target)))

        # Prepare data
        self._ce = CompoundEncoder(smiles)
        data = []
        for idx, smi in enumerate(smiles):
            a, b = self._ce.encode(smi)
            data.append(
                gdata.Data(x=a,
                           edge_index=self._ce.connectivity(smi),
                           edge_attr=b,
                           y=torch.tensor(target[idx]).type(torch.float)).to(
                               self._config['device']))

        # Split data into training, validation subsets
        data_train, data_valid = train_test_split(
            data, test_size=self._config['valid_size'])
        loader_train = gdata.DataLoader(data_train,
                                        batch_size=self._config['batch_size'],
                                        shuffle=True)
        loader_valid = gdata.DataLoader(data_valid,
                                        batch_size=self._config['batch_size'],
                                        shuffle=True)

        # Create model
        self._model = MessagePassingNet(self._ce.ATOM_DIM,
                                        len(target[0]),
                                        task=self._config['task'],
                                        config=model_config)
        self._model.construct()
        self._model.to(self._config['device'])
        optimizer = torch.optim.Adam(self._model.parameters(),
                                     lr=self._config['learning_rate'])

        # Setup callbacks
        CBO = CallbackOperator()
        _lrdecay = LRDecayLinear(self._config['learning_rate'],
                                 self._config['lr_decay'], optimizer)
        _validator = Validator(loader_valid, self._model,
                               self._config['valid_epoch_iter'],
                               self._config['valid_patience'])
        CBO.add_cb(_lrdecay)
        CBO.add_cb(_validator)

        # TRAIN BEGIN
        CBO.on_train_begin()

        # Begin training loop
        for epoch in range(self._config['epochs']):

            # EPOCH BEGIN
            if not CBO.on_epoch_begin(epoch):
                break

            train_loss = 0.0
            self._model.train()

            for b_idx, batch in enumerate(loader_train):

                # BATCH BEGIN
                if not CBO.on_batch_begin(b_idx):
                    break

                optimizer.zero_grad()
                embedding, pred = self._model(batch)
                target = batch.y
                if self._config['task'] == 'node':
                    pred = pred[batch.train_mask]
                    target = target[batch.train_mask]

                # BATCH END, LOSS BEGIN
                if not CBO.on_batch_end(b_idx):
                    break
                if not CBO.on_loss_begin(b_idx):
                    break

                loss = self._model.loss(pred, target)
                loss.backward()

                # LOSS END, STEP BEGIN
                if not CBO.on_loss_end(b_idx):
                    break
                if not CBO.on_step_begin(b_idx):
                    break

                optimizer.step()
                train_loss += loss.detach().item() * batch.num_graphs

                # STEP END
                if not CBO.on_step_end(b_idx):
                    break

            train_loss /= len(loader_train.dataset)

            # EPOCH END
            if not CBO.on_epoch_end(epoch):
                break

            if self._config['verbose']:
                print('Epoch: {} | Train Loss: {} | Valid Loss: {}'.format(
                    epoch, train_loss, _validator._best_loss))

        # TRAIN END
        CBO.on_train_end()

        if model_filename is not None:
            torch.save(self._model, model_filename)
예제 #29
0
    def train(self,
              smiles: List[str],
              target: List[List[float]],
              model_config: dict = None,
              valid_size: float = 0.2,
              valid_epoch_iter: int = 1,
              valid_patience: int = 16,
              batch_size: int = 1,
              lr: float = 0.001,
              lr_decay: float = 0.0,
              epochs: int = 128,
              verbose: int = 0,
              random_state: int = None,
              shuffle: bool = False,
              **kwargs) -> Tuple[List[float], List[float]]:
        """
        Trains a CompoundCGN using supplied SMILES strings, target values

        Args:
            smiles (list[str]): list of SMILES strings, one per compound
            target (list[list[float]]): list of target values, shape
                [n_samples, n_targets], one per compound
            model_filename (str, optional): if not `None`, saves the trained
                model to this filename/path
            model_config (dict, optional): if not supplied, uses default model
            architecture:
                {
                    'n_messages': 1,
                    'n_hidden': 1,
                    'hidden_dim': 32,
                    'dropout': 0.00
                }
            valid_size (float, optional): proportion of training set used for
                periodic validation, default = 0.2
            valid_epoch_iter (int, optional): validation set performance is
                measured every `this` epochs, default = 1 epochs
            valid_patience (int, optional): if lower validation set loss not
                encountered after `this` many epochs, terminate to avoid
                overfitting, default = 16
            batch_size (int, optional): size of each batch during training,
                default = 1
            lr (float, optional): learning rate for Adam opt, default = 0.001
            lr_decay (float, optional): linear rate of decay of learning rate
                per epoch, default = 0.0
            epochs (int, optional): number of training epochs, default = 128
            verbose (int, optional): training and validation loss printed to
                console every `this` epochs, default = 0 (no printing)
            random_state (int, optional): if not `None`, seeds validation
                subset randomized selection with this value
            shuffle (bool, optional): if True, shuffles training and validation
                subsets between training epochs, default = False
            **kwargs: additional arguments passed to torch.optim.Adam

        Returns:
            tuple[list[float], list[float]]: (training losses, validation
                losses) over all training epochs
        """

        # Check for inequality in length of input, target data
        if len(smiles) != len(target):
            raise ValueError(
                'Supplied SMILES and targets not the same length: {}, {}'.
                format(len(smiles), len(target)))

        # Prepare data
        self._ce = CompoundEncoder(smiles)
        data = []
        for idx, smi in enumerate(smiles):
            a, b, c = self._ce.encode(smi)
            data.append(
                gdata.Data(x=a,
                           edge_index=c,
                           edge_attr=b,
                           y=torch.tensor(
                               target[idx]).type(torch.float).reshape(
                                   1, len(target[idx]))).to(self._device))

        # Split data into training, validation subsets
        data_train, data_valid = train_test_split(data,
                                                  test_size=valid_size,
                                                  random_state=random_state)
        loader_train = gdata.DataLoader(data_train,
                                        batch_size=batch_size,
                                        shuffle=True)
        loader_valid = gdata.DataLoader(data_valid,
                                        batch_size=batch_size,
                                        shuffle=True)

        # Create model
        if model_config is None:
            self._model = CompoundGCN(self._ce.ATOM_DIM, self._ce.BOND_DIM,
                                      len(target[0]))
        else:
            self._model = CompoundGCN(self._ce.ATOM_DIM, self._ce.BOND_DIM,
                                      len(target[0]),
                                      model_config['n_messages'],
                                      model_config['n_hidden'],
                                      model_config['hidden_dim'],
                                      model_config['dropout'])
        self._model.to(self._device)
        optimizer = torch.optim.Adam(self._model.parameters(), lr=lr, **kwargs)

        # Setup callbacks
        CBO = CallbackOperator()
        _lrdecay = LRDecayLinear(lr, lr_decay, optimizer)
        _validator = Validator(loader_valid, self._model, valid_epoch_iter,
                               valid_patience)
        CBO.add_cb(_lrdecay)
        CBO.add_cb(_validator)

        # Record loss for return
        train_losses = []
        valid_losses = []

        # TRAIN BEGIN
        CBO.on_train_begin()

        # Begin training loop
        for epoch in range(epochs):

            # EPOCH BEGIN
            if not CBO.on_epoch_begin(epoch):
                break

            if shuffle:
                data_train, data_valid = train_test_split(
                    data, test_size=valid_size, random_state=random_state)
                loader_train = gdata.DataLoader(data_train,
                                                batch_size=batch_size,
                                                shuffle=True)
                loader_valid = gdata.DataLoader(data_valid,
                                                batch_size=batch_size,
                                                shuffle=True)

            train_loss = 0.0
            self._model.train()

            for b_idx, batch in enumerate(loader_train):

                # BATCH BEGIN
                if not CBO.on_batch_begin(b_idx):
                    break

                optimizer.zero_grad()
                pred, _, _ = self._model(batch)
                target = batch.y

                # BATCH END, LOSS BEGIN
                if not CBO.on_batch_end(b_idx):
                    break
                if not CBO.on_loss_begin(b_idx):
                    break

                loss = self._model.loss(pred, target)
                loss.backward()

                # LOSS END, STEP BEGIN
                if not CBO.on_loss_end(b_idx):
                    break
                if not CBO.on_step_begin(b_idx):
                    break

                optimizer.step()
                train_loss += loss.detach().item() * batch.num_graphs

                # STEP END
                if not CBO.on_step_end(b_idx):
                    break

            train_loss /= len(loader_train.dataset)

            # EPOCH END
            if not CBO.on_epoch_end(epoch):
                break

            if verbose > 0:
                if epoch % verbose == 0:
                    print('Epoch: {} | Train Loss: {} | Valid Loss: {}'.format(
                        epoch, train_loss, _validator._most_recent_loss))

            train_losses.append(train_loss)
            valid_losses.append(_validator._most_recent_loss.detach().item())

        # TRAIN END
        CBO.on_train_end()

        return (train_losses, valid_losses)
예제 #30
0
    def process(self):
        if osp.exists(
                os.path.join(self.processed_dir,
                             'Decagon-{}-multi.pt'.format(self.datatype))):
            return

        data_list = []

        # >>> Obtain One-Hot Encoding for Side-Effects
        json_dict = {
            literal_eval(k): v
            for k, v in self.json_load[self.datatype].items()
        }
        total = len(json_dict)

        for idx, (smiles1, smiles2) in enumerate(json_dict):
            printProgress(idx + 1, total,
                          '{} dataset preparation: '.format(self.datatype),
                          ' ', 2, 50)
            mol1 = MolFromSmiles(smiles1)
            mol2 = MolFromSmiles(smiles2)
            label = np.array(json_dict[(smiles1, smiles2)])
            #print(len(label[label == 1]))
            #print(len(label[label == 0]))
            #print("\n{}-[{},{},{}:{}] : {}".format(mode, smiles1, smiles2, se, target_dict[se], label))

            if mol1 is None or mol2 is None:
                print("There is a missing drug from the pair (%s,%s)" %
                      (mol1, mol2))
                continue

            ######################################################################
            # >>> Get pairwise graph G1, G2
            c1_size = mol1.GetNumAtoms()
            c2_size = mol2.GetNumAtoms()

            if c1_size == 0 or c2_size == 0:
                print("There is a size error from pair (%s,%s)" % (mol1, mol2))
                continue

            atoms1 = mol1.GetAtoms()
            atoms2 = mol2.GetAtoms()
            bonds1 = mol1.GetBonds()
            bonds2 = mol2.GetBonds()

            features, edges = [], []

            for atom in atoms1:
                feature = atom_features(atom)
                features.append(feature / sum(feature))  # normalize
            for atom in atoms2:
                feature = atom_features(atom)
                features.append(feature / sum(feature))  # normalize
            for bond in bonds1:
                edges.append([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()])
            for bond in bonds2:
                edges.append([
                    bond.GetBeginAtomIdx() + c1_size,
                    bond.GetEndAtomIdx() + c1_size
                ])

            if len(edges) == 0:
                continue

            G = nx.Graph(edges).to_directed()
            edge_index = [[e1, e2] for e1, e2 in G.edges]

            GraphSiameseData = DATA.Data(
                x=torch.Tensor(features),
                edge_index=torch.LongTensor(edge_index).transpose(1, 0),
                y=torch.Tensor(label).view(1, -1))
            GraphSiameseData.__setitem__('c1_size',
                                         torch.LongTensor([c1_size]))
            GraphSiameseData.__setitem__('c2_size',
                                         torch.LongTensor([c2_size]))
            data_list.append(GraphSiameseData)
            ###########################################################################

        if self.pre_filter is not None:
            data_list = [data for data in data_list if self.pre_filter(data)]
        if self.pre_transform is not None:
            data_list = [self.pre_transform(data) for data in data_list]

        # check this function
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])