示例#1
0
    def use(self, smiles: list, model_filename=None) -> list:

        # Figure out what to use
        if self._model is None and model_filename is None:
            raise RuntimeError(
                'Model not previously built, or model not supplied')
        if model_filename is not None:
            self._model = torch.load(model_filename)
            self._model.eval()

        # Prepare data
        data = []
        for idx, smi in enumerate(smiles):
            a, b = self._ce.encode(smi)
            data.append(
                gdata.Data(x=a,
                           edge_index=self._ce.connectivity(smi),
                           edge_attr=b).to(self._config['device']))
        loader_test = gdata.DataLoader(data, batch_size=1, shuffle=False)

        # Get results
        results = []
        for batch in loader_test:
            _, res = self._model(batch)
            results.append(res.detach().numpy()[0])
        return results
示例#2
0
def valid(valid_data, model, batch_size):
    pred = []
    gt = []
    loader = gd.DataLoader(valid_data, batch_size, shuffle=False)

    model.eval()
    for X in loader:
        gt.append(X.y)
        X = X.to('cuda')
        pred.append(model(X).detach().cpu())
    pred = torch.cat(pred, dim=0).view(-1).numpy()
    gt = torch.cat(gt, dim=0).view(-1).numpy()
    return pred, gt
示例#3
0
    def train_node_classifier(model_name, dataset, **model_kwargs):
        pl.seed_everything(42)
        node_data_loader = geom_data.DataLoader(dataset, batch_size=1)

        # Create a PyTorch Lightning trainer.
        root_dir = os.path.join(CHECKPOINT_PATH, "NodeLevel" + model_name)
        os.makedirs(root_dir, exist_ok=True)
        trainer = pl.Trainer(
            default_root_dir=root_dir,
            callbacks=[
                ModelCheckpoint(save_weights_only=True,
                                mode="max",
                                monitor="val_acc")
            ],
            gpus=AVAIL_GPUS,
            max_epochs=200,
            progress_bar_refresh_rate=0,
        )  # 0 because epoch size is 1.
        trainer.logger._default_hp_metric = None  # Optional logging argument that we don't need.

        # Check whether pretrained model exists. If yes, load it and skip training.
        pretrained_filename = os.path.join(CHECKPOINT_PATH,
                                           "NodeLevel%s.ckpt" % model_name)
        if os.path.isfile(pretrained_filename):
            print("Found pretrained model, loading...")
            model = NodeLevelGNN.load_from_checkpoint(pretrained_filename)
        else:
            pl.seed_everything()
            model = NodeLevelGNN(model_name=model_name,
                                 c_in=dataset.num_node_features,
                                 c_out=dataset.num_classes,
                                 **model_kwargs)
            trainer.fit(model, node_data_loader, node_data_loader)
            model = NodeLevelGNN.load_from_checkpoint(
                trainer.checkpoint_callback.best_model_path)

        # Test best model on the test set.
        test_result = trainer.test(model,
                                   test_dataloaders=node_data_loader,
                                   verbose=False)
        batch = next(iter(node_data_loader))
        batch = batch.to(model.device)
        _, train_acc = model.forward(batch, mode="train")
        _, val_acc = model.forward(batch, mode="val")
        result = {
            "train": train_acc,
            "val": val_acc,
            "test": test_result[0]["test_acc"]
        }
        return model, result
示例#4
0
def visualize_all_rank(model, valid_data, fn, batch_size):
    pred = []
    gt = []
    loader = gd.DataLoader(valid_data, batch_size, shuffle=False)

    model.eval()
    for X in loader:
        gt.append(X.y)
        X = X.to('cuda')
        pred.append(model(X).detach().cpu())
    pred = torch.cat(pred, dim=0).view(-1).numpy()
    gt = torch.cat(gt, dim=0).view(-1).numpy()
    fig = plot_rank(pred, gt)
    fig.savefig(fn)
示例#5
0
文件: utils.py 项目: EliHei2/scPotter
def get_dataloader(graph, X, y, batch_size=1,undirected=True, shuffle=True):
    """
        Converts a graph and a dataset to a dataloader.
        
        Parameters:
        ----------
        graph : igraph object
            The underlying graph to be fed to the graph neural networks.

        X : numpy ndarray
            Input dataset with columns as features and rows as observations.

        y : numpy ndarray
            Class labels.

        batch_size: int, default=1
            The batch size.

        undirected: boolean
            if the input graph is undirected (symmetric adjacency matrix).

        Returns:
        --------
        dataloader : a pytorch-geometric dataloader. All of the graphs will have the same connectivity (given by the input graph),
        but the node features will be the features from X.
    """
    n_obs, n_features = X.shape
    rows, cols = np.where(graph == 1)
    edges      = zip(rows.tolist(), cols.tolist())
    sources    = []
    targets    = []
    for edge in edges:
        sources.append(edge[0])
        targets.append(edge[1])
        if undirected:
            sources.append(edge[0])
            targets.append(edge[1])
    edge_index  = torch.tensor([sources,targets],dtype=torch.long)

    list_graphs = []
    y = y.tolist()
    # print(y)
    for i in range(n_obs):
        y_tensor = torch.tensor(y[i])
        X_tensor = torch.tensor(X[i,:]).view(X.shape[1], 1).float()
        data     = geo_dt.Data(x=X_tensor, edge_index=edge_index, y=y_tensor)
        list_graphs.append(data.coalesce())

    dataloader = geo_dt.DataLoader(list_graphs, batch_size=batch_size, shuffle=shuffle)
    return dataloader
示例#6
0
    def use(self,
            smiles: List[str],
            model_filename: str = None) -> List[List[float]]:
        """
        Uses a pre-trained CompoundGCN, either trained in-session or recalled
        from a file, for use on new data

        Args:
            smiles (list[str]): SMILES strings to predict for
            model_filename (str, optional): filename/path of model to load,
                default = None (model trained in-session used)

        Returns:
            list[list[float]]: predicted values of shape [n_samples, n_targets]
        """

        # Figure out what to use
        if self._model is None and model_filename is None:
            raise RuntimeError(
                'Model not previously built, or model not supplied')
        if model_filename is not None:
            self._model = torch.load(model_filename)
            self._model.eval()

        # Prepare data
        data = []
        for idx, smi in enumerate(smiles):
            a, b, c = self._ce.encode(smi)
            data.append(
                gdata.Data(x=a, edge_index=c, edge_attr=b).to(self._device))
        loader_test = gdata.DataLoader(data, batch_size=1, shuffle=False)

        # Get results
        results = []
        for batch in loader_test:
            res, _, _ = self._model(batch)
            results.append(res.detach().numpy().tolist()[0])
        return results
示例#7
0
    experiment.add_tags(tags=[
        f'graph_model={graph_model}',
        f'graph_attention_pooling={graph_attention_pooling}',
    ])

    # Everything in the experiment will be put inside this try statement
    # If exception happens, clean things up and move to the next experiment
    try:
        # Dataloaders
        dataloader_kwargs = {
            'pin_memory': True,
            'batch_size': batch_size,
            'num_workers': num_workers,
        }
        trn_loader = pyg_data.DataLoader(trn_dset,
                                         shuffle=True,
                                         **dataloader_kwargs)
        tst_loader = pyg_data.DataLoader(tst_dset, **dataloader_kwargs)

        # Construct graph model, might run into CUDA memory error
        graph_model_kwargs = {
            'node_attr_dim': node_attr_dim,
            'edge_attr_dim': edge_attr_dim,
            'state_dim': graph_state_dim,
            'num_conv': graph_num_conv,
            'out_dim': graph_out_dim,
            'attention_pooling': graph_attention_pooling,
        }

        if graph_model == 'gcn':
            drug_tower = EdgeGCNEncoder(**graph_model_kwargs)
示例#8
0
文件: data_utils.py 项目: JAOP1/GO
def make_GeometricDataloader(dataset, batch_size_ = 4, shuffle_ = True):
    return geodata.DataLoader(dataset, batch_size = batch_size_, shuffle=shuffle_)
示例#9
0
    def train(self,
              smiles: List[str],
              target: List[List[float]],
              model_config: dict = None,
              valid_size: float = 0.2,
              valid_epoch_iter: int = 1,
              valid_patience: int = 16,
              batch_size: int = 1,
              lr: float = 0.001,
              lr_decay: float = 0.0,
              epochs: int = 128,
              verbose: int = 0,
              random_state: int = None,
              shuffle: bool = False,
              **kwargs) -> Tuple[List[float], List[float]]:
        """
        Trains a CompoundCGN using supplied SMILES strings, target values

        Args:
            smiles (list[str]): list of SMILES strings, one per compound
            target (list[list[float]]): list of target values, shape
                [n_samples, n_targets], one per compound
            model_filename (str, optional): if not `None`, saves the trained
                model to this filename/path
            model_config (dict, optional): if not supplied, uses default model
            architecture:
                {
                    'n_messages': 1,
                    'n_hidden': 1,
                    'hidden_dim': 32,
                    'dropout': 0.00
                }
            valid_size (float, optional): proportion of training set used for
                periodic validation, default = 0.2
            valid_epoch_iter (int, optional): validation set performance is
                measured every `this` epochs, default = 1 epochs
            valid_patience (int, optional): if lower validation set loss not
                encountered after `this` many epochs, terminate to avoid
                overfitting, default = 16
            batch_size (int, optional): size of each batch during training,
                default = 1
            lr (float, optional): learning rate for Adam opt, default = 0.001
            lr_decay (float, optional): linear rate of decay of learning rate
                per epoch, default = 0.0
            epochs (int, optional): number of training epochs, default = 128
            verbose (int, optional): training and validation loss printed to
                console every `this` epochs, default = 0 (no printing)
            random_state (int, optional): if not `None`, seeds validation
                subset randomized selection with this value
            shuffle (bool, optional): if True, shuffles training and validation
                subsets between training epochs, default = False
            **kwargs: additional arguments passed to torch.optim.Adam

        Returns:
            tuple[list[float], list[float]]: (training losses, validation
                losses) over all training epochs
        """

        # Check for inequality in length of input, target data
        if len(smiles) != len(target):
            raise ValueError(
                'Supplied SMILES and targets not the same length: {}, {}'.
                format(len(smiles), len(target)))

        # Prepare data
        self._ce = CompoundEncoder(smiles)
        data = []
        for idx, smi in enumerate(smiles):
            a, b, c = self._ce.encode(smi)
            data.append(
                gdata.Data(x=a,
                           edge_index=c,
                           edge_attr=b,
                           y=torch.tensor(
                               target[idx]).type(torch.float).reshape(
                                   1, len(target[idx]))).to(self._device))

        # Split data into training, validation subsets
        data_train, data_valid = train_test_split(data,
                                                  test_size=valid_size,
                                                  random_state=random_state)
        loader_train = gdata.DataLoader(data_train,
                                        batch_size=batch_size,
                                        shuffle=True)
        loader_valid = gdata.DataLoader(data_valid,
                                        batch_size=batch_size,
                                        shuffle=True)

        # Create model
        if model_config is None:
            self._model = CompoundGCN(self._ce.ATOM_DIM, self._ce.BOND_DIM,
                                      len(target[0]))
        else:
            self._model = CompoundGCN(self._ce.ATOM_DIM, self._ce.BOND_DIM,
                                      len(target[0]),
                                      model_config['n_messages'],
                                      model_config['n_hidden'],
                                      model_config['hidden_dim'],
                                      model_config['dropout'])
        self._model.to(self._device)
        optimizer = torch.optim.Adam(self._model.parameters(), lr=lr, **kwargs)

        # Setup callbacks
        CBO = CallbackOperator()
        _lrdecay = LRDecayLinear(lr, lr_decay, optimizer)
        _validator = Validator(loader_valid, self._model, valid_epoch_iter,
                               valid_patience)
        CBO.add_cb(_lrdecay)
        CBO.add_cb(_validator)

        # Record loss for return
        train_losses = []
        valid_losses = []

        # TRAIN BEGIN
        CBO.on_train_begin()

        # Begin training loop
        for epoch in range(epochs):

            # EPOCH BEGIN
            if not CBO.on_epoch_begin(epoch):
                break

            if shuffle:
                data_train, data_valid = train_test_split(
                    data, test_size=valid_size, random_state=random_state)
                loader_train = gdata.DataLoader(data_train,
                                                batch_size=batch_size,
                                                shuffle=True)
                loader_valid = gdata.DataLoader(data_valid,
                                                batch_size=batch_size,
                                                shuffle=True)

            train_loss = 0.0
            self._model.train()

            for b_idx, batch in enumerate(loader_train):

                # BATCH BEGIN
                if not CBO.on_batch_begin(b_idx):
                    break

                optimizer.zero_grad()
                pred, _, _ = self._model(batch)
                target = batch.y

                # BATCH END, LOSS BEGIN
                if not CBO.on_batch_end(b_idx):
                    break
                if not CBO.on_loss_begin(b_idx):
                    break

                loss = self._model.loss(pred, target)
                loss.backward()

                # LOSS END, STEP BEGIN
                if not CBO.on_loss_end(b_idx):
                    break
                if not CBO.on_step_begin(b_idx):
                    break

                optimizer.step()
                train_loss += loss.detach().item() * batch.num_graphs

                # STEP END
                if not CBO.on_step_end(b_idx):
                    break

            train_loss /= len(loader_train.dataset)

            # EPOCH END
            if not CBO.on_epoch_end(epoch):
                break

            if verbose > 0:
                if epoch % verbose == 0:
                    print('Epoch: {} | Train Loss: {} | Valid Loss: {}'.format(
                        epoch, train_loss, _validator._most_recent_loss))

            train_losses.append(train_loss)
            valid_losses.append(_validator._most_recent_loss.detach().item())

        # TRAIN END
        CBO.on_train_end()

        return (train_losses, valid_losses)
示例#10
0
def simple_gnn_tutorial():
    AVAIL_GPUS = min(1, torch.cuda.device_count())
    BATCH_SIZE = 256 if AVAIL_GPUS else 64
    # Path to the folder where the datasets are/should be downloaded.
    DATASET_PATH = os.environ.get("PATH_DATASETS", "data/")
    # Path to the folder where the pretrained models are saved.
    CHECKPOINT_PATH = os.environ.get("PATH_CHECKPOINT", "saved_models/GNNs/")

    # Setting the seed.
    pl.seed_everything(42)

    # Ensure that all operations are deterministic on GPU (if used) for reproducibility.
    torch.backends.cudnn.determinstic = True
    torch.backends.cudnn.benchmark = False

    # Github URL where saved models are stored for this tutorial.
    base_url = "https://raw.githubusercontent.com/phlippe/saved_models/main/tutorial7/"
    # Files to download.
    pretrained_files = [
        "NodeLevelMLP.ckpt", "NodeLevelGNN.ckpt", "GraphLevelGraphConv.ckpt"
    ]

    # Create checkpoint path if it doesn't exist yet.
    os.makedirs(CHECKPOINT_PATH, exist_ok=True)

    # For each file, check whether it already exists. If not, try downloading it.
    for file_name in pretrained_files:
        file_path = os.path.join(CHECKPOINT_PATH, file_name)
        if "/" in file_name:
            os.makedirs(file_path.rsplit("/", 1)[0], exist_ok=True)
        if not os.path.isfile(file_path):
            file_url = base_url + file_name
            print("Downloading %s..." % file_url)
            try:
                urllib.request.urlretrieve(file_url, file_path)
            except HTTPError as e:
                print(
                    "Something went wrong. Please try to download the file from the GDrive folder,"
                    " or contact the author with the full output including the following error:\n",
                    e,
                )

    #--------------------
    # Graph convolutions.

    class GCNLayer(nn.Module):
        def __init__(self, c_in, c_out):
            super().__init__()
            self.projection = nn.Linear(c_in, c_out)

        def forward(self, node_feats, adj_matrix):
            """
			Args:
				node_feats: Tensor with node features of shape [batch_size, num_nodes, c_in]
				adj_matrix: Batch of adjacency matrices of the graph. If there is an edge from i to j,
							adj_matrix[b,i,j]=1 else 0. Supports directed edges by non-symmetric matrices.
							Assumes to already have added the identity connections.
							Shape: [batch_size, num_nodes, num_nodes]
			"""
            # Num neighbours = number of incoming edges.
            num_neighbours = adj_matrix.sum(dim=-1, keepdims=True)
            node_feats = self.projection(node_feats)
            node_feats = torch.bmm(adj_matrix, node_feats)
            node_feats = node_feats / num_neighbours
            return node_feats

    node_feats = torch.arange(8, dtype=torch.float32).view(1, 4, 2)
    adj_matrix = torch.Tensor([[[1, 1, 0, 0], [1, 1, 1, 1], [0, 1, 1, 1],
                                [0, 1, 1, 1]]])

    print("Node features:\n", node_feats)
    print("\nAdjacency matrix:\n", adj_matrix)

    layer = GCNLayer(c_in=2, c_out=2)
    layer.projection.weight.data = torch.Tensor([[1.0, 0.0], [0.0, 1.0]])
    layer.projection.bias.data = torch.Tensor([0.0, 0.0])

    with torch.no_grad():
        out_feats = layer(node_feats, adj_matrix)

    print("Adjacency matrix", adj_matrix)
    print("Input features", node_feats)
    print("Output features", out_feats)

    #--------------------
    # Graph attention.

    class GATLayer(nn.Module):
        def __init__(self,
                     c_in,
                     c_out,
                     num_heads=1,
                     concat_heads=True,
                     alpha=0.2):
            """
			Args:
				c_in: Dimensionality of input features
				c_out: Dimensionality of output features
				num_heads: Number of heads, i.e. attention mechanisms to apply in parallel. The
							output features are equally split up over the heads if concat_heads=True.
				concat_heads: If True, the output of the different heads is concatenated instead of averaged.
				alpha: Negative slope of the LeakyReLU activation.
			"""
            super().__init__()
            self.num_heads = num_heads
            self.concat_heads = concat_heads
            if self.concat_heads:
                assert c_out % num_heads == 0, "Number of output features must be a multiple of the count of heads."
                c_out = c_out // num_heads

            # Sub-modules and parameters needed in the layer.
            self.projection = nn.Linear(c_in, c_out * num_heads)
            self.a = nn.Parameter(torch.Tensor(num_heads,
                                               2 * c_out))  # One per head.
            self.leakyrelu = nn.LeakyReLU(alpha)

            # Initialization from the original implementation.
            nn.init.xavier_uniform_(self.projection.weight.data, gain=1.414)
            nn.init.xavier_uniform_(self.a.data, gain=1.414)

        def forward(self, node_feats, adj_matrix, print_attn_probs=False):
            """
			Args:
				node_feats: Input features of the node. Shape: [batch_size, c_in]
				adj_matrix: Adjacency matrix including self-connections. Shape: [batch_size, num_nodes, num_nodes]
				print_attn_probs: If True, the attention weights are printed during the forward pass
								(for debugging purposes)
			"""
            batch_size, num_nodes = node_feats.size(0), node_feats.size(1)

            # Apply linear layer and sort nodes by head.
            node_feats = self.projection(node_feats)
            node_feats = node_feats.view(batch_size, num_nodes, self.num_heads,
                                         -1)

            # We need to calculate the attention logits for every edge in the adjacency matrix.
            # Doing this on all possible combinations of nodes is very expensive
            # => Create a tensor of [W*h_i||W*h_j] with i and j being the indices of all edges.
            # Returns indices where the adjacency matrix is not 0 => edges.
            edges = adj_matrix.nonzero(as_tuple=False)
            node_feats_flat = node_feats.view(batch_size * num_nodes,
                                              self.num_heads, -1)
            edge_indices_row = edges[:, 0] * num_nodes + edges[:, 1]
            edge_indices_col = edges[:, 0] * num_nodes + edges[:, 2]
            a_input = torch.cat(
                [
                    torch.index_select(
                        input=node_feats_flat, index=edge_indices_row, dim=0),
                    torch.index_select(
                        input=node_feats_flat, index=edge_indices_col, dim=0),
                ],
                dim=-1,
            )  # Index select returns a tensor with node_feats_flat being indexed at the desired positions.

            # Calculate attention MLP output (independent for each head).
            attn_logits = torch.einsum("bhc,hc->bh", a_input, self.a)
            attn_logits = self.leakyrelu(attn_logits)

            # Map list of attention values back into a matrix.
            attn_matrix = attn_logits.new_zeros(
                adj_matrix.shape + (self.num_heads, )).fill_(-9e15)
            attn_matrix[adj_matrix[..., None].repeat(1, 1, 1, self.num_heads)
                        == 1] = attn_logits.reshape(-1)

            # Weighted average of attention.
            attn_probs = F.softmax(attn_matrix, dim=2)
            if print_attn_probs:
                print("Attention probs\n", attn_probs.permute(0, 3, 1, 2))
            node_feats = torch.einsum("bijh,bjhc->bihc", attn_probs,
                                      node_feats)

            # If heads should be concatenated, we can do this by reshaping. Otherwise, take mean.
            if self.concat_heads:
                node_feats = node_feats.reshape(batch_size, num_nodes, -1)
            else:
                node_feats = node_feats.mean(dim=2)

            return node_feats

    layer = GATLayer(2, 2, num_heads=2)
    layer.projection.weight.data = torch.Tensor([[1.0, 0.0], [0.0, 1.0]])
    layer.projection.bias.data = torch.Tensor([0.0, 0.0])
    layer.a.data = torch.Tensor([[-0.2, 0.3], [0.1, -0.1]])

    with torch.no_grad():
        out_feats = layer(node_feats, adj_matrix, print_attn_probs=True)

    print("Adjacency matrix", adj_matrix)
    print("Input features", node_feats)
    print("Output features", out_feats)

    #--------------------
    # PyTorch Geometric.

    gnn_layer_by_name = {
        "GCN": geom_nn.GCNConv,
        "GAT": geom_nn.GATConv,
        "GraphConv": geom_nn.GraphConv
    }

    #--------------------
    # Node-level tasks: Semi-supervised node classification.

    cora_dataset = torch_geometric.datasets.Planetoid(root=DATASET_PATH,
                                                      name="Cora")
    print(cora_dataset[0])

    class GNNModel(nn.Module):
        def __init__(
            self,
            c_in,
            c_hidden,
            c_out,
            num_layers=2,
            layer_name="GCN",
            dp_rate=0.1,
            **kwargs,
        ):
            """
			Args:
				c_in: Dimension of input features
				c_hidden: Dimension of hidden features
				c_out: Dimension of the output features. Usually number of classes in classification
				num_layers: Number of "hidden" graph layers
				layer_name: String of the graph layer to use
				dp_rate: Dropout rate to apply throughout the network
				kwargs: Additional arguments for the graph layer (e.g. number of heads for GAT)
			"""
            super().__init__()
            gnn_layer = gnn_layer_by_name[layer_name]

            layers = []
            in_channels, out_channels = c_in, c_hidden
            for l_idx in range(num_layers - 1):
                layers += [
                    gnn_layer(in_channels=in_channels,
                              out_channels=out_channels,
                              **kwargs),
                    nn.ReLU(inplace=True),
                    nn.Dropout(dp_rate),
                ]
                in_channels = c_hidden
            layers += [
                gnn_layer(in_channels=in_channels,
                          out_channels=c_out,
                          **kwargs)
            ]
            self.layers = nn.ModuleList(layers)

        def forward(self, x, edge_index):
            """
			Args:
				x: Input features per node
				edge_index: List of vertex index pairs representing the edges in the graph (PyTorch geometric notation)
			"""
            for layer in self.layers:
                # For graph layers, we need to add the "edge_index" tensor as additional input
                # All PyTorch Geometric graph layer inherit the class "MessagePassing", hence
                # we can simply check the class type.
                if isinstance(layer, geom_nn.MessagePassing):
                    x = layer(x, edge_index)
                else:
                    x = layer(x)
            return x

    class MLPModel(nn.Module):
        def __init__(self, c_in, c_hidden, c_out, num_layers=2, dp_rate=0.1):
            """
			Args:
				c_in: Dimension of input features
				c_hidden: Dimension of hidden features
				c_out: Dimension of the output features. Usually number of classes in classification
				num_layers: Number of hidden layers
				dp_rate: Dropout rate to apply throughout the network
			"""
            super().__init__()
            layers = []
            in_channels, out_channels = c_in, c_hidden
            for l_idx in range(num_layers - 1):
                layers += [
                    nn.Linear(in_channels, out_channels),
                    nn.ReLU(inplace=True),
                    nn.Dropout(dp_rate)
                ]
                in_channels = c_hidden
            layers += [nn.Linear(in_channels, c_out)]
            self.layers = nn.Sequential(*layers)

        def forward(self, x, *args, **kwargs):
            """
			Args:
				x: Input features per node
			"""
            return self.layers(x)

    class NodeLevelGNN(pl.LightningModule):
        def __init__(self, model_name, **model_kwargs):
            super().__init__()
            # Saving hyperparameters.
            self.save_hyperparameters()

            if model_name == "MLP":
                self.model = MLPModel(**model_kwargs)
            else:
                self.model = GNNModel(**model_kwargs)
            self.loss_module = nn.CrossEntropyLoss()

        def forward(self, data, mode="train"):
            x, edge_index = data.x, data.edge_index
            x = self.model(x, edge_index)

            # Only calculate the loss on the nodes corresponding to the mask.
            if mode == "train":
                mask = data.train_mask
            elif mode == "val":
                mask = data.val_mask
            elif mode == "test":
                mask = data.test_mask
            else:
                assert False, "Unknown forward mode: %s" % mode

            loss = self.loss_module(x[mask], data.y[mask])
            acc = (x[mask].argmax(dim=-1)
                   == data.y[mask]).sum().float() / mask.sum()
            return loss, acc

        def configure_optimizers(self):
            # We use SGD here, but Adam works as well.
            optimizer = optim.SGD(self.parameters(),
                                  lr=0.1,
                                  momentum=0.9,
                                  weight_decay=2e-3)
            return optimizer

        def training_step(self, batch, batch_idx):
            loss, acc = self.forward(batch, mode="train")
            self.log("train_loss", loss)
            self.log("train_acc", acc)
            return loss

        def validation_step(self, batch, batch_idx):
            _, acc = self.forward(batch, mode="val")
            self.log("val_acc", acc)

        def test_step(self, batch, batch_idx):
            _, acc = self.forward(batch, mode="test")
            self.log("test_acc", acc)

    def train_node_classifier(model_name, dataset, **model_kwargs):
        pl.seed_everything(42)
        node_data_loader = geom_data.DataLoader(dataset, batch_size=1)

        # Create a PyTorch Lightning trainer.
        root_dir = os.path.join(CHECKPOINT_PATH, "NodeLevel" + model_name)
        os.makedirs(root_dir, exist_ok=True)
        trainer = pl.Trainer(
            default_root_dir=root_dir,
            callbacks=[
                ModelCheckpoint(save_weights_only=True,
                                mode="max",
                                monitor="val_acc")
            ],
            gpus=AVAIL_GPUS,
            max_epochs=200,
            progress_bar_refresh_rate=0,
        )  # 0 because epoch size is 1.
        trainer.logger._default_hp_metric = None  # Optional logging argument that we don't need.

        # Check whether pretrained model exists. If yes, load it and skip training.
        pretrained_filename = os.path.join(CHECKPOINT_PATH,
                                           "NodeLevel%s.ckpt" % model_name)
        if os.path.isfile(pretrained_filename):
            print("Found pretrained model, loading...")
            model = NodeLevelGNN.load_from_checkpoint(pretrained_filename)
        else:
            pl.seed_everything()
            model = NodeLevelGNN(model_name=model_name,
                                 c_in=dataset.num_node_features,
                                 c_out=dataset.num_classes,
                                 **model_kwargs)
            trainer.fit(model, node_data_loader, node_data_loader)
            model = NodeLevelGNN.load_from_checkpoint(
                trainer.checkpoint_callback.best_model_path)

        # Test best model on the test set.
        test_result = trainer.test(model,
                                   test_dataloaders=node_data_loader,
                                   verbose=False)
        batch = next(iter(node_data_loader))
        batch = batch.to(model.device)
        _, train_acc = model.forward(batch, mode="train")
        _, val_acc = model.forward(batch, mode="val")
        result = {
            "train": train_acc,
            "val": val_acc,
            "test": test_result[0]["test_acc"]
        }
        return model, result

    # Small function for printing the test scores.
    def print_results(result_dict):
        if "train" in result_dict:
            print("Train accuracy: %4.2f%%" % (100.0 * result_dict["train"]))
        if "val" in result_dict:
            print("Val accuracy:   %4.2f%%" % (100.0 * result_dict["val"]))
        print("Test accuracy:  %4.2f%%" % (100.0 * result_dict["test"]))

    node_mlp_model, node_mlp_result = train_node_classifier(
        model_name="MLP",
        dataset=cora_dataset,
        c_hidden=16,
        num_layers=2,
        dp_rate=0.1)

    print_results(node_mlp_result)

    node_gnn_model, node_gnn_result = train_node_classifier(
        model_name="GNN",
        layer_name="GCN",
        dataset=cora_dataset,
        c_hidden=16,
        num_layers=2,
        dp_rate=0.1)
    print_results(node_gnn_result)

    #--------------------
    # Edge-level tasks: Link prediction.

    #--------------------
    # Graph-level tasks: Graph classification.

    tu_dataset = torch_geometric.datasets.TUDataset(root=DATASET_PATH,
                                                    name="MUTAG")

    print("Data object:", tu_dataset.data)
    print("Length:", len(tu_dataset))
    print("Average label: %4.2f" % (tu_dataset.data.y.float().mean().item()))

    torch.manual_seed(42)
    tu_dataset.shuffle()
    train_dataset = tu_dataset[:150]
    test_dataset = tu_dataset[150:]

    graph_train_loader = geom_data.DataLoader(train_dataset,
                                              batch_size=BATCH_SIZE,
                                              shuffle=True)
    graph_val_loader = geom_data.DataLoader(
        test_dataset,
        batch_size=BATCH_SIZE)  # Additional loader for a larger datasets.
    graph_test_loader = geom_data.DataLoader(test_dataset,
                                             batch_size=BATCH_SIZE)

    graph_train_loader = geom_data.DataLoader(train_dataset,
                                              batch_size=BATCH_SIZE,
                                              shuffle=True)
    graph_val_loader = geom_data.DataLoader(
        test_dataset,
        batch_size=BATCH_SIZE)  # Additional loader for a larger datasets.
    graph_test_loader = geom_data.DataLoader(test_dataset,
                                             batch_size=BATCH_SIZE)

    batch = next(iter(graph_test_loader))
    print("Batch:", batch)
    print("Labels:", batch.y[:10])
    print("Batch indices:", batch.batch[:40])

    class GraphGNNModel(nn.Module):
        def __init__(self,
                     c_in,
                     c_hidden,
                     c_out,
                     dp_rate_linear=0.5,
                     **kwargs):
            """
			Args:
				c_in: Dimension of input features
				c_hidden: Dimension of hidden features
				c_out: Dimension of output features (usually number of classes)
				dp_rate_linear: Dropout rate before the linear layer (usually much higher than inside the GNN)
				kwargs: Additional arguments for the GNNModel object
			"""
            super().__init__()
            self.GNN = GNNModel(c_in=c_in,
                                c_hidden=c_hidden,
                                c_out=c_hidden,
                                **kwargs)  # Not our prediction output yet!
            self.head = nn.Sequential(nn.Dropout(dp_rate_linear),
                                      nn.Linear(c_hidden, c_out))

        def forward(self, x, edge_index, batch_idx):
            """
			Args:
				x: Input features per node
				edge_index: List of vertex index pairs representing the edges in the graph (PyTorch geometric notation)
				batch_idx: Index of batch element for each node
			"""
            x = self.GNN(x, edge_index)
            x = geom_nn.global_mean_pool(x, batch_idx)  # Average pooling.
            x = self.head(x)
            return x

    class GraphLevelGNN(pl.LightningModule):
        def __init__(self, **model_kwargs):
            super().__init__()
            # Saving hyperparameters.
            self.save_hyperparameters()

            self.model = GraphGNNModel(**model_kwargs)
            self.loss_module = nn.BCEWithLogitsLoss(
            ) if self.hparams.c_out == 1 else nn.CrossEntropyLoss()

        def forward(self, data, mode="train"):
            x, edge_index, batch_idx = data.x, data.edge_index, data.batch
            x = self.model(x, edge_index, batch_idx)
            x = x.squeeze(dim=-1)

            if self.hparams.c_out == 1:
                preds = (x > 0).float()
                data.y = data.y.float()
            else:
                preds = x.argmax(dim=-1)
            loss = self.loss_module(x, data.y)
            acc = (preds == data.y).sum().float() / preds.shape[0]
            return loss, acc

        def configure_optimizers(self):
            # High lr because of small dataset and small model.
            optimizer = optim.AdamW(self.parameters(),
                                    lr=1e-2,
                                    weight_decay=0.0)
            return optimizer

        def training_step(self, batch, batch_idx):
            loss, acc = self.forward(batch, mode="train")
            self.log("train_loss", loss)
            self.log("train_acc", acc)
            return loss

        def validation_step(self, batch, batch_idx):
            _, acc = self.forward(batch, mode="val")
            self.log("val_acc", acc)

        def test_step(self, batch, batch_idx):
            _, acc = self.forward(batch, mode="test")
            self.log("test_acc", acc)

    def train_graph_classifier(model_name, **model_kwargs):
        pl.seed_everything(42)

        # Create a PyTorch Lightning trainer with the generation callback.
        root_dir = os.path.join(CHECKPOINT_PATH, "GraphLevel" + model_name)
        os.makedirs(root_dir, exist_ok=True)
        trainer = pl.Trainer(
            default_root_dir=root_dir,
            callbacks=[
                ModelCheckpoint(save_weights_only=True,
                                mode="max",
                                monitor="val_acc")
            ],
            gpus=AVAIL_GPUS,
            max_epochs=500,
            progress_bar_refresh_rate=0,
        )
        trainer.logger._default_hp_metric = None

        # Check whether pretrained model exists. If yes, load it and skip training.
        pretrained_filename = os.path.join(CHECKPOINT_PATH,
                                           "GraphLevel%s.ckpt" % model_name)
        if os.path.isfile(pretrained_filename):
            print("Found pretrained model, loading...")
            model = GraphLevelGNN.load_from_checkpoint(pretrained_filename)
        else:
            pl.seed_everything(42)
            model = GraphLevelGNN(
                c_in=tu_dataset.num_node_features,
                c_out=1
                if tu_dataset.num_classes == 2 else tu_dataset.num_classes,
                **model_kwargs,
            )
            trainer.fit(model, graph_train_loader, graph_val_loader)
            model = GraphLevelGNN.load_from_checkpoint(
                trainer.checkpoint_callback.best_model_path)

        # Test best model on validation and test set.
        train_result = trainer.test(model,
                                    test_dataloaders=graph_train_loader,
                                    verbose=False)
        test_result = trainer.test(model,
                                   test_dataloaders=graph_test_loader,
                                   verbose=False)
        result = {
            "test": test_result[0]["test_acc"],
            "train": train_result[0]["test_acc"]
        }
        return model, result

    model, result = train_graph_classifier(model_name="GraphConv",
                                           c_hidden=256,
                                           layer_name="GraphConv",
                                           num_layers=3,
                                           dp_rate_linear=0.5,
                                           dp_rate=0.0)

    print("Train performance: %4.2f%%" % (100.0 * result["train"]))
    print("Test performance:  %4.2f%%" % (100.0 * result["test"]))
示例#11
0
文件: gat.py 项目: xduan7/MoReL
    ###########################################################################
    # Dataset and dataloader
    dataset_kwargs = {
        'target_list': TARGET_LIST,
        'cid_smiles_dict': cid_smiles_dict,
        'cid_dscrptr_dict': cid_dscrptr_dict
    }
    dataset = GraphToDscrptrDataset(cid_list=cid_list, **dataset_kwargs)

    dataloader_kwargs = {
        'batch_size': 32,
        'timeout': 1,
        'pin_memory': True if use_cuda else False,
        'num_workers': 2 if use_cuda else 0
    }
    dataloader = pyg_data.DataLoader(dataset,
                                     shuffle=True,
                                     **dataloader_kwargs)

    model = EdgeGATEncoder(node_attr_dim=dataset.node_attr_dim,
                           edge_attr_dim=dataset.edge_attr_dim,
                           out_dim=len(TARGET_LIST)).to(device)

    model.train()
    data = next(iter(dataloader))

    print(f'The input batch data is {data}')
    data = data.to(device)
    print(f'The output shape is {model(data).shape}')
示例#12
0
    def train(self,
              smiles: list,
              target: list,
              model_filename: str = None,
              model_config: dict = None):
        ''' GraphOperator.train: trains a graph neural network given SMILES
        strings, target values, supplied config (i.e. architecture, hyper-
        parameters)

        Args:
            smiles (list): list of SMILES strings (str)
            target (list): list of target values (1d, float)
            model_filename (str): if not None, saves model to this location
            model_config (dict): configuration dict; if none supplied, default
                is used

        Returns:
            None
        '''

        # Check for inequality in length of input, target data
        if len(smiles) != len(target):
            raise ValueError(
                'Supplied SMILES and targets not the same length: {}, {}'.
                format(len(smiles), len(target)))

        # Prepare data
        self._ce = CompoundEncoder(smiles)
        data = []
        for idx, smi in enumerate(smiles):
            a, b = self._ce.encode(smi)
            data.append(
                gdata.Data(x=a,
                           edge_index=self._ce.connectivity(smi),
                           edge_attr=b,
                           y=torch.tensor(target[idx]).type(torch.float)).to(
                               self._config['device']))

        # Split data into training, validation subsets
        data_train, data_valid = train_test_split(
            data, test_size=self._config['valid_size'])
        loader_train = gdata.DataLoader(data_train,
                                        batch_size=self._config['batch_size'],
                                        shuffle=True)
        loader_valid = gdata.DataLoader(data_valid,
                                        batch_size=self._config['batch_size'],
                                        shuffle=True)

        # Create model
        self._model = MessagePassingNet(self._ce.ATOM_DIM,
                                        len(target[0]),
                                        task=self._config['task'],
                                        config=model_config)
        self._model.construct()
        self._model.to(self._config['device'])
        optimizer = torch.optim.Adam(self._model.parameters(),
                                     lr=self._config['learning_rate'])

        # Setup callbacks
        CBO = CallbackOperator()
        _lrdecay = LRDecayLinear(self._config['learning_rate'],
                                 self._config['lr_decay'], optimizer)
        _validator = Validator(loader_valid, self._model,
                               self._config['valid_epoch_iter'],
                               self._config['valid_patience'])
        CBO.add_cb(_lrdecay)
        CBO.add_cb(_validator)

        # TRAIN BEGIN
        CBO.on_train_begin()

        # Begin training loop
        for epoch in range(self._config['epochs']):

            # EPOCH BEGIN
            if not CBO.on_epoch_begin(epoch):
                break

            train_loss = 0.0
            self._model.train()

            for b_idx, batch in enumerate(loader_train):

                # BATCH BEGIN
                if not CBO.on_batch_begin(b_idx):
                    break

                optimizer.zero_grad()
                embedding, pred = self._model(batch)
                target = batch.y
                if self._config['task'] == 'node':
                    pred = pred[batch.train_mask]
                    target = target[batch.train_mask]

                # BATCH END, LOSS BEGIN
                if not CBO.on_batch_end(b_idx):
                    break
                if not CBO.on_loss_begin(b_idx):
                    break

                loss = self._model.loss(pred, target)
                loss.backward()

                # LOSS END, STEP BEGIN
                if not CBO.on_loss_end(b_idx):
                    break
                if not CBO.on_step_begin(b_idx):
                    break

                optimizer.step()
                train_loss += loss.detach().item() * batch.num_graphs

                # STEP END
                if not CBO.on_step_end(b_idx):
                    break

            train_loss /= len(loader_train.dataset)

            # EPOCH END
            if not CBO.on_epoch_end(epoch):
                break

            if self._config['verbose']:
                print('Epoch: {} | Train Loss: {} | Valid Loss: {}'.format(
                    epoch, train_loss, _validator._best_loss))

        # TRAIN END
        CBO.on_train_end()

        if model_filename is not None:
            torch.save(self._model, model_filename)
示例#13
0
def main():

    parser = argparse.ArgumentParser(
        description='Graph Model for Dragon7 Descriptor Prediction')

    parser.add_argument('--model_type',
                        type=str,
                        default='mpnn',
                        help='type of convolutional graph model',
                        choices=['mpnn', 'gcn', 'gat'])
    parser.add_argument('--pooling',
                        type=str,
                        default='set2set',
                        help='global pooling layer for graph model',
                        choices=['set2set', 'attention'])
    parser.add_argument('--state_dim',
                        type=int,
                        default=256,
                        help='hidden state dimension for conv layers')
    parser.add_argument('--num_conv',
                        type=int,
                        default=3,
                        help='number of convolution operations')
    parser.add_argument('--num_dscrptr',
                        type=int,
                        default=100,
                        help='number of dragon7 descriptors for prediction')

    parser.add_argument('--init_lr', type=float, default=5e-4)
    parser.add_argument('--weight_decay',
                        type=float,
                        default=1e-5,
                        help='L2 regularization for nn weights')
    parser.add_argument('--lr_decay_patience',
                        type=int,
                        default=8,
                        help='decay patience for learning rate')
    parser.add_argument('--lr_decay_factor',
                        type=float,
                        default=0.5,
                        help='decay factor for learning rate')
    parser.add_argument('--max_num_epochs',
                        type=int,
                        default=100,
                        help='maximum number of epochs')

    parser.add_argument('--val_size', type=int or float, default=10000)
    parser.add_argument('--tst_size', type=int or float, default=10000)

    parser.add_argument('--no_cuda',
                        action='store_true',
                        help='disables CUDA training')
    parser.add_argument('--cuda_device',
                        type=int,
                        default=0,
                        help='CUDA device ID')
    parser.add_argument('--rand_state',
                        type=int,
                        default=0,
                        help='random state of numpy/sklearn/pytorch')

    args = parser.parse_args()
    print('Training Arguments:\n' + json.dumps(vars(args), indent=4))

    # Constants and initializations ###########################################
    use_cuda = torch.cuda.is_available() and (not args.no_cuda)
    device = torch.device(f'cuda: {args.cuda_device}' if use_cuda else 'cpu')
    print(f'Training on device {device}')

    # It seems that NVidia Apex is not compatible with PyG
    # amp_handle = amp.init(enabled=False)

    seed_random_state(args.rand_state)

    target_list = c.TARGET_D7_DSCRPTR_NAMES[:args.num_dscrptr]

    # Get the trn/val/tst dataset and dataloaders #############################
    print('Preparing CID-SMILES dictionary ... ')
    cid_smiles_csv_path = c.PCBA_CID_SMILES_CSV_PATH
    cid_smiles_df = pd.read_csv(cid_smiles_csv_path,
                                sep='\t',
                                header=0,
                                index_col=0,
                                dtype=str)
    cid_smiles_df.index = cid_smiles_df.index.map(str)
    cid_smiles_dict = cid_smiles_df.to_dict()['SMILES']
    del cid_smiles_df

    print('Preparing CID-dscrptr dictionary ... ')
    # cid_dscrptr_dict has a structure of dict[target_name][str(cid)]

    # cid_dscrptr_df = pd.read_csv(c.PCBA_CID_TARGET_D7DSCPTR_CSV_PATH,
    #                              sep='\t',
    #                              header=0,
    #                              index_col=0,
    #                              usecols=['CID'] + TARGET_LIST,
    #                              dtype={t: np.float32 for t in TARGET_LIST})
    # cid_dscrptr_df.index = cid_dscrptr_df.index.map(str)
    #
    # # Perform STD normalization for multi-target regression
    # dscrptr_mean = cid_dscrptr_df.mean().values
    # dscrptr_std = cid_dscrptr_df.std().values
    # cid_dscrptr_df = \
    #     (cid_dscrptr_df - cid_dscrptr_df.mean()) / cid_dscrptr_df.std()
    #
    # cid_dscrptr_dict = cid_dscrptr_df.to_dict()
    # del cid_dscrptr_df

    cid_list = []
    dscrptr_array = np.array([], dtype=np.float32).reshape(0, len(target_list))
    for chunk_cid_dscrptr_df in pd.read_csv(
            c.PCBA_CID_TARGET_D7DSCPTR_CSV_PATH,
            sep='\t',
            header=0,
            index_col=0,
            usecols=['CID'] + target_list,
            dtype={
                **{
                    'CID': str
                },
                **{t: np.float32
                   for t in target_list}
            },
            chunksize=2**16):

        chunk_cid_dscrptr_df.index = chunk_cid_dscrptr_df.index.map(str)
        cid_list.extend(list(chunk_cid_dscrptr_df.index))
        dscrptr_array = np.vstack((dscrptr_array, chunk_cid_dscrptr_df.values))

    # Perform STD normalization for multi-target regression
    dscrptr_mean = np.mean(dscrptr_array, axis=0)
    dscrptr_std = np.std(dscrptr_array, axis=0)
    dscrptr_array = (dscrptr_array - dscrptr_mean) / dscrptr_std

    assert len(cid_list) == len(dscrptr_array)
    cid_dscrptr_dict = {
        cid: dscrptr
        for cid, dscrptr in zip(cid_list, dscrptr_array)
    }

    print('Preparing datasets and dataloaders ... ')
    # List of CIDs for training, validation, and testing
    # Make sure that all entries in the CID list is valid
    smiles_cid_set = set(list(cid_smiles_dict.keys()))
    dscrptr_cid_set = set(list(cid_dscrptr_dict.keys()))
    cid_list = sorted(list(smiles_cid_set & dscrptr_cid_set), key=int)

    trn_cid_list, tst_cid_list = \
        train_test_split(cid_list,
                         test_size=args.tst_size,
                         random_state=args.rand_state)
    trn_cid_list, val_cid_list = \
        train_test_split(trn_cid_list,
                         test_size=args.val_size,
                         random_state=args.rand_state)

    # # Downsizing training set for the purpose of testing
    # _, trn_cid_list = train_test_split(trn_cid_list,
    #                                    test_size=args.val_size * 10,
    #                                    random_state=args.rand_state)

    # Datasets and dataloaders
    dataset_kwargs = {
        'target_list': target_list,
        'cid_smiles_dict': cid_smiles_dict,
        'cid_dscrptr_dict': cid_dscrptr_dict,
        # 'multi_edge_indices': (MODEL_TYPE.upper() == 'GCN') or
        #                       (MODEL_TYPE.upper() == 'GAT')
    }
    trn_dataset = GraphToDscrptrDataset(cid_list=trn_cid_list,
                                        **dataset_kwargs)
    val_dataset = GraphToDscrptrDataset(cid_list=val_cid_list,
                                        **dataset_kwargs)
    tst_dataset = GraphToDscrptrDataset(cid_list=tst_cid_list,
                                        **dataset_kwargs)

    dataloader_kwargs = {
        'batch_size': 32,
        'timeout': 1,
        'pin_memory': True if use_cuda else False,
        'num_workers': 4 if use_cuda else 0
    }
    trn_loader = pyg_data.DataLoader(trn_dataset,
                                     shuffle=True,
                                     **dataloader_kwargs)
    val_loader = pyg_data.DataLoader(val_dataset, **dataloader_kwargs)
    tst_loader = pyg_data.DataLoader(tst_dataset, **dataloader_kwargs)

    # Model, optimizer, and scheduler #########################################
    attention_pooling = (args.pooling == 'attention')

    if args.model_type.upper() == 'GCN':
        model = EdgeGCNEncoder(node_attr_dim=trn_dataset.node_attr_dim,
                               edge_attr_dim=trn_dataset.edge_attr_dim,
                               state_dim=args.state_dim,
                               num_conv=args.num_conv,
                               out_dim=len(target_list),
                               attention_pooling=attention_pooling).to(device)
    elif args.model_type.upper() == 'GAT':
        model = EdgeGATEncoder(node_attr_dim=trn_dataset.node_attr_dim,
                               edge_attr_dim=trn_dataset.edge_attr_dim,
                               state_dim=args.state_dim,
                               num_conv=args.num_conv,
                               out_dim=len(target_list),
                               attention_pooling=attention_pooling).to(device)
    else:
        model = MPNN(node_attr_dim=trn_dataset.node_attr_dim,
                     edge_attr_dim=trn_dataset.edge_attr_dim,
                     state_dim=args.state_dim,
                     num_conv=args.num_conv,
                     out_dim=len(target_list),
                     attention_pooling=attention_pooling).to(device)

    num_params = count_parameters(model)
    print(f'Model Summary (Number of Parameters: {num_params})\n{model}')

    # optimizer = torch.optim.Adam(
    #     model.parameters(), lr=args.init_lr, amsgrad=True)
    optimizer = torch.optim.RMSprop(model.parameters(), lr=args.init_lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer,
        factor=args.lr_decay_factor,
        patience=args.lr_decay_patience,
        min_lr=1e-6)

    def train(loader):
        model.train()
        loss_all = 0

        for data in loader:
            data = data.to(device)
            optimizer.zero_grad()
            loss = F.mse_loss(model(data), data.y.view(-1, len(target_list)))
            # with amp_handle.scale_loss(loss, optimizer) as scaled_loss:
            #     scaled_loss.backward()
            loss.backward()
            loss_all += loss.item() * data.num_graphs
            optimizer.step()
        return loss_all / len(trn_loader.dataset)

    def test(loader):
        model.eval()
        mae_array = np.zeros(shape=(len(target_list)))
        trgt_array = np.zeros(shape=(0, len(target_list)))
        pred_array = np.zeros(shape=(0, len(target_list)))

        for data in loader:

            data = data.to(device)
            pred = model(data)

            trgt = data.y.cpu().numpy().reshape(-1, len(target_list))
            pred = pred.detach().cpu().numpy().reshape(-1, len(target_list))

            trgt = trgt * dscrptr_std + dscrptr_mean
            pred = pred * dscrptr_std + dscrptr_mean

            trgt_array = np.vstack((trgt_array, trgt))
            pred_array = np.vstack((pred_array, pred))
            mae_array += np.sum(np.abs(trgt - pred), axis=0)

        mae_array = mae_array / len(loader.dataset)

        # # Save the results
        #     np.save(c.PROCESSED_DATA_DIR + '/pred_array.npy', pred_array)
        #     np.save(c.PROCESSED_DATA_DIR + '/trgt_array.npy', trgt_array)

        r2_array = np.array([
            r2_score(y_pred=pred_array[:, i], y_true=trgt_array[:, i])
            for i, t in enumerate(target_list)
        ])

        for i, target in enumerate(target_list):
            print(f'Target Descriptor Name: {target:15s}, '
                  f'R2: {r2_array[i]:.4f}, MAE: {mae_array[i]:.4f}')

        return np.mean(r2_array), np.mean(mae_array)

    print('Training started.')
    best_val_r2 = None
    for epoch in range(1, args.max_num_epochs + 1):

        # scheduler.step()
        lr = scheduler.optimizer.param_groups[0]['lr']
        loss = train(trn_loader)
        print('Validation ' + '#' * 80)
        val_r2, val_mae = test(val_loader)
        print('#' * 80)
        scheduler.step(val_r2)

        if best_val_r2 is None or val_r2 > best_val_r2:
            best_val_r2 = val_r2
            print('Testing ' + '#' * 80)
            tst_r2, tst_mae = test(tst_loader)
            print('#' * 80)

        print(f'Epoch: {epoch:03d}, LR: {lr:6f}, Loss: {loss:.4f}, ',
              f'Validation R2: {val_r2:.4f} MAE: {val_mae:.4f}; ',
              f'Testing R2: {tst_r2:.4f} MAE: {tst_mae:.4f};')
示例#14
0
def main():
    # load config
    config = Config()
    opts = config.initialize()
    config.save(os.path.join(opts.to, "config.json"))
    print(config)

    # prepare dataset
    pre_transform = processes_dict[opts.processed_name]
    datasets = {}
    if opts.task in ["train", "tradition"]:
        datasets["train"] = GenomicsData(opts.root_name,
                                         opts.source_files,
                                         pre_transform,
                                         "train",
                                         opts.processed_name,
                                         val_prop=opts.split[1],
                                         test_prop=opts.split[2],
                                         random_seed=opts.random_seed)
        if opts.split[2] > 0.0:
            datasets["test"] = GenomicsData(opts.root_name,
                                            opts.source_files,
                                            pre_transform,
                                            "test",
                                            opts.processed_name,
                                            val_prop=opts.split[1],
                                            test_prop=opts.split[2],
                                            random_seed=opts.random_seed)
        if opts.split[1] > 0.0:
            datasets["val"] = GenomicsData(opts.root_name,
                                           opts.source_files,
                                           pre_transform,
                                           "val",
                                           opts.processed_name,
                                           val_prop=opts.split[1],
                                           test_prop=opts.split[2],
                                           random_seed=opts.random_seed)
    else:
        datasets["eval"] = GenomicsData(opts.root_name,
                                        opts.source_files,
                                        pre_transform,
                                        "train",
                                        opts.processed_name,
                                        val_prop=opts.split[1],
                                        test_prop=opts.split[2],
                                        random_seed=opts.random_seed)
    in_dim = datasets["train"].features_dim
    num_nodes = datasets["train"].num_nodes
    dataloaders = {
        k: pyg_data.DataLoader(dat,
                               batch_size=opts.batch_size,
                               shuffle=(k == "train"))
        for k, dat in datasets.items()
    }

    # tranditional evaulation:
    if opts.task == "tradition":
        train_scores, test_scores = traditional_surv_analysis(datasets, opts)
        save_generally([train_scores, test_scores],
                       os.path.join(opts.to, "tradition.json"))
        print("")
        print("train:")
        print(train_scores)
        print("test:")
        print(test_scores)
        return

    # networks
    if opts.load_model is not None:
        model = torch.load(opts.load_model)
    else:
        model = DeepDynGNN(in_dim, num_nodes, opts)

    # criterion
    kwargs = {}
    if opts.criterion.lower() == "svm_loss":
        kwargs["r"] = opts.svm_loss_r
    criterion = losses_dict[opts.criterion.lower()](**kwargs)

    # scores
    scores = {s: scores_dict[s]() for s in opts.scores}

    if opts.task == "train":
        if opts.optimizer.lower() == "adam":
            optimizer = optim.Adam(model.parameters(), opts.learning_rate)
        elif opts.optimizer.lower() == "adamw":
            optimizer = optim.AdamW(model.parameters(), opts.learning_rate)
        elif opts.optimizer.lower() == "adammax":
            optimizer = optim.Adamax(model.parameters(), opts.leanring_rate)
        elif opts.optimizer.lower() == "rms":
            optimizer = optim.RMSprop(model.parameters(), opts.learning_rate)
        elif opts.optimizer.lower() == "momentum":
            optimizer = optim.SGD(model.parameters(),
                                  opts.learning_rate,
                                  momentum=0.9)
        else:
            raise NotImplementedError("%s is not implemented." %
                                      opts.optimizer)

        # train model
        model, hist = train_val_test(model, criterion, optimizer, dataloaders,
                                     scores, opts)

        # save model
        save_generally(model, os.path.join(opts.to, "model.pth"))
        save_generally(hist, os.path.join(opts.to, "hist.json"))

    elif opts.task == "eval":
        assert opts.load_model is not None
        # predict model
        eval_loss, eval_scores, pred, target = eval_one_epoch(
            model, criterion, dataloaders["eval"], scores, opts.device)
        print("eval loss is : %.4f" % eval_loss)
        for k, v in eval_scores.items():
            print("eval %s is : %.4f" % (k, v))
        # save pred
        eval_scores.update({"loss": eval_loss})
        save_generally(eval_scores, os.path.join(opts.to, "eval_res.json"))
        save_generally(pred, os.path.join(opts.to, "pred.txt"))
        save_generally(target, os.path.join(opts.to, "target.txt"))
示例#15
0
def search(args):
    if args.benchmark == '101':
        from search.nas_101_utils import spec2data, NASBench, Architect, OPS_FULL, N_NODES, MAX_EDGES
        from nasbench import api
    else:
        import nas_201_api as nb
        from search.nas_201_utils import train_and_eval, CifarBench, Architect, arch2data
        OP_SPACE = cell.SearchSpaceNames['nas-bench-201']
        N_NODES = 4

    def initialize_pool(bench, size):
        arch_pool = []
        seen_arch = set()
        # time_cost = 0.

        while len(seen_arch) < size:
            arch = Architect().randomize_()
            # unique_str = arch.struct.to_unique_str(True)
            struct = arch.struct
            arch_str = bench.arch_str(struct)
            while arch_str is None or arch_str in seen_arch:
                arch.randomize_()
                struct = arch.struct
                arch_str = bench.arch_str(struct)
            seen_arch.add(arch_str)
            bench.eval_arch(arch_str)
            arch_pool.append(arch)
            # time_cost += cost

        return arch_pool, seen_arch

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

    logdir = args.log_dir
    writer = SummaryWriter(log_dir=logdir)

    logger = get_logger(os.path.join(logdir, 'log'))
    logger.info('Arguments : -------------------------------')
    for name, value in args._get_kwargs():
        logger.info('{:16} : {:}'.format(name, value))

    if args.benchmark == '101':
        nas_bench = api.NASBench(args.nas_bench_path)
        cifar_bench = NASBench(nas_bench, average_all=args.average_all)
        predictor = Predictor(t_edge=1,
                              t_node=len(OPS_FULL),
                              n_node=N_NODES,
                              h_dim=64,
                              n_out=1).to('cuda')

        def enum_arch_data():
            for h in cifar_bench._all_hashes:
                yield h, spec2data(cifar_bench.hash2spec(h))

    elif args.benchmark == '201':
        nas_bench = nb.NASBench201API(args.nas_bench_path)
        predictor = Predictor(len(OP_SPACE), 1, N_NODES, 64, 1).to('cuda')
        cifar_bench = CifarBench(nas_bench)

        def enum_arch_data():
            duplicated = set()
            for idx in range(len(nas_bench)):
                archstr = nas_bench[idx]
                struct = gt.Structure.str2structure(archstr)
                unique_str = struct.to_unique_str(True)
                if unique_str not in duplicated:
                    duplicated.add(unique_str)
                    yield archstr, arch2data(archstr)

    optim_p = torch.optim.Adam(predictor.parameters(),
                               args.p_lr,
                               weight_decay=args.weight_decay)

    logger.info("params size = %fM" % (count_parameters(predictor) / 1e6))
    logger.info("\n")

    logger.info("initialize arch pool")
    arch_pool, seen_arch = initialize_pool(cifar_bench, args.pool_size)

    history = [cifar_bench.arch_str(a.struct) for a in arch_pool]

    # logging initial samples
    best_arch_seen = cifar_bench.choose_best(seen_arch)
    logger.info("init pool: %d, seen arch: %d" %
                (len(arch_pool), len(seen_arch)))
    logger.info("simulated time cost: %f" % cifar_bench.total_cost)
    logger.info("best initial arch:")
    cifar_bench.log_arch(best_arch_seen, 0, 'acc_best', logger, writer)

    logger.info('start training predictor')

    train_loader = gd.DataListLoader(cifar_bench.history_data(),
                                     args.train_batch_size,
                                     shuffle=True)

    for epoch in tqdm(range(args.epochs)):
        loss = predictor.fit(train_loader, optim_p, 0, None, args.regression,
                             args.grad_clip, 0)
        writer.add_scalar('loss_r', loss, epoch)

    logger.info('preparing valid data')
    all_arch, all_data = list(
        zip(*tqdm(filter(lambda v: v[0] not in seen_arch, enum_arch_data()))))
    pred_results = []
    pred_loader = gd.DataLoader(all_data, batch_size=args.step_batch_size)
    with torch.no_grad():
        for batch in tqdm(pred_loader, total=len(pred_loader)):
            batch = batch.to('cuda')
            pred_results.append(predictor(batch).cpu().numpy())
    pred_results = np.concatenate(pred_results, axis=0).flatten()
    arg_rank = np.argsort(pred_results)

    while cifar_bench.total_cost < args.time_budget:
        cur_index = arg_rank[0]
        logger.info("current time cost: %f" % cifar_bench.total_cost)
        logger.info("arch to eval: %s" % all_arch[cur_index])
        cifar_bench.eval_arch(all_arch[cur_index])
        history.append(all_arch[cur_index])
        arg_rank = arg_rank[1:]

    best_arch_seen = cifar_bench.choose_best(history)

    with open(os.path.join(logdir, 'selections'), 'w') as f:
        for a in history:
            f.write(a + ',' + str(cifar_bench.lookup(a)))
            f.write('\n')

    return best_arch_seen, cifar_bench.valid_acc(
        best_arch_seen), cifar_bench.test_acc(best_arch_seen)