示例#1
0
def load_MolecueNet(data_args):
    """ Attention the multi-task problems not solved yet """
    molecule_net_dataset_names = {name.lower(): name for name in MoleculeNet.names.keys()}
    dataset = MoleculeNet(root=data_args.dataset_dir, name=molecule_net_dataset_names[data_args.dataset_name.lower()])
    # Chem.PeriodicTable.GetElementSymbol()
    dataset.data.x = dataset.data.x.float()
    dataset.data.y = dataset.data.y.squeeze().long()
    dataset.node_type_dict = None
    dataset.node_color = None
    return dataset
示例#2
0
def load_MolecueNet(dataset_dir, dataset_name, task=None):
    """ Attention the multi-task problems not solved yet """
    molecule_net_dataset_names = {name.lower(): name for name in MoleculeNet.names.keys()}
    dataset = MoleculeNet(root=dataset_dir, name=molecule_net_dataset_names[dataset_name.lower()])
    dataset.data.x = dataset.data.x.float()
    if task is None:
        dataset.data.y = dataset.data.y.squeeze().long()
    else:
        dataset.data.y = dataset.data.y[task].long()
    dataset.node_type_dict = None
    dataset.node_color = None
    return dataset
示例#3
0
def get_split(dataset_name, split, experiment):

    if dataset_name.lower() == 'tox21':
        ds = TUDataset('data/tox21',
                       name='Tox21_AhR_testing',
                       pre_transform=lambda sample: pre_transform(sample, 2))

    elif dataset_name.lower() == 'esol':

        ds = MoleculeNet(
            'data/esol',
            name='ESOL'
        )


    elif dataset_name.lower() == 'cycliq':
        ds = CYCLIQ(
            'data/cycliq-evo',
            name=dataset_name.upper()
        )


    ds.data, ds.slices = torch.load(f"runs/{dataset_name.lower()}/{experiment}/splits/{split}.pth")

    return ds
示例#4
0
 def _molenet(self, task):
     dataset = MoleculeNet('data/MolNet',
                           task,
                           transform=MolNetTransformer())
     mean = dataset.data.y.mean()
     std = dataset.data.y.std()
     dataset.data.y = (dataset.data.y - mean) / std
     return dataset, std.item(), 9, 3
示例#5
0
def _preprocess_esol(experiment_name, batch_size):

    dataset = MoleculeNet(
        'data/esol',
        name='ESOL'
    )

    data_list = (
        [dataset.get(idx) for idx in range(len(dataset))]
    )

    random.shuffle(data_list)

    n = len(data_list) // 10

    train_data = data_list[n:]
    val_data = data_list[:n]
    test_data = train_data[:n]
    train_data = train_data[n:]

    train = dataset
    val = dataset.copy()
    test = dataset.copy()

    train.data, train.slices = train.collate(train_data)
    val.data, val.slices = train.collate(val_data)
    test.data, test.slices = train.collate(test_data)

    torch.save((train.data, train.slices), f'runs/esol/{experiment_name}/splits/train.pth')
    torch.save((val.data, val.slices), f'runs/esol/{experiment_name}/splits/val.pth')
    torch.save((test.data, test.slices), f'runs/esol/{experiment_name}/splits/test.pth')


    return (
        DataLoader(train, batch_size=batch_size),
        DataLoader(val,   batch_size=batch_size),
        DataLoader(test,  batch_size=batch_size),
        train,
        val,
        test,
        max(train.num_features, val.num_features, test.num_features),
        train.num_classes,
    )
示例#6
0
import torch
from torch_geometric.data import Data
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs
from torch_geometric.datasets import TUDataset, Planetoid, MoleculeNet
from torch_geometric.data import DataLoader

dataset = MoleculeNet(root=".", name="ESOL")

print("Dataset features: ", dataset.num_features)
print("Dataset target: ", dataset.num_classes)
print("Dataset length: ", dataset.len)
print("Dataset sample: ", dataset[0])
print("Sample  nodes: ", dataset[0].num_nodes)
print("Sample  edges: ", dataset[0].num_edges)
print("Edges indexes: ", dataset[0].edge_index.t())

print(dataset[0].x)

print(dataset[0].edge_index.t())

from rdkit import Chem
from rdkit.Chem import Draw
molecule = Chem.MolFromSmiles(dataset[0]["smiles"])

fig = Draw.MolToImage(molecule, size=(360, 360))

fig.save('/home/anaconda3/work//molecule_first.png')
示例#7
0
def load_dataset(name: str) -> dir:
    """
    Load dataset.
    :param name: dataset's name. Possible options:("ESOL", "FreeSolv", "Lipo", "PCBA", "MUV", "HIV",
    "BACE", "BBPB", "Tox21", "ToxCast", "SIDER", "ClinTox")
    :return: torch_geometric.dataset object
    """
    molecule_set = [
        "ESOL", "FreeSolv", "Lipo", "PCBA", "MUV", "HIV", "BACE", "BBPB",
        "Tox21", "ToxCast", "SIDER", "ClinTox"
    ]
    molecule_set = [x.lower() for x in molecule_set]
    name = name.lower()

    # set Metrics: loss and score based on dataset's name
    Metric.set_loss_func(name)
    Metric.set_score_func(name)

    # To Do: use transform to argument data
    if name in molecule_set:
        data_args.dataset_type = 'mol'
        data_args.model_level = 'graph'

        dataset = MoleculeNet(root=os.path.abspath(
            os.path.join(ROOT_DIR, '..', 'datasets')),
                              name=name)
        dataset.data.x = dataset.data.x.to(torch.float32)
        data_args.dim_node = dataset.num_node_features
        data_args.dim_edge = dataset.num_edge_features
        data_args.num_targets = dataset.num_classes  # This so-called classes are actually targets.

        # Define models' output shape.
        if Metric.cur_task == 'bcs':
            data_args.num_classes = 2
        elif Metric.cur_task == 'reg':
            data_args.num_classes = 1

        assert data_args.target_idx != -1, 'Explaining on multi tasks is meaningless.'
        assert data_args.target_idx <= dataset.data.y.shape[
            1], 'No such target in the dataset.'

        dataset.data.y = dataset.data.y[:, data_args.target_idx]
        data_args.num_targets = 1

        dataset_len = len(dataset)
        dataset_split = [
            int(dataset_len * data_args.dataset_split[0]),
            int(dataset_len * data_args.dataset_split[1]), 0
        ]
        dataset_split[2] = dataset_len - dataset_split[0] - dataset_split[1]
        train_set, val_set, test_set = \
            random_split(dataset, dataset_split)

        return {'train': train_set, 'val': val_set, 'test': test_set}

    elif name == 'ba_lrp':
        data_args.dataset_type = 'syn'
        data_args.model_level = 'graph'

        dataset = BA_LRP(root=os.path.join(ROOT_DIR, '..', 'datasets',
                                           'ba_lrp'),
                         num_per_class=10000)
        dataset.data.x = dataset.data.x.to(torch.float32)
        data_args.dim_node = dataset.num_node_features
        data_args.dim_edge = dataset.num_edge_features
        data_args.num_targets = dataset.num_classes  # This so-called classes are actually targets.

        # Define models' output shape.
        if Metric.cur_task == 'bcs':
            data_args.num_classes = 2
        elif Metric.cur_task == 'reg':
            data_args.num_classes = 1

        assert data_args.target_idx != -1, 'Explaining on multi tasks is meaningless.'
        assert data_args.target_idx <= dataset.data.y.shape[
            1], 'No such target in the dataset.'

        dataset.data.y = dataset.data.y[:, data_args.target_idx]
        data_args.num_targets = 1

        dataset_len = len(dataset)
        dataset_split = [
            int(dataset_len * data_args.dataset_split[0]),
            int(dataset_len * data_args.dataset_split[1]), 0
        ]
        dataset_split[2] = dataset_len - dataset_split[0] - dataset_split[1]
        train_set, val_set, test_set = \
            random_split(dataset, dataset_split)

        return {'train': train_set, 'val': val_set, 'test': test_set}
    elif name == 'ba_shape':
        data_args.dataset_type = 'syn'
        data_args.model_level = 'node'

        dataset = BA_Shape(root=os.path.join(ROOT_DIR, '..', 'datasets',
                                             'ba_shape'),
                           num_base_node=300,
                           num_shape=80)
        dataset.data.x = dataset.data.x.to(torch.float32)
        data_args.dim_node = dataset.num_node_features
        data_args.dim_edge = dataset.num_edge_features
        data_args.num_targets = 1

        # Define models' output shape.
        if Metric.cur_task == 'bcs':
            data_args.num_classes = 2
        elif Metric.cur_task == 'reg':
            data_args.num_classes = 1
        else:
            data_args.num_classes = dataset.num_classes

        assert data_args.target_idx != -1, 'Explaining on multi tasks is meaningless.'
        if data_args.model_level != 'node':

            assert data_args.target_idx <= dataset.data.y.shape[
                1], 'No such target in the dataset.'

            dataset.data.y = dataset.data.y[:, data_args.target_idx]
            data_args.num_targets = 1

            dataset_len = len(dataset)
            dataset_split = [
                int(dataset_len * data_args.dataset_split[0]),
                int(dataset_len * data_args.dataset_split[1]), 0
            ]
            dataset_split[
                2] = dataset_len - dataset_split[0] - dataset_split[1]
            train_set, val_set, test_set = \
                random_split(dataset, dataset_split)

            return {'train': train_set, 'val': val_set, 'test': test_set}
        else:
            train_set = dataset
            val_set = copy.deepcopy(dataset)
            test_set = copy.deepcopy(dataset)
            train_set.data.mask = train_set.data.train_mask
            train_set.slices['mask'] = train_set.slices['train_mask']
            val_set.data.mask = val_set.data.val_mask
            val_set.slices['mask'] = val_set.slices['val_mask']
            test_set.data.mask = test_set.data.test_mask
            test_set.slices['mask'] = test_set.slices['test_mask']
            return {'train': train_set, 'val': val_set, 'test': test_set}
    print(f'#E#Dataset {name} does not exist.')
    sys.exit(1)
示例#8
0
                [single, double, triple, aromatic, conjugation, ring] + stereo)

            edge_attrs += [edge_attr, edge_attr]

        if len(edge_attrs) == 0:
            data.edge_index = torch.zeros((2, 0), dtype=torch.long)
            data.edge_attr = torch.zeros((0, 10), dtype=torch.float)
        else:
            data.edge_index = torch.tensor(edge_indices).t().contiguous()
            data.edge_attr = torch.stack(edge_attrs, dim=0)

        return data


path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'AFP_Mol')
dataset = MoleculeNet(path, name='ESOL', pre_transform=GenFeatures()).shuffle()

N = len(dataset) // 10
val_dataset = dataset[:N]
test_dataset = dataset[N:2 * N]
train_dataset = dataset[2 * N:]

train_loader = DataLoader(train_dataset, batch_size=200, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=200)
test_loader = DataLoader(test_dataset, batch_size=200)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AttentiveFP(in_channels=39,
                    hidden_channels=200,
                    out_channels=1,
                    edge_dim=10,
示例#9
0
 def __init__(self, task, f1_alpha, f2_alpha, f3_alpha):
     dataset = MoleculeNet('data/MolNet', task)
     super(MoleNetSampler, self).__init__(dataset, f1_alpha, f2_alpha,
                                          f3_alpha)