def load_MolecueNet(data_args): """ Attention the multi-task problems not solved yet """ molecule_net_dataset_names = {name.lower(): name for name in MoleculeNet.names.keys()} dataset = MoleculeNet(root=data_args.dataset_dir, name=molecule_net_dataset_names[data_args.dataset_name.lower()]) # Chem.PeriodicTable.GetElementSymbol() dataset.data.x = dataset.data.x.float() dataset.data.y = dataset.data.y.squeeze().long() dataset.node_type_dict = None dataset.node_color = None return dataset
def load_MolecueNet(dataset_dir, dataset_name, task=None): """ Attention the multi-task problems not solved yet """ molecule_net_dataset_names = {name.lower(): name for name in MoleculeNet.names.keys()} dataset = MoleculeNet(root=dataset_dir, name=molecule_net_dataset_names[dataset_name.lower()]) dataset.data.x = dataset.data.x.float() if task is None: dataset.data.y = dataset.data.y.squeeze().long() else: dataset.data.y = dataset.data.y[task].long() dataset.node_type_dict = None dataset.node_color = None return dataset
def get_split(dataset_name, split, experiment): if dataset_name.lower() == 'tox21': ds = TUDataset('data/tox21', name='Tox21_AhR_testing', pre_transform=lambda sample: pre_transform(sample, 2)) elif dataset_name.lower() == 'esol': ds = MoleculeNet( 'data/esol', name='ESOL' ) elif dataset_name.lower() == 'cycliq': ds = CYCLIQ( 'data/cycliq-evo', name=dataset_name.upper() ) ds.data, ds.slices = torch.load(f"runs/{dataset_name.lower()}/{experiment}/splits/{split}.pth") return ds
def _molenet(self, task): dataset = MoleculeNet('data/MolNet', task, transform=MolNetTransformer()) mean = dataset.data.y.mean() std = dataset.data.y.std() dataset.data.y = (dataset.data.y - mean) / std return dataset, std.item(), 9, 3
def _preprocess_esol(experiment_name, batch_size): dataset = MoleculeNet( 'data/esol', name='ESOL' ) data_list = ( [dataset.get(idx) for idx in range(len(dataset))] ) random.shuffle(data_list) n = len(data_list) // 10 train_data = data_list[n:] val_data = data_list[:n] test_data = train_data[:n] train_data = train_data[n:] train = dataset val = dataset.copy() test = dataset.copy() train.data, train.slices = train.collate(train_data) val.data, val.slices = train.collate(val_data) test.data, test.slices = train.collate(test_data) torch.save((train.data, train.slices), f'runs/esol/{experiment_name}/splits/train.pth') torch.save((val.data, val.slices), f'runs/esol/{experiment_name}/splits/val.pth') torch.save((test.data, test.slices), f'runs/esol/{experiment_name}/splits/test.pth') return ( DataLoader(train, batch_size=batch_size), DataLoader(val, batch_size=batch_size), DataLoader(test, batch_size=batch_size), train, val, test, max(train.num_features, val.num_features, test.num_features), train.num_classes, )
import torch from torch_geometric.data import Data import rdkit from rdkit import Chem from rdkit.Chem import AllChem from rdkit.Chem import DataStructs from torch_geometric.datasets import TUDataset, Planetoid, MoleculeNet from torch_geometric.data import DataLoader dataset = MoleculeNet(root=".", name="ESOL") print("Dataset features: ", dataset.num_features) print("Dataset target: ", dataset.num_classes) print("Dataset length: ", dataset.len) print("Dataset sample: ", dataset[0]) print("Sample nodes: ", dataset[0].num_nodes) print("Sample edges: ", dataset[0].num_edges) print("Edges indexes: ", dataset[0].edge_index.t()) print(dataset[0].x) print(dataset[0].edge_index.t()) from rdkit import Chem from rdkit.Chem import Draw molecule = Chem.MolFromSmiles(dataset[0]["smiles"]) fig = Draw.MolToImage(molecule, size=(360, 360)) fig.save('/home/anaconda3/work//molecule_first.png')
def load_dataset(name: str) -> dir: """ Load dataset. :param name: dataset's name. Possible options:("ESOL", "FreeSolv", "Lipo", "PCBA", "MUV", "HIV", "BACE", "BBPB", "Tox21", "ToxCast", "SIDER", "ClinTox") :return: torch_geometric.dataset object """ molecule_set = [ "ESOL", "FreeSolv", "Lipo", "PCBA", "MUV", "HIV", "BACE", "BBPB", "Tox21", "ToxCast", "SIDER", "ClinTox" ] molecule_set = [x.lower() for x in molecule_set] name = name.lower() # set Metrics: loss and score based on dataset's name Metric.set_loss_func(name) Metric.set_score_func(name) # To Do: use transform to argument data if name in molecule_set: data_args.dataset_type = 'mol' data_args.model_level = 'graph' dataset = MoleculeNet(root=os.path.abspath( os.path.join(ROOT_DIR, '..', 'datasets')), name=name) dataset.data.x = dataset.data.x.to(torch.float32) data_args.dim_node = dataset.num_node_features data_args.dim_edge = dataset.num_edge_features data_args.num_targets = dataset.num_classes # This so-called classes are actually targets. # Define models' output shape. if Metric.cur_task == 'bcs': data_args.num_classes = 2 elif Metric.cur_task == 'reg': data_args.num_classes = 1 assert data_args.target_idx != -1, 'Explaining on multi tasks is meaningless.' assert data_args.target_idx <= dataset.data.y.shape[ 1], 'No such target in the dataset.' dataset.data.y = dataset.data.y[:, data_args.target_idx] data_args.num_targets = 1 dataset_len = len(dataset) dataset_split = [ int(dataset_len * data_args.dataset_split[0]), int(dataset_len * data_args.dataset_split[1]), 0 ] dataset_split[2] = dataset_len - dataset_split[0] - dataset_split[1] train_set, val_set, test_set = \ random_split(dataset, dataset_split) return {'train': train_set, 'val': val_set, 'test': test_set} elif name == 'ba_lrp': data_args.dataset_type = 'syn' data_args.model_level = 'graph' dataset = BA_LRP(root=os.path.join(ROOT_DIR, '..', 'datasets', 'ba_lrp'), num_per_class=10000) dataset.data.x = dataset.data.x.to(torch.float32) data_args.dim_node = dataset.num_node_features data_args.dim_edge = dataset.num_edge_features data_args.num_targets = dataset.num_classes # This so-called classes are actually targets. # Define models' output shape. if Metric.cur_task == 'bcs': data_args.num_classes = 2 elif Metric.cur_task == 'reg': data_args.num_classes = 1 assert data_args.target_idx != -1, 'Explaining on multi tasks is meaningless.' assert data_args.target_idx <= dataset.data.y.shape[ 1], 'No such target in the dataset.' dataset.data.y = dataset.data.y[:, data_args.target_idx] data_args.num_targets = 1 dataset_len = len(dataset) dataset_split = [ int(dataset_len * data_args.dataset_split[0]), int(dataset_len * data_args.dataset_split[1]), 0 ] dataset_split[2] = dataset_len - dataset_split[0] - dataset_split[1] train_set, val_set, test_set = \ random_split(dataset, dataset_split) return {'train': train_set, 'val': val_set, 'test': test_set} elif name == 'ba_shape': data_args.dataset_type = 'syn' data_args.model_level = 'node' dataset = BA_Shape(root=os.path.join(ROOT_DIR, '..', 'datasets', 'ba_shape'), num_base_node=300, num_shape=80) dataset.data.x = dataset.data.x.to(torch.float32) data_args.dim_node = dataset.num_node_features data_args.dim_edge = dataset.num_edge_features data_args.num_targets = 1 # Define models' output shape. if Metric.cur_task == 'bcs': data_args.num_classes = 2 elif Metric.cur_task == 'reg': data_args.num_classes = 1 else: data_args.num_classes = dataset.num_classes assert data_args.target_idx != -1, 'Explaining on multi tasks is meaningless.' if data_args.model_level != 'node': assert data_args.target_idx <= dataset.data.y.shape[ 1], 'No such target in the dataset.' dataset.data.y = dataset.data.y[:, data_args.target_idx] data_args.num_targets = 1 dataset_len = len(dataset) dataset_split = [ int(dataset_len * data_args.dataset_split[0]), int(dataset_len * data_args.dataset_split[1]), 0 ] dataset_split[ 2] = dataset_len - dataset_split[0] - dataset_split[1] train_set, val_set, test_set = \ random_split(dataset, dataset_split) return {'train': train_set, 'val': val_set, 'test': test_set} else: train_set = dataset val_set = copy.deepcopy(dataset) test_set = copy.deepcopy(dataset) train_set.data.mask = train_set.data.train_mask train_set.slices['mask'] = train_set.slices['train_mask'] val_set.data.mask = val_set.data.val_mask val_set.slices['mask'] = val_set.slices['val_mask'] test_set.data.mask = test_set.data.test_mask test_set.slices['mask'] = test_set.slices['test_mask'] return {'train': train_set, 'val': val_set, 'test': test_set} print(f'#E#Dataset {name} does not exist.') sys.exit(1)
[single, double, triple, aromatic, conjugation, ring] + stereo) edge_attrs += [edge_attr, edge_attr] if len(edge_attrs) == 0: data.edge_index = torch.zeros((2, 0), dtype=torch.long) data.edge_attr = torch.zeros((0, 10), dtype=torch.float) else: data.edge_index = torch.tensor(edge_indices).t().contiguous() data.edge_attr = torch.stack(edge_attrs, dim=0) return data path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'AFP_Mol') dataset = MoleculeNet(path, name='ESOL', pre_transform=GenFeatures()).shuffle() N = len(dataset) // 10 val_dataset = dataset[:N] test_dataset = dataset[N:2 * N] train_dataset = dataset[2 * N:] train_loader = DataLoader(train_dataset, batch_size=200, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=200) test_loader = DataLoader(test_dataset, batch_size=200) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = AttentiveFP(in_channels=39, hidden_channels=200, out_channels=1, edge_dim=10,
def __init__(self, task, f1_alpha, f2_alpha, f3_alpha): dataset = MoleculeNet('data/MolNet', task) super(MoleNetSampler, self).__init__(dataset, f1_alpha, f2_alpha, f3_alpha)