def __init__(self, data, vocab, training=True): dir = get_download_dir() _url = _get_dgl_url('dataset/jtnn.zip') zip_file_path = '{}/jtnn.zip'.format(dir) download(_url, path=zip_file_path) extract_archive(zip_file_path, '{}/jtnn'.format(dir)) print('Loading data...') if data in ['train', 'test']: # ZINC subset data_file = '{}/jtnn/{}.txt'.format(dir, data) else: # New dataset data_file = data with open(data_file) as f: self.data = [line.strip("\r\n ").split()[0] for line in f] self.vocab = vocab print('Loading finished') print('\t# samples:', len(self.data)) self.training = training self.atom_featurizer_enc = get_atom_featurizer_enc() self.bond_featurizer_enc = get_bond_featurizer_enc() self.atom_featurizer_dec = get_atom_featurizer_dec() self.bond_featurizer_dec = get_bond_featurizer_dec()
def __init__(self, smiles_to_graph=smiles_to_bigraph, node_featurizer=None, edge_featurizer=None, load=False, log_every=1000, cache_file_path='./tox21_dglgraph.bin', n_jobs=1): self._url = 'dataset/tox21.csv.gz' data_path = get_download_dir() + '/tox21.csv.gz' download(_get_dgl_url(self._url), path=data_path, overwrite=False) df = pd.read_csv(data_path) self.id = df['mol_id'] df = df.drop(columns=['mol_id']) self.load_full = False super(Tox21, self).__init__(df, smiles_to_graph, node_featurizer, edge_featurizer, "smiles", cache_file_path, load=load, log_every=log_every, n_jobs=n_jobs) self.id = [self.id[i] for i in self.valid_ids]
def __init__(self, smiles_to_graph=smiles_to_bigraph, node_featurizer=None, edge_featurizer=None, load=False, log_every=1000, cache_file_path='./sider_dglgraph.bin', n_jobs=1): self._url = 'dataset/sider.zip' data_path = get_download_dir() + '/sider.zip' dir_path = get_download_dir() + '/sider' download(_get_dgl_url(self._url), path=data_path, overwrite=False) extract_archive(data_path, dir_path) df = pd.read_csv(dir_path + '/sider.csv') super(SIDER, self).__init__(df=df, smiles_to_graph=smiles_to_graph, node_featurizer=node_featurizer, edge_featurizer=edge_featurizer, smiles_column='smiles', cache_file_path=cache_file_path, load=load, log_every=log_every, init_mask=True, n_jobs=n_jobs)
def __init__(self, hidden_size, latent_size, depth, vocab_file=None): super(DGLJTNNVAE, self).__init__() if vocab_file is None: default_dir = get_download_dir() vocab_file = '{}/jtvae/{}.txt'.format(default_dir, 'vocab') zip_file_path = '{}/jtvae.zip'.format(default_dir) download(_get_dgl_url('dataset/jtvae.zip'), path=zip_file_path) extract_archive(zip_file_path, '{}/jtvae'.format(default_dir)) with open(vocab_file, 'r') as f: self.vocab = Vocab([x.strip("\r\n ") for x in f]) self.hidden_size = hidden_size self.latent_size = latent_size self.depth = depth self.embedding = nn.Embedding(self.vocab.size(), hidden_size) self.mpn = DGLMPN(hidden_size, depth) self.jtnn = DGLJTNNEncoder(self.vocab, hidden_size, self.embedding) self.decoder = DGLJTNNDecoder(self.vocab, hidden_size, latent_size // 2, self.embedding) self.jtmpn = DGLJTMPN(hidden_size, depth) self.T_mean = nn.Linear(hidden_size, latent_size // 2) self.T_var = nn.Linear(hidden_size, latent_size // 2) self.G_mean = nn.Linear(hidden_size, latent_size // 2) self.G_var = nn.Linear(hidden_size, latent_size // 2) self.atom_featurizer_enc = get_atom_featurizer_enc() self.bond_featurizer_enc = get_bond_featurizer_enc() self.atom_featurizer_dec = get_atom_featurizer_dec() self.bond_featurizer_dec = get_bond_featurizer_dec()
def __init__(self, hidden_size, latent_size, depth, vocab=None, vocab_file=None): super(DGLJTNNVAE, self).__init__() if vocab is None: if vocab_file is None: default_dir = get_download_dir() vocab_file = '{}/jtnn/{}.txt'.format(default_dir, 'vocab') zip_file_path = '{}/jtnn.zip'.format(default_dir) download(_get_dgl_url('dataset/jtnn.zip'), path=zip_file_path) extract_archive(zip_file_path, '{}/jtnn'.format(default_dir)) self.vocab = Vocab([x.strip("\r\n ") for x in open(vocab_file)]) else: self.vocab = vocab self.hidden_size = hidden_size self.latent_size = latent_size self.depth = depth self.embedding = nn.Embedding(self.vocab.size(), hidden_size) self.mpn = DGLMPN(hidden_size, depth) self.jtnn = DGLJTNNEncoder(self.vocab, hidden_size, self.embedding) self.decoder = DGLJTNNDecoder( self.vocab, hidden_size, latent_size // 2, self.embedding) self.jtmpn = DGLJTMPN(hidden_size, depth) self.T_mean = nn.Linear(hidden_size, latent_size // 2) self.T_var = nn.Linear(hidden_size, latent_size // 2) self.G_mean = nn.Linear(hidden_size, latent_size // 2) self.G_var = nn.Linear(hidden_size, latent_size // 2) self.n_nodes_total = 0 self.n_passes = 0 self.n_edges_total = 0 self.n_tree_nodes_total = 0
def __init__(self, file_path=None): if file_path is None: from dgl.data.utils import get_download_dir, download, _get_dgl_url, extract_archive default_dir = get_download_dir() vocab_file = '{}/jtvae/vocab.txt'.format(default_dir) zip_file_path = '{}/jtvae.zip'.format(default_dir) download(_get_dgl_url('dataset/jtvae.zip'), path=zip_file_path, overwrite=False) extract_archive(zip_file_path, '{}/jtvae'.format(default_dir)) with open(vocab_file, 'r') as f: self.vocab = [x.strip("\r\n ") for x in f] else: # Prepare a vocabulary from scratch vocab = set() with open(file_path, 'r') as f: for line in f: smiles = line.split()[0] mol = MolTree(smiles) for i in mol.nodes_dict: vocab.add(mol.nodes_dict[i]['smiles']) self.vocab = list(vocab) self.vmap = {x: i for i, x in enumerate(self.vocab)} self.slots = [get_slots(smiles) for smiles in self.vocab]
def __init__(self, smiles_to_graph=smiles_to_bigraph, node_featurizer=None, edge_featurizer=None, load=False, log_every=1000, cache_file_path='./muv_dglgraph.bin', n_jobs=1): self._url = 'dataset/muv.zip' data_path = get_download_dir() + '/muv.zip' dir_path = get_download_dir() + '/muv' download(_get_dgl_url(self._url), path=data_path, overwrite=False) extract_archive(data_path, dir_path) df = pd.read_csv(dir_path + '/muv.csv') self.ids = df['mol_id'].tolist() self.load_full = False df = df.drop(columns=['mol_id']) super(MUV, self).__init__(df=df, smiles_to_graph=smiles_to_graph, node_featurizer=node_featurizer, edge_featurizer=edge_featurizer, smiles_column='smiles', cache_file_path=cache_file_path, load=load, log_every=log_every, init_mask=True, n_jobs=n_jobs) self.ids = [self.ids[i] for i in self.valid_ids]
def __init__(self, smiles_to_graph=smiles_to_bigraph, node_featurizer=None, edge_featurizer=None, load=True, log_every=1000, cache_file_path='lipophilicity_dglgraph.bin'): self._url = 'dataset/lipophilicity.zip' data_path = get_download_dir() + '/lipophilicity.zip' dir_path = get_download_dir() + '/lipophilicity' download(_get_dgl_url(self._url), path=data_path) extract_archive(data_path, dir_path) df = pd.read_csv(dir_path + '/Lipophilicity.csv') # ChEMBL ids self.chembl_ids = df['CMPD_CHEMBLID'].tolist() self.load_full = False super(Lipophilicity, self).__init__(df=df, smiles_to_graph=smiles_to_graph, node_featurizer=node_featurizer, edge_featurizer=edge_featurizer, smiles_column='smiles', cache_file_path=cache_file_path, task_names=['exp'], load=load, log_every=log_every, init_mask=False)
def __init__(self, raw_dir=None, force_reload=False, verbose=True): url = _get_dgl_url('dataset/wn18.tgz') super(WN18Dataset, self).__init__('wn18', url=url, raw_dir=raw_dir, force_reload=force_reload, verbose=verbose)
def __init__(self, subset, load_binding_pocket=True, sanitize=False, calc_charges=False, remove_hs=False, use_conformation=True, construct_graph_and_featurize=ACNN_graph_construction_and_featurization, zero_padding=True, num_processes=64): self.task_names = ['-logKd/Ki'] self.n_tasks = len(self.task_names) self._url = 'dataset/pdbbind_v2015.tar.gz' root_dir_path = get_download_dir() data_path = root_dir_path + '/pdbbind_v2015.tar.gz' extracted_data_path = root_dir_path + '/pdbbind_v2015' download(_get_dgl_url(self._url), path=data_path, overwrite=False) extract_archive(data_path, extracted_data_path) if subset == 'core': index_label_file = extracted_data_path + '/v2015/INDEX_core_data.2013' elif subset == 'refined': index_label_file = extracted_data_path + '/v2015/INDEX_refined_data.2015' else: raise ValueError( 'Expect the subset_choice to be either ' 'core or refined, got {}'.format(subset)) self._preprocess(extracted_data_path, index_label_file, load_binding_pocket, sanitize, calc_charges, remove_hs, use_conformation, construct_graph_and_featurize, zero_padding, num_processes)
def __init__(self, smiles_to_graph=smiles_to_bigraph, node_featurizer=None, edge_featurizer=None, load=True, log_every=1000, cache_file_path='AstraZeneca_chembl_solubility_graph.bin', log_of_values=True): self._url = 'dataset/AstraZeneca_ChEMBL_Solubility.csv' data_path = get_download_dir() + '/AstraZeneca_ChEMBL_Solubility.csv' download(_get_dgl_url(self._url), path=data_path) df = pd.read_csv(data_path) # ChEMBL ids self.chembl_ids = df['Molecule ChEMBL ID'].tolist() # Molecular weight self.mol_weight = df['Molecular Weight'].tolist() self.load_full = False super(AstraZenecaChEMBLSolubility, self).__init__( df=df, smiles_to_graph=smiles_to_graph, node_featurizer=node_featurizer, edge_featurizer=edge_featurizer, smiles_column='Smiles', cache_file_path=cache_file_path, task_names=['Solubility'], load=load, log_every=log_every, init_mask=False) if log_of_values: self.labels = self.labels.log()
def download_and_load_checkpoint(model_name, model, model_postfix, local_pretrained_path='pre_trained.pth', log=True): """Download pretrained model checkpoint The model will be loaded to CPU. Parameters ---------- model_name : str Name of the model model : nn.Module Instantiated model instance model_postfix : str Postfix for pretrained model checkpoint local_pretrained_path : str Local name for the downloaded model checkpoint log : bool Whether to print progress for model loading Returns ------- model : nn.Module Pretrained model """ url_to_pretrained = _get_dgl_url(model_postfix) local_pretrained_path = '_'.join([model_name, local_pretrained_path]) download(url_to_pretrained, path=local_pretrained_path, log=log) checkpoint = torch.load(local_pretrained_path, map_location='cpu') model.load_state_dict(checkpoint['model_state_dict']) if log: print('Pretrained model loaded') return model
def __init__(self, subset, mol_to_graph=mol_to_bigraph, node_featurizer=default_node_featurizer, edge_featurizer=default_edge_featurizer, atom_pair_featurizer=default_atom_pair_featurizer, load=True): assert subset in ['train', 'val', 'test'], \ 'Expect subset to be "train" or "val" or "test", got {}'.format(subset) print('Preparing {} subset of USPTO'.format(subset)) self._subset = subset if subset == 'val': subset = 'valid' self._url = 'dataset/uspto.zip' data_path = get_download_dir() + '/uspto.zip' extracted_data_path = get_download_dir() + '/uspto' download(_get_dgl_url(self._url), path=data_path) extract_archive(data_path, extracted_data_path) super(USPTO, self).__init__( raw_file_path=extracted_data_path + '/{}.txt'.format(subset), mol_graph_path=extracted_data_path + '/{}_mol_graphs.bin'.format(subset), mol_to_graph=mol_to_graph, node_featurizer=node_featurizer, edge_featurizer=edge_featurizer, atom_pair_featurizer=atom_pair_featurizer, load=load)
def __init__(self, smiles_to_graph=smiles_to_bigraph, node_featurizer=None, edge_featurizer=None, load=True, log_every=1000, cache_file_path='freesolv_dglgraph.bin'): self._url = 'dataset/FreeSolv.zip' data_path = get_download_dir() + '/FreeSolv.zip' dir_path = get_download_dir() + '/FreeSolv' download(_get_dgl_url(self._url), path=data_path) extract_archive(data_path, dir_path) df = pd.read_csv(dir_path + '/SAMPL.csv') # Iupac names self.iupac_names = df['iupac'].tolist() # Calculated hydration free energy self.calc_energy = df['calc'].tolist() self.load_full = False super(FreeSolv, self).__init__(df=df, smiles_to_graph=smiles_to_graph, node_featurizer=node_featurizer, edge_featurizer=edge_featurizer, smiles_column='smiles', cache_file_path=cache_file_path, task_names=['expt'], load=load, log_every=log_every, init_mask=False)
def __init__(self, mode='dev', mol_to_graph=mol_to_complete_graph, node_featurizer=alchemy_nodes, edge_featurizer=alchemy_edges, load=True): if mode == 'test': raise ValueError('The test mode is not supported before ' 'the Alchemy contest finishes.') assert mode in ['dev', 'valid', 'test'], \ 'Expect mode to be dev, valid or test, got {}.'.format(mode) self.mode = mode # Construct DGLGraphs from raw data or use the preprocessed data self.load = load file_dir = osp.join(get_download_dir(), 'Alchemy_data') if load: file_name = "{}_processed_dgl".format(mode) else: file_name = "{}_single_sdf".format(mode) self.file_dir = pathlib.Path(file_dir, file_name) self._url = 'dataset/alchemy/' self.zip_file_path = pathlib.Path(file_dir, file_name + '.zip') download(_get_dgl_url(self._url + file_name + '.zip'), path=str(self.zip_file_path)) if not os.path.exists(str(self.file_dir)): archive = zipfile.ZipFile(self.zip_file_path) archive.extractall(file_dir) archive.close() self._load(mol_to_graph, node_featurizer, edge_featurizer)
def __init__(self, smiles_to_graph=smiles_to_bigraph, node_featurizer=None, edge_featurizer=None, load=False, log_every=1000, cache_file_path='./bbbp_dglgraph.bin', n_jobs=1): self._url = 'dataset/bbbp.zip' data_path = get_download_dir() + '/bbbp.zip' dir_path = get_download_dir() + '/bbbp' download(_get_dgl_url(self._url), path=data_path, overwrite=False) extract_archive(data_path, dir_path) df = pd.read_csv(dir_path + '/BBBP.csv') super(BBBP, self).__init__(df=df, smiles_to_graph=smiles_to_graph, node_featurizer=node_featurizer, edge_featurizer=edge_featurizer, smiles_column='smiles', cache_file_path=cache_file_path, task_names=['p_np'], load=load, log_every=log_every, init_mask=True, n_jobs=n_jobs) self.load_full = False self.names = df['name'].tolist() self.names = [self.names[i] for i in self.valid_ids]
def load_acm_raw(remove_self_loop): assert not remove_self_loop url = 'dataset/ACM.mat' data_path = get_download_dir() + '/ACM.mat' download(_get_dgl_url(url), path=data_path) data = sio.loadmat(data_path) p_vs_l = data['PvsL'] # paper-field? p_vs_a = data['PvsA'] # paper-author p_vs_t = data['PvsT'] # paper-term, bag of words p_vs_c = data['PvsC'] # paper-conference, labels come from that # We assign # (1) KDD papers as class 0 (data mining), # (2) SIGMOD and VLDB papers as class 1 (database), # (3) SIGCOMM and MOBICOMM papers as class 2 (communication) conf_ids = [0, 1, 9, 10, 13] label_ids = [0, 1, 2, 2, 1] p_vs_c_filter = p_vs_c[:, conf_ids] p_selected = (p_vs_c_filter.sum(1) != 0).A1.nonzero()[0] p_vs_l = p_vs_l[p_selected] p_vs_a = p_vs_a[p_selected] p_vs_t = p_vs_t[p_selected] p_vs_c = p_vs_c[p_selected] hg = dgl.heterograph({ ('paper', 'pa', 'author'): p_vs_a.nonzero(), ('author', 'ap', 'paper'): p_vs_a.transpose.nonzero(), ('paper', 'pf', 'field'): p_vs_l.nonzero(), ('field', 'fp', 'paper'): p_vs_l.transpose().nonzero() }) features = torch.FloatTensor(p_vs_t.toarray()) pc_p, pc_c = p_vs_c.nonzero() labels = np.zeros(len(p_selected), dtype=np.int64) for conf_id, label_id in zip(conf_ids, label_ids): labels[pc_p[pc_c == conf_id]] = label_id labels = torch.LongTensor(labels) num_classes = 3 float_mask = np.zeros(len(pc_p)) for conf_id in conf_ids: pc_c_mask = (pc_c == conf_id) float_mask[pc_c_mask] = np.random.permutation( np.linspace(0, 1, pc_c_mask.sum())) train_idx = np.where(float_mask <= 0.2)[0] val_idx = np.where((float_mask > 0.2) & (float_mask <= 0.3))[0] test_idx = np.where(float_mask > 0.3)[0] num_nodes = hg.number_of_nodes('paper') train_mask = get_binary_mask(num_nodes, train_idx) val_mask = get_binary_mask(num_nodes, val_idx) test_mask = get_binary_mask(num_nodes, test_idx) return hg, features, labels, num_classes, train_idx, val_idx, test_idx, \ train_mask, val_mask, test_mask
def test_jtvae(): # Test DGLMolTree smiles = 'CC1([C@@H](N2[C@H](S1)[C@@H](C2=O)NC(=O)CC3=CC=CC=C3)C(=O)O)C' tree = DGLMolTree(smiles) assert tree.treesize() == 17 tree.assemble() assert tree._recover_node(0, tree.mol) == 'C[CH3:15]' tree.recover() # Test JTVAEDataset smiles = [ 'CCCCCCC1=NN2C(=N)/C(=C\c3cc(C)n(-c4ccc(C)cc4C)c3C)C(=O)N=C2S1', 'COCC[C@@H](C)C(=O)N(C)Cc1ccc(O)cc1' ] with open('data.txt', 'w') as f: for smi in smiles: f.write(smi + '\n') default_dir = get_download_dir() vocab_file = '{}/jtnn/{}.txt'.format(default_dir, 'vocab') zip_file_path = '{}/jtnn.zip'.format(default_dir) download(_get_dgl_url('dataset/jtnn.zip'), path=zip_file_path, overwrite=False) extract_archive(zip_file_path, '{}/jtnn'.format(default_dir)) with open(vocab_file, 'r') as f: vocab = Vocab([x.strip("\r\n ") for x in f]) dataset = JTVAEDataset('data.txt', vocab) assert len(dataset) == 2 assert set(dataset[0].keys()) == { 'cand_graphs', 'mol_graph', 'mol_tree', 'stereo_cand_graphs', 'stereo_cand_label', 'tree_mess_src_e', 'tree_mess_tgt_e', 'tree_mess_tgt_n', 'wid' } dataset.training = False assert set(dataset[0].keys()) == {'mol_graph', 'mol_tree', 'wid'} dataset.training = True collate_fn = JTVAECollator(training=True) loader = DataLoader(dataset, batch_size=2, collate_fn=collate_fn) for _, batch_data in enumerate(loader): assert set(batch_data.keys()) == { 'cand_batch_idx', 'cand_graph_batch', 'mol_graph_batch', 'mol_trees', 'stereo_cand_batch_idx', 'stereo_cand_graph_batch', 'stereo_cand_labels', 'stereo_cand_lengths', 'tree_mess_src_e', 'tree_mess_tgt_e', 'tree_mess_tgt_n' } dataset.training = False collate_fn = JTVAECollator(training=False) loader = DataLoader(dataset, batch_size=2, collate_fn=collate_fn) for _, batch_data in enumerate(loader): assert set(batch_data.keys()) == {'mol_graph_batch', 'mol_trees'} remove_file('data.txt') remove_file(zip_file_path) remove_dir(default_dir + '/jtnn')
def __init__(self, smiles_to_graph=smiles_to_bigraph, node_featurizer=None, edge_featurizer=None, load=True, log_every=1000): self._url = 'dataset/pubchem_bioassay_aromaticity.csv' data_path = get_download_dir() + '/pubchem_bioassay_aromaticity.csv' download(_get_dgl_url(self._url), path=data_path) df = pd.read_csv(data_path) super(PubChemBioAssayAromaticity, self).__init__( df, smiles_to_graph, node_featurizer, edge_featurizer, "cano_smiles", "pubchem_aromaticity_dglgraph.bin", load=load, log_every=log_every)
def _download_babi_data(): download_dir = get_download_dir() zip_file_path = os.path.join(download_dir, 'babi_data.zip') data_url = _get_dgl_url('models/ggnn_babi_data.zip') download(data_url, path=zip_file_path) extract_dir = os.path.join(download_dir, 'babi_data') if not os.path.exists(extract_dir): extract_archive(zip_file_path, extract_dir)
def __init__(self, data_name, raw_dir=None, force_reload=False, verbose=False): _url = _get_dgl_url(f"dataset/{data_name}.zip") super(ExtDataset, self).__init__(name=data_name, url=_url, raw_dir=raw_dir, force_reload=force_reload, verbose=verbose)
def load_acm_raw(): from dgl.data.utils import download, get_download_dir, _get_dgl_url from scipy import io as sio url = 'dataset/ACM.mat' data_path = get_download_dir() + '/ACM.mat' download(_get_dgl_url(url), path=data_path) data = sio.loadmat(data_path) p_vs_l = data['PvsL'] # paper-field? p_vs_a = data['PvsA'] # paper-author p_vs_t = data['PvsT'] # paper-term, bag of words p_vs_c = data['PvsC'] # paper-conference, labels come from that # We assign # (1) KDD papers as class 0 (data mining), # (2) SIGMOD and VLDB papers as class 1 (database), # (3) SIGCOMM and MOBICOMM papers as class 2 (communication) conf_ids = [0, 1, 9, 10, 13] label_ids = [0, 1, 2, 2, 1] p_vs_c_filter = p_vs_c[:, conf_ids] p_selected = (p_vs_c_filter.sum(1) != 0).A1.nonzero()[0] p_vs_l = p_vs_l[p_selected] p_vs_a = p_vs_a[p_selected] p_vs_t = p_vs_t[p_selected] p_vs_c = p_vs_c[p_selected] pa = dgl.bipartite(p_vs_a, 'paper', 'pa', 'author') pl = dgl.bipartite(p_vs_l, 'paper', 'pf', 'field') gs = [pa, pl] hg = dgl.hetero_from_relations(gs) features = torch.FloatTensor(p_vs_t.toarray()) pc_p, pc_c = p_vs_c.nonzero() labels = np.zeros(len(p_selected), dtype=np.int64) for conf_id, label_id in zip(conf_ids, label_ids): labels[pc_p[pc_c == conf_id]] = label_id labels = torch.LongTensor(labels) num_classes = 3 float_mask = np.zeros(len(pc_p)) for conf_id in conf_ids: pc_c_mask = (pc_c == conf_id) float_mask[pc_c_mask] = np.random.permutation( np.linspace(0, 1, pc_c_mask.sum())) train_idx = np.where(float_mask <= 0.2)[0] val_idx = np.where((float_mask > 0.2) & (float_mask <= 0.3))[0] test_idx = np.where(float_mask > 0.3)[0] hg.nodes["paper"].data["feat"] = features return hg, labels, num_classes, train_idx, val_idx, test_idx
def readFragmentScores(name='fpscores'): import gzip global _fscores fname = '{}.pkl.gz'.format(name) download(_get_dgl_url(os.path.join('dataset', fname)), path=fname) _fscores = cPickle.load(gzip.open(fname)) outDict = {} for i in _fscores: for j in range(1, len(i)): outDict[i[j]] = float(i[0]) _fscores = outDict
def __init__(self, name, raw_dir=None, random_seed=717, train_size=0.7, val_size=0.1): assert name in ['gos', 'pol'], "Only supports 'gos' or 'pol'." self.seed = random_seed self.train_size = train_size self.val_size = val_size url = _get_dgl_url(self.file_urls[name]) super(GASDataset, self).__init__(name=name, url=url, raw_dir=raw_dir)
def __init__(self, label_keys=None, raw_dir=None, force_reload=False, verbose=True): self.label_keys = label_keys self._url = _get_dgl_url('dataset/qm9_ver2.zip') super(QM9Dataset_v2, self).__init__(name='qm9_v2', url=self._url, raw_dir=raw_dir, force_reload=force_reload, verbose=verbose)
def __init__(self, mode='train', vocab_file=None): self.mode = mode self.dir = get_download_dir() self.zip_file_path='{}/sst.zip'.format(self.dir) self.pretrained_file = 'glove.840B.300d.txt' if mode == 'train' else '' self.pretrained_emb = None self.vocab_file = '{}/sst/vocab.txt'.format(self.dir) if vocab_file is None else vocab_file download(_get_dgl_url(_urls['sst']), path=self.zip_file_path) extract_archive(self.zip_file_path, '{}/sst'.format(self.dir)) self.trees = [] self.num_classes = 5 print('Preprocessing...') self._load() print('Dataset creation finished. #Trees:', len(self.trees))
def test_acnn(): remove_dir('tmp1') remove_dir('tmp2') url = _get_dgl_url('dgllife/example_mols.tar.gz') local_path = 'tmp1/example_mols.tar.gz' download(url, path=local_path) extract_archive(local_path, 'tmp2') pocket_mol, pocket_coords = load_molecule( 'tmp2/example_mols/example.pdb', remove_hs=True) ligand_mol, ligand_coords = load_molecule( 'tmp2/example_mols/example.pdbqt', remove_hs=True) remove_dir('tmp1') remove_dir('tmp2') if torch.cuda.is_available(): device = torch.device('cuda:0') else: device = torch.device('cpu') g1 = ACNN_graph_construction_and_featurization(ligand_mol, pocket_mol, ligand_coords, pocket_coords) model = ACNN() model.to(device) g1.to(device) assert model(g1).shape == torch.Size([1, 1]) bg = dgl.batch_hetero([g1, g1]) bg.to(device) assert model(bg).shape == torch.Size([2, 1]) model = ACNN(hidden_sizes=[1, 2], weight_init_stddevs=[1, 1], dropouts=[0.1, 0.], features_to_use=torch.tensor([6., 8.]), radial=[[12.0], [0.0, 2.0], [4.0]]) model.to(device) g1.to(device) assert model(g1).shape == torch.Size([1, 1]) bg = dgl.batch_hetero([g1, g1]) bg.to(device) assert model(bg).shape == torch.Size([2, 1])
def download_data(dataset, fname): """Download dataset if built-in support exists Parameters ---------- dataset : str Dataset name fname : str Name of dataset file """ if dataset not in ['ChEMBL', 'ZINC']: # For dataset without built-in support, they should be locally processed. return data_path = fname download(_get_dgl_url(os.path.join('dataset', fname)), path=data_path)
def __init__(self, data, vocab, training=True): self.dir = get_download_dir() self.zip_file_path = '{}/jtnn.zip'.format(self.dir) download(_get_dgl_url('dgllife/jtnn.zip'), path=self.zip_file_path) extract_archive(self.zip_file_path, '{}/jtnn'.format(self.dir)) print('Loading data...') data_file = '{}/jtnn/{}.txt'.format(self.dir, data) with open(data_file) as f: self.data = [line.strip("\r\n ").split()[0] for line in f] self.vocab_file = '{}/jtnn/{}.txt'.format(self.dir, vocab) print('Loading finished.') print('\tNum samples:', len(self.data)) print('\tVocab file:', self.vocab_file) self.training = training self.vocab = Vocab([x.strip("\r\n ") for x in open(self.vocab_file)])
def load_acm(remove_self_loop): filename = 'ACM3025.pkl' url = 'dataset/' + filename data_path = get_download_dir() + '/' + filename if osp.exists(data_path): print(f'Using existing file {filename}', file=sys.stderr) else: download(_get_dgl_url(url), path=data_path) with open(data_path, 'rb') as f: data = pickle.load(f) labels, features = torch.from_numpy(data['label'].todense()).long(), \ torch.from_numpy(data['feature'].todense()).float() num_classes = labels.shape[1] labels = labels.nonzero()[:, 1] if remove_self_loop: num_nodes = data['label'].shape[0] data['PAP'] = sparse.csr_matrix(data['PAP'] - np.eye(num_nodes)) data['PLP'] = sparse.csr_matrix(data['PLP'] - np.eye(num_nodes)) # Adjacency matrices for meta path based neighbors # (Mufei): I verified both of them are binary adjacency matrices with self loops author_g = dgl.from_scipy(data['PAP']) subject_g = dgl.from_scipy(data['PLP']) gs = [author_g, subject_g] train_idx = torch.from_numpy(data['train_idx']).long().squeeze(0) val_idx = torch.from_numpy(data['val_idx']).long().squeeze(0) test_idx = torch.from_numpy(data['test_idx']).long().squeeze(0) num_nodes = author_g.number_of_nodes() train_mask = get_binary_mask(num_nodes, train_idx) val_mask = get_binary_mask(num_nodes, val_idx) test_mask = get_binary_mask(num_nodes, test_idx) print('dataset loaded') pprint({ 'dataset': 'ACM', 'train': train_mask.sum().item() / num_nodes, 'val': val_mask.sum().item() / num_nodes, 'test': test_mask.sum().item() / num_nodes }) return gs, features, labels, num_classes, train_idx, val_idx, test_idx, \ train_mask, val_mask, test_mask