def get_molnet_dataframe(dataset_name): """Downloads, caches and get the dataframe of MoleculeNet dataset. Args: dataset_name (str): MoleculeNet dataset name. If you want to know the detail of MoleculeNet, please refer to `official site <http://moleculenet.ai/datasets-1>`_ If you would like to know what dataset_name is available for chainer_chemistry, please refer to `molnet_config.py`. Returns (pandas.DataFrame or tuple): DataFrame of dataset without any preprocessing. When the files of dataset are seprated, this function returns multiple DataFrame. """ if dataset_name not in molnet_default_config: raise ValueError( "We don't support {} dataset. Please choose from {}".format( dataset_name, list(molnet_default_config.keys()))) dataset_config = molnet_default_config[dataset_name] if dataset_config['dataset_type'] == 'one_file_csv': df = pandas.read_csv(get_molnet_filepath(dataset_name)) return df elif dataset_config['dataset_type'] == 'separate_csv': train_df = pandas.read_csv(get_molnet_filepath(dataset_name, 'train')) valid_df = pandas.read_csv(get_molnet_filepath(dataset_name, 'valid')) test_df = pandas.read_csv(get_molnet_filepath(dataset_name, 'test')) return train_df, valid_df, test_df else: raise ValueError('dataset_type={} is not supported'.format( dataset_config['dataset_type']))
def parse_arguments(): # Lists of supported preprocessing methods/models. method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn', 'gin', 'nfp_gwm', 'ggnn_gwm', 'rsgcn_gwm', 'gin_gwm'] # scale_list = ['standardize', 'none'] dataset_names = list(molnet_default_config.keys()) # Set up the argument parser. parser = argparse.ArgumentParser(description='Prediction on Molnet.') parser.add_argument('--dataset', '-d', type=str, choices=dataset_names, default='bbbp', help='name of the dataset that training is run on') parser.add_argument('--method', '-m', type=str, choices=method_list, help='method name', default='nfp') parser.add_argument('--label', '-l', type=str, default='', help='target label for regression; empty string means ' 'predicting all properties at once') # parser.add_argument('--scale', type=str, choices=scale_list, # help='label scaling method', default='standardize') parser.add_argument('--gpu', '-g', type=int, default=-1, help='id of gpu to use; negative value means running' 'the code on cpu') parser.add_argument('--in-dir', '-i', type=str, default='result', help='directory to load model data from') parser.add_argument('--num-data', type=int, default=-1, help='amount of data to be parsed; -1 indicates ' 'parsing all data.') return parser.parse_args()
def parse_arguments(): # Lists of supported preprocessing methods/models. method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn', 'relgat', 'gin', 'gnnfilm', 'megnet', 'nfp_gwm', 'ggnn_gwm', 'rsgcn_gwm', 'gin_gwm'] # scale_list = ['standardize', 'none'] dataset_names = list(molnet_default_config.keys()) # Set up the argument parser. parser = argparse.ArgumentParser(description='Prediction on Molnet.') parser.add_argument('--dataset', '-d', type=str, choices=dataset_names, default='bbbp', help='name of the dataset that training is run on') parser.add_argument('--method', '-m', type=str, choices=method_list, help='method name', default='nfp') parser.add_argument('--label', '-l', type=str, default='', help='target label for regression; empty string means ' 'predicting all properties at once') # parser.add_argument('--scale', type=str, choices=scale_list, # help='label scaling method', default='standardize') parser.add_argument( '--device', type=str, default='-1', help='Device specifier. Either ChainerX device specifier or an ' 'integer. If non-negative integer, CuPy arrays with specified ' 'device id are used. If negative integer, NumPy arrays are used') parser.add_argument('--in-dir', '-i', type=str, default='result', help='directory to load model data from') parser.add_argument('--num-data', type=int, default=-1, help='amount of data to be parsed; -1 indicates ' 'parsing all data.') return parser.parse_args()
def parse_arguments(): # Lists of supported preprocessing methods/models and datasets. method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn', 'relgat', 'gin', 'gnnfilm', 'megnet', 'nfp_gwm', 'ggnn_gwm', 'rsgcn_gwm', 'gin_gwm'] dataset_names = list(molnet_default_config.keys()) scale_list = ['standardize', 'none'] parser = argparse.ArgumentParser(description='molnet example') parser.add_argument('--method', '-m', type=str, choices=method_list, help='method name', default='nfp') parser.add_argument('--label', '-l', type=str, default='', help='target label for regression; empty string means ' 'predicting all properties at once') parser.add_argument('--conv-layers', '-c', type=int, default=4, help='number of convolution layers') parser.add_argument('--batchsize', '-b', type=int, default=32, help='batch size') parser.add_argument( '--device', type=str, default='-1', help='Device specifier. Either ChainerX device specifier or an ' 'integer. If non-negative integer, CuPy arrays with specified ' 'device id are used. If negative integer, NumPy arrays are used') parser.add_argument('--out', '-o', type=str, default='result', help='path to save the computed model to') parser.add_argument('--epoch', '-e', type=int, default=20, help='number of epochs') parser.add_argument('--unit-num', '-u', type=int, default=16, help='number of units in one layer of the model') parser.add_argument('--dataset', '-d', type=str, choices=dataset_names, default='bbbp', help='name of the dataset that training is run on') parser.add_argument('--protocol', type=int, default=2, help='pickle protocol version') parser.add_argument('--num-data', type=int, default=-1, help='amount of data to be parsed; -1 indicates ' 'parsing all data.') parser.add_argument('--scale', type=str, choices=scale_list, help='label scaling method', default='standardize') return parser.parse_args()
def parse_arguments(): # Lists of supported preprocessing methods/models and datasets. method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn', 'relgat', 'gin', 'nfp_gwm', 'ggnn_gwm', 'rsgcn_gwm', 'gin_gwm'] dataset_names = list(molnet_default_config.keys()) # scale_list = ['standardize', 'none'] parser = argparse.ArgumentParser(description='molnet example') parser.add_argument('--method', '-m', type=str, choices=method_list, help='method name', default='nfp') parser.add_argument('--label', '-l', type=str, default='', help='target label for regression; empty string means ' 'predicting all properties at once') parser.add_argument('--conv-layers', '-c', type=int, default=4, help='number of convolution layers') parser.add_argument('--batchsize', '-b', type=int, default=32, help='batch size') parser.add_argument('--gpu', '-g', type=int, default=-1, help='id of gpu to use; negative value means running' 'the code on cpu') parser.add_argument('--out', '-o', type=str, default='result', help='path to save the computed model to') parser.add_argument('--epoch', '-e', type=int, default=20, help='number of epochs') parser.add_argument('--unit-num', '-u', type=int, default=16, help='number of units in one layer of the model') parser.add_argument('--dataset', '-d', type=str, choices=dataset_names, default='bbbp', help='name of the dataset that training is run on') parser.add_argument('--protocol', type=int, default=2, help='pickle protocol version') parser.add_argument('--num-data', type=int, default=-1, help='amount of data to be parsed; -1 indicates ' 'parsing all data.') # parser.add_argument('--scale', type=str, choices=scale_list, # help='label scaling method', default='standardize') return parser.parse_args()
def get_molnet_dataframe(dataset_name, pdbbind_subset=None): """Downloads, caches and get the dataframe of MoleculeNet dataset. Args: dataset_name (str): MoleculeNet dataset name. If you want to know the detail of MoleculeNet, please refer to `official site <http://moleculenet.ai/datasets-1>`_ If you would like to know what dataset_name is available for chainer_chemistry, please refer to `molnet_config.py`. pdbbind_subset (str): PDBbind dataset subset name. If you want to know the detail of subset, please refer to `official site <http://www.pdbbind.org.cn/download/pdbbind_2017_intro.pdf>` Returns (pandas.DataFrame or tuple): DataFrame of dataset without any preprocessing. When the files of dataset are seprated, this function returns multiple DataFrame. """ if dataset_name not in molnet_default_config: raise ValueError("We don't support {} dataset. Please choose from {}". format(dataset_name, list(molnet_default_config.keys()))) if dataset_name == 'pdbbind_grid': raise ValueError('pdbbind_grid dataset is not supported. Please ', 'choose pdbbind_smiles dataset.') dataset_config = molnet_default_config[dataset_name] if dataset_config['dataset_type'] == 'one_file_csv': df = pandas.read_csv(get_molnet_filepath( dataset_name, pdbbind_subset=pdbbind_subset)) return df elif dataset_config['dataset_type'] == 'separate_csv': train_df = pandas.read_csv(get_molnet_filepath(dataset_name, 'train')) valid_df = pandas.read_csv(get_molnet_filepath(dataset_name, 'valid')) test_df = pandas.read_csv(get_molnet_filepath(dataset_name, 'test')) return train_df, valid_df, test_df else: raise ValueError('dataset_type={} is not supported' .format(dataset_config['dataset_type']))
def parse_arguments(): # Lists of supported preprocessing methods/models and datasets. method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn', 'relgcn'] dataset_names = list(molnet_default_config.keys()) # scale_list = ['standardize', 'none'] parser = argparse.ArgumentParser(description='molnet example') parser.add_argument('--method', '-m', type=str, choices=method_list, help='method name', default='nfp') parser.add_argument('--label', '-l', type=str, default='', help='target label for regression; empty string means ' 'predicting all properties at once') parser.add_argument('--conv-layers', '-c', type=int, default=4, help='number of convolution layers') parser.add_argument('--batchsize', '-b', type=int, default=32, help='batch size') parser.add_argument('--gpu', '-g', type=int, default=-1, help='id of gpu to use; negative value means running' 'the code on cpu') parser.add_argument('--out', '-o', type=str, default='result', help='path to save the computed model to') parser.add_argument('--epoch', '-e', type=int, default=20, help='number of epochs') parser.add_argument('--unit-num', '-u', type=int, default=16, help='number of units in one layer of the model') parser.add_argument('--dataset', '-d', type=str, choices=dataset_names, default='bbbp', help='name of the dataset that training is run on') parser.add_argument('--protocol', type=int, default=2, help='pickle protocol version') parser.add_argument('--num-data', type=int, default=-1, help='amount of data to be parsed; -1 indicates ' 'parsing all data.') # parser.add_argument('--scale', type=str, choices=scale_list, # help='label scaling method', default='standardize') return parser.parse_args()
def get_molnet_dataset(dataset_name, preprocessor=None, labels=None, split=None, frac_train=.8, frac_valid=.1, frac_test=.1, seed=777, return_smiles=False, target_index=None, task_index=0, **kwargs): """Downloads, caches and preprocess MoleculeNet dataset. Args: dataset_name (str): MoleculeNet dataset name. If you want to know the detail of MoleculeNet, please refer to `official site <http://moleculenet.ai/datasets-1>`_ If you would like to know what dataset_name is available for chainer_chemistry, please refer to `molnet_config.py`. preprocessor (BasePreprocessor): Preprocessor. It should be chosen based on the network to be trained. If it is None, default `AtomicNumberPreprocessor` is used. labels (str or list): List of target labels. split (str or BaseSplitter or None): How to split dataset into train, validation and test. If `None`, this functions use the splitter that is recommended by MoleculeNet. Additionally You can use an instance of BaseSplitter or choose it from 'random', 'stratified' and 'scaffold'. return_smiles (bool): If set to ``True``, smiles array is also returned. target_index (list or None): target index list to partially extract dataset. If `None` (default), all examples are parsed. task_index (int): Target task index in dataset for stratification. (Stratified Splitter only) Returns (dict): Dictionary that contains dataset that is already split into train, valid and test dataset and 1-d numpy array with dtype=object(string) which is a vector of smiles for each example or `None`. """ if dataset_name not in molnet_default_config: raise ValueError( "We don't support {} dataset. Please choose from {}".format( dataset_name, list(molnet_default_config.keys()))) dataset_config = molnet_default_config[dataset_name] labels = labels or dataset_config['tasks'] if isinstance(labels, str): labels = [ labels, ] if preprocessor is None: preprocessor = AtomicNumberPreprocessor() if dataset_config['task_type'] == 'regression': def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) elif dataset_config['task_type'] == 'classification': def postprocess_label(label_list): label_list = numpy.asarray(label_list) label_list[numpy.isnan(label_list)] = -1 return label_list.astype(numpy.int32) parser = CSVFileParser(preprocessor, labels=labels, smiles_col=dataset_config['smiles_columns'], postprocess_label=postprocess_label) if dataset_config['dataset_type'] == 'one_file_csv': split = dataset_config['split'] if split is None else split if isinstance(split, str): splitter = split_method_dict[split]() elif isinstance(split, BaseSplitter): splitter = split else: raise TypeError("split must be None, str or instance of" " BaseSplitter, but got {}".format(type(split))) if isinstance(splitter, ScaffoldSplitter): get_smiles = True else: get_smiles = return_smiles result = parser.parse(get_molnet_filepath(dataset_name), return_smiles=get_smiles, target_index=target_index, **kwargs) dataset = result['dataset'] smiles = result['smiles'] train_ind, valid_ind, test_ind = \ splitter.train_valid_test_split(dataset, smiles_list=smiles, task_index=task_index, frac_train=frac_train, frac_valid=frac_valid, frac_test=frac_test, **kwargs) train = NumpyTupleDataset(*dataset.features[train_ind]) valid = NumpyTupleDataset(*dataset.features[valid_ind]) test = NumpyTupleDataset(*dataset.features[test_ind]) result['dataset'] = (train, valid, test) if return_smiles: train_smiles = smiles[train_ind] valid_smiles = smiles[valid_ind] test_smiles = smiles[test_ind] result['smiles'] = (train_smiles, valid_smiles, test_smiles) else: result['smiles'] = None elif dataset_config['dataset_type'] == 'separate_csv': result = {} train_result = parser.parse(get_molnet_filepath(dataset_name, 'train'), return_smiles=return_smiles, target_index=target_index) valid_result = parser.parse(get_molnet_filepath(dataset_name, 'valid'), return_smiles=return_smiles, target_index=target_index) test_result = parser.parse(get_molnet_filepath(dataset_name, 'test'), return_smiles=return_smiles, target_index=target_index) result['dataset'] = (train_result['dataset'], valid_result['dataset'], test_result['dataset']) result['smiles'] = (train_result['smiles'], valid_result['smiles'], test_result['smiles']) else: raise ValueError('dataset_type={} is not supported'.format( dataset_config['dataset_type'])) return result
def get_molnet_dataset(dataset_name, preprocessor=None, labels=None, split='random', frac_train=.8, frac_valid=.1, frac_test=.1, seed=777, return_smiles=False, target_index=None): from chainer_chemistry.dataset.parsers.csv_file_parser import CSVFileParser """Downloads, caches and preprocess MoleculeNet dataset. Args: dataset_name (str): MoleculeNet dataset name. If you want to know the detail of MoleculeNet, please refer to `official site <http://moleculenet.ai/datasets-1>`_ If you would like to know what dataset_name is available for chainer_chemistry, please refer to `molnet_config.py`. preprocessor (BasePreprocessor): Preprocessor. It should be chosen based on the network to be trained. If it is None, default `AtomicNumberPreprocessor` is used. labels (str or list): List of target labels. return_smiles (bool): If set to ``True``, smiles array is also returned. target_index (list or None): target index list to partially extract dataset. If `None` (default), all examples are parsed. Returns (dict): Dictionary that contains dataset that is already splitted into train, valid and test dataset and 1-d numpy array with dtype=object(string) which is a vector of smiles for each example or `None`. """ if dataset_name not in molnet_default_config: raise ValueError( "We don't support {} dataset. Please choose from {}".format( dataset_name, list(molnet_default_config.keys()))) dataset_config = molnet_default_config[dataset_name] labels = labels or dataset_config['tasks'] if isinstance(labels, str): labels = [ labels, ] if preprocessor is None: preprocessor = AtomicNumberPreprocessor() if dataset_config['task_type'] == 'regression': def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) elif dataset_config['task_type'] == 'classification': def postprocess_label(label_list): label_list = numpy.asarray(label_list) label_list[numpy.isnan(label_list)] = -1 return label_list.astype(numpy.int32) parser = CSVFileParser(preprocessor, labels=labels, smiles_col=dataset_config['smiles_columns'], postprocess_label=postprocess_label) if dataset_config['dataset_type'] == 'one_file_csv': result = parser.parse(get_molnet_filepath(dataset_name), return_smiles=return_smiles, target_index=target_index) # TODO(motoki): splitting function or class dataset = result['dataset'] if split == 'random': perm = numpy.random.permutation(len(dataset)) dataset = NumpyTupleDataset(*dataset.features[perm]) train_data_size = int(len(dataset) * frac_train) valid_data_size = int(len(dataset) * frac_valid) train = NumpyTupleDataset(*dataset.features[:train_data_size]) valid = NumpyTupleDataset( *dataset.features[train_data_size:train_data_size + valid_data_size]) test = NumpyTupleDataset(*dataset.features[train_data_size + valid_data_size:]) result['dataset'] = (train, valid, test) if return_smiles: smiles = result['smiles'][perm] train_smiles = smiles[:train_data_size] valid_smiles = smiles[train_data_size:train_data_size + valid_data_size] test_smiles = smiles[train_data_size + valid_data_size:] result['smiles'] = (train_smiles, valid_smiles, test_smiles) else: result['smiles'] = None else: raise NotImplementedError elif dataset_config['dataset_type'] == 'separate_csv': result = {} train_result = parser.parse(get_molnet_filepath(dataset_name, 'train'), return_smiles=return_smiles, target_index=target_index) valid_result = parser.parse(get_molnet_filepath(dataset_name, 'valid'), return_smiles=return_smiles, target_index=target_index) test_result = parser.parse(get_molnet_filepath(dataset_name, 'test'), return_smiles=return_smiles, target_index=target_index) result['dataset'] = (train_result['dataset'], valid_result['dataset'], test_result['dataset']) result['smiles'] = (train_result['smiles'], valid_result['smiles'], test_result['smiles']) else: raise NotImplementedError return result
def main(): method_list = ['nfp', 'ggnn', 'schnet', 'weavenet', 'rsgcn'] dataset_names = list(molnet_default_config.keys()) parser = argparse.ArgumentParser(description='molnet example') parser.add_argument('--method', '-m', type=str, choices=method_list, default='nfp') parser.add_argument('--label', '-l', type=str, default='', help='target label for regression, empty string means ' 'to predict all property at once') parser.add_argument('--conv-layers', '-c', type=int, default=4) parser.add_argument('--batchsize', '-b', type=int, default=32) parser.add_argument('--gpu', '-g', type=int, default=-1) parser.add_argument('--out', '-o', type=str, default='result') parser.add_argument('--epoch', '-e', type=int, default=20) parser.add_argument('--unit-num', '-u', type=int, default=16) parser.add_argument('--dataset', '-d', type=str, choices=dataset_names, default='bbbp') parser.add_argument('--protocol', type=int, default=2) parser.add_argument('--model-filename', type=str, default='regressor.pkl') parser.add_argument('--num-data', type=int, default=-1, help='Number of data to be parsed from parser.' '-1 indicates to parse all data.') args = parser.parse_args() dataset_name = args.dataset method = args.method num_data = args.num_data n_unit = args.unit_num conv_layers = args.conv_layers print('Use {} dataset'.format(dataset_name)) if args.label: labels = args.label cache_dir = os.path.join( 'input', '{}_{}_{}'.format(dataset_name, method, labels)) class_num = len(labels) if isinstance(labels, list) else 1 else: labels = None cache_dir = os.path.join('input', '{}_{}_all'.format(dataset_name, method)) class_num = len(molnet_default_config[args.dataset]['tasks']) # Dataset preparation def get_dataset_paths(cache_dir, num_data): filepaths = [] for filetype in ['train', 'valid', 'test']: filename = filetype + '_data' if num_data >= 0: filename += '_' + str(num_data) filename += '.npz' filepath = os.path.join(cache_dir, filename) filepaths.append(filepath) return filepaths filepaths = get_dataset_paths(cache_dir, num_data) if all([os.path.exists(fpath) for fpath in filepaths]): datasets = [] for fpath in filepaths: print('load from cache {}'.format(fpath)) datasets.append(NumpyTupleDataset.load(fpath)) else: print('preprocessing dataset...') preprocessor = preprocess_method_dict[method]() # only use first 100 for debug if num_data >= 0 target_index = numpy.arange(num_data) if num_data >= 0 else None datasets = D.molnet.get_molnet_dataset(dataset_name, preprocessor, labels=labels, target_index=target_index) if not os.path.exists(cache_dir): os.makedirs(cache_dir) datasets = datasets['dataset'] for i, fpath in enumerate(filepaths): NumpyTupleDataset.save(fpath, datasets[i]) train, val, _ = datasets # Network if method == 'nfp': print('Train NFP model...') predictor = GraphConvPredictor( NFP(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'ggnn': print('Train GGNN model...') predictor = GraphConvPredictor( GGNN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'schnet': print('Train SchNet model...') predictor = GraphConvPredictor( SchNet(out_dim=class_num, hidden_dim=n_unit, n_layers=conv_layers), None) elif method == 'weavenet': print('Train WeaveNet model...') n_atom = 20 n_sub_layer = 1 weave_channels = [50] * conv_layers predictor = GraphConvPredictor( WeaveNet(weave_channels=weave_channels, hidden_dim=n_unit, n_sub_layer=n_sub_layer, n_atom=n_atom), MLP(out_dim=class_num, hidden_dim=n_unit)) elif method == 'rsgcn': print('Train RSGCN model...') predictor = GraphConvPredictor( RSGCN(out_dim=n_unit, hidden_dim=n_unit, n_layers=conv_layers), MLP(out_dim=class_num, hidden_dim=n_unit)) else: raise ValueError('[ERROR] Invalid method {}'.format(method)) train_iter = iterators.SerialIterator(train, args.batchsize) val_iter = iterators.SerialIterator(val, args.batchsize, repeat=False, shuffle=False) metrics_fun = molnet_default_config[dataset_name]['metrics'] loss_fun = molnet_default_config[dataset_name]['loss'] task_type = molnet_default_config[dataset_name]['task_type'] if task_type == 'regression': model = Regressor(predictor, lossfun=loss_fun, metrics_fun=metrics_fun, device=args.gpu) # TODO(nakago): Use standard scaler for regression task elif task_type == 'classification': model = Classifier(predictor, lossfun=loss_fun, metrics_fun=metrics_fun, device=args.gpu) else: raise NotImplementedError( 'Not implemented task_type = {}'.format(task_type)) optimizer = optimizers.Adam() optimizer.setup(model) updater = training.StandardUpdater(train_iter, optimizer, device=args.gpu, converter=concat_mols) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend( E.Evaluator(val_iter, model, device=args.gpu, converter=concat_mols)) trainer.extend(E.snapshot(), trigger=(args.epoch, 'epoch')) trainer.extend(E.LogReport()) print_report_targets = ['epoch', 'main/loss', 'validation/main/loss'] if metrics_fun is not None and type(metrics_fun) == dict: for m_k in metrics_fun.keys(): print_report_targets.append('main/' + m_k) print_report_targets.append('validation/main/' + m_k) if task_type == 'classification': # Evaluation for train data takes time, skip for now. # trainer.extend(ROCAUCEvaluator( # train_iter, model, device=args.gpu, eval_func=predictor, # converter=concat_mols, name='train', raise_value_error=False)) # print_report_targets.append('train/main/roc_auc') trainer.extend( ROCAUCEvaluator(val_iter, model, device=args.gpu, eval_func=predictor, converter=concat_mols, name='val', raise_value_error=False)) print_report_targets.append('val/main/roc_auc') print_report_targets.append('elapsed_time') trainer.extend(E.PrintReport(print_report_targets)) trainer.extend(E.ProgressBar()) trainer.run() # --- save model --- protocol = args.protocol model.save_pickle(os.path.join(args.out, args.model_filename), protocol=protocol)
def get_molnet_dataset(dataset_name, preprocessor=None, labels=None, split=None, frac_train=.8, frac_valid=.1, frac_test=.1, seed=777, return_smiles=False, return_pdb_id=False, target_index=None, task_index=0, **kwargs): """Downloads, caches and preprocess MoleculeNet dataset. Args: dataset_name (str): MoleculeNet dataset name. If you want to know the detail of MoleculeNet, please refer to `official site <http://moleculenet.ai/datasets-1>`_ If you would like to know what dataset_name is available for chainer_chemistry, please refer to `molnet_config.py`. preprocessor (BasePreprocessor): Preprocessor. It should be chosen based on the network to be trained. If it is None, default `AtomicNumberPreprocessor` is used. labels (str or list): List of target labels. split (str or BaseSplitter or None): How to split dataset into train, validation and test. If `None`, this functions use the splitter that is recommended by MoleculeNet. Additionally You can use an instance of BaseSplitter or choose it from 'random', 'stratified' and 'scaffold'. return_smiles (bool): If set to ``True``, smiles array is also returned. return_pdb_id (bool): If set to ``True``, PDB ID array is also returned. This argument is only used when you select 'pdbbind_smiles'. target_index (list or None): target index list to partially extract dataset. If `None` (default), all examples are parsed. task_index (int): Target task index in dataset for stratification. (Stratified Splitter only) Returns (dict): Dictionary that contains dataset that is already split into train, valid and test dataset and 1-d numpy array with dtype=object(string) which is a vector of smiles for each example or `None`. """ if dataset_name not in molnet_default_config: raise ValueError("We don't support {} dataset. Please choose from {}". format(dataset_name, list(molnet_default_config.keys()))) if dataset_name == 'pdbbind_grid': pdbbind_subset = kwargs.get('pdbbind_subset') return get_pdbbind_grid(pdbbind_subset, split=split, frac_train=frac_train, frac_valid=frac_valid, frac_test=frac_test, task_index=task_index) if dataset_name == 'pdbbind_smiles': pdbbind_subset = kwargs.get('pdbbind_subset') time_list = kwargs.get('time_list') return get_pdbbind_smiles(pdbbind_subset, preprocessor=preprocessor, labels=labels, split=split, frac_train=frac_train, frac_valid=frac_valid, frac_test=frac_test, return_smiles=return_smiles, return_pdb_id=return_pdb_id, target_index=target_index, task_index=task_index, time_list=time_list) dataset_config = molnet_default_config[dataset_name] labels = labels or dataset_config['tasks'] if isinstance(labels, str): labels = [labels, ] if preprocessor is None: preprocessor = AtomicNumberPreprocessor() if dataset_config['task_type'] == 'regression': def postprocess_label(label_list): return numpy.asarray(label_list, dtype=numpy.float32) elif dataset_config['task_type'] == 'classification': def postprocess_label(label_list): label_list = numpy.asarray(label_list) label_list[numpy.isnan(label_list)] = -1 return label_list.astype(numpy.int32) parser = CSVFileParser(preprocessor, labels=labels, smiles_col=dataset_config['smiles_columns'], postprocess_label=postprocess_label) if dataset_config['dataset_type'] == 'one_file_csv': split = dataset_config['split'] if split is None else split if isinstance(split, str): splitter = split_method_dict[split]() elif isinstance(split, BaseSplitter): splitter = split else: raise TypeError("split must be None, str or instance of" " BaseSplitter, but got {}".format(type(split))) if isinstance(splitter, ScaffoldSplitter): get_smiles = True else: get_smiles = return_smiles result = parser.parse(get_molnet_filepath(dataset_name), return_smiles=get_smiles, target_index=target_index, **kwargs) dataset = result['dataset'] smiles = result['smiles'] train_ind, valid_ind, test_ind = \ splitter.train_valid_test_split(dataset, smiles_list=smiles, task_index=task_index, frac_train=frac_train, frac_valid=frac_valid, frac_test=frac_test, **kwargs) train = NumpyTupleDataset(*dataset.features[train_ind]) valid = NumpyTupleDataset(*dataset.features[valid_ind]) test = NumpyTupleDataset(*dataset.features[test_ind]) result['dataset'] = (train, valid, test) if return_smiles: train_smiles = smiles[train_ind] valid_smiles = smiles[valid_ind] test_smiles = smiles[test_ind] result['smiles'] = (train_smiles, valid_smiles, test_smiles) else: result['smiles'] = None elif dataset_config['dataset_type'] == 'separate_csv': result = {} train_result = parser.parse(get_molnet_filepath(dataset_name, 'train'), return_smiles=return_smiles, target_index=target_index) valid_result = parser.parse(get_molnet_filepath(dataset_name, 'valid'), return_smiles=return_smiles, target_index=target_index) test_result = parser.parse(get_molnet_filepath(dataset_name, 'test'), return_smiles=return_smiles, target_index=target_index) result['dataset'] = (train_result['dataset'], valid_result['dataset'], test_result['dataset']) result['smiles'] = (train_result['smiles'], valid_result['smiles'], test_result['smiles']) else: raise ValueError('dataset_type={} is not supported' .format(dataset_config['dataset_type'])) return result