def k_fold_split(self, dataset, k, directories=None, **kwargs): """ Parameters ---------- dataset: `dc.data.Dataset` Dataset to do a k-fold split k: int Number of folds to split `dataset` into. directories: list[str] list of length 2*k filepaths to save the result disk-datasets Returns ------- list of length k tuples of (train, cv) where `train` and `cv` are both lists of `Dataset`s. """ logger.info("Computing K-fold split") if directories is None: directories = [tempfile.mkdtemp() for _ in range(2 * k)] else: assert len(directories) == 2 * k cv_datasets = [] train_ds_base = None train_datasets = [] # rem_dataset is remaining portion of dataset if isinstance(dataset, DiskDataset): rem_dataset = dataset else: rem_dataset = DiskDataset.from_numpy(dataset.X, dataset.y, dataset.w, dataset.ids) for fold in range(k): # Note starts as 1/k since fold starts at 0. Ends at 1 since fold goes up # to k-1. frac_fold = 1. / (k - fold) train_dir, cv_dir = directories[2 * fold], directories[2 * fold + 1] fold_inds, rem_inds, _ = self.split(rem_dataset, frac_train=frac_fold, frac_valid=1 - frac_fold, frac_test=0, **kwargs) cv_dataset = rem_dataset.select(fold_inds, select_dir=cv_dir) cv_datasets.append(cv_dataset) rem_dataset = rem_dataset.select(rem_inds) train_ds_to_merge = filter(lambda x: x is not None, [train_ds_base, rem_dataset]) train_ds_to_merge = filter(lambda x: len(x) > 0, train_ds_to_merge) train_dataset = DiskDataset.merge(train_ds_to_merge, merge_dir=train_dir) train_datasets.append(train_dataset) update_train_base_merge = filter(lambda x: x is not None, [train_ds_base, cv_dataset]) train_ds_base = DiskDataset.merge(update_train_base_merge) return list(zip(train_datasets, cv_datasets))
def load_gpcr(dataset_file, featurizer='ECFP', transformers=True, reload=True, sep='OnePositiveSplit', K=5): # data_dir=os.path.dirname(dataset_file) save_dir = os.path.join( os.path.dirname(dataset_file), '.'.join(os.path.basename(dataset_file).split('.')[:-1]), "ecfp", "split") train, valid, test = os.path.join(save_dir, 'train'), os.path.join( save_dir, 'valid'), os.path.join(save_dir, 'test') fopen = open(dataset_file, "r") ss = fopen.readlines() m = ss[0].strip('\n').split(',') m.remove('SMILES') if os.path.isdir(save_dir): if reload: dataset, train_dataset, valid_dataset, test_dataset = DiskDataset( data_dir=save_dir), DiskDataset(data_dir=train), DiskDataset( data_dir=valid), DiskDataset(data_dir=test) transformers = [ deepchem.trans.NormalizationTransformer(transform_w=True, dataset=train_dataset) ] all_dataset = (dataset, train_dataset, valid_dataset, test_dataset) return m, all_dataset, transformers if featurizer == 'ECFP': featurizer = deepchem.feat.CircularFingerprint(size=1024) elif featurizer == 'GraphConv': featurizer = deepchem.feat.ConvMolFeaturizer() elif featurizer == 'Weave': featurizer = deepchem.feat.WeaveFeaturizer() elif featurizer == 'Raw': featurizer = deepchem.feat.RawFeaturizer() elif featurizer == 'AdjacencyConv': featurizer = deepchem.feat.AdjacencyFingerprint(max_n_atoms=150, max_valence=6) elif featurizer == 'SelfDefine': featurizer = deepchem.feat.UserDefinedFeaturizer(feature_field) loader = deepchem.data.CSVLoader(tasks=m, smiles_field="SMILES", featurizer=featurizer) dataset = loader.featurize(dataset_file, data_dir=save_dir, shard_size=8192) # dataset = loader.featurize(dataset_file, shard_size=8192) # Initialize transformers if transformers: transformers = [ deepchem.trans.NormalizationTransformer(transform_w=True, dataset=dataset) ] for transformer in transformers: dataset = transformer.transform(dataset) splitters = { 'index': deepchem.splits.IndexSplitter(), 'random': deepchem.splits.RandomSplitter(), 'random_stratified': deepchem.splits.RandomStratifiedSplitter(), 'scaffold': deepchem.splits.ScaffoldSplitter(), 'butina': deepchem.splits.ButinaSplitter(), 'task': deepchem.splits.TaskSplitter(), 'Harmonious_positive': Harmonious_positive(), 'OnePositiveSplit': OnePositiveSplit() } splitter = splitters[sep] if sep == 'task': fold_datasets = splitter.k_fold_split(dataset, K) all_dataset = fold_datasets elif sep == 'Harmonious_positive': train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset) train_dataset = DiskDataset.from_numpy(train_dataset.X, train_dataset.y, train_dataset.w, train_dataset.ids, dataset.tasks, data_dir=train) valid_dataset = DiskDataset.from_numpy(valid_dataset.X, valid_dataset.y, valid_dataset.w, valid_dataset.ids, dataset.tasks, data_dir=valid) test_dataset = DiskDataset.from_numpy(test_dataset.X, test_dataset.y, test_dataset.w, test_dataset.ids, dataset.tasks, data_dir=test) all_dataset = (dataset, train_dataset, valid_dataset, test_dataset) elif sep == 'Harmonious_positive' and K: # train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( # dataset, # frac_train=frac_train, # frac_valid=0, # frac_test=1- frac_train, # ) # train_dataset = DiskDataset.from_numpy(train_dataset.X,train_dataset.y,train_dataset.w,train_dataset.ids, # dataset.tasks,data_dir=train) # train_dataset.reshard(8192) # test_dataset = DiskDataset.from_numpy(test_dataset.X,test_dataset.y,test_dataset.w,test_dataset.ids, # dataset.tasks,data_dir=test) # test_dataset.reshard(8192) # fold_dataset = splitter.k_fold_split( # train_dataset, K, directories=[os.path.join(valid,str(i)) for i in range(K)],verbose=True) fold_dataset = splitter.k_fold_split( dataset, K, directories=[os.path.join(valid, str(i)) for i in range(K)], verbose=True) folds = [] for i in range(K): print('merge fold dataset {}...'.format(i)) train_fold = DiskDataset.merge( [fold_dataset[j] for j in range(K) if j != i], merge_dir=os.path.join(valid, str(i), 'train_fold')) test_fold = DiskDataset.merge([fold_dataset[i]], merge_dir=os.path.join( valid, str(i), 'valid_fold')) folds.append([train_fold, test_fold]) all_dataset = (dataset, [], folds, []) else: train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split( dataset, train_dir=train, valid_dir=valid, test_dir=test, frac_train=frac_train, frac_valid=frac_valid, frac_test=frac_test) all_dataset = (dataset, train_dataset, valid_dataset, test_dataset) # else: # train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset,train_dir=train, valid_dir=valid, test_dir=test) # all_dataset = (dataset,train_dataset, valid_dataset, test_dataset) # if reload: # deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,transformers) return m, all_dataset, transformers
def k_fold_split(self, dataset, k, directories=None, **kwargs): """ Parameters ---------- dataset: Dataset Dataset to do a k-fold split k: int number of folds directories: list of str list of length 2*k filepaths to save the result disk-datasets kwargs Returns ------- list of length k tuples of (train, cv) """ """ :param dataset: :param k: :param directories: :param kwargs: :return: list of length k tuples of (train, cv) """ log("Computing K-fold split", self.verbose) if directories is None: directories = [tempfile.mkdtemp() for _ in range(2 * k)] else: assert len(directories) == 2 * k cv_datasets = [] train_ds_base = None train_datasets = [] # rem_dataset is remaining portion of dataset if isinstance(dataset, DiskDataset): rem_dataset = dataset else: rem_dataset = DiskDataset.from_numpy(dataset.X, dataset.y, dataset.w, dataset.ids) for fold in range(k): # Note starts as 1/k since fold starts at 0. Ends at 1 since fold goes up # to k-1. frac_fold = 1. / (k - fold) train_dir, cv_dir = directories[2 * fold], directories[2 * fold + 1] fold_inds, rem_inds, _ = self.split( rem_dataset, frac_train=frac_fold, frac_valid=1 - frac_fold, frac_test=0, **kwargs) cv_dataset = rem_dataset.select(fold_inds, select_dir=cv_dir) cv_datasets.append(cv_dataset) rem_dataset = rem_dataset.select(rem_inds) train_ds_to_merge = filter(lambda x: x is not None, [train_ds_base, rem_dataset]) train_ds_to_merge = filter(lambda x: len(x) > 0, train_ds_to_merge) train_dataset = DiskDataset.merge(train_ds_to_merge, merge_dir=train_dir) train_datasets.append(train_dataset) update_train_base_merge = filter(lambda x: x is not None, [train_ds_base, cv_dataset]) train_ds_base = DiskDataset.merge(update_train_base_merge) return list(zip(train_datasets, cv_datasets))
def train_test_mtnn(train_task_csvs, test_tasks_csvs, tasks_nickname, smiles_field, y_field, id_field, tempdir, num_epochs=40, batch_size=128, learning_rate=0.001, graph_conv_sizes=(128, 128), dense_size=256, gpu=None): """ Trains a multitask GCNN using the training sets in train_tasks_csvs and validates it using the test sets in test_tasks_csvs. Saves the trained model and the predictions under a folder named "train_test". Prints performance metrics (R2 and Spearman rho) after every epoch. NB: each task in the model should have a corresponding training and test files, named similarly (ex:task1_train.csv, task1_test.csv). :param train_task_csvs: list of csv files containing the training tasks :param test_tasks_csvs: list of csv files containing the test tasks :param tasks_nickname: how the model will be named :param smiles_field: in the csvs, name of the column containing the smiles string of the cpds :param y_field: in the csv, name of the column containing the activity data :param id_field: in the csv, name of the column containing the molids :param tempdir: where to store the temporary files for the DiskDatasets (will be deleted later on) :param num_epochs: how many epochs to train for :param batch_size: number of molecules per minibatch :param learning_rate: learning rate :param graph_conv_sizes: tuple with output dimension for every GC layer :param dense_size: nb of neurons in the last dense layer :param gpu: GPU to use for training (if None, only CPU will be used) :return: None """ ensure_dir(tempdir) tasks, training_dset = load_training_data(train_task_csvs, smiles_field=smiles_field, y_field=y_field, id_field=id_field, tempdir=op.join( tempdir, 'train'), cv=False) tasks, test_dset = load_training_data(test_tasks_csvs, smiles_field=smiles_field, y_field=y_field, id_field=id_field, tempdir=op.join(tempdir, 'test'), cv=False) # Take care of outdir outdir = get_multitask_traintest_outdir(tasks_nickname) ensure_dir(outdir) # Have we already run that experiment? if op.exists(op.join(outdir, 'DONE.txt')): print('Model already trained and validated.') else: print('Training and validating multitask graph convolution model') # Merge to reduce the number of shards (very necessary to avoid weird problems of non-random minibatches) disk_dir_to_delete = tempfile.mkdtemp(prefix=tempdir + '/') training_dset = DiskDataset.merge([training_dset], merge_dir=disk_dir_to_delete) # Transformation (z-scaling) zscaling_dir_train = op.join(tempdir, 'zscaling', 'train') ensure_dir(zscaling_dir_train) zscaling_dir_test = op.join(tempdir, 'zscaling', 'test') ensure_dir(zscaling_dir_test) transfo_dir_to_delete_1 = tempfile.mkdtemp(prefix=zscaling_dir_train + '/') transfo_dir_to_delete_2 = tempfile.mkdtemp(prefix=zscaling_dir_test + '/') transformer = NormalizationTransformer(transform_y=True, dataset=training_dset) scaled_train = transformer.transform(training_dset, outdir=transfo_dir_to_delete_1) scaled_val = transformer.transform(test_dset, outdir=transfo_dir_to_delete_2) # Train the model scaled_train_y, yhattrain, scaled_train_w, scaled_test_y, yhattest, scaled_test_w = \ train_and_validate_mtnn(scaled_train, n_tasks=len(tasks), outdir=outdir, graph_conv_sizes=graph_conv_sizes, dense_size=dense_size, batch_size=batch_size, learning_rate=learning_rate, num_epochs=num_epochs, pickle_file_name=tasks_nickname + '.pkl', test=scaled_val, transformer=transformer, test_unscaled=test_dset, gpu=gpu) # compute metrics scaled_results_test = evaluate_multitask_gc(scaled_test_y, yhattest, scaled_test_w) for k, vals in scaled_results_test.items(): print(k) print(vals) # let's reverse the transformation from the predictions yhattest_untransf = undo_transforms(yhattest, [transformer]) unscaled_results_test = evaluate_multitask_gc(test_dset.y, yhattest_untransf, test_dset.w) for k, vals in unscaled_results_test.items(): print(k) print(vals) # hopefully the results are very similar # Remove transfo dir shutil.rmtree(transfo_dir_to_delete_1) shutil.rmtree(transfo_dir_to_delete_2) # Get rid of the temporary directory structure shutil.rmtree(tempdir) print('Dataset folders removed!')
def cross_validate_mtnn(task_csvs, tasks_nickname, smiles_field, split_field, y_field, id_field, tempdir, num_epochs, batch_size=128, learning_rate=0.001, graph_conv_sizes=(128, 128), dense_size=256, gpu=None): """ Cross-validates a multitask GCNN using the training sets in train_tasks_csvs. Saves the trained models and the predictions under folders named "fold_i". Prints performance metrics (R2 and Spearman rho) after every epoch. NB: each task in the model should have a corresponding training file. A columns with fold assignment should be provided for the cross-validation. :param task_csvs: list of csv files containing the training tasks :param tasks_nickname: how the model will be named :param smiles_field: in the csvs, name of the column containing the smiles string of the cpds :param split_field: in the csvs, name of the column containing the fold assignment for the cross-validation :param y_field: in the csvs, name of the column containing the activity data :param id_field: in the csv, name of the column containing the molids :param tempdir: where to store the temporary files for the DiskDatasets (will be deleted later on) :param num_epochs: how many epochs to train for :param batch_size: number of molecules per minibatch :param learning_rate: learning rate :param graph_conv_sizes: tuple with output dimension for every GC layer :param dense_size: nb of neurons in the last dense layer :param gpu: GPU to use for training (if None, only CPU will be used) :return: A pandas dataframe with performance metrics for every fold """ ensure_dir(tempdir) tasks, folds, fold_dirs = load_training_data(task_csvs, smiles_field=smiles_field, split_field=split_field, y_field=y_field, id_field=id_field, tempdir=tempdir, cv=True) fold_results = defaultdict(list) for i, fold in enumerate(folds): # Take care of outdir outdir = get_multitask_cv_outdir(tasks_nickname, i) ensure_dir(outdir) # Have we already run that fold? if op.exists(op.join(outdir, 'DONE.txt')): print('Fold %i already computed.' % i) else: print('Running graph convolution model for fold %i' % i) val = fold disk_dir_to_delete = tempfile.mkdtemp(prefix=tempdir + '/') train = DiskDataset.merge(folds[0:i] + folds[i + 1:], merge_dir=disk_dir_to_delete) # Transformation (z-scaling) zscaling_dir_train = op.join(tempdir, 'zscaling', 'train') ensure_dir(zscaling_dir_train) zscaling_dir_test = op.join(tempdir, 'zscaling', 'test') ensure_dir(zscaling_dir_test) transfo_dir_to_delete_1 = tempfile.mkdtemp( prefix=zscaling_dir_train + '/') transfo_dir_to_delete_2 = tempfile.mkdtemp( prefix=zscaling_dir_test + '/') transformer = NormalizationTransformer(transform_y=True, dataset=train) scaled_train = transformer.transform( train, outdir=transfo_dir_to_delete_1) scaled_val = transformer.transform(val, outdir=transfo_dir_to_delete_2) train_y, yhattrain, train_w, test_y, yhattest, test_w = \ train_and_validate_mtnn(scaled_train, len(tasks), outdir=outdir, graph_conv_sizes=graph_conv_sizes, dense_size=dense_size, batch_size=batch_size, learning_rate=learning_rate, num_epochs=num_epochs, pickle_file_name=tasks_nickname + '_fold_%i.pkl' % i, test=scaled_val, test_unscaled=val, transformer=transformer, fold=i, gpu=gpu) # compute metrics train_results = evaluate_multitask_gc(train_y, yhattrain, train_w) test_results = evaluate_multitask_gc(test_y, yhattest, test_w) # Populate the results dictionary for j, t in enumerate(tasks): fold_results['fold'].append(i) fold_results['task'].append(t) fold_results['train'].append(True) fold_results['r2'].append(train_results[j][0]) fold_results['mse'].append(train_results[j][1]) fold_results['mae'].append(train_results[j][2]) fold_results['varex'].append(train_results[j][3]) fold_results['spearman'].append(train_results[j][4]) fold_results['fold'].append(i) fold_results['task'].append(t) fold_results['train'].append(False) fold_results['r2'].append(test_results[j][0]) fold_results['mse'].append(test_results[j][1]) fold_results['mae'].append(test_results[j][2]) fold_results['varex'].append(test_results[j][3]) fold_results['spearman'].append(test_results[j][4]) # Clean the tempdirs shutil.rmtree(disk_dir_to_delete) shutil.rmtree(transfo_dir_to_delete_1) shutil.rmtree(transfo_dir_to_delete_2) print('folder removed!') # Get rid of the foldirs for foldir in fold_dirs: shutil.rmtree(foldir) shutil.rmtree(tempdir) print('fold dataset folders removed!') return pd.DataFrame.from_dict(fold_results)
def k_fold_split(self, dataset, k, directories=None, **kwargs): """ Parameters ---------- dataset: Dataset Dataset to do a k-fold split k: int number of folds directories: list of str list of length 2*k filepaths to save the result disk-datasets kwargs Returns ------- list of length k tuples of (train, cv) """ """ :param dataset: :param k: :param directories: :param kwargs: :return: list of length k tuples of (train, cv) """ log("Computing K-fold split", self.verbose) if directories is None: directories = [tempfile.mkdtemp() for _ in range(2 * k)] else: assert len(directories) == 2 * k cv_datasets = [] train_ds_base = None train_datasets = [] # rem_dataset is remaining portion of dataset rem_dataset = dataset for fold in range(k): # Note starts as 1/k since fold starts at 0. Ends at 1 since fold goes up # to k-1. frac_fold = 1. / (k - fold) train_dir, cv_dir = directories[2 * fold], directories[2 * fold + 1] fold_inds, rem_inds, _ = self.split(rem_dataset, frac_train=frac_fold, frac_valid=1 - frac_fold, frac_test=0) cv_dataset = rem_dataset.select(fold_inds) cv_datasets.append(cv_dataset) rem_dataset = rem_dataset.select(rem_inds) train_ds_to_merge = filter(lambda x: x is not None, [train_ds_base, rem_dataset]) train_ds_to_merge = filter(lambda x: len(x) > 0, train_ds_to_merge) train_dataset = DiskDataset.merge(train_ds_to_merge, merge_dir=train_dir) train_datasets.append(train_dataset) update_train_base_merge = filter(lambda x: x is not None, [train_ds_base, cv_dataset]) train_ds_base = DiskDataset.merge(update_train_base_merge, merge_dir=cv_dir) return list(zip(train_datasets, cv_datasets))