def input_smi_to_csv(input_file, outdir, n_tasks): # 1. Read input molids = [] smis = [] molid2smi = {} with open(input_file, 'r') as reader: for line in reader: content = line.strip().split('\t') assert (len(content) == 2), 'Input file format does not seem to be correct. Expecting .smi format.' molids.append(content[1]) smis.append(content[0]) molid2smi[content[1]] = content[0] molids = [str(mid) for mid in molids] # 2. Write temporary csv file for DeepChem if outdir is None: outdir = '/tmp' ensure_dir(outdir) task_columns = ['Value_%i' % i for i in range(n_tasks)] input_data = pd.DataFrame.from_dict({'molid': molids, 'smiles': smis}) for v in task_columns: input_data[v] = [0 for _ in range(len(molids))] temporary_file = op.join(tempfile.mkdtemp(dir=outdir), 'input.csv') input_data.to_csv(temporary_file) return molids, smis, temporary_file
def load_training_data(dataset_files, split_field='Fold', smiles_field='Smiles', y_field='Value', id_field='Compound_No', tempdir=op.join(MODEL_DIR, 'datatemp'), cv=True): """ Given a list of datasets in csv format, read them and prepare them for DeepChem (split if needed, etc.) :param dataset_files: path to the csv files containing the training data for each task of interest :param split_field: column name in the csv giving the fold assignment for CV. Not used if cv=False :param smiles_field: column name in the csv giving the SMILES of the compounds :param y_field: column name in the csv giving the experimental value to learn :param cv: whether we are also splitting the data by split_field :return: list of tasks and the list of ConvMol datasets (one dataset per group in split_field) """ ensure_dir(tempdir) df_trains = [] for dataset_file in dataset_files: try: df_trains.append(pd.read_csv(dataset_file, sep=',')) except IOError: # no test split for example df = pd.DataFrame( {id_field: [], y_field: [], smiles_field: []}) # create an empty df for missing task df_trains.append(df) n_tasks = len(dataset_files) # Rename the y_field column df_trains = [df_train.rename(index=str, columns={y_field: y_field + '_%i' % i}) for i, df_train in enumerate(df_trains)] # Merge the individual tasks based on Smiles if cv: df_train = reduce(lambda x, y: pd.merge(x, y, on=[id_field, smiles_field, split_field], how='outer'), df_trains) else: df_train = reduce(lambda x, y: pd.merge(x, y, on=[id_field, smiles_field], how='outer'), df_trains) # Save the merged train csv in a temporary place dataset_file = op.join(tempdir, 'data.csv') df_train.to_csv(dataset_file, na_rep=np.nan, index=False) # Featurization featurizer = ConvMolFeaturizer() loader = CSVLoader(tasks=[y_field + '_%i' % i for i in range(n_tasks)], smiles_field=smiles_field, featurizer=featurizer, id_field=id_field) dataset = loader.featurize(dataset_file, shard_size=8192, data_dir=tempdir) if cv: folds = np.unique(df_trains[0][split_field].tolist()) # Separate in folds folds_datasets = [] fold_dirs = [] for f in folds: fold_dir = tempfile.mkdtemp(prefix=tempdir + '/') indices = np.flatnonzero(df_train[split_field] == f) folds_datasets.append(dataset.select(indices, select_dir=fold_dir)) fold_dirs.append(fold_dir) return ['Value_%i' % i for i in range(n_tasks)], folds_datasets, fold_dirs return ['Value_%i' % i for i in range(n_tasks)], dataset
def easy_predict(dataset, model, n_tasks, tempdir, smiles_field='Canonical_Smiles', id_field='Compound_No'): ensure_dir(tempdir) # 1. Get the data into the DeepChem DiskDataset object format _, dset = load_inference_data(dataset, n_tasks, tempdir, smiles_field, id_field) molids_processed = dset.ids predictions = model.predict(dset) return predictions, molids_processed
def train_and_validate_mtnn(train, n_tasks, outdir, graph_conv_sizes, dense_size, batch_size, learning_rate, num_epochs, pickle_file_name, test=None, test_unscaled=None, transformer=None, fold=None, gpu=None): """ :param train: DeepChen dataset object, y appropriately scaled already :param n_tasks: number of tasks in the data :param outdir: where to store the outputs :param batch_size: number of examples per minibatch :param learning_rate: initial learning rate :param graph_conv_sizes: tuple with output dimension for every GC layer :param dense_size: size of the dense layer :param num_epochs: number of epochs to perform training :param pickle_file_name: how to call the file that will contain ytrain, yhattrain, etc. :param test: optional. Can be a DeepChem dataset object in case we want to validate the thing, with y already scaled as needed. If not, only training set fitting will be monitored. :param test_unscaled: optional. Can be a DeepChem dataset object with y as in the original dataset. :param transformer: optional. transformer object used to transform train and test (normally, z-scaler for the y). :param fold: fold number in case we are doing CV. Will be used as a suffix for pickle files :param gpu: which GPU to use. If None, training will happen on CPU (very slow!) :return: y_true, y_pred, and weights for the training (and also for test if provided) """ # 0. GPU management if gpu is None: import os os.environ['CUDA_VISIBLE_DEVICES'] = '' config = tf.ConfigProto(device_count={'GPU': 0, 'CPU': 1}) else: import os os.environ['CUDA_VISIBLE_DEVICES'] = '%i' % gpu config = tf.ConfigProto( gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.75), device_count={'GPU': 1}) # 1. Define the model model_dir = op.join(outdir, 'model') ensure_dir(model_dir) model = define_gc_regression_model(n_tasks, graph_conv_sizes=graph_conv_sizes, dense_size=dense_size, batch_size=batch_size, learning_rate=learning_rate, model_dir=model_dir, config=config) # 2. Define the metrics r2 = Metric(r2_score, np.mean) spearman = Metric(spearmanr, np.mean, mode='regression', name='spearman_rho') # 3. Train the model for l in range(0, num_epochs): print('EPOCH %i' % l) model.fit( train, nb_epoch=1) # at every epoch, stop and evaluate on training set model.evaluate(train, [r2, spearman]) if test is not None: try: model.evaluate(test, [r2, spearman]) except TypeError: # nan in one of the tasks, for ex print('No validation performance available') # 4. Obtain final performance yhattrain = model.predict(train) if test is not None: yhattest = model.predict(test) # 5. Save the model and the predictions print('Saving results...') # save the ys in a pickle file (also include the non scaled y for test set so we can revert back and compare) with open(op.join(outdir, pickle_file_name), 'wb') as writer: if test is not None: pickle.dump([ train.y, yhattrain, train.w, test.y, yhattest, test.w, test_unscaled.y ], writer, protocol=pickle.HIGHEST_PROTOCOL) else: pickle.dump([train.y, yhattrain, train.w], writer, protocol=pickle.HIGHEST_PROTOCOL) model.save() # save the transformer for inference time... if fold is not None: transformer_file = op.join(outdir, 'transformer_fold_%i.pkl' % fold) else: transformer_file = op.join(outdir, 'transformer.pkl') with open(transformer_file, 'wb') as writer: pickle.dump(transformer, writer, protocol=pickle.HIGHEST_PROTOCOL) # save the molids... if fold is not None: molids_file = op.join(outdir, 'molids_fold_%i.pkl' % fold) else: molids_file = op.join(outdir, 'molids.pkl') with open(molids_file, 'wb') as writer: if test is not None: pickle.dump([train.ids, test.ids], writer, protocol=pickle.HIGHEST_PROTOCOL) else: pickle.dump(train.ids, writer, protocol=pickle.HIGHEST_PROTOCOL) # Signal that this training is over by creating an empty DONE.txt file open(op.join(outdir, 'DONE.txt'), 'a').close() if test is not None: return train.y, yhattrain, train.w, test.y, yhattest, test.w else: return train.y, yhattrain, train.w
def train_test_mtnn(train_task_csvs, test_tasks_csvs, tasks_nickname, smiles_field, y_field, id_field, tempdir, num_epochs=40, batch_size=128, learning_rate=0.001, graph_conv_sizes=(128, 128), dense_size=256, gpu=None): """ Trains a multitask GCNN using the training sets in train_tasks_csvs and validates it using the test sets in test_tasks_csvs. Saves the trained model and the predictions under a folder named "train_test". Prints performance metrics (R2 and Spearman rho) after every epoch. NB: each task in the model should have a corresponding training and test files, named similarly (ex:task1_train.csv, task1_test.csv). :param train_task_csvs: list of csv files containing the training tasks :param test_tasks_csvs: list of csv files containing the test tasks :param tasks_nickname: how the model will be named :param smiles_field: in the csvs, name of the column containing the smiles string of the cpds :param y_field: in the csv, name of the column containing the activity data :param id_field: in the csv, name of the column containing the molids :param tempdir: where to store the temporary files for the DiskDatasets (will be deleted later on) :param num_epochs: how many epochs to train for :param batch_size: number of molecules per minibatch :param learning_rate: learning rate :param graph_conv_sizes: tuple with output dimension for every GC layer :param dense_size: nb of neurons in the last dense layer :param gpu: GPU to use for training (if None, only CPU will be used) :return: None """ ensure_dir(tempdir) tasks, training_dset = load_training_data(train_task_csvs, smiles_field=smiles_field, y_field=y_field, id_field=id_field, tempdir=op.join( tempdir, 'train'), cv=False) tasks, test_dset = load_training_data(test_tasks_csvs, smiles_field=smiles_field, y_field=y_field, id_field=id_field, tempdir=op.join(tempdir, 'test'), cv=False) # Take care of outdir outdir = get_multitask_traintest_outdir(tasks_nickname) ensure_dir(outdir) # Have we already run that experiment? if op.exists(op.join(outdir, 'DONE.txt')): print('Model already trained and validated.') else: print('Training and validating multitask graph convolution model') # Merge to reduce the number of shards (very necessary to avoid weird problems of non-random minibatches) disk_dir_to_delete = tempfile.mkdtemp(prefix=tempdir + '/') training_dset = DiskDataset.merge([training_dset], merge_dir=disk_dir_to_delete) # Transformation (z-scaling) zscaling_dir_train = op.join(tempdir, 'zscaling', 'train') ensure_dir(zscaling_dir_train) zscaling_dir_test = op.join(tempdir, 'zscaling', 'test') ensure_dir(zscaling_dir_test) transfo_dir_to_delete_1 = tempfile.mkdtemp(prefix=zscaling_dir_train + '/') transfo_dir_to_delete_2 = tempfile.mkdtemp(prefix=zscaling_dir_test + '/') transformer = NormalizationTransformer(transform_y=True, dataset=training_dset) scaled_train = transformer.transform(training_dset, outdir=transfo_dir_to_delete_1) scaled_val = transformer.transform(test_dset, outdir=transfo_dir_to_delete_2) # Train the model scaled_train_y, yhattrain, scaled_train_w, scaled_test_y, yhattest, scaled_test_w = \ train_and_validate_mtnn(scaled_train, n_tasks=len(tasks), outdir=outdir, graph_conv_sizes=graph_conv_sizes, dense_size=dense_size, batch_size=batch_size, learning_rate=learning_rate, num_epochs=num_epochs, pickle_file_name=tasks_nickname + '.pkl', test=scaled_val, transformer=transformer, test_unscaled=test_dset, gpu=gpu) # compute metrics scaled_results_test = evaluate_multitask_gc(scaled_test_y, yhattest, scaled_test_w) for k, vals in scaled_results_test.items(): print(k) print(vals) # let's reverse the transformation from the predictions yhattest_untransf = undo_transforms(yhattest, [transformer]) unscaled_results_test = evaluate_multitask_gc(test_dset.y, yhattest_untransf, test_dset.w) for k, vals in unscaled_results_test.items(): print(k) print(vals) # hopefully the results are very similar # Remove transfo dir shutil.rmtree(transfo_dir_to_delete_1) shutil.rmtree(transfo_dir_to_delete_2) # Get rid of the temporary directory structure shutil.rmtree(tempdir) print('Dataset folders removed!')
def train_multitask_gc(train_task_csvs, tasks_nickname, smiles_field, y_field, id_field, tempdir, num_epochs, batch_size=128, learning_rate=0.001, graph_conv_sizes=(128, 128), dense_size=256, gpu=None): """ We assemble all the data we have on all tasks for a final training run. :param train_task_csvs: csv files of the training sets :param tasks_nickname: how to name the model (ex: 'PCtasks') :param smiles_field: in the csv, name of the column containing the smiles string of the cpds :param y_field: in the csv, name of the column containing the activity data :param id_field: in the csv, name of the column containing the molids :param tempdir: where to store the temporary files for the DiskDatasets (will be deleted later on) :param num_epochs: how many epochs to train for :param batch_size: number of molecules per minibatch :param learning_rate: learning rate :param graph_conv_sizes: tuple with output dimension for every GC layer :param dense_size: nb of neurons in the last dense layer :param gpu: GPU to use for training (if None, only CPU will be used) :return: None """ ensure_dir(tempdir) # Get and merge the data tasks, training_dset = load_training_data(train_task_csvs, smiles_field=smiles_field, y_field=y_field, id_field=id_field, tempdir=op.join( tempdir, 'train'), cv=False) # Take care of outdir outdir = get_multitask_outdir(tasks_nickname) ensure_dir(outdir) # Have we already run that experiment? if op.exists(op.join(outdir, 'DONE.txt')): print('Model already trained and validated.') else: print('Training the final multitask graph convolution model') # Transformation (z-scaling) zscaling_dir_train = op.join(tempdir, 'zscaling') ensure_dir(zscaling_dir_train) transfo_dir_to_delete = tempfile.mkdtemp(prefix=zscaling_dir_train + '/') transformer = NormalizationTransformer(transform_y=True, dataset=training_dset) scaled_train = transformer.transform(training_dset, outdir=transfo_dir_to_delete) train_y, yhattrain, train_w = train_and_validate_mtnn( scaled_train, len(tasks), outdir, graph_conv_sizes=graph_conv_sizes, dense_size=dense_size, batch_size=batch_size, learning_rate=learning_rate, num_epochs=num_epochs, pickle_file_name=tasks_nickname + '.pkl', test=None, test_unscaled=None, transformer=transformer, fold=None, gpu=gpu) # compute metrics train_results = evaluate_multitask_gc(train_y, yhattrain, train_w) for k, vals in train_results.items(): print(k) print(vals) # Remove temporary directory for transformer shutil.rmtree(transfo_dir_to_delete) # Get rid of the whole temporary directory structure shutil.rmtree(tempdir) print('Dataset folders removed!')
def cross_validate_mtnn(task_csvs, tasks_nickname, smiles_field, split_field, y_field, id_field, tempdir, num_epochs, batch_size=128, learning_rate=0.001, graph_conv_sizes=(128, 128), dense_size=256, gpu=None): """ Cross-validates a multitask GCNN using the training sets in train_tasks_csvs. Saves the trained models and the predictions under folders named "fold_i". Prints performance metrics (R2 and Spearman rho) after every epoch. NB: each task in the model should have a corresponding training file. A columns with fold assignment should be provided for the cross-validation. :param task_csvs: list of csv files containing the training tasks :param tasks_nickname: how the model will be named :param smiles_field: in the csvs, name of the column containing the smiles string of the cpds :param split_field: in the csvs, name of the column containing the fold assignment for the cross-validation :param y_field: in the csvs, name of the column containing the activity data :param id_field: in the csv, name of the column containing the molids :param tempdir: where to store the temporary files for the DiskDatasets (will be deleted later on) :param num_epochs: how many epochs to train for :param batch_size: number of molecules per minibatch :param learning_rate: learning rate :param graph_conv_sizes: tuple with output dimension for every GC layer :param dense_size: nb of neurons in the last dense layer :param gpu: GPU to use for training (if None, only CPU will be used) :return: A pandas dataframe with performance metrics for every fold """ ensure_dir(tempdir) tasks, folds, fold_dirs = load_training_data(task_csvs, smiles_field=smiles_field, split_field=split_field, y_field=y_field, id_field=id_field, tempdir=tempdir, cv=True) fold_results = defaultdict(list) for i, fold in enumerate(folds): # Take care of outdir outdir = get_multitask_cv_outdir(tasks_nickname, i) ensure_dir(outdir) # Have we already run that fold? if op.exists(op.join(outdir, 'DONE.txt')): print('Fold %i already computed.' % i) else: print('Running graph convolution model for fold %i' % i) val = fold disk_dir_to_delete = tempfile.mkdtemp(prefix=tempdir + '/') train = DiskDataset.merge(folds[0:i] + folds[i + 1:], merge_dir=disk_dir_to_delete) # Transformation (z-scaling) zscaling_dir_train = op.join(tempdir, 'zscaling', 'train') ensure_dir(zscaling_dir_train) zscaling_dir_test = op.join(tempdir, 'zscaling', 'test') ensure_dir(zscaling_dir_test) transfo_dir_to_delete_1 = tempfile.mkdtemp( prefix=zscaling_dir_train + '/') transfo_dir_to_delete_2 = tempfile.mkdtemp( prefix=zscaling_dir_test + '/') transformer = NormalizationTransformer(transform_y=True, dataset=train) scaled_train = transformer.transform( train, outdir=transfo_dir_to_delete_1) scaled_val = transformer.transform(val, outdir=transfo_dir_to_delete_2) train_y, yhattrain, train_w, test_y, yhattest, test_w = \ train_and_validate_mtnn(scaled_train, len(tasks), outdir=outdir, graph_conv_sizes=graph_conv_sizes, dense_size=dense_size, batch_size=batch_size, learning_rate=learning_rate, num_epochs=num_epochs, pickle_file_name=tasks_nickname + '_fold_%i.pkl' % i, test=scaled_val, test_unscaled=val, transformer=transformer, fold=i, gpu=gpu) # compute metrics train_results = evaluate_multitask_gc(train_y, yhattrain, train_w) test_results = evaluate_multitask_gc(test_y, yhattest, test_w) # Populate the results dictionary for j, t in enumerate(tasks): fold_results['fold'].append(i) fold_results['task'].append(t) fold_results['train'].append(True) fold_results['r2'].append(train_results[j][0]) fold_results['mse'].append(train_results[j][1]) fold_results['mae'].append(train_results[j][2]) fold_results['varex'].append(train_results[j][3]) fold_results['spearman'].append(train_results[j][4]) fold_results['fold'].append(i) fold_results['task'].append(t) fold_results['train'].append(False) fold_results['r2'].append(test_results[j][0]) fold_results['mse'].append(test_results[j][1]) fold_results['mae'].append(test_results[j][2]) fold_results['varex'].append(test_results[j][3]) fold_results['spearman'].append(test_results[j][4]) # Clean the tempdirs shutil.rmtree(disk_dir_to_delete) shutil.rmtree(transfo_dir_to_delete_1) shutil.rmtree(transfo_dir_to_delete_2) print('folder removed!') # Get rid of the foldirs for foldir in fold_dirs: shutil.rmtree(foldir) shutil.rmtree(tempdir) print('fold dataset folders removed!') return pd.DataFrame.from_dict(fold_results)