示例#1
0
def input_smi_to_csv(input_file, outdir, n_tasks):

    # 1. Read input
    molids = []
    smis = []
    molid2smi = {}
    with open(input_file, 'r') as reader:
        for line in reader:
            content = line.strip().split('\t')
            assert (len(content) == 2), 'Input file format does not seem to be correct. Expecting .smi format.'
            molids.append(content[1])
            smis.append(content[0])
            molid2smi[content[1]] = content[0]
    molids = [str(mid) for mid in molids]

    # 2. Write temporary csv file for DeepChem
    if outdir is None:
        outdir = '/tmp'
    ensure_dir(outdir)

    task_columns = ['Value_%i' % i for i in range(n_tasks)]
    input_data = pd.DataFrame.from_dict({'molid': molids, 'smiles': smis})
    for v in task_columns:
        input_data[v] = [0 for _ in range(len(molids))]
    temporary_file = op.join(tempfile.mkdtemp(dir=outdir), 'input.csv')
    input_data.to_csv(temporary_file)
    return molids, smis, temporary_file
示例#2
0
def load_training_data(dataset_files, split_field='Fold', smiles_field='Smiles', y_field='Value',
                       id_field='Compound_No', tempdir=op.join(MODEL_DIR, 'datatemp'), cv=True):
    """
    Given a list of datasets in csv format, read them and prepare them for DeepChem (split if needed, etc.)
    :param dataset_files: path to the csv files containing the training data for each task of interest
    :param split_field: column name in the csv giving the fold assignment for CV. Not used if cv=False
    :param smiles_field: column name in the csv giving the SMILES of the compounds
    :param y_field: column name in the csv giving the experimental value to learn
    :param cv: whether we are also splitting the data by split_field
    :return: list of tasks and the list of ConvMol datasets (one dataset per group in split_field)
    """
    ensure_dir(tempdir)

    df_trains = []
    for dataset_file in dataset_files:
        try:
            df_trains.append(pd.read_csv(dataset_file, sep=','))
        except IOError:  # no test split for example
            df = pd.DataFrame(
                {id_field: [], y_field: [], smiles_field: []})  # create an empty df for missing task
            df_trains.append(df)

    n_tasks = len(dataset_files)
    # Rename the y_field column
    df_trains = [df_train.rename(index=str, columns={y_field: y_field + '_%i' % i}) for i, df_train in
                 enumerate(df_trains)]

    # Merge the individual tasks based on Smiles
    if cv:
        df_train = reduce(lambda x, y: pd.merge(x, y, on=[id_field, smiles_field, split_field], how='outer'),
                          df_trains)
    else:
        df_train = reduce(lambda x, y: pd.merge(x, y, on=[id_field, smiles_field], how='outer'), df_trains)
    # Save the merged train csv in a temporary place
    dataset_file = op.join(tempdir, 'data.csv')
    df_train.to_csv(dataset_file, na_rep=np.nan, index=False)

    # Featurization
    featurizer = ConvMolFeaturizer()
    loader = CSVLoader(tasks=[y_field + '_%i' % i for i in range(n_tasks)], smiles_field=smiles_field,
                       featurizer=featurizer, id_field=id_field)
    dataset = loader.featurize(dataset_file, shard_size=8192, data_dir=tempdir)

    if cv:
        folds = np.unique(df_trains[0][split_field].tolist())

        # Separate in folds
        folds_datasets = []
        fold_dirs = []
        for f in folds:
            fold_dir = tempfile.mkdtemp(prefix=tempdir + '/')
            indices = np.flatnonzero(df_train[split_field] == f)
            folds_datasets.append(dataset.select(indices, select_dir=fold_dir))
            fold_dirs.append(fold_dir)

        return ['Value_%i' % i for i in range(n_tasks)], folds_datasets, fold_dirs

    return ['Value_%i' % i for i in range(n_tasks)], dataset
示例#3
0
def easy_predict(dataset,
                 model,
                 n_tasks,
                 tempdir,
                 smiles_field='Canonical_Smiles',
                 id_field='Compound_No'):
    ensure_dir(tempdir)
    # 1. Get the data into the DeepChem DiskDataset object format
    _, dset = load_inference_data(dataset, n_tasks, tempdir, smiles_field,
                                  id_field)
    molids_processed = dset.ids

    predictions = model.predict(dset)

    return predictions, molids_processed
示例#4
0
def train_and_validate_mtnn(train,
                            n_tasks,
                            outdir,
                            graph_conv_sizes,
                            dense_size,
                            batch_size,
                            learning_rate,
                            num_epochs,
                            pickle_file_name,
                            test=None,
                            test_unscaled=None,
                            transformer=None,
                            fold=None,
                            gpu=None):
    """
    :param train: DeepChen dataset object, y appropriately scaled already
    :param n_tasks: number of tasks in the data
    :param outdir: where to store the outputs
    :param batch_size: number of examples per minibatch
    :param learning_rate: initial learning rate
    :param graph_conv_sizes: tuple with output dimension for every GC layer
    :param dense_size: size of the dense layer
    :param num_epochs: number of epochs to perform training
    :param pickle_file_name: how to call the file that will contain ytrain, yhattrain, etc.
    :param test: optional. Can be a DeepChem dataset object in case we want to validate the thing, with y already scaled
    as needed. If not, only training set fitting will be monitored.
    :param test_unscaled: optional. Can be a DeepChem dataset object with y as in the original dataset.
    :param transformer: optional. transformer object used to transform train and test (normally, z-scaler for the y).
    :param fold: fold number in case we are doing CV. Will be used as a suffix for pickle files
    :param gpu: which GPU to use. If None, training will happen on CPU (very slow!)
    :return: y_true, y_pred, and weights for the training (and also for test if provided)
    """

    # 0. GPU management
    if gpu is None:
        import os
        os.environ['CUDA_VISIBLE_DEVICES'] = ''
        config = tf.ConfigProto(device_count={'GPU': 0, 'CPU': 1})
    else:
        import os
        os.environ['CUDA_VISIBLE_DEVICES'] = '%i' % gpu
        config = tf.ConfigProto(
            gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=0.75),
            device_count={'GPU': 1})

    # 1. Define the model
    model_dir = op.join(outdir, 'model')
    ensure_dir(model_dir)
    model = define_gc_regression_model(n_tasks,
                                       graph_conv_sizes=graph_conv_sizes,
                                       dense_size=dense_size,
                                       batch_size=batch_size,
                                       learning_rate=learning_rate,
                                       model_dir=model_dir,
                                       config=config)

    # 2. Define the metrics
    r2 = Metric(r2_score, np.mean)
    spearman = Metric(spearmanr,
                      np.mean,
                      mode='regression',
                      name='spearman_rho')

    # 3. Train the model
    for l in range(0, num_epochs):
        print('EPOCH %i' % l)
        model.fit(
            train,
            nb_epoch=1)  # at every epoch, stop and evaluate on training set
        model.evaluate(train, [r2, spearman])
        if test is not None:
            try:
                model.evaluate(test, [r2, spearman])
            except TypeError:  # nan in one of the tasks, for ex
                print('No validation performance available')

    # 4. Obtain final performance
    yhattrain = model.predict(train)
    if test is not None:
        yhattest = model.predict(test)

    # 5. Save the model and the predictions
    print('Saving results...')
    # save the ys in a pickle file (also include the non scaled y for test set so we can revert back and compare)
    with open(op.join(outdir, pickle_file_name), 'wb') as writer:
        if test is not None:
            pickle.dump([
                train.y, yhattrain, train.w, test.y, yhattest, test.w,
                test_unscaled.y
            ],
                        writer,
                        protocol=pickle.HIGHEST_PROTOCOL)
        else:
            pickle.dump([train.y, yhattrain, train.w],
                        writer,
                        protocol=pickle.HIGHEST_PROTOCOL)

    model.save()

    # save the transformer for inference time...
    if fold is not None:
        transformer_file = op.join(outdir, 'transformer_fold_%i.pkl' % fold)
    else:
        transformer_file = op.join(outdir, 'transformer.pkl')
    with open(transformer_file, 'wb') as writer:
        pickle.dump(transformer, writer, protocol=pickle.HIGHEST_PROTOCOL)

    # save the molids...
    if fold is not None:
        molids_file = op.join(outdir, 'molids_fold_%i.pkl' % fold)
    else:
        molids_file = op.join(outdir, 'molids.pkl')
    with open(molids_file, 'wb') as writer:
        if test is not None:
            pickle.dump([train.ids, test.ids],
                        writer,
                        protocol=pickle.HIGHEST_PROTOCOL)
        else:
            pickle.dump(train.ids, writer, protocol=pickle.HIGHEST_PROTOCOL)

    # Signal that this training is over by creating an empty DONE.txt file
    open(op.join(outdir, 'DONE.txt'), 'a').close()

    if test is not None:
        return train.y, yhattrain, train.w, test.y, yhattest, test.w
    else:
        return train.y, yhattrain, train.w
示例#5
0
def train_test_mtnn(train_task_csvs,
                    test_tasks_csvs,
                    tasks_nickname,
                    smiles_field,
                    y_field,
                    id_field,
                    tempdir,
                    num_epochs=40,
                    batch_size=128,
                    learning_rate=0.001,
                    graph_conv_sizes=(128, 128),
                    dense_size=256,
                    gpu=None):
    """
    Trains a multitask GCNN using the training sets in train_tasks_csvs and validates it using the test sets in
    test_tasks_csvs. Saves the trained model and the predictions under a folder named "train_test". Prints performance
    metrics (R2 and Spearman rho) after every epoch.
    NB: each task in the model should have a corresponding training and test files, named similarly (ex:task1_train.csv,
    task1_test.csv).
    :param train_task_csvs: list of csv files containing the training tasks
    :param test_tasks_csvs: list of csv files containing the test tasks
    :param tasks_nickname: how the model will be named
    :param smiles_field: in the csvs, name of the column containing the smiles string of the cpds
    :param y_field: in the csv, name of the column containing the activity data
    :param id_field: in the csv, name of the column containing the molids
    :param tempdir: where to store the temporary files for the DiskDatasets (will be deleted later on)
    :param num_epochs: how many epochs to train for
    :param batch_size: number of molecules per minibatch
    :param learning_rate: learning rate
    :param graph_conv_sizes: tuple with output dimension for every GC layer
    :param dense_size: nb of neurons in the last dense layer
    :param gpu: GPU to use for training (if None, only CPU will be used)
    :return: None
    """

    ensure_dir(tempdir)
    tasks, training_dset = load_training_data(train_task_csvs,
                                              smiles_field=smiles_field,
                                              y_field=y_field,
                                              id_field=id_field,
                                              tempdir=op.join(
                                                  tempdir, 'train'),
                                              cv=False)
    tasks, test_dset = load_training_data(test_tasks_csvs,
                                          smiles_field=smiles_field,
                                          y_field=y_field,
                                          id_field=id_field,
                                          tempdir=op.join(tempdir, 'test'),
                                          cv=False)

    # Take care of outdir
    outdir = get_multitask_traintest_outdir(tasks_nickname)
    ensure_dir(outdir)

    # Have we already run that experiment?
    if op.exists(op.join(outdir, 'DONE.txt')):
        print('Model already trained and validated.')

    else:
        print('Training and validating multitask graph convolution model')

        # Merge to reduce the number of shards (very necessary to avoid weird problems of non-random minibatches)
        disk_dir_to_delete = tempfile.mkdtemp(prefix=tempdir + '/')
        training_dset = DiskDataset.merge([training_dset],
                                          merge_dir=disk_dir_to_delete)

        # Transformation (z-scaling)
        zscaling_dir_train = op.join(tempdir, 'zscaling', 'train')
        ensure_dir(zscaling_dir_train)
        zscaling_dir_test = op.join(tempdir, 'zscaling', 'test')
        ensure_dir(zscaling_dir_test)
        transfo_dir_to_delete_1 = tempfile.mkdtemp(prefix=zscaling_dir_train +
                                                   '/')
        transfo_dir_to_delete_2 = tempfile.mkdtemp(prefix=zscaling_dir_test +
                                                   '/')
        transformer = NormalizationTransformer(transform_y=True,
                                               dataset=training_dset)
        scaled_train = transformer.transform(training_dset,
                                             outdir=transfo_dir_to_delete_1)
        scaled_val = transformer.transform(test_dset,
                                           outdir=transfo_dir_to_delete_2)

        # Train the model
        scaled_train_y, yhattrain, scaled_train_w, scaled_test_y, yhattest, scaled_test_w = \
            train_and_validate_mtnn(scaled_train, n_tasks=len(tasks), outdir=outdir, graph_conv_sizes=graph_conv_sizes,
                                    dense_size=dense_size, batch_size=batch_size, learning_rate=learning_rate,
                                    num_epochs=num_epochs, pickle_file_name=tasks_nickname + '.pkl', test=scaled_val,
                                    transformer=transformer, test_unscaled=test_dset, gpu=gpu)

        # compute metrics
        scaled_results_test = evaluate_multitask_gc(scaled_test_y, yhattest,
                                                    scaled_test_w)
        for k, vals in scaled_results_test.items():
            print(k)
            print(vals)

        # let's reverse the transformation from the predictions
        yhattest_untransf = undo_transforms(yhattest, [transformer])
        unscaled_results_test = evaluate_multitask_gc(test_dset.y,
                                                      yhattest_untransf,
                                                      test_dset.w)
        for k, vals in unscaled_results_test.items():
            print(k)
            print(vals)
        # hopefully the results are very similar

        # Remove transfo dir
        shutil.rmtree(transfo_dir_to_delete_1)
        shutil.rmtree(transfo_dir_to_delete_2)

    # Get rid of the temporary directory structure
    shutil.rmtree(tempdir)

    print('Dataset folders removed!')
示例#6
0
def train_multitask_gc(train_task_csvs,
                       tasks_nickname,
                       smiles_field,
                       y_field,
                       id_field,
                       tempdir,
                       num_epochs,
                       batch_size=128,
                       learning_rate=0.001,
                       graph_conv_sizes=(128, 128),
                       dense_size=256,
                       gpu=None):
    """
    We assemble all the data we have on all tasks for a final training run.
    :param train_task_csvs: csv files of the training sets
    :param tasks_nickname: how to name the model (ex: 'PCtasks')
    :param smiles_field: in the csv, name of the column containing the smiles string of the cpds
    :param y_field: in the csv, name of the column containing the activity data
    :param id_field: in the csv, name of the column containing the molids
    :param tempdir: where to store the temporary files for the DiskDatasets (will be deleted later on)
    :param num_epochs: how many epochs to train for
    :param batch_size: number of molecules per minibatch
    :param learning_rate: learning rate
    :param graph_conv_sizes: tuple with output dimension for every GC layer
    :param dense_size: nb of neurons in the last dense layer
    :param gpu: GPU to use for training (if None, only CPU will be used)
    :return: None
    """
    ensure_dir(tempdir)

    # Get and merge the data
    tasks, training_dset = load_training_data(train_task_csvs,
                                              smiles_field=smiles_field,
                                              y_field=y_field,
                                              id_field=id_field,
                                              tempdir=op.join(
                                                  tempdir, 'train'),
                                              cv=False)

    # Take care of outdir
    outdir = get_multitask_outdir(tasks_nickname)
    ensure_dir(outdir)

    # Have we already run that experiment?
    if op.exists(op.join(outdir, 'DONE.txt')):
        print('Model already trained and validated.')

    else:
        print('Training the final multitask graph convolution model')

        # Transformation (z-scaling)
        zscaling_dir_train = op.join(tempdir, 'zscaling')
        ensure_dir(zscaling_dir_train)
        transfo_dir_to_delete = tempfile.mkdtemp(prefix=zscaling_dir_train +
                                                 '/')
        transformer = NormalizationTransformer(transform_y=True,
                                               dataset=training_dset)
        scaled_train = transformer.transform(training_dset,
                                             outdir=transfo_dir_to_delete)

        train_y, yhattrain, train_w = train_and_validate_mtnn(
            scaled_train,
            len(tasks),
            outdir,
            graph_conv_sizes=graph_conv_sizes,
            dense_size=dense_size,
            batch_size=batch_size,
            learning_rate=learning_rate,
            num_epochs=num_epochs,
            pickle_file_name=tasks_nickname + '.pkl',
            test=None,
            test_unscaled=None,
            transformer=transformer,
            fold=None,
            gpu=gpu)
        # compute metrics
        train_results = evaluate_multitask_gc(train_y, yhattrain, train_w)
        for k, vals in train_results.items():
            print(k)
            print(vals)

        # Remove temporary directory for transformer
        shutil.rmtree(transfo_dir_to_delete)

    # Get rid of the whole temporary directory structure
    shutil.rmtree(tempdir)

    print('Dataset folders removed!')
示例#7
0
def cross_validate_mtnn(task_csvs,
                        tasks_nickname,
                        smiles_field,
                        split_field,
                        y_field,
                        id_field,
                        tempdir,
                        num_epochs,
                        batch_size=128,
                        learning_rate=0.001,
                        graph_conv_sizes=(128, 128),
                        dense_size=256,
                        gpu=None):
    """
    Cross-validates a multitask GCNN using the training sets in train_tasks_csvs. Saves the trained models and the
    predictions under folders named "fold_i". Prints performance metrics (R2 and Spearman rho) after every epoch.
    NB: each task in the model should have a corresponding training file. A columns with fold assignment should be
    provided for the cross-validation.
    :param task_csvs: list of csv files containing the training tasks
    :param tasks_nickname: how the model will be named
    :param smiles_field: in the csvs, name of the column containing the smiles string of the cpds
    :param split_field: in the csvs, name of the column containing the fold assignment for the cross-validation
    :param y_field: in the csvs, name of the column containing the activity data
    :param id_field: in the csv, name of the column containing the molids
    :param tempdir: where to store the temporary files for the DiskDatasets (will be deleted later on)
    :param num_epochs: how many epochs to train for
    :param batch_size: number of molecules per minibatch
    :param learning_rate: learning rate
    :param graph_conv_sizes: tuple with output dimension for every GC layer
    :param dense_size: nb of neurons in the last dense layer
    :param gpu: GPU to use for training (if None, only CPU will be used)
    :return: A pandas dataframe with performance metrics for every fold
    """

    ensure_dir(tempdir)
    tasks, folds, fold_dirs = load_training_data(task_csvs,
                                                 smiles_field=smiles_field,
                                                 split_field=split_field,
                                                 y_field=y_field,
                                                 id_field=id_field,
                                                 tempdir=tempdir,
                                                 cv=True)

    fold_results = defaultdict(list)

    for i, fold in enumerate(folds):

        # Take care of outdir
        outdir = get_multitask_cv_outdir(tasks_nickname, i)
        ensure_dir(outdir)

        # Have we already run that fold?
        if op.exists(op.join(outdir, 'DONE.txt')):
            print('Fold %i already computed.' % i)

        else:
            print('Running graph convolution model for fold %i' % i)
            val = fold
            disk_dir_to_delete = tempfile.mkdtemp(prefix=tempdir + '/')
            train = DiskDataset.merge(folds[0:i] + folds[i + 1:],
                                      merge_dir=disk_dir_to_delete)

            # Transformation (z-scaling)
            zscaling_dir_train = op.join(tempdir, 'zscaling', 'train')
            ensure_dir(zscaling_dir_train)
            zscaling_dir_test = op.join(tempdir, 'zscaling', 'test')
            ensure_dir(zscaling_dir_test)
            transfo_dir_to_delete_1 = tempfile.mkdtemp(
                prefix=zscaling_dir_train + '/')
            transfo_dir_to_delete_2 = tempfile.mkdtemp(
                prefix=zscaling_dir_test + '/')
            transformer = NormalizationTransformer(transform_y=True,
                                                   dataset=train)
            scaled_train = transformer.transform(
                train, outdir=transfo_dir_to_delete_1)
            scaled_val = transformer.transform(val,
                                               outdir=transfo_dir_to_delete_2)

            train_y, yhattrain, train_w, test_y, yhattest, test_w = \
                train_and_validate_mtnn(scaled_train, len(tasks), outdir=outdir, graph_conv_sizes=graph_conv_sizes,
                                        dense_size=dense_size, batch_size=batch_size, learning_rate=learning_rate,
                                        num_epochs=num_epochs, pickle_file_name=tasks_nickname + '_fold_%i.pkl' % i,
                                        test=scaled_val, test_unscaled=val, transformer=transformer, fold=i, gpu=gpu)

            # compute metrics
            train_results = evaluate_multitask_gc(train_y, yhattrain, train_w)
            test_results = evaluate_multitask_gc(test_y, yhattest, test_w)

            # Populate the results dictionary
            for j, t in enumerate(tasks):
                fold_results['fold'].append(i)
                fold_results['task'].append(t)
                fold_results['train'].append(True)
                fold_results['r2'].append(train_results[j][0])
                fold_results['mse'].append(train_results[j][1])
                fold_results['mae'].append(train_results[j][2])
                fold_results['varex'].append(train_results[j][3])
                fold_results['spearman'].append(train_results[j][4])
                fold_results['fold'].append(i)
                fold_results['task'].append(t)
                fold_results['train'].append(False)
                fold_results['r2'].append(test_results[j][0])
                fold_results['mse'].append(test_results[j][1])
                fold_results['mae'].append(test_results[j][2])
                fold_results['varex'].append(test_results[j][3])
                fold_results['spearman'].append(test_results[j][4])

            # Clean the tempdirs
            shutil.rmtree(disk_dir_to_delete)
            shutil.rmtree(transfo_dir_to_delete_1)
            shutil.rmtree(transfo_dir_to_delete_2)
            print('folder removed!')

    # Get rid of the foldirs
    for foldir in fold_dirs:
        shutil.rmtree(foldir)
    shutil.rmtree(tempdir)
    print('fold dataset folders removed!')

    return pd.DataFrame.from_dict(fold_results)