Exemplo n.º 1
0
    def k_fold_split(self, dataset, k, directories=None, **kwargs):
        """
    Parameters
    ----------
    dataset: `dc.data.Dataset`
      Dataset to do a k-fold split
    k: int
      Number of folds to split `dataset` into.
    directories: list[str]
      list of length 2*k filepaths to save the result disk-datasets

    Returns
    -------
    list of length k tuples of (train, cv) where `train` and `cv` are both
    lists of `Dataset`s.
    """
        logger.info("Computing K-fold split")
        if directories is None:
            directories = [tempfile.mkdtemp() for _ in range(2 * k)]
        else:
            assert len(directories) == 2 * k
        cv_datasets = []
        train_ds_base = None
        train_datasets = []
        # rem_dataset is remaining portion of dataset
        if isinstance(dataset, DiskDataset):
            rem_dataset = dataset
        else:
            rem_dataset = DiskDataset.from_numpy(dataset.X, dataset.y,
                                                 dataset.w, dataset.ids)
        for fold in range(k):
            # Note starts as 1/k since fold starts at 0. Ends at 1 since fold goes up
            # to k-1.
            frac_fold = 1. / (k - fold)
            train_dir, cv_dir = directories[2 * fold], directories[2 * fold +
                                                                   1]
            fold_inds, rem_inds, _ = self.split(rem_dataset,
                                                frac_train=frac_fold,
                                                frac_valid=1 - frac_fold,
                                                frac_test=0,
                                                **kwargs)
            cv_dataset = rem_dataset.select(fold_inds, select_dir=cv_dir)
            cv_datasets.append(cv_dataset)
            rem_dataset = rem_dataset.select(rem_inds)

            train_ds_to_merge = filter(lambda x: x is not None,
                                       [train_ds_base, rem_dataset])
            train_ds_to_merge = filter(lambda x: len(x) > 0, train_ds_to_merge)
            train_dataset = DiskDataset.merge(train_ds_to_merge,
                                              merge_dir=train_dir)
            train_datasets.append(train_dataset)

            update_train_base_merge = filter(lambda x: x is not None,
                                             [train_ds_base, cv_dataset])
            train_ds_base = DiskDataset.merge(update_train_base_merge)
        return list(zip(train_datasets, cv_datasets))
def load_gpcr(dataset_file,
              featurizer='ECFP',
              transformers=True,
              reload=True,
              sep='OnePositiveSplit',
              K=5):
    #    data_dir=os.path.dirname(dataset_file)

    save_dir = os.path.join(
        os.path.dirname(dataset_file),
        '.'.join(os.path.basename(dataset_file).split('.')[:-1]), "ecfp",
        "split")
    train, valid, test = os.path.join(save_dir, 'train'), os.path.join(
        save_dir, 'valid'), os.path.join(save_dir, 'test')
    fopen = open(dataset_file, "r")
    ss = fopen.readlines()
    m = ss[0].strip('\n').split(',')
    m.remove('SMILES')
    if os.path.isdir(save_dir):
        if reload:
            dataset, train_dataset, valid_dataset, test_dataset = DiskDataset(
                data_dir=save_dir), DiskDataset(data_dir=train), DiskDataset(
                    data_dir=valid), DiskDataset(data_dir=test)
            transformers = [
                deepchem.trans.NormalizationTransformer(transform_w=True,
                                                        dataset=train_dataset)
            ]
            all_dataset = (dataset, train_dataset, valid_dataset, test_dataset)
            return m, all_dataset, transformers
    if featurizer == 'ECFP':
        featurizer = deepchem.feat.CircularFingerprint(size=1024)
    elif featurizer == 'GraphConv':
        featurizer = deepchem.feat.ConvMolFeaturizer()
    elif featurizer == 'Weave':
        featurizer = deepchem.feat.WeaveFeaturizer()
    elif featurizer == 'Raw':
        featurizer = deepchem.feat.RawFeaturizer()
    elif featurizer == 'AdjacencyConv':
        featurizer = deepchem.feat.AdjacencyFingerprint(max_n_atoms=150,
                                                        max_valence=6)
    elif featurizer == 'SelfDefine':
        featurizer = deepchem.feat.UserDefinedFeaturizer(feature_field)
    loader = deepchem.data.CSVLoader(tasks=m,
                                     smiles_field="SMILES",
                                     featurizer=featurizer)
    dataset = loader.featurize(dataset_file,
                               data_dir=save_dir,
                               shard_size=8192)
    #    dataset = loader.featurize(dataset_file, shard_size=8192)
    # Initialize transformers
    if transformers:
        transformers = [
            deepchem.trans.NormalizationTransformer(transform_w=True,
                                                    dataset=dataset)
        ]
        for transformer in transformers:
            dataset = transformer.transform(dataset)
    splitters = {
        'index': deepchem.splits.IndexSplitter(),
        'random': deepchem.splits.RandomSplitter(),
        'random_stratified': deepchem.splits.RandomStratifiedSplitter(),
        'scaffold': deepchem.splits.ScaffoldSplitter(),
        'butina': deepchem.splits.ButinaSplitter(),
        'task': deepchem.splits.TaskSplitter(),
        'Harmonious_positive': Harmonious_positive(),
        'OnePositiveSplit': OnePositiveSplit()
    }
    splitter = splitters[sep]
    if sep == 'task':
        fold_datasets = splitter.k_fold_split(dataset, K)
        all_dataset = fold_datasets
    elif sep == 'Harmonious_positive':
        train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
            dataset)
        train_dataset = DiskDataset.from_numpy(train_dataset.X,
                                               train_dataset.y,
                                               train_dataset.w,
                                               train_dataset.ids,
                                               dataset.tasks,
                                               data_dir=train)
        valid_dataset = DiskDataset.from_numpy(valid_dataset.X,
                                               valid_dataset.y,
                                               valid_dataset.w,
                                               valid_dataset.ids,
                                               dataset.tasks,
                                               data_dir=valid)
        test_dataset = DiskDataset.from_numpy(test_dataset.X,
                                              test_dataset.y,
                                              test_dataset.w,
                                              test_dataset.ids,
                                              dataset.tasks,
                                              data_dir=test)
        all_dataset = (dataset, train_dataset, valid_dataset, test_dataset)
    elif sep == 'Harmonious_positive' and K:
        #        train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
        #                                dataset,
        #                                frac_train=frac_train,
        #                                frac_valid=0,
        #                                frac_test=1- frac_train,
        #                                )
        #        train_dataset = DiskDataset.from_numpy(train_dataset.X,train_dataset.y,train_dataset.w,train_dataset.ids,
        #                                               dataset.tasks,data_dir=train)
        #        train_dataset.reshard(8192)
        #        test_dataset  = DiskDataset.from_numpy(test_dataset.X,test_dataset.y,test_dataset.w,test_dataset.ids,
        #                                               dataset.tasks,data_dir=test)
        #        test_dataset.reshard(8192)
        #        fold_dataset = splitter.k_fold_split(
        #                train_dataset, K, directories=[os.path.join(valid,str(i)) for i in range(K)],verbose=True)
        fold_dataset = splitter.k_fold_split(
            dataset,
            K,
            directories=[os.path.join(valid, str(i)) for i in range(K)],
            verbose=True)
        folds = []
        for i in range(K):
            print('merge fold dataset {}...'.format(i))
            train_fold = DiskDataset.merge(
                [fold_dataset[j] for j in range(K) if j != i],
                merge_dir=os.path.join(valid, str(i), 'train_fold'))
            test_fold = DiskDataset.merge([fold_dataset[i]],
                                          merge_dir=os.path.join(
                                              valid, str(i), 'valid_fold'))
            folds.append([train_fold, test_fold])
        all_dataset = (dataset, [], folds, [])
    else:
        train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(
            dataset,
            train_dir=train,
            valid_dir=valid,
            test_dir=test,
            frac_train=frac_train,
            frac_valid=frac_valid,
            frac_test=frac_test)
        all_dataset = (dataset, train_dataset, valid_dataset, test_dataset)

#    else:
#        train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset,train_dir=train, valid_dir=valid, test_dir=test)
#        all_dataset = (dataset,train_dataset, valid_dataset, test_dataset)
#    if reload:
#        deepchem.utils.save.save_dataset_to_disk(save_dir, train, valid, test,transformers)
    return m, all_dataset, transformers
Exemplo n.º 3
0
  def k_fold_split(self, dataset, k, directories=None, **kwargs):
    """
    Parameters
    ----------
    dataset: Dataset
    Dataset to do a k-fold split

    k: int
    number of folds

    directories: list of str
    list of length 2*k filepaths to save the result disk-datasets

    kwargs

    Returns
    -------
    list of length k tuples of (train, cv)

    """
    """
    :param dataset:
    :param k:
    :param directories:
    :param kwargs:
    :return: list of length k tuples of (train, cv)
    """
    log("Computing K-fold split", self.verbose)
    if directories is None:
      directories = [tempfile.mkdtemp() for _ in range(2 * k)]
    else:
      assert len(directories) == 2 * k
    cv_datasets = []
    train_ds_base = None
    train_datasets = []
    # rem_dataset is remaining portion of dataset
    if isinstance(dataset, DiskDataset):
      rem_dataset = dataset
    else:
      rem_dataset = DiskDataset.from_numpy(dataset.X, dataset.y, dataset.w,
                                           dataset.ids)
    for fold in range(k):
      # Note starts as 1/k since fold starts at 0. Ends at 1 since fold goes up
      # to k-1.
      frac_fold = 1. / (k - fold)
      train_dir, cv_dir = directories[2 * fold], directories[2 * fold + 1]
      fold_inds, rem_inds, _ = self.split(
          rem_dataset,
          frac_train=frac_fold,
          frac_valid=1 - frac_fold,
          frac_test=0,
          **kwargs)
      cv_dataset = rem_dataset.select(fold_inds, select_dir=cv_dir)
      cv_datasets.append(cv_dataset)
      rem_dataset = rem_dataset.select(rem_inds)

      train_ds_to_merge = filter(lambda x: x is not None,
                                 [train_ds_base, rem_dataset])
      train_ds_to_merge = filter(lambda x: len(x) > 0, train_ds_to_merge)
      train_dataset = DiskDataset.merge(train_ds_to_merge, merge_dir=train_dir)
      train_datasets.append(train_dataset)

      update_train_base_merge = filter(lambda x: x is not None,
                                       [train_ds_base, cv_dataset])
      train_ds_base = DiskDataset.merge(update_train_base_merge)
    return list(zip(train_datasets, cv_datasets))
Exemplo n.º 4
0
def train_test_mtnn(train_task_csvs,
                    test_tasks_csvs,
                    tasks_nickname,
                    smiles_field,
                    y_field,
                    id_field,
                    tempdir,
                    num_epochs=40,
                    batch_size=128,
                    learning_rate=0.001,
                    graph_conv_sizes=(128, 128),
                    dense_size=256,
                    gpu=None):
    """
    Trains a multitask GCNN using the training sets in train_tasks_csvs and validates it using the test sets in
    test_tasks_csvs. Saves the trained model and the predictions under a folder named "train_test". Prints performance
    metrics (R2 and Spearman rho) after every epoch.
    NB: each task in the model should have a corresponding training and test files, named similarly (ex:task1_train.csv,
    task1_test.csv).
    :param train_task_csvs: list of csv files containing the training tasks
    :param test_tasks_csvs: list of csv files containing the test tasks
    :param tasks_nickname: how the model will be named
    :param smiles_field: in the csvs, name of the column containing the smiles string of the cpds
    :param y_field: in the csv, name of the column containing the activity data
    :param id_field: in the csv, name of the column containing the molids
    :param tempdir: where to store the temporary files for the DiskDatasets (will be deleted later on)
    :param num_epochs: how many epochs to train for
    :param batch_size: number of molecules per minibatch
    :param learning_rate: learning rate
    :param graph_conv_sizes: tuple with output dimension for every GC layer
    :param dense_size: nb of neurons in the last dense layer
    :param gpu: GPU to use for training (if None, only CPU will be used)
    :return: None
    """

    ensure_dir(tempdir)
    tasks, training_dset = load_training_data(train_task_csvs,
                                              smiles_field=smiles_field,
                                              y_field=y_field,
                                              id_field=id_field,
                                              tempdir=op.join(
                                                  tempdir, 'train'),
                                              cv=False)
    tasks, test_dset = load_training_data(test_tasks_csvs,
                                          smiles_field=smiles_field,
                                          y_field=y_field,
                                          id_field=id_field,
                                          tempdir=op.join(tempdir, 'test'),
                                          cv=False)

    # Take care of outdir
    outdir = get_multitask_traintest_outdir(tasks_nickname)
    ensure_dir(outdir)

    # Have we already run that experiment?
    if op.exists(op.join(outdir, 'DONE.txt')):
        print('Model already trained and validated.')

    else:
        print('Training and validating multitask graph convolution model')

        # Merge to reduce the number of shards (very necessary to avoid weird problems of non-random minibatches)
        disk_dir_to_delete = tempfile.mkdtemp(prefix=tempdir + '/')
        training_dset = DiskDataset.merge([training_dset],
                                          merge_dir=disk_dir_to_delete)

        # Transformation (z-scaling)
        zscaling_dir_train = op.join(tempdir, 'zscaling', 'train')
        ensure_dir(zscaling_dir_train)
        zscaling_dir_test = op.join(tempdir, 'zscaling', 'test')
        ensure_dir(zscaling_dir_test)
        transfo_dir_to_delete_1 = tempfile.mkdtemp(prefix=zscaling_dir_train +
                                                   '/')
        transfo_dir_to_delete_2 = tempfile.mkdtemp(prefix=zscaling_dir_test +
                                                   '/')
        transformer = NormalizationTransformer(transform_y=True,
                                               dataset=training_dset)
        scaled_train = transformer.transform(training_dset,
                                             outdir=transfo_dir_to_delete_1)
        scaled_val = transformer.transform(test_dset,
                                           outdir=transfo_dir_to_delete_2)

        # Train the model
        scaled_train_y, yhattrain, scaled_train_w, scaled_test_y, yhattest, scaled_test_w = \
            train_and_validate_mtnn(scaled_train, n_tasks=len(tasks), outdir=outdir, graph_conv_sizes=graph_conv_sizes,
                                    dense_size=dense_size, batch_size=batch_size, learning_rate=learning_rate,
                                    num_epochs=num_epochs, pickle_file_name=tasks_nickname + '.pkl', test=scaled_val,
                                    transformer=transformer, test_unscaled=test_dset, gpu=gpu)

        # compute metrics
        scaled_results_test = evaluate_multitask_gc(scaled_test_y, yhattest,
                                                    scaled_test_w)
        for k, vals in scaled_results_test.items():
            print(k)
            print(vals)

        # let's reverse the transformation from the predictions
        yhattest_untransf = undo_transforms(yhattest, [transformer])
        unscaled_results_test = evaluate_multitask_gc(test_dset.y,
                                                      yhattest_untransf,
                                                      test_dset.w)
        for k, vals in unscaled_results_test.items():
            print(k)
            print(vals)
        # hopefully the results are very similar

        # Remove transfo dir
        shutil.rmtree(transfo_dir_to_delete_1)
        shutil.rmtree(transfo_dir_to_delete_2)

    # Get rid of the temporary directory structure
    shutil.rmtree(tempdir)

    print('Dataset folders removed!')
Exemplo n.º 5
0
def cross_validate_mtnn(task_csvs,
                        tasks_nickname,
                        smiles_field,
                        split_field,
                        y_field,
                        id_field,
                        tempdir,
                        num_epochs,
                        batch_size=128,
                        learning_rate=0.001,
                        graph_conv_sizes=(128, 128),
                        dense_size=256,
                        gpu=None):
    """
    Cross-validates a multitask GCNN using the training sets in train_tasks_csvs. Saves the trained models and the
    predictions under folders named "fold_i". Prints performance metrics (R2 and Spearman rho) after every epoch.
    NB: each task in the model should have a corresponding training file. A columns with fold assignment should be
    provided for the cross-validation.
    :param task_csvs: list of csv files containing the training tasks
    :param tasks_nickname: how the model will be named
    :param smiles_field: in the csvs, name of the column containing the smiles string of the cpds
    :param split_field: in the csvs, name of the column containing the fold assignment for the cross-validation
    :param y_field: in the csvs, name of the column containing the activity data
    :param id_field: in the csv, name of the column containing the molids
    :param tempdir: where to store the temporary files for the DiskDatasets (will be deleted later on)
    :param num_epochs: how many epochs to train for
    :param batch_size: number of molecules per minibatch
    :param learning_rate: learning rate
    :param graph_conv_sizes: tuple with output dimension for every GC layer
    :param dense_size: nb of neurons in the last dense layer
    :param gpu: GPU to use for training (if None, only CPU will be used)
    :return: A pandas dataframe with performance metrics for every fold
    """

    ensure_dir(tempdir)
    tasks, folds, fold_dirs = load_training_data(task_csvs,
                                                 smiles_field=smiles_field,
                                                 split_field=split_field,
                                                 y_field=y_field,
                                                 id_field=id_field,
                                                 tempdir=tempdir,
                                                 cv=True)

    fold_results = defaultdict(list)

    for i, fold in enumerate(folds):

        # Take care of outdir
        outdir = get_multitask_cv_outdir(tasks_nickname, i)
        ensure_dir(outdir)

        # Have we already run that fold?
        if op.exists(op.join(outdir, 'DONE.txt')):
            print('Fold %i already computed.' % i)

        else:
            print('Running graph convolution model for fold %i' % i)
            val = fold
            disk_dir_to_delete = tempfile.mkdtemp(prefix=tempdir + '/')
            train = DiskDataset.merge(folds[0:i] + folds[i + 1:],
                                      merge_dir=disk_dir_to_delete)

            # Transformation (z-scaling)
            zscaling_dir_train = op.join(tempdir, 'zscaling', 'train')
            ensure_dir(zscaling_dir_train)
            zscaling_dir_test = op.join(tempdir, 'zscaling', 'test')
            ensure_dir(zscaling_dir_test)
            transfo_dir_to_delete_1 = tempfile.mkdtemp(
                prefix=zscaling_dir_train + '/')
            transfo_dir_to_delete_2 = tempfile.mkdtemp(
                prefix=zscaling_dir_test + '/')
            transformer = NormalizationTransformer(transform_y=True,
                                                   dataset=train)
            scaled_train = transformer.transform(
                train, outdir=transfo_dir_to_delete_1)
            scaled_val = transformer.transform(val,
                                               outdir=transfo_dir_to_delete_2)

            train_y, yhattrain, train_w, test_y, yhattest, test_w = \
                train_and_validate_mtnn(scaled_train, len(tasks), outdir=outdir, graph_conv_sizes=graph_conv_sizes,
                                        dense_size=dense_size, batch_size=batch_size, learning_rate=learning_rate,
                                        num_epochs=num_epochs, pickle_file_name=tasks_nickname + '_fold_%i.pkl' % i,
                                        test=scaled_val, test_unscaled=val, transformer=transformer, fold=i, gpu=gpu)

            # compute metrics
            train_results = evaluate_multitask_gc(train_y, yhattrain, train_w)
            test_results = evaluate_multitask_gc(test_y, yhattest, test_w)

            # Populate the results dictionary
            for j, t in enumerate(tasks):
                fold_results['fold'].append(i)
                fold_results['task'].append(t)
                fold_results['train'].append(True)
                fold_results['r2'].append(train_results[j][0])
                fold_results['mse'].append(train_results[j][1])
                fold_results['mae'].append(train_results[j][2])
                fold_results['varex'].append(train_results[j][3])
                fold_results['spearman'].append(train_results[j][4])
                fold_results['fold'].append(i)
                fold_results['task'].append(t)
                fold_results['train'].append(False)
                fold_results['r2'].append(test_results[j][0])
                fold_results['mse'].append(test_results[j][1])
                fold_results['mae'].append(test_results[j][2])
                fold_results['varex'].append(test_results[j][3])
                fold_results['spearman'].append(test_results[j][4])

            # Clean the tempdirs
            shutil.rmtree(disk_dir_to_delete)
            shutil.rmtree(transfo_dir_to_delete_1)
            shutil.rmtree(transfo_dir_to_delete_2)
            print('folder removed!')

    # Get rid of the foldirs
    for foldir in fold_dirs:
        shutil.rmtree(foldir)
    shutil.rmtree(tempdir)
    print('fold dataset folders removed!')

    return pd.DataFrame.from_dict(fold_results)
Exemplo n.º 6
0
    def k_fold_split(self, dataset, k, directories=None, **kwargs):
        """
    Parameters
    ----------
    dataset: Dataset
    Dataset to do a k-fold split

    k: int
    number of folds

    directories: list of str
    list of length 2*k filepaths to save the result disk-datasets

    kwargs

    Returns
    -------
    list of length k tuples of (train, cv)

    """
        """
    :param dataset: 
    :param k: 
    :param directories: 
    :param kwargs:
    :return: list of length k tuples of (train, cv)
    """
        log("Computing K-fold split", self.verbose)
        if directories is None:
            directories = [tempfile.mkdtemp() for _ in range(2 * k)]
        else:
            assert len(directories) == 2 * k
        cv_datasets = []
        train_ds_base = None
        train_datasets = []
        # rem_dataset is remaining portion of dataset
        rem_dataset = dataset
        for fold in range(k):
            # Note starts as 1/k since fold starts at 0. Ends at 1 since fold goes up
            # to k-1.
            frac_fold = 1. / (k - fold)
            train_dir, cv_dir = directories[2 * fold], directories[2 * fold +
                                                                   1]
            fold_inds, rem_inds, _ = self.split(rem_dataset,
                                                frac_train=frac_fold,
                                                frac_valid=1 - frac_fold,
                                                frac_test=0)
            cv_dataset = rem_dataset.select(fold_inds)
            cv_datasets.append(cv_dataset)
            rem_dataset = rem_dataset.select(rem_inds)

            train_ds_to_merge = filter(lambda x: x is not None,
                                       [train_ds_base, rem_dataset])
            train_ds_to_merge = filter(lambda x: len(x) > 0, train_ds_to_merge)
            train_dataset = DiskDataset.merge(train_ds_to_merge,
                                              merge_dir=train_dir)
            train_datasets.append(train_dataset)

            update_train_base_merge = filter(lambda x: x is not None,
                                             [train_ds_base, cv_dataset])
            train_ds_base = DiskDataset.merge(update_train_base_merge,
                                              merge_dir=cv_dir)
        return list(zip(train_datasets, cv_datasets))