Пример #1
0
    def __init__(self,
                 token_dict_path: str,
                 tokenized_data_path: str,
                 dataset_usage: str,
                 rand_state: int = 0,

                 tokenize_on: str = 'atom',
                 max_seq_length: int = 256):

        assert dataset_usage in ['training', 'validation', 'test']
        self.__rand_state = rand_state

        tasks, (trn, val, tst), transformers = \
            load_pcba(featurizer='Raw', split='random', reload=True)

        # seq_length = 0
        # for i in (list(trn.ids) + list(val.ids) + list(tst.ids)):
        #     if len(i) > seq_length:
        #         seq_length = len(i)
        #
        # print(seq_length)

        token_dict = get_smiles_token_dict(
            dict_path=token_dict_path,
            smiles_strings=(list(trn.ids) + list(val.ids) + list(tst.ids)),
            tokenize_on=tokenize_on)

        if dataset_usage == 'training':
            smiles = trn.ids
            targets = trn.y
        elif dataset_usage == 'validation':
            smiles = val.ids
            targets = val.y
        else:
            smiles = tst.ids
            targets = tst.y

        targets = np.argmax(targets, axis=1)

        self.__smiles, tokenized_smiles, targets = tokenize_smiles(
            data_path=tokenized_data_path,
            token_dict=token_dict,
            smiles_strings=smiles,
            targets=targets,
            max_seq_length=max_seq_length)

        # Create a mask for padding characters in each SMILES string
        self.token_dict = token_dict
        self.__padding_mask = \
            np.array((np.array(tokenized_smiles)
                      != self.token_dict['<PAD>'])).astype(np.int64)

        # Convert the data and target type to int64 to work with PyTorch
        self.__data = np.array(tokenized_smiles).astype(np.int64)
        self.__targets = np.array(targets).astype(np.int64)

        self.__len = len(self.__data)
Пример #2
0
    Author:             Xiaotian Duan (xduan7)
    Email:              [email protected]
    Date:               5/5/19
    Python Version:     3.5.4
    File Description:   

"""
import numpy as np
from deepchem.molnet import load_pcba
from sklearn.decomposition import PCA

from featurizers import smiles_to_mols, mols_to_sim_mat, mols_to_ssm_mat, \
    FP_FUNC_DICT, SIM_FUNC_DICT

pcba_tasks, pcba_datasets, transformers = load_pcba(featurizer='Raw',
                                                    split='scaffold',
                                                    reload=True)
(train_dataset, valid_dataset, test_dataset) = pcba_datasets

trn_smiles = train_dataset.ids
trn_target = train_dataset.y
print(f'Data Loaded.')

# len(trn_smiles) = 350,000
# which yields about 10 TB of features if each distances has 20 channels
# and stored in float32, and requires perhaps incremental PCA

# However, batch size of 32 will generate 0.896 GB of feature for iPCA
# Need to use smaller dataset and eliminates some of the not so important
# fingerprint distances
Пример #3
0
import os
import numpy as np
import shutil
from deepchem.molnet import load_pcba
from deepchem.utils.save import load_from_disk
from deepchem.data import Dataset
from deepchem import metrics
from deepchem.metrics import Metric
from deepchem.metrics import to_one_hot
from deepchem.utils.evaluate import Evaluator
from deepchem.models import MultiTaskClassifier
from deepchem.models.tensorgraph.optimizers import ExponentialDecay

np.random.seed(123)

pcba_tasks, pcba_datasets, transformers = load_pcba()
(train_dataset, valid_dataset, test_dataset) = pcba_datasets

metric = Metric(metrics.roc_auc_score, np.mean, mode="classification")

n_features = train_dataset.get_data_shape()[0]
rate = ExponentialDecay(0.001, 0.8, 1000)
model = MultiTaskClassifier(
    len(pcba_tasks),
    n_features,
    dropouts=[.25],
    learning_rate=rate,
    weight_init_stddevs=[.1],
    batch_size=64)

# Fit trained model
Пример #4
0
from deepchem.models.sklearn_models import SklearnModel
from deepchem.utils.evaluate import Evaluator

np.random.seed(123)

# Set some global variables up top
reload = True
is_verbose = False

base_dir = "/tmp/pcba_sklearn"
model_dir = os.path.join(base_dir, "model")
if os.path.exists(base_dir):
  shutil.rmtree(base_dir)
os.makedirs(base_dir)

pcba_tasks, pcba_datasets, transformers = load_pcba()
(train_dataset, valid_dataset, test_dataset) = pcba_datasets

classification_metric = Metric(
    metrics.roc_auc_score, np.mean, verbose=is_verbose, mode="classification")


def model_builder(model_dir):
  sklearn_model = RandomForestClassifier(
      class_weight="balanced", n_estimators=500)
  return SklearnModel(sklearn_model, model_dir)


model = SingletaskToMultitask(pcba_tasks, model_builder, model_dir)

# Fit trained model