def __init__(self, token_dict_path: str, tokenized_data_path: str, dataset_usage: str, rand_state: int = 0, tokenize_on: str = 'atom', max_seq_length: int = 256): assert dataset_usage in ['training', 'validation', 'test'] self.__rand_state = rand_state tasks, (trn, val, tst), transformers = \ load_pcba(featurizer='Raw', split='random', reload=True) # seq_length = 0 # for i in (list(trn.ids) + list(val.ids) + list(tst.ids)): # if len(i) > seq_length: # seq_length = len(i) # # print(seq_length) token_dict = get_smiles_token_dict( dict_path=token_dict_path, smiles_strings=(list(trn.ids) + list(val.ids) + list(tst.ids)), tokenize_on=tokenize_on) if dataset_usage == 'training': smiles = trn.ids targets = trn.y elif dataset_usage == 'validation': smiles = val.ids targets = val.y else: smiles = tst.ids targets = tst.y targets = np.argmax(targets, axis=1) self.__smiles, tokenized_smiles, targets = tokenize_smiles( data_path=tokenized_data_path, token_dict=token_dict, smiles_strings=smiles, targets=targets, max_seq_length=max_seq_length) # Create a mask for padding characters in each SMILES string self.token_dict = token_dict self.__padding_mask = \ np.array((np.array(tokenized_smiles) != self.token_dict['<PAD>'])).astype(np.int64) # Convert the data and target type to int64 to work with PyTorch self.__data = np.array(tokenized_smiles).astype(np.int64) self.__targets = np.array(targets).astype(np.int64) self.__len = len(self.__data)
Author: Xiaotian Duan (xduan7) Email: [email protected] Date: 5/5/19 Python Version: 3.5.4 File Description: """ import numpy as np from deepchem.molnet import load_pcba from sklearn.decomposition import PCA from featurizers import smiles_to_mols, mols_to_sim_mat, mols_to_ssm_mat, \ FP_FUNC_DICT, SIM_FUNC_DICT pcba_tasks, pcba_datasets, transformers = load_pcba(featurizer='Raw', split='scaffold', reload=True) (train_dataset, valid_dataset, test_dataset) = pcba_datasets trn_smiles = train_dataset.ids trn_target = train_dataset.y print(f'Data Loaded.') # len(trn_smiles) = 350,000 # which yields about 10 TB of features if each distances has 20 channels # and stored in float32, and requires perhaps incremental PCA # However, batch size of 32 will generate 0.896 GB of feature for iPCA # Need to use smaller dataset and eliminates some of the not so important # fingerprint distances
import os import numpy as np import shutil from deepchem.molnet import load_pcba from deepchem.utils.save import load_from_disk from deepchem.data import Dataset from deepchem import metrics from deepchem.metrics import Metric from deepchem.metrics import to_one_hot from deepchem.utils.evaluate import Evaluator from deepchem.models import MultiTaskClassifier from deepchem.models.tensorgraph.optimizers import ExponentialDecay np.random.seed(123) pcba_tasks, pcba_datasets, transformers = load_pcba() (train_dataset, valid_dataset, test_dataset) = pcba_datasets metric = Metric(metrics.roc_auc_score, np.mean, mode="classification") n_features = train_dataset.get_data_shape()[0] rate = ExponentialDecay(0.001, 0.8, 1000) model = MultiTaskClassifier( len(pcba_tasks), n_features, dropouts=[.25], learning_rate=rate, weight_init_stddevs=[.1], batch_size=64) # Fit trained model
from deepchem.models.sklearn_models import SklearnModel from deepchem.utils.evaluate import Evaluator np.random.seed(123) # Set some global variables up top reload = True is_verbose = False base_dir = "/tmp/pcba_sklearn" model_dir = os.path.join(base_dir, "model") if os.path.exists(base_dir): shutil.rmtree(base_dir) os.makedirs(base_dir) pcba_tasks, pcba_datasets, transformers = load_pcba() (train_dataset, valid_dataset, test_dataset) = pcba_datasets classification_metric = Metric( metrics.roc_auc_score, np.mean, verbose=is_verbose, mode="classification") def model_builder(model_dir): sklearn_model = RandomForestClassifier( class_weight="balanced", n_estimators=500) return SklearnModel(sklearn_model, model_dir) model = SingletaskToMultitask(pcba_tasks, model_builder, model_dir) # Fit trained model