def in_silico_mutagenesis(model: Model, encoded_sequences: np.ndarray) -> np.ndarray: """Computes in-silico-mutagenesis scores Parameters ---------- model: Model This can be any model that accepts inputs of the required shape and produces an output of shape `(N_sequences, N_tasks)`. encoded_sequences: np.ndarray A numpy array of shape `(N_sequences, N_letters, sequence_length, 1)` Returns ------- np.ndarray A numpy array of ISM scores. The shape is `(num_task, N_sequences, N_letters, sequence_length, 1)`. """ # Shape (N_sequences, num_tasks) wild_type_predictions = model.predict(NumpyDataset(encoded_sequences)) # check whether wild_type_predictions is np.ndarray or not assert isinstance(wild_type_predictions, np.ndarray) num_tasks = wild_type_predictions.shape[1] # Shape (N_sequences, N_letters, sequence_length, 1, num_tasks) mutagenesis_scores = np.empty( encoded_sequences.shape + (num_tasks,), dtype=np.float32) # Shape (N_sequences, num_tasks, 1, 1, 1) wild_type_predictions = wild_type_predictions[:, np.newaxis, np.newaxis, np.newaxis] for sequence_index, (sequence, wild_type_prediction) in enumerate( zip(encoded_sequences, wild_type_predictions)): # Mutates every position of the sequence to every letter # Shape (N_letters * sequence_length, N_letters, sequence_length, 1) # Breakdown: # Shape of sequence[np.newaxis] (1, N_letters, sequence_length, 1) mutated_sequences = np.repeat( sequence[np.newaxis], np.prod(sequence.shape), axis=0) # remove wild-type # len(arange) = N_letters * sequence_length arange = np.arange(len(mutated_sequences)) # len(horizontal cycle) = N_letters * sequence_length horizontal_cycle = np.tile(np.arange(sequence.shape[1]), sequence.shape[0]) mutated_sequences[arange, :, horizontal_cycle, :] = 0 # add mutant vertical_repeat = np.repeat(np.arange(sequence.shape[0]), sequence.shape[1]) mutated_sequences[arange, vertical_repeat, horizontal_cycle, :] = 1 # make mutant predictions mutated_predictions = model.predict(NumpyDataset(mutated_sequences)) # check whether wild_type_predictions is np.ndarray or not assert isinstance(mutated_predictions, np.ndarray) mutated_predictions = mutated_predictions.reshape(sequence.shape + (num_tasks,)) mutagenesis_scores[ sequence_index] = wild_type_prediction - mutated_predictions rolled_scores = np.rollaxis(mutagenesis_scores, -1) return rolled_scores
def fit_model(model_name, model_params, model_dir, data_dir): """Builds model from featurized data.""" task_type = Model.get_task_type(model_name) train_dir = os.path.join(data_dir, "train-data") train = Dataset(train_dir) task_types = {task: task_type for task in train.get_task_names()} model_params["data_shape"] = train.get_data_shape() model = Model.model_builder(model_name, task_types, model_params) model.fit(train) model.save(model_dir)
def create_and_eval_model(train_dataset, test_dataset, task_type, model_params, model_name, model_dir, tasks): """Helper method to create model for test.""" # Fit model task_types = {task: task_type for task in tasks} model_params["data_shape"] = train_dataset.get_data_shape() print("Creating Model object.") import deepchem.models.deep model = Model.model_builder(model_name, task_types, model_params) print("About to fit model") model.fit(train_dataset) print("Done fitting, about to save...") model.save(model_dir) # Eval model on train evaluator = Evaluator(model, train_dataset, verbose=True) with tempfile.NamedTemporaryFile() as train_csv_out: with tempfile.NamedTemporaryFile() as train_stats_out: _, performance_df = evaluator.compute_model_performance( train_csv_out, train_stats_out) print("train_performance_df") print(performance_df) evaluator = Evaluator(model, test_dataset, verbose=True) with tempfile.NamedTemporaryFile() as test_csv_out: with tempfile.NamedTemporaryFile() as test_stats_out: _, performance_df = evaluator.compute_model_performance( test_csv_out, test_stats_out) print("test_performance_df") print(performance_df) return performance_df.iterrows().next()[1]["r2_score"]
def eval_trained_model(model_type, model_dir, data_dir, csv_out, stats_out): """Evaluates a trained model on specified data.""" model = Model.load(model_type, model_dir) data = Dataset(data_dir) evaluator = Evaluator(model, data, verbose=True) _, perf_df = evaluator.compute_model_performance(csv_out, stats_out) print("Model Performance.") print(perf_df)
def predict(self, dataset, transformers=[], batch_size=None, pad_batches=False): """ Uses self to make predictions on provided Dataset object. This is overridden to make sure the batch size is always valid for Tensorflow. Returns: y_pred: numpy ndarray of shape (n_samples,) """ return Model.predict(self, dataset, transformers, self.model_instance.batch_size, True)
def load(self, model_dir): """ Load keras multitask DNN from disk. """ filename = Model.get_model_filename(model_dir) filename, _ = os.path.splitext(filename) json_filename = "%s.%s" % (filename, "json") h5_filename = "%s.%s" % (filename, "h5") with open(json_filename) as file_obj: model = model_from_json(file_obj.read()) model.load_weights(h5_filename) self.raw_model = model
def reload(self): """ Load keras multitask DNN from disk. """ filename = Model.get_model_filename(self.model_dir) filename, _ = os.path.splitext(filename) json_filename = "%s.%s" % (filename, "json") h5_filename = "%s.%s" % (filename, "h5") with open(json_filename) as file_obj: model = model_from_json(file_obj.read()) model.load_weights(h5_filename) self.raw_model = model
def reload(self, custom_objects={}): """ Load keras multitask DNN from disk. """ filename = Model.get_model_filename(self.model_dir) filename, _ = os.path.splitext(filename) json_filename = "%s.%s" % (filename, "json") h5_filename = "%s.%s" % (filename, "h5") with open(json_filename) as file_obj: model = model_from_json(file_obj.read(), custom_objects=custom_objects) model.load_weights(h5_filename) self.model_instance = model
def test_API(self): """Straightforward test of multitask deepchem classification API.""" splittype = "scaffold" feature_types = ["ECFP"] output_transforms = [] input_transforms = [] task_type = "classification" # TODO(rbharath): There should be some automatic check to ensure that all # required model_params are specified. model_params = {"nb_hidden": 10, "activation": "relu", "dropout": .5, "learning_rate": .01, "momentum": .9, "nesterov": False, "decay": 1e-4, "batch_size": 5, "nb_epoch": 2} model_name = "multitask_deep_classifier" # Featurize input featurizer = DataFeaturizer(tasks=self.tasks, smiles_field=self.smiles_field, verbose=True) feature_files = featurizer.featurize(self.input_file, feature_types, self.feature_dir) # Transform data into arrays for ML samples = FeaturizedSamples(self.samplesdir, feature_files, reload_data=False) # Split into train/test train_samples, test_samples = samples.train_test_split( splittype, self.train_dir, self.test_dir) train_dataset = Dataset(self.train_dir, train_samples, feature_types) test_dataset = Dataset(self.test_dir, test_samples, feature_types) # Transforming train/test data train_dataset.transform(input_transforms, output_transforms) test_dataset.transform(input_transforms, output_transforms) # Fit model task_types = {task: task_type for task in self.tasks} model_params["data_shape"] = train_dataset.get_data_shape() model = Model.model_builder(model_name, task_types, model_params) model.fit(train_dataset) model.save(self.model_dir) # Eval model on train evaluator = Evaluator(model, test_dataset, verbose=True) with tempfile.NamedTemporaryFile() as test_csv_out: with tempfile.NamedTemporaryFile() as test_stats_out: evaluator.compute_model_performance(test_csv_out, test_stats_out)
def save(self, out_dir): """ Saves underlying keras model to disk. """ super(KerasModel, self).save(out_dir) model = self.get_raw_model() filename, _ = os.path.splitext(Model.get_model_filename(out_dir)) # Note that keras requires the model architecture and weights to be stored # separately. A json file is generated that specifies the model architecture. # The weights will be stored in an h5 file. The pkl.gz file with store the # target name. json_filename = "%s.%s" % (filename, "json") h5_filename = "%s.%s" % (filename, "h5") # Save architecture json_string = model.to_json() with open(json_filename, "wb") as file_obj: file_obj.write(json_string) model.save_weights(h5_filename, overwrite=True)
def _create_model(self, splittype, feature_types, input_transforms, output_transforms, task_type, model_params, model_name, input_file, tasks, protein_pdb_field=None, ligand_pdb_field=None): """Helper method to create model for test.""" # Featurize input input_file = os.path.join(self.current_dir, input_file) featurizer = DataFeaturizer(tasks=tasks, smiles_field=self.smiles_field, protein_pdb_field=protein_pdb_field, ligand_pdb_field=ligand_pdb_field, verbose=True) feature_files = featurizer.featurize(input_file, feature_types, self.feature_dir) # Transform data into arrays for ML samples = FeaturizedSamples(self.samplesdir, feature_files, reload_data=False) # Split into train/test train_samples, test_samples = samples.train_test_split( splittype, self.train_dir, self.test_dir) train_dataset = Dataset(self.train_dir, train_samples, feature_types) test_dataset = Dataset(self.test_dir, test_samples, feature_types) # Transforming train/test data train_dataset.transform(input_transforms, output_transforms) test_dataset.transform(input_transforms, output_transforms) # Fit model task_types = {task: task_type for task in tasks} model_params["data_shape"] = train_dataset.get_data_shape() model = Model.model_builder(model_name, task_types, model_params) model.fit(train_dataset) model.save(self.model_dir) # Eval model on train evaluator = Evaluator(model, test_dataset, verbose=True) with tempfile.NamedTemporaryFile() as test_csv_out: with tempfile.NamedTemporaryFile() as test_stats_out: _, _ = evaluator.compute_model_performance( test_csv_out, test_stats_out)
def reload(self): """Loads sklearn model from joblib file on disk.""" self.model_instance = load_from_disk( Model.get_model_filename(self.model_dir))
nb_tasks = len(sorted_tasks) y_pred = np.zeros((nb_samples, nb_tasks)) for ind, task in enumerate(sorted_tasks): task_type = self.task_types[task] taskname = "task%d" % ind if task_type == "classification": # Class probabilities are predicted for classification outputs. Instead, # output the most likely class. y_pred_task = np.squeeze(np.argmax(y_pred_dict[taskname], axis=1)) else: y_pred_task = np.squeeze(y_pred_dict[taskname]) y_pred[:, ind] = y_pred_task y_pred = np.squeeze(y_pred) return y_pred Model.register_model_type("multitask_deep_regressor", MultiTaskDNN) Model.register_model_type("multitask_deep_classifier", MultiTaskDNN) class SingleTaskDNN(MultiTaskDNN): """ Abstract base class for different ML models. """ def __init__(self, model_type, task_types, model_params, initialize_raw_model=True): super(SingleTaskDNN, self).__init__(model_type, task_types, model_params, initialize_raw_model) Model.register_model_type("singletask_deep_regressor", SingleTaskDNN) Model.register_model_type("singletask_deep_classifier", SingleTaskDNN) def to_one_hot(y): """Transforms label vector into one-hot encoding.
model.add(Activation('relu')) model.add(MaxPooling3D(pool_size=(nb_pool[2], nb_pool[2], nb_pool[2]))) model.add(Flatten()) # TODO(rbharath): If we change away from axis-size 32, this code will break. # Eventually figure out a more general rule that works for all axis sizes. model.add(Dense(16, init='normal')) model.add(Activation('relu')) model.add(Dropout(0.5)) model.add(Dense(1, init='normal')) sgd = RMSprop(lr=learning_rate, decay=1e-6, momentum=0.9, nesterov=True) print("About to compile model") model.compile(loss=loss_function, optimizer=sgd) self.raw_model = model def fit_on_batch(self, X, y, w): X = shuffle_data(X) loss = self.raw_model.train_on_batch(X, y) print("Loss: %f" % loss) def predict_on_batch(self, X): if len(np.shape(X)) != 5: raise ValueError( "Tensorial datatype must be of shape (n_samples, N, N, N, n_channels).") X = shuffle_data(X) y_pred = self.raw_model.predict_on_batch(X) y_pred = np.squeeze(y_pred) return y_pred Model.register_model_type("convolutional_3D_regressor", DockingDNN)
for ind, task in enumerate(sorted_tasks): task_type = self.task_types[task] taskname = "task%d" % ind if task_type == "classification": # Class probabilities are predicted for classification outputs. Instead, # output the most likely class. y_pred_task = np.squeeze( np.argmax(y_pred_dict[taskname], axis=1)) else: y_pred_task = np.squeeze(y_pred_dict[taskname]) y_pred[:, ind] = y_pred_task y_pred = np.squeeze(y_pred) return y_pred Model.register_model_type(MultiTaskDNN) class SingleTaskDNN(MultiTaskDNN): """ Abstract base class for different ML models. """ def __init__(self, task_types, model_params, initialize_raw_model=True): super(SingleTaskDNN, self).__init__(task_types, model_params, initialize_raw_model=initialize_raw_model) Model.register_model_type(SingleTaskDNN)
""" Makes predictions on dataset. """ # Sets batch_size which the default impl in Model expects #TODO(enf/rbharath): This is kludgy. Fix later. if "batch_size" not in self.model_params.keys(): self.model_params["batch_size"] = 32 return super(SklearnModel, self).predict(X) def save(self, out_dir): """Saves sklearn model to disk using joblib.""" super(SklearnModel, self).save(out_dir) save_to_disk(self.raw_model, self.get_model_filename(out_dir)) def load(self, model_dir): """Loads sklearn model from joblib file on disk.""" self.raw_model = load_from_disk(Model.get_model_filename(model_dir)) Model.register_model_type(SklearnModel) #TODO(enf/rbharath): deprecate the following if __init__.py functions as planned. ''' Model.register_model_type("logistic", SklearnModel) Model.register_model_type("rf_classifier", SklearnModel) Model.register_model_type("rf_regressor", SklearnModel) Model.register_model_type("linear", SklearnModel) Model.register_model_type("ridge", SklearnModel) Model.register_model_type("lasso", SklearnModel) Model.register_model_type("lasso_lars", SklearnModel) Model.register_model_type("elastic_net", SklearnModel) '''
for (X, y, _, _) in numpy_dataset.itershards(): Xs.append(X) ys.append(y) X = np.concatenate(Xs) y = np.concatenate(ys).ravel() self.raw_model.fit(X, y) def predict_on_batch(self, X): """ Makes predictions on given batch of new data. """ return self.raw_model.predict(X) def save(self, out_dir): """Saves sklearn model to disk using joblib.""" super(SklearnModel, self).save(out_dir) save_to_disk(self.raw_model, self.get_model_filename(out_dir)) def load(self, model_dir): """Loads sklearn model from joblib file on disk.""" self.raw_model = load_from_disk(Model.get_model_filename(model_dir)) Model.register_model_type("logistic", SklearnModel) Model.register_model_type("rf_classifier", SklearnModel) Model.register_model_type("rf_regressor", SklearnModel) Model.register_model_type("linear", SklearnModel) Model.register_model_type("ridge", SklearnModel) Model.register_model_type("lasso", SklearnModel) Model.register_model_type("lasso_lars", SklearnModel) Model.register_model_type("elastic_net", SklearnModel)
def reload(self): """Loads sklearn model from joblib file on disk.""" self.model_instance = load_from_disk(Model.get_model_filename(self.model_dir))
def load(self, model_dir): """Loads sklearn model from joblib file on disk.""" self.raw_model = load_from_disk(Model.get_model_filename(model_dir))
Makes predictions on dataset. """ # Sets batch_size which the default impl in Model expects #TODO(enf/rbharath): This is kludgy. Fix later. if "batch_size" not in self.model_params.keys(): self.model_params["batch_size"] = 32 return super(SklearnModel, self).predict(X) def save(self, out_dir): """Saves sklearn model to disk using joblib.""" super(SklearnModel, self).save(out_dir) save_to_disk(self.raw_model, self.get_model_filename(out_dir)) def load(self, model_dir): """Loads sklearn model from joblib file on disk.""" self.raw_model = load_from_disk(Model.get_model_filename(model_dir)) Model.register_model_type(SklearnModel) #TODO(enf/rbharath): deprecate the following if __init__.py functions as planned. ''' Model.register_model_type("logistic", SklearnModel) Model.register_model_type("rf_classifier", SklearnModel) Model.register_model_type("rf_regressor", SklearnModel) Model.register_model_type("linear", SklearnModel) Model.register_model_type("ridge", SklearnModel) Model.register_model_type("lasso", SklearnModel) Model.register_model_type("lasso_lars", SklearnModel) Model.register_model_type("elastic_net", SklearnModel) '''
nb_tasks = len(sorted_tasks) y_pred = np.zeros((nb_samples, nb_tasks)) for ind, task in enumerate(sorted_tasks): task_type = self.task_types[task] taskname = "task%d" % ind if task_type == "classification": # Class probabilities are predicted for classification outputs. Instead, # output the most likely class. y_pred_task = np.squeeze(np.argmax(y_pred_dict[taskname], axis=1)) else: y_pred_task = np.squeeze(y_pred_dict[taskname]) y_pred[:, ind] = y_pred_task y_pred = np.squeeze(y_pred) return y_pred Model.register_model_type(MultiTaskDNN) class SingleTaskDNN(MultiTaskDNN): """ Abstract base class for different ML models. """ def __init__(self, task_types, model_params, initialize_raw_model=True): super(SingleTaskDNN, self).__init__(task_types, model_params, initialize_raw_model=initialize_raw_model) Model.register_model_type(SingleTaskDNN) def to_one_hot(y): """Transforms label vector into one-hot encoding. Turns y into vector of shape [n_samples, 2] (assuming binary labels).