def deeplift(model, X, batch_size=200, target_layer_idx=-2): """ Returns (num_task, num_samples, 1, num_bases, sequence_length) deeplift score array. """ assert len(np.shape(X)) == 4 and np.shape(X)[1] == 1 from deeplift.conversion import kerasapi_conversion as kc #dump the model to hdf5, as current dl wants a saved model input model.save('tmp.hdf5') # convert to deeplift model and get scoring function deeplift_model = kc.convert_model_from_saved_files('tmp.hdf5', verbose=False) #get the deeplift score with respect to the logit score_func = deeplift_model.get_target_contribs_func( find_scores_layer_idx=0, target_layer_idx=target_layer_idx) # use a 40% GC reference input_references = [np.array([0.3, 0.2, 0.2, 0.3])[None, None, None, :]] # get deeplift scores deeplift_scores = score_func(task_idx=0, input_data_list=[X], batch_size=batch_size, progress_update=None, input_references_list=input_references) return np.asarray(deeplift_scores)
def deeplift(model, X, batch_size=200, target_layer_idx=-2, task_idx=0, num_refs_per_seq=10, reference="shuffled_ref", one_hot_func=None): """ Returns (num_task, num_samples, 1, num_bases, sequence_length) deeplift score array. """ assert reference in ["shuffled_ref", "gc_ref"] if one_hot_func == None: #check that dataset has been one-hot-encoded assert len(np.shape(X)) == 4 and np.shape(X)[1] == 1 from deeplift.conversion import kerasapi_conversion as kc deeplift_model = kc.convert_model_from_saved_files(model, verbose=False) #get the deeplift score with respect to the logit score_func = deeplift_model.get_target_contribs_func( find_scores_layer_idx=0, target_layer_idx=target_layer_idx) if reference == "shuffled_ref": deeplift_scores = deeplift_shuffled_ref(X, score_func, batch_size, task_idx, num_refs_per_seq, one_hot_func=one_hot_func) elif reference == "gc_ref": deeplift_scores = deeplift_gc_ref(X, score_func, batch_size, task_idx) else: raise Exception( "supported DeepLIFT references are 'shuffled_ref' and 'gc_ref'") return np.asarray(deeplift_scores)
def test_convert_conv1d_model_compute_scores(self): deeplift_model =\ kc.convert_model_from_saved_files( self.saved_file_path, nonlinear_mxts_mode=NonlinearMxtsMode.Rescale) #print(deeplift_model.get_name_to_layer()['inp1_0'].get_shape()) #print(deeplift_model.get_name_to_layer()['convolution1d_1_0'].get_shape()) #print(deeplift_model.get_name_to_layer()['maxpooling1d_1_0'].get_shape()) #print(deeplift_model.get_name_to_layer()['merge_1'].get_shape()) #print(deeplift_model.get_name_to_layer()['flatten_1'].get_shape()) deeplift_contribs_func = deeplift_model.\ get_target_contribs_func( find_scores_layer_name=["inp1_0", "inp2_0"], pre_activation_target_layer_name="output_preact_0") grads_inp1, grads_inp2 = self.grad_func(self.inp1, self.inp2) np.testing.assert_almost_equal( np.array( deeplift_contribs_func(task_idx=0, input_data_list={ 'inp1_0': self.inp1, 'inp2_0': self.inp2 }, input_references_list={ 'inp1_0': np.zeros_like(self.inp1), 'inp2_0': np.zeros_like(self.inp2) }, batch_size=10, progress_update=None)), #when biases are 0 and ref is 0, deeplift #with the rescale rule is the same as grad*inp np.array([grads_inp1 * self.inp1, grads_inp2 * self.inp2]), decimal=6)
def get_deeplift_scoring_function(model,target_layer_idx=-2,task_idx=0, reference="shuffled_ref", sequential=True): """ Arguments: model -- a string containing the path to the hdf5 exported model target_layer_idx -- should be -2 for classification; -1 for regression reference -- one of 'shuffled_ref','gc_ref','zero_ref' Returns: deepLIFT scoring function """ from deeplift.conversion import kerasapi_conversion as kc deeplift_model = kc.convert_model_from_saved_files(model,verbose=False) #get the deeplift score with respect to the logit if(sequential): score_func = deeplift_model.get_target_contribs_func( find_scores_layer_idx=task_idx, target_layer_idx=target_layer_idx) else: input_name = deeplift_model.get_input_layer_names()[0] target_layer_name = list(deeplift_model.get_name_to_layer().keys())[target_layer_idx] multipliers_func = deeplift_model.get_target_multipliers_func(input_name, target_layer_name) score_func = deeplift.util.get_hypothetical_contribs_func_onehot(multipliers_func) if reference=="shuffled_ref": from deeplift.util import get_shuffle_seq_ref_function from deeplift.dinuc_shuffle import dinuc_shuffle score_func=get_shuffle_seq_ref_function( score_computation_function=score_func, shuffle_func=dinuc_shuffle, one_hot_func=None) return score_func
def test_convert_conv1d_model_compute_scores(self): deeplift_model =\ kc.convert_model_from_saved_files( self.saved_file_path, nonlinear_mxts_mode=NonlinearMxtsMode.Rescale) #print(deeplift_model.get_name_to_layer()['inp1_0'].get_shape()) #print(deeplift_model.get_name_to_layer()['convolution1d_1_0'].get_shape()) #print(deeplift_model.get_name_to_layer()['maxpooling1d_1_0'].get_shape()) #print(deeplift_model.get_name_to_layer()['merge_1'].get_shape()) #print(deeplift_model.get_name_to_layer()['flatten_1'].get_shape()) deeplift_contribs_func = deeplift_model.\ get_target_contribs_func( find_scores_layer_name=["inp1_0", "inp2_0"], pre_activation_target_layer_name="output_preact_0") grads_inp1, grads_inp2 = self.grad_func(self.inp1, self.inp2) np.testing.assert_almost_equal( np.array(deeplift_contribs_func(task_idx=0, input_data_list={ 'inp1_0': self.inp1, 'inp2_0': self.inp2}, input_references_list={ 'inp1_0': np.zeros_like(self.inp1), 'inp2_0': np.zeros_like(self.inp2)}, batch_size=10, progress_update=None)), #when biases are 0 and ref is 0, deeplift #with the rescale rule is the same as grad*inp np.array([grads_inp1*self.inp1, grads_inp2*self.inp2]), decimal=6)
def deeplift(model, X, batch_size=200,target_layer_idx=-2,task_idx=0, num_refs_per_seq=10,reference="shuffled_ref",one_hot_func=None): """ Returns (num_task, num_samples, 1, num_bases, sequence_length) deeplift score array. """ assert reference in ["shuffled_ref","gc_ref","zero_ref"] if one_hot_func==None: #check that dataset has been one-hot-encoded assert len(np.shape(X)) == 4 and np.shape(X)[1] == 1 from deeplift.conversion import kerasapi_conversion as kc deeplift_model = kc.convert_model_from_saved_files(model,verbose=False) #get the deeplift score with respect to the logit score_func = deeplift_model.get_target_contribs_func( find_scores_layer_idx=0, target_layer_idx=target_layer_idx) if reference=="shuffled_ref": deeplift_scores=deeplift_shuffled_ref(X,score_func,batch_size,task_idx,num_refs_per_seq,one_hot_func=one_hot_func) elif reference=="gc_ref": deeplift_scores=deeplift_gc_ref(X,score_func,batch_size,task_idx) elif reference=="zero_ref": deeplift_scores=deeplift_zero_ref(X,score_func,batch_size,task_idx) else: raise Exception("supported DeepLIFT references are 'shuffled_ref' and 'gc_ref'") return np.asarray(deeplift_scores)
def get_deeplift_scoring_function(model, target_layer_idx=-2, task_idx=0, num_refs_per_seq=10, reference="shuffled_ref", one_hot_func=None): """ Arguments: model -- a string containing the path to the hdf5 exported model target_layer_idx -- Layer in the model whose outputs will be interpreted. For classification models we \ interpret the logit (input to the sigmoid), which is the output of layer -2. For regression models we intepret the model output, which is the output of layer -1. reference -- one of 'shuffled_ref','gc_ref','zero_ref' one_hot_func -- one hot function to use for encoding FASTA string inputs; if the inputs are already one-hot-encoded, use the default of None Returns: deepLIFT scoring function """ assert reference in ["shuffled_ref", "gc_ref", "zero_ref"] from deeplift.conversion import kerasapi_conversion as kc deeplift_model = kc.convert_model_from_saved_files(model, verbose=False) #get the deeplift score with respect to the logit score_func = deeplift_model.get_target_contribs_func( find_scores_layer_idx=0, target_layer_idx=target_layer_idx) if reference == "shuffled_ref": from deeplift.util import get_shuffle_seq_ref_function from deeplift.dinuc_shuffle import dinuc_shuffle score_func = get_shuffle_seq_ref_function( score_computation_function=score_func, shuffle_func=dinuc_shuffle, one_hot_func=one_hot_func) return score_func
def inp_deeplift(PathOutput, Data_2, Labels_2, preds, n_model): if n_model == 0 or n_model == 1: deeplift_layer0 = "input1_0" deeplift_layer1 = "dense_2_0" compile_guided_layer = "activation_3" grad_cam_layer = "conv3d_2" elif n_model == 2 or n_model == 3: deeplift_layer0 = "input1_0" deeplift_layer1 = "dense_2_0" compile_guided_layer = "activation_5" grad_cam_layer = "conv3d_4" saved_model_file = PathOutput + 'best_model.hd5' revealcancel_model = kc.convert_model_from_saved_files( h5_file=saved_model_file, nonlinear_mxts_mode=NonlinearMxtsMode.RevealCancel) revealcancel_func = revealcancel_model.get_target_contribs_func( find_scores_layer_name=deeplift_layer0, pre_activation_target_layer_name=deeplift_layer1) from collections import OrderedDict method_to_task_to_scores = OrderedDict() for method_name, score_func in [ ('revealcancel', revealcancel_func), ]: print("Computing scores for:", method_name) method_to_task_to_scores[method_name] = {} for task_idx in range(2): print("\tComputing scores for task: " + str(task_idx)) scores = np.array( score_func( task_idx=task_idx, input_data_list=[Data_2], # input_references_list=[np.zeros_like(Data_2)], input_references_list=[np.average(Data_2, axis=0)], batch_size=4, progress_update=None)) method_to_task_to_scores[method_name][task_idx] = scores # Generate the heatmap Data_2_map = np.zeros(Data_2.shape[0:4]) for nn in range(Data_2.shape[0]): Data_test = Data_2[nn:nn + 1] nlabels = np.argmax(Labels_2[nn]) class_idx = np.argmax(preds[nn]) ## Copy DeepLIFT image deeplift_map = method_to_task_to_scores['revealcancel'][class_idx][ nn, :, :, :, 0] Data_2_map[nn] = deeplift_map return Data_2_map
def test_convert_conv2d_model_forward_prop(self): deeplift_model =\ kc.convert_model_from_saved_files(self.saved_file_path) deeplift_fprop_func = compile_func( [deeplift_model.get_layers()[0].get_activation_vars()], deeplift_model.get_layers()[-1].get_activation_vars()) np.testing.assert_almost_equal(deeplift_fprop_func(self.inp), self.keras_output_fprop_func( [self.inp, 0]), decimal=6)
def test_convert_conv2d_model_forward_prop(self): deeplift_model =\ kc.convert_model_from_saved_files(self.saved_file_path) deeplift_fprop_func = compile_func( [deeplift_model.get_layers()[0].get_activation_vars()], deeplift_model.get_layers()[-1].get_activation_vars()) np.testing.assert_almost_equal( deeplift_fprop_func(self.inp), self.keras_output_fprop_func([self.inp, 0]), decimal=6)
def getDeepliftScores(self, weight_path, yaml_path, data=None, sequence_length=150, use_references=True): if data is None: data = self.X[:, 0, :, :] deeplift_model = kc.convert_model_from_saved_files( weight_path, yaml_path, nonlinear_mxts_mode=deeplift.layers.NonlinearMxtsMode.DeepLIFT_GenomicsDefault ) deeplift_contribs_func = deeplift_model.get_target_contribs_func( find_scores_layer_idx=0, target_layer_idx=-1) multipliers_func = deeplift_model.get_target_multipliers_func(find_scores_layer_idx=0, target_layer_idx=-1) hypothetical_contribs_func = get_hypothetical_contribs_func_onehot(multipliers_func) hypothetical_contribs_many_refs_func = get_shuffle_seq_ref_function( score_computation_function=hypothetical_contribs_func, shuffle_func=dinuc_shuffle) #idk?? num_refs_per_seq = 10 hypothetical_scores = hypothetical_contribs_many_refs_func( task_idx=0, input_data_sequences=data, num_refs_per_seq=num_refs_per_seq, batch_size=50, progress_update=1000, ) ''' data_indices, references = self.motifs.get_references(sequence_length) final_scores = [] final_hyp = [] for (ind, ref) in zip(data_indices, references): scores = np.array(deeplift_contribs_func(task_idx=0, input_data_list=[data[ind].astype(float)], input_references_list=[ref.astype(float)], batch_size=50, progress_update=4000)) hyp_scores = hypothetical_contribs_func( task_idx=0, input_data_list=[data[ind].astype(float)], input_references_list=[ref.astype(float)], batch_size=50, progress_update=1000, ) final_scores.append(scores) final_hyp.append(hyp_scores) return np.concatenate(final_scores, axis=0), np.concatenate(final_hyp, axis=0) ''' return scores, hypothetical_scores
def test_convert_conv1d_model_forward_prop(self): deeplift_model =\ kc.convert_model_from_saved_files( self.saved_file_path, nonlinear_mxts_mode=NonlinearMxtsMode.Gradient) deeplift_fprop_func = compile_func( inputs=[deeplift_model.get_layers()[0].get_activation_vars()], outputs=deeplift_model.get_layers()[-1].get_activation_vars()) np.testing.assert_almost_equal(deeplift_fprop_func(self.inp), self.keras_output_fprop_func( [self.inp, 0]), decimal=6)
def test_batch_norm_convert_model_fprop(self): deeplift_model =\ kc.convert_model_from_saved_files( self.saved_file_path, nonlinear_mxts_mode=NonlinearMxtsMode.Rescale) deeplift_fprop_func = compile_func( [deeplift_model.get_layers()[0].get_activation_vars()], deeplift_model.get_layers()[-1].get_activation_vars()) np.testing.assert_almost_equal(deeplift_fprop_func(self.inp), self.keras_output_fprop_func( [self.inp, 0]), decimal=5)
def test_convert_conv1d_model_forward_prop(self): deeplift_model =\ kc.convert_model_from_saved_files( self.saved_file_path, nonlinear_mxts_mode=NonlinearMxtsMode.Rescale) deeplift_fprop_func = compile_func( inputs=[deeplift_model.get_layers()[0].get_activation_vars()], outputs=deeplift_model.get_layers()[-1].get_activation_vars()) np.testing.assert_almost_equal( deeplift_fprop_func(self.inp), self.keras_output_fprop_func([self.inp, 0]), decimal=6)
def test_convert_conv1d_model_forward_prop(self): deeplift_model =\ kc.convert_model_from_saved_files( self.saved_file_path, nonlinear_mxts_mode=NonlinearMxtsMode.Rescale) print(deeplift_model.get_name_to_layer().keys()) deeplift_fprop_func = compile_func( [deeplift_model.get_name_to_layer()['inp1_0'].get_activation_vars(), deeplift_model.get_name_to_layer()['inp2_0'].get_activation_vars()], deeplift_model.get_name_to_layer()['output_postact_0'].get_activation_vars()) np.testing.assert_almost_equal( deeplift_fprop_func([self.inp1, self.inp2]), self.keras_output_fprop_func(self.inp1, self.inp2), decimal=6)
def test_convert_conv2d_model_compute_scores(self): deeplift_model =\ kc.convert_model_from_saved_files(self.saved_file_path) deeplift_contribs_func = deeplift_model.\ get_target_contribs_func( find_scores_layer_idx=0, target_layer_idx=-2) np.testing.assert_almost_equal( deeplift_contribs_func(task_idx=0, input_data_list=[self.inp], batch_size=10, progress_update=None), #when biases are 0 and ref is 0, deeplift is the same as grad*inp self.grad_func([self.inp, 0])*self.inp, decimal=6)
def test_batch_norm_convert_model_backprop(self): deeplift_model =\ kc.convert_model_from_saved_files( self.saved_file_path, nonlinear_mxts_mode=NonlinearMxtsMode.Rescale) deeplift_multipliers_func = deeplift_model.\ get_target_multipliers_func( find_scores_layer_idx=0, target_layer_idx=-1) np.testing.assert_almost_equal( deeplift_multipliers_func(task_idx=0, input_data_list=[self.inp], batch_size=10, progress_update=None), self.grad_func([self.inp, 0]), decimal=5)
def test_convert_conv2d_model_compute_scores(self): deeplift_model =\ kc.convert_model_from_saved_files(self.saved_file_path) deeplift_contribs_func = deeplift_model.\ get_target_contribs_func( find_scores_layer_idx=0, target_layer_idx=-2) np.testing.assert_almost_equal( deeplift_contribs_func(task_idx=0, input_data_list=[self.inp], batch_size=10, progress_update=None), #when biases are 0 and ref is 0, deeplift is the same as grad*inp self.grad_func([self.inp, 0]) * self.inp, decimal=6)
def test_convert_conv1d_model_compute_scores(self): deeplift_model =\ kc.convert_model_from_saved_files(self.saved_file_path, nonlinear_mxts_mode=NonlinearMxtsMode.Gradient) deeplift_contribs_func = deeplift_model.\ get_target_contribs_func( find_scores_layer_idx=0, target_layer_idx=-2) np.testing.assert_almost_equal( deeplift_contribs_func(task_idx=0, input_data_list=[self.inp], batch_size=10, progress_update=None), self.grad_func([self.inp, 0]) * self.inp, decimal=6)
def __init__(self): model_file = pkg_resources.resource_filename( 'iseeu', 'models/kfold4_best.hdf5') print(f"****{model_file}*****") self._model = load_model(model_file) dm = kc.convert_model_from_saved_files( h5_file=model_file, nonlinear_mxts_mode=NonlinearMxtsMode.RevealCancel, verbose=False) self._deeplift_model = dm input_layer_name = self._deeplift_model.get_input_layer_names()[0] self._importance_func = self._deeplift_model.get_target_contribs_func( find_scores_layer_name=input_layer_name, pre_activation_target_layer_name='preact_fc2_0')
def test_batch_norm_convert_model_backprop(self): deeplift_model =\ kc.convert_model_from_saved_files( self.saved_file_path, nonlinear_mxts_mode=NonlinearMxtsMode.Rescale) deeplift_multipliers_func = deeplift_model.\ get_target_multipliers_func( find_scores_layer_idx=0, target_layer_idx=-1) np.testing.assert_almost_equal(deeplift_multipliers_func( task_idx=0, input_data_list=[self.inp], batch_size=10, progress_update=None), self.grad_func([self.inp, 0]), decimal=5)
def _deeplift_contribs_generator(hdf5_path, x_test, process_x_func, num_feature, num_class, batch_size): """Generator which yields DeepLIFT contribution scores. Applies vectorization batch-by-batch to avoid memory overflow. Arguments: hdf5_path: str path to saved HDF5 Keras Model process_x_func: function function for vectorizing feature data num_feature: int number of features present in the dataset num_class: int number of classes batch_size: int batch size """ # convert Keras model, and get relevant function deeplift_model = kc.convert_model_from_saved_files( hdf5_path, nonlinear_mxts_mode=NonlinearMxtsMode.RevealCancel) # input layer is 0, since we have a softmax layer the target layer is -2 get_deeplift_contribs = deeplift_model.get_target_contribs_func( find_scores_layer_idx=0, target_layer_idx=-2) num_batch = int(round(float(len(x_test)) / batch_size)) # yield a 3D array detailing the DeepLIFT contrib scores for batch_idx, x in enumerate(chunks(x_test, batch_size)): start = time.time() x = process_x_func(x) batch_size = len(x) zeros = [0.0] * batch_size # reference data all_batch_contribs = np.zeros((num_class, batch_size, num_feature)) for c in range(num_class): batch_contribs = get_deeplift_contribs(task_idx=c, input_data_list=[x], input_references_list=zeros, batch_size=1024, progress_update=None) all_batch_contribs[c] = batch_contribs if not batch_idx % 10: print('{}/{} in {:.2f} s'.format(batch_idx, num_batch, time.time() - start)) yield all_batch_contribs
def test_convert_conv1d_model_forward_prop(self): deeplift_model =\ kc.convert_model_from_saved_files( self.saved_file_path, nonlinear_mxts_mode=NonlinearMxtsMode.Rescale) print(deeplift_model.get_name_to_layer().keys()) deeplift_fprop_func = compile_func([ deeplift_model.get_name_to_layer()['inp1_0'].get_activation_vars(), deeplift_model.get_name_to_layer()['inp2_0'].get_activation_vars() ], deeplift_model.get_name_to_layer() ['output_postact_0']. get_activation_vars()) np.testing.assert_almost_equal( deeplift_fprop_func([self.inp1, self.inp2]), self.keras_output_fprop_func(self.inp1, self.inp2), decimal=6)
def run_deeplift_comparison(model_file, target_example, reference_example): backend.clear_session() deeplift_model = kc.convert_model_from_saved_files( model_file, nonlinear_mxts_mode=deeplift.layers.NonlinearMxtsMode.RevealCancel) deeplift_contribs_func = deeplift_model.get_target_contribs_func( find_scores_layer_idx=0, target_layer_idx=-1) deeplift_results = np.array( deeplift_contribs_func(task_idx=0, input_data_list=[[target]], input_references_list=[reference_example], batch_size=1, progress_update=1)) return deeplift_results
for j in range(1,15): print('======================model:CNN_1D_exclude_transcript_%f==================='%j) model = load_model('CNN_1D_exclude_transcript_'+str(j)+'.h5') predict = model.predict_classes(encoded_label1).astype('int') print(predict) """ for j in range(8, 9): print( '======================model:CNN_1D_exclude_transcript_%f======================' % j) deeplift_model = kc.convert_model_from_saved_files( 'CNN_1D_exclude_transcript_' + str(j) + '.h5', nonlinear_mxts_mode=deeplift.layers.NonlinearMxtsMode. DeepLIFT_GenomicsDefault) find_scores_layer_idx = 0 deeplift_contribs_func = deeplift_model.get_target_contribs_func( find_scores_layer_idx=find_scores_layer_idx, target_layer_idx=-2) background = OrderedDict([('A', 0.3), ('C', 0.2), ('G', 0.2), ('T', 0.3), ('N', 0)]) scores = np.array( deeplift_contribs_func(task_idx=1, input_data_list=[encoded_label1], input_references_list=[ np.array([ background['A'], background['C'], background['G'], background['T'],
else: dset["VBF"][vars] = scaler.transform(dset["VBF"][vars]) dset["Top"][vars] = scaler.transform(dset["Top"][vars]) if "WW" in cfg["samples"]: dset["WW"][vars] = scaler.transform(dset["WW"][vars]) ######################################################## # Deeplift initializations find_scores_layer_idx = 0 target_layer_idx = -2 n_vars = len(cfg["training_variables"]) # load model to deeplift deeplift_model = kc.convert_model_from_saved_files( model_input_path, nonlinear_mxts_mode=deeplift.layers.NonlinearMxtsMode. DeepLIFT_GenomicsDefault) deeplift_contribs_func = deeplift_model.get_target_contribs_func( \ find_scores_layer_idx=find_scores_layer_idx, \ target_layer_idx=target_layer_idx) ######################################################## # get inputs and calculate deeplift scores! if "filepathData" in cfg.keys(): inputs = dset["Data"][cfg["training_variables"]].values else: inputs = dset["VBF"][cfg["training_variables"]].append( dset["Top"][cfg["training_variables"]]).values # Cut on DNN output?
encoded_seq = np.array([one_hot(prom) for prom in prom_seq]) encoded_seq = np.expand_dims(encoded_seq, 3) encoded_shuf_seq = np.array([one_hot(prom) for prom in prom_shuf]) encoded_shuf_seq = np.expand_dims(encoded_shuf_seq, 3) categories = np_utils.to_categorical(label, 2) model = models.load_model('/nam-99/ablage/nam/peleke/Thesis_models/model2020-10-06073328.h5') predictions = np.argmax(model.predict(encoded_seq), axis=1) actual = np.argmax(categories, axis=1) print(predictions) print(accuracy_score(label, predictions)) # compute deeplift scores deeplift_model =\ kc.convert_model_from_saved_files('/nam-99/ablage/nam/peleke/Thesis_models/model2020-10-06073328.h5', nonlinear_mxts_mode=NonlinearMxtsMode.DeepLIFT_GenomicsDefault) deeplift_contrib_func = deeplift_model.get_target_contribs_func(find_scores_layer_idx=0, target_layer_idx=-2) # True positive predictions tp = [] tp_shuf = [] for pred, true, enc_seq, enc_shuf_seq in zip(predictions, actual, encoded_seq, encoded_shuf_seq): if pred == 1 and true == 1: tp.append(enc_seq) tp_shuf.append(enc_shuf_seq) tp_data = np.array(tp)
def DeepLIFT(json_file, weight_file, onehot): from deeplift.layers import NonlinearMxtsMode import deeplift.conversion.kerasapi_conversion as kc import deeplift.layers import deeplift.conversion.kerasapi_conversion from collections import OrderedDict import deeplift method_to_model = OrderedDict() for method_name, nonlinear_mxts_mode in [ #The genomics default = rescale on conv layers, revealcance on fully-connected ('rescale_conv_revealcancel_fc', NonlinearMxtsMode.DeepLIFT_GenomicsDefault), ('rescale_all_layers', NonlinearMxtsMode.Rescale), ('revealcancel_all_layers', NonlinearMxtsMode.RevealCancel), ('grad_times_inp', NonlinearMxtsMode.Gradient), ('guided_backprop', NonlinearMxtsMode.GuidedBackprop)]: method_to_model[method_name] = kc.convert_model_from_saved_files( h5_file=weight_file, json_file=json_file, nonlinear_mxts_mode=nonlinear_mxts_mode) print("Compiling scoring functions") method_to_scoring_func = OrderedDict() for method,model in method_to_model.items(): print("Compiling scoring function for: "+method) method_to_scoring_func[method] = model.get_target_contribs_func(find_scores_layer_idx=0, target_layer_idx=-2) #To get a function that just gives the gradients, we use the multipliers of the Gradient model gradient_func = method_to_model['grad_times_inp'].get_target_multipliers_func(find_scores_layer_idx=0, target_layer_idx=-2) print("Compiling integrated gradients scoring functions") integrated_gradients10_func = deeplift.util.get_integrated_gradients_function( gradient_computation_function = gradient_func, num_intervals=10) method_to_scoring_func['integrated_gradients10'] = integrated_gradients10_func background = OrderedDict([('A', 0.3), ('C', 0.2), ('G', 0.2), ('T', 0.3)]) from collections import OrderedDict method_to_task_to_scores = OrderedDict() for method_name, score_func in method_to_scoring_func.items(): print("on method",method_name) method_to_task_to_scores[method_name] = OrderedDict() for task_idx in [0]: scores = np.array(score_func( task_idx=task_idx, input_data_list=[onehot], input_references_list=[ np.array([background['A'], background['C'], background['G'], background['T']])[None,None,:]], batch_size=200, progress_update=None)) assert scores.shape[2]==4 scores = np.sum(scores, axis=2) method_to_task_to_scores[method_name][task_idx] = scores return method_to_task_to_scores
prepared_proms = np.expand_dims(np.array(encoded_proms, dtype=np.float32), axis=3) prepared_shuff_proms = np.expand_dims(np.array(encoded_shuff_proms, dtype=np.float32), axis=3) print(prepared_proms.shape) print(prepared_shuff_proms.shape) print(classes.shape) model = models.load_model('/nam-99/ablage/nam/peleke/Models/model2020-07-30150217.h5') predictions = np.argmax(model.predict(prepared_proms), axis=1) print(predictions) actual = np.argmax(classes, axis=1) print('Predictions done') deeplift_model =\ kc.convert_model_from_saved_files('/nam-99/ablage/nam/peleke/Models/model2020-07-30150217.h5', nonlinear_mxts_mode=NonlinearMxtsMode.DeepLIFT_GenomicsDefault) deeplift_contrib_func = deeplift_model.get_target_contribs_func(find_scores_layer_idx=0, target_layer_idx=-2) # Calculate contributions scores for tps and tns tp = [] tp_shuff = [] tn = [] tn_shuff = [] for pred, true, enc_seq, enc_shuf_seq in zip(predictions, actual, prepared_proms, prepared_shuff_proms): if pred == 1 and true == 1: tp.append(enc_seq) tp_shuff.append(enc_shuf_seq) elif pred == 0 and true == 0: tn.append(enc_seq)
def compute_deeplift_scores(dataset, X, Y, keras_model_file, reference_label, non_reference_label, base_neuron_label, mask, gpu_id, threshold, percentage_cutoff): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id) os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' X_masked, mapping = get_masked_data(X, mask) Y = np.argmax(Y, axis=1) task_id = base_neuron_label find_scores_layer_idx = 0 mode = 'average' reference = get_reference(mode, reference_label, X_masked, Y) deeplift_model = kc.convert_model_from_saved_files( keras_model_file, nonlinear_mxts_mode=deeplift.layers.NonlinearMxtsMode. DeepLIFT_GenomicsDefault) deeplift_contribs_func = deeplift_model.get_target_contribs_func( find_scores_layer_idx=find_scores_layer_idx, target_layer_idx=-2) scores = np.array( deeplift_contribs_func( task_idx=task_id, input_references_list=reference, input_data_list=[X_masked[Y == non_reference_label]], batch_size=10, progress_update=10)) sum_scores = np.zeros(X_masked.shape[1]) for score in scores: sum_scores += score padded_sum_scores = get_padded_data(sum_scores, mapping) print("Reshaping scores ") full_matrix = np.zeros((264, 264)) full_matrix[np.triu_indices(264, 1)] = padded_sum_scores full_matrix_T = full_matrix.T full_matrix = full_matrix + full_matrix_T - np.diag(np.diag(full_matrix_T)) if not os.path.isdir('./important_features/'): print("Folder that will store the results cannot be found.") print("Creating the results folder in " + './important_features/') os.makedirs('./important_features/') np.savetxt('./important_features/' + dataset + '_scores_deeplift_reduced_r_' + str(threshold) + '_t_' + str(percentage_cutoff) + '.csv', np.transpose(np.array(scores)), delimiter=',') print('Writing reshaped scores') np.savetxt('./important_features/' + dataset + '_scores_reshaped_reduced_r_' + str(threshold) + '_t_' + str(percentage_cutoff) + '.csv', full_matrix, delimiter=",") selected_features_matrix = get_crucial_features_abs( full_matrix, percentage_cutoff) np.fill_diagonal(selected_features_matrix, 0) selected_features_file = './important_features/' + dataset + '_deeplift_features_nodes_r_' + str( threshold) + '_t_' + str(percentage_cutoff) + '.csv' np.savetxt(selected_features_file, selected_features_matrix) return selected_features_file, flatten_data(selected_features_matrix)
keras_model.load_weights(keras_model_weights) # ## Prepare the deeplift models # # ### Model conversion # # Convert the keras models to a deeplift model capable of computing importance # scores using DeepLIFT. # In[5]: deeplift_model = kc.convert_model_from_saved_files( h5_file=keras_model_weights, json_file=keras_model_json) ''' # ### Sanity checks # To ensure that the conversion happend correctly, ensure that the models give # identical predictions # # If you are using a functional model, see this issue for how to adapt the # code: https://github.com/kundajelab/deeplift/issues/54 # In[6]: #make sure predictions are the same as the original model from deeplift.util import compile_func
def __init__(self, model, output_layer, task_idx, preact=True, mxts_mode='rescale_conv_revealcancel_fc', batch_size=32): """ Args: model: Kipoi model output_layer (int): selected Keras layer with respect to which the scores should be calculated task_idx (int): Node/Neuron within the selected layer with respect to which the score should be calculated preact: !NOT YET IMPLEMENTED! Use values prior to activation - for now the default is True! mxts_mode: Selected score batch_size: Batch size for scoring """ from deeplift.conversion import kerasapi_conversion as kc from deeplift.layers import NonlinearMxtsMode if not preact: print("preact is not implemented yet") def get_mxts_mode(mode_name): # Labels from examples: mxts_modes = { 'rescale_conv_revealcancel_fc': NonlinearMxtsMode.DeepLIFT_GenomicsDefault, 'revealcancel_all_layers': NonlinearMxtsMode.RevealCancel, 'rescale_all_layers': NonlinearMxtsMode.Rescale, 'grad_times_inp': NonlinearMxtsMode.Gradient, 'guided_backprop': NonlinearMxtsMode.GuidedBackprop } return mxts_modes[mode_name] self.model = model if not self.is_compatible(model): raise Exception("Model not compatible with DeepLift") self.task_idx = task_idx self.batch_size = batch_size weight_f = tempfile.mktemp() arch_f = tempfile.mktemp() model.model.save_weights(weight_f) with open(arch_f, "w") as ofh: ofh.write(model.model.to_json()) self.deeplift_model = kc.convert_model_from_saved_files( weight_f, json_file=arch_f, nonlinear_mxts_mode=get_mxts_mode(mxts_mode)) # TODO this code may be useful for future when functional models can be handled too self.input_layer_idxs = [0] self.output_layers_idxs = [-1] """ input_names = self.model._get_feed_input_names() self.input_layer_idxs = [] self.output_layers_idxs = [] for input_name in input_names: input_layer_name = input_name[:-len("_input")] if input_name.endswith("_input") else input_name for i, l in enumerate(self.model.model.layers): if l.name == input_layer_name: self.input_layer_idxs.append(i) """ self.fwd_predict_fn = None # Now try to find the correct layer: if not isinstance(output_layer, int): raise Exception( "output_layer has to be an integer index of the Keras layer in the Keras model." ) # TODO: DeepLIFT does not guarantee that the layer naming recapitulates the Keras layer order. if output_layer < 0: output_layer = len(model.model.layers) + output_layer target_layer_idx = [ i for i, l in enumerate(self.deeplift_model.get_layers()) if l.name == str(output_layer) ][0] # Compile the function that computes the contribution scores # For sigmoid or softmax outputs, target_layer_idx should be -2 (the default) # (See "3.6 Choice of target layer" in https://arxiv.org/abs/1704.02685 for justification) # For regression tasks with a linear output, target_layer_idx should be -1 # (which simply refers to the last layer) # If you want the DeepLIFT multipliers instead of the contribution scores, you can use get_target_multipliers_func self.deeplift_contribs_func = self.deeplift_model.get_target_contribs_func( find_scores_layer_idx=self.input_layer_idxs, target_layer_idx=target_layer_idx)
def plotPromoters(): ######################## #command line arguments# ######################## parser = argparse.ArgumentParser() #PARAMETERS parser.add_argument( "--sequences", help="Full path to a fasta-file containing the promoter sequences.", type=str) parser.add_argument("--outdir", help="Full path to the output directory.") parser.add_argument( "--N", help= "How many references are used for averaging single signal sequence contributions.", type=int, default=10) parser.add_argument("--model", help="Full path to the trained keras model.", type=str, default=None) parser.add_argument( "--background", help="Full path to a fasta-file containing the background sequences.", type=str) parser.add_argument("--target_layer", help="Target layer index for deeplift (default=-3).", type=int, default=-3) parser.add_argument("--ylim", help="Limits for y-axis.", type=float, nargs=2, default=None) parser.add_argument( "--labels", help= "Full path to a file containing labels used as figure titles. If not given, use fasta IDs.", type=str, default=None) parser.add_argument("--logoType", help="Logo image file extension (default=pdf).", type=str, default='pdf', choices=['png', 'pdf']) args = parser.parse_args() #reading in the promoter sequences ids = [] signal = [] signal_seq = [] for seq in pyfastx.Fasta(args.sequences): ids.append(seq.name) signal_seq.append(str(seq.seq).upper()) #and one-hot encoding for i in range(0, len(signal_seq)): signal.append(vectorizeSequence(signal_seq[i])) signal = np.array(signal) #reading in the background sequences bg = [] for seq in pyfastx.Fasta(args.background): bg.append(str(seq.seq).upper()) #and one-hot encoding for i in range(0, len(bg)): bg[i] = vectorizeSequence(bg[i]) bg = np.array(bg) #reading in labels if given if args.labels != None: labels = [] f = open(args.labels, 'rt') for row in f: labels.append(row) f.close() else: labels = ids #initialize the deeplift model deeplift_model = kc.convert_model_from_saved_files( args.model, nonlinear_mxts_mode=deeplift.layers.NonlinearMxtsMode. DeepLIFT_GenomicsDefault) find_scores_layer_idx = 0 #computes importance scores for inpur layer input deeplift_contribs_func = deeplift_model.get_target_contribs_func( find_scores_layer_idx=find_scores_layer_idx, target_layer_idx=args.target_layer) #and then score each sequence against args.N different background sequences scores = np.zeros(shape=(args.N, signal.shape[0], signal.shape[1])) for i in range(0, args.N): scores[i, :, :] = np.sum(deeplift_contribs_func( task_idx=1, input_data_list=[signal], input_references_list=[bg[:signal.shape[0], :, :]], batch_size=10, progress_update=None), axis=2) bg = np.roll(bg, 1, axis=0) scores = np.mean(scores, axis=0) #now the contributions have been calculated, next plotting the sequence logos weighted by the contributions for ind in range(0, len(signal_seq)): #first plotting the sequence seq = signal_seq[ind] fig, ax = plt.subplots() matrix_df = lm.saliency_to_matrix( seq, scores[ind, :]) #pd.DataFrame(scores[i,:]) logo = lm.Logo(df=matrix_df, color_scheme='classic') logo.ax.set_xlabel('position') logo.ax.set_ylabel('contribution') title = labels[ind] logo.ax.set_title(title) if args.ylim != None: logo.ax.set_ylim(args.ylim) plt.tight_layout() plt.savefig(args.outdir + ids[ind] + '.' + args.logoType, dpi=150, bbox_inches='tight', pad_inches=0) plt.close(fig) plt.clf() plt.cla() #and then saving the importance scores to a file np.savetxt(args.outdir + ids[ind] + '.txt', scores[ind, :])
def compute_deeplift_scores( TARGET_DIRECTORY, dataset, X, Y, keras_model_file, reference_label, non_reference_label, base_neuron_label, mask, gpu_id, dropout, threshold, percentage_cutoff, cluster_mask, flags): """ Wrapper function for model reduction, called by main.py Uses DeepLIFT to compute saliency scores for feature selection, with the average data used as reference See https://github.com/kundajelab/deeplift for DeepLIFT implementation Inputs: - TARGET_DIRECTORY: general directory path to write files to (str) - dataset: choice of dataset, along with seed and fold number (str) - X: Numpy array containing data matrices - Y: Numpy array containing data labels - keras_model_file: name of existing model file (str) - reference_label: how the reference class is represented in Y, usually 0 (int) - non_reference_label: how the other class(es) is (are) represented in Y, usually 1 (int) - base_neuron_label: label to be used as the base, usually 0 (int) - mask: Numpy array, usually initialised as all 1s unless neurons are repeatedly removed - gpu_id: ID of GPU to use (int) - dropout: fraction of neurons to turn off (float) - threshold: usually set as 1.0, represents the previous percentage_cutoff when repeatedly removing neurons (float) - percentage_cutoff: usually set as 0.95 to keep 5% of the most significant features (float) - cluster_mask: Numpy array containing mask obtained from CLIP - flags: used to vary model settings, see main.py (dict) Returns: - new_model_file: directory path to the new model (str) - mask_2D_flattened: Numpy array of 1s and 0s, with 1 representing a selected feature """ keras_model = keras.models.load_model(keras_model_file) print(keras_model.summary()) # original model deeplift_model = kc.convert_model_from_saved_files( keras_model_file, nonlinear_mxts_mode=deeplift.layers.NonlinearMxtsMode.DeepLIFT_GenomicsDefault) print(deeplift_model.get_layers()) mode = 'average' X_masked, mapping = get_masked_data(X, mask) Y = np.argmax(Y, axis=1) reference = get_reference(mode, reference_label, X_masked, Y) print('+++++++++++++ Computing DeepLIFT scores ++++++++++++++') print('previous threshold', threshold, 'new threshold', percentage_cutoff) find_scores_layer_idx = 0 input_scores = np.zeros(X_masked[Y == non_reference_label].shape) layer_scores = [] task_id = base_neuron_label for layer_idx, layer in enumerate(deeplift_model.get_layers()): if type(layer).__name__ == 'Dense' or type(layer).__name__ == 'Input': deeplift_contribs_func = deeplift_model.get_target_contribs_func(find_scores_layer_idx=layer_idx, target_layer_idx=-2) scores = np.array(deeplift_contribs_func(task_idx=task_id, input_references_list=reference, input_data_list=[X_masked[Y == non_reference_label]], batch_size=10, progress_update=50)) sum_scores = np.zeros(scores.shape[1]) for score in scores: sum_scores += score sum_scores = np.absolute(sum_scores) if sum_scores.shape[0] > 2: plot_hist(TARGET_DIRECTORY, sum_scores, dataset + '_t_' + str(threshold) + '_layer_' + str(layer_idx)) print('layer', layer_idx, 'type is: ', type(layer), 'scores dimensions are: ', scores.shape, 'sum_scores', sum_scores.shape) layer_scores.append(sum_scores) if layer_idx == 0: input_scores = np.square(scores) layer_scores.append([]) elif type(layer).__name__ == 'NoOp' or type(layer).__name__ == 'Softmax': layer_scores.append([]) print('layer', layer_idx, 'type is: ', type(layer).__name__) alpha = (1 - (percentage_cutoff/threshold)) new_model, mask_2D = compute_new_reduced_model(keras_model, dropout, layer_scores, 2, alpha, mapping, cluster_mask, flags) input_sum_scores = np.zeros(X_masked.shape[1]) for input_score in input_scores: input_sum_scores += input_score padded_sum_scores = get_padded_data(input_sum_scores, mapping) full_matrix = create_matrix(padded_sum_scores) mkdir(TARGET_DIRECTORY + './important_features/') np.savetxt( TARGET_DIRECTORY + './important_features/' + dataset + '_scores_deeplift_reduced_r_' + str(threshold) + '_t_' + str(percentage_cutoff) + '.csv', np.transpose(np.array(input_scores)), delimiter= ',') np.savetxt( TARGET_DIRECTORY + './important_features/' + dataset + '_scores_reshaped_reduced_r_' + str(threshold) + '_t_' + str(percentage_cutoff) + '.csv', full_matrix, delimiter=",") np.savetxt( TARGET_DIRECTORY + './important_features/' + dataset + '_deeplift_features_nodes_r_' + str(threshold) + '_t_' + str(percentage_cutoff) + '.csv', mask_2D) mkdir(TARGET_DIRECTORY + './reduced_models/') new_model_file = TARGET_DIRECTORY + './reduced_models/' + dataset + '_from_' + str(threshold) + '_to_' + str(percentage_cutoff) + '.h5' new_model.save(new_model_file) os.remove(keras_model_file) mask_2D_flattened = corr_mx_flatten_single(mask_2D) return new_model_file, mask_2D_flattened
#esaved_model_file = 'keras2_mnist_cnn_allconv.h5' saved_model_file = 'mnist_my_cnn_model.h5' keras_model = keras.models.load_model(saved_model_file) keras_model.summary() from keras.datasets import mnist (X_train, y_train), (X_test, y_test) = mnist.load_data() X_test = X_test[:, :, :, None] import deeplift from deeplift.layers import NonlinearMxtsMode from deeplift.conversion import kerasapi_conversion as kc #Three different models, one each for RevealCancel, Gradient and GuidedBackprop revealcancel_model = kc.convert_model_from_saved_files( h5_file=saved_model_file, nonlinear_mxts_mode=NonlinearMxtsMode.RevealCancel) grad_model = kc.convert_model_from_saved_files( h5_file=saved_model_file, nonlinear_mxts_mode=NonlinearMxtsMode.Gradient) guided_backprop_model = kc.convert_model_from_saved_files( h5_file=saved_model_file, nonlinear_mxts_mode=NonlinearMxtsMode.GuidedBackprop) from deeplift.util import compile_func import numpy as np from keras import backend as K deeplift_model = revealcancel_model deeplift_prediction_func = compile_func( [deeplift_model.get_layers()[0].get_activation_vars()], deeplift_model.get_layers()[-1].get_activation_vars())
import deeplift from deeplift.conversion import kerasapi_conversion as kc from deeplift.blobs import NonlinearMxtsMode from deeplift.util import get_integrated_gradients_function """ Created by Mohsen Naghipourfar on 6/14/18. Email : [email protected] or [email protected] Website: http://ce.sharif.edu/~naghipourfar Github: https://github.com/naghipourfar Skype: mn7697np """ # Code = keras.models.load_model("./classifier.h5") deeplift_model = kc.convert_model_from_saved_files( "./classifier-noBatchNorm-noGaussian.h5", nonlinear_mxts_mode=NonlinearMxtsMode.Gradient) print(deeplift_model.get_name_to_layer().keys()) gradient_function = deeplift_model.get_target_multipliers_func( find_scores_layer_name="input_1_0", pre_activation_target_layer_name="preact_dense_5_0") integrated_gradient_5 = get_integrated_gradients_function(gradient_function, 5) x = pd.read_csv("../Data/fpkm_normalized.csv", header=None) for task_idx in range(1): print("\tComputing scores for task: " + str(task_idx)) scores = np.array( integrated_gradient_5(task_idx=task_idx,
file = open('Results/PICKLE2020-05-26215517', 'rb') tested_IDs = pickle.load(file)[0] testing_indices = [geneIDs.index(gene) for gene in tested_IDs] test_data = one_hot_seq[testing_indices] test_data_shuffled = one_hot_dinuc_shuff_seq[testing_indices] test_categories = np.argmax(categories[testing_indices], axis=1) # Load model and make predictions model = models.load_model('Results/model2020-05-26215517.h5') predictions = np.argmax(model.predict(test_data), axis=1) print('Predictions done') print(predictions) deeplift_model =\ kc.convert_model_from_saved_files('Results/model2020-05-26215517.h5', nonlinear_mxts_mode=NonlinearMxtsMode.DeepLIFT_GenomicsDefault) deeplift_contribs_func = deeplift_model.get_target_contribs_func( find_scores_layer_idx=0, target_layer_idx=-2) print('deeplift sound') # calculate deeplift for each gene (only tp and tn) tp_data = [] tn_data = [] tp_shuf_data = [] tn_shuf_data = [] for j, indx in enumerate(testing_indices): test_gene = geneIDs[indx] if test_categories[j] == 1 and predictions[j] == 1: