def test_tf_eager(): """ This is a basic eager example from keras. """ tf = pytest.importorskip('tensorflow') if LooseVersion(tf.__version__) >= LooseVersion("2.4.0"): pytest.skip("Deep explainer does not work for TF 2.4 in eager mode.") x = pd.DataFrame({"B": np.random.random(size=(100, ))}) y = x.B y = y.map(lambda zz: chr(int(zz * 2 + 65))).str.get_dummies() model = tf.keras.models.Sequential() model.add( tf.keras.layers.Dense(10, input_shape=(x.shape[1], ), activation="relu")) model.add( tf.keras.layers.Dense(y.shape[1], input_shape=(10, ), activation="softmax")) model.summary() model.compile(loss="categorical_crossentropy", optimizer="Adam") model.fit(x.values, y.values, epochs=2) e = DeepExplainer(model, x.values[:1]) sv = e.shap_values(x.values) assert np.abs(e.expected_value[0] + sv[0].sum(-1) - model(x.values)[:, 0]).max() < 1e-4
def test_tf_eager(): """ This is a basic eager example from keras. """ _skip_if_no_tensorflow() import pandas as pd import numpy as np import tensorflow as tf from shap import DeepExplainer import datetime x = pd.DataFrame({ "B": np.random.random(size=(100,)) }) y = x.B y = y.map(lambda zz: chr(int(zz * 2 + 65))).str.get_dummies() model = tf.keras.models.Sequential() model.add(tf.keras.layers.Dense(10, input_shape=(x.shape[1],), activation="relu")) model.add(tf.keras.layers.Dense(y.shape[1], input_shape=(10,), activation="softmax")) model.summary() model.compile(loss="categorical_crossentropy", optimizer="Adam") model.fit(x.values, y.values, epochs=2) e = DeepExplainer(model, x.values[:1]) sv = e.shap_values(x.values) assert np.abs(e.expected_value[0] + sv[0].sum(-1) - model(x.values)[:,0]).max() < 1e-4
def __init__(self, model, label, dataset): images = [] length = len(dataset) idxs = np.random.choice(length, min(100, length), replace=False) for i in idxs: images.append(dataset[i]['image']) background_data = torch.stack(images) if torch.cuda.is_available(): background_data = background_data.cuda() self.label = label self.model = model self.explainer = DeepExplainer(self.model.model, background_data[:25])
class ShapImageExplanier(ImageExplainer): def __init__(self, model, label, dataset): images = [] length = len(dataset) idxs = np.random.choice(length, min(100, length), replace=False) for i in idxs: images.append(dataset[i]['image']) background_data = torch.stack(images) if torch.cuda.is_available(): background_data = background_data.cuda() self.label = label self.model = model self.explainer = DeepExplainer(self.model.model, background_data[:25]) def explain(self, instance, budget): instance = instance.unsqueeze(0) _, c, w, h = instance.shape shap_values = self.explainer.shap_values(instance)[self.label][0] # print(shap_values) # print(shap_values.shape) explanation_2d = np.sum(np.abs(shap_values), axis=0) top_percentile = np.percentile(explanation_2d, 100 - budget) # only return above percentile explanation_2d[explanation_2d < top_percentile] = 0.0 explanation_2d[explanation_2d >= top_percentile] = 1.0 return explanation_2d.astype(np.float32)
def fit(self, explainer, new_observation, shap_explainer_type=None, **kwargs): """Calculate the result of explanation Fit method makes calculations in place and changes the attributes. Parameters ----------- explainer : Explainer object Model wrapper created using the Explainer class. new_observation : pd.Series or np.ndarray An observation for which a prediction needs to be explained. shap_explainer_type : {'TreeExplainer', 'DeepExplainer', 'GradientExplainer', 'LinearExplainer', 'KernelExplainer'} String name of the Explainer class (default is `None`, which automatically chooses an Explainer to use). kwargs: dict Keyword parameters passed to the `shapley_values` method. Returns ----------- None """ from shap import TreeExplainer, DeepExplainer, GradientExplainer, LinearExplainer, KernelExplainer checks.check_compatibility(explainer) shap_explainer_type = checks.check_shap_explainer_type( shap_explainer_type, explainer.model) if self.type == 'predict_parts': new_observation = checks.check_new_observation_predict_parts( new_observation, explainer) if shap_explainer_type == "TreeExplainer": try: self.shap_explainer = TreeExplainer(explainer.model, explainer.data.values) except: # https://github.com/ModelOriented/DALEX/issues/371 self.shap_explainer = TreeExplainer(explainer.model) elif shap_explainer_type == "DeepExplainer": self.shap_explainer = DeepExplainer(explainer.model, explainer.data.values) elif shap_explainer_type == "GradientExplainer": self.shap_explainer = GradientExplainer(explainer.model, explainer.data.values) elif shap_explainer_type == "LinearExplainer": self.shap_explainer = LinearExplainer(explainer.model, explainer.data.values) elif shap_explainer_type == "KernelExplainer": self.shap_explainer = KernelExplainer( lambda x: explainer.predict(x), explainer.data.values) self.result = self.shap_explainer.shap_values(new_observation.values, **kwargs) self.new_observation = new_observation self.shap_explainer_type = shap_explainer_type
def get_partials(filter_id, model, conv_layer_idx, node, ref_samples, contribution_data, samples_chunk, input_reads, intermediate_diff, pad_left, pad_right, lstm=False, check_additivity=False): num_reads = len(input_reads) if filter_id is None: return [], [] read_ids = [] scores_pt_all = [] print("Processing filter: {}".format(filter_id)) if contribution_data[filter_id] is None or not (len( contribution_data[filter_id]) > 0): return [], [] for seq_id in tqdm(range(num_reads)): read_id = re.search("seq_[0-9]+", input_reads[seq_id].id).group() read_id = int(read_id.replace("seq_", "")) read_ids.append(read_id) if contribution_data[filter_id][seq_id] is None or not (len( contribution_data[filter_id][seq_id]) > 0): scores_pt_all.append(None) continue out = model.get_layer(index=conv_layer_idx).get_output_at(node) if lstm: out = out[:, filter_id:filter_id + 1] else: out = out[:, contribution_data[filter_id][seq_id][1][0], filter_id:filter_id + 1] explainer_nt = DeepExplainer((model.get_layer(index=0).input, out), ref_samples) sample = samples_chunk[seq_id, :, :].reshape( (1, ref_samples.shape[1], ref_samples.shape[2])) # Get difference in activation of the intermediate neuron if lstm: diff = intermediate_diff[seq_id, filter_id] else: diff = intermediate_diff[ seq_id, contribution_data[filter_id][seq_id][1][0], filter_id] scores_nt = explainer_nt.shap_values( sample, check_additivity=check_additivity)[0] partials = np.asarray([ phi_i * contribution_data[filter_id][seq_id][2][0] for phi_i in scores_nt ]) / diff partials = partials.reshape(partials.shape[1], partials.shape[2]) # Sum along the channel (nt) axis and pad scores_pt_pad = np.sum(partials, axis=1) scores_pt_pad = np.pad(scores_pt_pad, (pad_left, pad_right), 'constant', constant_values=0.0) if node == 1: scores_pt_pad = scores_pt_pad[::-1] scores_pt_all.append(scores_pt_pad) return scores_pt_all, read_ids
def get_filter_contribs(args, allow_eager=False): """Calculate DeepLIFT contribution scores for all neurons in the convolutional layer and extract all motifs for which a filter neuron got a non-zero contribution score.""" if tf.executing_eagerly() and not allow_eager: print("Using SHAP. Disabling eager execution...") tf.compat.v1.disable_v2_behavior() set_mem_growth() model = load_model(args.model) max_only = args.partial or args.easy_partial or not args.all_occurrences check_additivity = not args.no_check if args.w_norm and not args.do_lstm: print("Create model with mean-centered weight matrices ...") conv_layer_idx = [ idx for idx, layer in enumerate(model.layers) if "Conv1D" in str(layer) ][0] kernel_normed, bias_normed = normalize_filter_weights( model.get_layer(index=conv_layer_idx).get_weights()[0], model.get_layer(index=conv_layer_idx).get_weights()[1]) model.get_layer(index=conv_layer_idx).set_weights( [kernel_normed, bias_normed]) path = args.model if re.search("\.h5$", path) is not None: path = re.sub("\.h5$", "", path) norm_path = path + "_w_norm.h5" model.save(norm_path) args.model = norm_path # extract some model information if args.do_lstm: conv_layer_idx = [ idx for idx, layer in enumerate(model.layers) if "Bidirectional" in str(layer) ][args.inter_layer - 1] n_filters = model.get_layer( index=conv_layer_idx).get_output_at(0).shape[-1] input_layer_id = [ idx for idx, layer in enumerate(model.layers) if "Input" in str(layer) ][0] motif_length = model.get_layer( index=input_layer_id).get_output_at(0).shape[1] pad_left = 0 pad_right = 0 else: conv_layer_ids = [ idx for idx, layer in enumerate(model.layers) if "Conv1D" in str(layer) ] conv_layer_idx = conv_layer_ids[args.inter_layer - 1] motif_length = get_rf_size(model, conv_layer_idx) n_filters = model.get_layer( index=conv_layer_idx).get_weights()[0].shape[-1] pad_left = (motif_length - 1) // 2 pad_right = motif_length - 1 - pad_left print(model.summary()) print("Loading test data (.npy) ...") test_data_set_name = os.path.splitext(os.path.basename(args.test_data))[0] samples = np.load(args.test_data, mmap_mode='r') total_num_reads = samples.shape[0] len_reads = samples.shape[1] print("Loading test data (.fasta) ...") nonpatho_reads = list(SeqIO.parse(args.nonpatho_test, "fasta")) patho_reads = list(SeqIO.parse(args.patho_test, "fasta")) reads = nonpatho_reads + patho_reads for idx, r in enumerate(reads): r.id = test_data_set_name + "_seq_" + str( idx) + "_" + os.path.basename(r.id) r.description = test_data_set_name + "_seq_" + str( idx) + "_" + os.path.basename(r.description) r.name = test_data_set_name + "_seq_" + str( idx) + "_" + os.path.basename(r.name) print("Padding reads ...") reads = ["N" * pad_left + r + "N" * pad_right for r in reads] assert len(reads) == total_num_reads, \ "Test data in .npy-format and fasta files containing different number of reads!" # create output directory and subdirectories if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) if not os.path.exists(args.out_dir + "/filter_scores/"): os.makedirs(args.out_dir + "/filter_scores/") if not os.path.exists(args.out_dir + "/fasta/"): os.makedirs(args.out_dir + "/fasta/") if (args.partial or args.easy_partial ) and not os.path.exists(args.out_dir + "/nuc_scores/"): os.makedirs(args.out_dir + "/nuc_scores/") # load or create reference sequences ref_samples = get_reference_seqs(args, len_reads) num_ref_seqs = ref_samples.shape[0] print("Running DeepSHAP ...") chunk_size = args.chunk_size // num_ref_seqs i = 0 if tf.executing_eagerly(): intermediate_model = tf.keras.Model( model.inputs, (model.get_layer(index=conv_layer_idx).get_output_at(0), model.get_layer(index=conv_layer_idx).get_output_at(1))) def map2layer(input_samples): out = intermediate_model(input_samples, training=False) return out[0].numpy(), out[1].numpy() intermediate_ref_fwd, intermediate_ref_rc = map2layer(ref_samples) else: def map2layer(input_samples, layer, out_node): feed_dict = dict( zip([model.get_layer(index=0).input], [input_samples])) return tf.compat.v1.keras.backend.get_session().run( model.get_layer(index=layer).get_output_at(out_node), feed_dict) intermediate_ref_fwd = map2layer(ref_samples, conv_layer_idx, 0) intermediate_ref_rc = map2layer(ref_samples, conv_layer_idx, 1) intermediate_ref_fwd = intermediate_ref_fwd.mean(axis=0, keepdims=True) intermediate_ref_rc = intermediate_ref_rc.mean(axis=0, keepdims=True) explainer = DeepExplainer(([ model.get_layer(index=conv_layer_idx).get_output_at(0), model.get_layer(index=conv_layer_idx).get_output_at(1) ], model.layers[-1].output), [intermediate_ref_fwd, intermediate_ref_rc]) filter_range = range(n_filters) if args.inter_neuron is not None: filter_range = [None] * n_filters for n in args.inter_neuron: filter_range[n] = n while i < total_num_reads: print("Done " + str(i) + " from " + str(total_num_reads) + " sequences") samples_chunk = samples[i:i + chunk_size, :, :] reads_chunk = reads[i:i + chunk_size] if tf.executing_eagerly(): intermediate_fwd, intermediate_rc = map2layer(ref_samples) else: intermediate_fwd = map2layer(samples_chunk, conv_layer_idx, 0) intermediate_rc = map2layer(samples_chunk, conv_layer_idx, 1) inter_diff_fwd = intermediate_fwd - intermediate_ref_fwd inter_diff_rc = intermediate_rc - intermediate_ref_rc scores_filter = explainer.shap_values( [intermediate_fwd, intermediate_rc], check_additivity=check_additivity) scores_fwd, scores_rc = scores_filter[0] # shape: [num_reads, len_reads, n_filters] print("Getting data ...") # for each filter do: if args.do_lstm: dat_fwd = [ get_lstm_data(i, scores_filter_avg=scores_fwd, input_reads=reads_chunk, motif_len=motif_length) for i in filter_range ] dat_rc = [ get_lstm_data(i, scores_filter_avg=scores_rc, input_reads=reads_chunk, motif_len=motif_length, rc=True) for i in filter_range ] else: dat_fwd = [ get_filter_data(i, scores_filter_avg=scores_fwd, input_reads=reads_chunk, motif_len=motif_length, max_only=max_only) for i in filter_range ] dat_rc = [ get_filter_data(i, scores_filter_avg=scores_rc, input_reads=reads_chunk, motif_len=motif_length, rc=True, max_only=max_only) for i in filter_range ] if max_only: dat_max = [ get_max_strand(i, dat_fwd=dat_fwd, dat_rc=dat_rc) for i in filter_range ] contrib_dat_fwd, motif_dat_fwd, contrib_dat_rc, motif_dat_rc = list( zip(*dat_max)) else: contrib_dat_fwd, motif_dat_fwd = list(zip(*dat_fwd)) contrib_dat_rc, motif_dat_rc = list(zip(*dat_rc)) print("Saving data ...") if contrib_dat_fwd: for f in filter_range: write_filter_data(f, contribution_data=contrib_dat_fwd, motifs=motif_dat_fwd, out_dir=args.out_dir, data_set_name=test_data_set_name) if contrib_dat_rc: for f in filter_range: write_filter_data(f, contribution_data=contrib_dat_rc, motifs=motif_dat_rc, out_dir=args.out_dir, data_set_name=test_data_set_name) if args.partial: print("Getting partial data ...") partials_nt_fwd = [ get_partials(i, model=model, conv_layer_idx=conv_layer_idx, node=0, ref_samples=ref_samples, contribution_data=contrib_dat_fwd, samples_chunk=samples_chunk, input_reads=reads_chunk, intermediate_diff=inter_diff_fwd, pad_left=pad_left, pad_right=pad_right, lstm=args.do_lstm, check_additivity=check_additivity) for i in filter_range ] partials_nt_rc = [ get_partials(i, model=model, conv_layer_idx=conv_layer_idx, node=1, ref_samples=ref_samples, contribution_data=contrib_dat_rc, samples_chunk=samples_chunk, input_reads=reads_chunk, intermediate_diff=inter_diff_rc, pad_left=pad_left, pad_right=pad_right, lstm=args.do_lstm, check_additivity=check_additivity) for i in filter_range ] elif args.easy_partial: print("Getting partial data ...") partials_nt_fwd = [ get_easy_partials(i, model=model, conv_layer_idx=conv_layer_idx, node=0, contribution_data=contrib_dat_fwd, samples_chunk=samples_chunk, input_reads=reads_chunk, intermediate_diff=inter_diff_fwd, pad_left=pad_left, pad_right=pad_right) for i in filter_range ] partials_nt_rc = [ get_easy_partials(i, model=model, conv_layer_idx=conv_layer_idx, node=1, contribution_data=contrib_dat_rc, samples_chunk=samples_chunk, input_reads=reads_chunk, intermediate_diff=inter_diff_rc, pad_left=pad_left, pad_right=pad_right) for i in filter_range ] if args.partial or args.easy_partial: scores_nt_fwd, read_ids_fwd = list(zip(*partials_nt_fwd)) scores_nt_rc, read_ids_rc = list(zip(*partials_nt_rc)) print("Saving partial data ...") if scores_nt_fwd: for f in filter_range: write_partial_data(f, read_ids=read_ids_fwd, contribution_data=contrib_dat_fwd, scores_input_pad=scores_nt_fwd, out_dir=args.out_dir, data_set_name=test_data_set_name, motif_len=motif_length) if scores_nt_rc: for f in filter_range: write_partial_data(f, read_ids=read_ids_rc, contribution_data=contrib_dat_rc, scores_input_pad=scores_nt_rc, out_dir=args.out_dir, data_set_name=test_data_set_name, motif_len=motif_length) i += chunk_size print("Done " + str(min(i, total_num_reads)) + " from " + str(total_num_reads) + " sequences")
y_pred = model(train_feat) y_pred = torch.sigmoid((y_pred - sig_cent) / damp_factor) print('Training Score:') print(criterion(y_pred, train_target)) print(y_pred) t2 = time.time() print('Time to Train: ', str(t2 - t1)) background_size = 10000 n_analyses = 250 background = train_feat[np.random.randint(0, len(train_feat), size=background_size)] exp = DeepExplainer(model, background) analyses = eval_feat[np.random.randint(0, len(eval_feat), size=n_analyses)] shap_values = exp.shap_values(analyses) df = pd.DataFrame(shap_values, columns=[ 'd_1', 'd_2', 'd_3', 'd_4', 'd_5', 'f_64', 'f_256', 'f_1024', 'f_4096', 'f_16384', 'r_64', 'r_256', 'r_1024', 'delta' ]) mins = [] maxes = [] means = [] meds = []
def nt_map(args, allow_eager=False): """Create bedgraph files per genome which show the pathogenicity prediction score over all genomic positions.""" # create output directory if not os.path.exists(args.out_dir): os.makedirs(args.out_dir) ref_samples = get_reference_seqs(args, args.read_length) if tf.executing_eagerly() and not allow_eager: print("Using SHAP. Disabling eager execution...") tf.compat.v1.disable_v2_behavior() set_mem_growth() model = load_model(args.model) if args.gradient: explainer = GradientExplainer(model, ref_samples) else: explainer = DeepExplainer(model, ref_samples) check_additivity = not args.no_check # for each fragmented genome do for fragments_file in os.listdir(args.dir_fragmented_genomes): if fragments_file.endswith(".fasta") or fragments_file.endswith( ".fna"): genome = os.path.splitext(os.path.basename(fragments_file))[0] print("Processing " + genome + " ...") # load fragments in fasta format tokenizer = Tokenizer(char_level=True) tokenizer.fit_on_texts('ACGT') fragments = list( SeqIO.parse(args.dir_fragmented_genomes + "/" + fragments_file, "fasta")) num_fragments = len(fragments) records = np.array([ tokenizer.texts_to_matrix(record.seq).astype("int32")[:, 1:] for record in fragments ]) chunk_size = args.chunk_size i = 0 scores_nt_chunks = [] while i < num_fragments: if args.gradient: contribs_chunk = explainer.shap_values( records[i:i + chunk_size, :])[0] else: contribs_chunk = \ explainer.shap_values(records[i:i+chunk_size, :], check_additivity=check_additivity)[0] scores_nt_chunk = np.sum(contribs_chunk, axis=-1) scores_nt_chunks.append(scores_nt_chunk) i = i + chunk_size print("Done " + str(min(i, num_fragments)) + " from " + str(num_fragments) + " sequences") scores_nt = np.vstack(scores_nt_chunks) # load genome size genome_info_file = args.genomes_dir + "/" + re.split( "_fragmented_genomes", genome)[0] + ".genome" if not os.path.isfile(genome_info_file): print("Skipping " + genome + " since .genome file is missing!") continue genome_info = pd.read_csv(genome_info_file, sep="\t", index_col=0, header=None) # prepare output table df = pd.DataFrame() # save pathogenicity score for each nucleotide of all contigs of that genome genome_patho_dict = OrderedDict() # count by how many reads each nucleotide is covered genome_read_counter_dict = OrderedDict() # build bed graph file representing pathogenicity over genome for fragment_idx in range(num_fragments): seq_name, start_f, end_f = re.split(":|\.\.", fragments[fragment_idx].id) contig_len = int(genome_info.loc[seq_name]) start = max(0, int(start_f)) end = min(int(end_f), contig_len) if seq_name not in genome_patho_dict: genome_patho_dict[seq_name] = np.zeros(contig_len) genome_read_counter_dict[seq_name] = np.zeros(contig_len) try: genome_patho_dict[seq_name][start:end] += \ scores_nt[fragment_idx, start-int(start_f):end-int(start_f)] except ValueError as err: print(err) print( "Error. Please check if the genome length matches its description in the .genome/.gff3 file." ) break genome_read_counter_dict[seq_name][start:end] += 1 for seq_name, genome_read_counter in genome_read_counter_dict.items( ): # compute mean pathogenicity score per nucleotide genome_patho_dict[seq_name] /= genome_read_counter # convert array of nucelotde pathogenicity scores to intervals (-> bedgraph format) scores = genome_patho_dict[seq_name] interval_starts = np.arange(scores.shape[0], dtype='int32') interval_ends = np.arange(scores.shape[0], dtype='int32') + 1 df_s = pd.DataFrame( OrderedDict((('seq_name', [seq_name] * scores.shape[0]), ('start', interval_starts), ('end', interval_ends), ('score', scores)))) df = df.append(df_s, ignore_index=True) # save results out_file = args.out_dir + "/" + genome + "_nt_contribs_map.bedgraph" df[['start', 'end']] = df[['start', 'end']].astype(int) df.to_csv(out_file, sep="\t", index=False, header=False)