def score(self, input_batch): """ Args: input_batch: Input batch that should be scored. Returns: A list of length: len(`scores`). Every element of the list is a stacked list of depth D if the model input is D-dimensional with identcal shape. Every entry of that list then contains the scores of the model output selected by `output_sel_fn`. Values are `None` if the input_batch already had a `1` at that position. """ ref = self.model.predict_on_batch(input_batch) scores = [] for sample_i in range( len(get_model_input(input_batch, self.model_input))): # get the full set of model inputs for the selected sample sample_set = get_dataset_item(input_batch, sample_i) # get the reference output for this sample ref_sample_pred = get_dataset_item(ref, sample_i) # Apply the output selection function if defined if self.output_sel_fn is not None: ref_sample_pred = self.output_sel_fn(ref_sample_pred) # get the one-hot encoded reference input array input_sample = get_model_input(sample_set, input_id=self.model_input) # where we keep the scores - scores are lists (ordered by diff # method of ndarrays, lists or dictionaries - whatever is returned by the model score = np.empty(input_sample.shape, dtype=object) score[:] = None for alt_batch, alt_idxs in self._mutate_sample_batched( input_sample): num_samples = len(alt_batch) mult_set = numpy_collate([sample_set] * num_samples) mult_set = set_model_input(mult_set, numpy_collate(alt_batch), input_id=self.model_input) alt = self.model.predict_on_batch(mult_set) for alt_sample_i in range(num_samples): alt_sample = get_dataset_item(alt, alt_sample_i) # Apply the output selection function if defined if self.output_sel_fn is not None: alt_sample = self.output_sel_fn(alt_sample) # Apply scores across all model outputs for ref and alt output_scores = [ apply_within(ref_sample_pred, alt_sample, scr) for scr in self.scores ] score.__setitem__(alt_idxs[alt_sample_i], output_scores) scores.append(score.tolist()) return scores
def from_seqlet_imps(cls, seqlet_imps): from kipoi.data_utils import numpy_collate s1 = seqlet_imps[0] # tasks = s1.tasks() return cls( seq=np.stack([s.seq for s in seqlet_imps]), contrib=numpy_collate([s.contrib for s in seqlet_imps]), hyp_contrib=numpy_collate([s.hyp_contrib for s in seqlet_imps]), profile=numpy_collate([s.profile for s in seqlet_imps]), name=s1.name, attrs=s1.attrs )
def batch_iter(self, batch_size=32, **kwargs): # TODO - implement this in parallel - add `num_workers` argument # https://github.com/fchollet/keras/blob/master/keras/utils/data_utils.py#L589 l = [] for x in iter(self): l.append(x) if len(l) == batch_size: ret = numpy_collate(l) # remove all elements del l[:] yield ret # Return the rest if len(l) > 0: yield numpy_collate(l)
def get_example_data(example, layer, writer=None): example_dir = "examples/{0}".format(example) if INSTALL_REQ: install_model_requirements(example_dir, "dir", and_dataloaders=True) model = kipoi.get_model(example_dir, source="dir") # The preprocessor Dataloader = kipoi.get_dataloader_factory(example_dir, source="dir") # with open(example_dir + "/example_files/test.json", "r") as ifh: dataloader_arguments = json.load(ifh) for k in dataloader_arguments: dataloader_arguments[k] = "example_files/" + dataloader_arguments[k] outputs = [] with cd(model.source_dir): dl = Dataloader(**dataloader_arguments) it = dl.batch_iter(batch_size=32, num_workers=0) # Loop through the data, make predictions, save the output for i, batch in enumerate(tqdm(it)): # make the prediction pred_batch = model.input_grad(batch['inputs'], avg_func="sum", layer=layer, final_layer=False) # write out the predictions, metadata (, inputs, targets) # always keep the inputs so that input*grad can be generated! output_batch = prepare_batch(batch, pred_batch, keep_inputs=True) if writer is not None: writer.batch_write(output_batch) outputs.append(output_batch) if writer is not None: writer.close() return numpy_collate(outputs)
def score(self, input_batch, input_ref): """ Calculate DeepLIFT scores of a given input sequence. Args: input_batch: Model input data Returns: DeepLIFT scores in the same shape / same containers as the input batch. """ x_standardized = self.model._batch_to_list(input_batch) ref_standaradized = None if input_ref is not None: ref_standaradized = self.model._batch_to_list(input_ref) scores = self.deeplift_contribs_func( task_idx=self.task_idx, input_data_list=x_standardized, input_references_list=ref_standaradized, batch_size=self.batch_size, progress_update=1000) # TODO DeepLIFT error when using batched execution: """ # run_function_in_batches fails for scores = run_function_in_batches( func=self.deeplift_contribs_func, input_data_list=x_standardized, batch_size=self.batch_size, progress_update=1000, task_idx=self.task_idx) """ # DeepLIFT returns all samples as a list of individual samples scores = [numpy_collate(el) for el in scores] # re-format the list-type input back to how the input_batch was: scores = self.model._match_to_input(scores, input_batch) return scores