예제 #1
0
def in_silico_mutagenesis(model: Model,
                          encoded_sequences: np.ndarray) -> np.ndarray:
  """Computes in-silico-mutagenesis scores

  Parameters
  ----------
  model: Model
    This can be any model that accepts inputs of the required shape and produces
    an output of shape `(N_sequences, N_tasks)`.
  encoded_sequences: np.ndarray
    A numpy array of shape `(N_sequences, N_letters, sequence_length, 1)`

  Returns
  -------
  np.ndarray
    A numpy array of ISM scores. The shape is `(num_task, N_sequences, N_letters, sequence_length, 1)`.
  """
  # Shape (N_sequences, num_tasks)
  wild_type_predictions = model.predict(NumpyDataset(encoded_sequences))
  # check whether wild_type_predictions is np.ndarray or not
  assert isinstance(wild_type_predictions, np.ndarray)
  num_tasks = wild_type_predictions.shape[1]
  # Shape (N_sequences, N_letters, sequence_length, 1, num_tasks)
  mutagenesis_scores = np.empty(
      encoded_sequences.shape + (num_tasks,), dtype=np.float32)
  # Shape (N_sequences, num_tasks, 1, 1, 1)
  wild_type_predictions = wild_type_predictions[:, np.newaxis, np.newaxis,
                                                np.newaxis]
  for sequence_index, (sequence, wild_type_prediction) in enumerate(
      zip(encoded_sequences, wild_type_predictions)):

    # Mutates every position of the sequence to every letter
    # Shape (N_letters * sequence_length, N_letters, sequence_length, 1)
    # Breakdown:
    # Shape of sequence[np.newaxis] (1, N_letters, sequence_length, 1)
    mutated_sequences = np.repeat(
        sequence[np.newaxis], np.prod(sequence.shape), axis=0)

    # remove wild-type
    # len(arange) = N_letters * sequence_length
    arange = np.arange(len(mutated_sequences))
    # len(horizontal cycle) = N_letters * sequence_length
    horizontal_cycle = np.tile(np.arange(sequence.shape[1]), sequence.shape[0])
    mutated_sequences[arange, :, horizontal_cycle, :] = 0

    # add mutant
    vertical_repeat = np.repeat(np.arange(sequence.shape[0]), sequence.shape[1])
    mutated_sequences[arange, vertical_repeat, horizontal_cycle, :] = 1
    # make mutant predictions
    mutated_predictions = model.predict(NumpyDataset(mutated_sequences))
    # check whether wild_type_predictions is np.ndarray or not
    assert isinstance(mutated_predictions, np.ndarray)
    mutated_predictions = mutated_predictions.reshape(sequence.shape +
                                                      (num_tasks,))
    mutagenesis_scores[
        sequence_index] = wild_type_prediction - mutated_predictions
  rolled_scores = np.rollaxis(mutagenesis_scores, -1)
  return rolled_scores
예제 #2
0
파일: __init__.py 프로젝트: amoliu/deepchem
  def predict(self, dataset, transformers=[], batch_size=None,
              pad_batches=False):
    """
    Uses self to make predictions on provided Dataset object.

    This is overridden to make sure the batch size is always valid for Tensorflow.

    Returns:
      y_pred: numpy ndarray of shape (n_samples,)
    """
    return Model.predict(self, dataset, transformers,
                         self.model_instance.batch_size, True)