예제 #1
0
    def __init__(self,
                 experiment_proto,
                 loss,
                 target_names=None,
                 additional_output=None):
        """Initialize a LatentAffinity output layer.

    Args:
      experiment_proto: selection_pb2.Experiment describing the experiment.
      loss: instance of an AbstractLoss subclass used for computing loss on this
        output layer.
      target_names: optional list of strings giving target names to train
        against.
      additional_output: optional list of strings containing all the
        additional output to predict.

    Raises:
      ValueError: if any target_names are not counts.
    """
        self.loss = loss

        affinity_names = selection.all_target_and_background_names(
            experiment_proto)
        additional_output = get_additional_output_names(
            experiment_proto, additional_output)
        target_names = get_target_names(experiment_proto, target_names)
        self.target_axis = lt.Axis('target', target_names)

        if additional_output:
            self.additional_output_axis = lt.Axis('additional_output',
                                                  additional_output)
        else:
            self.additional_output_axis = None
        self.logit_axis = lt.Axis('target', affinity_names + additional_output)
        self.affinity_axis = lt.Axis('affinity', affinity_names)
        self.all_target_axis = lt.Axis('target',
                                       target_names + additional_output)

        self.all_count_names = selection.all_count_names(experiment_proto)
        self.binding_arrays_map = _binding_arrays_map(experiment_proto)

        signs = _get_selection_signs(affinity_names, target_names,
                                     experiment_proto)
        self.selection_signs = lt.LabeledTensor(
            tf.constant(signs, dtype=tf.float32, name='selection_signs'),
            [self.affinity_axis, self.target_axis])

        # TODO(shoyer): consider if there's a sane way to make lt.Variable
        affinity_weights = tf.Variable(tf.ones_like(signs, dtype=tf.float32),
                                       name='affinity_weights')
        bias = tf.Variable(tf.zeros([self.target_axis.size]), name='bias')
        self.params = [affinity_weights, bias]

        self.affinity_weights = lt.LabeledTensor(
            tf.convert_to_tensor(affinity_weights),
            [self.affinity_axis, self.target_axis])
        self.bias = lt.LabeledTensor(tf.convert_to_tensor(bias),
                                     [self.target_axis])
  def __init__(self,
               experiment_proto,
               loss,
               affinity_target_map=None,
               target_names=None,
               additional_output=None):
    """Initialize a FullyObserved output layer.

    Args:
      experiment_proto: selection_pb2.Experiment describing the experiment.
      loss: instance of an AbstractLoss subclass used for computing loss on this
        output layer.
      affinity_target_map: dictionary with one entry for each selection target
        molecule (e.g. protein) and the list of target output values to be used
        to calculate that target molecule's affinity. This dictionary is
        optional to create this OutputLayer but is required to calculate
        affinity. (In other words, during training it is unnecessary but for
        inference it is usually required.)
      target_names: optional list of strings giving target names to train
        against.
      additional_output: optional list of strings containing all the
        additional output to predict.

    Raises:
      Error: if the affinity_target_map is invalid.
    """
    self.loss = loss

    target_names = get_target_names(experiment_proto, target_names)
    additional_output = get_additional_output_names(experiment_proto,
                                                    additional_output)
    if additional_output:
      self.additional_output_axis = lt.Axis('additional_output',
                                            additional_output)
    else:
      self.additional_output_axis = None
    self.count_axis = self.target_axis = lt.Axis('target', target_names)
    self.logit_axis = lt.Axis('target', target_names+additional_output)

    self.binding_arrays_map = _binding_arrays_map(experiment_proto)

    self.params = []

    self.affinity_target_axis = self.affinity_target_lt = None
    if affinity_target_map:
      affinity_target_map = config.DEFAULT_AFFINITY_TARGET_MAPS[
          affinity_target_map]
      # make sure that every target in the affinity_target_map is in the logits
      # (otherwise the target is silently ignored, could be dangerous)
      target_names = self.count_axis.labels
      affinity_names = list(affinity_target_map.keys())
      for (affinity,
           desired_target_names) in affinity_target_map.items():
        for desired_name in desired_target_names:
          if desired_name not in target_names:
            raise Error('The desired target name %s for the affinity molecule'
                        '%s is not found in the logit target names.\n'
                        'logit target names: %s\n', desired_name,
                        affinity, target_names)

      array = np.zeros((len(affinity_names), len(target_names)), dtype=int)
      for i, affinity in enumerate(affinity_names):
        for j, target in enumerate(target_names):
          if target in affinity_target_map[affinity]:
            array[i, j] = 1
      self.affinity_axis = lt.Axis('affinity', affinity_names)
      self.affinity_target_lt = lt.LabeledTensor(
          tf.constant(
              array, dtype=tf.float32, name='affinity_targets'),
          [self.affinity_axis, self.count_axis])
예제 #3
0
def create_input_and_outputs(feature_tensors,
                             experiment_proto,
                             input_features=(SEQUENCE_ONE_HOT, ),
                             skip_all_zero_counts=True,
                             kmer_k_max=4,
                             additional_output=None):
    """Create inputs and outputs from parsed features.

  Args:
    feature_tensors: Dict[str, tf.Tensor] with parsed featured created by
      `build_features`.
    experiment_proto: selection_pb2.Experiment describing the experiment.
    input_features: optional sequence of feature constants defined in this
      module.
    skip_all_zero_counts: some sequences have no counts, e.g., because they were
      created artificially for validation purposes on the binding array. We want
      to skip these sequences for training.
    kmer_k_max: optional integer giving the maximum kmer length to use if
      SEQUENCE_KMER_COUNT is in `input_features`.
    additional_output: optional list of strings contains additional outputs.

  Returns:
    inputs: LabeledTensor with dtype=float32 and axes
      [batch_axis, input_position_axis, input_channel_axis], of one-hot-encoded
      rasterized sequences for input into machine learning models.
    outputs: LabeledTensor with dtype=float32 and axes [batch_axis, output_axis]
      denoting possible output tensors, including counts and binding array
      measurements.
  """

    sequence_length = experiment_proto.sequence_length
    count_names = selection.all_count_names(experiment_proto)
    array_names = selection.binding_array_names(experiment_proto)

    sequence_tensor = feature_tensors['sequence']
    batch_axis = sequence_tensor.axes['batch']
    position_axis = ('position', list(range(sequence_length)))

    inputs = {}

    if SEQUENCE_ONE_HOT in input_features:
        seq_indices = custom_ops.dna_sequence_to_indices(
            sequence_tensor, sequence_length)
        tensor = tf.one_hot(seq_indices, depth=4, dtype=tf.float32)
        channel_axis = ('channel', list(dna.DNA_BASES))
        axes = [batch_axis, position_axis, channel_axis]
        one_hots = lt.LabeledTensor(tensor, axes)
        inputs[SEQUENCE_ONE_HOT] = one_hots

    if SEQUENCE_KMER_COUNT in input_features:
        raw_counts = custom_ops.count_all_dna_kmers(sequence_tensor,
                                                    kmer_k_max)
        kmer_axis = lt.Axis('kmer', _kmer_labels(kmer_k_max))
        counts = lt.LabeledTensor(raw_counts, [batch_axis, kmer_axis])
        means, stds = _all_kmer_mean_and_std(kmer_k_max, sequence_length)
        mean_count = lt.constant(means, tf.float32, axes=[kmer_axis])
        std_count = lt.constant(stds, tf.float32, axes=[kmer_axis])
        inputs[SEQUENCE_KMER_COUNT] = (
            (lt.cast(counts, tf.float32) - mean_count) / std_count)

    if STRUCTURE_PARTITION_FUNCTION in input_features:
        with tf.name_scope('structure_partition_fn'):
            raw_pf_tensor = lt.expand_dims(
                feature_tensors['partition_function'],
                ['batch', 'partition_fn_axis'])
            inputs[STRUCTURE_PARTITION_FUNCTION] = lt.log(raw_pf_tensor)

    output_names = count_names + array_names
    outputs = [lt.cast(feature_tensors[k], tf.float32) for k in output_names]

    if additional_output and additional_output[0]:
        outputs += [
            lt.cast(feature_tensors[k], tf.float32) for k in additional_output
        ]
        output_names += additional_output
    outputs = lt.pack(outputs, ('output', output_names), axis_position=1)

    if skip_all_zero_counts:
        with tf.name_scope('counts_filtering'):
            counts = lt.select(outputs, {'output': count_names})
            keep = lt.reduce_any(lt.not_equal(counts, 0.0), 'output')
            inputs = {k: lt.boolean_mask(v, keep) for k, v in inputs.items()}
            outputs = lt.boolean_mask(outputs, keep)

    return inputs, outputs