def average_loss_per_target(self, logits, outputs, include_array=True): """Calculate averaged over examples. This is the loss to use for training. If affinity loss is calculated and "include_array" is set to True, the count loss for the novel sequences included in the microarray and the affinity loss for the sequences not included in the microarray are excluded from the average loss calculation. Otherwise, return the average count loss over all samples. Args: logits: LabeledTensor with dtype=float32 and axes [batch, logit_axis]. outputs: LabeledTensor with dtype=float32 and axes [batch, output_axis]. include_array: Optional boolean variable indicating whether to also compute affinity loss against binding array data. Returns: LabeledTensor with type=float32 with axes [output_axis]. """ # should be independent of mini-batch size loss_matrix = self.loss_per_example_and_target(logits, outputs, include_array) if bool(set(self.binding_arrays_map.keys()) & set(outputs.axes['output'].labels)) and include_array: count_loss = lt.select(loss_matrix, {'target': list(self.target_axis.labels)}) # Only the count loss for the samples with at least one non-zero # count output will be kept. loss_matrix_keep_idx = lt.reduce_any(lt.not_equal( lt.select(outputs, {'output': list(self.target_axis.labels)}) , 0.0), 'output') loss_matrix_keep = lt.boolean_mask(count_loss, loss_matrix_keep_idx) reduce_loss_matrix = utils.reduce_nanmean(loss_matrix_keep, 'batch') affinity_loss = lt.select( loss_matrix, {'target': list(self.binding_arrays_map.keys())}) # Only the affinity loss for the samples with at least one non-zero # affinity output wil be kept. affinity_loss_keep_idx = lt.reduce_any( lt.not_equal( lt.select(outputs, {'output': list(self.binding_arrays_map.keys())}), 0.0), 'output') affity_loss_keep = lt.boolean_mask(affinity_loss, affinity_loss_keep_idx) reduce_affity_loss = utils.reduce_nanmean(affity_loss_keep, 'batch') # Count loss and affinity loss are concatenated avg_loss = lt.concat([reduce_loss_matrix, reduce_affity_loss], 'target') # Only the additional output loss for the samples with at least one # non-zero output value wil be kept. if self.additional_output_axis: ao_labels = list(self.additional_output_axis.labels) af_loss = lt.select(loss_matrix, {'target': ao_labels}) af_loss_keep_idx = lt.reduce_any( lt.not_equal(lt.select(outputs, {'output': ao_labels}), 0.0), 'output') af_loss_keep = lt.boolean_mask(af_loss, af_loss_keep_idx) reduce_af_loss = utils.reduce_nanmean(af_loss_keep, 'batch') avg_loss = lt.concat([avg_loss, reduce_af_loss], 'target') else: avg_loss = utils.reduce_nanmean(loss_matrix, 'batch') return avg_loss
def create_input_and_outputs(feature_tensors, experiment_proto, input_features=(SEQUENCE_ONE_HOT, ), skip_all_zero_counts=True, kmer_k_max=4, additional_output=None): """Create inputs and outputs from parsed features. Args: feature_tensors: Dict[str, tf.Tensor] with parsed featured created by `build_features`. experiment_proto: selection_pb2.Experiment describing the experiment. input_features: optional sequence of feature constants defined in this module. skip_all_zero_counts: some sequences have no counts, e.g., because they were created artificially for validation purposes on the binding array. We want to skip these sequences for training. kmer_k_max: optional integer giving the maximum kmer length to use if SEQUENCE_KMER_COUNT is in `input_features`. additional_output: optional list of strings contains additional outputs. Returns: inputs: LabeledTensor with dtype=float32 and axes [batch_axis, input_position_axis, input_channel_axis], of one-hot-encoded rasterized sequences for input into machine learning models. outputs: LabeledTensor with dtype=float32 and axes [batch_axis, output_axis] denoting possible output tensors, including counts and binding array measurements. """ sequence_length = experiment_proto.sequence_length count_names = selection.all_count_names(experiment_proto) array_names = selection.binding_array_names(experiment_proto) sequence_tensor = feature_tensors['sequence'] batch_axis = sequence_tensor.axes['batch'] position_axis = ('position', list(range(sequence_length))) inputs = {} if SEQUENCE_ONE_HOT in input_features: seq_indices = custom_ops.dna_sequence_to_indices( sequence_tensor, sequence_length) tensor = tf.one_hot(seq_indices, depth=4, dtype=tf.float32) channel_axis = ('channel', list(dna.DNA_BASES)) axes = [batch_axis, position_axis, channel_axis] one_hots = lt.LabeledTensor(tensor, axes) inputs[SEQUENCE_ONE_HOT] = one_hots if SEQUENCE_KMER_COUNT in input_features: raw_counts = custom_ops.count_all_dna_kmers(sequence_tensor, kmer_k_max) kmer_axis = lt.Axis('kmer', _kmer_labels(kmer_k_max)) counts = lt.LabeledTensor(raw_counts, [batch_axis, kmer_axis]) means, stds = _all_kmer_mean_and_std(kmer_k_max, sequence_length) mean_count = lt.constant(means, tf.float32, axes=[kmer_axis]) std_count = lt.constant(stds, tf.float32, axes=[kmer_axis]) inputs[SEQUENCE_KMER_COUNT] = ( (lt.cast(counts, tf.float32) - mean_count) / std_count) if STRUCTURE_PARTITION_FUNCTION in input_features: with tf.name_scope('structure_partition_fn'): raw_pf_tensor = lt.expand_dims( feature_tensors['partition_function'], ['batch', 'partition_fn_axis']) inputs[STRUCTURE_PARTITION_FUNCTION] = lt.log(raw_pf_tensor) output_names = count_names + array_names outputs = [lt.cast(feature_tensors[k], tf.float32) for k in output_names] if additional_output and additional_output[0]: outputs += [ lt.cast(feature_tensors[k], tf.float32) for k in additional_output ] output_names += additional_output outputs = lt.pack(outputs, ('output', output_names), axis_position=1) if skip_all_zero_counts: with tf.name_scope('counts_filtering'): counts = lt.select(outputs, {'output': count_names}) keep = lt.reduce_any(lt.not_equal(counts, 0.0), 'output') inputs = {k: lt.boolean_mask(v, keep) for k, v in inputs.items()} outputs = lt.boolean_mask(outputs, keep) return inputs, outputs