def transform(counts): if log_transform: counts = lt.log(1.0 + counts) selection_dict = {'target': list(counts.axes['target'].labels)} aligned_means = lt.select(means, selection_dict) aligned_stddevs = lt.select(stddevs, selection_dict) return (counts - aligned_means) / aligned_stddevs
def affinity_loss_per_example_and_target(self, logits, outputs): """Calculate loss per example on predicting affinity. This calls "predict_affinity" which assumably has been implemented in the current output layer to predict affinity, and calculates the loss against the array output. Args: logits: LabeledTensor with dtype=float32 and axes [batch, logit_axis]. outputs: LabeledTensor with dtype=float32 and axes [batch, output_axis]. These outputs should include everything from the preprocessing, whether it is used in the loss or not. Returns: LabeledTensor with dtype=float32 and axes [batch, target_axis] giving loss for each target. """ affinity_pred = _affinities_to_binding_arrays(self.binding_arrays_map, self.predict_affinity(logits)) affinity_pred = lt.rename_axis(affinity_pred, 'output', 'target') array_output = lt.rename_axis( lt.select(outputs, {'output': list(self.binding_arrays_map.keys())}), 'output', 'target') return self.loss.per_example_and_target_array(affinity_pred, array_output)
def _normed_prev_round_counts(self, input_counts, input_counts_label='output'): """Create a Tensor with normalized counts from the previous round. Args: input_counts: LabeledTensor with dtype=float32 and axes [batch, input_counts_label]. input_counts_label: Name of the axis in input_counts that contains the count data to use. For LatentAffinityWithDeps that uses the actual count values, input_counts will be the outputs tensor and the label will be 'output'. For LatentAffinityWithPredDeps, input_counts will be the predictions for these counts and the axis will be 'target'. Returns: preds: LabeledTensor with dtype=float32 and axes [batch, target_axis]. """ parent_lookup = {} for k, parent in self.parent_count_names.items(): if parent in input_counts.axes: parent_lookup[k] = lt.select(input_counts, {input_counts_label: parent}) default_tensor = lt.LabeledTensor( tf.zeros_like(input_counts[:, 0]), [input_counts.axes['batch']]) parent_tensors = [ parent_lookup.get(k, default_tensor) for k in self.target_axis.labels ] parent_counts = lt.pack(parent_tensors, self.target_axis, axis_position=1) normed_counts = self.deps_normalize(parent_counts) return normed_counts
def predict_additional_output(self, logits): if not self.additional_output_axis: raise Error( 'Tries to calculate additional output while no such output specified' ) return lt.select(logits, {'target': list(self.additional_output_axis.labels)})
def predict_outputs(self, logits, outputs=None): """Predict a score that should correlate with each output. Args: logits: LabeledTensor with dtype=float32 and axes [batch, logit_axis]. outputs: optional LabeledTensor with dtype=float32 and axes [batch, output_axis]. Note that different output layers may not be directly comparable if they make sure of `outputs` from prior rounds of selection in predictions. Returns: LabeledTensor with dtype=float32 and axes [batch, output_axis] giving predictions for each count and binding array. """ predicted_counts = lt.rename_axis( self.predict_counts(logits, outputs), 'target', 'output') if self.binding_arrays_map: predicted_affinity = self.predict_affinity(logits) predicted_binding_arrays = lt.pack([ lt.select(predicted_affinity, {'affinity': target}) for target in self.binding_arrays_map.values() ], ('output', list(self.binding_arrays_map.keys())), axis_position=1) preds = lt.concat([predicted_counts, predicted_binding_arrays], 'output') else: preds = predicted_counts if self.additional_output_axis: predicted_additional_output = lt.rename_axis( self.predict_additional_output(logits), 'target', 'output') preds = lt.concat([preds, predicted_additional_output], 'output') return preds
def predict_counts(self, logits, outputs=None): # pylint: disable=unused-argument """Make count predictions from logits and counts. Args: logits: LabeledTensor with dtype=float32 and axes [batch, logit_axis]. outputs: LabeledTensor with dtype=float32 and axes [batch, output_axis]. Unused by the base class but in the signature for the benefit of subclasses that use counts from previous rounds to help predict future rounds. It is the responsibility of the implementation using `outputs` to ensure that this method respects the casual structure of the experiment. Returns: preds: LabeledTensor with dtype=float32 and axes [batch, target_axis]. """ # TODO(shoyer): consider using tf.nn.softplus instead of abs here weights = abs(self.affinity_weights) * self.selection_signs if self.additional_output_axis: affinity_logits = lt.rename_axis( lt.select(logits, {'target': list(self.affinity_axis.labels)}), 'target', 'affinity') else: affinity_logits = lt.rename_axis(logits, 'target', 'affinity') preds = lt.matmul(affinity_logits, weights) + self.bias return preds
def predict_affinity(self, logits): """See method on base class.""" if self.additional_output_axis: return lt.rename_axis( lt.select(logits, {'target': list(self.affinity_axis.labels)}), 'target', 'affinity') else: return lt.rename_axis(logits, 'target', 'affinity')
def loss_per_example_and_target(self, logits, outputs, include_array=True): """See method on base class.""" with tf.name_scope('predictions'): if self.additional_output_axis: affinity_logits = lt.select(logits, {'target': list(self.affinity_axis.labels)}) ao_logits = lt.select(logits, {'target': list(self.additional_output_axis.labels)}) count_preds = self.predict_counts(affinity_logits, outputs) preds = lt.concat([count_preds, ao_logits], 'target') else: preds = self.predict_counts(logits, outputs) targets = _targets_from_outputs(outputs, self.all_target_axis) loss = self.loss.per_example_and_target(preds, targets) if bool(set(self.binding_arrays_map.keys()) & set(outputs.axes['output'].labels)) and include_array: affinity_loss = self.affinity_loss_per_example_and_target(logits, outputs) return lt.concat([loss, affinity_loss], 'target') else: return loss
def __init__(self, hps, net, output_layer, experiment_proto, input_paths): inputs, outputs = data.input_pipeline(input_paths, experiment_proto, hps.mbsz, hps=hps, num_threads=8) with tf.name_scope('neural_net'): logits = net.fprop(inputs, mode='train') with tf.name_scope('output_layer'): loss_per_target = output_layer.average_loss_per_target( logits, outputs, include_array=hps.train_on_array) loss = utils.reduce_nanmean(loss_per_target) self.global_step = tf.Variable(0, name='global_step', trainable=False) if hps.optimizer == 'momentum': optimizer = tf.MomentumOptimizer(hps.learn_rate, hps.momentum) elif hps.optimizer == 'adam': optimizer = tf.AdamOptimizer(hps.learn_rate) else: raise ValueError('invalid optimizer: %s' % hps.optimizer) optimizer = tf.MomentumOptimizer(hps.learn_rate, hps.momentum) grads = optimizer.compute_gradients(loss, net.params + output_layer.params) opt_op = optimizer.apply_gradients(grads, global_step=self.global_step) self.train_op = tf.with_dependencies([opt_op], loss) contrib_deprecated.scalar_summary('loss/mean', loss) for target in loss_per_target.axes['target'].labels: contrib_deprecated.scalar_summary( 'loss/' + six.ensure_str(target), lt.select(loss_per_target, {'target': target})) with tf.name_scope('summarize_grads'): slim.learning.add_gradients_summaries(grads) tf.add_to_collection(tf.GraphKeys.GLOBAL_STEP, self.global_step) tf.add_to_collection('train_op', self.train_op) tf.add_to_collection('loss', loss) self.mbsz = hps.mbsz # The log Poisson loss implemented in TensorFlow may sometimes be negative. if (hps.loss_name == output_layers.LOSS_POISSON_LOSS or hps.loss_name == output_layers.LOSS_ZERO_TRUNCATED_POISSON_LOSS): self.min_cost = -float('inf') self.min_is_inclusive = False else: self.min_cost = 0 self.min_is_inclusive = True
def predict_counts(self, logits, outputs): """See method on base class.""" preds = super(LatentAffinityWithCrossDeps, self).predict_counts(logits, outputs) interact_weights = abs(self.logit_by_prev_count) * self.selection_signs # We're calling _normed_prev_round_counts a second time here with the same # arguments, but that's actually OK because TensorFlow automatically # consolidates these calls. if self.additional_output_axis: affinity_logits = lt.rename_axis( lt.select(logits, {'target': list(self.affinity_axis.labels)}), 'target', 'affinity') else: affinity_logits = lt.rename_axis(logits, 'target', 'affinity') preds += (lt.matmul(affinity_logits, interact_weights) * self._normed_prev_round_counts(outputs)) return preds
def predict_affinity(self, logits): """See method on base class.""" if not self.affinity_target_lt: raise Error( 'No affinity_target_map has been designated. This FullyObserved ' 'layer cannot calculate the affinity. The FullyObserved layer ' 'must be initialized with an affinity_target_map to be capable ' 'of calculating affinity.') # then do matrix multiple to turn (target) X (target by protein) # to a vector of length protein. For proteins with multiple targets, the # multiplication takes the sum of the values. if self.additional_output_axis: count_logits = lt.select(logits, {'target': list(self.target_axis.labels)}) else: count_logits = logits output_per_affinity = lt.matmul(count_logits, self.affinity_target_lt) return output_per_affinity
def predict_counts(self, logits, outputs=None): """See method on base class.""" if self.additional_output_axis: return lt.select(logits, {'target': list(self.target_axis.labels)}) else: return logits
def _affinities_to_binding_arrays(binding_arrays_map, affinities): return lt.pack([ lt.select(affinities, {'affinity': target}) for target in binding_arrays_map.values() ], ('output', list(binding_arrays_map.keys())), axis_position=1)
def _targets_from_outputs(outputs, target_axis): selected = lt.select(outputs, {'output': list(target_axis.labels)}) targets = lt.reshape(selected, ['output'], [target_axis]) return targets
def average_loss_per_target(self, logits, outputs, include_array=True): """Calculate averaged over examples. This is the loss to use for training. If affinity loss is calculated and "include_array" is set to True, the count loss for the novel sequences included in the microarray and the affinity loss for the sequences not included in the microarray are excluded from the average loss calculation. Otherwise, return the average count loss over all samples. Args: logits: LabeledTensor with dtype=float32 and axes [batch, logit_axis]. outputs: LabeledTensor with dtype=float32 and axes [batch, output_axis]. include_array: Optional boolean variable indicating whether to also compute affinity loss against binding array data. Returns: LabeledTensor with type=float32 with axes [output_axis]. """ # should be independent of mini-batch size loss_matrix = self.loss_per_example_and_target(logits, outputs, include_array) if bool(set(self.binding_arrays_map.keys()) & set(outputs.axes['output'].labels)) and include_array: count_loss = lt.select(loss_matrix, {'target': list(self.target_axis.labels)}) # Only the count loss for the samples with at least one non-zero # count output will be kept. loss_matrix_keep_idx = lt.reduce_any(lt.not_equal( lt.select(outputs, {'output': list(self.target_axis.labels)}) , 0.0), 'output') loss_matrix_keep = lt.boolean_mask(count_loss, loss_matrix_keep_idx) reduce_loss_matrix = utils.reduce_nanmean(loss_matrix_keep, 'batch') affinity_loss = lt.select( loss_matrix, {'target': list(self.binding_arrays_map.keys())}) # Only the affinity loss for the samples with at least one non-zero # affinity output wil be kept. affinity_loss_keep_idx = lt.reduce_any( lt.not_equal( lt.select(outputs, {'output': list(self.binding_arrays_map.keys())}), 0.0), 'output') affity_loss_keep = lt.boolean_mask(affinity_loss, affinity_loss_keep_idx) reduce_affity_loss = utils.reduce_nanmean(affity_loss_keep, 'batch') # Count loss and affinity loss are concatenated avg_loss = lt.concat([reduce_loss_matrix, reduce_affity_loss], 'target') # Only the additional output loss for the samples with at least one # non-zero output value wil be kept. if self.additional_output_axis: ao_labels = list(self.additional_output_axis.labels) af_loss = lt.select(loss_matrix, {'target': ao_labels}) af_loss_keep_idx = lt.reduce_any( lt.not_equal(lt.select(outputs, {'output': ao_labels}), 0.0), 'output') af_loss_keep = lt.boolean_mask(af_loss, af_loss_keep_idx) reduce_af_loss = utils.reduce_nanmean(af_loss_keep, 'batch') avg_loss = lt.concat([avg_loss, reduce_af_loss], 'target') else: avg_loss = utils.reduce_nanmean(loss_matrix, 'batch') return avg_loss
def transform(counts): aligned_counts = lt.select( total_counts, {'target': list(counts.axes['target'].labels)}) return counts / lt.cast(aligned_counts, tf.float32)
def _split_outputs(self, outputs): """Split outputs into counts and binding array LabeledTensors.""" counts = lt.select(outputs, {'output': self.all_count_names}) binding = lt.select(outputs, {'output': self.binding_array_names}) return counts, binding
def compute_experiment_statistics( experiment_proto, input_paths, proto_w_stats_path, preprocess_mode=data.PREPROCESS_SKIP_ALL_ZERO_COUNTS, max_size=None, logdir=None, save_stats=False): """Calculate the mean and standard deviation of counts from input files. These statistics are used for normalization. If any statistic is missing or save_stats=True, compute the statistics. Save the statitics to proto_w_stats_path if save_stats=True. Args: experiment_proto: selection_pb2.Experiment describing the experiment. input_paths: list of strings giving paths to sstables of input examples. proto_w_stats_path: string path to the validation proto file with stats preprocess_mode: optional preprocess mode defined in the `data` module. max_size: optional number of examples to examine to compute statistics. By default, examines the entire dataset. logdir: optional path to a directory in which to log events. save_stats: optional boolean indicating whether to update all the statistics and save to proto_w_stats_path. Returns: selection_pb2.Experiment with computed statistics. """ experiment_proto = copy.deepcopy(experiment_proto) has_all_statistics = True all_reads = {} for round_proto in experiment_proto.rounds.values(): for reads in [round_proto.positive_reads, round_proto.negative_reads]: if reads.name: all_reads[reads.name] = reads if not reads.HasField('statistics'): has_all_statistics = False all_ao = {} for ao_proto in experiment_proto.additional_output: if ao_proto.name: all_ao[ao_proto.name] = ao_proto if not ao_proto.HasField('statistics'): has_all_statistics = False if not has_all_statistics or save_stats: with tf.Graph().as_default(): logger.info('Setting up graph for statistics') # we only care about outputs, which don't rely on training hyper # parameters hps = tf.HParams(preprocess_mode=preprocess_mode, kmer_k_max=0, ratio_random_dna=0.0, total_reads_defining_positive=0, additional_output=','.join([ x.name for x in experiment_proto.additional_output ])) _, outputs = data.input_pipeline(input_paths, experiment_proto, final_mbsz=100000, hps=hps, num_epochs=1, num_threads=1) size_op = tf.shape(outputs)[list( outputs.axes.keys()).index('batch')] all_update_ops = [] all_value_ops = {} for name in all_reads: counts = lt.select(outputs, {'output': name}) log_counts = lt.log(counts + 1.0) ops = { 'mean': contrib_metrics.streaming_mean(counts), 'std_dev': streaming_std(counts), 'mean_log_plus_one': contrib_metrics.streaming_mean(log_counts), 'std_dev_log_plus_one': streaming_std(log_counts), } value_ops, update_ops = contrib_metrics.aggregate_metric_map( ops) all_update_ops.extend(list(update_ops.values())) all_value_ops[name] = value_ops for name in all_ao: ao = lt.select(outputs, {'output': name}) log_ao = lt.log(ao + 1.0) ops = { 'mean': contrib_metrics.streaming_mean(ao), 'std_dev': streaming_std(ao), 'mean_log_plus_one': contrib_metrics.streaming_mean(log_ao), 'std_dev_log_plus_one': streaming_std(log_ao), } value_ops, update_ops = contrib_metrics.aggregate_metric_map( ops) all_update_ops.extend(list(update_ops.values())) all_value_ops[name] = value_ops logger.info('Running statistics ops') sv = tf.train.Supervisor(logdir=logdir) with sv.managed_session() as sess: total = 0 for results in run_until_exhausted(sv, sess, [size_op] + all_update_ops): total += results[0] if max_size is not None and total >= max_size: break all_statistics = { k: sess.run(v) for k, v in all_value_ops.items() } for reads_name, reads in all_reads.items(): for name, value in all_statistics[reads_name].items(): setattr(reads.statistics, name, value.item()) for ao_name, ao in all_ao.items(): for name, value in all_statistics[ao_name].items(): setattr(ao.statistics, name, value.item()) logger.info('Computed statistics: %r', all_statistics) if save_stats: logger.info('Save the proto with statistics to %s', proto_w_stats_path) with open('/tmp/tmp.pbtxt', 'w') as f: f.write(text_format.MessageToString(experiment_proto)) gfile.Copy('/tmp/tmp.pbtxt', proto_w_stats_path, overwrite=True) else: logger.info('All the statistics exist. Nothing to compute') return experiment_proto
def create_input_and_outputs(feature_tensors, experiment_proto, input_features=(SEQUENCE_ONE_HOT, ), skip_all_zero_counts=True, kmer_k_max=4, additional_output=None): """Create inputs and outputs from parsed features. Args: feature_tensors: Dict[str, tf.Tensor] with parsed featured created by `build_features`. experiment_proto: selection_pb2.Experiment describing the experiment. input_features: optional sequence of feature constants defined in this module. skip_all_zero_counts: some sequences have no counts, e.g., because they were created artificially for validation purposes on the binding array. We want to skip these sequences for training. kmer_k_max: optional integer giving the maximum kmer length to use if SEQUENCE_KMER_COUNT is in `input_features`. additional_output: optional list of strings contains additional outputs. Returns: inputs: LabeledTensor with dtype=float32 and axes [batch_axis, input_position_axis, input_channel_axis], of one-hot-encoded rasterized sequences for input into machine learning models. outputs: LabeledTensor with dtype=float32 and axes [batch_axis, output_axis] denoting possible output tensors, including counts and binding array measurements. """ sequence_length = experiment_proto.sequence_length count_names = selection.all_count_names(experiment_proto) array_names = selection.binding_array_names(experiment_proto) sequence_tensor = feature_tensors['sequence'] batch_axis = sequence_tensor.axes['batch'] position_axis = ('position', list(range(sequence_length))) inputs = {} if SEQUENCE_ONE_HOT in input_features: seq_indices = custom_ops.dna_sequence_to_indices( sequence_tensor, sequence_length) tensor = tf.one_hot(seq_indices, depth=4, dtype=tf.float32) channel_axis = ('channel', list(dna.DNA_BASES)) axes = [batch_axis, position_axis, channel_axis] one_hots = lt.LabeledTensor(tensor, axes) inputs[SEQUENCE_ONE_HOT] = one_hots if SEQUENCE_KMER_COUNT in input_features: raw_counts = custom_ops.count_all_dna_kmers(sequence_tensor, kmer_k_max) kmer_axis = lt.Axis('kmer', _kmer_labels(kmer_k_max)) counts = lt.LabeledTensor(raw_counts, [batch_axis, kmer_axis]) means, stds = _all_kmer_mean_and_std(kmer_k_max, sequence_length) mean_count = lt.constant(means, tf.float32, axes=[kmer_axis]) std_count = lt.constant(stds, tf.float32, axes=[kmer_axis]) inputs[SEQUENCE_KMER_COUNT] = ( (lt.cast(counts, tf.float32) - mean_count) / std_count) if STRUCTURE_PARTITION_FUNCTION in input_features: with tf.name_scope('structure_partition_fn'): raw_pf_tensor = lt.expand_dims( feature_tensors['partition_function'], ['batch', 'partition_fn_axis']) inputs[STRUCTURE_PARTITION_FUNCTION] = lt.log(raw_pf_tensor) output_names = count_names + array_names outputs = [lt.cast(feature_tensors[k], tf.float32) for k in output_names] if additional_output and additional_output[0]: outputs += [ lt.cast(feature_tensors[k], tf.float32) for k in additional_output ] output_names += additional_output outputs = lt.pack(outputs, ('output', output_names), axis_position=1) if skip_all_zero_counts: with tf.name_scope('counts_filtering'): counts = lt.select(outputs, {'output': count_names}) keep = lt.reduce_any(lt.not_equal(counts, 0.0), 'output') inputs = {k: lt.boolean_mask(v, keep) for k, v in inputs.items()} outputs = lt.boolean_mask(outputs, keep) return inputs, outputs