def random_dna_features(experiment_proto, size): """Create a dict of feature tensors for random DNA sequences. All features other than 'sequence' should use the default value. Args: experiment_proto: selection_pb2.Experiment describing the experiment. size: scalar integer tf.Tensor giving the number of desired sequences. Returns: Dict[Any, labeled_tensor.LabeledTensor] providing generated features. """ with tf.name_scope('preprocess_random_input'): template = selection.get_template_sequence(experiment_proto) features = build_features(experiment_proto) feature_tensors = {} for k, v in features.items(): if k != 'sequence': default_value = lt.constant(v.default_value, v.dtype, v.axes) expanded_default = lt.expand_dims(default_value, ['batch'] + v.axes) tiled_default = lt.tile(expanded_default, {'batch': size}) feature_tensors[k] = tiled_default feature_tensors['sequence'] = lt.LabeledTensor( random_dna_sequence(template, (size, )), ['batch']) return feature_tensors
def _get_measurement_statistics(experiment_proto, statistic): """Extract measurement statistics from a selection_pb2.Experiment. Args: experiment_proto: selection_pb2.Experiment proto with saved measurement statistics. statistic: string giving the name of the statistic to pull out of the proto. Returns: LabeledTensor giving the value of the desired statistic for each output. """ mapping = selection.extract_measurement_statistic(experiment_proto, statistic) output_names, data = list(zip(*sorted(mapping.items()))) return lt.constant(data, axes=[('target', list(output_names))])
def create_input_and_outputs(feature_tensors, experiment_proto, input_features=(SEQUENCE_ONE_HOT, ), skip_all_zero_counts=True, kmer_k_max=4, additional_output=None): """Create inputs and outputs from parsed features. Args: feature_tensors: Dict[str, tf.Tensor] with parsed featured created by `build_features`. experiment_proto: selection_pb2.Experiment describing the experiment. input_features: optional sequence of feature constants defined in this module. skip_all_zero_counts: some sequences have no counts, e.g., because they were created artificially for validation purposes on the binding array. We want to skip these sequences for training. kmer_k_max: optional integer giving the maximum kmer length to use if SEQUENCE_KMER_COUNT is in `input_features`. additional_output: optional list of strings contains additional outputs. Returns: inputs: LabeledTensor with dtype=float32 and axes [batch_axis, input_position_axis, input_channel_axis], of one-hot-encoded rasterized sequences for input into machine learning models. outputs: LabeledTensor with dtype=float32 and axes [batch_axis, output_axis] denoting possible output tensors, including counts and binding array measurements. """ sequence_length = experiment_proto.sequence_length count_names = selection.all_count_names(experiment_proto) array_names = selection.binding_array_names(experiment_proto) sequence_tensor = feature_tensors['sequence'] batch_axis = sequence_tensor.axes['batch'] position_axis = ('position', list(range(sequence_length))) inputs = {} if SEQUENCE_ONE_HOT in input_features: seq_indices = custom_ops.dna_sequence_to_indices( sequence_tensor, sequence_length) tensor = tf.one_hot(seq_indices, depth=4, dtype=tf.float32) channel_axis = ('channel', list(dna.DNA_BASES)) axes = [batch_axis, position_axis, channel_axis] one_hots = lt.LabeledTensor(tensor, axes) inputs[SEQUENCE_ONE_HOT] = one_hots if SEQUENCE_KMER_COUNT in input_features: raw_counts = custom_ops.count_all_dna_kmers(sequence_tensor, kmer_k_max) kmer_axis = lt.Axis('kmer', _kmer_labels(kmer_k_max)) counts = lt.LabeledTensor(raw_counts, [batch_axis, kmer_axis]) means, stds = _all_kmer_mean_and_std(kmer_k_max, sequence_length) mean_count = lt.constant(means, tf.float32, axes=[kmer_axis]) std_count = lt.constant(stds, tf.float32, axes=[kmer_axis]) inputs[SEQUENCE_KMER_COUNT] = ( (lt.cast(counts, tf.float32) - mean_count) / std_count) if STRUCTURE_PARTITION_FUNCTION in input_features: with tf.name_scope('structure_partition_fn'): raw_pf_tensor = lt.expand_dims( feature_tensors['partition_function'], ['batch', 'partition_fn_axis']) inputs[STRUCTURE_PARTITION_FUNCTION] = lt.log(raw_pf_tensor) output_names = count_names + array_names outputs = [lt.cast(feature_tensors[k], tf.float32) for k in output_names] if additional_output and additional_output[0]: outputs += [ lt.cast(feature_tensors[k], tf.float32) for k in additional_output ] output_names += additional_output outputs = lt.pack(outputs, ('output', output_names), axis_position=1) if skip_all_zero_counts: with tf.name_scope('counts_filtering'): counts = lt.select(outputs, {'output': count_names}) keep = lt.reduce_any(lt.not_equal(counts, 0.0), 'output') inputs = {k: lt.boolean_mask(v, keep) for k, v in inputs.items()} outputs = lt.boolean_mask(outputs, keep) return inputs, outputs