def non_batched_dataset(train_dev_or_test, label_vocab, data_root_dir=DATA_ROOT_DIR): """Constructs a dataset of examples. Args: train_dev_or_test: one of _DEV_FOLD_VALUES. The source examples to load into a dataset. label_vocab: list of string. data_root_dir: path to tfrecord examples. Returns: tf.data.Dataset, where each example is of form { SEQUENCE_KEY: one-hot of amino acid characters SEQUENCE_LENGTH_KEY: length of sequence SEQUENCE_ID_KEY: unique identifier for protein LABEL_KEY: rank-1 tensor of integer labels from label_vocab, } """ if train_dev_or_test not in DATA_FOLD_VALUES: raise ValueError(('Only train, dev, test and * are supported datasets.' ' Received {}.').format(train_dev_or_test)) dataset_files = [ os.path.join(data_root_dir, f) for f in tf.gfile.ListDirectory(data_root_dir) if train_dev_or_test in f and ".tfrecord" in f ] tfrecord_dataset = tf.data.TFRecordDataset(dataset_files) dataset = tfrecord_dataset.map(lambda record: tf.io.parse_single_example( # pylint: disable=g-long-lambda record, DATASET_FEATURES)) dataset = dataset.map(_add_sequence_length) dataset = dataset.filter(_is_sequence_short_enough_for_training) amino_acid_table = contrib_lookup.index_table_from_tensor( utils.AMINO_ACID_VOCABULARY, default_value=len(utils.AMINO_ACID_VOCABULARY)) protein_class_table = contrib_lookup.index_table_from_tensor( mapping=label_vocab) dataset = dataset.map( lambda ex: _map_sequence_to_ints(ex, amino_acid_table)) dataset = dataset.map( lambda ex: _map_labels_to_ints(ex, protein_class_table)) dataset = dataset.map(_to_one_hot_sequence) if train_dev_or_test == TRAIN_FOLD: dataset = dataset.repeat() dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) return dataset
def embeddings(features, configs, embedding_size=None): assert (embedding_size is not None) and isinstance(embedding_size, int) embedded = {} with tf.variable_scope('embeddings'): for config in configs: embeddings = tf.get_variable( name=config.name, shape=[config.length, embedding_size], dtype=tf.float32, trainable=True, initializer=tf.random_uniform_initializer(minval=-0.5, maxval=0.5, dtype=tf.float32), ) if config.table is not None: lookup_table = lookup.index_table_from_tensor( mapping=tf.constant(config.table, dtype=tf.string), default_value=0) for field in config.fields: embedded[field] = tf.nn.embedding_lookup( embeddings, lookup_table.lookup(features[field])) else: for field in config.fields: embedded[field] = tf.nn.embedding_lookup( embeddings, features[field]) return embedded
def create_vocab_lookup_tables(vocab): str_to_int = lookup.index_table_from_tensor( mapping=vocab, num_oov_buckets=0, default_value=OOV_TOKEN_ID, name='vocab_lookup_str_to_int' ) int_to_str = lookup.index_to_string_table_from_tensor( mapping=vocab, default_value=UNKNOWN_TOKEN, name='vocab_lookup_int_to_str' ) word2id = {w: i for i, w in enumerate(vocab)} vocab_lookup = { INT_TO_STR: int_to_str, STR_TO_INT: str_to_int, RAW_WORD2ID: word2id, RAW_ID2WORD: vocab } graph_utils.add_dict_to_collection(vocab_lookup, VOCAB_LOOKUP_COLL_NAME) return vocab_lookup
def test_table_roundtrip(self): export_path = os.path.join(tempfile.mkdtemp(), 'export') with tf.Graph().as_default(): with tf.Session().as_default() as session: input_string = tf.placeholder(tf.string) # Map string through a table, in this case based on a constant tensor. table = lookup.index_table_from_tensor( tf.constant(['cat', 'dog', 'giraffe'])) output = table.lookup(input_string) inputs = {'input': input_string} outputs = {'output': output} saved_transform_io.write_saved_transform_from_session( session, inputs, outputs, export_path) with tf.Graph().as_default(): with tf.Session().as_default() as session: # Using a computed input gives confidence that the graphs are fused. input_string = tf.constant('dog') inputs = {'input': input_string} outputs = saved_transform_io.apply_saved_transform( export_path, inputs) session.run(tf.tables_initializer()) result = session.run(outputs['output']) self.assertEqual(1, result)
def __init__(self, speaker_list, file_pattern, T=2**12, batch_size=1, num_epoch=None, buffer_size=4000): ''' `T`: sequence length ''' with tf.device('/cpu'): with tf.name_scope('ByteInputPipeline'): self.speaker_list = tf.constant(speaker_list) self.table = index_table_from_tensor(mapping=self.speaker_list) self.T = T filenames = tf.gfile.Glob(file_pattern) if filenames: print('Data Loader: {} files found\n'.format( len(filenames))) else: raise ValueError('No files found: {}'.format(file_pattern)) dataset = (tf.data.TFRecordDataset(filenames).map( self._parse_function).shuffle(buffer_size).batch( batch_size).repeat(num_epoch)) self.iterator = dataset.make_initializable_iterator() self.x, self.y = self.iterator.get_next()
def testCreatePhasesWithUnwrappedTable(self): # Create a graph with a table that is not wrapped in `apply_function`. string_placeholder = tf.placeholder(tf.string, shape=(None, )) table = lookup.index_table_from_tensor(['a', 'b']) table.lookup(string_placeholder) with self.assertRaisesRegexp(ValueError, 'Found table initializers'): impl_helper.create_phases()
def make_item2id(self, seq_list): count_dict = collections.Counter( reduce(lambda seq1, seq2: seq1 + seq2, seq_list)) item_mapper = list( map(lambda i: i[0] if i[1] >= self.min_count else '0', count_dict.items())) item2id = index_table_from_tensor(mapping=item_mapper, default_value=0) self.vocab_size = item2id.size() return item2id
def add_vocab_lookups(self): with open(self.config.filename_words) as f: words = [word.strip() for idx, word in enumerate(f)] with open(self.config.filename_tags) as f: labels = [label.strip() for idx, label in enumerate(f)] with open(self.config.filename_chars) as f: chars = [char.strip() for idx, char in enumerate(f)] self.label_list = tf.constant(labels) self.word_table = lookup.index_table_from_tensor( mapping=words, default_value=words.index(UNK)) self.char_table = lookup.index_table_from_tensor(mapping=chars, default_value=-1) self.label_table = lookup.index_table_from_tensor( mapping=self.label_list, num_oov_buckets=1)
def fasta_indexer(): """Get a function for converting tokenized protein strings to indices.""" mapping = tf.constant(FULL_RESIDUE_VOCAB) table = contrib_lookup.index_table_from_tensor(mapping) def mapper(residues): return tf.ragged.map_flat_values(table.lookup, residues) return mapper
def __init__(self, split_name, preprocess_fn, num_epochs, shuffle, random_seed=None, filter_filename=None, drop_remainder=True, sup=False): """Initialize the dataset object. Args: split_name: A string split name, to load from the dataset. preprocess_fn: Preprocess a single example. The example is already parsed into a dictionary. num_epochs: An int, defaults to `None`. Number of epochs to cycle through the dataset before stopping. If set to `None` this will read samples indefinitely. shuffle: A boolean, defaults to `False`. Whether output data are shuffled. random_seed: Optional int. Random seed for shuffle operation. filter_filename: Optional filename to use for filtering. drop_remainder: If true, then the last incomplete batch is dropped. """ # This is an instance-variable instead of a class-variable because it # depends on FLAGS, which is not parsed yet at class-parse-time. dataset_dir = FLAGS.sup_dataset_dir if sup else FLAGS.unsup_dataset_dir files = os.path.join(os.path.expanduser(dataset_dir), '%s@%i') filenames = { 'train': generate_sharded_filenames(files % ('train', 1024))[:-40], 'val': generate_sharded_filenames(files % ('train', 1024))[-40:], 'trainval': generate_sharded_filenames(files % ('train', 1024)), 'test': generate_sharded_filenames(files % ('validation', 128)) } super(DatasetWalmartFashion, self).__init__(filenames=filenames[split_name], reader=tf.data.TFRecordDataset, num_epochs=num_epochs, shuffle=shuffle, random_seed=random_seed, filter_fn=self.get_filter() if filter_filename is not None else None, drop_remainder=drop_remainder) self.split_name = split_name self.preprocess_fn = preprocess_fn self.filename_list = None if filter_filename is not None: with tf.gfile.Open(filter_filename, 'r') as f: filename_list = json.load(f) filename_list = tf.constant(filename_list['values']) filename_list = index_table_from_tensor(mapping=filename_list, num_oov_buckets=0, default_value=-1) self.filename_list = filename_list
def _map_to_int(x): """Maps string tensor into indexes using vocab. Args: x : a Tensor/SparseTensor of string. Returns: a Tensor/SparseTensor of indexes (int) of the same shape as x. """ table = lookup.index_table_from_tensor(vocab, default_value=len(vocab)) return table.lookup(x)
def get_vocab_lookup(vocab, name=None, reuse=None): with tf.variable_scope(name, 'vocab_lookup', reuse=reuse): vocab_lookup = lookup.index_table_from_tensor( mapping=vocab, num_oov_buckets=0, default_value=OOV_TOKEN_ID, name=name ) return vocab_lookup
def convert_label(label): """Parses a string tensor into the label tensor Args: label_string_tensor: Tensor of dtype string. Result of parsing the CSV column specified by LABEL_COLUMN Returns: A Tensor of the same shape as label_string_tensor, should return an int64 Tensor representing the label index for classification tasks """ table = lookup.index_table_from_tensor(['<=50K', '>50K']) return table.lookup(label)
def get_word_embeddings(word_tensor, embedding_words, embedding_vectors): """Convert a string tensor of words into a float32 tensor of word embeddings.""" word_lookup = lookup.index_table_from_tensor(embedding_words, default_value=0) word_ids = word_lookup.lookup(word_tensor) word_embeddings = tf.concat(( tf.get_variable(name="unknown_word_embedding", initializer=embedding_vectors[:1], trainable=True), tf.get_variable(name="known_word_embeddings", initializer=embedding_vectors[1:], trainable=False) ), axis=0) return tf.nn.embedding_lookup(word_embeddings, word_ids)
def convert_label(label): """Parses a string tensor into the label tensor Args: label_string_tensor: Tensor of dtype string. Result of parsing the CSV column specified by LABEL_COLUMN Returns: A Tensor of the same shape as label_string_tensor, should return an int64 Tensor representing the label index for classification tasks """ table = lookup.index_table_from_tensor(['<=50K', '>50K']) return table.lookup(label)
def _map_to_int(x): """Maps string tensor into indexes using vocab. Args: x : a Tensor/SparseTensor of string. Returns: a Tensor/SparseTensor of indexes (int) of the same shape as x. """ table = lookup.index_table_from_tensor( vocab, default_value=len(vocab)) return table.lookup(x)
def string2index(feature_strings, feature): """ Convert a `Tensor` of type `tf.string` to a corresponding Tensor of ids (`tf.int32`) :param feature_strings: string `Tensor` :param feature: feature extractor with string to index vocabulary :return: feature id Tensor """ with variable_scope('lookup'): feats = list(feature.ordered_feats()) lookup = index_table_from_tensor(mapping=tf.constant(feats), default_value=feature.unk_index()) return lookup.lookup(feature_strings)
def __init__(self, speaker_list, filenames, num_epoch=1): with tf.device('/cpu'): with tf.name_scope('ByteInputPipeline'): self.speaker_list = tf.constant(speaker_list) self.table = index_table_from_tensor(mapping=self.speaker_list) print('{} files found'.format(len(filenames))) dataset = (tf.data.TFRecordDataset(filenames).map( self._parse_function).batch(1).repeat(num_epoch)) self.iterator = dataset.make_initializable_iterator() self.x, self.y, self.f, self.w, self.t = self.iterator.get_next( )
def build_tensorize_text_fn(embeddings): """Builds a function to turn text into word/char ids.""" tbl = contrib_lookup.index_table_from_tensor( mapping=embeddings.get_vocab(), num_oov_buckets=1) def fn(string_tensor): """Builds the output tensor dictionary.""" out = {} if FLAGS.lowercase: string_tensor = ops.lowercase_op(string_tensor) out["wids"] = tf.to_int32(tbl.lookup(string_tensor)) out["cids"] = char_utils.batch_word_to_char_ids(string_tensor, 50) out["len"] = tf.shape(string_tensor)[-1] return out return fn
def add_id_lookups(self): table = lookup.index_table_from_tensor(mapping=tf.constant(['']), default_value=1) sentences_shape = tf.shape(self.padded_sentences, out_type=tf.int64) removed_char_sentences = remove_unknown_chars(self.padded_sentences, self.char_table) split_words = tf.string_split(tf.reshape(removed_char_sentences, [-1]), delimiter="") dense_split_words = tf.sparse_tensor_to_dense(split_words, default_value='') max_word_len = tf.gather_nd(split_words.dense_shape, tf.constant([1])) chars_shape = tf.concat([sentences_shape, [max_word_len]], 0) chars = tf.reshape(dense_split_words, chars_shape) self.word_lengths = tf.reduce_sum(table.lookup(chars), 2) lowercase_sentences = lowercase(self.padded_sentences) sanitised_sentences = tf.regex_replace(lowercase_sentences, '^[0-9]+$', NUM) self.sequence_lengths = tf.reduce_sum( table.lookup(sanitised_sentences), 1) self.word_ids = self.word_table.lookup(sanitised_sentences) self.char_ids = self.char_table.lookup(chars) word_mask = tf.sequence_mask(self.sequence_lengths) char_mask = tf.sequence_mask(self.word_lengths) self.word_ids = tf.where(word_mask, self.word_ids, tf.zeros_like(self.word_ids)) self.char_ids = tf.where(char_mask, self.char_ids, tf.zeros_like(self.char_ids)) label_lengths = tf.reduce_sum(table.lookup(self.label_codes), 1) labels_mask = tf.sequence_mask(label_lengths) self.labels = self.label_table.lookup(self.label_codes) self.labels = tf.where(labels_mask, self.labels, tf.zeros_like(self.labels))
def provide_dataset(self): """Provides dataset (audio, labels) of nsynth.""" length = 64000 channels = 1 pitch_counts = self.get_pitch_counts() pitches = sorted(pitch_counts.keys()) label_index_table = contrib_lookup.index_table_from_tensor( sorted(pitches), dtype=tf.int64) def _parse_nsynth(record): """Parsing function for NSynth dataset.""" features = { 'pitch': tf.FixedLenFeature([1], dtype=tf.int64), 'audio': tf.FixedLenFeature([length], dtype=tf.float32), 'qualities': tf.FixedLenFeature([10], dtype=tf.int64), 'instrument_source': tf.FixedLenFeature([1], dtype=tf.int64), 'instrument_family': tf.FixedLenFeature([1], dtype=tf.int64), } example = tf.parse_single_example(record, features) wave, label = example['audio'], example['pitch'] wave = spectral_ops.crop_or_pad(wave[tf.newaxis, :, tf.newaxis], length, channels)[0] one_hot_label = tf.one_hot( label_index_table.lookup(label), depth=len(pitches))[0] return wave, one_hot_label, label, example['instrument_source'] dataset = self._get_dataset_from_path() dataset = dataset.map(_parse_nsynth, num_parallel_calls=4) # Filter just specified instrument sources def _is_wanted_source(s): return tf.reduce_any(list(map(lambda q: tf.equal(s, q)[0], self._instrument_sources))) dataset = dataset.filter(lambda w, l, p, s: _is_wanted_source(s)) # Filter just specified pitches dataset = dataset.filter(lambda w, l, p, s: tf.greater_equal(p, self._min_pitch)[0]) dataset = dataset.filter(lambda w, l, p, s: tf.less_equal(p, self._max_pitch)[0]) dataset = dataset.map(lambda w, l, p, s: (w, l)) return dataset
def string_to_int_mapper(keys_to_map, mapping, num_oov_buckets=1, suffix="_id"): """Creates a mapping function to convert strings to ints in a tf.data.Dataset. For `dataset` outputs of type `str`, uses the list of strings in the given input `mapping` to look up the strings using tf.contrib.lookup and convert them to same-shape tensors of size tf.int32. Example: vocab = ['the', 'fox', 'jumped'] dataset = dataset.map(string_to_int_mapper(['words'], mapping=vocab)) dataset['words_id'] # <-- 'the' is mapped to 0, 'fox' to 1, etc... Args: keys_to_map: List of strings that are keys for tf.string Tensors to lookup. mapping: List of strings (or string tensors) to do the lookup. If the mapping is already a lookup table, then we directly use it. num_oov_buckets: Number of OOV buckets to use (default = 1). suffix: String to append to the given keys to indicate the mapped Tensors. Returns: _mapper: A mapping function that can be used with the tf.data.Dataset API. """ if isinstance(mapping, LookupInterface): table = mapping else: table = contrib_lookup.index_table_from_tensor( mapping=mapping, num_oov_buckets=num_oov_buckets) def _mapper(dataset): for k in keys_to_map: dataset[k + suffix] = tf.to_int32(table.lookup(dataset[k])) return dataset return _mapper
def make_iterator_from_text_dataset(text_dataset, batch_size, unit_dict, shuffle=False, bucket_width=-1, num_cores=4): from tensorflow.contrib.lookup import index_table_from_tensor table = index_table_from_tensor(mapping=list(unit_dict.values())) dataset = tf.data.TextLineDataset(text_dataset) dataset = dataset.map( lambda str: tf.string_split([str], delimiter='').values) dataset = dataset.map(lambda chars: (chars, tf.size(chars))) dataset = dataset.map(lambda chars, size: (table.lookup(chars), size)) if shuffle is True: dataset = dataset.shuffle(buffer_size=1000000, reshuffle_each_iteration=True) def batching_fun(x): labels_shape = ( tf.TensorShape([None]), tf.TensorShape([]), ) return x.padded_batch(batch_size=batch_size, padded_shapes=(labels_shape)) if bucket_width == -1: dataset = batching_fun(dataset) else: def key_func(labels, labels_len): # labels_len = tf.shape(labels)[0] bucket_id = labels_len // bucket_width return tf.cast(bucket_id, dtype=tf.int64) def reduce_func(unused_key, windowed_dataset): return batching_fun(windowed_dataset) dataset = tf.data.Dataset.apply( dataset, tf.data.experimental.group_by_window(key_func=key_func, reduce_func=reduce_func, window_size=batch_size)) dataset = dataset.prefetch(128) iterator = dataset.make_initializable_iterator() labels, labels_len = iterator.get_next() return BatchedData(iterator_initializer=iterator.initializer, inputs_filenames=None, labels_filenames=None, inputs=None, payload=None, inputs_length=None, labels=labels, labels_length=labels_len)
def decode_target(target_string): table = lookup.index_table_from_tensor(tf.constant(commons.TARGET_LABELS)) return table.lookup(target_string)
def main(): input = [["emersoN", "lAke", "aNd", "palmer"], ["i", "haVe", "a", "343yaCht123", "m%an", "2543"]] sentences_padded, _ = pad_sequences(input, '') sentences = tf.constant(sentences_padded) lowercase_sentences = lowercase(sentences) table = lookup.index_table_from_tensor(mapping=tf.constant(['']), default_value=1) sequence_lengths = tf.reduce_sum(table.lookup(sentences), 1) word_table = lookup.index_table_from_file(vocabulary_file="data/words.txt", num_oov_buckets=1) char_table = lookup.index_table_from_file(vocabulary_file="data/chars.txt", default_value=-1) sentences_shape = tf.shape(sentences, out_type=tf.int64) # We need to remove chars not in vocab removed_char_sentences = remove_unknown_chars(sentences, char_table) split_words = tf.string_split(tf.reshape(removed_char_sentences, [-1]), delimiter="") dense_split_words = tf.sparse_tensor_to_dense(split_words, default_value='') max_word_len = tf.gather_nd(split_words.dense_shape, [1]) chars_shape = tf.concat([sentences_shape, [max_word_len]], 0) chars = tf.reshape(dense_split_words, chars_shape) word_lengths = tf.reduce_sum(table.lookup(chars), 2) word_ids = word_table.lookup(sentences) char_ids = char_table.lookup(chars) word_mask = tf.sequence_mask(sequence_lengths) word_ids = tf.where(word_mask, word_ids, tf.zeros_like(word_ids)) char_mask = tf.sequence_mask(word_lengths) char_ids = tf.where(char_mask, char_ids, tf.zeros_like(char_ids)) config = Config() # build model model = NERModel(config) model.build() dev = CoNLLDataset(config.filename_dev, max_iter=config.max_iter) train = CoNLLDataset(config.filename_train, max_iter=config.max_iter) batch_size = model.config.batch_size # iterate over dataset for i, (words, labels) in enumerate(minibatches(train, batch_size)): print "Start" fd, _ = model.get_feed_dict(words, labels, model.config.lr, model.config.dropout) _, train_loss = model.sess.run([model.train_op, model.loss], feed_dict=fd) print "train loss", train_loss metrics = model.run_evaluate(dev) msg = " - ".join( ["{} {:04.2f}".format(k, v) for k, v in metrics.items()]) print msg
def construct_input(sequence_feature_map, categorical_values, categorical_seq_feature, feature_value, mode, normalize, momentum, min_value, max_value, input_keep_prob): """Returns a function to build the model. Args: sequence_feature_map: A dictionary of (Sparse)Tensors of dense shape [batch_size, max_sequence_length, None] keyed by the feature name. categorical_values: Potential values of the categorical_seq_feature. categorical_seq_feature: Name of feature of observation code. feature_value: Name of feature of observation value. mode: The execution mode, as defined in tf.estimator.ModeKeys. normalize: Whether to normalize each lab test. momentum: For the batch normalization mean and variance will be updated as momentum*old_value + (1-momentum) * new_value. min_value: Observation values smaller than this will be capped to min_value. max_value: Observation values larger than this will be capped to max_value. input_keep_prob: Keep probability for input observation values. Returns: - diff_delta_time: Tensor of shape [batch_size, max_seq_length, 1] with the - obs_values: A dense representation of the observation_values with obs_values[b, t, :] has at most one non-zero value at the position of the corresponding lab test from obs_code_ids with the value of the lab result. A padded Tensor of shape [batch_size, max_sequence_length, vocab_size] of type float32 of possibly normalized observation values. - indicator: A one-hot encoding of whether a value in obs_values comes from observation_values or is just filled in to be 0. A Tensor of shape [batch_size, max_sequence_length, vocab_size] and type float32. """ with tf.variable_scope('input'): sequence_feature_map = { k: tf.sparse_reorder(s) if isinstance(s, tf.SparseTensor) else s for k, s in sequence_feature_map.items() } # Filter out invalid values. # For invalid observation values we do this through a sparse retain. # This makes sure that the invalid values will not be considered in the # normalization. observation_values = sequence_feature_map[feature_value] observation_code_sparse = sequence_feature_map[categorical_seq_feature] # Future work: Create a flag for the missing value indicator. valid_values = tf.abs(observation_values.values - 9999999.0) > TOLERANCE # apply input dropout if input_keep_prob < 1.0: random_tensor = input_keep_prob random_tensor += tf.random_uniform(tf.shape(observation_values.values)) # 0. if [input_keep_prob, 1.0) and 1. if [1.0, 1.0 + input_keep_prob) dropout_mask = tf.floor(random_tensor) if mode == tf.estimator.ModeKeys.TRAIN: valid_values = tf.to_float(valid_values) * dropout_mask valid_values = valid_values > 0.5 sequence_feature_map[feature_value] = tf.sparse_retain( observation_values, valid_values) sequence_feature_map[categorical_seq_feature] = tf.sparse_retain( observation_code_sparse, valid_values) # 1. Construct the sequence of observation values to feed into the RNN # and their indicator. # We assign each observation code an id from 0 to vocab_size-1. At each # timestep we will lookup the id for the observation code and take the value # of the lab test and a construct a vector with all zeros but the id-th # position is set to the lab test value. obs_code = sequence_feature_map[categorical_seq_feature] obs_code_dense_ids = contrib_lookup.index_table_from_tensor( tuple(categorical_values), num_oov_buckets=0, name='vocab_lookup').lookup(obs_code.values) obs_code_sparse = tf.SparseTensor( values=obs_code_dense_ids, indices=obs_code.indices, dense_shape=obs_code.dense_shape) obs_code_sparse = tf.sparse_reorder(obs_code_sparse) observation_values = sequence_feature_map[feature_value] observation_values = tf.sparse_reorder(observation_values) vocab_size = len(categorical_values) obs_values, indicator = combine_observation_code_and_values( obs_code_sparse, observation_values, vocab_size, mode, normalize, momentum, min_value, max_value) # 2. We compute the diff_delta_time as additional sequence feature. # Note, the LSTM is very sensitive to how you encode time. delta_time = sequence_feature_map['deltaTime'] diff_delta_time = tf.concat( [delta_time[:, :1, :], delta_time[:, :-1, :]], axis=1) - delta_time diff_delta_time = tf.to_float(diff_delta_time) / (60.0 * 60.0) return (diff_delta_time, obs_values, indicator)
def get_label_ids(label_tensor, labels_names): """Convert a string tensor of string label names into an int32 tensor of label indices (same shape).""" label_lookup = lookup.index_table_from_tensor(labels_names, default_value=0) label_ids = label_lookup.lookup(label_tensor) return tf.cast(label_ids, tf.int32)
def _lookup_key(key, key_vocab): table = lookup.index_table_from_tensor(key_vocab, default_value=-1) key_indices = table.lookup(key) with tf.control_dependencies([tf.assert_non_negative(key_indices)]): return tf.identity(key_indices)
def create_sprites_dataset(characters, actions, directions, channels=3, length=8, shuffle=False, fake_data=False): """Creates a tf.data pipeline for the sprites dataset. Args: characters: A list of (skin, hair, top, pants) tuples containing relative paths to the sprite png image for each attribute. actions: A list of Actions. directions: A list of Directions. channels: Number of image channels to yield. length: Desired length of the sequences. shuffle: Whether or not to shuffle the characters and sequences start frame. fake_data: Boolean for whether or not to yield synthetic data. Returns: A tf.data.Dataset yielding (seq, skin label index, hair label index, top label index, pants label index, action label index, skin label name, hair label_name, top label name, pants label name, action label name) tuples. """ if fake_data: dummy_image = tf.random.normal([HEIGHT, WIDTH, CHANNELS]) else: basedir = download_sprites() action_names = [action.name for action in actions] action_metadata = [(action.start_row, action.frames) for action in actions] direction_rows = [direction.row_offset for direction in directions] chars = tf.data.Dataset.from_tensor_slices(characters) act_names = tf.data.Dataset.from_tensor_slices(action_names).repeat() acts_metadata = tf.data.Dataset.from_tensor_slices(action_metadata).repeat() dir_rows = tf.data.Dataset.from_tensor_slices(direction_rows).repeat() if shuffle: chars = chars.shuffle(len(characters)) dataset = tf.data.Dataset.zip((chars, act_names, acts_metadata, dir_rows)) skin_table = contrib_lookup.index_table_from_tensor(sorted(SKIN_COLORS)) hair_table = contrib_lookup.index_table_from_tensor(sorted(HAIRSTYLES)) top_table = contrib_lookup.index_table_from_tensor(sorted(TOPS)) pants_table = contrib_lookup.index_table_from_tensor(sorted(PANTS)) action_table = contrib_lookup.index_table_from_tensor(sorted(action_names)) def process_example(attrs, act_name, act_metadata, dir_row_offset): """Processes a dataset row.""" skin_name = attrs[0] hair_name = attrs[1] top_name = attrs[2] pants_name = attrs[3] if fake_data: char = dummy_image else: skin = read_image(basedir + os.sep + skin_name) hair = read_image(basedir + os.sep + hair_name) top = read_image(basedir + os.sep + top_name) pants = read_image(basedir + os.sep + pants_name) char = create_character(skin, hair, top, pants) if shuffle: seq = create_random_seq(char, act_metadata, dir_row_offset, length) else: seq = create_seq(char, act_metadata, dir_row_offset, length) seq = seq[..., :channels] # limit output channels skin_idx = skin_table.lookup(skin_name) hair_idx = hair_table.lookup(hair_name) top_idx = top_table.lookup(top_name) pants_idx = pants_table.lookup(pants_name) act_idx = action_table.lookup(act_name) return (seq, skin_idx, hair_idx, top_idx, pants_idx, act_idx, skin_name, hair_name, top_name, pants_name, act_name) dataset = dataset.map(process_example) return dataset
def preprocessing_fn(inputs): table = lookup.index_table_from_tensor(['a', 'b']) integerized = table.lookup(inputs['x']) return {'integerized': integerized}
def convert_label(label): table = lookup.index_table_from_tensor(['>50K', '<=50K']) return table.lookup(label)
def init_predict_graph(self): """ init predict model graph :return: """ # split 1-D String dense Tensor to words SparseTensor self.input_sentences = tf.placeholder(dtype=tf.string, shape=[None], name='input_sentences') sparse_words = tf.string_split(self.input_sentences, delimiter=' ') # slice SparseTensor valid_indices = tf.less(sparse_words.indices, tf.constant([self.num_steps], dtype=tf.int64)) valid_indices = tf.reshape( tf.split(valid_indices, [1, 1], axis=1)[1], [-1]) valid_sparse_words = tf.sparse_retain(sparse_words, valid_indices) excess_indices = tf.greater_equal( sparse_words.indices, tf.constant([self.num_steps], dtype=tf.int64)) excess_indices = tf.reshape( tf.split(excess_indices, [1, 1], axis=1)[1], [-1]) excess_sparse_words = tf.sparse_retain(sparse_words, excess_indices) # compute sentences lengths int_values = tf.ones(shape=tf.shape(valid_sparse_words.values), dtype=tf.int64) int_valid_sparse_words = tf.SparseTensor( indices=valid_sparse_words.indices, values=int_values, dense_shape=valid_sparse_words.dense_shape) input_sentences_lengths = tf.sparse_reduce_sum(int_valid_sparse_words, axis=1) # sparse to dense default_padding_word = self.data_utils._START_VOCAB[0] words = tf.sparse_to_dense( sparse_indices=valid_sparse_words.indices, output_shape=[valid_sparse_words.dense_shape[0], self.num_steps], sparse_values=valid_sparse_words.values, default_value=default_padding_word) # dict words to ids with open(os.path.join(self.vocab_path, 'words_vocab.txt'), encoding='utf-8', mode='rt') as data_file: words_table_list = [ line.strip() for line in data_file if line.strip() ] words_table_tensor = tf.constant(words_table_list, dtype=tf.string) words_table = lookup.index_table_from_tensor( mapping=words_table_tensor, default_value=self.data_utils._START_VOCAB_ID[3]) # words_table = lookup.index_table_from_file(os.path.join(vocab_path, 'words_vocab.txt'), default_value=3) words_ids = words_table.lookup(words) # blstm model predict with tf.variable_scope('model', reuse=None): logits = self.sequence_labeling_model.inference( words_ids, input_sentences_lengths, self.num_classes, is_training=False) if self.use_crf: logits = tf.reshape(logits, shape=[-1, self.num_steps, self.num_classes]) transition_params = tf.get_variable( "transitions", [self.num_classes, self.num_classes]) input_sentences_lengths = tf.to_int32(input_sentences_lengths) predict_labels_ids, sequence_scores = crf.crf_decode( logits, transition_params, input_sentences_lengths) predict_labels_ids = tf.to_int64(predict_labels_ids) sequence_scores = tf.reshape(sequence_scores, shape=[-1, 1]) normalized_sequence_scores = self.tensorflow_utils.score_normalize( sequence_scores) predict_scores = tf.matmul( normalized_sequence_scores, tf.ones(shape=[1, self.num_steps], dtype=tf.float32)) else: props = tf.nn.softmax(logits) max_prop_values, max_prop_indices = tf.nn.top_k(props, k=1) predict_labels_ids = tf.reshape(max_prop_indices, shape=[-1, self.num_steps]) predict_labels_ids = tf.to_int64(predict_labels_ids) predict_scores = tf.reshape(max_prop_values, shape=[-1, self.num_steps]) predict_scores = tf.as_string(predict_scores, precision=3) # dict ids to labels with open(os.path.join(self.vocab_path, 'labels_vocab.txt'), encoding='utf-8', mode='rt') as data_file: labels_table_list = [ line.strip() for line in data_file if line.strip() ] labels_table_tensor = tf.constant(labels_table_list, dtype=tf.string) labels_table = lookup.index_to_string_table_from_tensor( mapping=labels_table_tensor, default_value=self.default_label) # labels_table = lookup.index_to_string_table_from_file(os.path.join(vocab_path, 'labels_vocab.txt'), default_value='O') predict_labels = labels_table.lookup(predict_labels_ids) sparse_predict_labels = self.tensorflow_utils.sparse_concat( predict_labels, valid_sparse_words, excess_sparse_words, self.default_label) sparse_predict_scores = self.tensorflow_utils.sparse_concat( predict_scores, valid_sparse_words, excess_sparse_words, '0.0') self.format_predict_labels = self.tensorflow_utils.sparse_string_join( sparse_predict_labels, 'predict_labels') self.format_predict_scores = self.tensorflow_utils.sparse_string_join( sparse_predict_scores, 'predict_scores') saver = tf.train.Saver() tables_init_op = tf.tables_initializer() self.sess = tf.Session() self.sess.run(tables_init_op) ckpt = tf.train.get_checkpoint_state(self.checkpoint_path) if ckpt and ckpt.model_checkpoint_path: print('read model from {}'.format(ckpt.model_checkpoint_path)) saver.restore(self.sess, ckpt.model_checkpoint_path) else: print('No checkpoint file found at %s' % self.checkpoint_path) return
def decode_target(target_string): table = lookup.index_table_from_tensor(tf.constant(commons.TARGET_LABELS)) return table.lookup(target_string)