Пример #1
0
def non_batched_dataset(train_dev_or_test,
                        label_vocab,
                        data_root_dir=DATA_ROOT_DIR):
    """Constructs a dataset of examples.

  Args:
    train_dev_or_test: one of _DEV_FOLD_VALUES. The source examples to load into
      a dataset.
    label_vocab: list of string.
    data_root_dir: path to tfrecord examples.

  Returns:
    tf.data.Dataset, where each example is of form
    {
        SEQUENCE_KEY: one-hot of amino acid characters
        SEQUENCE_LENGTH_KEY: length of sequence
        SEQUENCE_ID_KEY: unique identifier for protein
        LABEL_KEY: rank-1 tensor of integer labels from label_vocab,
    }
  """
    if train_dev_or_test not in DATA_FOLD_VALUES:
        raise ValueError(('Only train, dev, test and * are supported datasets.'
                          ' Received {}.').format(train_dev_or_test))
    dataset_files = [
        os.path.join(data_root_dir, f)
        for f in tf.gfile.ListDirectory(data_root_dir)
        if train_dev_or_test in f and ".tfrecord" in f
    ]

    tfrecord_dataset = tf.data.TFRecordDataset(dataset_files)

    dataset = tfrecord_dataset.map(lambda record: tf.io.parse_single_example(  # pylint: disable=g-long-lambda
        record, DATASET_FEATURES))
    dataset = dataset.map(_add_sequence_length)
    dataset = dataset.filter(_is_sequence_short_enough_for_training)

    amino_acid_table = contrib_lookup.index_table_from_tensor(
        utils.AMINO_ACID_VOCABULARY,
        default_value=len(utils.AMINO_ACID_VOCABULARY))
    protein_class_table = contrib_lookup.index_table_from_tensor(
        mapping=label_vocab)

    dataset = dataset.map(
        lambda ex: _map_sequence_to_ints(ex, amino_acid_table))
    dataset = dataset.map(
        lambda ex: _map_labels_to_ints(ex, protein_class_table))
    dataset = dataset.map(_to_one_hot_sequence)

    if train_dev_or_test == TRAIN_FOLD:
        dataset = dataset.repeat()

    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

    return dataset
Пример #2
0
def embeddings(features, configs, embedding_size=None):
    assert (embedding_size is not None) and isinstance(embedding_size, int)

    embedded = {}
    with tf.variable_scope('embeddings'):
        for config in configs:
            embeddings = tf.get_variable(
                name=config.name,
                shape=[config.length, embedding_size],
                dtype=tf.float32,
                trainable=True,
                initializer=tf.random_uniform_initializer(minval=-0.5,
                                                          maxval=0.5,
                                                          dtype=tf.float32),
            )

            if config.table is not None:
                lookup_table = lookup.index_table_from_tensor(
                    mapping=tf.constant(config.table, dtype=tf.string),
                    default_value=0)

                for field in config.fields:
                    embedded[field] = tf.nn.embedding_lookup(
                        embeddings, lookup_table.lookup(features[field]))

            else:
                for field in config.fields:
                    embedded[field] = tf.nn.embedding_lookup(
                        embeddings, features[field])

    return embedded
Пример #3
0
def create_vocab_lookup_tables(vocab):
    str_to_int = lookup.index_table_from_tensor(
        mapping=vocab,
        num_oov_buckets=0,
        default_value=OOV_TOKEN_ID,
        name='vocab_lookup_str_to_int'
    )

    int_to_str = lookup.index_to_string_table_from_tensor(
        mapping=vocab,
        default_value=UNKNOWN_TOKEN,
        name='vocab_lookup_int_to_str'
    )

    word2id = {w: i for i, w in enumerate(vocab)}

    vocab_lookup = {
        INT_TO_STR: int_to_str,
        STR_TO_INT: str_to_int,
        RAW_WORD2ID: word2id,
        RAW_ID2WORD: vocab
    }

    graph_utils.add_dict_to_collection(vocab_lookup, VOCAB_LOOKUP_COLL_NAME)

    return vocab_lookup
Пример #4
0
    def test_table_roundtrip(self):
        export_path = os.path.join(tempfile.mkdtemp(), 'export')

        with tf.Graph().as_default():
            with tf.Session().as_default() as session:
                input_string = tf.placeholder(tf.string)
                # Map string through a table, in this case based on a constant tensor.
                table = lookup.index_table_from_tensor(
                    tf.constant(['cat', 'dog', 'giraffe']))
                output = table.lookup(input_string)
                inputs = {'input': input_string}
                outputs = {'output': output}
                saved_transform_io.write_saved_transform_from_session(
                    session, inputs, outputs, export_path)

        with tf.Graph().as_default():
            with tf.Session().as_default() as session:
                # Using a computed input gives confidence that the graphs are fused.
                input_string = tf.constant('dog')
                inputs = {'input': input_string}
                outputs = saved_transform_io.apply_saved_transform(
                    export_path, inputs)
                session.run(tf.tables_initializer())
                result = session.run(outputs['output'])
                self.assertEqual(1, result)
Пример #5
0
    def __init__(self,
                 speaker_list,
                 file_pattern,
                 T=2**12,
                 batch_size=1,
                 num_epoch=None,
                 buffer_size=4000):
        ''' `T`: sequence length '''
        with tf.device('/cpu'):
            with tf.name_scope('ByteInputPipeline'):
                self.speaker_list = tf.constant(speaker_list)
                self.table = index_table_from_tensor(mapping=self.speaker_list)
                self.T = T

                filenames = tf.gfile.Glob(file_pattern)
                if filenames:
                    print('Data Loader: {} files found\n'.format(
                        len(filenames)))
                else:
                    raise ValueError('No files found: {}'.format(file_pattern))

                dataset = (tf.data.TFRecordDataset(filenames).map(
                    self._parse_function).shuffle(buffer_size).batch(
                        batch_size).repeat(num_epoch))
                self.iterator = dataset.make_initializable_iterator()
                self.x, self.y = self.iterator.get_next()
Пример #6
0
    def testCreatePhasesWithUnwrappedTable(self):
        # Create a graph with a table that is not wrapped in `apply_function`.
        string_placeholder = tf.placeholder(tf.string, shape=(None, ))
        table = lookup.index_table_from_tensor(['a', 'b'])
        table.lookup(string_placeholder)

        with self.assertRaisesRegexp(ValueError, 'Found table initializers'):
            impl_helper.create_phases()
Пример #7
0
 def make_item2id(self, seq_list):
     count_dict = collections.Counter(
         reduce(lambda seq1, seq2: seq1 + seq2, seq_list))
     item_mapper = list(
         map(lambda i: i[0] if i[1] >= self.min_count else '0',
             count_dict.items()))
     item2id = index_table_from_tensor(mapping=item_mapper, default_value=0)
     self.vocab_size = item2id.size()
     return item2id
Пример #8
0
    def add_vocab_lookups(self):
        with open(self.config.filename_words) as f:
            words = [word.strip() for idx, word in enumerate(f)]

        with open(self.config.filename_tags) as f:
            labels = [label.strip() for idx, label in enumerate(f)]

        with open(self.config.filename_chars) as f:
            chars = [char.strip() for idx, char in enumerate(f)]

        self.label_list = tf.constant(labels)

        self.word_table = lookup.index_table_from_tensor(
            mapping=words, default_value=words.index(UNK))
        self.char_table = lookup.index_table_from_tensor(mapping=chars,
                                                         default_value=-1)
        self.label_table = lookup.index_table_from_tensor(
            mapping=self.label_list, num_oov_buckets=1)
Пример #9
0
def fasta_indexer():
  """Get a function for converting tokenized protein strings to indices."""
  mapping = tf.constant(FULL_RESIDUE_VOCAB)
  table = contrib_lookup.index_table_from_tensor(mapping)

  def mapper(residues):
    return tf.ragged.map_flat_values(table.lookup, residues)

  return mapper
Пример #10
0
    def __init__(self,
                 split_name,
                 preprocess_fn,
                 num_epochs,
                 shuffle,
                 random_seed=None,
                 filter_filename=None,
                 drop_remainder=True,
                 sup=False):
        """Initialize the dataset object.

    Args:
      split_name: A string split name, to load from the dataset.
      preprocess_fn: Preprocess a single example. The example is already
        parsed into a dictionary.
      num_epochs: An int, defaults to `None`. Number of epochs to cycle
        through the dataset before stopping. If set to `None` this will read
        samples indefinitely.
      shuffle: A boolean, defaults to `False`. Whether output data are
        shuffled.
      random_seed: Optional int. Random seed for shuffle operation.
      filter_filename: Optional filename to use for filtering.
      drop_remainder: If true, then the last incomplete batch is dropped.
    """
        # This is an instance-variable instead of a class-variable because it
        # depends on FLAGS, which is not parsed yet at class-parse-time.
        dataset_dir = FLAGS.sup_dataset_dir if sup else FLAGS.unsup_dataset_dir
        files = os.path.join(os.path.expanduser(dataset_dir), '%s@%i')
        filenames = {
            'train': generate_sharded_filenames(files % ('train', 1024))[:-40],
            'val': generate_sharded_filenames(files % ('train', 1024))[-40:],
            'trainval': generate_sharded_filenames(files % ('train', 1024)),
            'test': generate_sharded_filenames(files % ('validation', 128))
        }

        super(DatasetWalmartFashion,
              self).__init__(filenames=filenames[split_name],
                             reader=tf.data.TFRecordDataset,
                             num_epochs=num_epochs,
                             shuffle=shuffle,
                             random_seed=random_seed,
                             filter_fn=self.get_filter()
                             if filter_filename is not None else None,
                             drop_remainder=drop_remainder)
        self.split_name = split_name
        self.preprocess_fn = preprocess_fn

        self.filename_list = None
        if filter_filename is not None:
            with tf.gfile.Open(filter_filename, 'r') as f:
                filename_list = json.load(f)
                filename_list = tf.constant(filename_list['values'])
                filename_list = index_table_from_tensor(mapping=filename_list,
                                                        num_oov_buckets=0,
                                                        default_value=-1)
            self.filename_list = filename_list
Пример #11
0
    def _map_to_int(x):
        """Maps string tensor into indexes using vocab.

    Args:
      x : a Tensor/SparseTensor of string.
    Returns:
      a Tensor/SparseTensor of indexes (int) of the same shape as x.
    """
        table = lookup.index_table_from_tensor(vocab, default_value=len(vocab))
        return table.lookup(x)
Пример #12
0
def get_vocab_lookup(vocab, name=None, reuse=None):
    with tf.variable_scope(name, 'vocab_lookup', reuse=reuse):
        vocab_lookup = lookup.index_table_from_tensor(
            mapping=vocab,
            num_oov_buckets=0,
            default_value=OOV_TOKEN_ID,
            name=name
        )

    return vocab_lookup
Пример #13
0
 def convert_label(label):
   """Parses a string tensor into the label tensor
   Args:
     label_string_tensor: Tensor of dtype string. Result of parsing the
     CSV column specified by LABEL_COLUMN
   Returns:
     A Tensor of the same shape as label_string_tensor, should return
     an int64 Tensor representing the label index for classification tasks
   """
   table = lookup.index_table_from_tensor(['<=50K', '>50K'])
   return table.lookup(label)
Пример #14
0
def get_word_embeddings(word_tensor, embedding_words, embedding_vectors):
    """Convert a string tensor of words into a float32 tensor of word embeddings."""
    word_lookup = lookup.index_table_from_tensor(embedding_words, default_value=0)
    word_ids = word_lookup.lookup(word_tensor)

    word_embeddings = tf.concat((
        tf.get_variable(name="unknown_word_embedding", initializer=embedding_vectors[:1], trainable=True),
        tf.get_variable(name="known_word_embeddings", initializer=embedding_vectors[1:], trainable=False)
    ), axis=0)

    return tf.nn.embedding_lookup(word_embeddings, word_ids)
   def convert_label(label):
       """Parses a string tensor into the label tensor
 Args:
   label_string_tensor: Tensor of dtype string. Result of parsing the
   CSV column specified by LABEL_COLUMN
 Returns:
   A Tensor of the same shape as label_string_tensor, should return
   an int64 Tensor representing the label index for classification tasks
 """
       table = lookup.index_table_from_tensor(['<=50K', '>50K'])
       return table.lookup(label)
Пример #16
0
  def _map_to_int(x):
    """Maps string tensor into indexes using vocab.

    Args:
      x : a Tensor/SparseTensor of string.
    Returns:
      a Tensor/SparseTensor of indexes (int) of the same shape as x.
    """
    table = lookup.index_table_from_tensor(
        vocab,
        default_value=len(vocab))
    return table.lookup(x)
Пример #17
0
def string2index(feature_strings, feature):
    """
    Convert a `Tensor` of type `tf.string` to a corresponding Tensor of ids (`tf.int32`)
    :param feature_strings: string `Tensor`
    :param feature: feature extractor with string to index vocabulary
    :return: feature id Tensor
    """
    with variable_scope('lookup'):
        feats = list(feature.ordered_feats())
        lookup = index_table_from_tensor(mapping=tf.constant(feats),
                                         default_value=feature.unk_index())
        return lookup.lookup(feature_strings)
Пример #18
0
    def __init__(self, speaker_list, filenames, num_epoch=1):
        with tf.device('/cpu'):
            with tf.name_scope('ByteInputPipeline'):
                self.speaker_list = tf.constant(speaker_list)
                self.table = index_table_from_tensor(mapping=self.speaker_list)

                print('{} files found'.format(len(filenames)))
                dataset = (tf.data.TFRecordDataset(filenames).map(
                    self._parse_function).batch(1).repeat(num_epoch))

                self.iterator = dataset.make_initializable_iterator()
                self.x, self.y, self.f, self.w, self.t = self.iterator.get_next(
                )
Пример #19
0
def build_tensorize_text_fn(embeddings):
    """Builds a function to turn text into word/char ids."""
    tbl = contrib_lookup.index_table_from_tensor(
        mapping=embeddings.get_vocab(), num_oov_buckets=1)

    def fn(string_tensor):
        """Builds the output tensor dictionary."""
        out = {}
        if FLAGS.lowercase:
            string_tensor = ops.lowercase_op(string_tensor)
        out["wids"] = tf.to_int32(tbl.lookup(string_tensor))
        out["cids"] = char_utils.batch_word_to_char_ids(string_tensor, 50)
        out["len"] = tf.shape(string_tensor)[-1]
        return out

    return fn
Пример #20
0
    def add_id_lookups(self):
        table = lookup.index_table_from_tensor(mapping=tf.constant(['']),
                                               default_value=1)

        sentences_shape = tf.shape(self.padded_sentences, out_type=tf.int64)

        removed_char_sentences = remove_unknown_chars(self.padded_sentences,
                                                      self.char_table)
        split_words = tf.string_split(tf.reshape(removed_char_sentences, [-1]),
                                      delimiter="")
        dense_split_words = tf.sparse_tensor_to_dense(split_words,
                                                      default_value='')

        max_word_len = tf.gather_nd(split_words.dense_shape, tf.constant([1]))
        chars_shape = tf.concat([sentences_shape, [max_word_len]], 0)

        chars = tf.reshape(dense_split_words, chars_shape)

        self.word_lengths = tf.reduce_sum(table.lookup(chars), 2)

        lowercase_sentences = lowercase(self.padded_sentences)
        sanitised_sentences = tf.regex_replace(lowercase_sentences, '^[0-9]+$',
                                               NUM)

        self.sequence_lengths = tf.reduce_sum(
            table.lookup(sanitised_sentences), 1)

        self.word_ids = self.word_table.lookup(sanitised_sentences)
        self.char_ids = self.char_table.lookup(chars)

        word_mask = tf.sequence_mask(self.sequence_lengths)
        char_mask = tf.sequence_mask(self.word_lengths)

        self.word_ids = tf.where(word_mask, self.word_ids,
                                 tf.zeros_like(self.word_ids))
        self.char_ids = tf.where(char_mask, self.char_ids,
                                 tf.zeros_like(self.char_ids))

        label_lengths = tf.reduce_sum(table.lookup(self.label_codes), 1)
        labels_mask = tf.sequence_mask(label_lengths)
        self.labels = self.label_table.lookup(self.label_codes)
        self.labels = tf.where(labels_mask, self.labels,
                               tf.zeros_like(self.labels))
Пример #21
0
  def provide_dataset(self):
    """Provides dataset (audio, labels) of nsynth."""
    length = 64000
    channels = 1

    pitch_counts = self.get_pitch_counts()
    pitches = sorted(pitch_counts.keys())
    label_index_table = contrib_lookup.index_table_from_tensor(
        sorted(pitches), dtype=tf.int64)

    def _parse_nsynth(record):
      """Parsing function for NSynth dataset."""
      features = {
          'pitch': tf.FixedLenFeature([1], dtype=tf.int64),
          'audio': tf.FixedLenFeature([length], dtype=tf.float32),
          'qualities': tf.FixedLenFeature([10], dtype=tf.int64),
          'instrument_source': tf.FixedLenFeature([1], dtype=tf.int64),
          'instrument_family': tf.FixedLenFeature([1], dtype=tf.int64),
      }

      example = tf.parse_single_example(record, features)
      wave, label = example['audio'], example['pitch']
      wave = spectral_ops.crop_or_pad(wave[tf.newaxis, :, tf.newaxis],
                                      length,
                                      channels)[0]
      one_hot_label = tf.one_hot(
          label_index_table.lookup(label), depth=len(pitches))[0]
      return wave, one_hot_label, label, example['instrument_source']

    dataset = self._get_dataset_from_path()
    dataset = dataset.map(_parse_nsynth, num_parallel_calls=4)

    # Filter just specified instrument sources
    def _is_wanted_source(s):
      return tf.reduce_any(list(map(lambda q: tf.equal(s, q)[0], self._instrument_sources)))
    dataset = dataset.filter(lambda w, l, p, s: _is_wanted_source(s))
    # Filter just specified pitches
    dataset = dataset.filter(lambda w, l, p, s: tf.greater_equal(p, self._min_pitch)[0])
    dataset = dataset.filter(lambda w, l, p, s: tf.less_equal(p, self._max_pitch)[0])
    dataset = dataset.map(lambda w, l, p, s: (w, l))
    return dataset
Пример #22
0
def string_to_int_mapper(keys_to_map,
                         mapping,
                         num_oov_buckets=1,
                         suffix="_id"):
    """Creates a mapping function to convert strings to ints in a tf.data.Dataset.

  For `dataset` outputs of type `str`, uses the list of strings in the given
  input `mapping` to look up the strings using tf.contrib.lookup and convert
  them to same-shape tensors of size tf.int32.

  Example:
    vocab = ['the', 'fox', 'jumped']
    dataset = dataset.map(string_to_int_mapper(['words'], mapping=vocab))
    dataset['words_id']  # <-- 'the' is mapped to 0, 'fox' to 1, etc...

  Args:
    keys_to_map: List of strings that are keys for tf.string Tensors to lookup.
    mapping: List of strings (or string tensors) to do the lookup. If the
        mapping is already a lookup table, then we directly use it.
    num_oov_buckets: Number of OOV buckets to use (default = 1).
    suffix: String to append to the given keys to indicate the mapped Tensors.

  Returns:
    _mapper: A mapping function that can be used with the tf.data.Dataset API.
  """
    if isinstance(mapping, LookupInterface):
        table = mapping
    else:
        table = contrib_lookup.index_table_from_tensor(
            mapping=mapping, num_oov_buckets=num_oov_buckets)

    def _mapper(dataset):
        for k in keys_to_map:
            dataset[k + suffix] = tf.to_int32(table.lookup(dataset[k]))
        return dataset

    return _mapper
Пример #23
0
def make_iterator_from_text_dataset(text_dataset,
                                    batch_size,
                                    unit_dict,
                                    shuffle=False,
                                    bucket_width=-1,
                                    num_cores=4):

    from tensorflow.contrib.lookup import index_table_from_tensor
    table = index_table_from_tensor(mapping=list(unit_dict.values()))

    dataset = tf.data.TextLineDataset(text_dataset)
    dataset = dataset.map(
        lambda str: tf.string_split([str], delimiter='').values)
    dataset = dataset.map(lambda chars: (chars, tf.size(chars)))
    dataset = dataset.map(lambda chars, size: (table.lookup(chars), size))
    if shuffle is True:
        dataset = dataset.shuffle(buffer_size=1000000,
                                  reshuffle_each_iteration=True)

    def batching_fun(x):

        labels_shape = (
            tf.TensorShape([None]),
            tf.TensorShape([]),
        )

        return x.padded_batch(batch_size=batch_size,
                              padded_shapes=(labels_shape))

    if bucket_width == -1:
        dataset = batching_fun(dataset)
    else:

        def key_func(labels, labels_len):
            # labels_len = tf.shape(labels)[0]
            bucket_id = labels_len // bucket_width
            return tf.cast(bucket_id, dtype=tf.int64)

        def reduce_func(unused_key, windowed_dataset):
            return batching_fun(windowed_dataset)

        dataset = tf.data.Dataset.apply(
            dataset,
            tf.data.experimental.group_by_window(key_func=key_func,
                                                 reduce_func=reduce_func,
                                                 window_size=batch_size))

    dataset = dataset.prefetch(128)

    iterator = dataset.make_initializable_iterator()

    labels, labels_len = iterator.get_next()

    return BatchedData(iterator_initializer=iterator.initializer,
                       inputs_filenames=None,
                       labels_filenames=None,
                       inputs=None,
                       payload=None,
                       inputs_length=None,
                       labels=labels,
                       labels_length=labels_len)
Пример #24
0
def decode_target(target_string):
    table = lookup.index_table_from_tensor(tf.constant(commons.TARGET_LABELS))
    return table.lookup(target_string)
def main():
    input = [["emersoN", "lAke", "aNd", "palmer"],
             ["i", "haVe", "a", "343yaCht123", "m%an", "2543"]]

    sentences_padded, _ = pad_sequences(input, '')

    sentences = tf.constant(sentences_padded)
    lowercase_sentences = lowercase(sentences)

    table = lookup.index_table_from_tensor(mapping=tf.constant(['']),
                                           default_value=1)

    sequence_lengths = tf.reduce_sum(table.lookup(sentences), 1)

    word_table = lookup.index_table_from_file(vocabulary_file="data/words.txt",
                                              num_oov_buckets=1)

    char_table = lookup.index_table_from_file(vocabulary_file="data/chars.txt",
                                              default_value=-1)

    sentences_shape = tf.shape(sentences, out_type=tf.int64)

    # We need to remove chars not in vocab
    removed_char_sentences = remove_unknown_chars(sentences, char_table)

    split_words = tf.string_split(tf.reshape(removed_char_sentences, [-1]),
                                  delimiter="")
    dense_split_words = tf.sparse_tensor_to_dense(split_words,
                                                  default_value='')

    max_word_len = tf.gather_nd(split_words.dense_shape, [1])
    chars_shape = tf.concat([sentences_shape, [max_word_len]], 0)

    chars = tf.reshape(dense_split_words, chars_shape)

    word_lengths = tf.reduce_sum(table.lookup(chars), 2)

    word_ids = word_table.lookup(sentences)
    char_ids = char_table.lookup(chars)

    word_mask = tf.sequence_mask(sequence_lengths)
    word_ids = tf.where(word_mask, word_ids, tf.zeros_like(word_ids))

    char_mask = tf.sequence_mask(word_lengths)
    char_ids = tf.where(char_mask, char_ids, tf.zeros_like(char_ids))

    config = Config()

    # build model
    model = NERModel(config)
    model.build()
    dev = CoNLLDataset(config.filename_dev, max_iter=config.max_iter)
    train = CoNLLDataset(config.filename_train, max_iter=config.max_iter)

    batch_size = model.config.batch_size

    # iterate over dataset
    for i, (words, labels) in enumerate(minibatches(train, batch_size)):
        print "Start"

        fd, _ = model.get_feed_dict(words, labels, model.config.lr,
                                    model.config.dropout)

        _, train_loss = model.sess.run([model.train_op, model.loss],
                                       feed_dict=fd)

        print "train loss", train_loss

        metrics = model.run_evaluate(dev)
        msg = " - ".join(
            ["{} {:04.2f}".format(k, v) for k, v in metrics.items()])
        print msg
def construct_input(sequence_feature_map, categorical_values,
                    categorical_seq_feature, feature_value, mode, normalize,
                    momentum, min_value, max_value, input_keep_prob):
  """Returns a function to build the model.

  Args:
    sequence_feature_map: A dictionary of (Sparse)Tensors of dense shape
      [batch_size, max_sequence_length, None] keyed by the feature name.
    categorical_values: Potential values of the categorical_seq_feature.
    categorical_seq_feature: Name of feature of observation code.
    feature_value: Name of feature of observation value.
    mode: The execution mode, as defined in tf.estimator.ModeKeys.
    normalize: Whether to normalize each lab test.
    momentum: For the batch normalization mean and variance will be updated as
      momentum*old_value + (1-momentum) * new_value.
    min_value: Observation values smaller than this will be capped to min_value.
    max_value: Observation values larger than this will be capped to max_value.
    input_keep_prob: Keep probability for input observation values.

  Returns:
    - diff_delta_time: Tensor of shape [batch_size, max_seq_length, 1]
      with the
    - obs_values: A dense representation of the observation_values with
                  obs_values[b, t, :] has at most one non-zero value at the
                  position of the corresponding lab test from obs_code_ids with
                  the value of the lab result. A padded Tensor of shape
                  [batch_size, max_sequence_length, vocab_size] of type float32
                  of possibly normalized observation values.
    - indicator: A one-hot encoding of whether a value in obs_values comes from
                 observation_values or is just filled in to be 0. A Tensor of
                 shape [batch_size, max_sequence_length, vocab_size] and type
                 float32.
  """
  with tf.variable_scope('input'):
    sequence_feature_map = {
        k: tf.sparse_reorder(s) if isinstance(s, tf.SparseTensor) else s
        for k, s in sequence_feature_map.items()
    }
    # Filter out invalid values.
    # For invalid observation values we do this through a sparse retain.
    # This makes sure that the invalid values will not be considered in the
    # normalization.
    observation_values = sequence_feature_map[feature_value]
    observation_code_sparse = sequence_feature_map[categorical_seq_feature]
    # Future work: Create a flag for the missing value indicator.
    valid_values = tf.abs(observation_values.values - 9999999.0) > TOLERANCE
    # apply input dropout
    if input_keep_prob < 1.0:
      random_tensor = input_keep_prob
      random_tensor += tf.random_uniform(tf.shape(observation_values.values))
      # 0. if [input_keep_prob, 1.0) and 1. if [1.0, 1.0 + input_keep_prob)
      dropout_mask = tf.floor(random_tensor)
      if mode == tf.estimator.ModeKeys.TRAIN:
        valid_values = tf.to_float(valid_values) * dropout_mask
        valid_values = valid_values > 0.5
    sequence_feature_map[feature_value] = tf.sparse_retain(
        observation_values, valid_values)
    sequence_feature_map[categorical_seq_feature] = tf.sparse_retain(
        observation_code_sparse, valid_values)

    # 1. Construct the sequence of observation values to feed into the RNN
    #    and their indicator.
    # We assign each observation code an id from 0 to vocab_size-1. At each
    # timestep we will lookup the id for the observation code and take the value
    # of the lab test and a construct a vector with all zeros but the id-th
    # position is set to the lab test value.
    obs_code = sequence_feature_map[categorical_seq_feature]
    obs_code_dense_ids = contrib_lookup.index_table_from_tensor(
        tuple(categorical_values), num_oov_buckets=0,
        name='vocab_lookup').lookup(obs_code.values)
    obs_code_sparse = tf.SparseTensor(
        values=obs_code_dense_ids,
        indices=obs_code.indices,
        dense_shape=obs_code.dense_shape)
    obs_code_sparse = tf.sparse_reorder(obs_code_sparse)
    observation_values = sequence_feature_map[feature_value]
    observation_values = tf.sparse_reorder(observation_values)
    vocab_size = len(categorical_values)
    obs_values, indicator = combine_observation_code_and_values(
        obs_code_sparse, observation_values, vocab_size, mode, normalize,
        momentum, min_value, max_value)

    # 2. We compute the diff_delta_time as additional sequence feature.
    # Note, the LSTM is very sensitive to how you encode time.
    delta_time = sequence_feature_map['deltaTime']
    diff_delta_time = tf.concat(
        [delta_time[:, :1, :], delta_time[:, :-1, :]], axis=1) - delta_time
    diff_delta_time = tf.to_float(diff_delta_time) / (60.0 * 60.0)

  return (diff_delta_time, obs_values, indicator)
Пример #27
0
def get_label_ids(label_tensor, labels_names):
    """Convert a string tensor of string label names into an int32 tensor of label indices (same shape)."""
    label_lookup = lookup.index_table_from_tensor(labels_names, default_value=0)
    label_ids = label_lookup.lookup(label_tensor)

    return tf.cast(label_ids, tf.int32)
Пример #28
0
def _lookup_key(key, key_vocab):
    table = lookup.index_table_from_tensor(key_vocab, default_value=-1)
    key_indices = table.lookup(key)
    with tf.control_dependencies([tf.assert_non_negative(key_indices)]):
        return tf.identity(key_indices)
Пример #29
0
def create_sprites_dataset(characters, actions, directions, channels=3,
                           length=8, shuffle=False, fake_data=False):
  """Creates a tf.data pipeline for the sprites dataset.

  Args:
    characters: A list of (skin, hair, top, pants) tuples containing
      relative paths to the sprite png image for each attribute.
    actions: A list of Actions.
    directions: A list of Directions.
    channels: Number of image channels to yield.
    length: Desired length of the sequences.
    shuffle: Whether or not to shuffle the characters and sequences
      start frame.
    fake_data: Boolean for whether or not to yield synthetic data.

  Returns:
    A tf.data.Dataset yielding (seq, skin label index, hair label index,
    top label index, pants label index, action label index, skin label
    name, hair label_name, top label name, pants label name, action
    label name) tuples.
  """
  if fake_data:
    dummy_image = tf.random.normal([HEIGHT, WIDTH, CHANNELS])
  else:
    basedir = download_sprites()

  action_names = [action.name for action in actions]
  action_metadata = [(action.start_row, action.frames) for action in actions]

  direction_rows = [direction.row_offset for direction in directions]

  chars = tf.data.Dataset.from_tensor_slices(characters)
  act_names = tf.data.Dataset.from_tensor_slices(action_names).repeat()
  acts_metadata = tf.data.Dataset.from_tensor_slices(action_metadata).repeat()
  dir_rows = tf.data.Dataset.from_tensor_slices(direction_rows).repeat()

  if shuffle:
    chars = chars.shuffle(len(characters))

  dataset = tf.data.Dataset.zip((chars, act_names, acts_metadata, dir_rows))

  skin_table = contrib_lookup.index_table_from_tensor(sorted(SKIN_COLORS))
  hair_table = contrib_lookup.index_table_from_tensor(sorted(HAIRSTYLES))
  top_table = contrib_lookup.index_table_from_tensor(sorted(TOPS))
  pants_table = contrib_lookup.index_table_from_tensor(sorted(PANTS))
  action_table = contrib_lookup.index_table_from_tensor(sorted(action_names))

  def process_example(attrs, act_name, act_metadata, dir_row_offset):
    """Processes a dataset row."""
    skin_name = attrs[0]
    hair_name = attrs[1]
    top_name = attrs[2]
    pants_name = attrs[3]

    if fake_data:
      char = dummy_image
    else:
      skin = read_image(basedir + os.sep + skin_name)
      hair = read_image(basedir + os.sep + hair_name)
      top = read_image(basedir + os.sep + top_name)
      pants = read_image(basedir + os.sep + pants_name)
      char = create_character(skin, hair, top, pants)

    if shuffle:
      seq = create_random_seq(char, act_metadata, dir_row_offset, length)
    else:
      seq = create_seq(char, act_metadata, dir_row_offset, length)
    seq = seq[..., :channels]  # limit output channels

    skin_idx = skin_table.lookup(skin_name)
    hair_idx = hair_table.lookup(hair_name)
    top_idx = top_table.lookup(top_name)
    pants_idx = pants_table.lookup(pants_name)
    act_idx = action_table.lookup(act_name)

    return (seq, skin_idx, hair_idx, top_idx, pants_idx, act_idx,
            skin_name, hair_name, top_name, pants_name, act_name)

  dataset = dataset.map(process_example)
  return dataset
Пример #30
0
 def preprocessing_fn(inputs):
   table = lookup.index_table_from_tensor(['a', 'b'])
   integerized = table.lookup(inputs['x'])
   return {'integerized': integerized}
Пример #31
0
 def convert_label(label):
     table = lookup.index_table_from_tensor(['>50K', '<=50K'])
     return table.lookup(label)
Пример #32
0
    def init_predict_graph(self):
        """
        init predict model graph
        :return:
        """
        # split 1-D String dense Tensor to words SparseTensor
        self.input_sentences = tf.placeholder(dtype=tf.string,
                                              shape=[None],
                                              name='input_sentences')
        sparse_words = tf.string_split(self.input_sentences, delimiter=' ')

        # slice SparseTensor
        valid_indices = tf.less(sparse_words.indices,
                                tf.constant([self.num_steps], dtype=tf.int64))
        valid_indices = tf.reshape(
            tf.split(valid_indices, [1, 1], axis=1)[1], [-1])
        valid_sparse_words = tf.sparse_retain(sparse_words, valid_indices)

        excess_indices = tf.greater_equal(
            sparse_words.indices, tf.constant([self.num_steps],
                                              dtype=tf.int64))
        excess_indices = tf.reshape(
            tf.split(excess_indices, [1, 1], axis=1)[1], [-1])
        excess_sparse_words = tf.sparse_retain(sparse_words, excess_indices)

        # compute sentences lengths
        int_values = tf.ones(shape=tf.shape(valid_sparse_words.values),
                             dtype=tf.int64)
        int_valid_sparse_words = tf.SparseTensor(
            indices=valid_sparse_words.indices,
            values=int_values,
            dense_shape=valid_sparse_words.dense_shape)
        input_sentences_lengths = tf.sparse_reduce_sum(int_valid_sparse_words,
                                                       axis=1)

        # sparse to dense
        default_padding_word = self.data_utils._START_VOCAB[0]
        words = tf.sparse_to_dense(
            sparse_indices=valid_sparse_words.indices,
            output_shape=[valid_sparse_words.dense_shape[0], self.num_steps],
            sparse_values=valid_sparse_words.values,
            default_value=default_padding_word)

        # dict words to ids
        with open(os.path.join(self.vocab_path, 'words_vocab.txt'),
                  encoding='utf-8',
                  mode='rt') as data_file:
            words_table_list = [
                line.strip() for line in data_file if line.strip()
            ]
        words_table_tensor = tf.constant(words_table_list, dtype=tf.string)
        words_table = lookup.index_table_from_tensor(
            mapping=words_table_tensor,
            default_value=self.data_utils._START_VOCAB_ID[3])
        # words_table = lookup.index_table_from_file(os.path.join(vocab_path, 'words_vocab.txt'), default_value=3)
        words_ids = words_table.lookup(words)

        # blstm model predict
        with tf.variable_scope('model', reuse=None):
            logits = self.sequence_labeling_model.inference(
                words_ids,
                input_sentences_lengths,
                self.num_classes,
                is_training=False)

        if self.use_crf:
            logits = tf.reshape(logits,
                                shape=[-1, self.num_steps, self.num_classes])
            transition_params = tf.get_variable(
                "transitions", [self.num_classes, self.num_classes])
            input_sentences_lengths = tf.to_int32(input_sentences_lengths)
            predict_labels_ids, sequence_scores = crf.crf_decode(
                logits, transition_params, input_sentences_lengths)
            predict_labels_ids = tf.to_int64(predict_labels_ids)
            sequence_scores = tf.reshape(sequence_scores, shape=[-1, 1])
            normalized_sequence_scores = self.tensorflow_utils.score_normalize(
                sequence_scores)
            predict_scores = tf.matmul(
                normalized_sequence_scores,
                tf.ones(shape=[1, self.num_steps], dtype=tf.float32))
        else:
            props = tf.nn.softmax(logits)
            max_prop_values, max_prop_indices = tf.nn.top_k(props, k=1)
            predict_labels_ids = tf.reshape(max_prop_indices,
                                            shape=[-1, self.num_steps])
            predict_labels_ids = tf.to_int64(predict_labels_ids)
            predict_scores = tf.reshape(max_prop_values,
                                        shape=[-1, self.num_steps])
        predict_scores = tf.as_string(predict_scores, precision=3)

        # dict ids to labels
        with open(os.path.join(self.vocab_path, 'labels_vocab.txt'),
                  encoding='utf-8',
                  mode='rt') as data_file:
            labels_table_list = [
                line.strip() for line in data_file if line.strip()
            ]
        labels_table_tensor = tf.constant(labels_table_list, dtype=tf.string)
        labels_table = lookup.index_to_string_table_from_tensor(
            mapping=labels_table_tensor, default_value=self.default_label)
        # labels_table = lookup.index_to_string_table_from_file(os.path.join(vocab_path, 'labels_vocab.txt'), default_value='O')
        predict_labels = labels_table.lookup(predict_labels_ids)

        sparse_predict_labels = self.tensorflow_utils.sparse_concat(
            predict_labels, valid_sparse_words, excess_sparse_words,
            self.default_label)
        sparse_predict_scores = self.tensorflow_utils.sparse_concat(
            predict_scores, valid_sparse_words, excess_sparse_words, '0.0')

        self.format_predict_labels = self.tensorflow_utils.sparse_string_join(
            sparse_predict_labels, 'predict_labels')
        self.format_predict_scores = self.tensorflow_utils.sparse_string_join(
            sparse_predict_scores, 'predict_scores')

        saver = tf.train.Saver()
        tables_init_op = tf.tables_initializer()

        self.sess = tf.Session()
        self.sess.run(tables_init_op)
        ckpt = tf.train.get_checkpoint_state(self.checkpoint_path)
        if ckpt and ckpt.model_checkpoint_path:
            print('read model from {}'.format(ckpt.model_checkpoint_path))
            saver.restore(self.sess, ckpt.model_checkpoint_path)
        else:
            print('No checkpoint file found at %s' % self.checkpoint_path)
            return
Пример #33
0
def decode_target(target_string):
    table = lookup.index_table_from_tensor(tf.constant(commons.TARGET_LABELS))
    return table.lookup(target_string)