def test_symbol(vocab_path, data_path, poly_dict_path): symbol = Symbol(vocab_path, poly_dict_path) with open(data_path, "r") as json_file: data = json.load(json_file) metadata = list(zip(data["features"], data["labels"])) for meta in metadata: input_data = np.asarray(symbol.feature_to_sequence(meta[0]), dtype=np.float32) target_data = np.asarray(symbol.label_to_sequence(meta[1]), dtype=np.float32) poly_mask = symbol.poly_mask(meta[0]) print(meta[0]) print(meta[1]) print("input_data" + "=" * 50) print(input_data) print("target_data" + "=" * 50) print(target_data) print("poly_mask" + "=" * 50) print(poly_mask) print("sequence to feature value " + "=" * 50) print(symbol.input_to_word_value(input_data))
class DataFeeder(threading.Thread): '''Feeds batches of data into a queue on a background thread.''' def __init__(self, coordinator, hparams, shuffle): super(DataFeeder, self).__init__() self._coord = coordinator self._hparams = hparams hp = self._hparams self._offset = 0 self._symbol = Symbol(hp.vocab_path, hp.poly_dict_path) self.input_dim = self._symbol.input_dim self.num_class = self._symbol.num_class self.shuffle = shuffle with open(hp.data_path, "r") as json_file: data = json.load(json_file) self._eval_feature = data["features"][:hp.eval_size] self._eval_label = data["features"][:hp.eval_size] self._metadata = list( zip(data["features"][hp.eval_size:], data["labels"][hp.eval_size:])) self.num_samples = len(self._metadata) self._placeholders = [ tf.placeholder(tf.float32, [None, None, self.input_dim], 'inputs'), tf.placeholder(tf.int32, [None], 'target_lengths'), tf.placeholder(tf.float32, [None, None, self.num_class], 'targets'), tf.placeholder(tf.float32, [None, None, self.num_class], 'poly_mask') ] # Create queue for buffering data: self.queue = tf.FIFOQueue( hp.queue_capacity, [tf.float32, tf.int32, tf.float32, tf.float32], name='input_queue') self._enqueue_op = self.queue.enqueue(self._placeholders) def start_in_session(self, session): self._session = session self.start() def run(self): try: while not self._coord.should_stop(): self._enqueue_next_group() except Exception as e: traceback.print_exc() self._coord.request_stop(e) def dequeue(self): (inputs, target_lengths, targets, poly_mask) = self.queue.dequeue() inputs.set_shape(self._placeholders[0].shape) target_lengths.set_shape(self._placeholders[1].shape) targets.set_shape(self._placeholders[2].shape) poly_mask.set_shape(self._placeholders[3].shape) return inputs, target_lengths, targets, poly_mask def _enqueue_next_group(self): # Read a group of examples: batch_size = self._hparams.batch_size batches_per_group = self._hparams.queue_capacity examples = [ self._get_next_example() for i in range(batch_size * batches_per_group) ] # Local sorted for computational efficiency if self.shuffle: examples.sort(key=lambda x: x[-2]) # Bucket examples based on similar output sequence length for efficiency: batches = [ examples[i:i + batch_size] for i in range(0, len(examples), batch_size) ] if self.shuffle: random.shuffle(batches) for batch in batches: feed_dict = dict( zip(self._placeholders, _prepare_batch(batch, self.shuffle))) self._session.run(self._enqueue_op, feed_dict=feed_dict) def _get_next_example(self): '''Loads a single example (input, mel_target, linear_target, cost) from disk''' if self._offset >= len(self._metadata): self._offset = 0 if self.shuffle: random.shuffle(self._metadata) meta = self._metadata[self._offset] self._offset += 1 # TODO input_data = np.asarray(self._symbol.feature_to_sequence(meta[0]), dtype=np.float32) target_data = np.asarray(self._symbol.label_to_sequence(meta[1]), dtype=np.float32) poly_mask = np.asarray(self._symbol.poly_mask(meta[0]), dtype=np.float32) return (input_data, target_data, input_data.shape[0], poly_mask)