def eval_places(self, dataset, data_mode, gpu_frac, workers, mem_dequeing, steps=20): """ Runs and tracks a single evaluation step on the given data subset Args: Other arguments can be seen in _train_network data_mode: Data subset to use for evaluation steps: Number of examples the model is evaluated on. """ # Create reader to get TFRecords reader = DataReader(dataset) with tf.Graph().as_default() as g: # Read data and define operations data, labels = reader.read_batch(1, data_mode, mem_dequeing, workers, shuffle=True, train_mode=False) main_run = self._build_run_settings(data, labels, []) # Load weights ops for warm start on training assign_ops = self._load_pretrained() # Supervisor for training. We only want it to deal with the # session. Initializes variables supervisor = tf.train.Supervisor(graph=g) # Initialize session session_conf = mu.get_session_config(gpu_frac, log_placement=False) with supervisor.managed_session(config=session_conf) as sess: # Load model, if existing. Otherwise start from scratch step_value, _ = self._initialize_model(sess, saver=mu.get_saver(), is_training=True) if len(assign_ops) > 0: logger.info('Assigning pretrained values to models ...') sess.run(assign_ops) else: raise RuntimeError( "Unexpected error: no assign operations" + " to load weights") try: # Let queues start dequeing examples coord, threads = mu.initialize_queues(sess) counter = 0 while counter < steps: self.test_places_output(sess, data, main_run) counter += 1 try: mu.finalize_queues(coord, threads) except RuntimeError as e: logger.warning('Error stopping coordinator: %s', e) except tf.errors.OutOfRangeError: logger.info('Queue run out of evaluation instances')
def _train_network(self, dataset, logs_path, batch_size, metrics, track_summaries, gpu_frac, workers, mem_dequeing, track_models=None, steps=None, max_steps=None, log_steps=10): """ Runs training on the network defined for the given number of steps and stores a checkpoint at the end. Args: See train_network for other arguments. logs_path: Path where to store the network stats Returns step: Step at which training has stopped loss: Mean loss in the process metrics: Mean metrics values in the process """ # Create reader to get TFRecords reader = DataReader(dataset) with tf.Graph().as_default() as g: # Read data and define operations data, labels = reader.read_batch(batch_size, DataMode.TRAINING, mem_dequeing, workers) main_run = self._build_run_settings(data, labels, metrics) # Load weights ops for warm start on training assign_ops = self._load_pretrained() # Prepare logging for Tensorboard saver, summary_ops, writer = mu.prepare_logging(logs_path, g) # Supervisor for training. We only want it to deal with the # session. Initializes variables supervisor = tf.train.Supervisor(graph=g) # Initialize session session_conf = mu.get_session_config(gpu_frac, log_placement=False) with supervisor.managed_session(config=session_conf) as sess: # Load model, if existing. Otherwise start from scratch step_value, start = self._initialize_model(sess, saver=saver, is_training=True) # Get stopping condition according to mode step_limit = step_value + steps if steps is not None else max_steps # noqa step_limit = step_limit if max_steps is None else min( step_limit, max_steps) # noqa stop = step_value >= step_limit # Assign weights only if model started from scratch if start is True and len(assign_ops) > 0: logger.info('Assigning pretrained values to models ...') sess.run(assign_ops) try: # Let queues start dequeing examples coord, threads = mu.initialize_queues(sess) while not stop: # Run network log_run = step_value % log_steps == 0 main_res = main_run.training_run(sess, summary_ops, log=log_run) # Track summaries if needed if track_summaries is not None and \ step_value % track_summaries == 0: mu.store_summaries(writer, step_value, main_res.summary_str) # Track models if needed if track_models is not None \ and step_value % track_models == 0 \ and step_value != 0: mu.store_checkpoint(sess, saver, step_value, logs_path) # Update current step and stop condition step_value = main_res.step stop = step_value >= step_limit # Store model at exit mu.store_checkpoint(sess, saver, step_value, logs_path) try: mu.finalize_queues(coord, threads) except RuntimeError as e: logger.warning('Error stopping coordinator: %s', e) except tf.errors.OutOfRangeError as e: logger.warn( 'Input queue exhausted due to ' + 'unexpected reason: %s.', e) return step_value, main_run.loss_average( ), main_run.metrics_average() # noqa
def _eval_network(self, dataset, data_mode, logs_path, batch_size, metrics, gpu_frac, workers, mem_dequeing, track_summaries=50, steps=None): """ Runs and tracks a single evaluation step on the given data subset Args: Other arguments can be seen in train_network data_mode: Data subset to use for evaluation track_summaries: Steps between Tensorboard summaries. Only used if steps is None steps: Number of batches the model is evaluated on. If not None, a single summary is created at the end. If None, the model is evaluated on the whole dataset and it tracks a summary periodically. Returns loss: Loss produced by the batch metrics: Set of metrics values """ # Create reader to get TFRecords reader = DataReader(dataset) with tf.Graph().as_default() as g: # Read data and define operations data, labels = reader.read_batch(batch_size, data_mode, mem_dequeing, workers, shuffle=True, train_mode=False) main_run = self._build_run_settings(data, labels, metrics) # Prepare logging for Tensorboard saver, summary_ops, writer = mu.prepare_logging(logs_path, g) # Supervisor for training. We only want it to deal with the # session. Initializes variables supervisor = tf.train.Supervisor(graph=g) # Initialize session session_conf = mu.get_session_config(gpu_frac, log_placement=False) with supervisor.managed_session(config=session_conf) as sess: # Load model, if existing. Otherwise start from scratch step_value, _ = self._initialize_model(sess, saver=saver, is_training=False) try: # Let queues start dequeing examples coord, threads = mu.initialize_queues(sess) # Initialize loop conditions step_counter = 0 stop = False if steps is None else step_counter >= steps while not stop: # Run evaluation res = main_run.test_run(sess, summary_ops, step_value, data_mode=data_mode, log=True) if steps is not None and steps == step_counter: # Reached max steps, store summary and stop main_run.manual_log(writer, step_value) stop = True elif steps is None \ and step_counter % track_summaries == 0: # Periodic storage of summaries mu.store_summaries(writer, step_counter, res.summary_str) step_counter += 1 try: mu.finalize_queues(coord, threads) except RuntimeError as e: logger.warning('Error stopping coordinator: %s', e) except tf.errors.OutOfRangeError: logger.info('Queue run out of evaluation instances') return main_run.loss_average(), main_run.metrics_average()
FLAGS = tf.app.flags.FLAGS if __name__ == '__main__': with tf.Session() as sess: # Airbnb settings dataset = AirbnbSettings(dataset_location=FLAGS.data_location, image_specs=get_alexnet_specs( FLAGS.batch_size, random_crop=True)) # Read batches from dataset reader = DataReader(dataset) features, label = reader.read_batch( batch_size=FLAGS.batch_size, data_mode=DataMode.TRAINING, # Use whatever here, e.g. training memory_factor=FLAGS.memory_factor, reader_threads=FLAGS.reader_threads, train_mode=False) # Initi all vars sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) # Define coordinator to handle all threads coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord, sess=sess) example, l = sess.run([features, label]) # Print first instance in batch idx = 0