예제 #1
0
    def __init__(self, network_creator, environment_creator, args):

        super(ActorLearner, self).__init__()

        tf.reset_default_graph()

        self.global_step = 0

        self.environment_creator = environment_creator
        self.network_creator = network_creator

        self.n_steps = args.n_steps
        self.state_shape = args.state_shape
        self.num_actions = args.num_actions
        self.initial_lr = args.initial_lr
        self.lr_annealing_steps = args.lr_annealing_steps
        self.n_emulators_per_emulator_runner = args.n_emulators_per_emulator_runner
        self.n_emulator_runners = args.n_emulator_runners
        self.device = args.device
        self.debugging_folder = args.debugging_folder
        self.network_checkpoint_folder = os.path.join(self.debugging_folder,
                                                      'checkpoints/')
        self.optimizer_checkpoint_folder = os.path.join(
            self.debugging_folder, 'optimizer_checkpoints/')
        self.last_saving_step = 0
        self.summary_writer = tf.summary.FileWriter(
            os.path.join(self.debugging_folder, 'tf'))

        self.max_global_steps = args.max_global_steps
        self.gamma = args.gamma
        self.game = args.game

        self.arch = args.arch
        self.network = network_creator(name='value_learning')
        self.target_network = network_creator(name='value_target',
                                              learning_network=self.network)
        self.target_update_freq = args.target_update_freq

        self.train_step, flat_raw_gradients, flat_clipped_gradients, global_norm, self.learning_rate = q_network.train_operation(
            self.network, args)

        config = tf.ConfigProto()
        if 'gpu' in self.device:
            logger.debug('Dynamic gpu mem allocation')
            config.gpu_options.allow_growth = True
        self.session = tf.Session(config=config)

        self.network_saver = tf.train.Saver()

        # Summaries
        variable_summaries(flat_raw_gradients, 'raw_gradients')
        variable_summaries(flat_clipped_gradients, 'clipped_gradients')
        tf.summary.scalar('global_norm', global_norm)
        tf.summary.scalar("Weighted_TD_loss", self.network.weighted_td_loss)
예제 #2
0
    def __init__(self, network_creator, environment_creator, explo_policy,
                 args):

        super(ActorLearner, self).__init__()

        # Folder and debug settings
        self.checkpoint_interval = args.checkpoint_interval
        self.debugging_folder = args.debugging_folder
        self.network_checkpoint_folder = os.path.join(self.debugging_folder,
                                                      'checkpoints/')
        self.optimizer_checkpoint_folder = os.path.join(
            self.debugging_folder, 'optimizer_checkpoints/')
        self.last_saving_step = 0
        self.device = args.device

        # Reinforcement learning settings
        self.game = args.game
        self.global_step = 0
        self.max_global_steps = args.max_global_steps
        self.max_local_steps = args.max_local_steps
        self.num_actions = args.num_actions

        self.explo_policy = explo_policy

        self.gamma = args.gamma
        self.initial_lr = args.initial_lr
        self.lr_annealing_steps = args.lr_annealing_steps

        self.emulator_counts = args.emulator_counts
        self.emulators = np.asarray([
            environment_creator.create_environment(i)
            for i in range(self.emulator_counts)
        ])
        self.max_global_steps = args.max_global_steps
        self.gamma = args.gamma
        self.network = network_creator()

        with tf.name_scope('Optimizer'):
            self.learning_rate = tf.placeholder(tf.float32,
                                                shape=[],
                                                name='lr')
            # Optimizer
            optimizer_variable_names = 'OptimizerVariables'
            self.optimizer = tf.train.RMSPropOptimizer(
                self.learning_rate,
                decay=args.alpha,
                epsilon=args.e,
                name=optimizer_variable_names)
            grads_and_vars = self.optimizer.compute_gradients(
                self.network.loss)
            self.flat_raw_gradients = tf.concat(
                [tf.reshape(g, [-1]) for g, v in grads_and_vars], axis=0)

            # This is not really an operation, but a list of gradient Tensors.
            # When calling run() on it, the value of those Tensors
            # (i.e., of the gradients) will be calculated
            if args.clip_norm_type == 'ignore':
                # Unclipped gradients
                global_norm = tf.global_norm([g for g, v in grads_and_vars],
                                             name='global_norm')
            elif args.clip_norm_type == 'global':
                # Clip network grads by network norm
                gradients_n_norm = tf.clip_by_global_norm(
                    [g for g, v in grads_and_vars], args.clip_norm)
                global_norm = tf.identity(gradients_n_norm[1],
                                          name='global_norm')
                grads_and_vars = list(
                    zip(gradients_n_norm[0], [v for g, v in grads_and_vars]))
            elif args.clip_norm_type == 'local':
                # Clip layer grads by layer norm
                gradients = [
                    tf.clip_by_norm(g, args.clip_norm) for g in grads_and_vars
                ]
                grads_and_vars = list(
                    zip(gradients, [v for g, v in grads_and_vars]))
                global_norm = tf.global_norm([g for g, v in grads_and_vars],
                                             name='global_norm')
            else:
                raise Exception('Norm type not recognized')
            self.flat_clipped_gradients = tf.concat(
                [tf.reshape(g, [-1]) for g, v in grads_and_vars], axis=0)

            self.train_step = self.optimizer.apply_gradients(grads_and_vars)

        config = tf.ConfigProto(allow_soft_placement=True)
        if 'gpu' in self.device:
            logging.debug('Dynamic gpu mem allocation')
            config.gpu_options.allow_growth = True

        self.session = tf.Session(config=config)
        self.summary_writer = tf.summary.FileWriter(
            os.path.join(self.debugging_folder, 'tf'), self.session.graph)

        self.network_saver = tf.train.Saver()

        self.optimizer_variables = [
            var for var in tf.global_variables()
            if optimizer_variable_names in var.name
        ]
        self.optimizer_saver = tf.train.Saver(self.optimizer_variables,
                                              max_to_keep=1,
                                              name='OptimizerSaver')

        # Summaries
        variable_summaries(self.flat_raw_gradients, 'raw_gradients')